#1426 Retry buildinstall tasks on losetup error
Merged 3 years ago by hlin. Opened 3 years ago by hlin.
hlin/pungi RHELCMP-1394  into  master

@@ -801,6 +801,10 @@ 

                  weight=compose.conf["runroot_weights"].get("buildinstall"),

              )

          else:

+             try:

+                 lorax_log_dir = _get_log_dir(compose, variant, arch)

+             except Exception:

+                 lorax_log_dir = None

              runroot.run(

                  cmd,

                  log_file=log_file,
@@ -809,6 +813,7 @@ 

                  mounts=[compose.topdir],

                  weight=compose.conf["runroot_weights"].get("buildinstall"),

                  chown_paths=chown_paths,

+                 log_dir=lorax_log_dir,

              )

  

          if final_output_dir != output_dir:

file modified
+39 -7
@@ -74,12 +74,38 @@ 

          run(command, show_cmd=True, logfile=log_file)

          self._result = True

  

+     def _has_losetup_error(self, log_dir):

+         """

+         Check if there's losetup error in log.

+ 

+         This error happens if the Koji builder runs out of loopback devices.

+         This can happen if too many tasks that require them are scheduled on

+         the same builder. A retried task might end up on a different builder,

+         or maybe some other task will have finished already.

+ 

+         :param str log_dir: path to buildinstall log dir,

+             e.g. logs/s390x/buildinstall-BaseOS-logs/

+         """

+         if not log_dir:

+             return False

+ 

+         log_file = os.path.join(log_dir, "program.log")

+         try:

+             with open(log_file) as f:

+                 for line in f:

+                     if "losetup: cannot find an unused loop device" in line:

+                         return True

+         except Exception:

+             pass

+         return False

+ 

      def _run_koji(self, command, log_file=None, packages=None, arch=None, **kwargs):

          """

          Runs the runroot command in Koji.

          """

          runroot_channel = self.compose.conf.get("runroot_channel")

          runroot_tag = self.compose.conf["runroot_tag"]

+         log_dir = kwargs.pop("log_dir", None)

  

          koji_wrapper = kojiwrapper.KojiWrapper(self.compose.conf["koji_profile"])

          koji_cmd = koji_wrapper.get_runroot_cmd(
@@ -92,13 +118,19 @@ 

              **kwargs

          )

  

-         output = koji_wrapper.run_runroot_cmd(koji_cmd, log_file=log_file)

-         if output["retcode"] != 0:

-             raise RuntimeError(

-                 "Runroot task failed: %s. See %s for more details."

-                 % (output["task_id"], log_file)

-             )

-         self._result = output

+         attempt = 0

+         max_retries = 3

+         while True:

+             output = koji_wrapper.run_runroot_cmd(koji_cmd, log_file=log_file)

+             if output["retcode"] == 0:

+                 self._result = output

+                 return

+             elif attempt >= max_retries or not self._has_losetup_error(log_dir):

+                 raise RuntimeError(

+                     "Runroot task failed: %s. See %s for more details."

+                     % (output["task_id"], log_file)

+                 )

+             attempt += 1

  

      def _ssh_run(self, hostname, user, command, fmt_dict=None, log_file=None):

          """

file modified
+34
@@ -198,3 +198,37 @@ 

                  ),

              ]

          )

+ 

+ 

+ class TestRunrootKoji(helpers.PungiTestCase):

+     def setUp(self):

+         super(TestRunrootKoji, self).setUp()

+         self.compose = helpers.DummyCompose(

+             self.topdir, {"runroot": True, "runroot_tag": "f28-build"},

+         )

+ 

+         self.runroot = Runroot(self.compose)

+ 

+     def test_has_losetup_error(self):

+         self.assertFalse(self.runroot._has_losetup_error(None))

+ 

+         with mock.patch("pungi.runroot.open", mock.mock_open(read_data="")):

+             self.assertFalse(self.runroot._has_losetup_error("/foo_log_dir"))

+ 

+         with mock.patch(

+             "pungi.runroot.open",

+             mock.mock_open(read_data="losetup: cannot find an unused loop device"),

+         ):

+             self.assertTrue(self.runroot._has_losetup_error("/bar_log_dir"))

+ 

+     @mock.patch("pungi.runroot.kojiwrapper.KojiWrapper")

+     def test_run_koji_retry(self, mock_kojiwrapper):

+         self.compose.conf["koji_profile"] = "test"

+         mock_kojiwrapper.return_value.get_runroot_cmd.return_value = ["df -h"]

+         mock_kojiwrapper.return_value.run_runroot_cmd.side_effect = [

+             {"retcode": 1, "task_id": 1},

+             {"retcode": 0, "task_id": 2},

+         ]

+         self.runroot._has_losetup_error = mock.Mock(side_effect=[True, False])

+         self.runroot._run_koji("")

+         self.assertEqual(mock_kojiwrapper.return_value.run_runroot_cmd.call_count, 2)

JIRA: RHELCMP-1394
Signed-off-by: Haibo Lin hlin@redhat.com

rebased onto 4ca7d8b8612366470b733aa062c53035a214d38a

3 years ago

rebased onto b19985917e90e158d84a379f7737e19ea8e3ffd6

3 years ago
return any(line for line in f if "losetup: cannot find ..." in line)

or other structure without "re" would work. There is no complicated regular expression. Or is there some reason for this?

I didn't find other issues.

rebased onto 75cf79c2df5317c0bdc5cc46b4b075cf2bc5b617

3 years ago

return any(line for line in f if "losetup: cannot find ..." in line)

or other structure without "re" would work. There is no complicated regular expression. Or is there some reason for this?

It's not necessary to use "re".

I'm not sure why the test failed which passed in local.

rebased onto f8846fb52269a42ac5726953bd5bd7405e970776

3 years ago

I'm not sure why the test failed which passed in local.

It's python 3 in my local but py2 in jenkins.

Can we add a comment describing the error and why retry make sense? Something like:

This error happens if the Koji builder runs out of loopback devices. This can happen if too many tasks that require them are scheduled on the same builder. A retried task might end up on a different builder, or maybe some other task will have finished already.

Could this work line by line and not read the entire file into memory?

rebased onto 0e7d7e5369eaf75af36561170b99c7057f6398a7

3 years ago

rebased onto 7e6bed9

3 years ago

Looks good to me. :thumbsup:

Pull-Request has been merged by hlin

3 years ago