From 7e6bed9713b8a011dccb5bc164084b3547d796cd Mon Sep 17 00:00:00 2001
From: Haibo Lin <hlin@redhat.com>
Date: Jul 21 2020 09:15:25 +0000
Subject: Retry buildinstall tasks on losetup error


JIRA: RHELCMP-1394
Signed-off-by: Haibo Lin <hlin@redhat.com>

---

diff --git a/pungi/phases/buildinstall.py b/pungi/phases/buildinstall.py
index 63dd7e3..dfaac45 100644
--- a/pungi/phases/buildinstall.py
+++ b/pungi/phases/buildinstall.py
@@ -801,6 +801,10 @@ class BuildinstallThread(WorkerThread):
                 weight=compose.conf["runroot_weights"].get("buildinstall"),
             )
         else:
+            try:
+                lorax_log_dir = _get_log_dir(compose, variant, arch)
+            except Exception:
+                lorax_log_dir = None
             runroot.run(
                 cmd,
                 log_file=log_file,
@@ -809,6 +813,7 @@ class BuildinstallThread(WorkerThread):
                 mounts=[compose.topdir],
                 weight=compose.conf["runroot_weights"].get("buildinstall"),
                 chown_paths=chown_paths,
+                log_dir=lorax_log_dir,
             )
 
         if final_output_dir != output_dir:
diff --git a/pungi/runroot.py b/pungi/runroot.py
index fb1a6df..322a1b8 100644
--- a/pungi/runroot.py
+++ b/pungi/runroot.py
@@ -74,12 +74,38 @@ class Runroot(kobo.log.LoggingBase):
         run(command, show_cmd=True, logfile=log_file)
         self._result = True
 
+    def _has_losetup_error(self, log_dir):
+        """
+        Check if there's losetup error in log.
+
+        This error happens if the Koji builder runs out of loopback devices.
+        This can happen if too many tasks that require them are scheduled on
+        the same builder. A retried task might end up on a different builder,
+        or maybe some other task will have finished already.
+
+        :param str log_dir: path to buildinstall log dir,
+            e.g. logs/s390x/buildinstall-BaseOS-logs/
+        """
+        if not log_dir:
+            return False
+
+        log_file = os.path.join(log_dir, "program.log")
+        try:
+            with open(log_file) as f:
+                for line in f:
+                    if "losetup: cannot find an unused loop device" in line:
+                        return True
+        except Exception:
+            pass
+        return False
+
     def _run_koji(self, command, log_file=None, packages=None, arch=None, **kwargs):
         """
         Runs the runroot command in Koji.
         """
         runroot_channel = self.compose.conf.get("runroot_channel")
         runroot_tag = self.compose.conf["runroot_tag"]
+        log_dir = kwargs.pop("log_dir", None)
 
         koji_wrapper = kojiwrapper.KojiWrapper(self.compose.conf["koji_profile"])
         koji_cmd = koji_wrapper.get_runroot_cmd(
@@ -92,13 +118,19 @@ class Runroot(kobo.log.LoggingBase):
             **kwargs
         )
 
-        output = koji_wrapper.run_runroot_cmd(koji_cmd, log_file=log_file)
-        if output["retcode"] != 0:
-            raise RuntimeError(
-                "Runroot task failed: %s. See %s for more details."
-                % (output["task_id"], log_file)
-            )
-        self._result = output
+        attempt = 0
+        max_retries = 3
+        while True:
+            output = koji_wrapper.run_runroot_cmd(koji_cmd, log_file=log_file)
+            if output["retcode"] == 0:
+                self._result = output
+                return
+            elif attempt >= max_retries or not self._has_losetup_error(log_dir):
+                raise RuntimeError(
+                    "Runroot task failed: %s. See %s for more details."
+                    % (output["task_id"], log_file)
+                )
+            attempt += 1
 
     def _ssh_run(self, hostname, user, command, fmt_dict=None, log_file=None):
         """
diff --git a/tests/test_runroot.py b/tests/test_runroot.py
index 2328904..95f4ca1 100644
--- a/tests/test_runroot.py
+++ b/tests/test_runroot.py
@@ -198,3 +198,37 @@ class TestRunrootOpenSSH(helpers.PungiTestCase):
                 ),
             ]
         )
+
+
+class TestRunrootKoji(helpers.PungiTestCase):
+    def setUp(self):
+        super(TestRunrootKoji, self).setUp()
+        self.compose = helpers.DummyCompose(
+            self.topdir, {"runroot": True, "runroot_tag": "f28-build"},
+        )
+
+        self.runroot = Runroot(self.compose)
+
+    def test_has_losetup_error(self):
+        self.assertFalse(self.runroot._has_losetup_error(None))
+
+        with mock.patch("pungi.runroot.open", mock.mock_open(read_data="")):
+            self.assertFalse(self.runroot._has_losetup_error("/foo_log_dir"))
+
+        with mock.patch(
+            "pungi.runroot.open",
+            mock.mock_open(read_data="losetup: cannot find an unused loop device"),
+        ):
+            self.assertTrue(self.runroot._has_losetup_error("/bar_log_dir"))
+
+    @mock.patch("pungi.runroot.kojiwrapper.KojiWrapper")
+    def test_run_koji_retry(self, mock_kojiwrapper):
+        self.compose.conf["koji_profile"] = "test"
+        mock_kojiwrapper.return_value.get_runroot_cmd.return_value = ["df -h"]
+        mock_kojiwrapper.return_value.run_runroot_cmd.side_effect = [
+            {"retcode": 1, "task_id": 1},
+            {"retcode": 0, "task_id": 2},
+        ]
+        self.runroot._has_losetup_error = mock.Mock(side_effect=[True, False])
+        self.runroot._run_koji("")
+        self.assertEqual(mock_kojiwrapper.return_value.run_runroot_cmd.call_count, 2)