#12529 [WIP] Adding retry logics to the mass-rebuild script
Merged 18 days ago by jnsamyak. Opened 2 months ago by jnsamyak.
jnsamyak/releng f42_rebuild_retry  into  main

@@ -0,0 +1,46 @@ 

+ import unittest

+ from unittest.mock import patch, MagicMock

+ 

+ # Mock environment and constants

+ MAX_RETRIES = 3

+ RETRY_DELAY = 1

+ 

+ # Mock function to simulate command behavior

+ def mock_function(success_on_attempt):

+     """Simulates a function that succeeds on the given attempt."""

+     attempt = {'count': 0}

+ 

+     def inner_function(*args, **kwargs):

+         attempt['count'] += 1

+         if attempt['count'] >= success_on_attempt:

+             return 0  # Simulate success

+         return 1  # Simulate failure

+ 

+     return inner_function

+ 

+ class TestRetryMechanism(unittest.TestCase):

+ 

+     @patch('time.sleep', return_value=None)

+     def test_retry_successful(self, _):

+         """Test if retry succeeds within the allowed attempts."""

+         retryable_function = mock_function(success_on_attempt=2)

+         result = retry(retryable_function)

+         self.assertEqual(result, 0, "The retry mechanism should succeed when the function succeeds within retries.")

+ 

+     @patch('time.sleep', return_value=None)

+     def test_retry_failure(self, _):

+         """Test if retry fails after exhausting all attempts."""

+         retryable_function = mock_function(success_on_attempt=MAX_RETRIES + 1)

+         result = retry(retryable_function)

+         self.assertEqual(result, 1, "The retry mechanism should fail when the function never succeeds.")

+ 

+     @patch('time.sleep', return_value=None)

+     def test_retry_logging(self, mock_sleep):

+         """Test if retry logs correct retry attempts."""

+         retryable_function = mock_function(success_on_attempt=MAX_RETRIES + 1)

+         with patch('builtins.print') as mock_print:

+             retry(retryable_function)

+             self.assertGreaterEqual(mock_print.call_count, MAX_RETRIES, "Retry attempts should be logged.")

+ 

+ if __name__ == '__main__':

+     unittest.main()

file modified
+62 -54
@@ -15,11 +15,16 @@ 

  import subprocess

  import sys

  import operator

+ import time

  

  # contains info about all rebuilds, add new rebuilds there and update rebuildid

  # here

  from massrebuildsinfo import MASSREBUILDS

  

+ # Configuration for retry logic

+ MAX_RETRIES = 3  # Number of retries

+ RETRY_DELAY = 5  # Delay in seconds between retries

+ 

  # Set some variables

  # Some of these could arguably be passed in as args.

  rebuildid = 'f42'
@@ -29,59 +34,65 @@ 

  workdir = os.path.expanduser('~/massbuild')

  enviro = os.environ

  

- 

- # Define functions

- 

- # This function needs a dry-run like option

+ # Retry logic wrapper function

+ def retry(func, *args, retries=MAX_RETRIES, delay=RETRY_DELAY, **kwargs):

+     """Retry logic wrapper function."""

+     for attempt in range(retries):

+         result = func(*args, **kwargs)

+         if result == 0:  # Success

+             return 0

+         print(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds...")

+         time.sleep(delay)

+     print(f"All {retries} attempts failed.")

+     return 1

+ 

+ # Updated buildmeoutput with retry logic

  def buildmeoutput(cmd, action, pkg, env, cwd=workdir):

-     """Simple function to run a command and return 0 for success, 1 for

-        failure.  It also writes the taskID and name to a file and console.

-        cmd is the command and arguments, action is aname for the action (for

-        logging), pkg is the name of the packagebeing operated on, env is the

-        environment dict, and cwd is where the script should be executed from."""

- 

-     try:

-         output = subprocess.check_output(cmd, env=env, cwd=cwd).decode('utf-8').split()

-         with open(workdir+"/taskID_file", 'a') as task_file:

-             task_file.write('%s %s\n' % (pkg, output[2]))

-         sys.stdout.write('  Successful submission: %s  taskID: %s\n' % (pkg, output[2]))

-     except subprocess.CalledProcessError as e:

-         sys.stderr.write('%s failed %s: %s\n' % (pkg, action, e))

-         return 1

-     return 0

- 

- # This function needs a dry-run like option

+     def attempt():

+         try:

+             output = subprocess.check_output(cmd, env=env, cwd=cwd).decode('utf-8').split()

+             with open(workdir + "/taskID_file", 'a') as task_file:

+                 task_file.write('%s %s\n' % (pkg, output[2]))

+             sys.stdout.write('  Successful submission: %s  taskID: %s\n' % (pkg, output[2]))

+             return 0

+         except subprocess.CalledProcessError as e:

+             sys.stderr.write('%s failed %s: %s\n' % (pkg, action, e))

+             return 1

+     return retry(attempt)

+ 

+ # Updated runme with retry logic

  def runme(cmd, action, pkg, env, cwd=workdir):

-     """Simple function to run a command and return 0 for success, 1 for

-        failure.  cmd is a list of the command and arguments, action is a

-        name for the action (for logging), pkg is the name of the package

-        being operated on, env is the environment dict, and cwd is where

-        the script should be executed from."""

- 

-     try:

-         subprocess.check_call(cmd, env=env, cwd=cwd)

-     except subprocess.CalledProcessError as e:

-         sys.stderr.write('%s failed %s: %s\n' % (pkg, action, e))

-         return 1

-     return 0

- 

- # This function needs a dry-run like option

+     def attempt():

+         try:

+             subprocess.check_call(cmd, env=env, cwd=cwd)

+             return 0

+         except subprocess.CalledProcessError as e:

+             sys.stderr.write('%s failed %s: %s\n' % (pkg, action, e))

+             return 1

+     return retry(attempt)

+ 

+ # Updated runmeoutput with retry logic

  def runmeoutput(cmd, action, pkg, env, cwd=workdir):

-     """Simple function to run a command and return output if successful. 

-        cmd is a list of the command and arguments, action is a

-        name for the action (for logging), pkg is the name of the package

-        being operated on, env is the environment dict, and cwd is where

-        the script should be executed from.  Returns 0 for failure"""

- 

-     try:

-         pid = subprocess.Popen(cmd, env=env, cwd=cwd,

-                                stdout=subprocess.PIPE, encoding='utf8')

-     except BaseException as e:

-         sys.stderr.write('%s failed %s: %s\n' % (pkg, action, e))

-         return 0

-     result = pid.communicate()[0].rstrip('\n')

-     return result

- 

+     def attempt():

+         try:

+             pid = subprocess.Popen(cmd, env=env, cwd=cwd,

+                                    stdout=subprocess.PIPE, encoding='utf8')

+             result = pid.communicate()[0].rstrip('\n')

+             if pid.returncode == 0:

+                 return result

+             else:

+                 return 0

+         except BaseException as e:

+             sys.stderr.write('%s failed %s: %s\n' % (pkg, action, e))

+             return 0

+     for attempt_number in range(MAX_RETRIES):

+         result = attempt()

+         if result:

+             return result

+         print(f"Attempt {attempt_number + 1} failed. Retrying in {RETRY_DELAY} seconds...")

+         time.sleep(RETRY_DELAY)

+     print(f"All {MAX_RETRIES} attempts failed for {pkg}.")

+     return 0

  

  # Environment for using releng credentials for pushing and building

  enviro['GIT_SSH'] = '/usr/local/bin/relengpush'
@@ -110,7 +121,6 @@ 

          continue

  

      # Query to see if a build has already been attempted

-     # this version requires newer koji:

      builds = kojisession.listBuilds(id, createdAfter=massrebuild['epoch'])

      newbuild = False

      # Check the builds to make sure they were for the target we care about
@@ -119,7 +129,6 @@ 

              buildtarget = kojisession.getTaskInfo(build['task_id'],

                                         request=True)['request'][1]

              if buildtarget == massrebuild['target'] or buildtarget in massrebuild['targets']:

-                 # We've already got an attempt made, skip.

                  newbuild = True

                  break

          except:
@@ -142,7 +151,6 @@ 

  

      # Check for a noautobuild file

      if os.path.exists(os.path.join(workdir, name, 'noautobuild')):

-         # Maintainer does not want us to auto build.

          print('Skipping %s due to opt-out' % name)

          continue

  
@@ -182,6 +190,7 @@ 

      if runme(commit, 'commit', name, enviro,

                   cwd=os.path.join(workdir, name)):

          continue

+ 

      # git push

      push = ['git', 'push', '--no-verify']

      print('Pushing changes for %s' % name)
@@ -189,7 +198,6 @@ 

                   cwd=os.path.join(workdir, name)):

          continue

  

- 

      # get git url

      urlcmd = ['fedpkg', 'giturl']

      print('Getting git url for %s' % name)

In the updated script, we introduced a retry mechanism to enhance the robustness of the build process.
The retry logic ensures that temporary infrastructure or network issues do not cause permanent failures.

Here’s a summary of what we added:

  1. Retry Wrapper Function (retry):
    - A generic function to wrap other functions and enable retrying failed operations.
    - Configurable retry count (MAX_RETRIES) and delay (RETRY_DELAY) between attempts.

  2. Integration into Core Functions:

    • buildmeoutput, runme, and runmeoutput functions were modified to utilize the retry mechanism.
    • Commands are retried on failure, and logs are generated for each attempt.
  3. Improved Logging and Error Handling:

    • Informative messages are displayed for each retry attempt and final failures.
    • All attempts are logged, making debugging easier.
  4. Scalability:

    • The retry mechanism is designed to be generic, making it reusable across other parts of the script if needed.

Signed-off-by: Samyak Jain samyak.jn11@gmail.com

This looks pretty nice for making sure submitting builds doesn't fail as much...

The other place in the past we have seen problems is git pushing the rebuild commit. Would it be difficult to add retry for that if it fails?

Couple of minor points:

attempt() is a local function which returns 0 or 1 in buildmeoutput() / runme() and uses retry(), but returns 0 or "result" in runmeoutput() and can't use the retry() function.

retry() also has an attempt variable.

It's probably worth putting some randomization into the sleep() logic, and wait a bit longer than 3 seconds (failure aren't that common, right?) ... so it waits somewhere between 30s and 300s (5m) say. Also means if something temporary happens it doesn't fail 3 times in less than 10s.

But AFAICS, it should be better than nothing as is.

To be clear I'm not saying you should do a bunch of work to integrate all the code, if you want to clean up the first point. Just rename one or both functions, and maybe add a comment.

Going on a tangent from James’s comments, the signalling in that script looks wonky to me – it should use exceptions for flagging errors instead of special return values. However, this isn’t a fault of this PR and probably should be addressed outside of it.

If depending on a 3rd party Python package is okay for this script, take a look at backoff. It implements retrying with increasing backoff times, but works as a decorator on functions. You can specify that certain exceptions or return values flag a need for retry, so it should work with the existing code base.

So... the mass rebuild is ready to fire off now. Should we try and wait to address comments here? Or should we just merge this? Or should we just do the mass rebuild without it?

I would 100% merge it before running the script tomorrow morning. It can be tidied up later, but it's going to make the mass rebuild less of a pain as is.

rebased onto a0860e3

18 days ago

Merging this, and will handle the cleanups in the next PR! Thanks everyone for the reviews :D

Pull-Request has been merged by jnsamyak

18 days ago