#449 Retry while fetching from dist-git and revisit retry logic implementation
Merged 4 years ago by lucarval. Opened 4 years ago by yashn.

file modified
-4
@@ -25,10 +25,6 @@ 

      REQUESTS_TIMEOUT = (6.1, 15)

      REQUESTS_VERIFY = True

  

-     # General options for retrying failed operations (querying external services)

-     RETRY_TIMEOUT = 6

-     RETRY_INTERVAL = 2

- 

      POLICIES_DIR = '/etc/greenwave/policies'

  

      MESSAGING = 'fedmsg'

@@ -13,7 +13,6 @@ 

  import json

  

  import fedmsg.consumers

- import requests

  

  import greenwave.app_factory

  from greenwave.api_v1 import subject_type_identifier_to_list
@@ -23,6 +22,7 @@ 

      messaging_tx_sent_ok_counter, messaging_tx_failed_counter)

  from greenwave.policies import applicable_decision_context_product_version_pairs

  from greenwave.utils import right_before_this_time

+ from greenwave.request_session import get_requests_session

  

  try:

      import fedora_messaging.api
@@ -31,7 +31,7 @@ 

      pass

  

  

- requests_session = requests.Session()

+ requests_session = get_requests_session()

  

  

  log = logging.getLogger(__name__)

@@ -0,0 +1,24 @@ 

+ import requests

+ 

+ from requests.adapters import HTTPAdapter

+ from requests.packages.urllib3.util.retry import Retry

+ 

+ from greenwave import __version__

+ 

+ 

+ def get_requests_session():

+     """ Get http(s) session for request processing.  """

+ 

+     session = requests.Session()

+     retry = Retry(

+         total=3,

+         read=3,

+         connect=3,

+         backoff_factor=1,

+         status_forcelist=(500, 502, 503, 504),

+     )

+     adapter = HTTPAdapter(max_retries=retry)

+     session.mount('http://', adapter)

+     session.mount('https://', adapter)

+     session.headers["User-Agent"] = f"greenwave {__version__}"

+     return session

file modified
+12 -11
@@ -9,8 +9,6 @@ 

  import logging

  import re

  import json

- import requests

- import urllib3.exceptions

  from io import BytesIO

  import tarfile

  import subprocess
@@ -20,14 +18,12 @@ 

  from flask import current_app

  from werkzeug.exceptions import BadGateway

  

- from greenwave import __version__

  from greenwave.cache import cached

- import greenwave.utils

+ from greenwave.request_session import get_requests_session

  

  log = logging.getLogger(__name__)

  

- requests_session = requests.Session()

- requests_session.headers["User-Agent"] = f"greenwave {__version__}"

+ requests_session = get_requests_session()

  

  

  class ResultsRetriever(object):
@@ -85,7 +81,6 @@ 

  

  

  @cached

- @greenwave.utils.retry(wait_on=urllib3.exceptions.NewConnectionError)

  def retrieve_scm_from_koji(nvr):

      """ Retrieve cached rev and namespace from koji using the nvr """

      koji_url = current_app.config['KOJI_BASE_URL']
@@ -170,8 +165,16 @@ 

      dist_git_base_url = current_app.config['DIST_GIT_BASE_URL'].rstrip('/')

      dist_git_url = f'{dist_git_base_url}/{pkg_namespace}/{pkg_name}'

      cmd = ['git', 'archive', f'--remote={dist_git_url}', rev, 'gating.yaml']

-     git_archive = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

-     output, error_output = git_archive.communicate()

+     # Retry thrice if TimeoutExpired exception is raised

+     MAX_RETRY = 3

+     for tries in range(MAX_RETRY):

+         try:

+             git_archive = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

+             output, error_output = git_archive.communicate(timeout=30)

+             break

+         except subprocess.TimeoutExpired:

+             git_archive.kill()

+             continue

  

      if git_archive.returncode != 0:

          error_output = error_output.decode('utf-8')
@@ -190,7 +193,6 @@ 

  

  

  # NOTE - not cached, for now.

- @greenwave.utils.retry(wait_on=urllib3.exceptions.NewConnectionError)

  def retrieve_waivers(product_version, subject_type, subject_identifiers, when):

      if not subject_identifiers:

          return []
@@ -218,7 +220,6 @@ 

  

  

  # NOTE - not cached.

- @greenwave.utils.retry(timeout=300, interval=30, wait_on=urllib3.exceptions.NewConnectionError)

  def retrieve_decision(greenwave_url, data):

      timeout = current_app.config['REQUESTS_TIMEOUT']

      verify = current_app.config['REQUESTS_VERIFY']

file modified
+1 -32
@@ -9,38 +9,7 @@ 

  from werkzeug.exceptions import InternalServerError

  

  import greenwave.app_factory

- from greenwave.utils import json_error, retry

- 

- 

- def test_retry_passthrough():

-     """ Ensure that retry doesn't gobble exceptions. """

-     expected = "This is the exception."

- 

-     @retry(timeout=0.1, interval=0.1, wait_on=Exception)

-     def f():

-         raise Exception(expected)

- 

-     with pytest.raises(Exception) as actual:

-         f()

- 

-     assert expected in str(actual)

- 

- 

- def test_retry_count():

-     """ Ensure that retry doesn't gobble exceptions. """

-     expected = "This is the exception."

- 

-     calls = []

- 

-     @retry(timeout=0.3, interval=0.1, wait_on=Exception)

-     def f():

-         calls.append(1)

-         raise Exception(expected)

- 

-     with pytest.raises(Exception):

-         f()

- 

-     assert sum(calls) == 3

+ from greenwave.utils import json_error

  

  

  @pytest.mark.parametrize(('error, expected_status_code,'

file modified
-28
@@ -3,7 +3,6 @@ 

  import functools

  import logging

  import os

- import time

  import hashlib

  import datetime

  
@@ -114,33 +113,6 @@ 

      return response

  

  

- def retry(timeout=None, interval=None, wait_on=Exception):

-     """ A decorator that allows to retry a section of code...

-     ...until success or timeout.

- 

-     If omitted, the values for `timeout` and `interval` are

-     taken from the global configuration.

-     """

-     def wrapper(function):

-         @functools.wraps(function)

-         def inner(*args, **kwargs):

-             _timeout = timeout or current_app.config['RETRY_TIMEOUT']

-             _interval = interval or current_app.config['RETRY_INTERVAL']

-             # These can be configured per-function, or globally if omitted.

-             start = time.time()

-             while True:

-                 try:

-                     return function(*args, **kwargs)

-                 except wait_on as e:  # pylint: disable=broad-except

-                     log.warning("Exception %r raised from %r.  Retry in %rs",

-                                 e, function, _interval)

-                     time.sleep(_interval)

-                     if (time.time() - start) >= _timeout:

-                         raise  # This re-raises the last exception.

-         return inner

-     return wrapper

- 

- 

  def sha1_mangle_key(key):

      """

      Like dogpile.cache.util.sha1_mangle_key, but works correctly on

  • First commit adds logic to retry when fetching data from dist-git
  • Second commit adds retry logic to the session instead of retrying the entire method

The retry has to be introduced also in _retrieve_yaml_remote_rule_web.
At the moment the "git archive" mode is not used (internally), due to a bug -- and it will never be used in Fedora. So we need to add the logic to both mechanisms.

Is the timeout set somewhere?

The retry has to be introduced also in _retrieve_yaml_remote_rule_web.
At the moment the "git archive" mode is not used (internally), due to a bug -- and it will never be used in Fedora. So we need to add the logic to both mechanisms.

@yashn handles this in _retrieve_yaml_remote_rule_web by configuring the Python requests session to retry on connection errors.

It's unclear how much value there is in retrying the git archive mode, but I suppose some flaky network issue could prevent that from working. So I don't personally have a problem with it.

We don't actually supply a timeout to communicate, so this exception will never get raised. If this retry is implemented, perhaps it makes sense for it to retry on a different failure.

rebased onto 46d89e7

4 years ago

We don't actually supply a timeout to communicate, so this exception will never get raised. If this retry is implemented, perhaps it makes sense for it to retry on a different failure.

Nice catch! I added a timeout value and now it makes sure it kills the old process before starting a new one.

How about moving this to separate submodule request_session.py containing:

session = _get_requests_session()

BTW, remove the unused argument.

I think you can remove the retry decorator from utils submodule.

Can you also use REQUESTS_TIMEOUT variable and remove its redundant uses?

Actually, apart from REQUESTS_TIMEOUT, there are also options RETRY_TIMEOUT and RETRY_INTERVAL, this is very confusing.

How about moving this to separate submodule request_session.py containing:
session = _get_requests_session()

imho, I do not think we should have a separate submodule for this. I think it fits just fine in utils.py.

BTW, remove the unused argument.

I had missed this. Thanks for pointing it out. :)

Actually, apart from REQUESTS_TIMEOUT, there are also options RETRY_TIMEOUT and RETRY_INTERVAL, this is very confusing.

Sure, I will remove RETRY_TIMEOUT and RETRY_INTERVAL, but the usage of REQUESTS_TIMEOUT still seems relevant. Thoughts?

I did remove the retry decorator before but I think I missed it when I rebased it. Fixing it now :)

2 new commits added

  • Add retry logic to session instead of retrying the entire method
  • Add logic to retry when fetching data from dist-git
4 years ago

imho, I do not think we should have a separate submodule for this. I think it fits just fine in utils.py.

Problems with submodules in a single file with generic name (like utils.py or common.py) is that they can grow to a really huge size which contains lot of unrelated stuff and lots of imports which is difficult to read and maintain.

imho, I do not think we should have a separate submodule for this. I think it fits just fine in utils.py.

Problems with submodules in a single file with generic name (like utils.py or common.py) is that they can grow to a really huge size which contains lot of unrelated stuff and lots of imports which is difficult to read and maintain.

Agreed, but maybe we should move it to separate file in-case we end up with a huge file? And at the same time, I don't feel it is required to have a separate submodule for a single method. But I would be happy to do it if you feel strongly about it. Thoughts? :)

I don't feel it is required to have a separate submodule for a single method.

Not required, but preferable. In small files it's easier to find stuff, make sense of the imports and see if something is unused.

But I would be happy to do it if you feel strongly about it.

I'm OK with merging this as is, but I have strong feeling that we will have to refactor the utils.py in future.

+1

2 new commits added

  • Add retry logic to session instead of retrying the entire method
  • Add logic to retry when fetching data from dist-git
4 years ago

made the change. :)

Just one thing I would like to bring up, the functional tests seem to fail when I use vagrant. The ones that interact with WaiverDB with requests.exceptions.HTTPError: 401 Client Error: UNAUTHORIZED for url: http://localhost:5004/api/v1.0/waivers/. I guess this is because of the access control we have recently added to waiverdb. Can I please ask if that's right or am I missing something? Anything I can do to make it work using vagrant?

+1

Not sure what's wrong with vagrant (possibly the dependency list needs an update); I use only docker-compose.

@yashn they fail I assume due to the recent changes we made in waiverdb. I think we are good to merge this PR. I'm gonna check now how to solve the vagrant and we can make eventually another PR.

Commit 54bf42d fixes this pull-request

Pull-Request has been merged by lucarval

4 years ago

Pull-Request has been merged by lucarval

4 years ago