#326 Improvements for downloading repodata
Merged 4 years ago by lsedlar. Opened 4 years ago by lsedlar.
lsedlar/odcs download-refactor  into  master

file modified
+3 -1
@@ -202,7 +202,9 @@ 

      headers = {

          'authorization': 'Bearer {0}'.format(token)

      }

-     r = requests.get(conf.auth_openidc_userinfo_uri, headers=headers)

+     r = requests.get(

+         conf.auth_openidc_userinfo_uri, headers=headers, timeout=conf.net_timeout

Is timeout working the way you expect here? I've just briefly checked the documentation and it seems it defines the timeout for first response from the server. So if server is sending data 1KB/s, the timeout won't be reached, because it will really send some data during the conf.net_timeout.

I think what you need is timeout for the complete response, not just for first bytes received.

But this could also improve the situation in case the Pulp is really just sitting here and not answering at all, so I'm not against merging this really. I just want to be sure what this code actually does.

+     )

      if r.status_code != 200:

          # In Fedora, the manually created service tokens can't be used with the UserInfo

          # endpoint. We treat this as an empty response - and hence an empty group list. An empty

@@ -47,17 +47,21 @@ 

          :param str path: Path to store the file to.

          :param str url: URL of file to download.

          :rtype: str

-         :return: content of downloaded file.

+         :return: path to the downloaded file

          """

          log.info("%r: Downloading %s", self.compose, url)

-         r = requests.get(url)

+         r = requests.get(url, timeout=conf.net_timeout, stream=True)

          r.raise_for_status()

  

          filename = os.path.basename(url)

          makedirs(os.path.join(path, "repodata"))

-         with open(os.path.join(path, "repodata", filename), "wb") as f:

-             f.write(r.content)

-         return r.content

+         outfile = os.path.join(path, "repodata", filename)

+         with open(outfile, "wb") as f:

+             for chunk in r.iter_content(chunk_size=None):

+                 f.write(chunk)

+         response_len = os.stat(outfile).st_size

+         log.info("%r: Downloaded %d bytes from %s", self.compose, response_len, url)

+         return outfile

  

      def _download_repodata(self, path, baseurl):

          """
@@ -77,7 +81,9 @@ 

  

          # Download the repomd.xml.

          repomd_url = os.path.join(baseurl, "repodata", "repomd.xml")

-         repomd = self._download_file(path, repomd_url)

+         repomd_path = self._download_file(path, repomd_url)

+         with open(repomd_path) as f:

+             repomd = f.read()

          tree = ElementTree.fromstring(repomd)

  

          # In case the repomd.xml did not change since the last compose, use

file modified
+4 -1
@@ -26,6 +26,7 @@ 

  import json

  import requests

  

+ from odcs.server import conf

  from odcs.server.mergerepo import MergeRepo

  from odcs.server.utils import retry

  
@@ -46,7 +47,9 @@ 

          r = requests.post(

              '{0}{1}'.format(self.rest_api_root, endpoint.lstrip('/')),

              query_data,

-             auth=(self.username, self.password))

+             auth=(self.username, self.password),

+             timeout=conf.net_timeout,

+         )

          r.raise_for_status()

          return r.json()

  

@@ -25,6 +25,8 @@ 

  import requests

  import productmd.common

  

+ from odcs.server import conf

+ 

  

  class PungiCompose(object):

      """Represents 3rd party Pungi Compose"""
@@ -47,7 +49,7 @@ 

          """

          Fetches the json file represented by `url`.

          """

-         r = requests.get(url)

+         r = requests.get(url, timeout=conf.net_timeout)

          r.raise_for_status()

          return r.json()

  

  • all requests.get and requests.post calls get a timeout
  • add a log message when download finishes
  • switch to streaming responses - we only care about content of repomd.xml, so we don't need to read potentially big filelists

Is timeout working the way you expect here? I've just briefly checked the documentation and it seems it defines the timeout for first response from the server. So if server is sending data 1KB/s, the timeout won't be reached, because it will really send some data during the conf.net_timeout.

I think what you need is timeout for the complete response, not just for first bytes received.

But this could also improve the situation in case the Pulp is really just sitting here and not answering at all, so I'm not against merging this really. I just want to be sure what this code actually does.

The timeout should be both for the first response from the server and also for pauses in sending the data. It does not catch the case you describe of data being received at a very slow rate.

Overall timeout for the entire request doesn't seem supported by requests. Given that we haven't really seen many problems with this, I'm not sure if it's a high priority thing.

Pull-Request has been merged by lsedlar

4 years ago