#329 Fetch just container.yaml and content_sets.yml instead of cloning whole dist-git repo.
Merged 5 months ago by lucarval. Opened 5 months ago by jkaluza.
jkaluza/freshmaker fast-git  into  master

file modified
+42 -44

@@ -37,7 +37,7 @@ 

  import concurrent.futures

  from freshmaker import log, conf

  from freshmaker.kojiservice import koji_service

- from freshmaker.utils import sorted_by_nvr, clone_distgit_repo, temp_dir

+ from freshmaker.utils import sorted_by_nvr, get_distgit_files

  import koji

  

  

@@ -360,50 +360,48 @@ 

              namespace = "rpms"

              name = repository

  

-         prefix = "freshmaker-%s-%s-%s" % (namespace, name, commit)

-         with temp_dir(prefix=prefix) as repodir:

-             try:

-                 clone_distgit_repo(namespace, name, repodir, commit=commit,

-                                    ssh=False, logger=log)

-             except OSError as e:

-                 self.log_error("Error while cloning dist-git repo: %s" % e)

-                 return data

- 

-             content_sets_path = os.path.join(repodir, "content_sets.yml")

-             if not os.path.exists(content_sets_path):

-                 log.debug("%s: Should generate Pulp repo, %s does not exist.",

-                           nvr, content_sets_path)

-                 data["generate_pulp_repos"] = True

-                 return data

+         try:

+             files = get_distgit_files(

+                 namespace, name, commit, ["content_sets.yml", "container.yaml"],

+                 ssh=False, logger=log)

+         except OSError as e:

+             self.log_error("Error while fetching dist-git repo files: %s" % e)

+             return data

  

-             try:

-                 with open(content_sets_path, 'r') as f:

-                     content_sets_yaml = yaml.load(f)

-             except Exception as err:

-                 log.exception(err)

-                 data["generate_pulp_repos"] = True

-                 return data

- 

-             for content_sets in content_sets_yaml.values():

-                 data["content_sets"] += content_sets

- 

-             container_path = os.path.join(repodir, "container.yaml")

-             if not os.path.exists(container_path):

-                 log.debug("%s: Should generate Pulp repo, %s does not exist.",

-                           nvr, container_path)

-                 data["generate_pulp_repos"] = True

-                 return data

- 

-             with open(container_path, 'r') as f:

-                 container_yaml = yaml.load(f)

- 

-             if ("compose" not in container_yaml or

-                     "pulp_repos" not in container_yaml["compose"] or

-                     not container_yaml["compose"]["pulp_repos"]):

-                 log.debug("%s: Should generate Pulp repo, pulp_repos not "

-                           "enabled in %s.", nvr, container_path)

-                 data["generate_pulp_repos"] = True

-                 return data

+         content_sets_data = files["content_sets.yml"]

+         container_data = files["container.yaml"]

+ 

+         if content_sets_data is None:

+             log.debug("%s: Should generate Pulp repo, content_sets.yml does "

+                       "not exist.", nvr)

+             data["generate_pulp_repos"] = True

+             return data

+ 

+         try:

+             content_sets_yaml = yaml.load(content_sets_data)

Is this using PyYAML? I cannot find it in requirements.txt.

More importantly: yaml.safe_load() should be used instead if you don't trust source or don't need to load any fancy Python objects. [1]

[1] https://pyyaml.org/wiki/PyYAMLDocumentation

+         except Exception as err:

+             log.exception(err)

+             data["generate_pulp_repos"] = True

+             return data

+ 

+         for content_sets in content_sets_yaml.values():

+             data["content_sets"] += content_sets

+ 

+         if container_data is None:

+             log.debug("%s: Should generate Pulp repo, container.yaml does not "

+                       "exist.", nvr)

+             data["generate_pulp_repos"] = True

+             return data

+ 

+         container_yaml = yaml.load(container_data)

+ 

+         if ("compose" not in container_yaml or

+                 "pulp_repos" not in container_yaml["compose"] or

+                 not container_yaml["compose"]["pulp_repos"]):

+             log.debug("%s: Should generate Pulp repo, pulp_repos not "

+                       "enabled in containery.yaml.", nvr)

+             data["generate_pulp_repos"] = True

+             return data

  

          # This is workaround until OSBS-5919 is fixed.

          if "arches" in self and self["arches"] == "x86_64":

file modified
+71 -3

@@ -32,6 +32,8 @@ 

  import time

  import koji

  import kobo.rpmlib

+ import tarfile

+ import io

  

  from freshmaker import conf, app, log

  from freshmaker.types import ArtifactType

@@ -206,9 +208,19 @@ 

      return dest

  

  

- def clone_distgit_repo(namespace, name, dest, branch='master', ssh=True,

-                        user=None, logger=None, commit=None):

-     """clone a git repo"""

+ def get_distgit_url(namespace, name, ssh, user):

+     """

+     Returns the dist-git repository URL.

+ 

+     :param str namespace: Namespace in which the repository is located, for

+         example "rpms", "containers", "modules", ...

+     :param str name: Name of the repository inside the namespace.

+     :param bool ssh: If True, SSH auth will be used when fetching the files.

+     :param str user: If set, overrides the default user for SSH auth.

+ 

+     :rtype: str

+     :return: The dist-git repository URL.

+     """

      if ssh:

          if user is None:

              if hasattr(conf, 'git_user'):

@@ -220,10 +232,66 @@ 

          repo_url = conf.git_base_url

  

      repo_url = os.path.join(repo_url, namespace, name)

+     return repo_url

+ 

+ 

+ def clone_distgit_repo(namespace, name, dest, branch='master', ssh=True,

+                        user=None, logger=None, commit=None):

+     """clone a git repo"""

+     repo_url = get_distgit_url(namespace, name, ssh, user)

      return clone_repo(repo_url, dest, branch=branch, logger=logger,

                        commit=commit)

  

  

+ def get_distgit_files(

+         namespace, name, commit_or_branch, files, ssh=True, user=None,

+         logger=None):

+     """

+     Fetches the `files` from dist-git repository defined by `namespace`,

+     `name` and `commit_or_branch` and returns them.

+ 

+     This is much faster than cloning the dist-git repository and should be

+     preferred method to get the files from dist-git in case the full clone

+     of repository is not needed.

+ 

+     :param str namespace: Namespace in which the repository is located, for

+         example "rpms", "containers", "modules", ...

+     :param str name: Name of the repository inside the namespace.

+     :param str commit_or_branch: Commit hash or branch name.

+     :param list files: List of strings defining the files to fetch.

+     :param bool ssh: If True, SSH auth will be used when fetching the files.

+     :param str user: If set, overrides the default user for SSH auth.

+     :param freshmaker.log logger: Logger instance.

+ 

+     :rtype: dict

+     :return: Dictionary with file name as key and file content as value.

+         If the file does not exist in a dist-git repo, None is used as value.

+     """

+     repo_url = get_distgit_url(namespace, name, ssh, user)

+ 

+     # Use the "git archive" to get the files in tarball and then extract

+     # them and return in dict. We need to go file by file, because the

+     # "git archive" would fail completely in case any file does not exist

+     # in the git repo.

+     ret = {}

+     for f in files:

+         try:

+             cmd = ['git', 'archive', '--remote=%s' % repo_url,

I can't seem to get this to work with https only. It seems to require ssh.

UPDATE: That's because https is not supported. But that's ok, our configuration specifies the git base URL with the git:// protocol which is just fine.

+                    commit_or_branch, f]

You could fetch all the files at once. git archive accepts multiple path values.

+             tar_data = _run_command(cmd, logger=logger, return_output=True)

+             tar_bytes = io.BytesIO(tar_data)

+             tar = tarfile.open(fileobj=tar_bytes)

+             for member in tar.getmembers():

+                 ret[member.name] = tar.extractfile(member).read()

+         except OSError as e:

+             if "path not found" in str(e):

+                 ret[os.path.basename(f)] = None

+             else:

+                 raise

+ 

+     return ret

+ 

+ 

  def add_empty_commit(repo, msg="bump", author=None, logger=None):

      """Commit an empty commit to repo"""

      if author is None:

file modified
+51 -45

@@ -23,7 +23,7 @@ 

  import json

  import six

  

- from mock import call, patch, Mock, mock_open

+ from mock import call, patch, Mock

  from six.moves import http_client

  

  import freshmaker

@@ -147,32 +147,33 @@ 

  

          self.patcher = helpers.Patcher(

              "freshmaker.lightblue.")

-         self.clone_distgit_repo = self.patcher.patch("clone_distgit_repo")

-         self.path_exists = self.patcher.patch("os.path.exists")

-         self.patched_open = self.patcher.patch("open", create=True)

+         self.get_distgit_files = self.patcher.patch("get_distgit_files")

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": None,

+             "container.yaml": None,

+         }

  

      def tearDown(self):

          super(TestGetAdditionalDataFromDistGit, self).tearDown()

          self.patcher.unpatch_all()

  

      def test_generate(self):

-         self.path_exists.return_value = True

-         self.patched_open.side_effect = [

-             mock_open(read_data="x86_64:\n  - content_set").return_value,

-             mock_open(read_data="compose:\n  pulp_repos: True").return_value]

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": "x86_64:\n  - content_set",

+             "container.yaml": "compose:\n  pulp_repos: True",

+         }

  

          image = ContainerImage.create({"brew": {"build": "nvr"}})

          ret = image._get_additional_data_from_distgit(

              "rpms/foo-docker", "branch", "commit")

          self.assertEqual(ret["generate_pulp_repos"], False)

  

-         self.clone_distgit_repo.assert_called_once_with(

-             'rpms', 'foo-docker',

-             helpers.AnyStringWith('freshmaker-rpms-foo-docker'),

-             commit='commit', logger=log, ssh=False)

+         self.get_distgit_files.assert_called_once_with(

+             'rpms', 'foo-docker', "commit",

+             ["content_sets.yml", "container.yaml"], logger=log, ssh=False)

  

      def test_generate_os_error(self):

-         self.clone_distgit_repo.side_effect = OSError(

+         self.get_distgit_files.side_effect = OSError(

              "Got an error (128) from git: fatal: reference is not a tree: "

              "4d42e2009cec70d871c65de821396cd750d523f1")

  

@@ -183,35 +184,33 @@ 

  

          self.assertEqual(

              image["error"],

-             "Error while cloning dist-git repo: Got an error (128) from git: "

+             "Error while fetching dist-git repo files: Got an error (128) from git: "

              "fatal: reference is not a tree: 4d42e2009cec70d871c65de821396cd750d523f1")

  

-         self.clone_distgit_repo.assert_called_once_with(

-             'rpms', 'foo-docker',

-             helpers.AnyStringWith('freshmaker-rpms-foo-docker'),

-             commit='commit', logger=log, ssh=False)

+         self.get_distgit_files.assert_called_once_with(

+             'rpms', 'foo-docker', 'commit',

+             ["content_sets.yml", "container.yaml"], logger=log, ssh=False)

  

      def test_generate_no_namespace(self):

-         self.path_exists.return_value = True

-         self.patched_open.side_effect = [

-             mock_open(read_data="x86_64:\n  - content_set").return_value,

-             mock_open(read_data="compose:\n  pulp_repos: True").return_value]

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": "x86_64:\n  - content_set",

+             "container.yaml": "compose:\n  pulp_repos: True",

+         }

  

          image = ContainerImage.create({"brew": {"build": "nvr"}})

          ret = image._get_additional_data_from_distgit(

              "foo-docker", "branch", "commit")

          self.assertEqual(ret["generate_pulp_repos"], False)

  

-         self.clone_distgit_repo.assert_called_once_with(

-             'rpms', 'foo-docker',

-             helpers.AnyStringWith('freshmaker-rpms-foo-docker'),

-             commit='commit', logger=log, ssh=False)

+         self.get_distgit_files.assert_called_once_with(

+             'rpms', 'foo-docker', "commit",

+             ["content_sets.yml", "container.yaml"], logger=log, ssh=False)

  

      def test_generate_no_pulp_repos(self):

-         self.path_exists.return_value = True

-         self.patched_open.side_effect = [

-             mock_open(read_data="x86_64:\n  - content_set").return_value,

-             mock_open(read_data="compose:\n  pulp_repos_x: True").return_value]

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": "x86_64:\n  - content_set",

+             "container.yaml": "compose:\n  pulp_repos_x: True",

+         }

  

          image = ContainerImage.create({"brew": {"build": "nvr"}})

          ret = image._get_additional_data_from_distgit(

@@ -219,10 +218,10 @@ 

          self.assertEqual(ret["generate_pulp_repos"], True)

  

      def test_generate_pulp_repos_false(self):

-         self.path_exists.return_value = True

-         self.patched_open.side_effect = [

-             mock_open(read_data="x86_64:\n  - content_set").return_value,

-             mock_open(read_data="compose:\n  pulp_repos: False").return_value]

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": "x86_64:\n  - content_set",

+             "container.yaml": "compose:\n  pulp_repos: False",

+         }

  

          image = ContainerImage.create({"brew": {"build": "nvr"}})

          ret = image._get_additional_data_from_distgit(

@@ -230,9 +229,10 @@ 

          self.assertEqual(ret["generate_pulp_repos"], True)

  

      def test_generate_no_content_sets_yml(self):

-         def mocked_path_exists(path):

-             return not path.endswith("content_sets.yml")

-         self.path_exists.side_effect = mocked_path_exists

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": None,

+             "container.yaml": "compose:\n  pulp_repos: False",

+         }

  

          image = ContainerImage.create({"brew": {"build": "nvr"}})

          ret = image._get_additional_data_from_distgit(

@@ -240,12 +240,10 @@ 

          self.assertEqual(ret["generate_pulp_repos"], True)

  

      def test_generate_no_container_yaml(self):

-         def mocked_path_exists(path):

-             return not path.endswith("container.yaml")

-         self.path_exists.side_effect = mocked_path_exists

-         self.patched_open.side_effect = [

-             mock_open(read_data="x86_64:\n  - content_set").return_value,

-             mock_open(read_data="compose:\n  pulp_repos: False").return_value]

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": "x86_64:\n  - content_set",

+             "container.yaml": None,

+         }

  

          image = ContainerImage.create({"brew": {"build": "nvr"}})

          ret = image._get_additional_data_from_distgit(

@@ -260,7 +258,11 @@ 

  

          self.patcher = helpers.Patcher(

              'freshmaker.lightblue.')

-         self.clone_distgit_repo = self.patcher.patch("clone_distgit_repo")

+         self.get_distgit_files = self.patcher.patch("get_distgit_files")

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": None,

+             "container.yaml": None,

+         }

  

          self.dummy_image = ContainerImage.create({

              '_id': '1233829',

@@ -516,7 +518,11 @@ 

  

          self.patcher = helpers.Patcher(

              'freshmaker.lightblue.')

-         self.clone_distgit_repo = self.patcher.patch("clone_distgit_repo")

+         self.get_distgit_files = self.patcher.patch("get_distgit_files")

+         self.get_distgit_files.return_value = {

+             "content_sets.yml": None,

+             "container.yaml": None,

+         }

  

          self.fake_server_url = 'lightblue.localhost'

          self.fake_cert_file = 'path/to/cert'

Currently, Freshmaker is cloning the git repository of every image it is going
to rebuild to find out the content of container.yaml and content_sets.yml to
decide whether it needs to generate Pulp composes itself or if the OSBS can
do it iself.

Cloning whole repository is slow and downloads unnecessary files.

In this commit, new get_distgit_files method is introduce which uses
"git archive" instead of "git clone" to get only particular files which
are needed by Freshmaker from the dist-git repository.

This is much faster and also does not consume any extra resources.

rebased onto 51cc2ec

5 months ago

I can't seem to get this to work with https only. It seems to require ssh.

UPDATE: That's because https is not supported. But that's ok, our configuration specifies the git base URL with the git:// protocol which is just fine.

You could fetch all the files at once. git archive accepts multiple path values.

:+1: Minor nitpick comment, but nothing that should block it from being merged.

@cqi, could you take a look?

Is this using PyYAML? I cannot find it in requirements.txt.

More importantly: yaml.safe_load() should be used instead if you don't trust source or don't need to load any fancy Python objects. [1]

[1] https://pyyaml.org/wiki/PyYAMLDocumentation

@lholecek, I'll follow up with a different PR for that, thanks!

Commit e61f115 fixes this pull-request

Pull-Request has been merged by lucarval

5 months ago

Pull-Request has been merged by lucarval

5 months ago