#502 Try hard to find the repositories for an image
Merged 4 years ago by lucarval. Opened 4 years ago by lucarval.

file modified
+35 -6
@@ -419,6 +419,29 @@ 

              return

          return rpm_manifest["rpms"]

  

+     def get_registry_repositories(self, lb_instance):

+         if self['repositories']:

+             return self['repositories']

+ 

+         parsed_nvr = kobo.rpmlib.parse_nvr(self.nvr)

+ 

+         if '.' not in parsed_nvr['release']:

+             log.debug('There are no repositories for %s', self.nvr)

+             return []

+ 

+         original_release = parsed_nvr['release'].rsplit('.', 1)[0]

+         parsed_nvr['release'] = original_release

+         original_nvr = '{name}-{version}-{release}'.format(**parsed_nvr)

+         log.debug('Finding repositories for %s through %s', self.nvr, original_nvr)

+ 

+         previous_images = lb_instance.get_images_by_nvrs(

+             [original_nvr], published=None, include_rpm_manifest=False)

+         if not previous_images:

+             log.warning('original_nvr %s not found in Lightblue', original_nvr)

+             return []

+ 

+         return previous_images[0].get_registry_repositories(lb_instance)

+ 

  

  class LightBlue(object):

      """Interface to query lightblue"""
@@ -1248,12 +1271,7 @@ 

              for image_id, images in enumerate(to_rebuild):

                  for parent_id, image in enumerate(images):

                      nvr = image.nvr

-                     # Also include the sorted names of repositories in the image group

-                     # to handle the case when different releases of single name-version are

-                     # included in different container repositories.

-                     repository_key = "-".join(sorted([r["repository"] for r in image["repositories"]]))

-                     parsed_nvr = koji.parse_NVR(nvr)

-                     image_group = "%s-%s-%s" % (parsed_nvr["name"], parsed_nvr["version"], repository_key)

+                     image_group = self.describe_image_group(image)

                      if image_group not in image_group_to_nvrs:

                          image_group_to_nvrs[image_group] = []

                      if nvr not in image_group_to_nvrs[image_group]:
@@ -1349,6 +1367,17 @@ 

  

          return to_rebuild

  

+     # Cache to avoid multiple calls. We want one call per nvr, not one per arch

+     @region.cache_on_arguments(to_str=lambda image: image.nvr)

+     def describe_image_group(self, image):

+         # Also include the sorted names of repositories in the image group

+         # to handle the case when different releases of single name-version are

+         # included in different container repositories.

+         repositories = image.get_registry_repositories(self)

+         repository_key = sorted([r["repository"] for r in repositories])

+         parsed_nvr = koji.parse_NVR(image.nvr)

+         return "%s-%s-%s" % (parsed_nvr["name"], parsed_nvr["version"], repository_key)

+ 

      def _images_to_rebuild_to_batches(self, to_rebuild, directly_affected_nvrs):

          """

          Creates batches with images as defined by `find_images_to_rebuild`

@@ -20,13 +20,10 @@ 

  # SOFTWARE.

  

  import json

- import os

- import sys

  import unittest

  

  from unittest import mock

  

- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa

  from tests import get_fedmsg, helpers

  

  from freshmaker import db, events, models

file modified
+101
@@ -1749,6 +1749,107 @@ 

          self.assertEqual(len(ret), 1)

          self.assertIsNotNone(ret[0][0].get("parent"))

  

+     @patch("freshmaker.lightblue.LightBlue.get_images_by_nvrs")

+     @patch('freshmaker.lightblue.LightBlue.find_images_with_packages_from_content_set')

+     @patch('freshmaker.lightblue.LightBlue.find_parent_images_with_package')

+     @patch('os.path.exists')

+     def test_dedupe_dependency_images_with_all_repositories(

+             self, exists, find_parent_images_with_package,

+             find_images_with_packages_from_content_set, get_images_by_nvrs):

+         exists.return_value = True

+ 

+         vulnerable_srpm_name = 'oh-noes'

+         vulnerable_srpm_nvr = '{}-1.0-1'.format(vulnerable_srpm_name)

+ 

+         ubi_image_template = {

+             "brew": {"package": "ubi8-container", "build": "ubi8-container-8.1-100"},

+             "parent_image_builds": {},

+             "repository": "containers/ubi8",

+             "commit": "2b868f757977782367bf624373a5fe3d8e6bacd6",

+             "repositories": [{"repository": "ubi8"}],

+             "rpm_manifest": [{

+                 "rpms": [

+                     {"srpm_name": vulnerable_srpm_name}

+                 ]

+             }]

+         }

+ 

+         directly_affected_ubi_image = ContainerImage.create(copy.deepcopy(ubi_image_template))

+ 

+         dependency_ubi_image_data = copy.deepcopy(ubi_image_template)

+         dependency_ubi_image_nvr = directly_affected_ubi_image.nvr + ".12345678"

+         dependency_ubi_image_data["brew"]["build"] = dependency_ubi_image_nvr

+         # A dependecy image is not directly published

+         dependency_ubi_image_data["repositories"] = []

+         dependency_ubi_image = ContainerImage.create(dependency_ubi_image_data)

+ 

+         python_image = ContainerImage.create({

+             "brew": {"package": "python-36-container", "build": "python-36-container-1-10"},

+             "parent_brew_build": directly_affected_ubi_image.nvr,

+             "parent_image_builds": {},

+             "repository": "containers/python-36",

+             "commit": "3a740231deab2abf335d5cad9a80d466c783be7d",

+             "repositories": [{"repository": "ubi8/python-36"}],

+             "rpm_manifest": [{

+                 "rpms": [

+                     {"srpm_name": vulnerable_srpm_name}

+                 ]

+             }]

+         })

+ 

+         nodejs_image = ContainerImage.create({

+             "brew": {"package": "nodejs-12-container", "build": "nodejs-12-container-1-20.45678"},

+             "parent_brew_build": dependency_ubi_image.nvr,

+             "repository": "containers/nodejs-12",

+             "commit": "97d57a9db975b58b43113e15d29e35de6c1a3f0b",

+             "repositories": [{"repository": "ubi8/nodejs-12"}],

+             "rpm_manifest": [{

+                 "rpms": [

+                     {"srpm_name": vulnerable_srpm_name}

+                 ]

+             }]

+         })

+ 

+         def fake_find_parent_images_with_package(image, *args, **kwargs):

+             parents = {

+                 directly_affected_ubi_image.nvr: directly_affected_ubi_image,

+                 dependency_ubi_image.nvr: dependency_ubi_image,

+             }

+             parent = parents.get(image.get("parent_brew_build"))

+             if parent:

+                 return [parent]

+             return []

+ 

+         find_parent_images_with_package.side_effect = fake_find_parent_images_with_package

+ 

+         find_images_with_packages_from_content_set.return_value = [

+             directly_affected_ubi_image, python_image, nodejs_image,

+         ]

+ 

+         def fake_get_images_by_nvrs(nvrs, **kwargs):

+             if nvrs == [dependency_ubi_image.nvr]:

+                 return [dependency_ubi_image]

+             elif nvrs == [directly_affected_ubi_image.nvr]:

+                 return [directly_affected_ubi_image]

+             raise ValueError("Unexpected test data, {}".format(nvrs))

+ 

+         get_images_by_nvrs.side_effect = fake_get_images_by_nvrs

+ 

+         lb = LightBlue(server_url=self.fake_server_url,

+                        cert=self.fake_cert_file,

+                        private_key=self.fake_private_key)

+         batches = lb.find_images_to_rebuild([vulnerable_srpm_nvr], [vulnerable_srpm_name])

+         expected_batches = [

+             # The dependency ubi image has a higher NVR and it should be used as

+             # the parent for both images.

+             {dependency_ubi_image.nvr},

+             {python_image.nvr, nodejs_image.nvr},

+         ]

+         for batch, expected_batch_nvrs in zip(batches, expected_batches):

+             batch_nvrs = set(image.nvr for image in batch)

+             self.assertEqual(batch_nvrs, expected_batch_nvrs)

+         self.assertEqual(len(batches), len(expected_batches))

+ 

      @patch("freshmaker.lightblue.ContainerImage.resolve_published")

      @patch("freshmaker.lightblue.LightBlue.get_images_by_nvrs")

      @patch("os.path.exists")

file modified
+4 -4
@@ -57,8 +57,8 @@ 

      def test_monitor_api_structure(self):

          resp = self.client.get('/api/1/monitor/metrics')

          self.assertEqual(

-             len([l for l in resp.get_data(as_text=True).splitlines()

-                  if l.startswith('# TYPE')]), num_of_metrics)

+             len([line for line in resp.get_data(as_text=True).splitlines()

+                  if line.startswith('# TYPE')]), num_of_metrics)

  

  

  class ConsumerTest(helpers.ConsumerBaseTest):
@@ -127,5 +127,5 @@ 

  

      r = requests.get('http://127.0.0.1:10040/metrics')

  

-     assert len([l for l in r.text.splitlines()

-                 if l.startswith('# TYPE')]) == num_of_metrics

+     assert len([line for line in r.text.splitlines()

+                 if line.startswith('# TYPE')]) == num_of_metrics

file modified
+1 -1
@@ -1185,7 +1185,7 @@ 

  

      def test_patch_event_not_allowed(self):

          with self.test_request_context(user='john_smith'):

-             resp = self.client.patch(f'/api/1/events/1', json={'action': 'cancel'})

+             resp = self.client.patch('/api/1/events/1', json={'action': 'cancel'})

          assert resp.status_code == 403

          assert resp.json['message'] == (

              'User john_smith does not have any of the following roles: admin, manual_rebuilder'

An accurate list of repositories is important to ensure the
deduplication code works as expected.

If Freshmaker has built an image in the past, it may not have been
directly published and Lightblue won't list any repositories for it.
In such case, try to get a list of repositories from the original NVR.

Signed-off-by: Luiz Carvalho lucarval@redhat.com

This needs a good amount of work. It's mostly for illustration purposes.

I tested this against one of the latest CVEs, and it deduplicated 5 additional images out of 213.

Is this published=None correct? Step 5 mentions to check "if the “original_nvr” image is published". Should it just keep the default value True?

Is this published=None correct? Step 5 mentions to check "if the “original_nvr” image is published". Should it just keep the default value True?

Right, but step 6 says "If not published, use the “original_nvr” as the nvr and jump back to step 1". By using published=None, we get the image regardless of whether or not it's been published. Then, we decide in the code how to treat it. Otherwise, we'd have to make two queries to Lightblue, published=True and published=False, which seems unnecessary.

rebased onto 63f5b469fdd80c687a44fd89cfcc119136045443

4 years ago

@gnaponie, @jkaluza, do we have an existing mechanism for querying Freshmaker from itself like this? From what I can see, lightblue.py does not use a db connection, nor it knows how to access Freshmaker's API.

I see a couple of options:

  1. Teach lightblue.py to access Freshmaker's API via HTTP requests.
  2. Pass in the db connection when creating the LightBlue object from freshmaker/handlers/koji/rebuild_images_on_rpm_advisory_change.py.

I think, ideally, we'd only have the web API interacting with the database, which would make option 1 more favorable. However, this is definitely not the case right now. Both the backend/consumers and the web API interact directly with the database. Option 2 seems to be more inline with the current architecture.

Thoughts?

rebased onto 2b868f757977782367bf624373a5fe3d8e6bacd6

4 years ago

rebased onto 4ed27ae

4 years ago

@gnaponie, @jkaluza, do we have an existing mechanism for querying Freshmaker from itself like this? From what I can see, lightblue.py does not use a db connection, nor it knows how to access Freshmaker's API.
I see a couple of options:

Teach lightblue.py to access Freshmaker's API via HTTP requests.
Pass in the db connection when creating the LightBlue object from freshmaker/handlers/koji/rebuild_images_on_rpm_advisory_change.py.

I think, ideally, we'd only have the web API interacting with the database, which would make option 1 more favorable. However, this is definitely not the case right now. Both the backend/consumers and the web API interact directly with the database. Option 2 seems to be more inline with the current architecture.
Thoughts?

I've taken the approach of reaching the DB directly.

@gnaponie, @jkaluza, do we have an existing mechanism for querying Freshmaker from itself like this? From what I can see, lightblue.py does not use a db connection, nor it knows how to access Freshmaker's API.
I see a couple of options:
Teach lightblue.py to access Freshmaker's API via HTTP requests.
Pass in the db connection when creating the LightBlue object from freshmaker/handlers/koji/rebuild_images_on_rpm_advisory_change.py.
I think, ideally, we'd only have the web API interacting with the database, which would make option 1 more favorable. However, this is definitely not the case right now. Both the backend/consumers and the web API interact directly with the database. Option 2 seems to be more inline with the current architecture.
Thoughts?

I've taken the approach of reaching the DB directly.

I'm gonna rethink this a bit. I think we can use a simple heuristic to simply strip off the .xxxx suffix from the release value, and use the modified NVR to query Lightblue. This would remove the dependency of having to query the Freshmaker API and should make things simpler.

I've finally understood what this PR is about :). It makes sense to me. It surprises me a bit that the image built by Freshmaker is not included in the repository, but I believe you that's the case :).

@lucarval Could you please resolve the conflicts here? Thank you.

Maybe these two lines could be simplified like {python_image.nvr, nodejs_image.nvr}.

A few test case classes are written based on ModelsTestCase which already has the code to create and drop db separated into setUp and tearDown. With autouse, does this mock_db also affect those tests as well? Looks like we can drop the same code from the setUp and tearDown:

    def setUp(self):
        super(ModelsTestCase, self).setUp()
        db.session.remove()
        db.drop_all()
        db.create_all()
        db.session.commit()

        self.user = User(username='tester1')
        db.session.add(self.user)
        db.session.commit()

    def tearDown(self):
        super(ModelsTestCase, self).tearDown()

        db.session.remove()
        db.drop_all()
        db.session.commit()

@lucarval what's the status of this PR?

@lucarval what's the status of this PR?

It's outdated. It needs to be reworked to no longer require access to the DB. I just haven't had the bandwidth. If there's urgency in getting this done, I suggest someone else picks it up. Otherwise, I'll try to have it cleaned up this week.

rebased onto e341a2cb8d69c28458aa8e897715b57b521438b6

4 years ago

Maybe these two lines could be simplified like {python_image.nvr, nodejs_image.nvr}.

Good idea! Done.

@cqi, I've reworked the changes to be more efficient and not require additional DB queries. This also simplified testing quite a bit.

I tested these changes by using dev_scripts/find_images_to_rebuild.py 53625. The diff with and without my changes are:

diff <(< master.json jq '.[].original_nvr' -r | sort) <(< modified.json jq '.[].original_nvr' -r | sort)
55d54
< s2i-base-container-1-131.1584463498

As you can see, with my changes, we wouldn't rebuild s2i-base-container image twice s2i-base-container-1-131.1584463498 and s2i-base-container-1-141.1584463496. They are properly deduplicated.

rebased onto 2ff7902

4 years ago

@lucarval there's no urgency, but I was trying to close old PRs and this was opened 2 months ago.
Let us know when this is ready for another review. Thank you.

We could just sorted(r["repository"] for r in repositories)

Since freshmaker is already Python 3 only, perhaps we could using f-strings to format the string instead. IMO, it could make code more compact.

LGTM. Leave two minor suggestions.

Additionally, get_registry_repositories will be called recursively through the for-loop of for parent_id, image in enumerate(images):, probably it could be tested to see if it would have potential performance issue based on the known cases of image builds and whether it is worth to convert the for-loop into a multi-threaded solution.

rebased onto 82c90c1

4 years ago

@cqi, I've incorporated your suggestion to change to use sorted(r["repository"] for r in repositories)

I've decided that using f-strings wouldn't improve things by much, so I left it as is.

Regarding the performance concern, the check for a . on the release value pretty much narrows it down to only the cases we should always check. I've ran this against a recent advisory and didn't notice a significant increase in computation time.

rebased onto c2840e8

4 years ago

rebased onto 4ed1b66

4 years ago

1 new commit added

  • Fix flake8 errors
4 years ago

2 new commits added

  • Fix flake8 errors
  • Try hard to find the repositories for an image
4 years ago

+1 on the new commit,
I trust you both for the other one :)

Pull-Request has been merged by lucarval

4 years ago