#1794 pkgset: refactor per-arch subset generation to speed it up
Closed 20 days ago by adamwill. Opened 24 days ago by adamwill.
adamwill/pungi pkgset-speedup-subsets  into  master

file modified
+9 -10
@@ -37,24 +37,23 @@ 

  

  

  def populate_arch_pkgsets(compose, path_prefix, global_pkgset):

-     result = {}

- 

+     arch_map = {}

      for arch in compose.get_arches():

          compose.log_info("Populating package set for arch: %s", arch)

          is_multilib = is_arch_multilib(compose.conf, arch)

          arches = get_valid_arches(arch, is_multilib, add_src=True)

-         pkgset = global_pkgset.subset(

-             arch,

-             arches,

-             exclusive_noarch=compose.conf["pkgset_exclusive_arch_considers_noarch"],

-             inherit_to_noarch=compose.conf["pkgset_inherit_exclusive_arch_to_noarch"],

-         )

+         arch_map[arch] = arches

+     pkgsets = global_pkgset.subsets(

+         arch_map,

+         exclusive_noarch=compose.conf["pkgset_exclusive_arch_considers_noarch"],

+         inherit_to_noarch=compose.conf["pkgset_inherit_exclusive_arch_to_noarch"],

+     )

+     for arch, pkgset in pkgsets.items():

          pkgset.save_file_list(

              compose.paths.work.package_list(arch=arch, pkgset=global_pkgset),

              remove_path_prefix=path_prefix,

          )

-         result[arch] = pkgset

-     return result

+     return pkgsets

  

  

  def get_create_global_repo_cmd(compose, path_prefix, repo_dir_global, pkgset):

file modified
+88 -64
@@ -205,86 +205,110 @@ 

  

          return self.rpms_by_arch

  

-     def subset(

-         self, primary_arch, arch_list, exclusive_noarch=True, inherit_to_noarch=True

-     ):

-         """Create a subset of this package set that only includes

-         packages compatible with"""

-         pkgset = PackageSetBase(

-             self.name, self.sigkey_ordering, logger=self._logger, arches=arch_list

-         )

-         pkgset.merge(

-             self,

-             primary_arch,

-             arch_list,

-             exclusive_noarch=exclusive_noarch,

-             inherit_to_noarch=inherit_to_noarch,

-         )

-         return pkgset

- 

-     def merge(

-         self,

-         other,

-         primary_arch,

-         arch_list,

-         exclusive_noarch=True,

-         inherit_to_noarch=True,

-     ):

-         """

-         Merge ``other`` package set into this instance.

+     def subsets(self, arch_map, exclusive_noarch=True, inherit_to_noarch=True):

+         """Create subsets of this package set according to arch_map

+         dict. Keys of the dict are primary arch names. Value for each

+         key is a list of compatible package arches. e.g.:

+         {

+             "x86_64": ["x86_64", "i686", "noarch", "src"],

+             "aarch64": ["aarch64", "noarch", "src"]

+         }

+         Will create one package set per primary arch which contains

+         all packages for any of the arches in its list, and return

+         a dict whose keys are the primary arch names and values are

+         the package sets.

          """

-         msg = "Merging package sets for %s: %s" % (primary_arch, arch_list)

+         msg = "Creating package subsets with arch map %s" % (arch_map)

          self.log_debug("[BEGIN] %s" % msg)

  

-         # if "src" is present, make sure "nosrc" is included too

-         if "src" in arch_list and "nosrc" not in arch_list:

-             arch_list.append("nosrc")

- 

-         # make sure sources are processed last

-         for i in ("nosrc", "src"):

-             if i in arch_list:

-                 arch_list.remove(i)

-                 arch_list.append(i)

- 

-         seen_sourcerpms = set()

-         # {Exclude,Exclusive}Arch must match *tree* arch + compatible native

-         # arches (excluding multilib arches)

-         if primary_arch:

-             exclusivearch_list = get_valid_arches(

-                 primary_arch, multilib=False, add_noarch=False, add_src=False

+         pkgsets = {}

+         eals = {}

+         seen_sourcerpms = {}

+ 

+         for arch in arch_map.keys():

+             # create a PackageSetBase per target primary arch

+             pkgsets[arch] = PackageSetBase(

+                 self.name,

+                 self.sigkey_ordering,

+                 logger=self._logger,

+                 arches=arch_map[arch],

+             )

+             # set up an exclusivearchlist per target primary arch

+             eals[arch] = get_valid_arches(

+                 arch, multilib=False, add_noarch=False, add_src=False

              )

              # We don't want to consider noarch: if a package is true noarch

              # build (not just a subpackage), it has to have noarch in

              # ExclusiveArch otherwise rpm will refuse to build it.

              # This should eventually become a default, but it could have a big

              # impact and thus it's hidden behind an option.

-             if not exclusive_noarch and "noarch" in exclusivearch_list:

-                 exclusivearch_list.remove("noarch")

-         else:

-             exclusivearch_list = None

-         for arch in arch_list:

-             self.rpms_by_arch.setdefault(arch, [])

-             for i in other.rpms_by_arch.get(arch, []):

-                 if i.file_path in self.file_cache:

-                     # TODO: test if it really works

-                     continue

-                 if inherit_to_noarch and exclusivearch_list and arch == "noarch":

-                     if is_excluded(i, exclusivearch_list, logger=self._logger):

-                         continue

+             if not exclusive_noarch and "noarch" in eals[arch]:

+                 eals[arch].remove("noarch")

+ 

+         for ourarch in self.rpms_by_arch:

+             if ourarch in ("src", "nosrc"):

+                 # we do these at the end, separately

+                 continue

+ 

+             # find the package sets we should put these packages into

+             tarches = [tarch for tarch in arch_map.keys() if ourarch in arch_map[tarch]]

+             if not tarches:

+                 continue

  

-                 if arch in ("nosrc", "src"):

-                     # include only sources having binary packages

-                     if i.name not in seen_sourcerpms:

+             self.log_debug("handling arch %s" % ourarch)

+             # main work loop, putting packages into appropriate sets

+             for i in self.rpms_by_arch[ourarch]:

+                 for tarch in tarches:

+                     target = pkgsets[tarch]

+                     if i.file_path in target.file_cache.file_cache:

+                         # TODO: test if it really works

+                         self.log_debug("cache hit (bin)")

                          continue

-                 else:

+                     if inherit_to_noarch and eals[tarch] and ourarch == "noarch":

+                         if is_excluded(i, eals[tarch], logger=target._logger):

+                             continue

+ 

                      sourcerpm_name = kobo.rpmlib.parse_nvra(i.sourcerpm)["name"]

-                     seen_sourcerpms.add(sourcerpm_name)

+                     # we do this check *after* the exclusion check and

+                     # use the target arch not the package arch so we

+                     # don't pull the source RPMs of excluded noarch

+                     # packages into the pkgsets for arches they're

+                     # excluded from

+                     if sourcerpm_name in seen_sourcerpms:

+                         seen_sourcerpms[sourcerpm_name].add(tarch)

+                     else:

+                         seen_sourcerpms[sourcerpm_name] = {tarch}

  

-                 self.file_cache.file_cache[i.file_path] = i

-                 self.rpms_by_arch[arch].append(i)

+                     target.file_cache.file_cache[i.file_path] = i

+                     if ourarch in target.rpms_by_arch:

+                         target.rpms_by_arch[ourarch].append(i)

+                     else:

+                         target.rpms_by_arch[ourarch] = [i]

+             self.log_debug("done with arch %s" % ourarch)

+ 

+         # now do src/nosrc

+         for srcarch in ("src", "nosrc"):

+             self.log_debug("handling src arch %s" % srcarch)

+             srcs = self.rpms_by_arch.get(srcarch, [])

+             tarches = [tarch for tarch in arch_map.keys() if "src" in arch_map[tarch]]

+             for i in srcs:

+                 for tarch in tarches:

+                     target = pkgsets[tarch]

+                     if i.file_path in target.file_cache.file_cache:

+                         self.log_debug("cache hit (src)")

+                         continue

+                     if tarch in seen_sourcerpms[i.name]:

+                         target.file_cache.file_cache[i.file_path] = i

+                         if srcarch in target.rpms_by_arch:

+                             target.rpms_by_arch[srcarch].append(i)

+                         else:

+                             target.rpms_by_arch[srcarch] = [i]

+             self.log_debug("done with src arch %s" % srcarch)

  

          self.log_debug("[DONE ] %s" % msg)

  

+         return pkgsets

+ 

      def save_file_list(self, file_path, remove_path_prefix=None):

          with open(file_path, "w") as f:

              for arch in sorted(self.rpms_by_arch):

file modified
+5 -9
@@ -75,15 +75,11 @@ 

          self.assertEqual(result["x86_64"], self.subsets["x86_64"])

          self.assertEqual(result["amd64"], self.subsets["amd64"])

  

-         self.pkgset.subset.assert_any_call(

-             "x86_64",

-             ["x86_64", "noarch", "src"],

-             exclusive_noarch=True,

-             inherit_to_noarch=True,

-         )

-         self.pkgset.subset.assert_any_call(

-             "amd64",

-             ["amd64", "x86_64", "noarch", "src"],

+         self.pkgset.subsets.assert_any_call(

+             {

+                 "x86_64": ["x86_64", "noarch", "src"],

+                 "amd64": ["amd64", "x86_64", "noarch", "src"],

+             },

              exclusive_noarch=True,

              inherit_to_noarch=True,

          )

This attempts to speed up the generation of per-arch subsets of
the global package set by refactoring how it happens. Instead of
generating one subset per primary arch at a time in sequence,
iterating over the global package list for each compatible arch
each time (which means in Fedora we go through noarch and src
four times each, since we have four primary arches and they all
list noarch and src as compatible arches), iterate over the global
package list for every arch only one time, and put each package
encountered into the subsets for all compatible primary arches.

Signed-off-by: Adam Williamson awilliam@redhat.com

This is kinda speculative, not tested yet as I can't get the test suite to run locally, it complains about not having createrepo_c. I think it ought to make things a bit faster, though.

In Fedora this process takes about 20 mins total - 4 mins each for aarch64, ppc64le, s390x and 8 mins for x86_64. I think that's largely because it has to loop over noarch and src for each one. I think it ought to be faster doing it this way, so we only loop over each arch package list once.

rebased onto bc60af7

24 days ago

rebased onto bc60af7

24 days ago

oh, grump, didn't notice the merge tests. will have to refactor those to test the new subsets instead. will do that tomorrow or monday.

another possibility here is to refactor things even harder to create the per-arch subsets at the same time we create the global set, when initially reading the package files. that would mean we ultimately only have to go through the packages once (and we do that threaded, which helps with performance). I haven't really looked to see how much work that would be yet, though.

rebased onto bc60af7

21 days ago

rebased onto bc60af7

21 days ago

rebased onto bc60af7

21 days ago

rebased onto bc60af7

21 days ago

so testing this out, it improves the time taken from 22:13:

2024-11-18 19:02:55 [DEBUG   ] [BEGIN] Merging package sets for aarch64: ['aarch64', 'noarch', 'src']
2024-11-18 19:25:08 [DEBUG   ] [DONE ] Merging package sets for x86_64: ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']

to 14:47:

2024-11-18 22:18:29 [DEBUG   ] [BEGIN] Creating package subsets with arch map {'aarch64': ['aarch64', 'noarch', 'src'], 'ppc64le': ['ppc64le', 'noarch', 'src'], 's390x': ['s390x', 'noarch', 'src'], 'x86_64': ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']}
2024-11-18 22:43:16 [DEBUG   ] [DONE ] Creating package subsets with arch map {'aarch64': ['aarch64', 'noarch', 'src'], 'ppc64le': ['ppc64le', 'noarch', 'src'], 's390x': ['s390x', 'noarch', 'src'], 'x86_64': ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']}

it's not world-ending, but it's something. I might try the 'do it all at once' approach too and see if it's a) feasible and b) faster. Haven't yet checked for 'correctness' or updated the tests.

rebased onto bc60af7

21 days ago

rebased onto bc60af7

21 days ago

rebased onto bc60af7

21 days ago

rebased onto bc60af7

21 days ago

Huh. With the latest version it suddenly seems to be incredibly fast. I'm kinda suspicious of what's going on. I was just poking around at little things in the code and suddenly it seemed to be going through subsets more or less instantly. It looks to be working correctly too. I'm suspicious something weird is going on, though. Trying to poke at it more. It looks like this:

2024-11-19 05:22:29 [INFO    ] Using old repodata from: /mnt/koji/compose/adamwtest-ignore/Fedora-Rawhide-20241119.n.0/work/global/repo/f42
2024-11-19 05:22:29 [INFO    ] [BEGIN] Running createrepo for the global package set
2024-11-19 05:22:29 [INFO    ] Populating package set for arch: aarch64
2024-11-19 05:22:29 [INFO    ] Populating package set for arch: ppc64le
2024-11-19 05:22:29 [INFO    ] Populating package set for arch: s390x
2024-11-19 05:22:29 [INFO    ] Populating package set for arch: x86_64
2024-11-19 05:22:29 [DEBUG   ] [BEGIN] Creating package subsets with arch map {'aarch64': ['aarch64', 'noarch', 'src'], 'ppc64le': ['ppc64le', 'noarch', 'src'], 's390x': ['s390x', 'noarch',
 'src'], 'x86_64': ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']}
2024-11-19 05:22:29 [DEBUG   ] handling arch noarch
2024-11-19 05:22:30 [DEBUG   ] done with arch noarch
2024-11-19 05:22:30 [DEBUG   ] handling arch i686
2024-11-19 05:22:30 [DEBUG   ] done with arch i686
2024-11-19 05:22:30 [DEBUG   ] handling arch x86_64
2024-11-19 05:22:30 [DEBUG   ] done with arch x86_64
2024-11-19 05:22:30 [DEBUG   ] handling arch aarch64
2024-11-19 05:22:30 [DEBUG   ] done with arch aarch64
2024-11-19 05:22:30 [DEBUG   ] handling arch ppc64le
2024-11-19 05:22:30 [DEBUG   ] done with arch ppc64le
2024-11-19 05:22:30 [DEBUG   ] handling arch s390x
2024-11-19 05:22:31 [DEBUG   ] done with arch s390x
2024-11-19 05:22:31 [DEBUG   ] handling src arch src
2024-11-19 05:22:31 [DEBUG   ] done with src arch src
2024-11-19 05:22:31 [DEBUG   ] handling src arch nosrc
2024-11-19 05:22:31 [DEBUG   ] done with src arch nosrc
2024-11-19 05:22:31 [DEBUG   ] [DONE ] Creating package subsets with arch map {'aarch64': ['aarch64', 'noarch', 'src'], 'ppc64le': ['ppc64le', 'noarch', 'src'], 's390x': ['s390x', 'noarch', 'src'], 'x86_64': ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']}
2024-11-19 05:25:17 [INFO    ] [DONE ] Running createrepo for the global package set
2024-11-19 05:25:17 [INFO    ] [BEGIN] Running createrepo for arch 'aarch64'
2024-11-19 05:25:17 [INFO    ] [BEGIN] Running createrepo for arch 's390x'
2024-11-19 05:25:17 [INFO    ] [BEGIN] Running createrepo for arch 'ppc64le'
2024-11-19 05:25:17 [INFO    ] [BEGIN] Running createrepo for arch 'x86_64'
2024-11-19 05:26:30 [INFO    ] [DONE ] Running createrepo for arch 's390x'
2024-11-19 05:26:30 [INFO    ] [DONE ] Running createrepo for arch 'ppc64le'
2024-11-19 05:26:31 [INFO    ] [DONE ] Running createrepo for arch 'aarch64'
2024-11-19 05:26:42 [INFO    ] [DONE ] Running createrepo for arch 'x86_64'
2024-11-19 05:26:42 [INFO    ] [DONE ] ---------- PHASE: PKGSET ----------
2024-11-19 05:26:42 [INFO    ] PHASE PKGSET took 993 seconds

compared to old code running in what should be the same scenario (a previous compose to base off of, which is how things are in production, there's always yesterday's compose):

2024-11-19 03:55:59 [INFO    ] Using old repodata from: /mnt/koji/compose/adamwtest-ignore/Fedora-Rawhide-20241119.n.0/work/global/repo/f42
2024-11-19 03:55:59 [INFO    ] [BEGIN] Running createrepo for the global package set
2024-11-19 03:55:59 [INFO    ] Populating package set for arch: aarch64
2024-11-19 03:55:59 [DEBUG   ] [BEGIN] Merging package sets for aarch64: ['aarch64', 'noarch', 'src']
2024-11-19 03:58:24 [INFO    ] [DONE ] Running createrepo for the global package set
2024-11-19 04:00:09 [DEBUG   ] [DONE ] Merging package sets for aarch64: ['aarch64', 'noarch', 'src']
2024-11-19 04:00:09 [INFO    ] Populating package set for arch: ppc64le
2024-11-19 04:00:09 [DEBUG   ] [BEGIN] Merging package sets for ppc64le: ['ppc64le', 'noarch', 'src']
2024-11-19 04:04:31 [DEBUG   ] [DONE ] Merging package sets for ppc64le: ['ppc64le', 'noarch', 'src']
2024-11-19 04:04:31 [INFO    ] Populating package set for arch: s390x
2024-11-19 04:04:31 [DEBUG   ] [BEGIN] Merging package sets for s390x: ['s390x', 'noarch', 'src']
2024-11-19 04:08:32 [DEBUG   ] [DONE ] Merging package sets for s390x: ['s390x', 'noarch', 'src']
2024-11-19 04:08:33 [INFO    ] Populating package set for arch: x86_64
2024-11-19 04:08:33 [DEBUG   ] [BEGIN] Merging package sets for x86_64: ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']
2024-11-19 04:16:40 [DEBUG   ] [DONE ] Merging package sets for x86_64: ['x86_64', 'athlon', 'i686', 'i586', 'i486', 'i386', 'noarch', 'src']
2024-11-19 04:16:40 [INFO    ] [BEGIN] Running createrepo for arch 's390x'
2024-11-19 04:16:40 [INFO    ] [BEGIN] Running createrepo for arch 'ppc64le'
2024-11-19 04:16:40 [INFO    ] [BEGIN] Running createrepo for arch 'x86_64'
2024-11-19 04:16:40 [INFO    ] [BEGIN] Running createrepo for arch 'aarch64'
2024-11-19 04:17:52 [INFO    ] [DONE ] Running createrepo for arch 's390x'
2024-11-19 04:17:53 [INFO    ] [DONE ] Running createrepo for arch 'ppc64le'
2024-11-19 04:17:55 [INFO    ] [DONE ] Running createrepo for arch 'aarch64'
2024-11-19 04:18:08 [INFO    ] [DONE ] Running createrepo for arch 'x86_64'
2024-11-19 04:18:08 [INFO    ] [DONE ] ---------- PHASE: PKGSET ----------
2024-11-19 04:18:08 [INFO    ] PHASE PKGSET took 2065 seconds

Will keep playing with it...

For more detail on the scenario, I created a 20241119.n.0 compose clean with the old code. Then I immediately ran another compose with the old code, which came out as 20241119.n.1 - that's the second log extract above. Then I moved that compose to a different directory and ran another compose with the new code, so both effectively started from the same ingredients, at least they should have done.

oh holy shit this is hilarious.

my whole rewrite? unnecessary, because it turns out the only reason this whole part of the process takes so freaking long is these three lines:

                if i.file_path in self.file_cache:
                    # TODO: test if it really works
                    continue

That TODO has been there since 2015, the very first commit to pungi 4. Apparently, nobody ever TODID it, and no, it doesn't work, and it has added 20 minutes to every compose ever run by pungi 4. So, there's that!

I was suspicious of that check the first time I saw it because it seems to be checking the wrong thing (I think it ought to be checking self.file_cache.file_cache). At some point today I changed my branch to check the right thing instead, and it looks like that's the point at which it became turbocharged. I just did a test run of the original code but with those three lines simply commented out, and that gives us:

2024-11-19 07:04:17 [INFO    ] PHASE PKGSET took 1004 seconds

so, uh, yeah, a much simpler PR is coming real soon now. :P

Closing in favor of #1796 , there's no need to rewrite this since fixing the cache check makes it very fast anyway.

Pull-Request has been closed by adamwill

20 days ago