#58 Issue #57: Cache metadata lookup tables
Merged 6 years ago by ncoghlan. Opened 6 years ago by ncoghlan.
modularity/ ncoghlan/fedmod issue-57-cache-reverse-lookup-tables  into  master

file modified
+69 -44
@@ -9,10 +9,11 @@ 

  import modulemd

  import solv

  import json

- from attr import attributes, attrib

- from requests_toolbelt.downloadutils.tee import tee_to_file

+ from collections import defaultdict

  from fnmatch import fnmatch

  from urllib.parse import urljoin

+ from attr import attributes, attrib

+ from requests_toolbelt.downloadutils.tee import tee_to_file

  from lxml import etree

  

  XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME") or os.path.expanduser("~/.cache")
@@ -73,7 +74,13 @@ 

      _SOURCE_UPDATES_INFO,

  )

  _BOOTSTRAP_MODULEMD = os.path.join(CACHEDIR, "f27-bootstrap.yaml")

- _BOOTSTRAP_REVERSE_LOOKUP_CACHE = os.path.join(CACHEDIR, "f27-bootstrap-cache.json")

+ 

+ _LOOKUP_CACHES = {

+     "_BOOTSTRAP_COMPONENTS_CACHE": os.path.join(CACHEDIR, "f27-bootstrap-cache.json"),

+     "_MODULE_FORWARD_LOOKUP_CACHE": os.path.join(CACHEDIR, "f27-module-contents-cache.json"),

+     "_SRPM_REVERSE_LOOKUP_CACHE": os.path.join(CACHEDIR, "f27-srpm-to-module-cache.json"),

+     "_RPM_REVERSE_LOOKUP_CACHE": os.path.join(CACHEDIR, "f27-rpm-to-module-cache.json"),

+ }

  

  METADATA_SECTIONS = ("filelists", "primary", "modules")

  
@@ -133,6 +140,19 @@ 

          # TODO: Actually prune old metadata files

          pass

  

+ def _write_cache(cache_name, data):

+     """Write the given data to the nominated cache file"""

+     cache_fname = _LOOKUP_CACHES[cache_name]

+     with open(cache_fname, "w") as cache_file:

+         json.dump(data, cache_file)

+     print(f"  Added {cache_fname} to cache")

+ 

+ def _read_cache(cache_name):

+     """Read the parsed data from the nominated cache file"""

+     cache_fname = _LOOKUP_CACHES[cache_name]

+     with open(cache_fname, "r") as cache_file:

+         return json.load(cache_file)

+ 

  def _download_bootstrap_modulemd():

      from ._depchase import make_pool, get_rpms_for_srpms

      print("Downloading build bootstrap module details")
@@ -142,74 +162,79 @@ 

      mmd = modulemd.ModuleMetadata()

      mmd.load(_BOOTSTRAP_MODULEMD)

      pool = make_pool("x86_64")

-     bootstrap_rpms = {}

+     bootstrap_rpms = set()

      rpms = get_rpms_for_srpms(pool, mmd.components.rpms)

      for rpmname in rpms:

-         bootstrap_rpms[rpmname] = "bootstrap"

+         bootstrap_rpms.add(rpmname)

      for srpmname in mmd.components.rpms:

-         bootstrap_rpms[srpmname] = "bootstrap"

-     with open(_BOOTSTRAP_REVERSE_LOOKUP_CACHE, "w") as cachefile:

-         json.dump(bootstrap_rpms, cachefile)

-     print(f"  Added {_BOOTSTRAP_REVERSE_LOOKUP_CACHE} to cache")

+         bootstrap_rpms.add(srpmname)

+     _write_cache("_BOOTSTRAP_COMPONENTS_CACHE", list(bootstrap_rpms))

+ 

+ def _write_lookup_caches():

+     metadata_dir = os.path.join(_x86_64_MODULE_INFO.local_cache_path)

+     repomd_fname = os.path.join(metadata_dir, "repodata", "repomd.xml")

+     repomd_xml = etree.parse(repomd_fname)

+     repo_relative_modulemd = _read_repomd_location(repomd_xml, "modules")

+     repo_modulemd_fname = os.path.join(metadata_dir, repo_relative_modulemd)

+     with gzip.open(repo_modulemd_fname, "r") as modules_yaml_gz:

+         modules_yaml = modules_yaml_gz.read()

+     modules = modulemd.loads_all(modules_yaml)

+     module_forward_lookup = {}

+     srpm_reverse_lookup = defaultdict(list)

+     rpm_reverse_lookup = defaultdict(list)

+     for module in modules:

+         module_forward_lookup[module.name] = list(set(module.artifacts.rpms))

+         for srpmname in module.components.rpms:

+             srpm_reverse_lookup[srpmname].append(module.name)

+         for rpmname in module.artifacts.rpms:

+             rpmprefix = rpmname.split(":", 1)[0].rsplit("-", 1)[0]

+             rpm_reverse_lookup[rpmprefix].append(module.name)

+     # Cache the lookup tables as local JSON files

+     print("Caching lookup tables")

+     _write_cache("_MODULE_FORWARD_LOOKUP_CACHE", module_forward_lookup)

+     _write_cache("_SRPM_REVERSE_LOOKUP_CACHE", srpm_reverse_lookup)

+     _write_cache("_RPM_REVERSE_LOOKUP_CACHE", rpm_reverse_lookup)

+ 

  

  def download_repo_metadata():

      """Downloads the latest repo metadata"""

      for repo_definition in _ALL_REPOS:

          _download_metadata_files(repo_definition)

      _download_bootstrap_modulemd()

+     _write_lookup_caches()

+ 

  

  _SRPM_REVERSE_LOOKUP = {}  # SRPM name : [module names]

  _RPM_REVERSE_LOOKUP = {}   # RPM name : [module names]

  _BOOTSTRAP_COMPONENTS = set()

  _MODULE_FORWARD_LOOKUP = {}

  def _populate_module_reverse_lookup():

-     # TODO: Construct and cache the reverse mapping as a JSON file as part of

-     #       download_repo_metadata(), as with _BOOTSTRAP_REVERSE_LOOKUP_CACHE

      if _RPM_REVERSE_LOOKUP:

          return

+     # Check whether or not fetch-metadata has been run at all

      metadata_dir = os.path.join(_x86_64_MODULE_INFO.local_cache_path)

      repomd_fname = os.path.join(metadata_dir, "repodata", "repomd.xml")

      if not os.path.exists(repomd_fname):

          msg = f"{repomd_fname!r} does not exist. Run `fedmod fetch-metadata`."

          raise MissingMetadata(msg)

-     repomd_xml = etree.parse(repomd_fname)

-     repo_relative_modulemd = _read_repomd_location(repomd_xml, "modules")

-     if repo_relative_modulemd is None:

-         msg = (f"No 'modules' entry found in {repomd_fname!r}. "

-                 "Is the metadata for a non-modular repo?")

-         raise MissingMetadata(msg)

-     repo_modulemd_fname = os.path.join(metadata_dir, repo_relative_modulemd)

-     if not os.path.exists(repo_modulemd_fname):

-         msg = (f"{repo_modulemd_fname!r} does not exist. "

-                 "Try running `fedmod fetch-metadata` again.")

-         raise MissingMetadata(msg)

-     with gzip.open(repo_modulemd_fname, "r") as modules_yaml_gz:

-         modules_yaml = modules_yaml_gz.read()

-     modules = modulemd.loads_all(modules_yaml)

-     for module in modules:

-         _MODULE_FORWARD_LOOKUP[module.name] = module

-         for srpmname in module.components.rpms:

-             srpm_entry = _SRPM_REVERSE_LOOKUP.setdefault(srpmname, [])

-             srpm_entry.append(module.name)

-         for rpmname in module.artifacts.rpms:

-             rpmprefix = rpmname.split(":", 1)[0].rsplit("-", 1)[0]

-             rpm_entry = _RPM_REVERSE_LOOKUP.setdefault(rpmprefix, [])

-             rpm_entry.append(module.name)

-     # Read the extra RPM bootstrap metadata

-     if not os.path.exists(_BOOTSTRAP_REVERSE_LOOKUP_CACHE):

-         msg = (f"{_BOOTSTRAP_REVERSE_LOOKUP_CACHE!r} does not exist. "

-                 "Try running `fedmod fetch-metadata` again.")

-         raise MissingMetadata(msg)

-     with open(_BOOTSTRAP_REVERSE_LOOKUP_CACHE, "r") as cachefile:

-         _BOOTSTRAP_COMPONENTS.update(json.load(cachefile))

+     # Check whether or not fetch-metadata actually finished

+     for cache_entry in _LOOKUP_CACHES.values():

+         if not os.path.exists(cache_entry):

+             msg = (f"{cache_entry!r} does not exist. "

+                     "Try running `fedmod fetch-metadata` again.")

+             raise MissingMetadata(msg)

+     # Load the metadata

+     # TODO: Switch to lazy loading of the actual data

+     _SRPM_REVERSE_LOOKUP.update(_read_cache("_SRPM_REVERSE_LOOKUP_CACHE"))

+     _RPM_REVERSE_LOOKUP.update(_read_cache("_RPM_REVERSE_LOOKUP_CACHE"))

+     _BOOTSTRAP_COMPONENTS.update(_read_cache("_BOOTSTRAP_COMPONENTS_CACHE"))

+     _MODULE_FORWARD_LOOKUP.update(_read_cache("_MODULE_FORWARD_LOOKUP_CACHE"))

  

  def list_modules():

      return _MODULE_FORWARD_LOOKUP.keys()

  

  def get_rpms_in_module(module_name):

-     if module_name in _MODULE_FORWARD_LOOKUP:

-         return _MODULE_FORWARD_LOOKUP[module_name].artifacts.rpms

-     return set()

+     return _MODULE_FORWARD_LOOKUP.get(module_name, [])

  

  def get_modules_for_rpm(rpm_name):

      result = _RPM_REVERSE_LOOKUP.get(rpm_name)

Rather than deriving the required lookup tables
from the repo metadata every time fedmod runs,
instead run the derivation code as part of the
fetch-metadata command, and cache the result as
JSON files.

There are still lots of opportunities for start-up
time optimisations here, this is mainly aimed
at separating fedmod's notion of a "working data set"
from the raw repo level metadata that DNF uses.

Pull-Request has been merged by ncoghlan

6 years ago
Metadata