From 5587d855e2cc296a0af953911a3cc56fff8aa7b7 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Holcroft Date: Sep 25 2023 21:03:27 +0000 Subject: create a script to run all extraction and processing let's move processing logic in a dedicated file to simply maintenance and optimization --- diff --git a/build.py b/build.py index 731ca7d..509d88f 100755 --- a/build.py +++ b/build.py @@ -80,6 +80,7 @@ def main(): srpm_regex = None if args.filter: + log.info(f"Filter used: {args.filter}") srpm_regex = re.compile(f"^{args.filter}$") packages_folder = f"./results/{args.results}/packages/" @@ -137,11 +138,11 @@ def main(): data = {} stats = dict( - total=0, - processed=0, - filtered=0, - skipped=0, - error=0) + total=0, + processed=0, + filtered=0, + skipped=0, + error=0) total_urls = len(url_list) lock = threading.Lock() @@ -151,139 +152,142 @@ def main(): result = [] with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - for line in url_list: - stats['total'] += 1 - url = urlparse(line.strip()) - if not url.scheme: - continue - result.append( - executor.submit( - process_srpm, - url, - srpms_path, - packages_folder, - srpm_regex, - lock, - data, - data_file, - force=args.force, - keep=args.keep, - max_conn=max_conn, - max_ext=max_ext)) - - for r in concurrent.futures.as_completed(result): - (code, is_ok) = r.result() - if is_ok: - stats['processed'] += 1 - elif code == 1: - stats['filtered'] += 1 - elif code == 2: - stats['skipped'] += 1 - elif code == 3: - stats['error'] += 1 + for line in url_list: + stats['total'] += 1 + url = urlparse(line.strip()) + if not url.scheme: + continue + result.append( + executor.submit( + process_srpm, + url, + srpms_path, + packages_folder, + srpm_regex, + lock, + data, + data_file, + force=args.force, + keep=args.keep, + max_conn=max_conn, + max_ext=max_ext)) + + for r in concurrent.futures.as_completed(result): + (code, is_ok) = r.result() + if is_ok: + stats['processed'] += 1 + elif code == 1: + stats['filtered'] += 1 + elif code == 2: + stats['skipped'] += 1 + elif code == 3: + stats['error'] += 1 log.info("{processed} new packages processed, {filtered} filtered, {skipped} known and {error} with errors, on {total} total".format(**stats)) -def process_srpm(url, srpms_path, packages_folder, srpm_regex=None, lock=None, data={}, data_file=None, force=False, keep=False, max_conn=None, max_ext=None): + +def process_srpm(url, srpms_path, packages_folder, srpm_regex=None, lock=None, data={}, data_file=None, force=False, + keep=False, max_conn=None, max_ext=None): log = logging.getLogger("srpmParser.process_srpm") try: - srpm_filename = os.path.basename(url.path) - srpm_data = dnf.subject.Subject(srpm_filename) - nevras = srpm_data.get_nevra_possibilities(forms=1) - try: - package = nevras[0] - except TypeError: - package = next(nevras) - - if srpm_regex and not srpm_regex.match(package.name): - return (1, False) - - already_processed = False - with lock: - if package.name in data and not force: - # Compare version - try: - known_package = dnf.subject.Subject( - data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] - except TypeError: - known_package = next(dnf.subject.Subject( - data[package.name]["srpm"]).get_nevra_possibilities(forms=1)) - if rpm.labelCompare( - (package.epoch, - package.version, - package.release), - (known_package.epoch, - known_package.version, - known_package.release)) <= 0: - already_processed = True - - if already_processed: - log.debug("%s: Already processed, skipping", package.name) - return (2, False) - - log.info("%s: Processing", package.name) - srpm_path = os.path.join(srpms_path, srpm_filename) - if not os.path.isfile(srpm_path): - with max_conn: - log.info("%s: Downloading", package.name) - if url.scheme == "rsync": - dl = subprocess.run( + srpm_filename = os.path.basename(url.path) + srpm_data = dnf.subject.Subject(srpm_filename) + nevras = srpm_data.get_nevra_possibilities(forms=1) + try: + package = nevras[0] + except TypeError: + package = next(nevras) + + if srpm_regex and not srpm_regex.match(package.name): + return (1, False) + + already_processed = False + with lock: + if package.name in data and not force: + # Compare version + try: + known_package = dnf.subject.Subject( + data[package.name]["srpm"]).get_nevra_possibilities(forms=1)[0] + except TypeError: + known_package = next(dnf.subject.Subject( + data[package.name]["srpm"]).get_nevra_possibilities(forms=1)) + if rpm.labelCompare( + (package.epoch, + package.version, + package.release), + (known_package.epoch, + known_package.version, + known_package.release)) <= 0: + already_processed = True + + if already_processed: + log.debug("%s: Already processed, skipping", package.name) + return (2, False) + + log.info("%s: Processing", package.name) + srpm_path = os.path.join(srpms_path, srpm_filename) + if not os.path.isfile(srpm_path): + with max_conn: + log.info("%s: Downloading", package.name) + if url.scheme == "rsync": + dl = subprocess.run( ["rsync", url.geturl(), srpms_path], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - else: - dl = subprocess.run( + else: + dl = subprocess.run( ["curl", "-L", "--remote-name", url.geturl()], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=srpms_path) - if dl.returncode: - log.error("%s: error downloading srpm:", package.name) - log.error(dl.stdout) - return (3, False) - - try: - with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmp: - with max_ext: - log.info("%s: Extracting", package.name) - extract_srpm(tmp, srpm_path, packages_folder) - log.info("%s: Discovering", package.name) - try: - (tsearch, tcopy, results) = discover_translations( - tmp, package.name, packages_folder) - except Exception as e: - log.exception(e) - return(3, False) - except OSError as e: - if "Directory not empty" in e.strerror: - log.error(e) - log.warning("%s: error cleaning tmpdir: %s", package.name, tmp) - else: - raise - - if not keep: - os.unlink(srpm_path) - - with lock: - # save processed srpm name & version - data[package.name] = { - "srpm": srpm_filename, - "tsearch": tsearch, - "tcopy": tcopy, - "results": results} - - with open(data_file, "w") as f: - json.dump(data, f, indent=2) - - log.info("%s: Completed", package.name) + if dl.returncode: + log.error("%s: error downloading srpm:", package.name) + log.error(dl.stdout) + return (3, False) + + try: + with tempfile.TemporaryDirectory(prefix="l10n-stats") as tmp: + with max_ext: + log.info("%s: Extracting", package.name) + extract_srpm(tmp, srpm_path, packages_folder) + log.info("%s: Discovering", package.name) + try: + (tsearch, tcopy, results) = discover_translations( + tmp, package.name, packages_folder) + except Exception as e: + log.exception(e) + return (3, False) + except OSError as e: + if "Directory not empty" in e.strerror: + log.error(e) + log.warning("%s: error cleaning tmpdir: %s", package.name, tmp) + else: + raise + + if not keep: + os.unlink(srpm_path) + + with lock: + # save processed srpm name & version + data[package.name] = { + "srpm": srpm_filename, + "tsearch": tsearch, + "tcopy": tcopy, + "results": results} + + with open(data_file, "w") as f: + json.dump(data, f, indent=2) + + log.info("%s: Completed", package.name) except Exception as e: - log.exception(e) + log.exception(e) return (0, True) + def extract_srpm(tmp, name, dest_folder): """extract srpm page""" log = logging.getLogger("srpmParser.extract_srpm") diff --git a/configuration.json b/configuration.json index 28b561b..830e9dc 100644 --- a/configuration.json +++ b/configuration.json @@ -1,4 +1,6 @@ { + "fedora_releases": ["f7", "f10", "f20", "f30", "f35", "f37", "f38", "f39"], + "staging_srpm_regex": "blue.*", "tm_for_versions_comment": "only build tm for this fedora version", "tm_for_versions": ["f38"] } \ No newline at end of file diff --git a/run_all.py b/run_all.py new file mode 100755 index 0000000..cb42001 --- /dev/null +++ b/run_all.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import argparse +import json +import logging +import os + +import utils + + +def main(): + parser = argparse.ArgumentParser( + description="Run all scripts" + ) + parser.add_argument( + "--env", required=True, choices=["staging", "production"], help="Set the results folder to use" + ) + parser.add_argument( + "--scope", required=True, choices=["extract", "compute"], help="", + ) + parser.add_argument( + "-v", + "--verbose", + default=False, + action="store_true", + dest="verbose", + help="Add verbosity" + ) + args = parser.parse_args() + + utils.set_logging(args.verbose, "all") + log = logging.getLogger("run_all") + + with open("configuration.json", "r") as read_file: + configuration = json.load(read_file) + + if args.env == "staging": + build_suffix = f" {configuration['staging_srpm_regex']}" + else: + build_suffix = "" + + if args.scope == "extract": + for release in configuration["fedora_releases"]: + log.info(f"Starting extraction for {release}") + utils.set_logging(args.verbose, release) + command = f"podman run -it --rm -v ./:/src:z -v ./results:/src/results:z -v ./srpms:/srpms:z --tmpfs /tmp:size=4G fedlocstats:latest /src/build.py --results {release} {build_suffix}" + os.system(command) + log.info(f"Extraction for {release} is done") + elif args.scope == "compute": + for release in configuration["fedora_releases"]: + log.info(f"Starting processing for {release}") + utils.set_logging(args.verbose, release) + command = f"./build_language_list.py --results {release}" + os.system(command) + command = f"./build_stats.py --results {release}" + os.system(command) + command = f"LANG=C ./build_tm.py --results {release} --compress" + os.system(command) + command = f"./ build_website.py - -results {release}" + os.system(command) + + log.info(f"Extraction for {release} is done") + + log.info("Done") + + +if __name__ == "__main__": + main() diff --git a/utils.py b/utils.py index baac636..8a13ce9 100644 --- a/utils.py +++ b/utils.py @@ -5,14 +5,15 @@ import yaml import os import sys + def set_logging(verbose=False, release=None): try: with open("logging.yml", 'r') as f: config = yaml.safe_load(f.read()) - for _,handlers in config['handlers'].items(): + for _, handlers in config['handlers'].items(): if 'filename' in handlers: if release: - handlers['filename'] = handlers['filename'].format(release=release) + handlers['filename'] = handlers['filename'].format(release=release) os.makedirs(os.path.dirname(handlers['filename']), exist_ok=True) if verbose: config['handlers']['console']['level'] = "DEBUG" @@ -20,10 +21,10 @@ def set_logging(verbose=False, release=None): print("logging config loaded") except FileNotFoundError: param = dict( - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + format=f"%(asctime)s - %(name)s - {release} - %(levelname)s - %(message)s", level=logging.INFO) if verbose: - param['level'] = logging.DEBUG - if sys.version_info >= (3,8,0): + param['level'] = logging.DEBUG + if sys.version_info >= (3, 8, 0): param['force'] = True logging.basicConfig(**param)