PR#2037: backend: add hitcounter script for AWS CDN - copr/copr

		`@@ -96,6 +96,7 @@`
		`Requires: python3-retask`
		`Requires: python3-setproctitle`
		`Requires: python3-tabulate`
		`+ Requires: python3-boto3`
		`Requires: redis`
		`Requires: rpm-sign`
		`Requires: rsync`

backend/copr_backend/constants.py

file modified

		`@@ -41,3 +41,5 @@`
		`'[%(asctime)s][%(levelname)6s][PID:%(process)d][%(name)10s][%(filename)s:%(funcName)s:%(lineno)d] %(message)s')`
		`build_log_format = Formatter(`
		`'[%(asctime)s][%(levelname)6s][PID:%(process)d] %(message)s')`
		`+ script_log_format = Formatter(`
		`+ "[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s")`

backend/copr_backend/helpers.py

file modified

+27

		`@@ -552,6 +552,33 @@`
		`return logger`


		`+ def setup_script_logger(log, path):`
		`+ """`
		`+ Backend scripts should simply do:`
		`+`
		`+ log = logging.getLogger(__name__)`
		`+ setup_script_logger(log, "/var/log/copr-backend/foo.log")`
		`+ """`
		`+`
		`+ # Don't read copr config, just use INFO. Scripts should implement`
		`+ # some --verbose parameter for debug information`
		`+ log.setLevel(logging.INFO)`
		`+`
		`+ # Drop the default handler, we will create it ourselves`
		`+ log.handlers = []`
		`+`
		`+ # Print to stdout`
		`+ stdout_log = logging.StreamHandler(sys.stdout)`
		`+ stdout_log.setFormatter(logging.Formatter("%(message)s"))`
		`+ log.addHandler(stdout_log)`
		`+`
		`+ # Add file logging`
		`+ file_log = logging.FileHandler(path)`
		`+ log.addHandler(file_log)`
		`+`
		`+ return log`
		`+`
		`+`
		`def create_file_logger(name, filepath, fmt=None):`
		`logger = logging.getLogger(name)`

backend/run/copr-aws-s3-hitcounter

file added

+240

		`@@ -0,0 +1,240 @@`
		`+ #!/usr/bin/python3`
		`+ # pylint: disable=invalid-name`
		`+`
		`+ """`
		`+ Download HTTP access logs from AWS s3 storage, parse them, increment on`
		`+ frontend, and clean up.`
		`+`
		`+ AWS CLI cheatsheet:`
		`+`
		`+ aws s3 ls`
		`+ aws s3 ls s3://fedora-copr`
		`+ aws s3 cp s3://fedora-copr/cloudwatch/E2PUZIRCXCOXTG.2021-12-07-15.00d7a244.gz ./`
		`+`
		`+ Token permissions are required:`
		`+ https://pagure.io/fedora-infrastructure/issue/10395`
		`+ """`
		`+`
		`+`
		`+ import os`
		`+ import argparse`
		`+ import logging`
		`+ import tempfile`
		`+ import gzip`
		`+ from datetime import datetime`
		`+ from socket import gethostname`
		`+ import boto3`
		`+ from copr_common.request import SafeRequest`
		`+ from copr_log_hitcounter import url_to_key_strings`
		`+ from copr_backend.helpers import BackendConfigReader, setup_script_logger`
		`+`
		`+`
		`+ # We will allow only this hostname to delete files from the S3 storage`
		`+ PRODUCTION_HOSTNAME = "copr-be.aws.fedoraproject.org"`
		`+`
		`+`
		`+ log = logging.getLogger(__name__)`
		`+ setup_script_logger(log, "/var/log/copr-backend/hitcounter-s3.log")`
		`+`
		`+`
		`+ class S3Bucket:`
		`+ """`
		`+ A high-level interface for interacting with files in the AWS s3 buckets`
		`+ """`
		`+`
		`+ def __init__(self, bucket=None, directory=None, dry_run=False):`
		`+ self.s3 = boto3.client("s3")`
		`+ self.bucket = bucket or "fedora-copr"`
		`+ self.directory = directory or "cloudwatch/"`
		`+ self.dry_run = dry_run`
		`+`
		`+ def list_files(self):`
		`+ """`
		`+ List all files within our AWS s3 bucket`
		`+ """`
		`+ objects = self.s3.list_objects(`
		`+ Bucket=self.bucket,`
		`+ Prefix=self.directory)`
		`+ return [x["Key"] for x in objects["Contents"]]`
		`+`
		`+ def download_file(self, s3file, dstdir):`
		`+ """`
		`+ Download a file from AWS s3 bucket`
		`+ """`
		`+ dst = os.path.join(dstdir, os.path.basename(s3file))`
		`+ self.s3.download_file(self.bucket, s3file, dst)`
		`+ return dst`
		`+`
		`+ def delete_file(self, s3file):`
		`+ """`
		`+ Delete a file from AWS s3 bucket`
		`+ """`
		`+ # Refusing to delete anything from development instances. We don't have`
		`+ # separate S3 buckets / directories for production and devel so removing`
		`+ # a file on a devel instance would mean a data loss for production.`
		`+ # We will remove files only from production and make devel instances`
		`+ # count them incorrectly multiple times.`
		`+ if gethostname() != PRODUCTION_HOSTNAME:`
		`+ log.debug("Not deleting %s on a dev instance", s3file)`
		`+ return`
		`+ if self.dry_run:`
		`+ return`
		`+ self.s3.delete_object(Bucket=self.bucket, Key=s3file)`
		`+`
		`+`
		`+ def gunzip(path):`
		`+ """`
		`+ Take a .gz file and uncompress it in the same directory`
		`+ """`
		`+ with gzip.open(path, "rb") as src:`
		`+ with open(path.rstrip(".gz"), "w") as dst:`
		`+ dst.write(src.read().decode("utf-8"))`
		`+ return dst.name`
		`+`
		`+`
		`+ def parse_access_file(path):`
		`+ """`
		`+ Take a raw access file and return its contents as a list of dicts.`
		`+ """`
		`+ with open(path, "r") as fd:`
		`+ content = fd.readlines()`
		`+`
		`+ # The file starts with meta information and thanks to #Fields, we know what`
		`+ # each column means.`
		`+ assert content[0].startswith("#Version:")`
		`+ assert content[1].startswith("#Fields:")`
		`+ keys = content[1].lstrip("#Fields:").split()`
		`+`
		`+ accesses = []`
		`+ for line in content[2:]:`
		`+ # Make sure we are not parsing any more meta information`
		`+ assert not line.startswith("#")`
		`+`
		`+ # Combine field names and this row values to create a dict`
		`+ values = line.split()`
		`+ access = dict(zip(keys, values))`
		`+ accesses.append(access)`
		`+`
		`+ return accesses`
		`+`
		`+`
		`+ def get_hit_data(accesses):`
		`+ """`
		`+ Prepare body for the frontend request in the same format that`
		`+ copr_log_hitcounter.py does.`
		`+ """`
		`+ hits = {}`
		`+ timestamps = []`
		`+ for access in accesses:`
		`+ url = access["cs-uri-stem"]`
		`+ key_strings = url_to_key_strings(url)`
		`+`
		`+ # We don't want to count every accessed URL, only those pointing to`
		`+ # RPM files and repo file`
		`+ if not key_strings:`
		`+ log.debug("Skipping: %s", url)`
		`+ continue`
		`+`
		`+ log.debug("Processing: %s", url)`
		`+`
		`+ # When counting RPM access, we want to iterate both project hits and`
		+ # chroot hits. That way we can get multiple `key_strings` for one URL
		`+ for key_str in key_strings:`
		`+ hits.setdefault(key_str, 0)`
		`+ hits[key_str] += 1`
		`+`
		`+ # Remember this access timestamp`
		`+ datetime_format = "%Y-%m-%d %H:%M:%S"`
		`+ datetime_string = "{0} {1}".format(access["date"], access["time"])`
		`+ datetime_object = datetime.strptime(datetime_string, datetime_format)`
		`+ timestamps.append(int(datetime_object.timestamp()))`
		`+`
		`+ return {`
		`+ "ts_from": min(timestamps),`
		`+ "ts_to": max(timestamps),`
		`+ "hits": hits,`
		`+ } if hits else {}`
		`+`
		`+`
		`+ def update_frontend(accesses, dry_run=False):`
		`+ """`
		+ Increment frontend statistics based on these `accesses`
		`+ """`
		`+ result = get_hit_data(accesses)`
		`+ if not result:`
		`+ log.debug("No recognizable hits among these accesses, skipping.")`
		`+ return`
		`+`
		`+ log.info(`
		`+ "Sending: %i results from %i to %i",`
		`+ len(result["hits"]),`
		`+ result["ts_from"],`
		`+ result["ts_to"]`
		`+ )`
		`+ log.debug("Hits: %s", result["hits"])`
		`+`
		`+ opts = BackendConfigReader().read()`
		`+ url = os.path.join(`
		`+ opts.frontend_base_url,`
		`+ "stats_rcv",`
		`+ "from_backend",`
		`+ )`
		`+ if not dry_run:`
		`+ SafeRequest(auth=opts.frontend_auth).post(url, result)`
		`+`
		`+`
		`+ def get_arg_parser():`
		`+ """`
		`+ Generate argument parser for this script`
		`+ """`
		`+ name = os.path.basename(__file__)`
		`+ description = (`
		`+ "Download HTTP access logs from AWS s3 storage, parse them, increment "`
		`+ "on frontend, and clean up."`
		`+ )`
		`+ parser = argparse.ArgumentParser(name, description=description)`
		`+ parser.add_argument(`
		`+ "--dry-run",`
		`+ action="store_true",`
		`+ help=("Do not perform any destructive changes, only print what "`
		`+ "would happen"))`
		`+ parser.add_argument(`
		`+ "--verbose",`
		`+ action="store_true",`
		`+ help=("Print verbose information about what is going on"))`
		`+ return parser`
		`+`
		`+`
		`+ def main():`
		`+ """`
		`+ Main function`
		`+ """`
		`+ parser = get_arg_parser()`
		`+ args = parser.parse_args()`
		`+ tmp = tempfile.mkdtemp(prefix="copr-aws-s3-hitcounter-")`
		`+`
		`+ if args.verbose:`
		`+ log.setLevel(logging.DEBUG)`
		`+`
		`+ s3 = S3Bucket(dry_run=args.dry_run)`
		`+ for s3file in s3.list_files():`
		`+ gz = s3.download_file(s3file, dstdir=tmp)`
		`+ raw = gunzip(gz)`
		`+ accesses = parse_access_file(raw)`
		`+`
		`+ # Maybe we want to use some locking or transaction mechanism to avoid`
		`+ # a scenario when we increment the accesses on the frontend but then`
		`+ # leave the s3 file untouched, which would result in parsing and`
		`+ # incrementing from the same file again in the next run`
		`+ update_frontend(accesses, dry_run=args.dry_run)`
		`+ s3.delete_file(s3file)`
		`+`
		`+ # Clean all temporary files`
		`+ for path in [gz, raw]:`
		`+ os.remove(path)`
		`+`
		`+ os.removedirs(tmp)`
		`+`
		`+`
		`+ if __name__ == "__main__":`
		`+ main()`

backend/run/copr_log_hitcounter.py

file modified

+39 -36

		`@@ -19,17 +19,12 @@`
		`from netaddr import IPNetwork, IPAddress`

		`from collections import defaultdict`
		`- from copr_backend.helpers import BackendConfigReader`
		`+ from copr_backend.helpers import BackendConfigReader, setup_script_logger`

		`opts = BackendConfigReader().read()`

		`- logging.basicConfig(`
		`- filename="/var/log/copr-backend/hitcounter.log",`
		`- format='[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s',`
		`- level=logging.INFO)`
		`-`
		`log = logging.getLogger(__name__)`
		`- log.addHandler(logging.StreamHandler(sys.stdout))`
		`+ setup_script_logger(log, "/var/log/copr-backend/hitcounter.log")`

		`spider_regex = re.compile('.*(ahrefs\|bot/[0-9]\|bingbot\|borg\|google\|googlebot\|yahoo\|slurp\|msnbot\|msrbot'`
		`'\|openbot\|archiver\|netresearch\|lycos\|scooter\|altavista\|teoma\|gigabot\|baiduspider'`
		`@@ -88,36 +83,9 @@`
		`if spider_regex.match(m.group('agent')):`
		`continue`

		`- url_match = repomd_url_regex.match(m.group('url'))`
		`- if url_match:`
		`- chroot_key = (`
		`- 'chroot_repo_metadata_dl_stat',`
		`- url_match.group('owner'),`
		`- url_match.group('project'),`
		`- url_match.group('chroot')`
		`- )`
		`- chroot_key_str = '\|'.join(chroot_key)`
		`- hits[chroot_key_str] += 1`
		`- continue`
		`+ for key_str in url_to_key_strings(m.group('url')):`
		`+ hits[key_str] += 1`

		`- url_match = rpm_url_regex.match(m.group('url'))`
		`- if url_match:`
		`- chroot_key = (`
		`- 'chroot_rpms_dl_stat',`
		`- url_match.group('owner'),`
		`- url_match.group('project'),`
		`- url_match.group('chroot')`
		`- )`
		`- chroot_key_str = '\|'.join(chroot_key)`
		`- hits[chroot_key_str] += 1`
		`- project_key = (`
		`- 'project_rpms_dl_stat',`
		`- url_match.group('owner'),`
		`- url_match.group('project')`
		`- )`
		`- project_key_str = '\|'.join(project_key)`
		`- hits[project_key_str] += 1`
		`- continue`
		`last_line = logline`

		`return {`
		`@@ -127,6 +95,41 @@`
		`}`


		`+ def url_to_key_strings(url):`
		`+ """`
		`+ Take a full URL and return a list of unique strings representing it,`
		`+ that copr-frontend will understand.`
		`+ """`
		`+ url_match = repomd_url_regex.match(url)`
		`+ if url_match:`
		`+ chroot_key = (`
		`+ 'chroot_repo_metadata_dl_stat',`
		`+ url_match.group('owner'),`
		`+ url_match.group('project'),`
		`+ url_match.group('chroot')`
		`+ )`
		`+ chroot_key_str = '\|'.join(chroot_key)`
		`+ return [chroot_key_str]`
		`+`
		`+ url_match = rpm_url_regex.match(url)`
		`+ if url_match:`
		`+ chroot_key = (`
		`+ 'chroot_rpms_dl_stat',`
		`+ url_match.group('owner'),`
		`+ url_match.group('project'),`
		`+ url_match.group('chroot')`
		`+ )`
		`+ chroot_key_str = '\|'.join(chroot_key)`
		`+ project_key = (`
		`+ 'project_rpms_dl_stat',`
		`+ url_match.group('owner'),`
		`+ url_match.group('project')`
		`+ )`
		`+ project_key_str = '\|'.join(project_key)`
		`+ return [chroot_key_str, project_key_str]`
		`+ return []`
		`+`
		`+`
		`def get_timestamp(logline):`
		`if not logline:`
		`return None`

frontend/coprs_frontend/coprs/helpers.py

file modified

+92 -6

		`@@ -40,12 +40,6 @@`
		`return ''.join(random.choice(string.ascii_lowercase) for x in range(size))`


		`- REPO_DL_STAT_FMT = "repo_dl_stat::{copr_user}@{copr_project_name}:{copr_name_release}"`
		`- CHROOT_REPO_MD_DL_STAT_FMT = "chroot_repo_metadata_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"`
		`- CHROOT_RPMS_DL_STAT_FMT = "chroot_rpms_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"`
		`- PROJECT_RPMS_DL_STAT_FMT = "project_rpms_dl_stat:hset::{copr_user}@{copr_project_name}"`
		`-`
		`-`
		`FINISHED_STATES = ["succeeded", "forked", "canceled", "skipped", "failed"]`
		`FINISHED_STATUSES = [StatusEnum(s) for s in FINISHED_STATES]`

		`@@ -100,8 +94,100 @@`


		`class CounterStatType(object):`
		`+ # When a .repo file is generated on the frontend.`
		`+ # It is displayed in the "Repo Download" column on project overview pages.`
		`REPO_DL = "repo_dl"`

		`+ # When an RPM or SRPM file is downloaded directly from the backend`
		`+ # or from Amazon AWS CDN.`
		`+ # It is displayed in the "Architectures" column on project overview pages.`
		`+ CHROOT_RPMS_DL = "chroot_rpms_dl"`
		`+`
		`+ # When a repodata/repomd.xml RPM or SRPM file is downloaded directly from`
		`+ # the backend or from Amazon AWS CDN.`
		`+ # We are counting but not using this information anywhere.`
		`+ CHROOT_REPO_MD_DL = "chroot_repo_metadata_dl"`
		`+`
		+ # This should equal to the sum of all `CHROOT_RPMS_DL` stats within
		`+ # a project. It is a redundant information to be stored, since it can be`
		+ # calculated. But our `model.CounterStat` design isn't friendly for querying
		`+ # based on owner/project.`
		`+ # We are counting but not using this information anywhere.`
		`+ PROJECT_RPMS_DL = "project_rpms_dl"`
		`+`
		`+`
		`+ def get_stat_name(stat_type, copr_dir=None, copr_chroot=None,`
		`+ name_release=None, key_string=None):`
		`+ """`
		+ Generate a `models.CounterStat.name` value based on various optional
		`+ parameters. Only a subset of parameters needs to be set based on the context`
		+ and `stat_type`.
		`+`
		`+ This method is too complicated and messy, we should either minimize the`
		`+ number of input parameters if possible, or turn this into a class with`
		+ methods for each `stat_type` or use-case.
		`+ """`
		`+`
		`+ # TODO These are way too complicated`
		`+ # We should get rid of the redis syntax (hset::)`
		`+ #`
		+ # We should not start the value with `stat_type` because we already have
		+ # `CounterStat.counter_type` for that.
		`+ #`
		+ # We should probably add `CounterStat.copr_id` to remove `copr_user`
		+ # and `copr_project` from the strings
		`+ # pylint: disable=invalid-name`
		`+ REPO_DL_STAT_FMT = "repo_dl_stat::{copr_user}@{copr_project_name}:{copr_name_release}"`
		`+ CHROOT_REPO_MD_DL_STAT_FMT = "chroot_repo_metadata_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"`
		`+ CHROOT_RPMS_DL_STAT_FMT = "chroot_rpms_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"`
		`+ PROJECT_RPMS_DL_STAT_FMT = "project_rpms_dl_stat:hset::{copr_user}@{copr_project_name}"`
		`+`
		`+ stat_fmt = {`
		`+ CounterStatType.REPO_DL: REPO_DL_STAT_FMT,`
		`+ CounterStatType.CHROOT_REPO_MD_DL: CHROOT_REPO_MD_DL_STAT_FMT,`
		`+ CounterStatType.CHROOT_RPMS_DL: CHROOT_RPMS_DL_STAT_FMT,`
		`+ CounterStatType.PROJECT_RPMS_DL: PROJECT_RPMS_DL_STAT_FMT,`
		`+ }.get(stat_type)`
		`+`
		`+ if not stat_fmt:`
		`+ raise ValueError("Unexpected stat_type")`
		`+`
		`+ kwargs = {}`
		`+ if name_release:`
		`+ kwargs["copr_name_release"] = name_release`
		`+`
		`+ if copr_dir:`
		`+ kwargs.update({`
		`+ "copr_user": copr_dir.copr.owner_name,`
		`+ "copr_project_name": copr_dir.copr.name,`
		`+ })`
		`+`
		`+ if copr_chroot:`
		`+ kwargs.update({`
		`+ "copr_user": copr_chroot.copr.owner_name,`
		`+ "copr_project_name": copr_chroot.copr.name,`
		`+ "copr_chroot": copr_chroot.name,`
		`+ })`
		`+`
		`+ # The key strings come from backend hitcounter scripts, e.g.`
		`+ # 'project_rpms_dl_stat\|jvanek\|java17'`
		`+ if key_string:`
		`+ keys = key_string.split("\|")`
		`+ kwargs.update({`
		`+ "copr_user": keys[0],`
		`+ "copr_project_name": keys[1],`
		`+ })`
		`+`
		`+ # Also from backend hitcounter scripts, e.g.`
		`+ # 'chroot_repo_metadata_dl_stat\|jose_exposito\|touchegg\|fedora-35-x86_64'`
		`+ # 'chroot_rpms_dl_stat\|packit\|psss-tmt-896\|fedora-rawhide-x86_64'`
		`+ if key_string and stat_type in [CounterStatType.CHROOT_RPMS_DL,`
		`+ CounterStatType.CHROOT_REPO_MD_DL]:`
		`+ keys = key_string.split("\|")`
		`+ kwargs["copr_chroot"] = keys[2]`
		`+`
		`+ return stat_fmt.format(**kwargs)`
		`+`

		`class PermissionEnum(metaclass=EnumType):`
		`# The text form is part of APIv3!`

frontend/coprs_frontend/coprs/logic/complex_logic.py

file modified

+7 -10

		`@@ -10,7 +10,6 @@`
		`from copr_common.enums import StatusEnum`
		`from coprs import app`
		`from coprs import db`
		`- from coprs import rcp`
		`from coprs import helpers`
		`from coprs import models`
		`from coprs import exceptions`
		`@@ -19,10 +18,10 @@`
		`from coprs.logic.batches_logic import BatchesLogic`
		`from coprs.logic.packages_logic import PackagesLogic`
		`from coprs.logic.actions_logic import ActionsLogic`
		`+ from coprs.logic.stat_logic import CounterStatLogic`

		`from coprs.logic.users_logic import UsersLogic`
		`from coprs.models import User, Copr`
		`- from coprs.rmodels import TimedStatEvents`
		`from coprs.logic.coprs_logic import (CoprsLogic, CoprDirsLogic, CoprChrootsLogic,`
		`PinnedCoprsLogic, MockChrootsLogic)`

		`@@ -569,15 +568,13 @@`
		`"""`
		Calculate how many times a repo for given `copr_chroot` was downloaded
		`"""`
		`- chroot_rpms_dl_stat_key = helpers.CHROOT_RPMS_DL_STAT_FMT.format(`
		`- copr_user=copr_chroot.copr.owner_name,`
		`- copr_project_name=copr_chroot.copr.name,`
		`- copr_chroot=copr_chroot.name,`
		`- )`
		`- return TimedStatEvents.get_count(`
		`- rconnect=rcp.get_connection(),`
		`- name=chroot_rpms_dl_stat_key,`
		`+ stat_type = helpers.CounterStatType.CHROOT_RPMS_DL`
		`+ stat_name = helpers.get_stat_name(`
		`+ stat_type=stat_type,`
		`+ copr_chroot=copr_chroot,`
		`)`
		`+ stat = CounterStatLogic.get(stat_name).first()`
		`+ return stat.counter if stat else 0`

		`@classmethod`
		`def _delete_reason(cls, copr_chroots):`

frontend/coprs_frontend/coprs/logic/stat_logic.py

file modified

+25 -58

		`@@ -6,9 +6,6 @@`
		`from coprs import db`
		`from coprs.models import CounterStat`
		`from coprs import helpers`
		`- from coprs.helpers import REPO_DL_STAT_FMT, CHROOT_REPO_MD_DL_STAT_FMT, \`
		`- CHROOT_RPMS_DL_STAT_FMT, PROJECT_RPMS_DL_STAT_FMT`
		`- from coprs.rmodels import TimedStatEvents`


		`class CounterStatLogic(object):`
		`@@ -36,16 +33,16 @@`
		`return csl`

		`@classmethod`
		`- def incr(cls, name, counter_type):`
		`+ def incr(cls, name, counter_type, count=1):`
		`"""`
		`Warning: dirty method: does commit if missing stat record.`
		`"""`
		`try:`
		`csl = CounterStatLogic.get(name).one()`
		`- csl.counter = CounterStat.counter + 1`
		`+ csl.counter = CounterStat.counter + count`
		`except NoResultFound:`
		`csl = CounterStatLogic.add(name, counter_type)`
		`- csl.counter = 1`
		`+ csl.counter = count`

		`db.session.add(csl)`
		`return csl`
		`@@ -55,12 +52,12 @@`
		`# chroot -> stat_name`
		`chroot_by_stat_name = {}`
		`for chroot in copr.active_chroots:`
		`- kwargs = {`
		`- "copr_user": copr.user.name,`
		`- "copr_project_name": copr.name,`
		`- "copr_name_release": chroot.name_release`
		`- }`
		`- chroot_by_stat_name[REPO_DL_STAT_FMT.format(**kwargs)] = chroot.name_release`
		`+ stat_name = helpers.get_stat_name(`
		`+ stat_type=helpers.CounterStatType.REPO_DL,`
		`+ copr_dir=copr.main_dir,`
		`+ name_release=chroot.name_release,`
		`+ )`
		`+ chroot_by_stat_name[stat_name] = chroot.name_release`

		`# [{counter: <value>, name: <stat_name>}, ...]`
		`stats = cls.get_multiply_same_type(counter_type=helpers.CounterStatType.REPO_DL,`
		`@@ -74,59 +71,29 @@`
		`return repo_dl_stats`


		`- def handle_be_stat_message(rc, stat_data):`
		`+ def handle_be_stat_message(stat_data):`
		`"""`
		`- :param rc: connection to redis`
		`- :type rc: StrictRedis`
		`-`
		`:param stat_data: stats from backend`
		`:type stat_data: dict`
		`"""`
		`app.logger.debug('Got stat data: {}'.format(stat_data))`

		`- ts_from = int(stat_data['ts_from'])`
		`- ts_to = int(stat_data['ts_to'])`
		`hits = stat_data['hits']`
		`+ for key_str, count in hits.items():`
		`+ stat_type, key_string = key_str.split("\|", 1)`

		`- if not ts_from or not ts_to or ts_from > ts_to or not hits:`
		`- raise Exception("Invalid or empty data received.")`
		`-`
		`- ts_from_stored = int(rc.get('handle_be_stat_message_ts_from') or 0)`
		`- ts_to_stored = int(rc.get('handle_be_stat_message_ts_to') or 0)`
		`-`
		`- app.logger.debug('ts_from: {}'.format(ts_from))`
		`- app.logger.debug('ts_to: {}'.format(ts_to))`
		`- app.logger.debug('ts_from_stored: {}'.format(ts_from_stored))`
		`- app.logger.debug('ts_to_stored: {}'.format(ts_to_stored))`
		`+ # FIXME the keys from backend doesn't match CounterStatType exactly`
		`+ stat_type = stat_type.rstrip("_stat")`

		`- if (ts_from < ts_to_stored and ts_to > ts_from_stored):`
		`- app.logger.debug('Time overlap with already stored data. Skipping.')`
		`- return`
		`+ assert stat_type in [`
		`+ helpers.CounterStatType.REPO_DL,`
		`+ helpers.CounterStatType.CHROOT_REPO_MD_DL,`
		`+ helpers.CounterStatType.CHROOT_RPMS_DL,`
		`+ helpers.CounterStatType.PROJECT_RPMS_DL,`
		`+ ]`

		`- hits_formatted = defaultdict(int)`
		`- for key_str, count in hits.items():`
		`- key = key_str.split('\|')`
		`- if key[0] == 'chroot_repo_metadata_dl_stat':`
		`- redis_key = CHROOT_REPO_MD_DL_STAT_FMT.format(`
		`- copr_user=key[1],`
		`- copr_project_name=key[2],`
		`- copr_chroot=key[3])`
		`- elif key[0] == 'chroot_rpms_dl_stat':`
		`- redis_key = CHROOT_RPMS_DL_STAT_FMT.format(`
		`- copr_user=key[1],`
		`- copr_project_name=key[2],`
		`- copr_chroot=key[3])`
		`- elif key[0] == 'project_rpms_dl_stat':`
		`- redis_key = PROJECT_RPMS_DL_STAT_FMT.format(`
		`- copr_user=key[1],`
		`- copr_project_name=key[2])`
		`- else:`
		`- raise Exception('Unknown key {}'.format(key[0]))`
		`-`
		`- hits_formatted[redis_key] += count`
		`-`
		`- for redis_key, count in hits_formatted.items():`
		`- TimedStatEvents.add_event(rc, redis_key, count=count, timestamp=ts_to)`
		`-`
		`- rc.set('handle_be_stat_message_ts_from', ts_from)`
		`- rc.set('handle_be_stat_message_ts_to', ts_to)`
		`+ stat_name = helpers.get_stat_name(`
		`+ stat_type=stat_type,`
		`+ key_string=key_string,`
		`+ )`
		`+ CounterStatLogic.incr(stat_name, stat_type, count)`

frontend/coprs_frontend/coprs/views/coprs_ns/coprs_general.py

file modified

+6 -6

		`@@ -49,7 +49,7 @@`

		`from coprs.logic import builds_logic, coprs_logic, actions_logic, users_logic`
		`from coprs.helpers import generate_repo_url, \`
		`- url_for_copr_view, REPO_DL_STAT_FMT, CounterStatType, generate_repo_name, \`
		`+ url_for_copr_view, CounterStatType, generate_repo_name, \`
		`WorkList`


		`@@ -998,11 +998,11 @@`
		`response.headers["Content-Disposition"] = \`
		`"filename={0}.repo".format(copr_dir.repo_name)`

		`- name = REPO_DL_STAT_FMT.format(**{`
		`- 'copr_user': copr_dir.copr.user.name,`
		`- 'copr_project_name': copr_dir.copr.name,`
		`- 'copr_name_release': name_release,`
		`- })`
		`+ name = helpers.get_stat_name(`
		`+ CounterStatType.REPO_DL,`
		`+ copr_dir=copr_dir,`
		`+ name_release=name_release,`
		`+ )`
		`CounterStatLogic.incr(name=name, counter_type=CounterStatType.REPO_DL)`
		`db.session.commit()`

frontend/coprs_frontend/coprs/views/misc.py

file modified

+1 -16

		`@@ -3,13 +3,12 @@`
		`import datetime`
		`import functools`
		`from functools import wraps, partial`
		`+ from urllib.parse import urlparse`

		`- from netaddr import IPAddress, IPNetwork`
		`import re`
		`import flask`
		`from flask import send_file`

		`- from urllib.parse import urlparse`
		`from openid_teams.teams import TeamsRequest`

		`from copr_common.enums import RoleEnum`
		`@@ -364,20 +363,6 @@`
		`return decorated_function`


		`- def intranet_required(f):`
		`- @functools.wraps(f)`
		`- def decorated_function(args, *kwargs):`
		`- ip_addr = IPAddress(flask.request.remote_addr)`
		`- accept_ranges = set(app.config.get("INTRANET_IPS", []))`
		`- accept_ranges.add("127.0.0.1") # always accept from localhost`
		`- if not any(ip_addr in IPNetwork(addr_or_net) for addr_or_net in accept_ranges):`
		`- return ("Stats can be update only from intranet hosts, "`
		`- "not {}, check config\n".format(flask.request.remote_addr)), 403`
		`-`
		`- return f(args, *kwargs)`
		`- return decorated_function`
		`-`
		`-`
		`def req_with_copr(f):`
		`@wraps(f)`
		`def wrapper(**kwargs):`

frontend/coprs_frontend/coprs/views/stats_ns/stats_receiver.py

file modified

+7 -6

		`@@ -1,11 +1,10 @@`
		`# coding: utf-8`

		`import flask`
		`- import json`
		`- from coprs import rcp`
		`from coprs import app`
		`from coprs import db`
		`- from ..misc import intranet_required`
		`+ from coprs.exceptions import CoprHttpException`
		`+ from coprs.views.misc import backend_authenticated`
		`from . import stats_rcv_ns`
		`from ...logic.stat_logic import CounterStatLogic, handle_be_stat_message`

		`@@ -16,7 +15,7 @@`


		`@stats_rcv_ns.route("/<counter_type>/<name>/", methods=['POST'])`
		`- @intranet_required`
		`+ @backend_authenticated`
		`def increment(counter_type, name):`
		`app.logger.debug(flask.request.remote_addr)`

		`@@ -26,11 +25,13 @@`


		`@stats_rcv_ns.route("/from_backend", methods=['POST'])`
		`- @intranet_required`
		`+ @backend_authenticated`
		`def backend_stat_message_handler():`
		`try:`
		`- handle_be_stat_message(rcp.get_connection(), json.loads(flask.request.json))`
		`+ handle_be_stat_message(flask.request.json)`
		`+ db.session.commit()`
		`except Exception as err:`
		`app.logger.exception(err)`
		`+ raise CoprHttpException from err`

		`return "OK", 201`

frontend/coprs_frontend/tests/coprs_test_case.py

file modified

-7

		`@@ -7,7 +7,6 @@`
		`from functools import wraps`
		`import datetime`
		`import uuid`
		`- from unittest import mock`

		`import pytest`
		`import decorator`
		`@@ -67,11 +66,6 @@`
		`# os.makedirs(datadir)`
		`coprs.db.create_all()`
		`self.db.session.commit()`
		`- #coprs/views/coprs_ns/coprs_general.py`
		`- self.rmodel_TSE_coprs_general_patcher = mock.patch("coprs.logic.complex_logic.TimedStatEvents")`
		`- self.rmodel_TSE_coprs_general_mc = self.rmodel_TSE_coprs_general_patcher.start()`
		`- self.rmodel_TSE_coprs_general_mc.return_value.get_count.return_value = 0`
		`- self.rmodel_TSE_coprs_general_mc.return_value.add_event.return_value = None`

		`self.web_ui = WebUIRequests(self)`
		`self.api3 = API3Requests(self)`
		`@@ -88,7 +82,6 @@`
		`# we actually need to drop it so the setup_method() re-creates the data`
		`self.db.engine.execute('drop table dist_git_instance')`

		`- self.rmodel_TSE_coprs_general_patcher.stop()`
		`self.app.config = self.original_config.copy()`
		`cache.clear()`

frostyx commented 2 years ago

Download HTTP access logs from AWS s3 storage, parse them, increment
on frontend, and clean up.

Metadata Update from @frostyx:
- Pull-request tagged with: wip

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 1m 47s

frostyx commented 2 years ago

There are 3 known issues to me

It unintentionally logs into hitcounter.log instead of hitcounter-s3.log
It raises copr_common.request.RequestError: Request client error on https://copr-fe-dev.cloud.fedoraproject.org/stats_rcv/from_backend: 403 FORBIDDEN when trying to contact frontend. That is caused by @intranet_required on the frontend route but I haven't figured out how to fix it yet.
When I temporarily disable the decorator ^^ and successfully send the data to frontend, the values in a project overview don't increment. I am not yet sure why.

But I wanted to share the script as soon as possible even though it is not finished yet.

praiskup commented 2 years ago

Thank you for the progress!

How large are the logs ATM? How many MB/day is stored?

frostyx commented 2 years ago

How large are the logs ATM?

$ aws s3 ls s3://fedora-copr/cloudwatch/ |wc -l
15270

$ aws s3 ls s3://fedora-copr/cloudwatch/ | awk '{sum += $3}; {print sum}'
7469396044    # 7.4G

How many MB/day is stored?

$ aws s3 ls s3://fedora-copr/cloudwatch/ |grep 2022-01-16 |wc -l
410

$ aws s3 ls s3://fedora-copr/cloudwatch/ |grep 2022-01-16 | awk '{sum += $3}; {print sum}'
211062883    # ~200MB

msuchy commented 2 years ago

LGTM so far.

rebased onto 935b886531aa533d4825f7c999dbb3d24243c5b6

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 1m 51s

frostyx commented 2 years ago

I basically left the hitcounter script as-is and focused on the previously discussed frontend changes related to authentication and Redis. I added a lot of TODOs because I don't want to make this big change even more complicated, and rather do it in a follow-up PR.

From my POV, this PR is almost finished, I just want to properly document the types in helpers.CounterStatType. In the project overview, we have, e.g.

Fedora 35            x86_64 (0)*            (0 downloads)

And I am not really sure what number should be incremented when.

When clicking the, e.g. Fedora 35 button in the Repo Download field, the second value should be incremented.
When downloading RPM from backend (that we find by either of our hitcounter scripts), the first value should be incremented
What to do when repodata/repomd.xml file is downloaded from the backend?
For some reason we also used to store PROJECT_RPMS_DL_STAT_FMT into redis. I think whenever an RPM is downloaded, the chroot download count is incremented as well as project download count. Which is fine by me but do we show this value somewhere?

It unintentionally logs into hitcounter.log instead of hitcounter-s3.log

Oops, I should fix this as well

praiskup commented 2 years ago

When clicking the, e.g. Fedora 35 button in the Repo Download field, the second value should be incremented.

But isn't this working even now?

When downloading RPM from backend (that we find by either of our hitcounter scripts), the first value should be incremented

Agreed.

What to do when repodata/repomd.xml file is downloaded from the backend?

Nothing?

For some reason we also used to store PROJECT_RPMS_DL_STAT_FMT into redis. I think whenever an RPM is downloaded, the chroot download count is incremented as well as project download count. Which is fine by me but do we show this value somewhere?

I don't think we do, but we could (why not).

frostyx commented 2 years ago

But isn't this working even now?

It is, this is a frontend-only thing and it works even now. I just wanted to describe the whole picture.

What to do when repodata/repomd.xml file is downloaded from the backend?

Nothing?

Sounds reasonable to me :D
It's just that the previous hitcounter script parses and sends such accesses to frontend, where is also some code related to it, see the CHROOT_REPO_MD_DL_STAT_FMT constant. I wasn't really sure what to do with it :D

praiskup commented 2 years ago

Ad repomd.xml, the only special thig is that CDN never caches it, it is always
downloaded (through CDN rewrite, not redirect) from the copr backend.

Which brings me to an important thing we should IMO do - try to
- ignore any CloudFronts (CDN) access from the lighttpd logs
- ignore accesses from the Copr builders (mock can set user_agent to a specific value)

4 new commits added

frontend: don't use redis as a middleman when updating hitcounter stats
frontend: use standard backend auth for updating stats
frontend: don't return 201 when the call fails with an exception
backend: add hitcounter script for AWS CDN

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 2m 25s

rebased onto 0a1a74b50d37a770562a7e387831d3177bafbdb2

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 2m 09s

rebased onto 4c3f3336c1f365d1c26f8e95b497f35f4256bc2f

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : MERGER_FAILURE in 5s

1 new commit added

backend: print only logging messages to the stdout

2 years ago

zuul commented 2 years ago

Merge Failed.

This change or one of its cross-repo dependencies was unable to be automatically merged with the current state of its repository. Please rebase the change and upload a new patchset.

rebased onto 7c3eb35db6d16438037ff70302f77bc78df7a131

2 years ago

Metadata Update from @frostyx:
- Pull-request untagged with: wip

2 years ago

frostyx commented 2 years ago

PTAL

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 2m 53s

7 new commits added

backend: print only logging messages to the stdout
backend: fix pylint import warnings
backend: add handler for logging into file in hitcounter scripts
frontend: don't use redis as a middleman when updating hitcounter stats
frontend: use standard backend auth for updating stats
frontend: don't return 201 when the call fails with an exception
backend: add hitcounter script for AWS CDN

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 4m 16s

praiskup commented 2 years ago

Is the CI failure related to this PR?

praiskup commented 2 years ago

This is pretty complicated to go one by one commit ... some commits seem to fix other commits. But overall, I think it is OK, so I'm +1 ... We still should take a better look at the useragent suff, nad others ... but that should be done later, before the next release.

praiskup commented on line 11 of frontend/coprs_frontend/coprs/views/stats_ns/stats_receiver.py 2 years ago

This from ..misc import seems messy :-( can't we just from coprs.views.misc import?

praiskup commented 2 years ago

Very nice contribution, btw.

7 new commits added

backend: print only logging messages to the stdout
backend: fix pylint import warnings
backend: add handler for logging into file in hitcounter scripts
frontend: don't use redis as a middleman when updating hitcounter stats
frontend: use standard backend auth for updating stats
frontend: don't return 201 when the call fails with an exception
backend: add hitcounter script for AWS CDN

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 2m 23s

rebased onto 75c2285d4ec86ebfa866f5fe99f3e122d799c5fa

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 4m 17s

7 new commits added

backend: print only logging messages to the stdout
backend: fix pylint import warnings
backend: add handler for logging into file in hitcounter scripts
frontend: don't use redis as a middleman when updating hitcounter stats
frontend: use standard backend auth for updating stats
frontend: don't return 201 when the call fails with an exception
backend: add hitcounter script for AWS CDN

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 2m 48s

frostyx commented 2 years ago

This from ..misc import seems messy :-( can't we just from coprs.views.misc import?

Good catch, thanks. I copy-pasted it from some other file.

Is the CI failure related to this PR?

It was, code was fine but tests were trying to mock something, that I got rid of. Fixed.

This is pretty complicated to go one by one commit ... some commits seem to fix other commits.

Sorry about that, I really tried to not do it this way but I cannot merge those commits into one because there is something in between that causes conflicts. However, only the commits related to logging should be this messy.

praiskup commented 2 years ago

+1, will merge