#2037 backend: add hitcounter script for AWS CDN
Merged 2 years ago by praiskup. Opened 2 years ago by frostyx.
copr/ frostyx/copr s3-hitcounter  into  main

@@ -96,6 +96,7 @@ 

  Requires:   python3-retask

  Requires:   python3-setproctitle

  Requires:   python3-tabulate

+ Requires:   python3-boto3

  Requires:   redis

  Requires:   rpm-sign

  Requires:   rsync

@@ -41,3 +41,5 @@ 

      '[%(asctime)s][%(levelname)6s][PID:%(process)d][%(name)10s][%(filename)s:%(funcName)s:%(lineno)d] %(message)s')

  build_log_format = Formatter(

      '[%(asctime)s][%(levelname)6s][PID:%(process)d] %(message)s')

+ script_log_format = Formatter(

+     "[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s")

@@ -552,6 +552,33 @@ 

      return logger

  

  

+ def setup_script_logger(log, path):

+     """

+     Backend scripts should simply do:

+ 

+         log = logging.getLogger(__name__)

+         setup_script_logger(log, "/var/log/copr-backend/foo.log")

+     """

+ 

+     # Don't read copr config, just use INFO. Scripts should implement

+     # some --verbose parameter for debug information

+     log.setLevel(logging.INFO)

+ 

+     # Drop the default handler, we will create it ourselves

+     log.handlers = []

+ 

+     # Print to stdout

+     stdout_log = logging.StreamHandler(sys.stdout)

+     stdout_log.setFormatter(logging.Formatter("%(message)s"))

+     log.addHandler(stdout_log)

+ 

+     # Add file logging

+     file_log = logging.FileHandler(path)

+     log.addHandler(file_log)

+ 

+     return log

+ 

+ 

  def create_file_logger(name, filepath, fmt=None):

      logger = logging.getLogger(name)

  

@@ -0,0 +1,240 @@ 

+ #!/usr/bin/python3

+ # pylint: disable=invalid-name

+ 

+ """

+ Download HTTP access logs from AWS s3 storage, parse them, increment on

+ frontend, and clean up.

+ 

+ AWS CLI cheatsheet:

+ 

+     aws s3 ls

+     aws s3 ls s3://fedora-copr

+     aws s3 cp s3://fedora-copr/cloudwatch/E2PUZIRCXCOXTG.2021-12-07-15.00d7a244.gz ./

+ 

+ Token permissions are required:

+ https://pagure.io/fedora-infrastructure/issue/10395

+ """

+ 

+ 

+ import os

+ import argparse

+ import logging

+ import tempfile

+ import gzip

+ from datetime import datetime

+ from socket import gethostname

+ import boto3

+ from copr_common.request import SafeRequest

+ from copr_log_hitcounter import url_to_key_strings

+ from copr_backend.helpers import BackendConfigReader, setup_script_logger

+ 

+ 

+ # We will allow only this hostname to delete files from the S3 storage

+ PRODUCTION_HOSTNAME = "copr-be.aws.fedoraproject.org"

+ 

+ 

+ log = logging.getLogger(__name__)

+ setup_script_logger(log, "/var/log/copr-backend/hitcounter-s3.log")

+ 

+ 

+ class S3Bucket:

+     """

+     A high-level interface for interacting with files in the AWS s3 buckets

+     """

+ 

+     def __init__(self, bucket=None, directory=None, dry_run=False):

+         self.s3 = boto3.client("s3")

+         self.bucket = bucket or "fedora-copr"

+         self.directory = directory or "cloudwatch/"

+         self.dry_run = dry_run

+ 

+     def list_files(self):

+         """

+         List all files within our AWS s3 bucket

+         """

+         objects = self.s3.list_objects(

+             Bucket=self.bucket,

+             Prefix=self.directory)

+         return [x["Key"] for x in objects["Contents"]]

+ 

+     def download_file(self, s3file, dstdir):

+         """

+         Download a file from AWS s3 bucket

+         """

+         dst = os.path.join(dstdir, os.path.basename(s3file))

+         self.s3.download_file(self.bucket, s3file, dst)

+         return dst

+ 

+     def delete_file(self, s3file):

+         """

+         Delete a file from AWS s3 bucket

+         """

+         # Refusing to delete anything from development instances. We don't have

+         # separate S3 buckets / directories for production and devel so removing

+         # a file on a devel instance would mean a data loss for production.

+         # We will remove files only from production and make devel instances

+         # count them incorrectly multiple times.

+         if gethostname() != PRODUCTION_HOSTNAME:

+             log.debug("Not deleting %s on a dev instance", s3file)

+             return

+         if self.dry_run:

+             return

+         self.s3.delete_object(Bucket=self.bucket, Key=s3file)

+ 

+ 

+ def gunzip(path):

+     """

+     Take a .gz file and uncompress it in the same directory

+     """

+     with gzip.open(path, "rb") as src:

+         with open(path.rstrip(".gz"), "w") as dst:

+             dst.write(src.read().decode("utf-8"))

+     return dst.name

+ 

+ 

+ def parse_access_file(path):

+     """

+     Take a raw access file and return its contents as a list of dicts.

+     """

+     with open(path, "r") as fd:

+         content = fd.readlines()

+ 

+     # The file starts with meta information and thanks to #Fields, we know what

+     # each column means.

+     assert content[0].startswith("#Version:")

+     assert content[1].startswith("#Fields:")

+     keys = content[1].lstrip("#Fields:").split()

+ 

+     accesses = []

+     for line in content[2:]:

+         # Make sure we are not parsing any more meta information

+         assert not line.startswith("#")

+ 

+         # Combine field names and this row values to create a dict

+         values = line.split()

+         access = dict(zip(keys, values))

+         accesses.append(access)

+ 

+     return accesses

+ 

+ 

+ def get_hit_data(accesses):

+     """

+     Prepare body for the frontend request in the same format that

+     copr_log_hitcounter.py does.

+     """

+     hits = {}

+     timestamps = []

+     for access in accesses:

+         url = access["cs-uri-stem"]

+         key_strings = url_to_key_strings(url)

+ 

+         # We don't want to count every accessed URL, only those pointing to

+         # RPM files and repo file

+         if not key_strings:

+             log.debug("Skipping: %s", url)

+             continue

+ 

+         log.debug("Processing: %s", url)

+ 

+         # When counting RPM access, we want to iterate both project hits and

+         # chroot hits. That way we can get multiple `key_strings` for one URL

+         for key_str in key_strings:

+             hits.setdefault(key_str, 0)

+             hits[key_str] += 1

+ 

+         # Remember this access timestamp

+         datetime_format = "%Y-%m-%d %H:%M:%S"

+         datetime_string = "{0} {1}".format(access["date"], access["time"])

+         datetime_object = datetime.strptime(datetime_string, datetime_format)

+         timestamps.append(int(datetime_object.timestamp()))

+ 

+     return {

+         "ts_from": min(timestamps),

+         "ts_to": max(timestamps),

+         "hits": hits,

+     } if hits else {}

+ 

+ 

+ def update_frontend(accesses, dry_run=False):

+     """

+     Increment frontend statistics based on these `accesses`

+     """

+     result = get_hit_data(accesses)

+     if not result:

+         log.debug("No recognizable hits among these accesses, skipping.")

+         return

+ 

+     log.info(

+         "Sending: %i results from %i to %i",

+         len(result["hits"]),

+         result["ts_from"],

+         result["ts_to"]

+     )

+     log.debug("Hits: %s", result["hits"])

+ 

+     opts = BackendConfigReader().read()

+     url = os.path.join(

+         opts.frontend_base_url,

+         "stats_rcv",

+         "from_backend",

+     )

+     if not dry_run:

+         SafeRequest(auth=opts.frontend_auth).post(url, result)

+ 

+ 

+ def get_arg_parser():

+     """

+     Generate argument parser for this script

+     """

+     name = os.path.basename(__file__)

+     description = (

+         "Download HTTP access logs from AWS s3 storage, parse them, increment "

+         "on frontend, and clean up."

+     )

+     parser = argparse.ArgumentParser(name, description=description)

+     parser.add_argument(

+         "--dry-run",

+         action="store_true",

+         help=("Do not perform any destructive changes, only print what "

+               "would happen"))

+     parser.add_argument(

+         "--verbose",

+         action="store_true",

+         help=("Print verbose information about what is going on"))

+     return parser

+ 

+ 

+ def main():

+     """

+     Main function

+     """

+     parser = get_arg_parser()

+     args = parser.parse_args()

+     tmp = tempfile.mkdtemp(prefix="copr-aws-s3-hitcounter-")

+ 

+     if args.verbose:

+         log.setLevel(logging.DEBUG)

+ 

+     s3 = S3Bucket(dry_run=args.dry_run)

+     for s3file in s3.list_files():

+         gz = s3.download_file(s3file, dstdir=tmp)

+         raw = gunzip(gz)

+         accesses = parse_access_file(raw)

+ 

+         # Maybe we want to use some locking or transaction mechanism to avoid

+         # a scenario when we increment the accesses on the frontend but then

+         # leave the s3 file untouched, which would result in parsing and

+         # incrementing from the same file again in the next run

+         update_frontend(accesses, dry_run=args.dry_run)

+         s3.delete_file(s3file)

+ 

+         # Clean all temporary files

+         for path in [gz, raw]:

+             os.remove(path)

+ 

+     os.removedirs(tmp)

+ 

+ 

+ if __name__ == "__main__":

+     main()

@@ -19,17 +19,12 @@ 

  from netaddr import IPNetwork, IPAddress

  

  from collections import defaultdict

- from copr_backend.helpers import BackendConfigReader

+ from copr_backend.helpers import BackendConfigReader, setup_script_logger

  

  opts = BackendConfigReader().read()

  

- logging.basicConfig(

-     filename="/var/log/copr-backend/hitcounter.log",

-     format='[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s',

-     level=logging.INFO)

- 

  log = logging.getLogger(__name__)

- log.addHandler(logging.StreamHandler(sys.stdout))

+ setup_script_logger(log, "/var/log/copr-backend/hitcounter.log")

  

  spider_regex = re.compile('.*(ahrefs|bot/[0-9]|bingbot|borg|google|googlebot|yahoo|slurp|msnbot|msrbot'

                            '|openbot|archiver|netresearch|lycos|scooter|altavista|teoma|gigabot|baiduspider'
@@ -88,36 +83,9 @@ 

              if spider_regex.match(m.group('agent')):

                  continue

  

-             url_match = repomd_url_regex.match(m.group('url'))

-             if url_match:

-                 chroot_key = (

-                     'chroot_repo_metadata_dl_stat',

-                     url_match.group('owner'),

-                     url_match.group('project'),

-                     url_match.group('chroot')

-                 )

-                 chroot_key_str = '|'.join(chroot_key)

-                 hits[chroot_key_str] += 1

-                 continue

+             for key_str in url_to_key_strings(m.group('url')):

+                 hits[key_str] += 1

  

-             url_match = rpm_url_regex.match(m.group('url'))

-             if url_match:

-                 chroot_key = (

-                     'chroot_rpms_dl_stat',

-                     url_match.group('owner'),

-                     url_match.group('project'),

-                     url_match.group('chroot')

-                 )

-                 chroot_key_str = '|'.join(chroot_key)

-                 hits[chroot_key_str] += 1

-                 project_key = (

-                     'project_rpms_dl_stat',

-                     url_match.group('owner'),

-                     url_match.group('project')

-                 )

-                 project_key_str = '|'.join(project_key)

-                 hits[project_key_str] += 1

-                 continue

          last_line = logline

  

      return {
@@ -127,6 +95,41 @@ 

      }

  

  

+ def url_to_key_strings(url):

+     """

+     Take a full URL and return a list of unique strings representing it,

+     that copr-frontend will understand.

+     """

+     url_match = repomd_url_regex.match(url)

+     if url_match:

+         chroot_key = (

+             'chroot_repo_metadata_dl_stat',

+             url_match.group('owner'),

+             url_match.group('project'),

+             url_match.group('chroot')

+         )

+         chroot_key_str = '|'.join(chroot_key)

+         return [chroot_key_str]

+ 

+     url_match = rpm_url_regex.match(url)

+     if url_match:

+         chroot_key = (

+             'chroot_rpms_dl_stat',

+             url_match.group('owner'),

+             url_match.group('project'),

+             url_match.group('chroot')

+         )

+         chroot_key_str = '|'.join(chroot_key)

+         project_key = (

+             'project_rpms_dl_stat',

+             url_match.group('owner'),

+             url_match.group('project')

+         )

+         project_key_str = '|'.join(project_key)

+         return [chroot_key_str, project_key_str]

+     return []

+ 

+ 

  def get_timestamp(logline):

      if not logline:

          return None

@@ -40,12 +40,6 @@ 

      return ''.join(random.choice(string.ascii_lowercase) for x in range(size))

  

  

- REPO_DL_STAT_FMT = "repo_dl_stat::{copr_user}@{copr_project_name}:{copr_name_release}"

- CHROOT_REPO_MD_DL_STAT_FMT = "chroot_repo_metadata_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"

- CHROOT_RPMS_DL_STAT_FMT = "chroot_rpms_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"

- PROJECT_RPMS_DL_STAT_FMT = "project_rpms_dl_stat:hset::{copr_user}@{copr_project_name}"

- 

- 

  FINISHED_STATES = ["succeeded", "forked", "canceled", "skipped", "failed"]

  FINISHED_STATUSES = [StatusEnum(s) for s in FINISHED_STATES]

  
@@ -100,8 +94,100 @@ 

  

  

  class CounterStatType(object):

+     # When a .repo file is generated on the frontend.

+     # It is displayed in the "Repo Download" column on project overview pages.

      REPO_DL = "repo_dl"

  

+     # When an RPM or SRPM file is downloaded directly from the backend

+     # or from Amazon AWS CDN.

+     # It is displayed in the "Architectures" column on project overview pages.

+     CHROOT_RPMS_DL = "chroot_rpms_dl"

+ 

+     # When a repodata/repomd.xml RPM or SRPM file is downloaded directly from

+     # the backend or from Amazon AWS CDN.

+     # We are counting but not using this information anywhere.

+     CHROOT_REPO_MD_DL = "chroot_repo_metadata_dl"

+ 

+     # This should equal to the sum of all `CHROOT_RPMS_DL` stats within

+     # a project. It is a redundant information to be stored, since it can be

+     # calculated. But our `model.CounterStat` design isn't friendly for querying

+     # based on owner/project.

+     # We are counting but not using this information anywhere.

+     PROJECT_RPMS_DL = "project_rpms_dl"

+ 

+ 

+ def get_stat_name(stat_type, copr_dir=None, copr_chroot=None,

+                   name_release=None, key_string=None):

+     """

+     Generate a `models.CounterStat.name` value based on various optional

+     parameters. Only a subset of parameters needs to be set based on the context

+     and `stat_type`.

+ 

+     This method is too complicated and messy, we should either minimize the

+     number of input parameters if possible, or turn this into a class with

+     methods for each `stat_type` or use-case.

+     """

+ 

+     # TODO These are way too complicated

+     # We should get rid of the redis syntax (hset::)

+     #

+     # We should not start the value with `stat_type` because we already have

+     # `CounterStat.counter_type` for that.

+     #

+     # We should probably add `CounterStat.copr_id` to remove `copr_user`

+     # and `copr_project` from the strings

+     # pylint: disable=invalid-name

+     REPO_DL_STAT_FMT = "repo_dl_stat::{copr_user}@{copr_project_name}:{copr_name_release}"

+     CHROOT_REPO_MD_DL_STAT_FMT = "chroot_repo_metadata_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"

+     CHROOT_RPMS_DL_STAT_FMT = "chroot_rpms_dl_stat:hset::{copr_user}@{copr_project_name}:{copr_chroot}"

+     PROJECT_RPMS_DL_STAT_FMT = "project_rpms_dl_stat:hset::{copr_user}@{copr_project_name}"

+ 

+     stat_fmt = {

+         CounterStatType.REPO_DL: REPO_DL_STAT_FMT,

+         CounterStatType.CHROOT_REPO_MD_DL: CHROOT_REPO_MD_DL_STAT_FMT,

+         CounterStatType.CHROOT_RPMS_DL: CHROOT_RPMS_DL_STAT_FMT,

+         CounterStatType.PROJECT_RPMS_DL: PROJECT_RPMS_DL_STAT_FMT,

+     }.get(stat_type)

+ 

+     if not stat_fmt:

+         raise ValueError("Unexpected stat_type")

+ 

+     kwargs = {}

+     if name_release:

+         kwargs["copr_name_release"] = name_release

+ 

+     if copr_dir:

+         kwargs.update({

+             "copr_user": copr_dir.copr.owner_name,

+             "copr_project_name": copr_dir.copr.name,

+         })

+ 

+     if copr_chroot:

+         kwargs.update({

+             "copr_user": copr_chroot.copr.owner_name,

+             "copr_project_name": copr_chroot.copr.name,

+             "copr_chroot": copr_chroot.name,

+         })

+ 

+     # The key strings come from backend hitcounter scripts, e.g.

+     # 'project_rpms_dl_stat|jvanek|java17'

+     if key_string:

+         keys = key_string.split("|")

+         kwargs.update({

+             "copr_user": keys[0],

+             "copr_project_name": keys[1],

+         })

+ 

+     # Also from backend hitcounter scripts, e.g.

+     # 'chroot_repo_metadata_dl_stat|jose_exposito|touchegg|fedora-35-x86_64'

+     # 'chroot_rpms_dl_stat|packit|psss-tmt-896|fedora-rawhide-x86_64'

+     if key_string and stat_type in [CounterStatType.CHROOT_RPMS_DL,

+                                     CounterStatType.CHROOT_REPO_MD_DL]:

+         keys = key_string.split("|")

+         kwargs["copr_chroot"] = keys[2]

+ 

+     return stat_fmt.format(**kwargs)

+ 

  

  class PermissionEnum(metaclass=EnumType):

      # The text form is part of APIv3!

@@ -10,7 +10,6 @@ 

  from copr_common.enums import StatusEnum

  from coprs import app

  from coprs import db

- from coprs import rcp

  from coprs import helpers

  from coprs import models

  from coprs import exceptions
@@ -19,10 +18,10 @@ 

  from coprs.logic.batches_logic import BatchesLogic

  from coprs.logic.packages_logic import PackagesLogic

  from coprs.logic.actions_logic import ActionsLogic

+ from coprs.logic.stat_logic import CounterStatLogic

  

  from coprs.logic.users_logic import UsersLogic

  from coprs.models import User, Copr

- from coprs.rmodels import TimedStatEvents

  from coprs.logic.coprs_logic import (CoprsLogic, CoprDirsLogic, CoprChrootsLogic,

                                       PinnedCoprsLogic, MockChrootsLogic)

  
@@ -569,15 +568,13 @@ 

          """

          Calculate how many times a repo for given `copr_chroot` was downloaded

          """

-         chroot_rpms_dl_stat_key = helpers.CHROOT_RPMS_DL_STAT_FMT.format(

-             copr_user=copr_chroot.copr.owner_name,

-             copr_project_name=copr_chroot.copr.name,

-             copr_chroot=copr_chroot.name,

-         )

-         return TimedStatEvents.get_count(

-             rconnect=rcp.get_connection(),

-             name=chroot_rpms_dl_stat_key,

+         stat_type = helpers.CounterStatType.CHROOT_RPMS_DL

+         stat_name = helpers.get_stat_name(

+             stat_type=stat_type,

+             copr_chroot=copr_chroot,

          )

+         stat = CounterStatLogic.get(stat_name).first()

+         return stat.counter if stat else 0

  

      @classmethod

      def _delete_reason(cls, copr_chroots):

@@ -6,9 +6,6 @@ 

  from coprs import db

  from coprs.models import CounterStat

  from coprs import helpers

- from coprs.helpers import REPO_DL_STAT_FMT, CHROOT_REPO_MD_DL_STAT_FMT, \

-     CHROOT_RPMS_DL_STAT_FMT, PROJECT_RPMS_DL_STAT_FMT

- from coprs.rmodels import TimedStatEvents

  

  

  class CounterStatLogic(object):
@@ -36,16 +33,16 @@ 

          return csl

  

      @classmethod

-     def incr(cls, name, counter_type):

+     def incr(cls, name, counter_type, count=1):

          """

          Warning: dirty method: does commit if missing stat record.

          """

          try:

              csl = CounterStatLogic.get(name).one()

-             csl.counter = CounterStat.counter + 1

+             csl.counter = CounterStat.counter + count

          except NoResultFound:

              csl = CounterStatLogic.add(name, counter_type)

-             csl.counter = 1

+             csl.counter = count

  

          db.session.add(csl)

          return csl
@@ -55,12 +52,12 @@ 

          # chroot -> stat_name

          chroot_by_stat_name = {}

          for chroot in copr.active_chroots:

-             kwargs = {

-                 "copr_user": copr.user.name,

-                 "copr_project_name": copr.name,

-                 "copr_name_release": chroot.name_release

-             }

-             chroot_by_stat_name[REPO_DL_STAT_FMT.format(**kwargs)] = chroot.name_release

+             stat_name = helpers.get_stat_name(

+                 stat_type=helpers.CounterStatType.REPO_DL,

+                 copr_dir=copr.main_dir,

+                 name_release=chroot.name_release,

+             )

+             chroot_by_stat_name[stat_name] = chroot.name_release

  

          # [{counter: <value>, name: <stat_name>}, ...]

          stats = cls.get_multiply_same_type(counter_type=helpers.CounterStatType.REPO_DL,
@@ -74,59 +71,29 @@ 

          return repo_dl_stats

  

  

- def handle_be_stat_message(rc, stat_data):

+ def handle_be_stat_message(stat_data):

      """

-     :param rc: connection to redis

-     :type rc: StrictRedis

- 

      :param stat_data: stats from backend

      :type stat_data: dict

      """

      app.logger.debug('Got stat data: {}'.format(stat_data))

  

-     ts_from = int(stat_data['ts_from'])

-     ts_to = int(stat_data['ts_to'])

      hits = stat_data['hits']

+     for key_str, count in hits.items():

+         stat_type, key_string = key_str.split("|", 1)

  

-     if not ts_from or not ts_to or ts_from > ts_to or not hits:

-         raise Exception("Invalid or empty data received.")

- 

-     ts_from_stored = int(rc.get('handle_be_stat_message_ts_from') or 0)

-     ts_to_stored = int(rc.get('handle_be_stat_message_ts_to') or 0)

- 

-     app.logger.debug('ts_from: {}'.format(ts_from))

-     app.logger.debug('ts_to: {}'.format(ts_to))

-     app.logger.debug('ts_from_stored: {}'.format(ts_from_stored))

-     app.logger.debug('ts_to_stored: {}'.format(ts_to_stored))

+         # FIXME the keys from backend doesn't match CounterStatType exactly

+         stat_type = stat_type.rstrip("_stat")

  

-     if (ts_from < ts_to_stored and ts_to > ts_from_stored):

-         app.logger.debug('Time overlap with already stored data. Skipping.')

-         return

+         assert stat_type in [

+             helpers.CounterStatType.REPO_DL,

+             helpers.CounterStatType.CHROOT_REPO_MD_DL,

+             helpers.CounterStatType.CHROOT_RPMS_DL,

+             helpers.CounterStatType.PROJECT_RPMS_DL,

+         ]

  

-     hits_formatted = defaultdict(int)

-     for key_str, count in hits.items():

-         key = key_str.split('|')

-         if key[0] == 'chroot_repo_metadata_dl_stat':

-             redis_key = CHROOT_REPO_MD_DL_STAT_FMT.format(

-                 copr_user=key[1],

-                 copr_project_name=key[2],

-                 copr_chroot=key[3])

-         elif key[0] == 'chroot_rpms_dl_stat':

-             redis_key = CHROOT_RPMS_DL_STAT_FMT.format(

-                 copr_user=key[1],

-                 copr_project_name=key[2],

-                 copr_chroot=key[3])

-         elif key[0] == 'project_rpms_dl_stat':

-             redis_key = PROJECT_RPMS_DL_STAT_FMT.format(

-                 copr_user=key[1],

-                 copr_project_name=key[2])

-         else:

-             raise Exception('Unknown key {}'.format(key[0]))

- 

-         hits_formatted[redis_key] += count

- 

-     for redis_key, count in hits_formatted.items():

-         TimedStatEvents.add_event(rc, redis_key, count=count, timestamp=ts_to)

- 

-     rc.set('handle_be_stat_message_ts_from', ts_from)

-     rc.set('handle_be_stat_message_ts_to', ts_to)

+         stat_name = helpers.get_stat_name(

+             stat_type=stat_type,

+             key_string=key_string,

+         )

+         CounterStatLogic.incr(stat_name, stat_type, count)

@@ -49,7 +49,7 @@ 

  

  from coprs.logic import builds_logic, coprs_logic, actions_logic, users_logic

  from coprs.helpers import generate_repo_url, \

-     url_for_copr_view, REPO_DL_STAT_FMT, CounterStatType, generate_repo_name, \

+     url_for_copr_view, CounterStatType, generate_repo_name, \

      WorkList

  

  
@@ -998,11 +998,11 @@ 

      response.headers["Content-Disposition"] = \

          "filename={0}.repo".format(copr_dir.repo_name)

  

-     name = REPO_DL_STAT_FMT.format(**{

-         'copr_user': copr_dir.copr.user.name,

-         'copr_project_name': copr_dir.copr.name,

-         'copr_name_release': name_release,

-     })

+     name = helpers.get_stat_name(

+         CounterStatType.REPO_DL,

+         copr_dir=copr_dir,

+         name_release=name_release,

+     )

      CounterStatLogic.incr(name=name, counter_type=CounterStatType.REPO_DL)

      db.session.commit()

  

@@ -3,13 +3,12 @@ 

  import datetime

  import functools

  from functools import wraps, partial

+ from urllib.parse import urlparse

  

- from netaddr import IPAddress, IPNetwork

  import re

  import flask

  from flask import send_file

  

- from urllib.parse import urlparse

  from openid_teams.teams import TeamsRequest

  

  from copr_common.enums import RoleEnum
@@ -364,20 +363,6 @@ 

      return decorated_function

  

  

- def intranet_required(f):

-     @functools.wraps(f)

-     def decorated_function(*args, **kwargs):

-         ip_addr = IPAddress(flask.request.remote_addr)

-         accept_ranges = set(app.config.get("INTRANET_IPS", []))

-         accept_ranges.add("127.0.0.1")  # always accept from localhost

-         if not any(ip_addr in IPNetwork(addr_or_net) for addr_or_net in accept_ranges):

-             return ("Stats can be update only from intranet hosts, "

-                     "not {}, check config\n".format(flask.request.remote_addr)), 403

- 

-         return f(*args, **kwargs)

-     return decorated_function

- 

- 

  def req_with_copr(f):

      @wraps(f)

      def wrapper(**kwargs):

@@ -1,11 +1,10 @@ 

  # coding: utf-8

  

  import flask

- import json

- from coprs import rcp

  from coprs import app

  from coprs import db

- from ..misc import intranet_required

+ from coprs.exceptions import CoprHttpException

+ from coprs.views.misc import backend_authenticated

  from . import stats_rcv_ns

  from ...logic.stat_logic import CounterStatLogic, handle_be_stat_message

  
@@ -16,7 +15,7 @@ 

  

  

  @stats_rcv_ns.route("/<counter_type>/<name>/", methods=['POST'])

- @intranet_required

+ @backend_authenticated

  def increment(counter_type, name):

      app.logger.debug(flask.request.remote_addr)

  
@@ -26,11 +25,13 @@ 

  

  

  @stats_rcv_ns.route("/from_backend", methods=['POST'])

- @intranet_required

+ @backend_authenticated

  def backend_stat_message_handler():

      try:

-         handle_be_stat_message(rcp.get_connection(), json.loads(flask.request.json))

+         handle_be_stat_message(flask.request.json)

+         db.session.commit()

      except Exception as err:

          app.logger.exception(err)

+         raise CoprHttpException from err

  

      return "OK", 201

@@ -7,7 +7,6 @@ 

  from functools import wraps

  import datetime

  import uuid

- from unittest import mock

  

  import pytest

  import decorator
@@ -67,11 +66,6 @@ 

          #    os.makedirs(datadir)

          coprs.db.create_all()

          self.db.session.commit()

-         #coprs/views/coprs_ns/coprs_general.py

-         self.rmodel_TSE_coprs_general_patcher = mock.patch("coprs.logic.complex_logic.TimedStatEvents")

-         self.rmodel_TSE_coprs_general_mc = self.rmodel_TSE_coprs_general_patcher.start()

-         self.rmodel_TSE_coprs_general_mc.return_value.get_count.return_value = 0

-         self.rmodel_TSE_coprs_general_mc.return_value.add_event.return_value = None

  

          self.web_ui = WebUIRequests(self)

          self.api3 = API3Requests(self)
@@ -88,7 +82,6 @@ 

          # we actually need to drop it so the setup_method() re-creates the data

          self.db.engine.execute('drop table dist_git_instance')

  

-         self.rmodel_TSE_coprs_general_patcher.stop()

          self.app.config = self.original_config.copy()

          cache.clear()

  

Download HTTP access logs from AWS s3 storage, parse them, increment
on frontend, and clean up.

Metadata Update from @frostyx:
- Pull-request tagged with: wip

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

There are 3 known issues to me

  • It unintentionally logs into hitcounter.log instead of hitcounter-s3.log
  • It raises copr_common.request.RequestError: Request client error on https://copr-fe-dev.cloud.fedoraproject.org/stats_rcv/from_backend: 403 FORBIDDEN when trying to contact frontend. That is caused by @intranet_required on the frontend route but I haven't figured out how to fix it yet.
  • When I temporarily disable the decorator ^^ and successfully send the data to frontend, the values in a project overview don't increment. I am not yet sure why.

But I wanted to share the script as soon as possible even though it is not finished yet.

Thank you for the progress!

How large are the logs ATM? How many MB/day is stored?

How large are the logs ATM?

$ aws s3 ls s3://fedora-copr/cloudwatch/ |wc -l
15270

$ aws s3 ls s3://fedora-copr/cloudwatch/ | awk '{sum += $3}; {print sum}'
7469396044    # 7.4G

How many MB/day is stored?

$ aws s3 ls s3://fedora-copr/cloudwatch/ |grep 2022-01-16 |wc -l
410

$ aws s3 ls s3://fedora-copr/cloudwatch/ |grep 2022-01-16 | awk '{sum += $3}; {print sum}'
211062883    # ~200MB

rebased onto 935b886531aa533d4825f7c999dbb3d24243c5b6

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

I basically left the hitcounter script as-is and focused on the previously discussed frontend changes related to authentication and Redis. I added a lot of TODOs because I don't want to make this big change even more complicated, and rather do it in a follow-up PR.

From my POV, this PR is almost finished, I just want to properly document the types in helpers.CounterStatType. In the project overview, we have, e.g.

Fedora 35            x86_64 (0)*            (0 downloads)

And I am not really sure what number should be incremented when.

  • When clicking the, e.g. Fedora 35 button in the Repo Download field, the second value should be incremented.
  • When downloading RPM from backend (that we find by either of our hitcounter scripts), the first value should be incremented
  • What to do when repodata/repomd.xml file is downloaded from the backend?
  • For some reason we also used to store PROJECT_RPMS_DL_STAT_FMT into redis. I think whenever an RPM is downloaded, the chroot download count is incremented as well as project download count. Which is fine by me but do we show this value somewhere?

It unintentionally logs into hitcounter.log instead of hitcounter-s3.log

Oops, I should fix this as well

When clicking the, e.g. Fedora 35 button in the Repo Download field, the second value should be incremented.

But isn't this working even now?

When downloading RPM from backend (that we find by either of our hitcounter scripts), the first value should be incremented

Agreed.

What to do when repodata/repomd.xml file is downloaded from the backend?

Nothing?

For some reason we also used to store PROJECT_RPMS_DL_STAT_FMT into redis. I think whenever an RPM is downloaded, the chroot download count is incremented as well as project download count. Which is fine by me but do we show this value somewhere?

I don't think we do, but we could (why not).

But isn't this working even now?

It is, this is a frontend-only thing and it works even now. I just wanted to describe the whole picture.

What to do when repodata/repomd.xml file is downloaded from the backend?

Nothing?

Sounds reasonable to me :D
It's just that the previous hitcounter script parses and sends such accesses to frontend, where is also some code related to it, see the CHROOT_REPO_MD_DL_STAT_FMT constant. I wasn't really sure what to do with it :D

Ad repomd.xml, the only special thig is that CDN never caches it, it is always
downloaded (through CDN rewrite, not redirect) from the copr backend.

Which brings me to an important thing we should IMO do - try to
- ignore any CloudFronts (CDN) access from the lighttpd logs
- ignore accesses from the Copr builders (mock can set user_agent to a specific value)

4 new commits added

  • frontend: don't use redis as a middleman when updating hitcounter stats
  • frontend: use standard backend auth for updating stats
  • frontend: don't return 201 when the call fails with an exception
  • backend: add hitcounter script for AWS CDN
2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto 0a1a74b50d37a770562a7e387831d3177bafbdb2

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto 4c3f3336c1f365d1c26f8e95b497f35f4256bc2f

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

1 new commit added

  • backend: print only logging messages to the stdout
2 years ago

Merge Failed.

This change or one of its cross-repo dependencies was unable to be automatically merged with the current state of its repository. Please rebase the change and upload a new patchset.

rebased onto 7c3eb35db6d16438037ff70302f77bc78df7a131

2 years ago

Metadata Update from @frostyx:
- Pull-request untagged with: wip

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

7 new commits added

  • backend: print only logging messages to the stdout
  • backend: fix pylint import warnings
  • backend: add handler for logging into file in hitcounter scripts
  • frontend: don't use redis as a middleman when updating hitcounter stats
  • frontend: use standard backend auth for updating stats
  • frontend: don't return 201 when the call fails with an exception
  • backend: add hitcounter script for AWS CDN
2 years ago

Build succeeded.

Is the CI failure related to this PR?

This is pretty complicated to go one by one commit ... some commits seem to fix other commits. But overall, I think it is OK, so I'm +1 ... We still should take a better look at the useragent suff, nad others ... but that should be done later, before the next release.

This from ..misc import seems messy :-( can't we just from coprs.views.misc import?

Very nice contribution, btw.

7 new commits added

  • backend: print only logging messages to the stdout
  • backend: fix pylint import warnings
  • backend: add handler for logging into file in hitcounter scripts
  • frontend: don't use redis as a middleman when updating hitcounter stats
  • frontend: use standard backend auth for updating stats
  • frontend: don't return 201 when the call fails with an exception
  • backend: add hitcounter script for AWS CDN
2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto 75c2285d4ec86ebfa866f5fe99f3e122d799c5fa

2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

7 new commits added

  • backend: print only logging messages to the stdout
  • backend: fix pylint import warnings
  • backend: add handler for logging into file in hitcounter scripts
  • frontend: don't use redis as a middleman when updating hitcounter stats
  • frontend: use standard backend auth for updating stats
  • frontend: don't return 201 when the call fails with an exception
  • backend: add hitcounter script for AWS CDN
2 years ago

Build succeeded.

This from ..misc import seems messy :-( can't we just from coprs.views.misc import?

Good catch, thanks. I copy-pasted it from some other file.

Is the CI failure related to this PR?

It was, code was fine but tests were trying to mock something, that I got rid of. Fixed.

This is pretty complicated to go one by one commit ... some commits seem to fix other commits.

Sorry about that, I really tried to not do it this way but I cannot merge those commits into one because there is something in between that causes conflicts. However, only the commits related to logging should be this messy.

rebased onto 87bcb52

2 years ago

Pull-Request has been merged by praiskup

2 years ago

Build succeeded.