From 89f3cf49912dfa81da0ffc66bcd6f592c5953b6c Mon Sep 17 00:00:00 2001
From: Jakub Kadlcik <frostyx@email.cz>
Date: May 11 2022 06:29:16 +0000
Subject: backend: consolidate the two hitcounter scripts


- I am moving the shared code into `hitcounter.py`
- Fixing the outdated parts of the original `copr_log_hitcounter.py`
  script, such as frontend authentication
- Using the bot filter even for copr-aws-s3-hitcounter

Merges: #2191

---
diff --git a/backend/copr_backend/hitcounter.py b/backend/copr_backend/hitcounter.py
new file mode 100644
index 0000000..4d16616
--- /dev/null
+++ b/backend/copr_backend/hitcounter.py
@@ -0,0 +1,141 @@
+"""
+Shared logic for hitcounter scripts
+"""
+
+import os
+import re
+from datetime import datetime
+from copr_common.request import SafeRequest
+from copr_backend.helpers import BackendConfigReader
+
+base_regex = "/results/(?P<owner>[^/]*)/(?P<project>[^/]*)/(?P<chroot>[^/]*)/"
+repomd_url_regex = re.compile(base_regex + "repodata/repomd.xml", re.IGNORECASE)
+rpm_url_regex = re.compile(
+    base_regex + r"(?P<build_dir>[^/]*)/(?P<rpm>[^/]*\.rpm)", re.IGNORECASE)
+
+spider_regex = re.compile(
+    '.*(ahrefs|bot/[0-9]|bingbot|borg|google|googlebot|yahoo|slurp|msnbot'
+    '|openbot|archiver|netresearch|lycos|scooter|altavista|teoma|gigabot'
+    '|blitzbot|oegp|charlotte|furlbot|http://client|polybot|htdig|ichiro'
+    '|larbin|pompos|scrubby|searchsight|seekbot|semanticdiscovery|silk|snappy'
+    '|spider|voila|vortex|voyager|zao|zeal|fast-webcrawler|converacrawler'
+    '|msrbot|baiduspider|mogimogi|speedy|dataparksearch'
+    '|findlinks|crawler|yandex|blexbot|semrushbot).*',
+    re.IGNORECASE)
+
+
+def url_to_key_strings(url):
+    """
+    Take a full URL and return a list of unique strings representing it,
+    that copr-frontend will understand.
+    """
+    url_match = repomd_url_regex.match(url)
+    if url_match:
+        chroot_key = (
+            'chroot_repo_metadata_dl_stat',
+            url_match.group('owner'),
+            url_match.group('project'),
+            url_match.group('chroot')
+        )
+        chroot_key_str = '|'.join(chroot_key)
+        return [chroot_key_str]
+
+    url_match = rpm_url_regex.match(url)
+    if url_match:
+        chroot_key = (
+            'chroot_rpms_dl_stat',
+            url_match.group('owner'),
+            url_match.group('project'),
+            url_match.group('chroot')
+        )
+        chroot_key_str = '|'.join(chroot_key)
+        project_key = (
+            'project_rpms_dl_stat',
+            url_match.group('owner'),
+            url_match.group('project')
+        )
+        project_key_str = '|'.join(project_key)
+        return [chroot_key_str, project_key_str]
+    return []
+
+
+def update_frontend(accesses, log, dry_run=False):
+    """
+    Increment frontend statistics based on these `accesses`
+    """
+    result = get_hit_data(accesses, log)
+    if not result:
+        log.debug("No recognizable hits among these accesses, skipping.")
+        return
+
+    log.info(
+        "Sending: %i results from %i to %i",
+        len(result["hits"]),
+        result["ts_from"],
+        result["ts_to"]
+    )
+    if len(result["hits"]) < 100:
+        log.debug("Hits: %s", result["hits"])
+    else:
+        log.debug("Not logging the whole dict: %s hits", len(result["hits"]))
+
+    opts = BackendConfigReader().read()
+    url = os.path.join(
+        opts.frontend_base_url,
+        "stats_rcv",
+        "from_backend",
+    )
+    if not dry_run:
+        SafeRequest(auth=opts.frontend_auth, log=log).post(url, result)
+
+
+def get_hit_data(accesses, log):
+    """
+    Prepare body for the frontend request in the same format that
+    copr_log_hitcounter.py does.
+    """
+    hits = {}
+    timestamps = []
+    for access in accesses:
+        url = access["cs-uri-stem"]
+
+        if access["sc-status"] == "404":
+            log.debug("Skipping: %s (404 Not Found)", url)
+            continue
+
+        if access["cs(User-Agent)"].startswith("Mock"):
+            log.debug("Skipping: %s (user-agent: Mock)", url)
+            continue
+
+        bot = spider_regex.match(access["cs(User-Agent)"])
+        if bot:
+            log.debug("Skipping: %s (user-agent '%s' is a known bot)",
+                      url, bot.group(1))
+            continue
+
+        # We don't want to count every accessed URL, only those pointing to
+        # RPM files and repo file
+        key_strings = url_to_key_strings(url)
+        if not key_strings:
+            log.debug("Skipping: %s", url)
+            continue
+
+        log.debug("Processing: %s", url)
+
+        # When counting RPM access, we want to iterate both project hits and
+        # chroot hits. That way we can get multiple `key_strings` for one URL
+        for key_str in key_strings:
+            hits.setdefault(key_str, 0)
+            hits[key_str] += 1
+
+        # Remember this access timestamp
+        datetime_format = "%Y-%m-%d %H:%M:%S"
+        datetime_string = "{0} {1}".format(access["date"], access["time"])
+        datetime_object = datetime.strptime(datetime_string, datetime_format)
+        timestamps.append(int(datetime_object.timestamp()))
+
+    return {
+        "ts_from": min(timestamps),
+        "ts_to": max(timestamps),
+        "hits": hits,
+    } if hits else {}
diff --git a/backend/run/copr-aws-s3-hitcounter b/backend/run/copr-aws-s3-hitcounter
index 6c2b38d..b528589 100755
--- a/backend/run/copr-aws-s3-hitcounter
+++ b/backend/run/copr-aws-s3-hitcounter
@@ -21,12 +21,10 @@ import argparse
 import logging
 import tempfile
 import gzip
-from datetime import datetime
 from socket import gethostname
 import boto3
-from copr_common.request import SafeRequest
-from copr_log_hitcounter import url_to_key_strings
-from copr_backend.helpers import BackendConfigReader, setup_script_logger
+from copr_backend.hitcounter import update_frontend
+from copr_backend.helpers import setup_script_logger
 
 
 # We will allow only this hostname to delete files from the S3 storage
@@ -124,79 +122,6 @@ def parse_access_file(path):
     return accesses
 
 
-def get_hit_data(accesses):
-    """
-    Prepare body for the frontend request in the same format that
-    copr_log_hitcounter.py does.
-    """
-    hits = {}
-    timestamps = []
-    for access in accesses:
-        url = access["cs-uri-stem"]
-
-        if access["sc-status"] == "404":
-            log.debug("Skipping: %s (404 Not Found)", url)
-            continue
-
-        if access["cs(User-Agent)"].startswith("Mock"):
-            log.debug("Skipping: %s (user-agent: Mock)", url)
-            continue
-
-        # We don't want to count every accessed URL, only those pointing to
-        # RPM files and repo file
-        key_strings = url_to_key_strings(url)
-        if not key_strings:
-            log.debug("Skipping: %s", url)
-            continue
-
-        log.debug("Processing: %s", url)
-
-        # When counting RPM access, we want to iterate both project hits and
-        # chroot hits. That way we can get multiple `key_strings` for one URL
-        for key_str in key_strings:
-            hits.setdefault(key_str, 0)
-            hits[key_str] += 1
-
-        # Remember this access timestamp
-        datetime_format = "%Y-%m-%d %H:%M:%S"
-        datetime_string = "{0} {1}".format(access["date"], access["time"])
-        datetime_object = datetime.strptime(datetime_string, datetime_format)
-        timestamps.append(int(datetime_object.timestamp()))
-
-    return {
-        "ts_from": min(timestamps),
-        "ts_to": max(timestamps),
-        "hits": hits,
-    } if hits else {}
-
-
-def update_frontend(accesses, dry_run=False):
-    """
-    Increment frontend statistics based on these `accesses`
-    """
-    result = get_hit_data(accesses)
-    if not result:
-        log.debug("No recognizable hits among these accesses, skipping.")
-        return
-
-    log.info(
-        "Sending: %i results from %i to %i",
-        len(result["hits"]),
-        result["ts_from"],
-        result["ts_to"]
-    )
-    log.debug("Hits: %s", result["hits"])
-
-    opts = BackendConfigReader().read()
-    url = os.path.join(
-        opts.frontend_base_url,
-        "stats_rcv",
-        "from_backend",
-    )
-    if not dry_run:
-        SafeRequest(auth=opts.frontend_auth, log=log).post(url, result)
-
-
 def get_arg_parser():
     """
     Generate argument parser for this script
@@ -240,7 +165,7 @@ def main():
         # a scenario when we increment the accesses on the frontend but then
         # leave the s3 file untouched, which would result in parsing and
         # incrementing from the same file again in the next run
-        update_frontend(accesses, dry_run=args.dry_run)
+        update_frontend(accesses, log=log, dry_run=args.dry_run)
         s3.delete_file(s3file)
 
         # Clean all temporary files
diff --git a/backend/run/copr_log_hitcounter.py b/backend/run/copr_log_hitcounter.py
index 0248af7..8c2fef1 100755
--- a/backend/run/copr_log_hitcounter.py
+++ b/backend/run/copr_log_hitcounter.py
@@ -1,170 +1,91 @@
 #!/usr/bin/python3
 
-# This is script is supposed to be run daily from lighttpd logrotate, e.g.
-#    prerotate
-#        /usr/bin/copr_log_hitcounter.py /var/log/lighttpd/access.log --ignore-subnets 172.25.80.0/20 209.132.184.33/24 || :
-#    endscript
+"""
+This is script is supposed to be run daily from lighttpd logrotate, e.g.
+   prerotate
+       /usr/bin/copr_log_hitcounter.py /var/log/lighttpd/access.log \
+           --ignore-subnets 172.25.80.0/20 209.132.184.33/24 || :
+   endscript
+"""
 
 import re
-import sys
-import requests
-import json
 import os
 import logging
 import argparse
-import netaddr
-import time
+from datetime import datetime
+from copr_backend.helpers import setup_script_logger
+from copr_backend.hitcounter import update_frontend
 
-from dateutil.parser import parse as dt_parse
-from netaddr import IPNetwork, IPAddress
-
-from collections import defaultdict
-from copr_backend.helpers import BackendConfigReader, setup_script_logger
-
-opts = BackendConfigReader().read()
 
 log = logging.getLogger(__name__)
 setup_script_logger(log, "/var/log/copr-backend/hitcounter.log")
 
-spider_regex = re.compile('.*(ahrefs|bot/[0-9]|bingbot|borg|google|googlebot|yahoo|slurp|msnbot|msrbot'
-                          '|openbot|archiver|netresearch|lycos|scooter|altavista|teoma|gigabot|baiduspider'
-                          '|blitzbot|oegp|charlotte|furlbot|http://client|polybot|htdig|ichiro|mogimogi'
-                          '|larbin|pompos|scrubby|searchsight|seekbot|semanticdiscovery|silk|snappy|speedy'
-                          '|spider|voila|vortex|voyager|zao|zeal|fast-webcrawler|converacrawler|dataparksearch'
-                          '|findlinks|crawler|yandex|blexbot|semrushbot).*', re.IGNORECASE)
-
 logline_regex = re.compile(
     r'(?P<ip_address>.*)\s+(?P<hostname>.*)\s+-\s+\[(?P<timestamp>.*)\]\s+'
     r'"GET (?P<url>.*)\s+(?P<protocol>.*)"\s+(?P<code>.*)\s+(?P<bytes_sent>.*)\s+'
     r'"(?P<referer>.*)"\s+"(?P<agent>.*)"', re.IGNORECASE)
 
-repomd_url_regex = re.compile("/results/(?P<owner>[^/]*)/(?P<project>[^/]*)/(?P<chroot>[^/]*)/repodata/repomd.xml", re.IGNORECASE)
-rpm_url_regex = re.compile("/results/(?P<owner>[^/]*)/(?P<project>[^/]*)/(?P<chroot>[^/]*)/(?P<build_dir>[^/]*)/(?P<rpm>[^/]*\.rpm)", re.IGNORECASE)
-
-datetime_regex = re.compile(".*\[(?P<date>[^:]*):(?P<time>\S*)\s(?P<zone>[^\]]*)\].*")
-datetime_parse_template = '{date} {time} {zone}'
-
-parser = argparse.ArgumentParser(description='Read lighttpd access.log and count repo accesses.')
-parser.add_argument('--ignore-subnets', action='store', help='What IPs to ignore', nargs='+', default=[], metavar="SUBNET")
-parser.add_argument('logfile', action='store', help='Path to the input logfile')
-
-
-def get_hit_data():
-    hits = defaultdict(int)
-
-    first_line = None
-    last_line = None
-    ignore_networks = map(IPNetwork, args.ignore_subnets)
-    with open(sys.argv[1], 'r') as logfile:
-        logline = None
-        for logline in logfile:
-            if not first_line:
-                first_line = logline
-
-            m = logline_regex.match(logline)
-            if not m:
-                continue
-
-            if m.group('code') != str(200):
-                continue
-
-            ignore = False
-            for ignore_subnet in ignore_networks:
-                try:
-                    if IPAddress(m.group('ip_address')) in ignore_subnet:
-                        ignore = True
-                        break
-                except netaddr.core.AddrFormatError:
-                    ignore = True
-                    break
-            if ignore:
-                continue
 
-            if spider_regex.match(m.group('agent')):
-                continue
-
-            for key_str in url_to_key_strings(m.group('url')):
-                hits[key_str] += 1
-
-        last_line = logline
-
-    return {
-        'ts_from': get_timestamp(first_line),
-        'ts_to': get_timestamp(last_line),
-        'hits': hits,
-    }
-
-
-def url_to_key_strings(url):
+def parse_access_file(path):
+    """
+    Take a raw access file and return its contents as a list of dicts.
     """
-    Take a full URL and return a list of unique strings representing it,
-    that copr-frontend will understand.
+    with open(path, 'r') as logfile:
+        content = logfile.readlines()
+    assert content[0].startswith("=== start:")
+
+    accesses = []
+    for line in content[1:]:
+        m = logline_regex.match(line)
+        if not m:
+            continue
+        # Rename dict keys to match `copr-aws-s3-hitcounter`
+        access = m.groupdict()
+        access["cs-uri-stem"] = access.pop("url")
+        access["sc-status"] = access.pop("code")
+        access["cs(User-Agent)"] = access.pop("agent")
+        timestamp = datetime.strptime(access.pop("timestamp"),
+                                      "%d/%b/%Y:%H:%M:%S %z")
+        access["time"] = timestamp.strftime("%H:%M:%S")
+        access["date"] = timestamp.strftime("%Y-%m-%d")
+        accesses.append(access)
+    return accesses
+
+
+def get_arg_parser():
     """
-    url_match = repomd_url_regex.match(url)
-    if url_match:
-        chroot_key = (
-            'chroot_repo_metadata_dl_stat',
-            url_match.group('owner'),
-            url_match.group('project'),
-            url_match.group('chroot')
-        )
-        chroot_key_str = '|'.join(chroot_key)
-        return [chroot_key_str]
-
-    url_match = rpm_url_regex.match(url)
-    if url_match:
-        chroot_key = (
-            'chroot_rpms_dl_stat',
-            url_match.group('owner'),
-            url_match.group('project'),
-            url_match.group('chroot')
-        )
-        chroot_key_str = '|'.join(chroot_key)
-        project_key = (
-            'project_rpms_dl_stat',
-            url_match.group('owner'),
-            url_match.group('project')
-        )
-        project_key_str = '|'.join(project_key)
-        return [chroot_key_str, project_key_str]
-    return []
-
-
-def get_timestamp(logline):
-    if not logline:
-        return None
-
-    m = datetime_regex.match(logline)
-    if not m:
-        return None
-
-    datetime_str = datetime_parse_template.format(
-        date=m.group('date'),
-        time=m.group('time'),
-        zone=m.group('zone')
-    )
-
-    return int(dt_parse(datetime_str).strftime('%s'))
+    Generate argument parser for this script
+    """
+    name = os.path.basename(__file__)
+    description = 'Read lighttpd access.log and count repo accesses.'
+    parser = argparse.ArgumentParser(name, description=description)
+    parser.add_argument(
+        'logfile',
+        action='store',
+        help='Path to the input logfile')
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help=("Do not perform any destructive changes, only print what "
+              "would happen"))
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help=("Print verbose information about what is going on"))
+    return parser
+
+
+def main():
+    "Main function"
+    parser = get_arg_parser()
+    args = parser.parse_args()
+
+    if args.verbose:
+        log.setLevel(logging.DEBUG)
+
+    accesses = parse_access_file(args.logfile)
+    update_frontend(accesses, log=log, dry_run=args.dry_run)
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
-    result = get_hit_data()
-    result_json = json.dumps(result)
-    target_uri = os.path.join(opts.frontend_base_url, 'stats_rcv' , 'from_backend')
-
-    log.info('Sending: {} results from {} to {}'.format(
-        len(result['hits']),
-        result['ts_from'],
-        result['ts_to']))
-
-    for i in range(10):
-        try:
-            log.info('Trying to post data to frontend {}. time'.format(i+1))
-            r = requests.post(target_uri, json=result_json, timeout=20)
-        except Exception as e:
-            log.error(str(e))
-            time.sleep(10)
-        else:
-            log.info('Received: {} {}'.format(r.status_code, r.text))
-            break
+    main()