From 89f3cf49912dfa81da0ffc66bcd6f592c5953b6c Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: May 11 2022 06:29:16 +0000 Subject: backend: consolidate the two hitcounter scripts - I am moving the shared code into `hitcounter.py` - Fixing the outdated parts of the original `copr_log_hitcounter.py` script, such as frontend authentication - Using the bot filter even for copr-aws-s3-hitcounter Merges: #2191 --- diff --git a/backend/copr_backend/hitcounter.py b/backend/copr_backend/hitcounter.py new file mode 100644 index 0000000..4d16616 --- /dev/null +++ b/backend/copr_backend/hitcounter.py @@ -0,0 +1,141 @@ +""" +Shared logic for hitcounter scripts +""" + +import os +import re +from datetime import datetime +from copr_common.request import SafeRequest +from copr_backend.helpers import BackendConfigReader + +base_regex = "/results/(?P[^/]*)/(?P[^/]*)/(?P[^/]*)/" +repomd_url_regex = re.compile(base_regex + "repodata/repomd.xml", re.IGNORECASE) +rpm_url_regex = re.compile( + base_regex + r"(?P[^/]*)/(?P[^/]*\.rpm)", re.IGNORECASE) + +spider_regex = re.compile( + '.*(ahrefs|bot/[0-9]|bingbot|borg|google|googlebot|yahoo|slurp|msnbot' + '|openbot|archiver|netresearch|lycos|scooter|altavista|teoma|gigabot' + '|blitzbot|oegp|charlotte|furlbot|http://client|polybot|htdig|ichiro' + '|larbin|pompos|scrubby|searchsight|seekbot|semanticdiscovery|silk|snappy' + '|spider|voila|vortex|voyager|zao|zeal|fast-webcrawler|converacrawler' + '|msrbot|baiduspider|mogimogi|speedy|dataparksearch' + '|findlinks|crawler|yandex|blexbot|semrushbot).*', + re.IGNORECASE) + + +def url_to_key_strings(url): + """ + Take a full URL and return a list of unique strings representing it, + that copr-frontend will understand. + """ + url_match = repomd_url_regex.match(url) + if url_match: + chroot_key = ( + 'chroot_repo_metadata_dl_stat', + url_match.group('owner'), + url_match.group('project'), + url_match.group('chroot') + ) + chroot_key_str = '|'.join(chroot_key) + return [chroot_key_str] + + url_match = rpm_url_regex.match(url) + if url_match: + chroot_key = ( + 'chroot_rpms_dl_stat', + url_match.group('owner'), + url_match.group('project'), + url_match.group('chroot') + ) + chroot_key_str = '|'.join(chroot_key) + project_key = ( + 'project_rpms_dl_stat', + url_match.group('owner'), + url_match.group('project') + ) + project_key_str = '|'.join(project_key) + return [chroot_key_str, project_key_str] + return [] + + +def update_frontend(accesses, log, dry_run=False): + """ + Increment frontend statistics based on these `accesses` + """ + result = get_hit_data(accesses, log) + if not result: + log.debug("No recognizable hits among these accesses, skipping.") + return + + log.info( + "Sending: %i results from %i to %i", + len(result["hits"]), + result["ts_from"], + result["ts_to"] + ) + if len(result["hits"]) < 100: + log.debug("Hits: %s", result["hits"]) + else: + log.debug("Not logging the whole dict: %s hits", len(result["hits"])) + + opts = BackendConfigReader().read() + url = os.path.join( + opts.frontend_base_url, + "stats_rcv", + "from_backend", + ) + if not dry_run: + SafeRequest(auth=opts.frontend_auth, log=log).post(url, result) + + +def get_hit_data(accesses, log): + """ + Prepare body for the frontend request in the same format that + copr_log_hitcounter.py does. + """ + hits = {} + timestamps = [] + for access in accesses: + url = access["cs-uri-stem"] + + if access["sc-status"] == "404": + log.debug("Skipping: %s (404 Not Found)", url) + continue + + if access["cs(User-Agent)"].startswith("Mock"): + log.debug("Skipping: %s (user-agent: Mock)", url) + continue + + bot = spider_regex.match(access["cs(User-Agent)"]) + if bot: + log.debug("Skipping: %s (user-agent '%s' is a known bot)", + url, bot.group(1)) + continue + + # We don't want to count every accessed URL, only those pointing to + # RPM files and repo file + key_strings = url_to_key_strings(url) + if not key_strings: + log.debug("Skipping: %s", url) + continue + + log.debug("Processing: %s", url) + + # When counting RPM access, we want to iterate both project hits and + # chroot hits. That way we can get multiple `key_strings` for one URL + for key_str in key_strings: + hits.setdefault(key_str, 0) + hits[key_str] += 1 + + # Remember this access timestamp + datetime_format = "%Y-%m-%d %H:%M:%S" + datetime_string = "{0} {1}".format(access["date"], access["time"]) + datetime_object = datetime.strptime(datetime_string, datetime_format) + timestamps.append(int(datetime_object.timestamp())) + + return { + "ts_from": min(timestamps), + "ts_to": max(timestamps), + "hits": hits, + } if hits else {} diff --git a/backend/run/copr-aws-s3-hitcounter b/backend/run/copr-aws-s3-hitcounter index 6c2b38d..b528589 100755 --- a/backend/run/copr-aws-s3-hitcounter +++ b/backend/run/copr-aws-s3-hitcounter @@ -21,12 +21,10 @@ import argparse import logging import tempfile import gzip -from datetime import datetime from socket import gethostname import boto3 -from copr_common.request import SafeRequest -from copr_log_hitcounter import url_to_key_strings -from copr_backend.helpers import BackendConfigReader, setup_script_logger +from copr_backend.hitcounter import update_frontend +from copr_backend.helpers import setup_script_logger # We will allow only this hostname to delete files from the S3 storage @@ -124,79 +122,6 @@ def parse_access_file(path): return accesses -def get_hit_data(accesses): - """ - Prepare body for the frontend request in the same format that - copr_log_hitcounter.py does. - """ - hits = {} - timestamps = [] - for access in accesses: - url = access["cs-uri-stem"] - - if access["sc-status"] == "404": - log.debug("Skipping: %s (404 Not Found)", url) - continue - - if access["cs(User-Agent)"].startswith("Mock"): - log.debug("Skipping: %s (user-agent: Mock)", url) - continue - - # We don't want to count every accessed URL, only those pointing to - # RPM files and repo file - key_strings = url_to_key_strings(url) - if not key_strings: - log.debug("Skipping: %s", url) - continue - - log.debug("Processing: %s", url) - - # When counting RPM access, we want to iterate both project hits and - # chroot hits. That way we can get multiple `key_strings` for one URL - for key_str in key_strings: - hits.setdefault(key_str, 0) - hits[key_str] += 1 - - # Remember this access timestamp - datetime_format = "%Y-%m-%d %H:%M:%S" - datetime_string = "{0} {1}".format(access["date"], access["time"]) - datetime_object = datetime.strptime(datetime_string, datetime_format) - timestamps.append(int(datetime_object.timestamp())) - - return { - "ts_from": min(timestamps), - "ts_to": max(timestamps), - "hits": hits, - } if hits else {} - - -def update_frontend(accesses, dry_run=False): - """ - Increment frontend statistics based on these `accesses` - """ - result = get_hit_data(accesses) - if not result: - log.debug("No recognizable hits among these accesses, skipping.") - return - - log.info( - "Sending: %i results from %i to %i", - len(result["hits"]), - result["ts_from"], - result["ts_to"] - ) - log.debug("Hits: %s", result["hits"]) - - opts = BackendConfigReader().read() - url = os.path.join( - opts.frontend_base_url, - "stats_rcv", - "from_backend", - ) - if not dry_run: - SafeRequest(auth=opts.frontend_auth, log=log).post(url, result) - - def get_arg_parser(): """ Generate argument parser for this script @@ -240,7 +165,7 @@ def main(): # a scenario when we increment the accesses on the frontend but then # leave the s3 file untouched, which would result in parsing and # incrementing from the same file again in the next run - update_frontend(accesses, dry_run=args.dry_run) + update_frontend(accesses, log=log, dry_run=args.dry_run) s3.delete_file(s3file) # Clean all temporary files diff --git a/backend/run/copr_log_hitcounter.py b/backend/run/copr_log_hitcounter.py index 0248af7..8c2fef1 100755 --- a/backend/run/copr_log_hitcounter.py +++ b/backend/run/copr_log_hitcounter.py @@ -1,170 +1,91 @@ #!/usr/bin/python3 -# This is script is supposed to be run daily from lighttpd logrotate, e.g. -# prerotate -# /usr/bin/copr_log_hitcounter.py /var/log/lighttpd/access.log --ignore-subnets 172.25.80.0/20 209.132.184.33/24 || : -# endscript +""" +This is script is supposed to be run daily from lighttpd logrotate, e.g. + prerotate + /usr/bin/copr_log_hitcounter.py /var/log/lighttpd/access.log \ + --ignore-subnets 172.25.80.0/20 209.132.184.33/24 || : + endscript +""" import re -import sys -import requests -import json import os import logging import argparse -import netaddr -import time +from datetime import datetime +from copr_backend.helpers import setup_script_logger +from copr_backend.hitcounter import update_frontend -from dateutil.parser import parse as dt_parse -from netaddr import IPNetwork, IPAddress - -from collections import defaultdict -from copr_backend.helpers import BackendConfigReader, setup_script_logger - -opts = BackendConfigReader().read() log = logging.getLogger(__name__) setup_script_logger(log, "/var/log/copr-backend/hitcounter.log") -spider_regex = re.compile('.*(ahrefs|bot/[0-9]|bingbot|borg|google|googlebot|yahoo|slurp|msnbot|msrbot' - '|openbot|archiver|netresearch|lycos|scooter|altavista|teoma|gigabot|baiduspider' - '|blitzbot|oegp|charlotte|furlbot|http://client|polybot|htdig|ichiro|mogimogi' - '|larbin|pompos|scrubby|searchsight|seekbot|semanticdiscovery|silk|snappy|speedy' - '|spider|voila|vortex|voyager|zao|zeal|fast-webcrawler|converacrawler|dataparksearch' - '|findlinks|crawler|yandex|blexbot|semrushbot).*', re.IGNORECASE) - logline_regex = re.compile( r'(?P.*)\s+(?P.*)\s+-\s+\[(?P.*)\]\s+' r'"GET (?P.*)\s+(?P.*)"\s+(?P.*)\s+(?P.*)\s+' r'"(?P.*)"\s+"(?P.*)"', re.IGNORECASE) -repomd_url_regex = re.compile("/results/(?P[^/]*)/(?P[^/]*)/(?P[^/]*)/repodata/repomd.xml", re.IGNORECASE) -rpm_url_regex = re.compile("/results/(?P[^/]*)/(?P[^/]*)/(?P[^/]*)/(?P[^/]*)/(?P[^/]*\.rpm)", re.IGNORECASE) - -datetime_regex = re.compile(".*\[(?P[^:]*):(?P