| |
@@ -29,6 +29,10 @@
|
| |
|
| |
# We will allow only this hostname to delete files from the S3 storage
|
| |
PRODUCTION_HOSTNAME = "copr-be.aws.fedoraproject.org"
|
| |
+ DEVEL_HOSTNAME = "copr-be-dev.aws.fedoraproject.org"
|
| |
+
|
| |
+ PRODUCTION_CDN_HOSTNAME = "download.copr.fedorainfracloud.org"
|
| |
+ DEVEL_CDN_HOSTNAME = "download.copr-dev.fedorainfracloud.org"
|
| |
|
| |
|
| |
log = logging.getLogger(__name__)
|
| |
@@ -122,6 +126,37 @@
|
| |
return accesses
|
| |
|
| |
|
| |
+ def get_cdn_hostname(args):
|
| |
+ """
|
| |
+ The devel and production accesses are mixed together. Which ones do we want
|
| |
+ to count?
|
| |
+ """
|
| |
+ # If a CDN hostname was explicitly specified when calling the script
|
| |
+ if args.cdn_hostname:
|
| |
+ return args.cdn_hostname
|
| |
+
|
| |
+ # Count hits from devel CDN hostname on devel instance
|
| |
+ hostname = gethostname()
|
| |
+ if hostname == DEVEL_HOSTNAME:
|
| |
+ return DEVEL_CDN_HOSTNAME
|
| |
+
|
| |
+ # Default to production hits. Don't worry, we don't accidentally
|
| |
+ # remove them from any other instance
|
| |
+ return PRODUCTION_CDN_HOSTNAME
|
| |
+
|
| |
+
|
| |
+ def check_different_cdn_hostname(accesses, cdn_hostname):
|
| |
+ """
|
| |
+ If a list of HTTP accesses contain any access for a different CDN hostname
|
| |
+ (e.g. for devel instance when the script is running on production), return
|
| |
+ its value. Otherwise `None`.
|
| |
+ """
|
| |
+ for access in accesses:
|
| |
+ if access["x-host-header"] != cdn_hostname:
|
| |
+ return access["x-host-header"]
|
| |
+ return None
|
| |
+
|
| |
+
|
| |
def get_arg_parser():
|
| |
"""
|
| |
Generate argument parser for this script
|
| |
@@ -141,6 +176,18 @@
|
| |
"--verbose",
|
| |
action="store_true",
|
| |
help=("Print verbose information about what is going on"))
|
| |
+ parser.add_argument(
|
| |
+ "--try-indefinitely",
|
| |
+ action="store_true",
|
| |
+ help=("If true, try infinite number of attempts when contacting the "
|
| |
+ "frontend. Do not use this option for cron tasks because the "
|
| |
+ "number of simultaneously running instances might go up"))
|
| |
+ parser.add_argument(
|
| |
+ "--cdn-hostname",
|
| |
+ help=("By default the devel instance counts only hits from devel, and "
|
| |
+ "the production instance from production. You can override this "
|
| |
+ "by explicitly specifying the CDN hostname of interest, e.g. {0}"
|
| |
+ .format(PRODUCTION_CDN_HOSTNAME)))
|
| |
return parser
|
| |
|
| |
|
| |
@@ -151,21 +198,34 @@
|
| |
parser = get_arg_parser()
|
| |
args = parser.parse_args()
|
| |
tmp = tempfile.mkdtemp(prefix="copr-aws-s3-hitcounter-")
|
| |
+ cdn_hostname = get_cdn_hostname(args)
|
| |
|
| |
if args.verbose:
|
| |
log.setLevel(logging.DEBUG)
|
| |
|
| |
s3 = S3Bucket(dry_run=args.dry_run)
|
| |
- for s3file in s3.list_files():
|
| |
+ files = s3.list_files()
|
| |
+
|
| |
+ for i, s3file in enumerate(files, start=1):
|
| |
gz = s3.download_file(s3file, dstdir=tmp)
|
| |
raw = gunzip(gz)
|
| |
accesses = parse_access_file(raw)
|
| |
|
| |
+ different_cdn = check_different_cdn_hostname(accesses, cdn_hostname)
|
| |
+ if different_cdn:
|
| |
+ log.debug("Skipping: %s (different hostname: %s)",
|
| |
+ s3file, different_cdn)
|
| |
+ continue
|
| |
+
|
| |
+ log.info("[%s/%s] %s (%s accesses)",
|
| |
+ i, len(files), s3file, len(accesses))
|
| |
+
|
| |
# Maybe we want to use some locking or transaction mechanism to avoid
|
| |
# a scenario when we increment the accesses on the frontend but then
|
| |
# leave the s3 file untouched, which would result in parsing and
|
| |
# incrementing from the same file again in the next run
|
| |
- update_frontend(accesses, log=log, dry_run=args.dry_run)
|
| |
+ update_frontend(accesses, log=log, dry_run=args.dry_run,
|
| |
+ try_indefinitely=args.try_indefinitely)
|
| |
s3.delete_file(s3file)
|
| |
|
| |
# Clean all temporary files
|
| |
I tried to run the hitcounter script on devel and it died on me after some time. There are currently 98905 log files in the S3 so I made some adjustments for a better experience. Also, after discussion with @praiskup, I made sure to count only accesses from the appropriate instance (devel on devel, production on production).