From d1106768c4e0cf2baa4634eb63b0c9550c915ee9 Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: May 02 2022 10:12:15 +0000 Subject: [PATCH 1/3] backend: use pagination for S3 objects By default only first 1000 results are returned, so we need to use pagination. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/paginators.html#creating-paginators --- diff --git a/backend/run/copr-aws-s3-hitcounter b/backend/run/copr-aws-s3-hitcounter index 2b7520d..e521558 100755 --- a/backend/run/copr-aws-s3-hitcounter +++ b/backend/run/copr-aws-s3-hitcounter @@ -52,10 +52,16 @@ class S3Bucket: """ List all files within our AWS s3 bucket """ - objects = self.s3.list_objects( + paginator = self.s3.get_paginator("list_objects") + page_iterator = paginator.paginate( Bucket=self.bucket, Prefix=self.directory) - return [x["Key"] for x in objects["Contents"]] + + result = [] + for page in page_iterator: + for obj in page["Contents"]: + result.append(obj["Key"]) + return result def download_file(self, s3file, dstdir): """ From 62d11b749b5b6acd41b309bc7c76854d683dce7d Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: May 02 2022 10:12:15 +0000 Subject: [PATCH 2/3] backend: ignore 404 hits That provides validation that such project and chroot either exists or at least existed in the past. I think we want to exclude 404 hits and count everything else. We could count only 200 but there might be redirects or temporarily broken server returning 500 and I think we want to count those. --- diff --git a/backend/run/copr-aws-s3-hitcounter b/backend/run/copr-aws-s3-hitcounter index e521558..e8399ee 100755 --- a/backend/run/copr-aws-s3-hitcounter +++ b/backend/run/copr-aws-s3-hitcounter @@ -133,10 +133,14 @@ def get_hit_data(accesses): timestamps = [] for access in accesses: url = access["cs-uri-stem"] - key_strings = url_to_key_strings(url) + + if access["sc-status"] == "404": + log.debug("Skipping: %s (404 Not Found)", url) + continue # We don't want to count every accessed URL, only those pointing to # RPM files and repo file + key_strings = url_to_key_strings(url) if not key_strings: log.debug("Skipping: %s", url) continue From 06268ad165cf5b27cfa15508a419cb712f6537e7 Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: May 02 2022 10:12:15 +0000 Subject: [PATCH 3/3] backend: don't count RPMs downloaded from Mock --- diff --git a/backend/run/copr-aws-s3-hitcounter b/backend/run/copr-aws-s3-hitcounter index e8399ee..ee7512c 100755 --- a/backend/run/copr-aws-s3-hitcounter +++ b/backend/run/copr-aws-s3-hitcounter @@ -138,6 +138,10 @@ def get_hit_data(accesses): log.debug("Skipping: %s (404 Not Found)", url) continue + if access["cs(User-Agent)"].startswith("Mock"): + log.debug("Skipping: %s (user-agent: Mock)", url) + continue + # We don't want to count every accessed URL, only those pointing to # RPM files and repo file key_strings = url_to_key_strings(url)