| |
@@ -52,10 +52,16 @@
|
| |
"""
|
| |
List all files within our AWS s3 bucket
|
| |
"""
|
| |
- objects = self.s3.list_objects(
|
| |
+ paginator = self.s3.get_paginator("list_objects")
|
| |
+ page_iterator = paginator.paginate(
|
| |
Bucket=self.bucket,
|
| |
Prefix=self.directory)
|
| |
- return [x["Key"] for x in objects["Contents"]]
|
| |
+
|
| |
+ result = []
|
| |
+ for page in page_iterator:
|
| |
+ for obj in page["Contents"]:
|
| |
+ result.append(obj["Key"])
|
| |
+ return result
|
| |
|
| |
def download_file(self, s3file, dstdir):
|
| |
"""
|
| |
@@ -127,10 +133,18 @@
|
| |
timestamps = []
|
| |
for access in accesses:
|
| |
url = access["cs-uri-stem"]
|
| |
- key_strings = url_to_key_strings(url)
|
| |
+
|
| |
+ if access["sc-status"] == "404":
|
| |
+ log.debug("Skipping: %s (404 Not Found)", url)
|
| |
+ continue
|
| |
+
|
| |
+ if access["cs(User-Agent)"].startswith("Mock"):
|
| |
+ log.debug("Skipping: %s (user-agent: Mock)", url)
|
| |
+ continue
|
| |
|
| |
# We don't want to count every accessed URL, only those pointing to
|
| |
# RPM files and repo file
|
| |
+ key_strings = url_to_key_strings(url)
|
| |
if not key_strings:
|
| |
log.debug("Skipping: %s", url)
|
| |
continue
|
| |
How much memory does the script eat for all the statistics we have in s3 now? This is ideal candidate for some
yield
generator dance (both for memory optimization and speed).