#2270 A couple of fixes for the AWS S3 hitcounter script
Closed 2 years ago by praiskup. Opened 2 years ago by frostyx.
copr/ frostyx/copr aws-hitcounter-fixups  into  main

@@ -2,3 +2,6 @@ 

  

  runuser -c "PYTHONUNBUFFERED=1 python3 /usr/bin/copr_prune_results.py &> /dev/null" - copr

  runuser -c "PYTHONUNBUFFERED=1 python3 /usr/bin/copr_prune_srpms.py &> /dev/null" - copr

+ 

+ # Optional - Counting downloaded RPMs through Amazon CDN

+ # runuser -c "PYTHONUNBUFFERED=1 python3 /usr/bin/copr-aws-s3-hitcounter &> /dev/null" - copr

@@ -59,7 +59,7 @@ 

      return []

  

  

- def update_frontend(accesses, log, dry_run=False):

+ def update_frontend(accesses, log, dry_run=False, try_indefinitely=False):

      """

      Increment frontend statistics based on these `accesses`

      """
@@ -68,7 +68,7 @@ 

          log.debug("No recognizable hits among these accesses, skipping.")

          return

  

-     log.info(

+     log.debug(

          "Sending: %i results from %i to %i",

          len(result["hits"]),

          result["ts_from"],
@@ -86,7 +86,12 @@ 

          "from_backend",

      )

      if not dry_run:

-         SafeRequest(auth=opts.frontend_auth, log=log).post(url, result)

+         request = SafeRequest(

+             auth=opts.frontend_auth,

+             log=log,

+             try_indefinitely=try_indefinitely,

+         )

+         request.post(url, result)

  

  

  def get_hit_data(accesses, log):

@@ -29,6 +29,10 @@ 

  

  # We will allow only this hostname to delete files from the S3 storage

  PRODUCTION_HOSTNAME = "copr-be.aws.fedoraproject.org"

+ DEVEL_HOSTNAME = "copr-be-dev.aws.fedoraproject.org"

+ 

+ PRODUCTION_CDN_HOSTNAME = "download.copr.fedorainfracloud.org"

+ DEVEL_CDN_HOSTNAME = "download.copr-dev.fedorainfracloud.org"

  

  

  log = logging.getLogger(__name__)
@@ -122,6 +126,37 @@ 

      return accesses

  

  

+ def get_cdn_hostname(args):

+     """

+     The devel and production accesses are mixed together. Which ones do we want

+     to count?

+     """

+     # If a CDN hostname was explicitly specified when calling the script

+     if args.cdn_hostname:

+         return args.cdn_hostname

+ 

+     # Count hits from devel CDN hostname on devel instance

+     hostname = gethostname()

+     if hostname == DEVEL_HOSTNAME:

+         return DEVEL_CDN_HOSTNAME

+ 

+     # Default to production hits. Don't worry, we don't accidentally

+     # remove them from any other instance

+     return PRODUCTION_CDN_HOSTNAME

+ 

+ 

+ def check_different_cdn_hostname(accesses, cdn_hostname):

+     """

+     If a list of HTTP accesses contain any access for a different CDN hostname

+     (e.g. for devel instance when the script is running on production), return

+     its value. Otherwise `None`.

+     """

+     for access in accesses:

+         if access["x-host-header"] != cdn_hostname:

+             return access["x-host-header"]

+     return None

+ 

+ 

  def get_arg_parser():

      """

      Generate argument parser for this script
@@ -141,6 +176,18 @@ 

          "--verbose",

          action="store_true",

          help=("Print verbose information about what is going on"))

+     parser.add_argument(

+         "--try-indefinitely",

+         action="store_true",

+         help=("If true, try infinite number of attempts when contacting the "

+               "frontend. Do not use this option for cron tasks because the "

+               "number of simultaneously running instances might go up"))

+     parser.add_argument(

+         "--cdn-hostname",

+         help=("By default the devel instance counts only hits from devel, and "

+               "the production instance from production. You can override this "

+               "by explicitly specifying the CDN hostname of interest, e.g. {0}"

+               .format(PRODUCTION_CDN_HOSTNAME)))

      return parser

  

  
@@ -151,21 +198,34 @@ 

      parser = get_arg_parser()

      args = parser.parse_args()

      tmp = tempfile.mkdtemp(prefix="copr-aws-s3-hitcounter-")

+     cdn_hostname = get_cdn_hostname(args)

  

      if args.verbose:

          log.setLevel(logging.DEBUG)

  

      s3 = S3Bucket(dry_run=args.dry_run)

-     for s3file in s3.list_files():

+     files = s3.list_files()

+ 

+     for i, s3file in enumerate(files, start=1):

          gz = s3.download_file(s3file, dstdir=tmp)

          raw = gunzip(gz)

          accesses = parse_access_file(raw)

  

+         different_cdn = check_different_cdn_hostname(accesses, cdn_hostname)

+         if different_cdn:

+             log.debug("Skipping: %s (different hostname: %s)",

+                       s3file, different_cdn)

+             continue

+ 

+         log.info("[%s/%s] %s (%s accesses)",

+                  i, len(files), s3file, len(accesses))

+ 

          # Maybe we want to use some locking or transaction mechanism to avoid

          # a scenario when we increment the accesses on the frontend but then

          # leave the s3 file untouched, which would result in parsing and

          # incrementing from the same file again in the next run

-         update_frontend(accesses, log=log, dry_run=args.dry_run)

+         update_frontend(accesses, log=log, dry_run=args.dry_run,

+                         try_indefinitely=args.try_indefinitely)

          s3.delete_file(s3file)

  

          # Clean all temporary files

I tried to run the hitcounter script on devel and it died on me after some time. There are currently 98905 log files in the S3 so I made some adjustments for a better experience. Also, after discussion with @praiskup, I made sure to count only accesses from the appropriate instance (devel on devel, production on production).

Metadata Update from @frostyx:
- Pull-request tagged with: wip

2 years ago

Build succeeded.

3 new commits added

  • backend: count only hits from an appropriate CDN hostname
  • backend: add option for infinite number of attempts to the hitcounter script
  • backend: print more reasonable output from AWS hitcounter script
2 years ago

Build succeeded.

Metadata Update from @frostyx:
- Pull-request untagged with: wip

2 years ago

A bit weird, I believe we have two distinct distributions in cloudfronts... (prod and stage)

I'm not really sure. Don't we actually want to try indefinitely even in cron jobs? What kind of issues can happen, that are not temporary? If you are afraid of concurrently running cron jobs, we can use /bin/flock or something...

I would also prefer to have an example (commented-out?) command in cron file...

But I don't have any major objections, thank you for the PR!

3 new commits added

  • backend: count only hits from an appropriate CDN hostname
  • backend: add option for infinite number of attempts to the hitcounter script
  • backend: print more reasonable output from AWS hitcounter script
2 years ago

Build succeeded.

Metadata Update from @praiskup:
- Pull-request tagged with: release-blocker

2 years ago

rebased onto 86377b5

2 years ago

Build succeeded.

Pull-Request has been closed by praiskup

2 years ago