From f0ae5aeb95f3d0213c30887a24db29c0ffa2d759 Mon Sep 17 00:00:00 2001 From: Patrick Uiterwijk Date: Oct 17 2017 18:53:02 +0000 Subject: Do not use datagrepper's contains, but filter locally, and submit a delta This moves compute time and memory usage from the datagrepper server to the system running the fedimg_vars scripts. Without delta, we were asking the datagrepper database to check every message since datanommer was started for the first time. With contains, we ask postgres to perform a heavy string comparison operation. Signed-off-by: Patrick Uiterwijk --- diff --git a/alt.fedoraproject.org/build/fedimg_vars.py b/alt.fedoraproject.org/build/fedimg_vars.py index 2eacf8d..e4e0d51 100755 --- a/alt.fedoraproject.org/build/fedimg_vars.py +++ b/alt.fedoraproject.org/build/fedimg_vars.py @@ -36,27 +36,28 @@ cache = dogpile.cache.make_region().configure( }, ) -def get_page(page, pages, target): +def get_page(page, pages): """ Retrieve the JSON for a particular page of datagrepper results """ log.debug("Getting page %i of %s", page, pages) response = session.get(base_url, params=dict( topic=topic, page=page, - contains=target, + # Get messages from eight weeks + delta=4838400, rows_per_page=100, )) return response.json() -def get_messages(target): +def retrieve_messages(): """ Generator that yields messages from datagrepper """ # Get the first page - data = get_page(1, 'unknown', target) + data = get_page(1, 'unknown') for message in data['raw_messages']: yield message - more = functools.partial(get_page, pages=data['pages'], target=target) + more = functools.partial(get_page, pages=data['pages']) # Get all subsequent pages (if there are any...) for page in range(1, data['pages']): @@ -65,6 +66,14 @@ def get_messages(target): for message in data['raw_messages']: yield message + +def get_messages(target): + """ Filter the messages on target. """ + for message in retrieve_messages(): + if target in str(message): + yield message + + # We cache this guy on disk for 500s @cache.cache_on_arguments() def collect(release): diff --git a/getfedora.org/build/fedimg_vars.py b/getfedora.org/build/fedimg_vars.py index 3ec6365..7a4c0cf 100755 --- a/getfedora.org/build/fedimg_vars.py +++ b/getfedora.org/build/fedimg_vars.py @@ -36,27 +36,28 @@ cache = dogpile.cache.make_region().configure( }, ) -def get_page(page, pages, target): +def get_page(page, pages): """ Retrieve the JSON for a particular page of datagrepper results """ log.debug("Getting page %i of %s", page, pages) response = session.get(base_url, params=dict( topic=topic, page=page, - contains=target, + # Get messages from 28 weeks (7 months) + delta=16934400, rows_per_page=100, )) return response.json() -def get_messages(target): +def retrieve_messages(): """ Generator that yields messages from datagrepper """ # Get the first page - data = get_page(1, 'unknown', target) + data = get_page(1, 'unknown') for message in data['raw_messages']: yield message - more = functools.partial(get_page, pages=data['pages'], target=target) + more = functools.partial(get_page, pages=data['pages']) # Get all subsequent pages (if there are any...) for page in range(1, data['pages']): @@ -65,6 +66,14 @@ def get_messages(target): for message in data['raw_messages']: yield message + +def get_messages(target): + """ Filter the messages on target. """ + for message in retrieve_messages(): + if target in str(message): + yield message + + # We cache this guy on disk for 500s @cache.cache_on_arguments() def collect(release):