#751 Do not use datagrepper's contains, but filter locally, and submit a delta
Merged 6 years ago by maxamillion. Opened 6 years ago by puiterwijk.
puiterwijk/fedora-websites adddeltas  into  master

@@ -36,27 +36,28 @@ 

      },

  )

  

- def get_page(page, pages, target):

+ def get_page(page, pages):

      """ Retrieve the JSON for a particular page of datagrepper results """

      log.debug("Getting page %i of %s", page, pages)

      response = session.get(base_url, params=dict(

          topic=topic,

          page=page,

-         contains=target,

+         # Get messages from eight weeks

+         delta=4838400,

          rows_per_page=100,

      ))

      return response.json()

  

  

- def get_messages(target):

+ def retrieve_messages():

      """ Generator that yields messages from datagrepper """

  

      # Get the first page

-     data = get_page(1, 'unknown', target)

+     data = get_page(1, 'unknown')

      for message in data['raw_messages']:

          yield message

  

-     more = functools.partial(get_page, pages=data['pages'], target=target)

+     more = functools.partial(get_page, pages=data['pages'])

  

      # Get all subsequent pages (if there are any...)

      for page in range(1, data['pages']):
@@ -65,6 +66,14 @@ 

          for message in data['raw_messages']:

              yield message

  

+ 

+ def get_messages(target):

+     """ Filter the messages on target. """

+     for message in retrieve_messages():

+         if target in str(message):

+             yield message

+ 

+ 

  # We cache this guy on disk for 500s

  @cache.cache_on_arguments()

  def collect(release):

@@ -36,27 +36,28 @@ 

      },

  )

  

- def get_page(page, pages, target):

+ def get_page(page, pages):

      """ Retrieve the JSON for a particular page of datagrepper results """

      log.debug("Getting page %i of %s", page, pages)

      response = session.get(base_url, params=dict(

          topic=topic,

          page=page,

-         contains=target,

+         # Get messages from 28 weeks (7 months)

+         delta=16934400,

          rows_per_page=100,

      ))

      return response.json()

  

  

- def get_messages(target):

+ def retrieve_messages():

      """ Generator that yields messages from datagrepper """

  

      # Get the first page

-     data = get_page(1, 'unknown', target)

+     data = get_page(1, 'unknown')

      for message in data['raw_messages']:

          yield message

  

-     more = functools.partial(get_page, pages=data['pages'], target=target)

+     more = functools.partial(get_page, pages=data['pages'])

  

      # Get all subsequent pages (if there are any...)

      for page in range(1, data['pages']):
@@ -65,6 +66,14 @@ 

          for message in data['raw_messages']:

              yield message

  

+ 

+ def get_messages(target):

+     """ Filter the messages on target. """

+     for message in retrieve_messages():

+         if target in str(message):

+             yield message

+ 

+ 

  # We cache this guy on disk for 500s

  @cache.cache_on_arguments()

  def collect(release):

This moves compute time and memory usage from the datagrepper server to the system running
the fedimg_vars scripts.
Without delta, we were asking the datagrepper database to check every message since
datanommer was started for the first time.
With contains, we ask postgres to perform a heavy string comparison operation.

Signed-off-by: Patrick Uiterwijk patrick@puiterwijk.org

Pull-Request has been merged by maxamillion

6 years ago