#20 Add filtering to the config for scraping.
Merged 6 years ago by mprahl. Opened 6 years ago by ralph.

@@ -26,6 +26,12 @@ 

          'legacy_matching': True,

  

          #'jira': { # See /etc/fedmsg.d/sync2jira-credentials.py },

+         'filters': {

+             'github': {

+                 # Only sync multi-type tickets from bodhi.

+                 'fedora-infra/bodhi': { 'state': 'open', 'milestone': 4, },

+             },

+         },

          'map': {

              'pagure': {

                  'pungi':  { 'project': 'COMPOSE', 'component': 'Pungi', },
@@ -52,6 +58,8 @@ 

                  'fedora-modularity/product-definition-center': { 'project': 'FACTORY', 'component': None, },

                  'fedora-modularity/BPO': { 'project': 'FACTORY', 'component': None, },

  

+                 'fedora-infra/bodhi': { 'project': 'FACTORY', 'component': None, },

+ 

                  # Handy utils

                  #'ralphbean/finishline': { 'project': 'FACTORY', 'component': None, },

  

file modified
+7
@@ -35,12 +35,19 @@ 

                  #'koji': { 'project': 'BREW', 'component': None, },

              },

              'github': {

+                 'fedora-infra/bodhi': { 'project': 'FACTORY', 'component': None, },

                  #'product-definition-center/product-definition-center': {

                  #    'project': 'PDC', 'component': 'General', },

                  #'product-definition-center/pdc-client': {

                  #    'project': 'PDC', 'component': 'General', },

              },

          },

+         'filters': {

+             'github': {

+                 # Only sync multi-type tickets from bodhi.

+                 'fedora-infra/bodhi': { 'state': 'open', 'milestone': 4, },

+             },

+         },

      },

      'logging': dict(

          version=1,

file modified
+72 -12
@@ -17,9 +17,13 @@ 

  #

  # Authors:  Ralph Bean <rbean@redhat.com>

  

- 

  import logging

  

+ try:

+     from urllib.parse import urlencode  # py3

+ except ImportError:

+     from urllib import urlencode  # py2

+ 

  import requests

  

  import sync2jira.intermediary as i
@@ -33,18 +37,44 @@ 

      repo = msg['msg']['repository']['name']

      upstream = '{owner}/{repo}'.format(owner=owner, repo=repo)

      mapped_repos = config['sync2jira']['map']['github']

+ 

      if upstream not in mapped_repos:

          log.info("%r not in github map: %r" % (upstream, mapped_repos.keys()))

          return None

+ 

+     filter = config['sync2jira']\

+         .get('filters', {})\

+         .get('github', {})\

+         .get(upstream, {'state': 'open'})

+ 

+     for key, expected in filter.items():

+         actual = msg['msg']['issue'].get(key)

+         if actual != expected:

+             log.info("Actual %r %r != expected %r" % (key, actual, expected))

+             return None

+ 

      return i.Issue.from_github(upstream, msg['msg']['issue'], config)

  

  

  def handle_pagure_message(msg, config):

      upstream = msg['msg']['project']['name']

      mapped_repos = config['sync2jira']['map']['pagure']

+ 

      if upstream not in mapped_repos:

          log.info("%r not in pagure map: %r" % (upstream, mapped_repos.keys()))

          return None

+ 

+     filter = config['sync2jira']\

+         .get('filters', {})\

+         .get('pagure', {})\

+         .get(upstream, {'status': 'Open'})

+ 

+     for key, expected in filter.items():

+         actual = msg['msg']['issue'].get(key)

+         if actual != expected:

+             log.info("Actual %r %r != expected %r" % (key, actual, expected))

+             return None

+ 

      return i.Issue.from_pagure(upstream, msg['msg']['issue'], config)

  

  
@@ -52,7 +82,10 @@ 

      base = config['sync2jira'].get('pagure_url', 'https://pagure.io')

      url = base + '/api/0/' + upstream + '/issues'

  

-     params = dict(status='Open')

+     params = config['sync2jira']\

+         .get('filters', {})\

+         .get('pagure', {})\

+         .get(upstream, {'status': 'Open'})

  

      response = requests.get(url, params=params)

      data = response.json()['issues']
@@ -62,22 +95,49 @@ 

  

  

  def github_issues(upstream, config):

-     url = 'https://api.github.com/repos/%s/issues' % upstream

- 

-     headers = {}

      token = config['sync2jira'].get('github_token')

      if not token:

+         headers = {}

          log.warning('No github_token found.  We will be rate-limited...')

      else:

-         headers['Authorization'] = 'token ' + token

+         headers = {'Authorization': 'token ' + token}

  

-     params = dict(per_page=100, state='open')

+     filter = config['sync2jira']\

+         .get('filters', {})\

+         .get('github', {})\

+         .get(upstream, {'state': 'open'})

+     url = 'https://api.github.com/repos/%s/issues' % upstream

+     url += '?' + urlencode(filter)

  

-     response = requests.get(url, params=params, headers=headers)

-     data = response.json()

-     issues = (

-         i.Issue.from_github(upstream, issue, config) for issue in data

+     issues = _get_all_github_issues(url, headers)

+     issues = list((

+         i.Issue.from_github(upstream, issue, config) for issue in issues

          if not 'pull_request' in issue  # We don't want to copy these around

-     )

+     ))

      for issue in issues:

          yield issue

+ 

+ 

+ def _get_all_github_issues(url, headers):

+     """ Pagination utility.  Obnoxious. """

+     link = dict(next=url)

+     while 'next' in link:

+         response = requests.get(link['next'], headers=headers)

+         for issue in response.json():

+             yield issue

+         link = _github_link_field_to_dict(response.headers.get('link', None))

+ 

+ 

+ def _github_link_field_to_dict(field):

+     """ Utility for ripping apart github's Link header field.

+     It's kind of ugly.

+     """

+ 

+     if not field:

+         return dict()

+     return dict([

+         (

+             part.split('; ')[1][5:-1],

+             part.split('; ')[0][1:-1],

+         ) for part in field.split(', ')

+     ])

file modified
+184 -18
@@ -5,24 +5,26 @@ 

  

  

  class TestUpstream(unittest.TestCase):

-     config = {

-         'sync2jira': {

-             'map': {

-                 'github': {

-                     'org/repo': {},

+     def setUp(self):

+         self.config = {

+             'sync2jira': {

+                 'map': {

+                     'github': {

+                         'org/repo': {},

+                     },

+                     'pagure': {

+                         'some_repo': {},

+                     },

                  },

-                 'pagure': {

-                     'some_repo': {},

+                 'jira': {

+                     # Nothing, really..

                  },

              },

-             'jira': {

-                 # Nothing, really..

-             },

-         },

-     }

+         }

  

      @mock.patch('sync2jira.upstream.i.Issue.from_github')

      def test_handle_github_message(self, from_github):

+         issue_dict = {'state': 'open'}

          message = {

              'msg': {

                  'repository': {
@@ -31,15 +33,16 @@ 

                          'login': 'org',

                      },

                  },

-                 'issue': 'some_issue_dict',

+                 'issue': issue_dict,

              },

          }

          u.handle_github_message(message, self.config)

          from_github.assert_called_once_with(

-             'org/repo', 'some_issue_dict', self.config)

+             'org/repo', issue_dict, self.config)

  

      @mock.patch('sync2jira.upstream.i.Issue.from_pagure')

      def test_handle_pagure_message(self, from_pagure):

+         issue_dict = {'status': 'Open'}

          message = {

              'msg': {

                  'project': {
@@ -48,12 +51,122 @@ 

                          'login': 'org',

                      },

                  },

-                 'issue': 'some_issue_dict',

+                 'issue': issue_dict,

              },

          }

          u.handle_pagure_message(message, self.config)

          from_pagure.assert_called_once_with(

-             'some_repo', 'some_issue_dict', self.config)

+             'some_repo', issue_dict, self.config)

+ 

+     @mock.patch('sync2jira.upstream.i.Issue.from_github')

+     def test_handle_github_filter_positive(self, from_github):

+         self.config['sync2jira']['filters'] = {

+             'github': {

+                 'org/repo': {

+                     'some_value': 'present',

+                 },

+             },

+         }

+         issue_dict = {

+             'state': 'open',

+             'some_value': 'present',

+         }

+         message = {

+             'msg': {

+                 'repository': {

+                     'name': 'repo',

+                     'owner': {

+                         'login': 'org',

+                     },

+                 },

+                 'issue': issue_dict,

+             },

+         }

+         u.handle_github_message(message, self.config)

+         from_github.assert_called_once_with(

+             'org/repo', issue_dict, self.config)

+ 

+     @mock.patch('sync2jira.upstream.i.Issue.from_pagure')

+     def test_handle_pagure_filter_positive(self, from_pagure):

+         self.config['sync2jira']['filters'] = {

+             'pagure': {

+                 'some_repo': {

+                     'some_value': 'present',

+                 },

+             },

+         }

+         issue_dict = {

+             'status': 'Open',

+             'some_value': 'present',

+         }

+         message = {

+             'msg': {

+                 'project': {

+                     'name': 'some_repo',

+                     'owner': {

+                         'login': 'org',

+                     },

+                 },

+                 'issue': issue_dict,

+             },

+         }

+         u.handle_pagure_message(message, self.config)

+         from_pagure.assert_called_once_with(

+             'some_repo', issue_dict, self.config)

+ 

+     @mock.patch('sync2jira.upstream.i.Issue.from_github')

+     def test_handle_github_filter_negative(self, from_github):

+         self.config['sync2jira']['filters'] = {

+             'github': {

+                 'org/repo': {

+                     'some_value': 'present',

+                 },

+             },

+         }

+         issue_dict = {

+             'state': 'open',

+             'some_value': 'absent',

+         }

+         message = {

+             'msg': {

+                 'repository': {

+                     'name': 'repo',

+                     'owner': {

+                         'login': 'org',

+                     },

+                 },

+                 'issue': issue_dict,

+             },

+         }

+         u.handle_github_message(message, self.config)

+         assert not from_github.called, 'should not have been called'

+ 

+     @mock.patch('sync2jira.upstream.i.Issue.from_pagure')

+     def test_handle_pagure_filter_negative(self, from_pagure):

+         self.config['sync2jira']['filters'] = {

+             'pagure': {

+                 'some_repo': {

+                     'some_value': 'present',

+                 },

+             },

+         }

+         issue_dict = {

+             'status': 'Open',

+             'some_value': 'absent',

+         }

+         message = {

+             'msg': {

+                 'project': {

+                     'name': 'some_repo',

+                     'owner': {

+                         'login': 'org',

+                     },

+                 },

+                 'issue': issue_dict,

+             },

+         }

+         u.handle_pagure_message(message, self.config)

+         assert not from_pagure.called, 'should not have been called'

  

      @mock.patch('sync2jira.upstream.requests')

      @mock.patch('sync2jira.upstream.i.Issue.from_pagure')
@@ -89,9 +202,62 @@ 

          list(generator)

  

          requests.get.assert_called_once_with(

-             'https://api.github.com/repos/some_repo/issues',

+             'https://api.github.com/repos/some_repo/issues?state=open',

+             headers={},

+         )

+         from_github.assert_called_once_with(

+             'some_repo', 'some_issue_dict', self.config)

+ 

+     @mock.patch('sync2jira.upstream.requests')

+     @mock.patch('sync2jira.upstream.i.Issue.from_pagure')

+     def test_get_filtered_pagure_issues(self, from_pagure, requests):

+         self.config['sync2jira']['filters'] = {

+             'pagure': {

+                 'some_repo': {

+                     'some_value': 'present',

+                 },

+             },

+         }

+         response = mock.MagicMock()

+         response.json.return_value = {

+             'issues': ['some_issue_dict'],

+         }

+         requests.get.return_value = response

+ 

+         generator = u.pagure_issues('some_repo', self.config)

+         # Step through that...

+         list(generator)

+ 

+         requests.get.assert_called_once_with(

+             'https://pagure.io/api/0/some_repo/issues',

+             params={'some_value': 'present'},

+         )

+         from_pagure.assert_called_once_with(

+             'some_repo', 'some_issue_dict', self.config)

+ 

+     @mock.patch('sync2jira.upstream.requests')

+     @mock.patch('sync2jira.upstream.i.Issue.from_github')

+     def test_get_filtered_github_issues(self, from_github, requests):

+         self.config['sync2jira']['filters'] = {

+             'github': {

+                 'some_repo': {

+                     'some_value': 'present',

+                 },

+             },

+         }

+         response = mock.MagicMock()

+         response.json.return_value = [

+             'some_issue_dict',

+         ]

+         requests.get.return_value = response

+ 

+         generator = u.github_issues('some_repo', self.config)

+         # Step through that...

+         list(generator)

+ 

+         requests.get.assert_called_once_with(

+             'https://api.github.com/repos/some_repo/issues?some_value=present',

              headers={},

-             params={'per_page': 100, 'state': 'open'},

          )

          from_github.assert_called_once_with(

              'some_repo', 'some_issue_dict', self.config)

This allows us to sync only special kinds of tickets from the bodhi
queue to the FACTORY project.

Additionally, I added pagination to our github scraping here which
previously only grabbed the latest 100 tickets.

Fixes #16.

Pull-Request has been merged by mprahl

6 years ago