#85 /results/latest: use a single query with subquery join
Merged 7 years ago by jskladan. Opened 7 years ago by dcallagh.
taskotron/ dcallagh/resultsdb issue-84  into  develop

file modified
+37 -23
@@ -316,13 +316,14 @@ 

      # Checks if the sort parameter specified in the request is valid before querying.

      # Sorts by submit_time in a descending order if the sort parameter is absent or invalid.

      query_sorted = False

-     sort_match = re.match(r'^(?P<order>asc|desc):(?P<column>.+)$', _sort)

-     if sort_match:

-         if sort_match.group('column') == 'submit_time':

-             sort_order = {'asc': db.asc, 'desc': db.desc}[sort_match.group('order')]

-             sort_column = getattr(Result, sort_match.group('column'))

-             q = db.session.query(Result).order_by(sort_order(sort_column))

-             query_sorted = True

+     if _sort:

+         sort_match = re.match(r'^(?P<order>asc|desc):(?P<column>.+)$', _sort)

+         if sort_match:

+             if sort_match.group('column') == 'submit_time':

+                 sort_order = {'asc': db.asc, 'desc': db.desc}[sort_match.group('order')]

+                 sort_column = getattr(Result, sort_match.group('column'))

+                 q = db.session.query(Result).order_by(sort_order(sort_column))

+                 query_sorted = True

      if not query_sorted:

          q = db.session.query(Result).order_by(db.desc(Result.submit_time))

  
@@ -447,22 +448,35 @@ 

          return p['error']

  

      args = p['args']

-     q = select_testcases(','.join(args['testcases']), ','.join(args['testcases:like']))

-     testcases = q.all()

- 

-     results = []

-     for testcase in testcases:

-         q = select_results(

-             since_start=args['since']['start'],

-             since_end=args['since']['end'],

-             groups=args['groups'],

-             testcases=[testcase.name],

-             result_data=p['result_data'],

-             _sort=args['_sort'],

-             )

-         result = q.first()

-         if result:

-             results.append(result)

+ 

+     q = select_results(

+         since_start=args['since']['start'],

+         since_end=args['since']['end'],

+         groups=args['groups'],

+         testcases=args['testcases'],

+         testcases_like=args['testcases:like'],

+         result_data=p['result_data'],

+         _sort=args['_sort'],

+     )

+ 

+     # Produce a subquery with the same filter criteria as above *except*

+     # test case name, which we group by and join on.

+     sq = select_results(

+         since_start=args['since']['start'],

+         since_end=args['since']['end'],

+         groups=args['groups'],

+         result_data=p['result_data'],

+         )\

+         .order_by(None)\

+         .with_entities(

+             Result.testcase_name.label('testcase_name'),

+             db.func.max(Result.submit_time).label('max_submit_time'))\

+         .group_by(Result.testcase_name)\

+         .subquery()

+     q = q.join(sq, db.and_(Result.testcase_name == sq.c.testcase_name,

+                            Result.submit_time == sq.c.max_submit_time))

+ 

+     results = q.all()

  

      return jsonify(dict(

          data=[SERIALIZE(o) for o in results],

file modified
+11 -11
@@ -788,9 +788,9 @@ 

          data = json.loads(r.data)

  

          assert len(data['data']) == 2

-         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name

-         assert data['data'][0]['outcome'] == "FAILED"

-         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name + ".1"

+         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name + ".1"

+         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name

+         assert data['data'][1]['outcome'] == "FAILED"

  

      def test_get_results_latest_modifiers(self):

          self.helper_create_testcase()
@@ -818,28 +818,28 @@ 

          data = json.loads(r.data)

  

          assert len(data['data']) == 2

-         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name

+         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name + ".1"

          assert data['data'][0]['outcome'] == "FAILED"

-         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name + ".1"

+         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name

          assert data['data'][1]['outcome'] == "FAILED"

  

          r = self.app.get('/api/v2.0/results/latest?testcases:like=*')

          data = json.loads(r.data)

  

          assert len(data['data']) == 2

-         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name

+         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name + ".1"

          assert data['data'][0]['outcome'] == "FAILED"

-         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name + ".1"

+         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name

          assert data['data'][1]['outcome'] == "FAILED"

  

          r = self.app.get('/api/v2.0/results/latest?groups=%s' % self.ref_group_uuid)

          data = json.loads(r.data)

  

          assert len(data['data']) == 2

-         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name

-         assert data['data'][0]['outcome'] == "FAILED"

-         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name + ".1"

-         assert data['data'][1]['outcome'] == "PASSED"

+         assert data['data'][0]['testcase']['name'] == self.ref_testcase_name + ".1"

+         assert data['data'][0]['outcome'] == "PASSED"

+         assert data['data'][1]['testcase']['name'] == self.ref_testcase_name

+         assert data['data'][1]['outcome'] == "FAILED"

  

      def test_message_publication(self):

          self.helper_create_result()

no initial comment

Ping @jskladan , please consider merging this since it should make the Automated Tests tab in Bodhi much faster...

Absolutely!
Could you elaborate on how it works/why is it faster? What dataset was the change tested on?

I'm the last one to call myself DB expert, so I'm pretty interested in the technical details here. Especially since I was bitten in the rear by how Postgres query planner makes decisions on the specific data we have, and optimizations working well on "testing data" ended up causing issues.

Not saying this is the same, but as I said earlier, I don't have real understanding of the changes, and I would love to.

Thanks for the PR! I'm glad somebody other than me had a look at the DB-related code!

I wrote some details of my findings in #84 , did you see that one? Testing was all done using a prod db dump (as of a few weeks ago when I wrote the patch).

@dcallagh missed it! I'll have to dig into the notification settings...

Cool! I'd still like to understand the magic behind the subquery join, but the patch is sound as it is.

If you could spare some time to break it down for me (or at least point me to the right direction), I'd be glad - I'm sure understanding this will be beneficial :D

I'll merge the PR, though. Thanks again!

Oh yeah I didn't paste what the new version of the query looks like. That might help clarify.

The old version was issuing a large number of one-row queries like:

SELECT result.id AS result_id, result.testcase_name AS result_testcase_name, result.submit_time AS result_submit_time, result.outcome AS result_outcome, result.note AS result_note, result.ref_url AS result_ref_url 
FROM result JOIN result_data AS result_data_1 ON result.id = result_data_1.result_id JOIN result_data AS result_data_2 ON result.id = result_data_2.result_id 
WHERE result.testcase_name IN (%(testcase_name_1)s) AND result_data_1.key = %(key_1)s AND result_data_1.value IN (%(value_1)s) AND result_data_2.key = %(key_2)s AND result_data_2.value IN (%(value_2)s) ORDER BY result.submit_time DESC 
 LIMIT 1

to find the latest result for each test case name.

The new query looks like this (just one query to return all results):

SELECT result.id AS result_id, result.testcase_name AS result_testcase_name, result.submit_time AS result_submit_time, result.outcome AS result_outcome, result.note AS result_note, result.ref_url AS result_ref_url 
FROM result JOIN (SELECT result.testcase_name AS testcase_name, max(result.submit_time) AS max_submit_time 
FROM result GROUP BY result.testcase_name) AS anon_1 ON result.testcase_name = anon_1.testcase_name AND result.submit_time = anon_1.max_submit_time
ORDER BY result.submit_time DESC

Hmm I just noticed that the new query drops the eager loading for result_data, which is suboptimal. If you want to hold off on merging this for a bit, I will take another look tomorrow and see if I can fix that too.

Anyway, to explain further...

The trick is a well known solution to how to select the largest row from each group. (Actually not just largest = MAX() but also smallest = MIN() or any other aggregate function.) You will find it all over StackOverflow if you search for "sql max per group" or similar. I probably first read about it on https://www.xaprb.com/blog/2006/12/07/how-to-select-the-firstleastmax-row-per-group-in-sql/

Here we are selecting the "latest" result for each test case name, in other words, the row with largest submit_time grouped by test case name.

Selecting the largest submit_time for each test case is easy with a GROUP BY:

SELECT testcase_name, MAX(submit_time)
FROM result
GROUP BY testcase_name

But the trick is that we want to select each row with this maximum submit time. We want the other columns, not just the submit time itself.

So just take the above query, and join to it, with the join condition on the test case name and submit time. That's what the patch is essentially producing.

SELECT *
FROM result
JOIN
    (SELECT testcase_name, MAX(submit_time) max_submit_time
    FROM result
    GROUP BY testcase_name) AS x
ON x.testcase_name = result.testcase_name
AND x.max_submit_time = result.max_submit_time

Now we have all the other columns, and we can apply normal WHERE and ORDER BY and even other joins.

In general you can do the grouping by any columns, you just put them all in the GROUP BY of the inner subquery and then match them all in the join condition.

The other gotcha is that WHERE clauses on the outer query in general need to be repeated in the subquery (which this patch does, by calling select_results() twice).

For example if we wanted the latest passing result we would put WHERE outcome = 'PASSED' on the outer query... but we would also need to filter the rows we are joining to in the inner query as well. Otherwise, imagine there is a later failing result, the inner query would select that but the outer query would not, and so there would be no rows joined for that test case name.

Oh never mind the bit about eager loading for result_data. I was misreading the old version of the query. It's just doing a WHERE condition on result_data. You can't eager load it since it's a one-to-many and will potentially explode into too many rows if you eager load it.

Awesome, thanks for the thorough explanation! Seems like you won't be touching the code any more, so I'll merge the code shortly.
Thanks again!

Pull-Request has been merged by jskladan

7 years ago