#1930 Speedup for listing builds via APIv3
Merged 2 months ago by praiskup. Opened 2 months ago by frostyx.
copr/ frostyx/copr api-list-builds-speed  into  main

file modified
+1 -1
@@ -651,7 +651,7 @@ 

          :param args: argparse arguments provided by the user

          """

          ownername, projectname = self.parse_name(args.project)

-         pagination = {"limit": 100}

+         pagination = {"limit": 1000}

  

          builds_list = self.client.build_proxy.get_list(ownername, projectname,

                                                         pagination=pagination)

@@ -0,0 +1,21 @@ 

+ """

+ Add index for combination of build.id and build.copr_id

+ 

+ Revision ID: b630bad8a01e

+ Revises: d5990bd4aa46

+ Create Date: 2021-09-28 15:14:31.574563

+ """

+ 

+ from alembic import op

+ 

+ 

+ revision = 'b630bad8a01e'

+ down_revision = 'd5990bd4aa46'

+ 

+ 

+ def upgrade():

+     op.create_index('build_copr_id_build_id', 'build', ['copr_id', 'id'], unique=True)

+ 

+ 

+ def downgrade():

+     op.drop_index('build_copr_id_build_id', table_name='build')

@@ -594,7 +594,7 @@ 

      def repo_url(self):

          return "/".join([app.config["BACKEND_BASE_URL"],

                           u"results",

-                          self.main_dir.full_name])

+                          self.full_name])

  

      @property

      def repo_id(self):
@@ -1040,6 +1040,7 @@ 

          db.Index('build_canceled_is_background_source_status_id_idx',

                   'canceled', "is_background", "source_status", "id"),

          db.Index('build_copr_id_package_id', "copr_id", "package_id"),

+         db.Index("build_copr_id_build_id", "copr_id", "id", unique=True),

          db.Index("build_id_desc_per_copr_dir", id.desc(), "copr_dir_id"),

      )

  

@@ -118,8 +118,14 @@ 

              if self.order == 'name':

                  self.order_type = 'ASC'

  

- 

      def get(self):

+         return self.paginate_query(self.query)

+ 

+     def paginate_query(self, query):

+         """

+         Return `self.query` with all pagination parameters (limit, offset,

+         order) but do not run it.

+         """

          order_attr = getattr(self.model, self.order, None)

          if not order_attr:

              msg = "Cannot order by {}, {} doesn't have such property".format(
@@ -137,7 +143,7 @@ 

          elif self.order_type == 'DESC':

              order_fun = sqlalchemy.desc

  

-         return (self.query.order_by(order_fun(order_attr))

+         return (query.order_by(order_fun(order_attr))

                  .limit(self.limit)

                  .offset(self.offset))

  
@@ -152,6 +158,27 @@ 

          return [x.to_dict() for x in self.get()]

  

  

+ class SubqueryPaginator(Paginator):

+     """

+     Selecting rows with large offsets (400k+) is slower (~10 times) than

+     offset=0. There is not many options to get around it. To mitigate the

+     slowdown at least a little (~10%), we can filter, offset, and limit within

+     a subquery and then base the full-query on the subquery results.

+     """

+     def __init__(self, *args, **kwargs):

+         super().__init__(*args, **kwargs)

+         self.pk = getattr(self.model, "id")

+         self.subquery = self._create_subquery()

+ 

+     def get(self):

+         query = self.query.filter(self.pk.in_(self.subquery.subquery()))

+         return query.all()

+ 

+     def _create_subquery(self):

+         query = sqlalchemy.select(self.pk)

+         return self.paginate_query(query)

+ 

+ 

  class ListPaginator(Paginator):

      """

      The normal `Paginator` class works with a SQLAlchemy query object and

@@ -1,11 +1,10 @@ 

  import os

  import flask

+ from sqlalchemy.orm import joinedload

  

  from werkzeug.datastructures import MultiDict

  from werkzeug.utils import secure_filename

  

- from . import get_copr, file_upload, query_params, pagination, Paginator, json2form, GET, POST, PUT, DELETE

- from .json2form import get_form_compatible_data

  from copr_common.enums import StatusEnum

  from coprs import db, forms, models

  from coprs.exceptions import (BadRequest, AccessRestricted)
@@ -14,6 +13,20 @@ 

  from coprs.logic.complex_logic import ComplexLogic

  from coprs.logic.builds_logic import BuildsLogic

  

+ from . import (

+     get_copr,

+     file_upload,

+     query_params,

+     pagination,

+     SubqueryPaginator,

+     json2form,

+     GET,

+     POST,

+     PUT,

+     DELETE,

+ )

+ from .json2form import get_form_compatible_data

+ 

  

  def to_dict(build):

      return {
@@ -80,9 +93,6 @@ 

  @query_params()

  def get_build_list(ownername, projectname, packagename=None, status=None, **kwargs):

      copr = get_copr(ownername, projectname)

-     query = BuildsLogic.get_multiple_by_copr(copr)

-     if packagename:

-         query = BuildsLogic.filter_by_package_name(query, packagename)

  

      # WORKAROUND

      # We can't filter builds by status directly in the database, because we
@@ -93,7 +103,22 @@ 

      paginator_limit = None if status else kwargs["limit"]

      del kwargs["limit"]

  

-     paginator = Paginator(query, models.Build, limit=paginator_limit, **kwargs)

+     # Loading relationships straight away makes running `to_dict` somewhat

+     # faster, which adds up over time, and  brings a significant speedup for

+     # large projects

+     query = BuildsLogic.get_multiple()

+     query = query.options(

+         joinedload(models.Build.build_chroots),

+         joinedload(models.Build.package),

+         joinedload(models.Build.copr),

+     )

+ 

+     paginator = SubqueryPaginator(query, models.Build, limit=paginator_limit, **kwargs)

+     paginator.subquery = paginator.subquery.filter(models.Build.copr == copr)

+     if packagename:

+         paginator.subquery = BuildsLogic.filter_by_package_name(

+             paginator.subquery, packagename)

+ 

      builds = paginator.map(to_dict)

  

      if status:

See PR#1914

I found two bottlenecks

1) to_dict function isn't extremely slow but it can be made faster
by pre-loading the models.Build relationships. This adds up and
brings a significant speedup for large projects

2) Querying the first pages is blazing fast but it gets 10 times
slower when approaching large offsets. We can mitigate the slowdown
at least a little by offseting and limiting within a subquery. It
is also very helpful to query more results per page because the
time for convering database results to JSON remains the same but we
simply don't have to calculate the offset so often.

With this changes, I was able to reduce the time for completing

time copr-cli list-builds @rubygems/rubygems --output-format=json

from 42min to 13min.

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

See also PR#1931 which is a little bit slower but much less complex.
We need to decide whether the complexity is worth it and choose either this PR or #1931.

I think there is room for another speedup, so I will try to cut up the time more.

Another bottleneck is calculating the following dict item in the to_dict(build) function

"repo_url": build.copr.repo_url,

Removing it cuts of another ~5 minutes when listing rubygems builds. I think we need to preload it using joinedload or something like that. But I couldn't figure it out yet. If you have any tips, it will be appreciated.

Another bottleneck is on the client side and I fixed it in PR#1914. That saves another ~2-3 minutes and that gets us under 6 minutes for listing all rubygems builds. It should still be possible to cut down 1-2 minutes but it IMHO isn't necessary.

The speedup should be significant IMO. Have you tried to change the subquery so it only returns the primary key (not loading all fields speeds up both database and ORM).

The in_ operation doesn't keep the ordering of the subquery.

Have you tried to change the subquery so it only returns the primary key

Ah, you are already doing this. The reason why this isn't very fast is that we don't do index scans...

Can you please add another index, create index copr_id_build_id ON build (copr_id, id);?
This should make the queries much faster... (4x according to my local testing)

Can you please add another index, create index copr_id_build_id ON build (copr_id, id);?
This should make the queries much faster... (4x according to my local testing)

That's weird. It really helped you? I tried to add the index but it doesn't seem to have any effect on the performance.

1 new commit added

  • frontend: add index for combination of build.id and build.copr_id
2 months ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto fdbbb8a

2 months ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

4 new commits added

  • frontend: avoid additional query for main_dir.full_name
  • frontend: add index for combination of build.id and build.copr_id
  • frontend: move the subquery hack into paginator
  • frontend: speedup for listing builds via APIv3
2 months ago

Build succeeded.

"repo_url": build.copr.repo_url,

We discussed it with @praiskup and found a solution. Listing all rubygems builds now takes ~6 minutes.

Pull-Request has been merged by praiskup

2 months ago