From fdbbb8a78509664df776e03d25339faa64e21fd3 Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: Sep 29 2021 08:31:32 +0000 Subject: [PATCH 1/4] frontend: speedup for listing builds via APIv3 See PR#1914 I found two bottlenecks 1) `to_dict` function isn't extremely slow but it can be made faster by pre-loading the `models.Build` relationships. This adds up and brings a significant speedup for large projects 2) Querying the first pages is blazing fast but it gets 10 times slower when approaching large offsets. We can mitigate the slowdown at least a little by offseting and limiting within a subquery. It is also very helpful to query more results per page because the time for convering database results to JSON remains the same but we simply don't have to calculate the offset so often. With this changes, I was able to reduce the time for completing time copr-cli list-builds @rubygems/rubygems --output-format=json from 42min to 13min. --- diff --git a/cli/copr_cli/main.py b/cli/copr_cli/main.py index 7466e47..0f82d85 100644 --- a/cli/copr_cli/main.py +++ b/cli/copr_cli/main.py @@ -651,7 +651,7 @@ class Commands(object): :param args: argparse arguments provided by the user """ ownername, projectname = self.parse_name(args.project) - pagination = {"limit": 100} + pagination = {"limit": 1000} builds_list = self.client.build_proxy.get_list(ownername, projectname, pagination=pagination) diff --git a/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py b/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py index 1700afa..b0bdbb9 100644 --- a/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py +++ b/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py @@ -1,5 +1,7 @@ import os import flask +from sqlalchemy.orm import joinedload +from sqlalchemy import select from werkzeug.datastructures import MultiDict from werkzeug.utils import secure_filename @@ -80,9 +82,7 @@ def get_build(build_id): @query_params() def get_build_list(ownername, projectname, packagename=None, status=None, **kwargs): copr = get_copr(ownername, projectname) - query = BuildsLogic.get_multiple_by_copr(copr) - if packagename: - query = BuildsLogic.filter_by_package_name(query, packagename) + offset = kwargs.get("offset") or 0 # WORKAROUND # We can't filter builds by status directly in the database, because we @@ -93,13 +93,40 @@ def get_build_list(ownername, projectname, packagename=None, status=None, **kwar paginator_limit = None if status else kwargs["limit"] del kwargs["limit"] + # Selecting rows with large offsets (400k+) is slower (~10 times) than + # offset=0. There is not many options to get around it. To mitigate the + # slowdown at least a little (~10%), we can filter, offset, and limit within + # a subquery and then base the full-query on the subquery results. + subquery = ( + select(models.Build.id) + .filter(models.Build.copr == copr) + .limit(limit) + .offset(offset) + .subquery() + ) + + query = BuildsLogic.get_multiple().filter(models.Build.id.in_(subquery)) + if packagename: + query = BuildsLogic.filter_by_package_name(query, packagename) + + # Loading relationships straight away makes running `to_dict` somewhat + # faster, which adds up over time, and brings a significant speedup for + # large projects + query = query.options( + joinedload(models.Build.build_chroots), + joinedload(models.Build.package), + joinedload(models.Build.copr), + ) + paginator = Paginator(query, models.Build, limit=paginator_limit, **kwargs) + paginator.offset = 0 # We already applied offset within subquery builds = paginator.map(to_dict) if status: builds = [b for b in builds if b["state"] == status][:limit] paginator.limit = limit + paginator.offset = offset # So the client knows where to continue return flask.jsonify(items=builds, meta=paginator.meta) From b2a7448137c2c71527b454fb8ace0b9928c33f7d Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: Sep 29 2021 12:43:02 +0000 Subject: [PATCH 2/4] frontend: move the subquery hack into paginator --- diff --git a/frontend/coprs_frontend/coprs/views/apiv3_ns/__init__.py b/frontend/coprs_frontend/coprs/views/apiv3_ns/__init__.py index 5db8b0c..ffdaefc 100644 --- a/frontend/coprs_frontend/coprs/views/apiv3_ns/__init__.py +++ b/frontend/coprs_frontend/coprs/views/apiv3_ns/__init__.py @@ -118,8 +118,14 @@ class Paginator(object): if self.order == 'name': self.order_type = 'ASC' - def get(self): + return self.paginate_query(self.query) + + def paginate_query(self, query): + """ + Return `self.query` with all pagination parameters (limit, offset, + order) but do not run it. + """ order_attr = getattr(self.model, self.order, None) if not order_attr: msg = "Cannot order by {}, {} doesn't have such property".format( @@ -137,7 +143,7 @@ class Paginator(object): elif self.order_type == 'DESC': order_fun = sqlalchemy.desc - return (self.query.order_by(order_fun(order_attr)) + return (query.order_by(order_fun(order_attr)) .limit(self.limit) .offset(self.offset)) @@ -152,6 +158,27 @@ class Paginator(object): return [x.to_dict() for x in self.get()] +class SubqueryPaginator(Paginator): + """ + Selecting rows with large offsets (400k+) is slower (~10 times) than + offset=0. There is not many options to get around it. To mitigate the + slowdown at least a little (~10%), we can filter, offset, and limit within + a subquery and then base the full-query on the subquery results. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pk = getattr(self.model, "id") + self.subquery = self._create_subquery() + + def get(self): + query = self.query.filter(self.pk.in_(self.subquery.subquery())) + return query.all() + + def _create_subquery(self): + query = sqlalchemy.select(self.pk) + return self.paginate_query(query) + + class ListPaginator(Paginator): """ The normal `Paginator` class works with a SQLAlchemy query object and diff --git a/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py b/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py index b0bdbb9..772985c 100644 --- a/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py +++ b/frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_builds.py @@ -1,13 +1,10 @@ import os import flask from sqlalchemy.orm import joinedload -from sqlalchemy import select from werkzeug.datastructures import MultiDict from werkzeug.utils import secure_filename -from . import get_copr, file_upload, query_params, pagination, Paginator, json2form, GET, POST, PUT, DELETE -from .json2form import get_form_compatible_data from copr_common.enums import StatusEnum from coprs import db, forms, models from coprs.exceptions import (BadRequest, AccessRestricted) @@ -16,6 +13,20 @@ from coprs.views.apiv3_ns import apiv3_ns from coprs.logic.complex_logic import ComplexLogic from coprs.logic.builds_logic import BuildsLogic +from . import ( + get_copr, + file_upload, + query_params, + pagination, + SubqueryPaginator, + json2form, + GET, + POST, + PUT, + DELETE, +) +from .json2form import get_form_compatible_data + def to_dict(build): return { @@ -82,7 +93,6 @@ def get_build(build_id): @query_params() def get_build_list(ownername, projectname, packagename=None, status=None, **kwargs): copr = get_copr(ownername, projectname) - offset = kwargs.get("offset") or 0 # WORKAROUND # We can't filter builds by status directly in the database, because we @@ -93,40 +103,28 @@ def get_build_list(ownername, projectname, packagename=None, status=None, **kwar paginator_limit = None if status else kwargs["limit"] del kwargs["limit"] - # Selecting rows with large offsets (400k+) is slower (~10 times) than - # offset=0. There is not many options to get around it. To mitigate the - # slowdown at least a little (~10%), we can filter, offset, and limit within - # a subquery and then base the full-query on the subquery results. - subquery = ( - select(models.Build.id) - .filter(models.Build.copr == copr) - .limit(limit) - .offset(offset) - .subquery() - ) - - query = BuildsLogic.get_multiple().filter(models.Build.id.in_(subquery)) - if packagename: - query = BuildsLogic.filter_by_package_name(query, packagename) - # Loading relationships straight away makes running `to_dict` somewhat # faster, which adds up over time, and brings a significant speedup for # large projects + query = BuildsLogic.get_multiple() query = query.options( joinedload(models.Build.build_chroots), joinedload(models.Build.package), joinedload(models.Build.copr), ) - paginator = Paginator(query, models.Build, limit=paginator_limit, **kwargs) - paginator.offset = 0 # We already applied offset within subquery + paginator = SubqueryPaginator(query, models.Build, limit=paginator_limit, **kwargs) + paginator.subquery = paginator.subquery.filter(models.Build.copr == copr) + if packagename: + paginator.subquery = BuildsLogic.filter_by_package_name( + paginator.subquery, packagename) + builds = paginator.map(to_dict) if status: builds = [b for b in builds if b["state"] == status][:limit] paginator.limit = limit - paginator.offset = offset # So the client knows where to continue return flask.jsonify(items=builds, meta=paginator.meta) From aca9d6abfb384fe17bc3869b311b466abd8ebd64 Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: Sep 29 2021 12:43:08 +0000 Subject: [PATCH 3/4] frontend: add index for combination of build.id and build.copr_id --- diff --git a/frontend/coprs_frontend/alembic/versions/b630bad8a01e_add_index_for_combination_of_build_id_.py b/frontend/coprs_frontend/alembic/versions/b630bad8a01e_add_index_for_combination_of_build_id_.py new file mode 100644 index 0000000..417b58e --- /dev/null +++ b/frontend/coprs_frontend/alembic/versions/b630bad8a01e_add_index_for_combination_of_build_id_.py @@ -0,0 +1,21 @@ +""" +Add index for combination of build.id and build.copr_id + +Revision ID: b630bad8a01e +Revises: d5990bd4aa46 +Create Date: 2021-09-28 15:14:31.574563 +""" + +from alembic import op + + +revision = 'b630bad8a01e' +down_revision = 'd5990bd4aa46' + + +def upgrade(): + op.create_index('build_copr_id_build_id', 'build', ['copr_id', 'id'], unique=True) + + +def downgrade(): + op.drop_index('build_copr_id_build_id', table_name='build') diff --git a/frontend/coprs_frontend/coprs/models.py b/frontend/coprs_frontend/coprs/models.py index 5c811d8..a50e80d 100644 --- a/frontend/coprs_frontend/coprs/models.py +++ b/frontend/coprs_frontend/coprs/models.py @@ -1040,6 +1040,7 @@ class Build(db.Model, helpers.Serializer): db.Index('build_canceled_is_background_source_status_id_idx', 'canceled', "is_background", "source_status", "id"), db.Index('build_copr_id_package_id', "copr_id", "package_id"), + db.Index("build_copr_id_build_id", "copr_id", "id", unique=True), db.Index("build_id_desc_per_copr_dir", id.desc(), "copr_dir_id"), ) From a2733bc201a1acb9168cad47ff16732de3cfd47c Mon Sep 17 00:00:00 2001 From: Jakub Kadlcik Date: Sep 29 2021 12:43:12 +0000 Subject: [PATCH 4/4] frontend: avoid additional query for main_dir.full_name AFAIK in all cases, full name of the main dir should be simply full name of the project itself. This change improves performance of listing builds via APIv3 and reduces time for listing all builds from the rubygems project from ~13min to ~6min. --- diff --git a/frontend/coprs_frontend/coprs/models.py b/frontend/coprs_frontend/coprs/models.py index a50e80d..446502c 100644 --- a/frontend/coprs_frontend/coprs/models.py +++ b/frontend/coprs_frontend/coprs/models.py @@ -594,7 +594,7 @@ class Copr(db.Model, helpers.Serializer): def repo_url(self): return "/".join([app.config["BACKEND_BASE_URL"], u"results", - self.main_dir.full_name]) + self.full_name]) @property def repo_id(self):