PR#2030: Paginate packages list in APIv3 - copr/copr

		`@@ -791,17 +791,35 @@`

		`def action_list_packages(self, args):`
		`ownername, projectname = self.parse_name(args.copr)`
		`- packages = self.client.package_proxy.get_list(ownername=ownername, projectname=projectname,`
		`- with_latest_build=args.with_latest_build,`
		`- with_latest_succeeded_build=args.with_latest_succeeded_build)`
		`- packages_with_builds = [self._package_with_builds(p, args) for p in packages]`
		`- fields = ["id", "auto_rebuild", "name", "ownername", "projectname", "source_dict",`
		`- "source_type", "latest_succeeded_build", "latest_build"]`
		`+ fields = [`
		`+ "id",`
		`+ "auto_rebuild",`
		`+ "name",`
		`+ "ownername",`
		`+ "projectname",`
		`+ "source_dict",`
		`+ "source_type",`
		`+ "latest_succeeded_build",`
		`+ "latest_build"`
		`+ ]`
		`if args.with_all_builds:`
		`fields.insert(1, "builds")`
		`+`
		`+ pagination = {"limit": 1000}`
		`+ packages = self.client.package_proxy.get_list(`
		`+ ownername=ownername, projectname=projectname,`
		`+ with_latest_build=args.with_latest_build,`
		`+ with_latest_succeeded_build=args.with_latest_succeeded_build,`
		`+ pagination=pagination,`
		`+ )`
		`+`
		`printer = get_printer(args.output_format, fields, True)`
		`- for data in packages_with_builds:`
		`- printer.add_data(data)`
		`+ while packages:`
		`+ packages_with_builds = [self._package_with_builds(p, args)`
		`+ for p in packages]`
		`+ for data in packages_with_builds:`
		`+ printer.add_data(data)`
		`+ packages = next_page(packages)`
		`printer.finish()`

		`def action_list_package_names(self, args):`

cli/tests/resources/list_packages_response.json

file modified

+1 -1

		`@@ -133,7 +133,7 @@`
		`}`
		`],`
		`"meta": {`
		`- "limit": null,`
		`+ "limit": 1000,`
		`"offset": 0,`
		`"order": "id",`
		`"order_type": "ASC"`

cli/tests/test_cli.py

file modified

+3 -1

		`@@ -218,8 +218,9 @@`
		`assert out.split("\n") == expected_output.split("\n")`

		`@responses.activate`
		`+ @mock.patch("copr_cli.main.next_page")`
		`@mock.patch('copr_cli.main.config_from_file')`
		`- def test_list_packages(config_from_file, capsys):`
		`+ def test_list_packages(config_from_file, next_page, capsys):`
		`response_data = json.loads(read_res('list_packages_response.json'))`
		`expected_output = read_res('list_packages_expected.json')`

		`@@ -230,6 +231,7 @@`

		`# no config`
		`config_from_file.side_effect = copr.v3.CoprNoConfigException("Dude, your config is missing")`
		`+ next_page.return_value = None`

		`main.main(argv=["list-packages", "praiskup/ping"])`
		`out, _ = capsys.readouterr()`

frontend/coprs_frontend/coprs/logic/packages_logic.py

file modified

+1 -1

		`@@ -59,7 +59,7 @@`
		`use the given 'packages' array.`
		`:return: array of Package objects, with assigned latest Build object`
		`"""`
		`- if not packages:`
		`+ if packages is None:`
		`packages = cls.get_all_ordered(copr_dir_id).all()`

		`pkg_ids = [package.id for package in packages]`

frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_packages.py

file modified

+37 -6

		`@@ -18,9 +18,13 @@`
		`# @TODO if we need to do this on several places, we should figure a better way to do it`
		`from coprs.views.apiv3_ns.apiv3_builds import to_dict as build_to_dict`

		`- from . import query_params, pagination, get_copr, ListPaginator, GET, POST, PUT, DELETE`
		`+ from . import query_params, pagination, get_copr, GET, POST, PUT, DELETE, Paginator`
		`from .json2form import get_form_compatible_data`

		`+`
		`+ MAX_PACKAGES_WITHOUT_PAGINATION = 10000`
		`+`
		`+`
		`def to_dict(package, with_latest_build=False, with_latest_succeeded_build=False):`
		`source_dict = package.source_json_dict`
		`if "srpm_build_method" in source_dict:`
		`@@ -28,7 +32,19 @@`

		`latest = None`
		`if with_latest_build:`
		`- latest = package.last_build()`
		`+ # If the package was obtained via performance-friendly method`
		+ # `PackagesLogic.get_packages_with_latest_builds_for_dir`,
		+ # the `package` is still a `models.Package` object but it has one more
		`+ # attribute to it (if the package indeed has at least one build).`
		+ # In such case, the `latest_build` value is already set and using it
		`+ # costs us nothing (no additional SQL query)`
		`+ latest = getattr(package, "latest_build", None)`
		`+`
		+ # If the performance-friendly `latest_build` variable wasn't set because
		+ # the `package` was obtained differently, e.g. `PackagesLogic.get`, we
		`+ # need to fetch it`
		`+ if not latest:`
		`+ latest = package.last_build()`
		`latest = build_to_dict(latest) if latest else None`

		`latest_succeeded = None`
		`@@ -106,11 +122,26 @@`
		`with_latest_succeeded_build = get_arg_to_bool(with_latest_succeeded_build)`

		`copr = get_copr(ownername, projectname)`
		`- packages = PackagesLogic.get_packages_with_latest_builds_for_dir(copr.main_dir.id, small_build=False)`
		`- paginator = ListPaginator(packages, models.Package, **kwargs)`
		`+ query = PackagesLogic.get_all(copr.main_dir.id)`
		`+ paginator = Paginator(query, models.Package, **kwargs)`
		`+ packages = paginator.get().all()`

		`- packages = paginator.map(lambda x: to_dict(x, with_latest_build, with_latest_succeeded_build))`
		`- return flask.jsonify(items=packages, meta=paginator.meta)`
		`+ if len(packages) > MAX_PACKAGES_WITHOUT_PAGINATION:`
		`+ raise ApiError("Too many packages, please use pagination. "`
		`+ "Requests are limited to only {0} packages at once."`
		`+ .format(MAX_PACKAGES_WITHOUT_PAGINATION))`
		`+`
		`+ # Query latest builds for all packages at once. We can't use this solution`
		`+ # for querying latest successfull builds, so that will be a little slower`
		`+ if with_latest_build:`
		`+ packages = PackagesLogic.get_packages_with_latest_builds_for_dir(`
		`+ copr.main_dir.id,`
		`+ small_build=False,`
		`+ packages=packages)`
		`+`
		`+ items = [to_dict(p, with_latest_build, with_latest_succeeded_build)`
		`+ for p in packages]`
		`+ return flask.jsonify(items=items, meta=paginator.meta)`


		`@apiv3_ns.route("/package/add/<ownername>/<projectname>/<package_name>/<source_type_text>", methods=POST)`

python/copr/v3/exceptions.py

file modified

		`@@ -7,6 +7,7 @@`
		`"""`
		`def __init__(self, msg=None, response=None):`
		`super(CoprException, self).__init__(msg)`
		`+ msg = msg or "Unspecified error"`
		`self.result = Munch(error=msg, __response__=response)`

python/copr/v3/requests.py

file modified

		`@@ -111,5 +111,14 @@`
		`raise CoprRequestException(response_json["error"], response=response)`

		`except ValueError:`
		`+ # When the request timeouted on the apache layer, we couldn't return a`
		`+ # nice JSON response and therefore its parsing fails.`
		`+ if response.status_code == 504:`
		`+ message = ("{0}\nConsider using pagination for large queries."`
		`+ .format(response.reason))`
		`+ # We can't raise-from because of EPEL7`
		`+ # pylint: disable=raise-missing-from`
		`+ raise CoprTimeoutException(message, response=response)`
		`+`
		`raise CoprRequestException("Request is not in JSON format, there is probably a bug in the API code.",`
		`response=response)`

frostyx commented 2 years ago

Fix RHBZ 2036631
See #757

The #757 was mainly fixed in PR#1433 and mainly PR#1914 and reduced
from 45 minutes on the @rubygems/rubygems project to ~2 minutes. The
last issue is that request isn't paginated and returns everything at
once, therefore there isn't any CLI output for minutes, and that it
timeouts in production (because apache timeouts after 60s by
default). This PR fixes it.

The speed of CLI command using --with-latest-succeeded-build remains
the same (2 minutes for @rubygems/rubygems) but the speed of
--with-latest-build is now twice as fast (1 minute for
@rubygems/rubygems).

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 2m 23s

frostyx commented 2 years ago

I know we have streamed_json_array_response now but I wanted to stick with pagination because everything in APIv3 uses it, so I wanted to be consistent.

But I think in the future, maybe we can add some parameter to the API and let users pick either of those.

2 new commits added

frontend, cli: paginate packages list in APIv3
frontend: don't query all packages when empty list is specified

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 1m 45s

2 new commits added

frontend, cli: paginate packages list in APIv3
frontend: don't query all packages when empty list is specified

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 2m 58s

praiskup commented 2 years ago

Thank you for working on this. I would prefer to remove these one CLI query => multiple API request approach occurrences, then add new.

Also, is the old CLI compatible with new API, and vice versa?

Long term, I think we should have some kind of cursor-based pagination,
by example https://www.sitepoint.com/paginating-real-time-data-cursor-based-pagination/

Also, it doesn't answer the question of why "mutliple" API requests are less
expensive than asking all the in one.

Ok, I had to say it, but I'm afraid will have to accept this anyway :-) as
a better solution would cost us too much work, I bet.

Metadata Update from @frostyx:
- Pull-request tagged with: needs-work

2 years ago

2 new commits added

frontend, cli: paginate packages list in APIv3
frontend: don't query all packages when empty list is specified

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 1m 53s

praiskup commented 2 years ago

Is this still WIP?

frostyx commented 2 years ago

Thank you for working on this. I would prefer to remove these one CLI query => multiple API request approach occurrences, then add new.

I added this one just to be consistent with the rest of the API methods. Maybe we can add support for streaming into all of them using streamed_json_array_response

Also, is the old CLI compatible with new API, and vice versa?

Worked for me

Long term, I think we should have some kind of cursor-based pagination,
by example https://www.sitepoint.com/paginating-real-time-data-cursor-based-pagination/

No problem with me

Also, it doesn't answer the question of why "mutliple" API requests are less
expensive than asking all the in one.

It's complicated and honestly, I don't truly understand it. I observed this:

Returning all packages without pagination takes too long and timeouts
Returning one page takes a second and therefore solves the issue
Returning all packages with pagination is not faster than the unpaginated version but at least it doesn't timeout.
Returning all packages without pagination spikes the RAM usage up 2GBs, this doesn't happen when using the pagination

I don't have better explanations though.

I am all for moving to some better solution than pagination in the future but we need to do that consistently for all API routes, not just this one :-)

Metadata Update from @frostyx:
- Pull-request untagged with: needs-work

2 years ago

frostyx commented 2 years ago

Is this still WIP?

There was a small issue that tracebacked. Fixed now,

Metadata Update from @praiskup:
- Request assigned

2 years ago

praiskup commented on line 15 of frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_packages.py 2 years ago

This got my attention :-) the package can be None, or why?

praiskup commented 2 years ago

Otherwise I think we should go with this, thanks.

frostyx commented on line 15 of frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_packages.py 2 years ago

The package cannot but the latest_build can (when the package was defined manually and not yet submitted to be built).

praiskup commented on line 15 of frontend/coprs_frontend/coprs/views/apiv3_ns/apiv3_packages.py 2 years ago

Hmm, I don't get how this is possible. Is it an object of frontend/coprs_frontend/coprs/models.py:Package class? If not I believe this deserves a comment at the very least, but /me shrugs.

2 new commits added

frontend, cli: paginate packages list in APIv3
frontend: don't query all packages when empty list is specified

2 years ago

frostyx commented 2 years ago

Hmm, I don't get how this is possible. Is it an object of
frontend/coprs_frontend/coprs/models.py:Package class? If not I
believe this deserves a comment at the very least, but /me shrugs.

Oh, I understand you now. I updated the code and added some explanation
in a comment (and also fixed a bug, so thanks). Does it make more
sense now?

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 3m 08s

praiskup commented 2 years ago

What worries me is the large query done by the old copr-cli version (or via API). Perhaps, if the query is too large, we could raise some useful error that "this is too large to process, use pagination..."?

frostyx commented 2 years ago

What worries me is the large query done by the old copr-cli version (or via API). Perhaps, if the query is too large, we could raise some useful error that "this is too large to process, use pagination..."?

I know that it isn't what you are asking for but it should be fairly easy to raise some helpful exception in python-copr.

As for your suggestion, if I understand correctly, you want frontend to return some helpful message when the request timeouts? I don't know if this will be possible because IIRC the error doesn't come from copr-frontend code but from apache.

praiskup commented 2 years ago

I meant something like "E: this request has more than NNNN rows, cannot be processed at once. Use pagination.".

2 new commits added

frontend: limit max number of packages per request
python: raise user-friendly exception when on request timeout

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 1m 49s

4 new commits added

frontend: limit max number of packages per request
python: raise user-friendly exception when on request timeout
frontend, cli: paginate packages list in APIv3
frontend: don't query all packages when empty list is specified

2 years ago

zuul commented 2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

csdiff-pylint : FAILURE in 3m 15s

4 new commits added

frontend: limit max number of packages per request
python: raise user-friendly exception when on request timeout
frontend, cli: paginate packages list in APIv3
frontend: don't query all packages when empty list is specified

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 2m 11s

rebased onto dd3acf1

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 3m 09s

1 new commit added

python: fix exception caused by default msg value

2 years ago

zuul commented 2 years ago

Build succeeded.

csdiff-pylint : SUCCESS in 1m 58s

frostyx commented 2 years ago

[copr-build]

frostyx commented 2 years ago

I added 3 new commits, PTAL.
I think the the copr-cli build fails because (for some reason) it doesn't install the python3-copr from this PR.

praiskup commented 2 years ago

I think the the copr-cli build fails because (for some reason) it doesn't install the python3-copr

Right. That's how it is done,... the packages are built only against the @copr/copr-dev
repo (not the PR directory). I don't really get why F35 ppc64le
succeeded, though:
https://copr.fedorainfracloud.org/coprs/g/copr/copr-dev/build/3246500/

I manually built the python3-copr package now in @copr/copr-dev.

[copr-build]

praiskup commented 2 years ago