#2030 Paginate packages list in APIv3
Merged 2 years ago by praiskup. Opened 2 years ago by frostyx.
copr/ frostyx/copr cli-paginated-packages-2  into  main

file modified
+26 -8
@@ -791,17 +791,35 @@ 

  

      def action_list_packages(self, args):

          ownername, projectname = self.parse_name(args.copr)

-         packages = self.client.package_proxy.get_list(ownername=ownername, projectname=projectname,

-                                                       with_latest_build=args.with_latest_build,

-                                                       with_latest_succeeded_build=args.with_latest_succeeded_build)

-         packages_with_builds = [self._package_with_builds(p, args) for p in packages]

-         fields = ["id", "auto_rebuild", "name", "ownername", "projectname", "source_dict",

-                   "source_type", "latest_succeeded_build", "latest_build"]

+         fields = [

+             "id",

+             "auto_rebuild",

+             "name",

+             "ownername",

+             "projectname",

+             "source_dict",

+             "source_type",

+             "latest_succeeded_build",

+             "latest_build"

+         ]

          if args.with_all_builds:

              fields.insert(1, "builds")

+ 

+         pagination = {"limit": 1000}

+         packages = self.client.package_proxy.get_list(

+             ownername=ownername, projectname=projectname,

+             with_latest_build=args.with_latest_build,

+             with_latest_succeeded_build=args.with_latest_succeeded_build,

+             pagination=pagination,

+         )

+ 

          printer = get_printer(args.output_format, fields, True)

-         for data in packages_with_builds:

-             printer.add_data(data)

+         while packages:

+             packages_with_builds = [self._package_with_builds(p, args)

+                                     for p in packages]

+             for data in packages_with_builds:

+                 printer.add_data(data)

+             packages = next_page(packages)

          printer.finish()

  

      def action_list_package_names(self, args):

@@ -133,7 +133,7 @@ 

      }

    ],

    "meta": {

-     "limit": null,

+     "limit": 1000,

      "offset": 0,

      "order": "id",

      "order_type": "ASC"

file modified
+3 -1
@@ -218,8 +218,9 @@ 

      assert out.split("\n") == expected_output.split("\n")

  

  @responses.activate

+ @mock.patch("copr_cli.main.next_page")

  @mock.patch('copr_cli.main.config_from_file')

- def test_list_packages(config_from_file, capsys):

+ def test_list_packages(config_from_file, next_page, capsys):

      response_data = json.loads(read_res('list_packages_response.json'))

      expected_output = read_res('list_packages_expected.json')

  
@@ -230,6 +231,7 @@ 

  

      # no config

      config_from_file.side_effect = copr.v3.CoprNoConfigException("Dude, your config is missing")

+     next_page.return_value = None

  

      main.main(argv=["list-packages", "praiskup/ping"])

      out, _ = capsys.readouterr()

@@ -59,7 +59,7 @@ 

              use the given 'packages' array.

          :return: array of Package objects, with assigned latest Build object

          """

-         if not packages:

+         if packages is None:

              packages = cls.get_all_ordered(copr_dir_id).all()

  

          pkg_ids = [package.id for package in packages]

@@ -18,9 +18,13 @@ 

  # @TODO if we need to do this on several places, we should figure a better way to do it

  from coprs.views.apiv3_ns.apiv3_builds import to_dict as build_to_dict

  

- from . import query_params, pagination, get_copr, ListPaginator, GET, POST, PUT, DELETE

+ from . import query_params, pagination, get_copr, GET, POST, PUT, DELETE, Paginator

  from .json2form import get_form_compatible_data

  

+ 

+ MAX_PACKAGES_WITHOUT_PAGINATION = 10000

+ 

+ 

  def to_dict(package, with_latest_build=False, with_latest_succeeded_build=False):

      source_dict = package.source_json_dict

      if "srpm_build_method" in source_dict:
@@ -28,7 +32,19 @@ 

  

      latest = None

      if with_latest_build:

-         latest = package.last_build()

+         # If the package was obtained via performance-friendly method

+         # `PackagesLogic.get_packages_with_latest_builds_for_dir`,

+         # the `package` is still a `models.Package` object but it has one more

+         # attribute to it (if the package indeed has at least one build).

+         # In such case, the `latest_build` value is already set and using it

+         # costs us nothing (no additional SQL query)

+         latest = getattr(package, "latest_build", None)

+ 

+         # If the performance-friendly `latest_build` variable wasn't set because

+         # the `package` was obtained differently, e.g. `PackagesLogic.get`, we

+         # need to fetch it

+         if not latest:

+             latest = package.last_build()

          latest = build_to_dict(latest) if latest else None

  

      latest_succeeded = None
@@ -106,11 +122,26 @@ 

      with_latest_succeeded_build = get_arg_to_bool(with_latest_succeeded_build)

  

      copr = get_copr(ownername, projectname)

-     packages = PackagesLogic.get_packages_with_latest_builds_for_dir(copr.main_dir.id, small_build=False)

-     paginator = ListPaginator(packages, models.Package, **kwargs)

+     query = PackagesLogic.get_all(copr.main_dir.id)

+     paginator = Paginator(query, models.Package, **kwargs)

+     packages = paginator.get().all()

  

-     packages = paginator.map(lambda x: to_dict(x, with_latest_build, with_latest_succeeded_build))

-     return flask.jsonify(items=packages, meta=paginator.meta)

+     if len(packages) > MAX_PACKAGES_WITHOUT_PAGINATION:

+         raise ApiError("Too many packages, please use pagination. "

+                        "Requests are limited to only {0} packages at once."

+                        .format(MAX_PACKAGES_WITHOUT_PAGINATION))

+ 

+     # Query latest builds for all packages at once. We can't use this solution

+     # for querying latest successfull builds, so that will be a little slower

+     if with_latest_build:

+         packages = PackagesLogic.get_packages_with_latest_builds_for_dir(

+             copr.main_dir.id,

+             small_build=False,

+             packages=packages)

+ 

+     items = [to_dict(p, with_latest_build, with_latest_succeeded_build)

+              for p in packages]

+     return flask.jsonify(items=items, meta=paginator.meta)

  

  

  @apiv3_ns.route("/package/add/<ownername>/<projectname>/<package_name>/<source_type_text>", methods=POST)

@@ -7,6 +7,7 @@ 

      """

      def __init__(self, msg=None, response=None):

          super(CoprException, self).__init__(msg)

+         msg = msg or "Unspecified error"

          self.result = Munch(error=msg, __response__=response)

  

  

@@ -111,5 +111,14 @@ 

          raise CoprRequestException(response_json["error"], response=response)

  

      except ValueError:

+         # When the request timeouted on the apache layer, we couldn't return a

+         # nice JSON response and therefore its parsing fails.

+         if response.status_code == 504:

+             message = ("{0}\nConsider using pagination for large queries."

+                        .format(response.reason))

+             # We can't raise-from because of EPEL7

+             # pylint: disable=raise-missing-from

+             raise CoprTimeoutException(message, response=response)

+ 

          raise CoprRequestException("Request is not in JSON format, there is probably a bug in the API code.",

                                     response=response)

Fix RHBZ 2036631
See #757

The #757 was mainly fixed in PR#1433 and mainly PR#1914 and reduced
from 45 minutes on the @rubygems/rubygems project to ~2 minutes. The
last issue is that request isn't paginated and returns everything at
once, therefore there isn't any CLI output for minutes, and that it
timeouts in production (because apache timeouts after 60s by
default). This PR fixes it.

The speed of CLI command using --with-latest-succeeded-build remains
the same (2 minutes for @rubygems/rubygems) but the speed of
--with-latest-build is now twice as fast (1 minute for
@rubygems/rubygems).

Build succeeded.

I know we have streamed_json_array_response now but I wanted to stick with pagination because everything in APIv3 uses it, so I wanted to be consistent.

But I think in the future, maybe we can add some parameter to the API and let users pick either of those.

2 new commits added

  • frontend, cli: paginate packages list in APIv3
  • frontend: don't query all packages when empty list is specified
2 years ago

Build succeeded.

2 new commits added

  • frontend, cli: paginate packages list in APIv3
  • frontend: don't query all packages when empty list is specified
2 years ago

Build succeeded.

Thank you for working on this. I would prefer to remove these one CLI query => multiple API request approach occurrences, then add new.

Also, is the old CLI compatible with new API, and vice versa?

Long term, I think we should have some kind of cursor-based pagination,
by example https://www.sitepoint.com/paginating-real-time-data-cursor-based-pagination/

Also, it doesn't answer the question of why "mutliple" API requests are less
expensive than asking all the in one.

Ok, I had to say it, but I'm afraid will have to accept this anyway :-) as
a better solution would cost us too much work, I bet.

Metadata Update from @frostyx:
- Pull-request tagged with: needs-work

2 years ago

2 new commits added

  • frontend, cli: paginate packages list in APIv3
  • frontend: don't query all packages when empty list is specified
2 years ago

Build succeeded.

Thank you for working on this. I would prefer to remove these one CLI query => multiple API request approach occurrences, then add new.

I added this one just to be consistent with the rest of the API methods. Maybe we can add support for streaming into all of them using streamed_json_array_response

Also, is the old CLI compatible with new API, and vice versa?

Worked for me

Long term, I think we should have some kind of cursor-based pagination,
by example https://www.sitepoint.com/paginating-real-time-data-cursor-based-pagination/

No problem with me

Also, it doesn't answer the question of why "mutliple" API requests are less
expensive than asking all the in one.

It's complicated and honestly, I don't truly understand it. I observed this:

  • Returning all packages without pagination takes too long and timeouts
  • Returning one page takes a second and therefore solves the issue
  • Returning all packages with pagination is not faster than the unpaginated version but at least it doesn't timeout.
  • Returning all packages without pagination spikes the RAM usage up 2GBs, this doesn't happen when using the pagination

I don't have better explanations though.

I am all for moving to some better solution than pagination in the future but we need to do that consistently for all API routes, not just this one :-)

Metadata Update from @frostyx:
- Pull-request untagged with: needs-work

2 years ago

Is this still WIP?

There was a small issue that tracebacked. Fixed now,

Metadata Update from @praiskup:
- Request assigned

2 years ago

This got my attention :-) the package can be None, or why?

Otherwise I think we should go with this, thanks.

The package cannot but the latest_build can (when the package was defined manually and not yet submitted to be built).

Hmm, I don't get how this is possible. Is it an object of frontend/coprs_frontend/coprs/models.py:Package class? If not I believe this deserves a comment at the very least, but /me shrugs.

2 new commits added

  • frontend, cli: paginate packages list in APIv3
  • frontend: don't query all packages when empty list is specified
2 years ago

Hmm, I don't get how this is possible. Is it an object of
frontend/coprs_frontend/coprs/models.py:Package class? If not I
believe this deserves a comment at the very least, but /me shrugs.

Oh, I understand you now. I updated the code and added some explanation
in a comment (and also fixed a bug, so thanks). Does it make more
sense now?

Build succeeded.

What worries me is the large query done by the old copr-cli version (or via API). Perhaps, if the query is too large, we could raise some useful error that "this is too large to process, use pagination..."?

What worries me is the large query done by the old copr-cli version (or via API). Perhaps, if the query is too large, we could raise some useful error that "this is too large to process, use pagination..."?

I know that it isn't what you are asking for but it should be fairly easy to raise some helpful exception in python-copr.

As for your suggestion, if I understand correctly, you want frontend to return some helpful message when the request timeouts? I don't know if this will be possible because IIRC the error doesn't come from copr-frontend code but from apache.

I meant something like "E: this request has more than NNNN rows, cannot be processed at once. Use pagination.".

2 new commits added

  • frontend: limit max number of packages per request
  • python: raise user-friendly exception when on request timeout
2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

4 new commits added

  • frontend: limit max number of packages per request
  • python: raise user-friendly exception when on request timeout
  • frontend, cli: paginate packages list in APIv3
  • frontend: don't query all packages when empty list is specified
2 years ago

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

4 new commits added

  • frontend: limit max number of packages per request
  • python: raise user-friendly exception when on request timeout
  • frontend, cli: paginate packages list in APIv3
  • frontend: don't query all packages when empty list is specified
2 years ago

Build succeeded.

rebased onto dd3acf1

2 years ago

Build succeeded.

1 new commit added

  • python: fix exception caused by default msg value
2 years ago

Build succeeded.

I added 3 new commits, PTAL.
I think the the copr-cli build fails because (for some reason) it doesn't install the python3-copr from this PR.

I think the the copr-cli build fails because (for some reason) it doesn't install the python3-copr

Right. That's how it is done,... the packages are built only against the @copr/copr-dev
repo (not the PR directory). I don't really get why F35 ppc64le
succeeded, though:
https://copr.fedorainfracloud.org/coprs/g/copr/copr-dev/build/3246500/

I manually built the python3-copr package now in @copr/copr-dev.

[copr-build]

Commit f47eb42 fixes this pull-request

Pull-Request has been merged by praiskup

2 years ago

Commit c182564 fixes this pull-request

Pull-Request has been merged by praiskup

2 years ago

Commit e169691 fixes this pull-request

Pull-Request has been merged by praiskup

2 years ago

Commit a99b81a fixes this pull-request

Pull-Request has been merged by praiskup

2 years ago

Commit a88095f fixes this pull-request

Pull-Request has been merged by praiskup

2 years ago