#1914 cli: print JSON list continuously, not at once when all items are fetched
Merged 2 months ago by frostyx. Opened 2 months ago by frostyx.
copr/ frostyx/copr paginated-json  into  main

file modified
+45 -25
@@ -186,40 +186,60 @@ 

  class JsonPrinter(AbstractPrinter):

      """The class takes care of printing the data in json format"""

  

-     def __init__(self, fields):

-         super(JsonPrinter, self).__init__(fields)

-         self.output_info = []

-         self.json_values_list = []

- 

-     def __process_fields(self, data):

-         desired_info = []

+     def _get_result_json(self, data):

+         result = {}

          for field in self.fields:

              if callable(field):

-                 desired_info.append(field(data))

+                 name = field.__code__.co_varnames[0]

+                 result[name] = field(data)

              else:

-                 desired_info.append(data[field])

-         self.output_info.append(desired_info)

+                 result[field] = data[field]

+         return json_dumps(result)

  

      def add_data(self, data):

-         json_values = {}

-         self.__process_fields(data)

-         for item in self.output_info:

-             for field, info in zip(self.fields, item):

-                 if callable(field):

-                     json_values[field.__code__.co_varnames[0]] = info

-                 else:

-                     json_values[field] = info

-         self.json_values_list.append(json_values)

+         print(self._get_result_json(data))

  

      def finish(self):

-         print(json_dumps(self.json_values_list[0]))

+         pass

  

  

  class JsonPrinterListCommand(JsonPrinter):

-     """The class takes care of printing the data in list in json format"""

+     """

+     The class takes care of printing the data in list in json format

+ 

+     For performance reasons we cannot simply add everything to a list and then

+     use `json_dumps` function to print all JSON at once. For large projects this

+     would mean minutes of no output, which is not user-friendly.

+ 

+     The approach here is to utilize `json_dumps` to convert each object to

+     a JSON string and print it immediately. We then manually take care of

+     opening and closing list brackets and separators.

+     """

+ 

+     first_line = True

+ 

+     def add_data(self, data):

+         self.start()

+         result = self._get_result_json(data)

+         for line in result.split("\n"):

+             print("    {0}".format(line))

+ 

+     def start(self):

+         """

+         This is called before printing the first object

+         """

+         if self.first_line:

+             self.first_line = False

+             print("[")

+         else:

+             print("    ,")

  

      def finish(self):

-         print(json_dumps(self.json_values_list))

+         """

+         This is called by the user after printing all objects

+         """

+         if not self.first_line:

+             print("]")

  

  

  def get_printer(output_format, fields, list_command=False):
@@ -640,13 +660,13 @@ 

              sys.stderr.write(

                  "The default setting will be changed from text-row to json in the following releases\n")

  

+         fields = ["id", lambda name: name["source_package"]["name"], "state"]

+         printer = get_printer(args.output_format, fields, True)

          while builds_list:

-             fields = ["id", lambda name: name["source_package"]["name"], "state"]

-             printer = get_printer(args.output_format, fields, True)

              for data in builds_list:

                  printer.add_data(data)

-             printer.finish()

              builds_list = next_page(builds_list)

+         printer.finish()

  

      def action_mock_config(self, args):

          """ Method called when the 'mock-config' action has been selected by the

For performance reasons we cannot simply add everything to a list and then
use json_dumps function to print all JSON at once. For large projects this
would mean (tens) of minutes of no output, which is not user-friendly.

Instead, let's utilize json_dumps to convert each object to a JSON
string and print it immediately. By doing so, we need to manually take
care of opening and closing list brackets and separators.

Within this commit, I also did two unrelated changes that were
necessary because the code is too tied together.

I simplified the JsonPrinter as much as possible since 1) it was really
hard to understand, and 2) there was a leak causing iteration over
the whole self.output_info over and over again each time a new item
was added, resulting in the printer being increasingly slower.

I also updated the printer usage for listing builds. It doesn't make
sense to create a new printer for each page of builds. Instead, we
should simply create a printer, use it for the whole loop and finish
after that.

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

rebased onto b21ba554d0de7c566cf5a7fa090ce6d26944c7b5

2 months ago

Build succeeded.

I tried your code and my code on the copr-dev project, there are 1869 builds and list-builds takes about 18 seconds and the result is loaded gradually in my implementation.

I agree that It doesn't make sense to create a new printer for each page of builds. However I'm not sure if the other changes are desirable and I'd rather wait for praiskup's statement, because see https://pagure.io/copr/copr/pull-request/1695#comment-142930.

I tried your code and my code on the copr-dev project, there are 1869 builds and list-builds takes about 18 seconds and the result is loaded gradually in my implementation.

Isn't it still too slow?

For performance reasons we cannot simply add everything to a list and then
use json_dumps function to print all JSON at once. For large projects this
would mean (tens) of minutes of no output, which is not user-friendly.

I'd rather wait for praiskup's statement

I am open to talk about this :-) but indeed don't think it is correct first step ...

Dumping large JSONs isn't that much expensive action. So I would say that we should have fast API first, ... JSON (the current format) isn't designed for streaming ... :-/ and implementing this is a bit of hack.... And the other formats (like tables, etc.) would have problems as well.

Thank you for the review

and the result is loaded gradually in my implementation.
However I'm not sure if the other changes are desirable

The problem is, that the current implementation doesn't produce a valid json

From main

$ copr-cli list-builds @copr/copr-dev --output-format=json > /tmp/list-builds-1.json
$ python -c "import json; f=open('/tmp/list-builds-1.json', 'r'); json.load(f); f.close();"
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/lib64/python3.9/json/__init__.py", line 293, in load
    return loads(fp.read(),
  File "/usr/lib64/python3.9/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/usr/lib64/python3.9/json/decoder.py", line 340, in decode
    raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 503 column 1 (char 9673)

From this PR

$ copr-cli list-builds @copr/copr-dev --output-format=json > /tmp/list-builds-2.json
$ python -c "import json; f=open('/tmp/list-builds-2.json', 'r'); json.load(f); f.close();"
$ echo $?
0

there are 1869 builds and list-builds takes about 18 seconds

Could this be some glitch? I am testing it and both main and this PR give me the same values (around 4.5s).

$ time copr-cli list-builds @copr/copr-dev --output-format=json
...
real    0m4.444s

So I would say that we should have fast API first, ...

Agreed, I am trying to figure out where is the bottleneck on the frontend side.

JSON (the current format) isn't designed for streaming ... :-/ and implementing this is a bit of hack....

However, IMHO no matter how fast the frontend is, it will never be that fast to return projects like rubygems within a couple of seconds and therefore streaming will be required anyway. I agree the implementation is hacky (and probably all of them will be) but I don't think there is another option.

I am trying to figure out where is the bottleneck on the frontend side.

I found two bottlenecks

  1. apiv3_builds.py:to_dict function is a bit slow and it seems to help if we add joinedload for relationships into the query.
  2. We are querying 100 results per page. It is blazing fast from the beginning but gets 10 times slower when approaching large offsets (e.g. 414000). As I understand it, it is a common limitation that we cannot easily deal with. Correct me if I am wrong. I found a nice workaround in setting the limit to 1000 results per page. We will search for offsets exactly as long as previously but then query much more results, resulting in a lot less total queries.

With this, I was able to reduce

time copr-cli list-builds @rubygems/rubygems --output-format=json

from 42m21.379s to 12m52.401s.

I will submit the changes in a separate PR though. In this one, I would like to focus on fixing the copr-cli to produce a valid JSON.

We are querying 100 results per page. It is blazing fast from the beginning but gets 10 times slower when approaching large offsets (e.g. 414000). As I understand it, it is a common limitation that we cannot easily deal with.

Several options in the following answer, the most trivial I like is #2, dunno if applicable:
https://stackoverflow.com/questions/6618366/improving-offset-performance-in-postgresql

Let me summarize the current state.

So I would say that we should have fast API first, ...
I found two bottlenecks

I created PR#1931 for listing builds speedup

Several options in the following answer, the most trivial I like is #2, dunno if applicable:
https://stackoverflow.com/questions/6618366/improving-offset-performance-in-postgresql

Thank you for the tip, I tested it and created PR#1930, which on rubygems project cuts two more minutes in comparison to PR#1931. Please see those PRs and decide which you like better - the other one will get canceled.

This means that this PR only deals with client-side performance and leaving frontend alone.

There are basically two things that this PR is trying to achieve.

1) The current implementation of JsonPrinterListCommand doesn't produce a valid JSON - see my previous comment for more details https://pagure.io/copr/copr/pull-request/1914#comment-157933

2) As I described in the commit message

there was a leak causing iteration over
the whole self.output_info over and over again each time a new item
was added, resulting in the printer being increasingly slower.

This is the main reason listing packages is slow. With this PR, listing rubygems packages goes from 40min to 35sec.

time copr-cli list-packages @rubygems/rubygems --output-format=json

I will update the commit message to describe those reasons more clearly.

rebased onto 45df92f49c60cfda23041af7aa1953deed29b653

2 months ago

+1 (pls change the commit message and merge)

Build succeeded.

rebased onto 842b259a76a6fc1495aadbff82c67322e08ea8c8

2 months ago

Build succeeded.

rebased onto bd5413c

2 months ago

Build succeeded.

Pull-Request has been merged by frostyx

2 months ago
Metadata