#1175 backend: attempt to send fedora-messaging message N-times
Merged 4 years ago by msuchy. Opened 4 years ago by praiskup.
Unknown source fm-api-retries  into  master

file modified
+14 -6
@@ -148,6 +148,8 @@

          # Fix bus_id soon enough.

          self.opts.bus_id = getattr(self.opts, 'bus_id', type(self).__name__)

  

+         self.opts.bus_publish_retries = getattr(self.opts, 'bus_publish_retries', 5)

+ 

          if not log:

              log = logging

              logging.basicConfig(level=logging.DEBUG)
@@ -172,10 +174,16 @@

          """

          try:

              message.validate()

-             self._send_message(message)

-         # pylint: disable=W0703

-         except Exception:

-             self.log.exception("Failed to publish message.")

+         except Exception: # pylint: disable=W0703

+             self.log.exception("Failed to validate a message")

+             return

+ 

+         for attempt in range(1, self.opts.bus_publish_retries + 1):

+             try:

+                 self._send_message(message)

+             except Exception: # pylint: disable=W0703

+                 # We don't want to halt the worker because of messaging.

+                 self.log.exception("Attempt %s to publish a message failed", attempt)

  

      def announce_job(self, msg_type, job, who, ip, pid):

          """
@@ -315,5 +323,5 @@

          os.environ['FEDORA_MESSAGING_CONF'] = opts.toml_config

  

      def _send_message(self, message):

-         from fedora_messaging import api

-         api.publish(message)

+         from fedora_messaging import api as fm_api, exceptions as fm_ex

+         fm_api.publish(message)

@@ -3,8 +3,8 @@

  """

  

  bus_type = 'fedora-messaging'

- 

  bus_id = 'fm'

+ bus_publish_retries = 5

  

  toml_config = '/etc/fedora-messaging/fedora.toml'

  

@@ -2,9 +2,9 @@

  Example configuration file for stomp message bus.

  """

  

- bus_id = 'ci_message_bus'

- 

  bus_type = 'stomp'

+ bus_id = 'ci_message_bus'

+ bus_publish_retries = 5

  

  # we use python-stomppy, see it's documentation for more info

  hosts = [

With fedora-messaging-1.7.1-1.fc30.noarch we had an issue that the
api.publish() never returned; so I updated the package to the latest
fedora version 2.0.0-1.fc30.noarch and the problem disappeared.

But as it turns out, the logs started to contain messages like:

Traceback (most recent call last):
  File "/usr/lib/python3.7/site-packages/fedora_messaging/api.py", line 313, in publish
    eventual_result.wait(timeout=timeout)
  File "/usr/lib/python3.7/site-packages/crochet/_eventloop.py", line 239, in wait
    result = self._result(timeout)
  File "/usr/lib/python3.7/site-packages/crochet/_eventloop.py", line 201, in _result
    raise TimeoutError()
crochet._eventloop.TimeoutError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/share/copr/backend/msgbus.py", line 175, in send_message
    self._send_message(message)
  File "/usr/share/copr/backend/msgbus.py", line 319, in _send_message
    api.publish(message)
  File "/usr/lib/python3.7/site-packages/fedora_messaging/api.py", line 319, in publish
    raise wrapper
fedora_messaging.exceptions.PublishTimeout: None

This means that some of the messages were lost because of documented
PublishTimeout. Per documentation, it is up to the caller what to do
about this - so let's re-try for now, by default 5x.

rebased onto 794d01639be3c2ae9f5055ba32869f95555db900

4 years ago

rebased onto d8dc0828ac7748a66cc180ac886c7885d412e055

4 years ago

rebased onto 5d05fd207124fbab820f55382fb0050a054ce76e

4 years ago

rebased onto 5953194

4 years ago

Pull-Request has been merged by msuchy

4 years ago