From 678f79700d3a19a857c60d63b3b6e0f6f4d555ce Mon Sep 17 00:00:00 2001 From: Pavel Raiskup Date: Oct 01 2019 16:15:56 +0000 Subject: backend: more aggressive posting to Frontend Previously we attempted to contact Frontend 10 times with sleep(5) between attempts, and failed (in total 50s). This however created inconsistencies in Frontend database if Frontend was unavailable longer than 50s (e.g. because some build finished in Worker, but the process failed to notify frontend about this fact => then build stayed in 'running' state indefinitely until backend restarted the job). The new approach prolongs the "attempt" window from 50s to 120s, but also allows use to call 'FrontendClient.set_independent()' flag. If this is enabled (in background Worker processes), the process will keep re-trying to contact frontend till it (a) _is_ possible to send the data correctly or (b) there's some client error. Any other server/request error is considered to be a temporary issue. Fixes: #975 Merges: #1021 --- diff --git a/backend/backend/actions.py b/backend/backend/actions.py index 05d82d0..0505175 100644 --- a/backend/backend/actions.py +++ b/backend/backend/actions.py @@ -586,7 +586,7 @@ class ActionWorkerManager(WorkerManager): try: self.frontend_client.update({"actions": [result]}) - except RequestException: + except FrontendClientException: self.log.exception("can't post to frontend, retrying indefinitely") return False return True diff --git a/backend/backend/daemons/backend.py b/backend/backend/daemons/backend.py index afbc114..5bb1bba 100644 --- a/backend/backend/daemons/backend.py +++ b/backend/backend/daemons/backend.py @@ -6,10 +6,9 @@ import sys import lockfile from daemon import DaemonContext -from requests import RequestException from backend.frontend import FrontendClient -from ..exceptions import CoprBackendError +from ..exceptions import CoprBackendError, FrontendClientException from ..helpers import BackendConfigReader, get_redis_logger @@ -50,8 +49,8 @@ class CoprBackend(object): try: self.log.info("Rescheduling old unfinished builds") - self.frontend_client.reschedule_all_running(120) # 10 minutes - except RequestException as err: + self.frontend_client.reschedule_all_running() + except FrontendClientException as err: self.log.exception(err) raise CoprBackendError(err) diff --git a/backend/backend/daemons/build_dispatcher.py b/backend/backend/daemons/build_dispatcher.py index ae0133f..740dd61 100644 --- a/backend/backend/daemons/build_dispatcher.py +++ b/backend/backend/daemons/build_dispatcher.py @@ -11,7 +11,8 @@ from requests import get, RequestException from backend.frontend import FrontendClient from ..helpers import get_redis_logger -from ..exceptions import DispatchBuildError, NoVmAvailable +from ..exceptions import (DispatchBuildError, NoVmAvailable, + FrontendClientException) from ..job import BuildJob from ..vm_manage.manager import VmManager from ..constants import BuildStatus @@ -107,7 +108,7 @@ class BuildDispatcher(multiprocessing.Process): job.started_on = time.time() job.status = BuildStatus.STARTING can_build_start = self.frontend_client.starting_build(job.to_dict()) - except (RequestException, ValueError) as error: + except (FrontendClientException, ValueError) as error: self.log.exception("Communication with Frontend to confirm build start failed with error: %s", error) return False diff --git a/backend/backend/daemons/worker.py b/backend/backend/daemons/worker.py index 80a0b17..cd42a98 100644 --- a/backend/backend/daemons/worker.py +++ b/backend/backend/daemons/worker.py @@ -358,6 +358,7 @@ class Worker(multiprocessing.Process): def run(self): self.log.info("Starting worker") self.init_buses() + self.frontend_client.try_indefinitely = True try: self.do_job(self.job) diff --git a/backend/backend/exceptions.py b/backend/backend/exceptions.py index 4972e09..3316ab4 100644 --- a/backend/backend/exceptions.py +++ b/backend/backend/exceptions.py @@ -138,3 +138,7 @@ class CreateRepoError(CmdError): class DispatchBuildError(CoprBackendError): pass + + +class FrontendClientException(Exception): + pass diff --git a/backend/backend/frontend.py b/backend/backend/frontend.py index 8f265e0..4c6f61c 100644 --- a/backend/backend/frontend.py +++ b/backend/backend/frontend.py @@ -3,13 +3,25 @@ import time import logging from requests import post, get, RequestException -RETRY_TIMEOUT = 5 +from backend.exceptions import FrontendClientException + +# prolong the sleep time before asking frontend again +SLEEP_INCREMENT_TIME = 5 +# reasonable timeout for requests that block backend daemon +BACKEND_TIMEOUT = 2*60 + +class FrontendClientRetryError(Exception): + pass + class FrontendClient(object): """ Object to send data back to fronted """ + # do we block the main daemon process? + try_indefinitely = False + def __init__(self, opts, logger=None): super(FrontendClient, self).__init__() self.frontend_url = "{}/backend".format(opts.frontend_base_url) @@ -26,27 +38,34 @@ class FrontendClient(object): self.logger.addHandler(logging.NullHandler()) return self.logger - - def _post_to_frontend(self, data, url_path): - """ - Make a request to the frontend - """ - + def _frontend_request(self, url_path, data=None, authenticate=True): headers = {"content-type": "application/json"} url = "{}/{}/".format(self.frontend_url, url_path) - auth = ("user", self.frontend_auth) - - self.msg = None + auth = ("user", self.frontend_auth) if authenticate else None try: - response = post(url, data=json.dumps(data), auth=auth, headers=headers) - if response.status_code >= 400: - self.msg = "Failed to submit to frontend: {0}: {1}".format( - response.status_code, response.text) - raise RequestException(self.msg) - except RequestException as e: - self.msg = "Post request failed: {0}".format(e) - raise + # TODO: no data => use get() + response = post(url, data=json.dumps(data), auth=auth, + headers=headers) + except RequestException as ex: + raise FrontendClientRetryError( + "Requests error on {}: {}".format(url, str(ex))) + + if response.status_code >= 500: + # Server error. Hopefully this is only temporary problem, we wan't + # to re-try, and wait till the server works again. + raise FrontendClientRetryError( + "Request server error on {}: {} {}".format( + url, response.status_code, response.reason)) + + if response.status_code >= 400: + # Client error. The mistake is on our side, it doesn't make sense + # to continue with retries. + raise FrontendClientException( + "Request client error on {}: {} {}".format( + url, response.status_code, response.reason)) + + # TODO: Success, but tighten the redirects etc. return response def get_reliably(self, url_path): @@ -70,18 +89,29 @@ class FrontendClient(object): return response - def _post_to_frontend_repeatedly(self, data, url_path, max_repeats=10): + def _post_to_frontend_repeatedly(self, data, url_path): """ - Make a request max_repeats-time to the frontend + Repeat the request until it succeeds, or timeout is reached. """ - for i in range(max_repeats): + sleep = SLEEP_INCREMENT_TIME + start = time.time() + stop = start + BACKEND_TIMEOUT + + i = 0 + while True: + i += 1 + if not self.try_indefinitely and time.time() > stop: + raise FrontendClientException( + "Attempt to talk to frontend timeouted " + "(we gave it {} attempts)".format(i)) + try: - return self._post_to_frontend(data, url_path) - except RequestException: - self.log.warning("failed to post data to frontend, attempt #{0}".format(i)) - time.sleep(5) - else: - raise RequestException("Failed to post to frontend for {} times".format(max_repeats)) + return self._frontend_request(url_path, data=data) + except FrontendClientRetryError as ex: + self.log.warning("Retry request #%s on %s: %s", i, url_path, + str(ex)) + time.sleep(sleep) + sleep += SLEEP_INCREMENT_TIME def update(self, data): """ @@ -97,7 +127,7 @@ class FrontendClient(object): """ response = self._post_to_frontend_repeatedly(data, "starting_build") if "can_start" not in response.json(): - raise RequestException("Bad respond from the frontend") + raise FrontendClientException("Bad response from the frontend") return response.json()["can_start"] def reschedule_build(self, build_id, task_id, chroot_name): @@ -107,7 +137,7 @@ class FrontendClient(object): data = {"build_id": build_id, "task_id": task_id, "chroot": chroot_name} self._post_to_frontend_repeatedly(data, "reschedule_build_chroot") - def reschedule_all_running(self, attempts): - response = self._post_to_frontend_repeatedly({}, "reschedule_all_running", attempts) + def reschedule_all_running(self): + response = self._post_to_frontend_repeatedly({}, "reschedule_all_running") if response.status_code != 200: - raise RequestException("Failed to reschedule all running jobs") + raise FrontendClientException("Failed to reschedule builds") diff --git a/backend/tests/test_frontend.py b/backend/tests/test_frontend.py index 37885de..3056667 100644 --- a/backend/tests/test_frontend.py +++ b/backend/tests/test_frontend.py @@ -3,9 +3,11 @@ import multiprocessing from munch import Munch -from requests import RequestException +from requests import RequestException, Response -from backend.frontend import FrontendClient +from backend.frontend import (FrontendClient, FrontendClientRetryError, + SLEEP_INCREMENT_TIME) +from backend.exceptions import FrontendClientException from unittest import mock from unittest.mock import MagicMock @@ -43,55 +45,93 @@ class TestFrontendClient(object): self.chroot_name = "fedora-20-x86_64" @pytest.fixture - def mask_post_to_fe(self): - self.ptf = MagicMock() - self.fc._post_to_frontend = self.ptf + def mask_frontend_request(self): + self.f_r = MagicMock() + self.fc._frontend_request = self.f_r def test_post_to_frontend(self, post_req): post_req.return_value.status_code = 200 - self.fc._post_to_frontend(self.data, self.url_path) + self.fc._frontend_request(self.url_path, self.data) assert post_req.called def test_post_to_frontend_not_200(self, post_req): post_req.return_value.status_code = 501 - with pytest.raises(RequestException): - self.fc._post_to_frontend(self.data, self.url_path) + with pytest.raises(FrontendClientRetryError): + self.fc._frontend_request(self.url_path, self.data) assert post_req.called def test_post_to_frontend_post_error(self, post_req): post_req.side_effect = RequestException() - with pytest.raises(RequestException): - self.fc._post_to_frontend(self.data, self.url_path) + with pytest.raises(FrontendClientRetryError): + self.fc._frontend_request(self.url_path, self.data) assert post_req.called - def test_post_to_frontend_repeated_first_try_ok(self, mask_post_to_fe, mc_time): + def test_post_to_frontend_repeated_first_try_ok(self, mask_frontend_request, mc_time): response = "ok\n" - self.ptf.return_value = response + self.f_r.return_value = response + mc_time.time.return_value = 0 assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert not mc_time.sleep.called - def test_post_to_frontend_repeated_second_try_ok(self, mask_post_to_fe, mc_time): + def test_post_to_frontend_repeated_second_try_ok(self, mask_frontend_request, mc_time): response = "ok\n" - self.ptf.side_effect = [ - RequestException(), + self.f_r.side_effect = [ + FrontendClientRetryError(), response, ] - + mc_time.time.return_value = 0 assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert mc_time.sleep.called - def test_post_to_frontend_repeated_all_attempts_failed(self, mask_post_to_fe, mc_time): - self.ptf.side_effect = RequestException() + def test_post_to_frontend_err_400(self, post_req, mc_time): + response = Response() + response.status_code = 404 + response.reason = 'NOT FOUND' - with pytest.raises(RequestException): - self.fc._post_to_frontend_repeatedly(self.data, self.url_path) + post_req.side_effect = [ + FrontendClientRetryError(), + response, + ] + mc_time.time.return_value = 0 + with pytest.raises(FrontendClientException): + assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert mc_time.sleep.called + @mock.patch('backend.frontend.BACKEND_TIMEOUT', 100) + def test_post_to_frontend_repeated_all_attempts_failed(self, + mask_frontend_request, caplog, mc_time): + mc_time.time.side_effect = [0, 0, 5, 5+10, 5+10+15, 5+10+15+20, 1000] + self.f_r.side_effect = FrontendClientRetryError() + with pytest.raises(FrontendClientException): + self.fc._post_to_frontend_repeatedly(self.data, self.url_path) + assert mc_time.sleep.call_args_list == [mock.call(x) for x in [5, 10, 15, 20, 25]] + assert len(caplog.records) == 5 + + def test_post_to_frontend_repeated_indefinitely(self, + mask_frontend_request, caplog, mc_time): + mc_time.time.return_value = 1 + self.fc.try_indefinitely = True + self.f_r.side_effect = [FrontendClientRetryError() for _ in range(100)] \ + + [FrontendClientException()] # e.g. 501 eventually + with pytest.raises(FrontendClientException): + self.fc._post_to_frontend_repeatedly(self.data, self.url_path) + assert mc_time.sleep.called + assert len(caplog.records) == 100 + + def test_reschedule_300(self, mask_frontend_request, post_req): + response = Response() + response.status_code = 302 + response.reason = 'whatever' + post_req.side_effect = response + with pytest.raises(FrontendClientException) as ex: + self.fc.reschedule_all_running() + assert 'Failed to reschedule builds' in str(ex) + def test_update(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr @@ -110,7 +150,7 @@ class TestFrontendClient(object): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr - with pytest.raises(RequestException): + with pytest.raises(FrontendClientException): self.fc.starting_build(self.data) def test_starting_build_err_2(self): @@ -118,7 +158,7 @@ class TestFrontendClient(object): self.fc._post_to_frontend_repeatedly = ptfr ptfr.return_value.json.return_value = {} - with pytest.raises(RequestException): + with pytest.raises(FrontendClientException): self.fc.starting_build(self.data) def test_reschedule_build(self):