PR#193: [backend][frontend] queue redo - copr/copr

		`@@ -49,9 +49,9 @@`
		`self.update_process_title("Waiting for an action task from frontend for {}s"`
		`.format(int(time.time() - get_action_init_time)))`
		`try:`
		`- r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url),`
		`+ r = get("{0}/backend/waiting-action/".format(self.opts.frontend_base_url),`
		`auth=("user", self.opts.frontend_auth))`
		`- action_task = r.json().get("action")`
		`+ action_task = r.json()`
		`except (RequestException, ValueError) as error:`
		`self.log.exception("Retrieving an action task from {} failed with error: {}"`
		`.format(self.opts.frontend_base_url, error))`

backend/backend/daemons/build_dispatcher.py

file modified

+66 -63

		`@@ -20,13 +20,14 @@`
		`from ..vm_manage.manager import VmManager`
		`from .worker import Worker`

		`+ from collections import defaultdict`

		`class BuildDispatcher(multiprocessing.Process):`
		`"""`
		`1) Fetch build task from frontend`
		`- 2) Get a free VM for it`
		`+ 2) Get an available VM for it`
		`3) Create a worker for the job`
		`- 4) Start it asynchronously and go to 1)`
		`+ 4) Start worker asynchronously and go to 1)`
		`"""`

		`def __init__(self, opts):`
		`@@ -39,16 +40,18 @@`
		`self.workers = []`
		`self.next_worker_id = 1`

		`- # Maps e.g. x86_64 && i386 => PC`
		`- self.arch_to_group = dict()`
		`+ self.arch_to_groups = defaultdict(list)`
		`# PC => max N builders per user`
		`self.group_to_usermax = dict()`

		`self.init_internal_structures()`

		`- def get_vm_group_id(self, arch):`
		`+ def get_vm_group_ids(self, arch):`
		`+ if not arch:`
		`+ return [group["id"] for group in self.opts.build_groups]`
		`+`
		`try:`
		`- return self.arch_to_group[arch]`
		`+ return self.arch_to_groups[arch]`
		`except KeyError:`
		`raise DispatchBuildError("Unknown architecture {0}".format(arch))`

		`@@ -59,42 +62,40 @@`
		`setproctitle(proc_title)`

		`def init_internal_structures(self):`
		`- self.arch_to_group = dict()`
		`- self.group_to_usermax = dict()`
		`- for group in self.opts.build_groups:`
		`- group_id = group["id"]`
		`- for arch in group["archs"]:`
		`- self.arch_to_group[arch] = group_id`
		`- self.log.debug("mapping {0} to {1} group".format(arch, group_id))`
		`-`
		`- self.log.debug("user might use only {0}VMs for {1} group".format(group["max_vm_per_user"], group_id))`
		`- self.group_to_usermax[group_id] = group["max_vm_per_user"]`
		`-`
		`- def load_job(self):`
		`+ for group in self.opts.build_groups:`
		`+ group_id = group["id"]`
		`+`
		`+ for arch in group["archs"]:`
		`+ self.arch_to_groups[arch].append(group_id)`
		`+ self.log.debug("mapping {0} to {1} group".format(arch, group_id))`
		`+`
		`+ self.log.debug("user might use only {0}VMs for {1} group".format(group["max_vm_per_user"], group_id))`
		`+ self.group_to_usermax[group_id] = group["max_vm_per_user"]`
		`+`
		`+ def load_jobs(self):`
		`"""`
		`Retrieve a single build job from frontend.`
		`"""`
		`self.log.info("Waiting for a job from frontend...")`
		`get_task_init_time = time.time()`
		`+ tasks = None`

		`- task = None`
		`- while not task:`
		`- self.update_process_title("Waiting for a job from frontend for {} s"`
		`+ while not tasks:`
		`+ self.update_process_title("Waiting for jobs from frontend for {} s"`
		`.format(int(time.time() - get_task_init_time)))`
		`try:`
		`- r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url),`
		`- auth=("user", self.opts.frontend_auth))`
		`- task = r.json().get("build")`
		`+ tasks = get("{0}/backend/waiting-jobs/".format(self.opts.frontend_base_url),`
		`+ auth=("user", self.opts.frontend_auth)).json()`
		`+`
		`except (RequestException, ValueError) as error:`
		`- self.log.exception("Retrieving build job from {} failed with error: {}"`
		`+ self.log.exception("Retrieving build jobs from {} failed with error: {}"`
		`.format(self.opts.frontend_base_url, error))`
		`finally:`
		`- if not task:`
		`+ if not tasks:`
		`time.sleep(self.opts.sleeptime)`

		`- self.log.info("Got new build job {}".format(task.get("task_id")))`
		`-`
		`- return BuildJob(task, self.opts)`
		`+ self.log.info("Got new build jobs: {}".format([task.get("task_id") for task in tasks]))`
		`+ return [BuildJob(task, self.opts) for task in tasks]`

		`def can_build_start(self, job):`
		`"""`
		`@@ -125,6 +126,9 @@`
		`self.log.info("Removed finished worker {} for job {}"`
		`.format(worker.worker_id, worker.job.task_id))`

		`+ def get_worker_ids(self):`
		`+ return [worker.worker_id for worker in self.workers]`
		`+`
		`def start_worker(self, vm, job, reattach=False):`
		`worker = Worker(`
		`opts=self.opts,`
		`@@ -149,38 +153,37 @@`
		`while True:`
		`self.clean_finished_workers()`

		`- job = self.load_job()`
		`-`
		`- # now search db builder records for the job and`
		`- # if we found it, just spawn a worker to reattach`
		`- vm = self.vm_manager.get_vm_by_task_id(job.task_id)`
		`- if vm and vm.state == 'in_use':`
		`- worker = self.start_worker(vm, job, reattach=True)`
		`- worker.mark_started(job)`
		`- vm.store_field(self.vm_manager.rc, "used_by_pid", os.getpid())`
		`- self.log.info("Reattached new worker {} for job {}"`
		`+ for job in self.load_jobs():`
		`+ # search db builder records for the job and`
		`+ # if we found it, spawn a worker to reattach`
		`+ vm = self.vm_manager.get_vm_by_task_id(job.task_id)`
		`+ if vm and vm.state == 'in_use':`
		`+ self.log.info("Reattaching to VM: "+str(vm))`
		`+ worker = self.start_worker(vm, job, reattach=True)`
		`+ worker.mark_started(job)`
		`+ vm.store_field(self.vm_manager.rc, "used_by_worker", worker.worker_id)`
		`+ self.log.info("Reattached new worker {} for job {}"`
		`+ .format(worker.worker_id, worker.job.task_id))`
		`+ continue`
		`+`
		`+ # ... and if the task is new to us,`
		`+ # allocate new vm and run full build`
		`+ try:`
		`+ vm_group_ids = self.get_vm_group_ids(job.arch)`
		`+ self.log.info("Picking VM from groups {} for job {}".format(vm_group_ids, job))`
		`+ vm = self.vm_manager.acquire_vm(vm_group_ids, job.project_owner, self.next_worker_id,`
		`+ job.task_id, job.build_id, job.chroot)`
		`+ except NoVmAvailable as error:`
		`+ self.log.info("No available resources for task {} (Reason: {}). Deferring job."`
		`+ .format(job.task_id, error))`
		`+ continue`
		`+ else:`
		`+ self.log.info("VM {} for job {} successfully acquired".format(vm.vm_name, job.task_id))`
		`+`
		`+ if not self.can_build_start(job):`
		`+ self.vm_manager.release_vm(vm.vm_name)`
		`+ continue`
		`+`
		`+ worker = self.start_worker(vm, job)`
		`+ self.log.info("Started new worker {} for job {}"`
		`.format(worker.worker_id, worker.job.task_id))`
		`- continue`
		`-`
		`- # ... and if the task is new to us,`
		`- # allocate new vm and run full build`
		`- try:`
		`- self.log.info("Acquiring VM for job {}...".format(str(job)))`
		`- vm_group_id = None if not job.arch else self.get_vm_group_id(job.arch)`
		`- vm = self.vm_manager.acquire_vm(vm_group_id, job.project_owner, os.getpid(),`
		`- job.task_id, job.build_id, job.chroot)`
		`- except NoVmAvailable as error:`
		`- self.log.info("No available resources for task {} (Reason: {}). Deferring job."`
		`- .format(job.task_id, error))`
		`- self.frontend_client.defer_build(job.build_id, job.chroot)`
		`- continue`
		`- else:`
		`- self.log.info("VM {} for job {} successfully acquired".format(vm.vm_name, job.task_id))`
		`-`
		`- if not self.can_build_start(job):`
		`- self.vm_manager.release_vm(vm.vm_name)`
		`- continue`
		`-`
		`- worker = self.start_worker(vm, job)`
		`- self.log.info("Started new worker {} for job {}"`
		`- .format(worker.worker_id, worker.job.task_id))`

backend/backend/daemons/vm_master.py

file modified

-66

		`@@ -54,69 +54,6 @@`
		`.format(vmd.vm_name, not_re_acquired_in))`
		`self.vmm.start_vm_termination(vmd.vm_name, allowed_pre_state=VmStates.READY)`

		`-`
		`- def check_one_vm_for_dead_builder(self, vmd):`
		`- # TODO: builder should renew lease periodically`
		`- # and we should use that time instead of in_use_since and pid checks`
		`- in_use_since = vmd.get_field(self.vmm.rc, "in_use_since")`
		`- pid = vmd.get_field(self.vmm.rc, "used_by_pid")`
		`-`
		`- if not in_use_since or not pid:`
		`- return`
		`- in_use_time_elapsed = time.time() - float(in_use_since)`
		`-`
		`- # give a minute for worker to set correct title`
		`- if in_use_time_elapsed < 60 and str(pid) == "None":`
		`- return`
		`-`
		`- pid = int(pid)`
		`- # try:`
		`- # # here we can catch race condition: worker acquired VM but haven't set process title yet`
		`- # if psutil.pid_exists(pid) and vmd.vm_name in psutil.Process(pid).cmdline[0]:`
		`- # return`
		`- #`
		- # self.log.info("Process `{}` not exists anymore, doing second try. VM data: {}"
		`- # .format(pid, vmd))`
		`- # # dirty hack: sleep and check again`
		`- # time.sleep(5)`
		`- # if psutil.pid_exists(pid) and vmd.vm_name in psutil.Process(pid).cmdline[0]:`
		`- # return`
		`- # except Exception:`
		- # self.log.exception("Failed do determine if process `{}` still alive for VM: {}, assuming alive"
		`- # .format(pid, vmd))`
		`- # return`
		`-`
		`- # psutil changed Process().cmdline from property to function between f20 and f22`
		`- # disabling more precise check for now`
		`- try:`
		`- # here we can catch race condition: worker acquired VM but haven't set process title yet`
		`- if psutil.pid_exists(pid):`
		`- return`
		`-`
		- self.log.info("Process `{}` not exists anymore, doing second try. VM data: {}"
		`- .format(pid, vmd))`
		`- # dirty hack: sleep and check again`
		`- time.sleep(5)`
		`- if psutil.pid_exists(pid):`
		`- return`
		`-`
		`- except Exception:`
		- self.log.exception("Failed do determine if process `{}` still alive for VM: {}, assuming alive"
		`- .format(pid, vmd))`
		`- return`
		`-`
		- self.log.info("Process `{}` not exists anymore, terminating VM: {} ".format(pid, vmd.vm_name))
		`- self.vmm.start_vm_termination(vmd.vm_name, allowed_pre_state=VmStates.IN_USE)`
		`- # TODO: build rescheduling ?`
		`-`
		`- def remove_vm_with_dead_builder(self):`
		`- # TODO: rewrite build manage at backend and move functionality there`
		`- # VMM shouldn't do this`
		`-`
		`- # check that process who acquired VMD still exists, otherwise release VM`
		`- for vmd in self.vmm.get_vm_by_group_and_state_list(None, [VmStates.IN_USE]):`
		`- self.check_one_vm_for_dead_builder(vmd)`
		`-`
		`def check_vms_health(self):`
		`# for machines in state ready and time.time() - vm.last_health_check > threshold_health_check_period`
		`states_to_check = [VmStates.CHECK_HEALTH_FAILED, VmStates.READY,`
		`@@ -230,9 +167,6 @@`
		`self.check_vms_health()`
		`self.start_spawn_if_required()`

		`- # disable this now for detached builds`
		`- # self.remove_vm_with_dead_builder()`
		`-`
		`self.finalize_long_health_checks()`
		`self.terminate_again()`

backend/backend/daemons/worker.py

file modified

+4 -7

		`@@ -218,17 +218,15 @@`

		`except SSHConnectionError as err:`
		`self.log.exception(`
		`- "SSH connection staled: {0}".format(str(err)))`
		`+ "SSH connection stalled: {0}".format(str(err)))`
		`# The VM is unusable, don't wait for relatively slow`
		`# garbage collector.`
		`- self.vm_manager.start_vm_termination(`
		`- self.vm.vm_name,`
		`- allowed_pre_state=VmStates.IN_USE)`
		`+ self.vm_manager.start_vm_termination(self.vm.vm_name)`
		`self.frontend_client.reschedule_build(`
		`job.build_id, job.chroot)`
		`raise VmError("SSH connection issue, build rescheduled")`

		`- except: # programmer's failures`
		`+ except: # programmer's failure`
		`self.log.exception("Unexpected error")`
		`failed = True`

		`@@ -272,8 +270,7 @@`
		`"done".format(pipes.quote(job.results_dir))`
		`)`
		`result = run_cmd(cmd, shell=True)`
		`- built_packages = result.stdout.split('\n')[0].strip()`
		`-`
		`+ built_packages = result.stdout.strip()`
		`self.log.info("Built packages:\n{}".format(built_packages))`
		`return built_packages`

backend/backend/frontend.py

file modified

-17

		`@@ -70,23 +70,6 @@`
		`raise RequestException("Bad respond from the frontend")`
		`return response.json()["can_start"]`

		`- def defer_build(self, build_id, chroot_name):`
		`- """`
		`- Tell the frontend that the build task should be deferred`
		`- (put aside for some time until there are resources for it).`
		`- """`
		`- was_deferred = False`
		`- data = {"build_id": build_id, "chroot": chroot_name}`
		`- try:`
		`- response = self._post_to_frontend_repeatedly(data, "defer_build")`
		`- was_deferred = response.json()["was_deferred"]`
		`- except (RequestException, ValueError, KeyError) as e:`
		`- self.log.exception("Failed to defer build task (build_id {} and chroot {}) with error {}"`
		`- .format(build_id, chroot_name, e))`
		`- if not was_deferred:`
		`- self.log.exception("Frontend refused to defer build task: build_id {} and chroot {}"`
		`- .format(build_id, chroot_name))`
		`-`
		`def reschedule_build(self, build_id, chroot_name):`
		`"""`
		`Announce to the frontend that a build should be rescheduled (set pending state).`

backend/backend/helpers.py

file modified

+1 -1

		`@@ -260,7 +260,7 @@`
		`opts.fedmsg_enabled = _get_conf(`
		`cp, "backend", "fedmsg_enabled", False, mode="bool")`
		`opts.sleeptime = _get_conf(`
		`- cp, "backend", "sleeptime", 10, mode="int")`
		`+ cp, "backend", "sleeptime", 5, mode="int")`
		`opts.timeout = _get_conf(`
		`cp, "builder", "timeout", DEF_BUILD_TIMEOUT, mode="int")`
		`opts.consecutive_failure_threshold = _get_conf(`

backend/backend/job.py

file modified

-3

		`@@ -70,9 +70,6 @@`
		`else:`
		`self.chroot = 'srpm-builds'`

		`- if not self.task_id:`
		`- self.task_id = self.build_id`
		`-`
		`self.destdir = os.path.normpath(os.path.join(`
		`worker_opts.destdir,`
		`task_data["project_owner"],`

backend/backend/vm_manage/manager.py

file modified

+52 -59

		`@@ -5,7 +5,6 @@`
		`from __future__ import division`
		`from __future__ import absolute_import`

		`- from itertools import chain`
		`import json`
		`import time`
		`import weakref`
		`@@ -50,7 +49,7 @@`
		`local server_restart_time = tonumber(redis.call("HGET", KEYS[2], "server_start_timestamp"))`
		`if last_health_check and server_restart_time and last_health_check > server_restart_time then`
		`redis.call("HMSET", KEYS[1], "state", "in_use", "bound_to_user", ARGV[1],`
		`- "used_by_pid", ARGV[2], "in_use_since", ARGV[3],`
		`+ "used_by_worker", ARGV[2], "in_use_since", ARGV[3],`
		`"task_id", ARGV[4], "build_id", ARGV[5], "chroot", ARGV[6])`
		`return "OK"`
		`else`
		`@@ -67,11 +66,11 @@`
		`return nil`
		`else`
		`redis.call("HMSET", KEYS[1], "state", "ready", "last_release", ARGV[1])`
		`- redis.call("HDEL", KEYS[1], "in_use_since", "used_by_pid", "task_id", "build_id", "chroot")`
		`+ redis.call("HDEL", KEYS[1], "in_use_since", "used_by_worker", "task_id", "build_id", "chroot")`
		`redis.call("HINCRBY", KEYS[1], "builds_count", 1)`

		`local check_fails = tonumber(redis.call("HGET", KEYS[1], "check_fails"))`
		`- if check_fails > 0 then`
		`+ if check_fails and check_fails > 0 then`
		`redis.call("HSET", KEYS[1], "state", "check_health_failed")`
		`end`

		`@@ -85,10 +84,6 @@`
		`terminate_vm_lua = """`
		`local old_state = redis.call("HGET", KEYS[1], "state")`

		`- if old_state == "in_use" and ARGV[1] ~= "in_use" then`
		`- return "Termination of VM in in_use state are forbidden"`
		`- end`
		`-`
		`if ARGV[1] and ARGV[1] ~= "None" and old_state ~= ARGV[1] then`
		return "Old state != `allowed_pre_state`"
		`elseif old_state == "terminating" and ARGV[1] ~= "terminating" then`
		`@@ -162,17 +157,16 @@`
		`:type group: int`
		`:rtype: VmDescriptor`
		`"""`
		`- # print("\n ADD VM TO POOL")`
		`if self.rc.sismember(KEY_VM_POOL.format(group=group), vm_name):`
		raise VmError("Can't add VM `{}` to the pool, such name already used".format(vm_name))

		`vmd = VmDescriptor(vm_ip, vm_name, group, VmStates.GOT_IP)`
		`- # print("VMD: {}".format(vmd))`
		`pipe = self.rc.pipeline()`
		`pipe.sadd(KEY_VM_POOL.format(group=group), vm_name)`
		`pipe.hmset(KEY_VM_INSTANCE.format(vm_name=vm_name), vmd.to_dict())`
		`pipe.execute()`
		`self.log.info("registered new VM: {} {}".format(vmd.vm_name, vmd.vm_ip))`
		`+`
		`return vmd`

		`def lookup_vms_by_ip(self, vm_ip):`
		`@@ -199,64 +193,68 @@`
		`"""`
		`vmd_list = self.get_all_vm_in_group(group)`
		`vm_count_used_by_user = len([`
		`- vmd for vmd in vmd_list if`
		`- vmd.bound_to_user == username and vmd.state == VmStates.IN_USE`
		`+ vmd for vmd in vmd_list if vmd.bound_to_user == username`
		`])`

		`- if group == None:`
		`- limit = 0`
		`- for gid in range(len(self.opts.build_groups)):`
		`- limit += self.opts.build_groups[gid]["max_vm_per_user"]`
		`- else:`
		`- limit = self.opts.build_groups[group]["max_vm_per_user"]`
		`-`
		`+ limit = self.opts.build_groups[group].get("max_vm_per_user")`
		`self.log.debug("# vm by user: {}, limit:{} ".format(`
		`vm_count_used_by_user, limit`
		`))`
		`- if vm_count_used_by_user >= limit:`
		`- # TODO: this check isn't reliable, if two (or more) processes check VM list`
		`- # at the +- same time, they could acquire more VMs`
		`- # proper solution: do this check inside lua script`
		`+`
		`+ if limit and vm_count_used_by_user >= limit:`
		self.log.debug("No VM are available, user `{}` already acquired #{} VMs"
		`.format(username, vm_count_used_by_user))`
		`return False`
		`else:`
		`return True`

		`- def acquire_vm(self, group, username, pid, task_id=None, build_id=None, chroot=None):`
		`+ def get_ready_vms(self, group):`
		`+ vmd_list = self.get_all_vm_in_group(group)`
		`+ return [vmd for vmd in vmd_list if vmd.state == VmStates.READY]`
		`+`
		`+ def get_dirty_vms(self, group):`
		`+ vmd_list = self.get_all_vm_in_group(group)`
		`+ return [vmd for vmd in vmd_list if vmd.bound_to_user is not None]`
		`+`
		`+ def acquire_vm(self, groups, ownername, pid, task_id=None, build_id=None, chroot=None):`
		`"""`
		`- Try to acquire VM from pool`
		`+ Try to acquire VM from pool.`

		`- :param group: builder group id, as defined in config`
		`- :type group: int`
		`- :param username: build owner username, VMM prefer to reuse an existing VM which was used by the same user`
		`- :param pid: builder pid to release VM after build process unhandled death`
		`+ :param list groups: builder group ids where the build can be launched as defined in config`
		`+ :param ownername: job owner, prefer to reuse an existing VM which was used by that same user before`
		`+ :param pid: worker process id`

		`:rtype: VmDescriptor`
		`- :raises: NoVmAvailable when manager couldn't find suitable VM for the given group and user`
		`+ :raises: NoVmAvailable when manager couldn't find suitable VM for the given groups and owner`
		`"""`
		`- vmd_list = self.get_all_vm_in_group(group)`
		`- if not self.can_user_acquire_more_vm(username, group):`
		- raise NoVmAvailable("No VM are available, user `{}` already acquired too much VMs"
		`- .format(username))`
		`-`
		`- ready_vmd_list = [vmd for vmd in vmd_list if vmd.state == VmStates.READY]`
		`- # trying to find VM used by this user`
		`- dirtied_by_user = [vmd for vmd in ready_vmd_list if vmd.bound_to_user == username]`
		`- clean_list = [vmd for vmd in ready_vmd_list if vmd.bound_to_user is None]`
		`- all_vms = list(chain(dirtied_by_user, clean_list))`
		`-`
		`- for vmd in all_vms:`
		`- if vmd.get_field(self.rc, "check_fails") != "0":`
		`- self.log.debug("VM {} has check fails, skip acquire".format(vmd.vm_name))`
		`- vm_key = KEY_VM_INSTANCE.format(vm_name=vmd.vm_name)`
		`- if self.lua_scripts["acquire_vm"](keys=[vm_key, KEY_SERVER_INFO],`
		`- args=[username, pid, time.time(),`
		`- task_id, build_id, chroot]) == "OK":`
		`- self.log.info("Acquired VM :{} {} for pid: {}".format(vmd.vm_name, vmd.vm_ip, pid))`
		`- return vmd`
		`- else:`
		`- raise NoVmAvailable("No VM are available, please wait in queue. Group: {}".format(group))`
		`+ for group in groups:`
		`+ ready_vmd_list = self.get_ready_vms(group)`
		`+ # trying to find VM used by this user`
		`+ dirtied_by_user = [vmd for vmd in ready_vmd_list if vmd.bound_to_user == ownername]`
		`+`
		`+ user_can_acquire_more_vm = self.can_user_acquire_more_vm(ownername, group)`
		`+ if not dirtied_by_user and not user_can_acquire_more_vm:`
		`+ self.log.debug("User {} already acquired too much VMs in group {}"`
		`+ .format(ownername, group))`
		`+ continue`
		`+`
		`+ available_vms = dirtied_by_user`
		`+ if user_can_acquire_more_vm:`
		`+ clean_list = [vmd for vmd in ready_vmd_list if vmd.bound_to_user is None]`
		`+ available_vms += clean_list`
		`+`
		`+ for vmd in available_vms:`
		`+ if vmd.get_field(self.rc, "check_fails") != "0":`
		`+ self.log.debug("VM {} has check fails, skip acquire".format(vmd.vm_name))`
		`+ vm_key = KEY_VM_INSTANCE.format(vm_name=vmd.vm_name)`
		`+ if self.lua_scripts["acquire_vm"](keys=[vm_key, KEY_SERVER_INFO],`
		`+ args=[ownername, pid, time.time(),`
		`+ task_id, build_id, chroot]) == "OK":`
		`+ self.log.info("Acquired VM :{} {} for pid: {}".format(vmd.vm_name, vmd.vm_ip, pid))`
		`+ return vmd`
		`+`
		`+ raise NoVmAvailable("No VMs are currently available for task {} of user {}"`
		`+ .format(task_id, ownername))`

		`def release_vm(self, vm_name):`
		`"""`
		`@@ -268,7 +266,7 @@`
		`self.log.info("Releasing VM {}".format(vm_name))`
		`vm_key = KEY_VM_INSTANCE.format(vm_name=vm_name)`
		`lua_result = self.lua_scripts["release_vm"](keys=[vm_key], args=[time.time()])`
		- self.log.debug("release vm result `{}`".format(lua_result))
		+ self.log.info("Release vm result `{}`".format(lua_result))
		`return lua_result == "OK"`

		`def start_vm_termination(self, vm_name, allowed_pre_state=None):`
		`@@ -289,11 +287,8 @@`
		`}`
		`self.rc.publish(PUBSUB_MB, json.dumps(msg))`
		`self.log.info("VM {} queued for termination".format(vmd.vm_name))`
		`-`
		`- # TODO: remove when refactored build management`
		`- # self.rc.publish(PUBSUB_INTERRUPT_BUILDER.format(vmd.vm_ip), "vm died")`
		`else:`
		- self.log.debug("VM termination `{}` skipped due to: {} ".format(vm_name, lua_result))
		+ self.log.info("VM termination `{}` skipped due to: {} ".format(vm_name, lua_result))

		`def remove_vm_from_pool(self, vm_name):`
		`"""`
		`@@ -323,8 +318,6 @@`
		`"""`
		`:rtype: list of VmDescriptor`
		`"""`
		`- if group == None:`
		`- return self.get_all_vm()`
		`vm_name_list = self.rc.smembers(KEY_VM_POOL.format(group=group))`
		`return self._load_multi_safe(vm_name_list)`

backend/backend/vm_manage/models.py

file modified

+2 -8

		`@@ -12,15 +12,9 @@`
		`self.state = state`
		`self.group = int(group)`

		`- self.check_fails = 0`
		`-`
		`- # self.last_health_check = None`
		`- # self.last_release = None`
		`- # self.in_use_since = None`
		`- # self.terminating_since = None`
		`- #`
		`self.bound_to_user = None`
		`- # self.used_by_pid = None`
		`+ self.used_by_worker = None`
		`+ self.task_id = None`

		`@property`
		`def vm_key(self):`

backend/conf/copr-be.conf.example

file modified

-5

		`@@ -13,11 +13,6 @@`

		`dist_git_url=distgitvm.example.com`

		`- # comma-separated architectures`
		`- # default is i386,x86_64`
		`- #architectures=i386,x86_64`
		`-`
		`-`
		`# Set a number of build groups (default is 1)`
		`build_groups=1`

backend/conf/copr-be.local.conf

file modified

-5

		`@@ -13,11 +13,6 @@`

		`dist_git_url=distgitvm.example.com`

		`- # comma-separated architectures`
		`- # default is i386,x86_64`
		`- #architectures=i386,x86_64`
		`-`
		`-`
		`# Set a number of build groups (default is 1)`
		`build_groups=1`

backend/conf/playbooks/spawn_local.yml

file modified

+1 -1

		`@@ -8,5 +8,5 @@`
		`local_action: command echo "Copr builder {{ 999999999 \| random }}"`
		`register: vm_name`

		`- - debug: msg="IP=127.0.0.1"`
		`+ - debug: msg="IP=127.0.0.{{7 \| random}}"`
		`- debug: msg="vm_name={{vm_name.stdout}}"`

backend/run_test.sh

file modified

+1 -1

		`@@ -25,7 +25,7 @@`
		`# tests/mockremote/test_mockremote.py that are currently failing due to complete code rewrite`
		`# TODO: prune tests (case-by-case) that are no longer relevant. We mostly rely on`
		`# integration & regression tests now.`
		`- TESTS="tests/test_createrepo.py tests/test_frontend.py tests/test_helpers.py tests/test_sign.py"`
		`+ TESTS="tests/test_createrepo.py tests/test_frontend.py tests/test_helpers.py tests/test_sign.py tests/vm_manager/test_manager.py"`

		`if [[ -n $@ ]]; then`
		`TESTS=$@`

backend/tests/daemons/test_vm_master.py

file modified

+1 -1

		`@@ -229,7 +229,7 @@`
		`self.rc.hdel(self.vmd_b3.vm_key, "chroot")`

		`for idx, vmd in enumerate([self.vmd_a1, self.vmd_a2, self.vmd_b2, self.vmd_b3]):`
		`- vmd.store_field(self.rc, "used_by_pid", idx + 1)`
		`+ vmd.store_field(self.rc, "used_by_worker", idx + 1)`

		`for vmd in [self.vmd_a3, self.vmd_a3]:`
		`vmd.store_field(self.rc, "state", VmStates.READY)`

backend/tests/test_createrepo.py

file modified

+6 -6

		`@@ -157,10 +157,10 @@`

		`def test_createrepo_generated_commands_existing_repodata(self, mc_run_cmd_unsafe):`
		`path_epel_5 = os.path.join(self.tmp_dir_name, "epel-5")`
		`- expected_epel_5 = ('/usr/bin/createrepo_c --database '`
		`+ expected_epel_5 = ('/usr/bin/createrepo_c --database --ignore-lock --local-sqlite --cachedir /tmp/ --workers 8 '`
		`'--update -s sha --checksum md5 ' + path_epel_5)`
		`path_fedora = os.path.join(self.tmp_dir_name, "fedora-21")`
		`- expected_fedora = ('/usr/bin/createrepo_c --database '`
		`+ expected_fedora = ('/usr/bin/createrepo_c --database --ignore-lock --local-sqlite --cachedir /tmp/ --workers 8 '`
		`'--update ' + path_fedora)`
		`for path, expected in [(path_epel_5, expected_epel_5), (path_fedora, expected_fedora)]:`
		`os.makedirs(path)`
		`@@ -200,12 +200,12 @@`

		`def test_createrepo_devel_generated_commands_existing_repodata(self, mc_run_cmd_unsafe):`
		`path_epel_5 = os.path.join(self.tmp_dir_name, "epel-5")`
		`- expected_epel_5 = ("/usr/bin/createrepo_c --database "`
		`+ expected_epel_5 = ("/usr/bin/createrepo_c --database --ignore-lock --local-sqlite --cachedir /tmp/ --workers 8 "`
		`"-s sha --checksum md5 "`
		`"--outputdir " + os.path.join(path_epel_5, "devel") + " "`
		`"--baseurl " + self.base_url + " " + path_epel_5)`
		`path_fedora = os.path.join(self.tmp_dir_name, "fedora-21")`
		`- expected_fedora = ("/usr/bin/createrepo_c --database "`
		`+ expected_fedora = ("/usr/bin/createrepo_c --database --ignore-lock --local-sqlite --cachedir /tmp/ --workers 8 "`
		`"--outputdir " + os.path.join(path_fedora, "devel") + " "`
		`"--baseurl " + self.base_url + " " + path_fedora)`
		`for path, expected in [(path_epel_5, expected_epel_5), (path_fedora, expected_fedora)]:`
		`@@ -221,12 +221,12 @@`

		`def test_createrepo_devel_generated_commands(self, mc_run_cmd_unsafe):`
		`path_epel_5 = os.path.join(self.tmp_dir_name, "epel-5")`
		`- expected_epel_5 = ("/usr/bin/createrepo_c --database "`
		`+ expected_epel_5 = ("/usr/bin/createrepo_c --database --ignore-lock --local-sqlite --cachedir /tmp/ --workers 8 "`
		`"-s sha --checksum md5 "`
		`"--outputdir " + os.path.join(path_epel_5, "devel") + " "`
		`"--baseurl " + self.base_url + " " + path_epel_5)`
		`path_fedora = os.path.join(self.tmp_dir_name, "fedora-21")`
		`- expected_fedora = ("/usr/bin/createrepo_c --database "`
		`+ expected_fedora = ("/usr/bin/createrepo_c --database --ignore-lock --local-sqlite --cachedir /tmp/ --workers 8 "`
		`"--outputdir " + os.path.join(path_fedora, "devel") + " "`
		`"--baseurl " + self.base_url + " " + path_fedora)`
		`for path, expected in [(path_epel_5, expected_epel_5), (path_fedora, expected_fedora)]:`

backend/tests/vm_manager/test_manager.py

file modified

+81 -79

		`@@ -38,11 +38,8 @@`
		`with mock.patch("{}.time".format(MODULE_REF)) as handle:`
		`yield handle`

		`-`
		`- class TestCallback(object):`
		`- def log(self, msg):`
		`- print(msg)`
		`-`
		`+ GID1 = 0`
		`+ GID2 = 1`

		`class TestManager(object):`

		`@@ -53,12 +50,16 @@`
		`ssh=Munch(`
		`transport="ssh"`
		`),`
		`- build_groups_count=1,`
		`+ build_groups_count=2,`
		`build_groups={`
		`- 0: {`
		`+ GID1: {`
		`"name": "base",`
		`"archs": ["i386", "x86_64"],`
		`"max_vm_per_user": 3,`
		`+ },`
		`+ GID2: {`
		`+ "name": "arm",`
		`+ "archs": ["armV7",]`
		`}`
		`},`

		`@@ -69,22 +70,20 @@`
		`# destdir=self.tmp_dir_path,`
		`results_baseurl="/tmp",`
		`)`
		`- self.queue = Queue()`

		`self.vm_ip = "127.0.0.1"`
		`self.vm_name = "localhost"`
		`- self.group = 0`
		`- self.username = "bob"`
		`+`
		`+ self.vm2_ip = "127.0.0.2"`
		`+ self.vm2_name = "localhost2"`
		`+`
		`+ self.ownername = "bob"`

		`self.rc = get_redis_connection(self.opts)`
		`self.ps = None`
		`self.log_msg_list = []`

		`- self.callback = TestCallback()`
		`-`
		`- self.queue = Queue()`
		`self.vmm = VmManager(self.opts)`
		`-`
		`self.vmm.log = MagicMock()`
		`self.pid = 12345`

		`@@ -93,31 +92,30 @@`
		`if keys:`
		`self.vmm.rc.delete(*keys)`

		`- @pytest.fixture`
		`- def f_second_group(self):`
		`- self.opts.build_groups_count = 2`
		`- self.opts.build_groups[1] = {`
		`- "name": "arm",`
		`- "archs": ["armV7",]`
		`- }`
		`+ def test_manager_setup(self):`
		`+ vmm = VmManager(self.opts)`
		`+ assert GID1 in vmm.vm_groups`
		`+ assert GID2 in vmm.vm_groups`
		`+ assert len(vmm.vm_groups) == 2`

		`def test_add_vm_to_pool(self):`
		`- self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`

		`- vm_list = self.vmm.get_all_vm_in_group(self.group)`
		`+ with pytest.raises(VmError):`
		`+ self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`

		`+ vm_list = self.vmm.get_all_vm_in_group(GID1)`
		`vm = self.vmm.get_vm_by_name(self.vm_name)`
		`+`
		`assert len(vm_list) == 1`
		`assert vm_list[0].__dict__ == vm.__dict__`
		`- assert self.group in self.vmm.vm_groups`
		`- assert len(self.vmm.vm_groups) == 1`
		`-`
		`- with pytest.raises(VmError):`
		`- self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ assert vm.vm_ip == self.vm_ip`
		`+ assert vm.vm_name == self.vm_name`
		`+ assert vm.group == GID1`

		`def test_mark_vm_check_failed(self, mc_time):`
		`self.vmm.start_vm_termination = types.MethodType(MagicMock(), self.vmm)`
		`- self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`vmd = self.vmm.get_vm_by_name(self.vm_name)`
		`vmd.store_field(self.rc, "state", VmStates.CHECK_HEALTH)`
		`vmd.store_field(self.rc, "last_health_check", 12345)`
		`@@ -132,39 +130,39 @@`
		`assert vmd.get_field(self.rc, "state") == state`

		`def test_acquire_vm_no_vm_after_server_restart(self, mc_time):`
		`- vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`vmd.store_field(self.rc, "state", VmStates.READY)`

		`# undefined both last_health_check and server_start_timestamp`
		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(0, self.username, 42)`
		`+ self.vmm.acquire_vm([GID1], self.ownername, 42)`

		`# only server start timestamp is defined`
		`mc_time.time.return_value = 1`
		`self.vmm.mark_server_start()`
		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(0, self.username, 42)`
		`+ self.vmm.acquire_vm([GID1], self.ownername, 42)`

		`# only last_health_check defined`
		`self.rc.delete(KEY_SERVER_INFO)`
		`vmd.store_field(self.rc, "last_health_check", 0)`
		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(0, self.username, 42)`
		`+ self.vmm.acquire_vm([GID1], self.ownername, 42)`

		`# both defined but last_health_check < server_start_time`
		`self.vmm.mark_server_start()`
		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(0, self.username, 42)`
		`+ self.vmm.acquire_vm([GID1], self.ownername, 42)`

		`# and finally last_health_check > server_start_time`
		`vmd.store_field(self.rc, "last_health_check", 2)`
		`- vmd_res = self.vmm.acquire_vm(0, self.username, 42)`
		`+ vmd_res = self.vmm.acquire_vm([GID1], self.ownername, 42)`
		`assert vmd.vm_name == vmd_res.vm_name`

		`def test_acquire_vm_extra_kwargs(self, mc_time):`
		`mc_time.time.return_value = 0`
		`self.vmm.mark_server_start()`
		`- vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`vmd.store_field(self.rc, "state", VmStates.READY)`
		`vmd.store_field(self.rc, "last_health_check", 2)`

		`@@ -173,7 +171,7 @@`
		`"build_id": "20",`
		`"chroot": "fedora-20-x86_64"`
		`}`
		`- vmd_got = self.vmm.acquire_vm(self.group, self.username, self.pid, **kwargs)`
		`+ vmd_got = self.vmm.acquire_vm([GID1], self.ownername, self.pid, **kwargs)`
		`for k, v in kwargs.items():`
		`assert vmd_got.get_field(self.rc, k) == v`

		`@@ -181,83 +179,87 @@`
		`mc_time.time.return_value = 0`
		`self.vmm.mark_server_start()`

		`- vmd_main = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`- vmd_alt = self.vmm.add_vm_to_pool(self.vm_ip, "alternative", self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`+ vmd_alt = self.vmm.add_vm_to_pool(self.vm_ip, "vm_alt", GID1)`
		`+ vmd2 = self.vmm.add_vm_to_pool(self.vm2_ip, self.vm2_name, GID2)`

		`- vmd_main.store_field(self.rc, "state", VmStates.READY)`
		`+ vmd.store_field(self.rc, "state", VmStates.READY)`
		`vmd_alt.store_field(self.rc, "state", VmStates.READY)`
		`- vmd_alt.store_field(self.vmm.rc, "bound_to_user", self.username)`
		`- vmd_main.store_field(self.rc, "last_health_check", 2)`
		`+ vmd2.store_field(self.rc, "state", VmStates.READY)`
		`+`
		`+ vmd.store_field(self.rc, "last_health_check", 2)`
		`vmd_alt.store_field(self.rc, "last_health_check", 2)`
		`+ vmd2.store_field(self.rc, "last_health_check", 2)`
		`+`
		`+ vmd_alt.store_field(self.vmm.rc, "bound_to_user", self.ownername)`

		`- vmd_got_first = self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`- assert vmd_got_first.vm_name == "alternative"`
		`- vmd_got_second = self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`+ vmd_got_first = self.vmm.acquire_vm([GID1, GID2], ownername=self.ownername, pid=self.pid)`
		`+ assert vmd_got_first.vm_name == "vm_alt"`
		`+ vmd_got_second = self.vmm.acquire_vm([GID1, GID2], ownername=self.ownername, pid=self.pid)`
		`assert vmd_got_second.vm_name == self.vm_name`

		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`+ self.vmm.acquire_vm(groups=[GID1], ownername=self.ownername, pid=self.pid)`
		`+`
		`+ vmd_got_third = self.vmm.acquire_vm(groups=[GID1, GID2], ownername=self.ownername, pid=self.pid)`
		`+ assert vmd_got_third.vm_name == self.vm2_name`

		`def test_acquire_vm_per_user_limit(self, mc_time):`
		`mc_time.time.return_value = 0`
		`self.vmm.mark_server_start()`
		`+ max_vm_per_user = self.opts.build_groups[GID1]["max_vm_per_user"]`

		`- max_vm_per_user = self.opts.build_groups[0]["max_vm_per_user"]`
		`- acquired_vmd_list = []`
		`+ vmd_list = []`
		`for idx in range(max_vm_per_user + 1):`
		`- vmd = self.vmm.add_vm_to_pool("127.0.{}.1".format(idx), "vm_{}".format(idx), self.group)`
		`+ vmd = self.vmm.add_vm_to_pool("127.0.{}.1".format(idx), "vm_{}".format(idx), GID1)`
		`vmd.store_field(self.rc, "state", VmStates.READY)`
		`vmd.store_field(self.rc, "last_health_check", 2)`
		`- acquired_vmd_list.append(vmd)`
		`+ vmd_list.append(vmd)`

		`for idx in range(max_vm_per_user):`
		`- vmd = self.vmm.acquire_vm(0, self.username, idx)`
		`+ self.vmm.acquire_vm([GID1], self.ownername, idx)`

		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(0, self.username, 42)`
		`-`
		`- acquired_vmd_list[-1].store_field(self.rc, "state", VmStates.READY)`
		`- self.vmm.acquire_vm(0, self.username, 42)`
		`+ self.vmm.acquire_vm([GID1], self.ownername, 42)`

		`def test_acquire_only_ready_state(self, mc_time):`
		`mc_time.time.return_value = 0`
		`self.vmm.mark_server_start()`

		`- vmd_main = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd_main = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`vmd_main.store_field(self.rc, "last_health_check", 2)`

		`for state in [VmStates.IN_USE, VmStates.GOT_IP, VmStates.CHECK_HEALTH,`
		`VmStates.TERMINATING, VmStates.CHECK_HEALTH_FAILED]:`
		`vmd_main.store_field(self.rc, "state", state)`
		`with pytest.raises(NoVmAvailable):`
		`- self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`+ self.vmm.acquire_vm(groups=[GID1], ownername=self.ownername, pid=self.pid)`

		`def test_acquire_and_release_vm(self, mc_time):`
		`mc_time.time.return_value = 0`
		`self.vmm.mark_server_start()`

		`-`
		`- vmd_main = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`- vmd_alt = self.vmm.add_vm_to_pool(self.vm_ip, "alternative", self.group)`
		`+ vmd_main = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`+ vmd_alt = self.vmm.add_vm_to_pool(self.vm_ip, "vm_alt", GID1)`

		`vmd_main.store_field(self.rc, "state", VmStates.READY)`
		`vmd_alt.store_field(self.rc, "state", VmStates.READY)`
		`- vmd_alt.store_field(self.vmm.rc, "bound_to_user", self.username)`
		`+ vmd_alt.store_field(self.vmm.rc, "bound_to_user", self.ownername)`
		`vmd_main.store_field(self.rc, "last_health_check", 2)`
		`vmd_alt.store_field(self.rc, "last_health_check", 2)`

		`- vmd_got_first = self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`- assert vmd_got_first.vm_name == "alternative"`
		`+ vmd_got_first = self.vmm.acquire_vm(groups=[GID1], ownername=self.ownername, pid=self.pid)`
		`+ assert vmd_got_first.vm_name == "vm_alt"`

		`- self.vmm.release_vm("alternative")`
		`- vmd_got_again = self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`- assert vmd_got_again.vm_name == "alternative"`
		`+ self.vmm.release_vm("vm_alt")`
		`+ vmd_got_again = self.vmm.acquire_vm(groups=[GID1], ownername=self.ownername, pid=self.pid)`
		`+ assert vmd_got_again.vm_name == "vm_alt"`

		`- vmd_got_another = self.vmm.acquire_vm(group=self.group, username=self.username, pid=self.pid)`
		`+ vmd_got_another = self.vmm.acquire_vm(groups=[GID1], ownername=self.ownername, pid=self.pid)`
		`assert vmd_got_another.vm_name == self.vm_name`

		`def test_release_only_in_use(self):`
		`- vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`

		`for state in [VmStates.READY, VmStates.GOT_IP, VmStates.CHECK_HEALTH,`
		`VmStates.TERMINATING, VmStates.CHECK_HEALTH_FAILED]:`
		`@@ -278,7 +280,7 @@`
		`def test_start_vm_termination(self):`
		`self.ps = self.vmm.rc.pubsub(ignore_subscribe_messages=True)`
		`self.ps.subscribe(PUBSUB_MB)`
		`- self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`

		`self.vmm.start_vm_termination(self.vm_name)`
		`rcv_msg_list = self.rcv_from_ps_message_bus()`
		`@@ -294,7 +296,7 @@`
		`self.ps = self.vmm.rc.pubsub(ignore_subscribe_messages=True)`
		`self.ps.subscribe(PUBSUB_MB)`

		`- vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`vmd.store_field(self.rc, "state", VmStates.TERMINATING)`
		`self.vmm.start_vm_termination(self.vm_name, allowed_pre_state=VmStates.TERMINATING)`
		`rcv_msg_list = self.rcv_from_ps_message_bus()`
		`@@ -309,7 +311,7 @@`
		`def test_start_vm_termination_fail(self):`
		`self.ps = self.vmm.rc.pubsub(ignore_subscribe_messages=True)`
		`self.ps.subscribe(PUBSUB_MB)`
		`- vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`vmd.store_field(self.rc, "state", VmStates.TERMINATING)`

		`self.vmm.start_vm_termination(self.vm_name)`
		`@@ -329,7 +331,7 @@`
		`assert vmd.get_field(self.rc, "state") == VmStates.TERMINATING`

		`def test_remove_vm_from_pool_only_terminated(self):`
		`- vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, self.group)`
		`+ vmd = self.vmm.add_vm_to_pool(self.vm_ip, self.vm_name, GID1)`
		`for state in [VmStates.IN_USE, VmStates.GOT_IP, VmStates.CHECK_HEALTH,`
		`VmStates.READY, VmStates.CHECK_HEALTH_FAILED]:`

		`@@ -339,11 +341,11 @@`

		`vmd.store_field(self.vmm.rc, "state", VmStates.TERMINATING)`
		`self.vmm.remove_vm_from_pool(self.vm_name)`
		`- assert self.vmm.rc.scard(KEY_VM_POOL.format(group=self.group)) == 0`
		`+ assert self.vmm.rc.scard(KEY_VM_POOL.format(group=GID1)) == 0`

		`- def test_get_vms(self, f_second_group, capsys):`
		`- vmd_1 = self.vmm.add_vm_to_pool(self.vm_ip, "a1", self.group)`
		`- vmd_2 = self.vmm.add_vm_to_pool(self.vm_ip, "a2", self.group)`
		`+ def test_get_vms(self, capsys):`
		`+ vmd_1 = self.vmm.add_vm_to_pool(self.vm_ip, "a1", GID1)`
		`+ vmd_2 = self.vmm.add_vm_to_pool(self.vm_ip, "a2", GID1)`
		`vmd_3 = self.vmm.add_vm_to_pool(self.vm_ip, "b1", 1)`
		`vmd_4 = self.vmm.add_vm_to_pool(self.vm_ip, "b2", 1)`
		`vmd_5 = self.vmm.add_vm_to_pool(self.vm_ip, "b3", 1)`
		`@@ -366,13 +368,13 @@`

		`self.vmm.info()`

		`- def test_look_up_vms_by_ip(self, f_second_group, capsys):`
		`- self.vmm.add_vm_to_pool(self.vm_ip, "a1", self.group)`
		`+ def test_look_up_vms_by_ip(self, capsys):`
		`+ self.vmm.add_vm_to_pool(self.vm_ip, "a1", GID1)`
		`r1 = self.vmm.lookup_vms_by_ip(self.vm_ip)`
		`assert len(r1) == 1`
		`assert r1[0].vm_name == "a1"`

		`- self.vmm.add_vm_to_pool(self.vm_ip, "a2", self.group)`
		`+ self.vmm.add_vm_to_pool(self.vm_ip, "a2", GID1)`
		`r2 = self.vmm.lookup_vms_by_ip(self.vm_ip)`
		`assert len(r2) == 2`
		`r2 = sorted(r2, key=lambda vmd: vmd.vm_name)`

docker/backend/files/etc/copr/copr-be.conf

file modified

-5

		`@@ -18,11 +18,6 @@`
		`dist_git_url=http://distgit/cgit`
		`#dist_git_url=http://cgit`

		`- # comma-separated architectures`
		`- # default is i386,x86_64`
		`- architectures=i386,x86_64`
		`-`
		`-`
		`# Set a number of build groups (default is 1)`
		`build_groups=1`

frontend/coprs_frontend/alembic/schema/versions/4edb1ca2a13f_remove_priority_columns.py

file added

+24

		`@@ -0,0 +1,24 @@`
		`+ """remove_priority_columns`
		`+`
		`+ Revision ID: 4edb1ca2a13f`
		`+ Revises: e183e12563ee`
		`+ Create Date: 2018-01-01 16:34:13.196247`
		`+`
		`+ """`
		`+`
		`+ # revision identifiers, used by Alembic.`
		`+ revision = '4edb1ca2a13f'`
		`+ down_revision = 'e183e12563ee'`
		`+`
		`+ from alembic import op`
		`+ import sqlalchemy as sa`
		`+`
		`+`
		`+ def upgrade():`
		`+ op.drop_column('build_chroot', 'priority')`
		`+ op.drop_column('build', 'priority')`
		`+`
		`+`
		`+ def downgrade():`
		`+ op.add_column('build_chroot', sa.Column('priority', sa.BigInteger(), server_default='0',nullable=False))`
		`+ op.add_column('build', sa.Column('priority', sa.BigInteger(), server_default='0', nullable=False))`

frontend/coprs_frontend/alembic/schema/versions/552455e5910e_new_table_packages.py

file modified

+1 -1

		`@@ -13,7 +13,7 @@`
		`from alembic import op`
		`import sqlalchemy as sa`
		`from sqlalchemy import and_`
		`- from rpmUtils.miscutils import splitFilename`
		`+ from coprs.helpers import splitFilename`
		`import os`
		`import json`

frontend/coprs_frontend/coprs/constants.py

file modified

-2

		`@@ -11,5 +11,3 @@`
		`DEFAULT_BUILD_TIMEOUT = 3600 * 18`
		`MIN_BUILD_TIMEOUT = 0`
		`MAX_BUILD_TIMEOUT = 86400`
		`-`
		`- MAX_PRIO = 2 ** 40`

frontend/coprs_frontend/coprs/helpers.py

file modified

		`@@ -227,6 +227,7 @@`
		`return "{}{}".format(os, version)`


		`+ # TODO: is there something like python-rpm-utils or python-dnf-utils for this?`
		`def splitFilename(filename):`
		`"""`
		`Pass in a standard style rpm fullname`

frontend/coprs_frontend/coprs/logic/builds_logic.py

file modified

+9 -61

		`@@ -21,7 +21,7 @@`
		`from coprs import exceptions`
		`from coprs import models`
		`from coprs import helpers`
		`- from coprs.constants import DEFAULT_BUILD_TIMEOUT, MAX_BUILD_TIMEOUT, MAX_PRIO`
		`+ from coprs.constants import DEFAULT_BUILD_TIMEOUT, MAX_BUILD_TIMEOUT`
		`from coprs.exceptions import MalformedArgumentException, ActionInProgressException, InsufficientRightsException`
		`from coprs.helpers import StatusEnum`

		`@@ -117,32 +117,18 @@`
		`query = query.order_by(models.BuildChroot.build_id.asc())`
		`return query`

		`-`
		`@classmethod`
		`- def get_select_srpm_build_tasks_query(cls, background=False):`
		`+ def get_waiting_srpm_build_tasks(cls):`
		`return (models.Build.query.join(models.BuildChroot)`
		`.filter(models.Build.srpm_url.is_(None))`
		`.filter(models.Build.canceled == false())`
		`.filter(models.BuildChroot.status == helpers.StatusEnum("importing"))`
		`- .filter(models.Build.is_background == (true() if background else false())))`
		`-`
		`- @classmethod`
		`- def select_srpm_build_task(cls, background=False):`
		`- return (cls.get_select_srpm_build_tasks_query(background)`
		`- .order_by(models.Build.priority.asc(),`
		`- models.Build.id.asc())`
		`- .first())`
		`-`
		`- @classmethod`
		`- def get_srpm_build_task_lowest_priority(cls, background=False):`
		`- lowest_prio_task = (cls.get_select_srpm_build_tasks_query(background)`
		`- .order_by(models.Build.priority.desc()).first())`
		`- return (lowest_prio_task.priority if lowest_prio_task else 0)`
		`+ .order_by(models.Build.is_background.asc(), models.Build.id.asc())`
		`+ .all())`

		`@classmethod`
		`- def get_select_build_tasks_query(cls, background=False):`
		`+ def get_waiting_build_tasks(cls):`
		`return (models.BuildChroot.query.join(models.Build)`
		`- .filter(models.Build.is_background == (true() if background else false()))`
		`.filter(models.Build.canceled == false())`
		`.filter(or_(`
		`models.BuildChroot.status == helpers.StatusEnum("pending"),`
		`@@ -151,26 +137,9 @@`
		`models.BuildChroot.started_on < int(time.time() - 1.1 * MAX_BUILD_TIMEOUT),`
		`models.BuildChroot.ended_on.is_(None)`
		`)`
		`- )))`
		`-`
		`- @classmethod`
		`- def select_build_task(cls, background=False):`
		`- return (cls.get_select_build_tasks_query(background)`
		`- .order_by(models.BuildChroot.priority.asc(),`
		`- models.BuildChroot.build_id.asc())`
		`- .first())`
		`-`
		`- @classmethod`
		`- def get_build_task_lowest_priority(cls, background=False):`
		`- lowest_prio_task = (cls.get_select_build_tasks_query(background)`
		`- .order_by(models.BuildChroot.priority.desc()).first())`
		`- return (lowest_prio_task.priority if lowest_prio_task else 0)`
		`-`
		`- @classmethod`
		`- def get_task_lowest_priority(cls, background=False):`
		`- prio1 = cls.get_build_task_lowest_priority(background)`
		`- prio2 = cls.get_srpm_build_task_lowest_priority(background)`
		`- return max(prio1, prio2)`
		`+ ))`
		`+ .order_by(models.Build.is_background.asc(), models.BuildChroot.build_id.asc())`
		`+ .all())`

		`@classmethod`
		`def get_build_task(cls, task_id):`
		`@@ -187,15 +156,6 @@`
		`return BuildsLogic.get_by_id(build_id).first()`

		`@classmethod`
		`- def defer(cls, build_id):`
		`- build = cls.get(build_id).first()`
		`- if not build:`
		`- return None`
		`- build.priority = (BuildsLogic.get_task_lowest_priority(build.is_background)+1)%MAX_PRIO`
		`- db.session.add(build)`
		`- return build`
		`-`
		`- @classmethod`
		`def get_multiple(cls):`
		`return models.Build.query.order_by(models.Build.id.desc())`

		`@@ -605,7 +565,6 @@`
		`is_background=bool(background),`
		`batch=batch,`
		`srpm_url=srpm_url,`
		`- priority=(cls.get_task_lowest_priority(bool(background))+1)%MAX_PRIO`
		`)`

		`if timeout:`
		`@@ -620,10 +579,8 @@`

		`if skip_import:`
		`status = StatusEnum("pending")`
		`- priority = (cls.get_task_lowest_priority(bool(background))+1)%MAX_PRIO`
		`else:`
		`status = StatusEnum("importing")`
		`- priority = 0`

		`for chroot in chroots:`
		`git_hash = None`
		`@@ -634,7 +591,6 @@`
		`status=status,`
		`mock_chroot=chroot,`
		`git_hash=git_hash,`
		`- priority=priority`
		`)`
		`db.session.add(buildchroot)`

		`@@ -737,6 +693,7 @@`
		`log.info("Updating build: {} by: {}".format(build.id, upd_dict))`
		`if "chroot" in upd_dict:`
		`if upd_dict["chroot"] == "srpm-builds":`
		`+`
		`if upd_dict.get("status") == StatusEnum("failed") and not build.canceled:`
		`build.fail_type = helpers.FailTypeEnum("srpm_build_error")`
		`for ch in build.build_chroots:`
		`@@ -874,15 +831,6 @@`
		`)`

		`@classmethod`
		`- def defer(cls, build_id, name):`
		`- chroot = cls.get_by_build_id_and_name(build_id, name).first()`
		`- if not chroot:`
		`- return None`
		`- chroot.priority = (BuildsLogic.get_task_lowest_priority(chroot.build.is_background)+1)%MAX_PRIO`
		`- db.session.add(chroot)`
		`- return chroot`
		`-`
		`- @classmethod`
		`def get_multiply(cls):`
		`query = (`
		`models.BuildChroot.query`

frontend/coprs_frontend/coprs/models.py

file modified

+2 -7

		`@@ -537,7 +537,6 @@`
		`# background builds has lesser priority than regular builds.`
		`is_background = db.Column(db.Boolean, default=False, server_default="0", nullable=False)`

		`- priority = db.Column(db.BigInteger, default=0, server_default="0", nullable=False)`
		`srpm_url = db.Column(db.Text)`

		`# relations`
		`@@ -586,7 +585,7 @@`
		`return self.repos.split()`

		`@property`
		`- def import_task_id(self):`
		`+ def task_id(self):`
		`return str(self.id)`

		`@property`
		`@@ -606,7 +605,7 @@`
		`def import_log_url_distgit(self):`
		`if app.config["COPR_DIST_GIT_LOGS_URL"]:`
		`return "{}/{}.log".format(app.config["COPR_DIST_GIT_LOGS_URL"],`
		`- self.import_task_id.replace('/', '_'))`
		`+ self.task_id.replace('/', '_'))`
		`return None`

		`@property`
		`@@ -989,8 +988,6 @@`
		`started_on = db.Column(db.Integer)`
		`ended_on = db.Column(db.Integer, index=True)`

		`- priority = db.Column(db.BigInteger, default=0, server_default="0", nullable=False)`
		`-`
		`build_requires = db.Column(db.Text)`

		`@property`
		`@@ -998,7 +995,6 @@`
		`"""`
		`Textual representation of name of this chroot`
		`"""`
		`-`
		`return self.mock_chroot.name`

		`@property`
		`@@ -1006,7 +1002,6 @@`
		`"""`
		`Return text representation of status of this build chroot`
		`"""`
		`-`
		`if self.status is not None:`
		`return StatusEnum(self.status)`

frontend/coprs_frontend/coprs/views/backend_ns/backend_general.py

file modified

+18 -58

		`@@ -12,16 +12,16 @@`
		`from coprs.logic.complex_logic import ComplexLogic`
		`from coprs.logic.coprs_logic import CoprChrootsLogic`
		`from coprs.logic.packages_logic import PackagesLogic`
		`- from coprs.constants import MAX_PRIO`

		`from coprs.views import misc`
		`from coprs.views.backend_ns import backend_ns`
		`from sqlalchemy.sql import false, true`

		`import json`
		`+ import urllib`
		`import logging`
		`- log = logging.getLogger(__name__)`

		`+ log = logging.getLogger(__name__)`

		`@backend_ns.route("/importing/")`
		`# FIXME I'm commented`
		`@@ -42,7 +42,7 @@`
		`branches.add(b_ch.mock_chroot.distgit_branch_name)`

		`task_dict = {`
		`- "task_id": task.import_task_id,`
		`+ "task_id": task.task_id,`
		`"owner": copr.owner_name,`
		`"project": copr.name,`
		`"branches": list(branches),`
		`@@ -99,7 +99,6 @@`
		`for ch in build_chroots:`
		`if ch.status == helpers.StatusEnum("importing"):`
		`ch.status = helpers.StatusEnum("pending")`
		`- ch.priority = (BuildsLogic.get_task_lowest_priority(build.is_background)+1)%MAX_PRIO`
		`ch.git_hash = git_hash`

		`# Failed?`
		`@@ -170,6 +169,7 @@`
		`try:`
		`build_record = {`
		`"build_id": task.id,`
		`+ "task_id": task.id,`
		`"project_owner": task.copr.owner_name,`
		`"project_name": task.copr.name,`
		`"source_type": task.source_type,`
		`@@ -182,44 +182,31 @@`
		`return build_record`


		`- @backend_ns.route("/waiting/")`
		`+ @backend_ns.route("/waiting-action/")`
		`#@misc.backend_authenticated`
		`- def waiting():`
		`+ def waiting_action():`
		`"""`
		`- Return a single action and a single build.`
		`+ Return a single action.`
		`"""`
		`action_record = None`
		`- build_record = None`
		`-`
		`action = actions_logic.ActionsLogic.get_waiting().first()`
		`if action:`
		`action_record = action.to_dict(options={`
		`"__columns_except__": ["result", "message", "ended_on"]`
		`})`
		`+ return flask.jsonify(action_record)`

		`- build_task = BuildsLogic.select_build_task(background=False)`
		`- srpm_build_task = BuildsLogic.select_srpm_build_task(background=False)`
		`-`
		`- if not build_task:`
		`- build_record = get_srpm_build_record(srpm_build_task)`
		`- elif srpm_build_task and srpm_build_task.priority < build_task.priority:`
		`- build_record = get_srpm_build_record(srpm_build_task)`
		`- else:`
		`- build_record = get_build_record(build_task)`
		`-`
		`- if not build_record:`
		`- build_task = BuildsLogic.select_build_task(background=True)`
		`- srpm_build_task = BuildsLogic.select_srpm_build_task(background=True)`
		`-`
		`- if not build_task:`
		`- build_record = get_srpm_build_record(srpm_build_task)`
		`- elif srpm_build_task and srpm_build_task.priority < build_task.priority:`
		`- build_record = get_srpm_build_record(srpm_build_task)`
		`- else:`
		`- build_record = get_build_record(build_task)`

		`- response_dict = {"action": action_record, "build": build_record}`
		`- return flask.jsonify(response_dict)`
		`+ @backend_ns.route("/waiting-jobs/")`
		`+ #@misc.backend_authenticated`
		`+ def waiting_jobs():`
		`+ """`
		`+ Return the job queue.`
		`+ """`
		`+ build_records = ([get_build_record(task) for task in BuildsLogic.get_waiting_build_tasks()] +`
		`+ [get_srpm_build_record(task) for task in BuildsLogic.get_waiting_srpm_build_tasks()])`
		`+ log.info('Selected build records: {}'.format(build_records))`
		`+ return flask.jsonify(build_records)`


		`@backend_ns.route("/get-build-task/<task_id>")`
		`@@ -307,33 +294,6 @@`
		`return flask.jsonify(result)`


		`- @backend_ns.route("/defer_build/", methods=["POST", "PUT"])`
		`- @misc.backend_authenticated`
		`- def defer_build():`
		`- """`
		`- Defer build (lower job priority so that other jobs can be`
		`- considered for building meanwhile).`
		`- """`
		`-`
		`- result = {"was_deferred": False}`
		`-`
		`- if "build_id" in flask.request.json and "chroot" in flask.request.json:`
		`- build_id = flask.request.json["build_id"]`
		`- chroot_name = flask.request.json.get("chroot")`
		`-`
		`- if build_id and chroot_name:`
		`- log.info("Defer build {}, chroot {}".format(build_id, chroot_name))`
		`- if chroot_name == "srpm-builds":`
		`- task = BuildsLogic.defer(build_id)`
		`- else:`
		`- task = BuildChrootsLogic.defer(build_id, chroot_name)`
		`-`
		`- result["was_deferred"] = bool(task)`
		`- db.session.commit()`
		`-`
		`- return flask.jsonify(result)`
		`-`
		`-`
		`@backend_ns.route("/reschedule_all_running/", methods=["POST"])`
		`@misc.backend_authenticated`
		`def reschedule_all_running():`

frontend/coprs_frontend/tests/test_views/test_backend_ns/test_backend_general.py

file modified

+14 -14

		`@@ -8,8 +8,8 @@`
		`class TestWaitingBuilds(CoprsTestCase):`

		`def test_no_waiting_builds(self):`
		`- assert b'"build": null' in self.tc.get(`
		`- "/backend/waiting/", headers=self.auth_header).data`
		`+ assert b'[]' in self.tc.get(`
		`+ "/backend/waiting-jobs/", headers=self.auth_header).data`

		`def test_waiting_build_only_lists_not_started_or_ended(`
		`self, f_users, f_coprs, f_mock_chroots, f_builds, f_db):`
		`@@ -23,8 +23,8 @@`

		`self.db.session.commit()`

		`- r = self.tc.get("/backend/waiting/", headers=self.auth_header)`
		`- assert json.loads(r.data.decode("utf-8"))["build"] == None`
		`+ r = self.tc.get("/backend/waiting-jobs/", headers=self.auth_header)`
		`+ assert json.loads(r.data.decode("utf-8")) == []`

		`for build_chroot in self.b2_bc:`
		`build_chroot.status = 4 # pending`
		`@@ -32,8 +32,8 @@`

		`self.db.session.commit()`

		`- r = self.tc.get("/backend/waiting/", headers=self.auth_header)`
		`- assert json.loads(r.data.decode("utf-8"))["build"] != None`
		`+ r = self.tc.get("/backend/waiting-jobs/", headers=self.auth_header)`
		`+ assert json.loads(r.data.decode("utf-8")) != []`


		`def test_waiting_bg_build(self, f_users, f_coprs, f_mock_chroots, f_builds, f_db):`
		`@@ -43,9 +43,9 @@`
		`build_chroot.status = 4 # pending`
		`self.db.session.commit()`

		`- r = self.tc.get("/backend/waiting/")`
		`+ r = self.tc.get("/backend/waiting-jobs/")`
		`data = json.loads(r.data.decode("utf-8"))`
		`- assert data["build"]["build_id"] == 3`
		`+ assert data[0]["build_id"] == 3`


		`# status = 0 # failure`
		`@@ -164,8 +164,8 @@`
		`class TestWaitingActions(CoprsTestCase):`

		`def test_no_waiting_actions(self):`
		`- assert b'"action": null' in self.tc.get(`
		`- "/backend/waiting/", headers=self.auth_header).data`
		`+ assert b'null' in self.tc.get(`
		`+ "/backend/waiting-action/", headers=self.auth_header).data`

		`def test_waiting_actions_only_lists_not_started_or_ended(`
		`self, f_users, f_coprs, f_actions, f_db):`
		`@@ -175,16 +175,16 @@`

		`self.db.session.commit()`

		`- r = self.tc.get("/backend/waiting/", headers=self.auth_header)`
		`- assert json.loads(r.data.decode("utf-8"))["action"] == None`
		`+ r = self.tc.get("/backend/waiting-action/", headers=self.auth_header)`
		`+ assert json.loads(r.data.decode("utf-8")) == None`

		`for a in [self.a1, self.a2, self.a3]:`
		`a.result = helpers.BackendResultEnum("waiting")`
		`self.db.session.add(a)`

		`self.db.session.commit()`
		`- r = self.tc.get("/backend/waiting/", headers=self.auth_header)`
		`- assert json.loads(r.data.decode("utf-8"))["action"] != None`
		`+ r = self.tc.get("/backend/waiting-action/", headers=self.auth_header)`
		`+ assert json.loads(r.data.decode("utf-8")) != None`


		`class TestUpdateActions(CoprsTestCase):`

clime commented 6 years ago

Need to write unittests. Posting this (prematurely) for feedback...

Backend now specifies what job (arch/owner) it prefers or rather...
what it does not prefer (=cannot allocate resources for at the moment).

Previous job deferring mechanism was returned back as fallback because
even with the job criteria sent to frontend, it might happen that frontend
will pick a job that backend has no resources for (e.g. all builders in group
READY but dirty and frontend will pick job of an owner that does not have a VM
assigned in the group). In that case, deferring might be useful to prevent
queue jamming.

The original backend/waiting interface was split into /backend/select-action
and /backend/select-build-task.

Frontend job filtering logic is done in python, which should be ok considering
that pending/importing jobs are not that many at any point.

includes bugfix in acquire_vm for checking user limits on VMs
includes bugfix in can_user_acquire_more_vm for arch=None builds
includes bugfix for terminating IN_USE machines with failed health checks

update: frontend now presents the whole job queue to backend (see the comments)

praiskup commented 6 years ago

I'm not able to review now, and offhand saying it is good is hard... At least by this ...

Backend now specifies what job (arch/owner) it prefers or rather...
what it does not prefer (=cannot allocate resources for at the moment).

.. it doesn't look bad. I fail to see the "problem" statement in the git commit
message however. I already forgot where the starvation comes from :)
and why copr-backend can not simply pick appropriate task.

frostyx commented 6 years ago

Previous job deferring mechanism was returned back as fallback because
even with the job criteria sent to frontend, it might happen that frontend
will pick a job that backend has no resources for (e.g. all builders in group
READY but dirty and frontend will pick job of an owner that does not have a VM
assigned in the group)

I am not very familiar with this topic, so I am sure that you considered this, but ...
is backend able to find to which users the dirty builders are assigned and use also these criteria while selecting the job? In such case maybe any deferring mechanism wouldn't be needed?

msuchy commented 6 years ago

add index to last_defered column
last_defered can be NOT NULL DEFAULT 0 and then you can safe the or_() condition.
when defering one job, we can defer all jobs of the same user and the same chroot
add comment somewhere that last_defered is a timestamp from moment it has been defered (I actually though that is it time it has been defered to)

msuchy commented 6 years ago

You can still starve if there is no builder free, you are defered and someone else come in meantime.

clime commented 6 years ago

I'm not able to review now, and offhand saying it is good is hard... At least by this ...

Backend now specifies what job (arch/owner) it prefers or rather...
what it does not prefer (=cannot allocate resources for at the moment).

.. it doesn't look bad. I fail to see the "problem" statement in the git commit
message however. I already forgot where the starvation comes from :)
and why copr-backend can not simply pick appropriate task.

The problem is that frontend does not know the state of builders. It picks jobs based on their order solely but this can cause queue jamming and job starvation consequently.

The "fix" was job deferring (right now still present) where backend was able to tell frontend that it does not have resources for a given job right now so that frontend can pick some other job from the queue that is behind the deferred job in the frontend queue.

But it's not a fix that always worked. Job was deferred for a given period of time, which might had been "not enough". If there were 200 jobs in the queue that could not be allocated, then frontend kept recycling only those jobs in /backend/waiting interface because the jobs at the beginning of the queue were meanwhile put back into the queue (their deferred status has expired). So the job at the end of the queue that could be allocated was starving meanwhile. This was a problematic case where 'last_deferred' solution was not a sufficient (given the current LAST_DEFERRED time setting) measure to solve the root problem of frontend not having enough information to provide a startable job to backend.

Then there was the second problem that actually was introduced by the 'last_deferred' workaround itself. A build arrives to frontend but backend cannot allocate it right now. So it is deferred (instead of keeping the job at the top of the queue and waiting till backend releases some resources). It gets back to the queue (and at its top) again in LAST_DEFERRED seconds (as specified in the config) but it again gets deferred even though meanwhile it could have been allocated but wasn't because it was deferred and other new job that has just arrived into the queue got the resources instead. So the job can be "pinged" out indefinitely.

So this pull request gives more information to frontend by specifying what archs, owners, or (owner, arch) pairs should be excluded from consideration when selecting the job. There is also provide_task_ids to implement job reattaching but that's a special case. Submission order is still the main criterion for selection but some jobs at the beginning of the queue can be skipped right now if they are being excluded by the criteria.

But there is a case when this PR still fails. Let's suppose there is fedora-27-x86_64 job of owner1 waiting at the top of the queue. Now, let's say there is only one group at backend for x86_64 and all the builders there are ready are also dirty and they are all allocated to user set: owner2, owner3, and owner4.

In this case, the current exclusion criteria are not sufficient (nor exclude_archs, exlude_owner, exclude_owner_arch_pairs covers this) and frontend will provide the fedora-27-x86_64 job of owner1 to backend. Backend cannot allocate the job and this will cause queue jamming until some dirty VM is released and new one is spawned.

That's why I returned back the 'last_deferred' mechanism to cover this case but that mechanism does not work always (e.g. when there is a very long queue) and it introduces the "ping out" problem again (i.e. job is repeteadly deferred till the end of time).

So the job starving should now happen much less thanks to the job-selection criteria that backend provides to frontend but there is still a case when this can happen because 'last_deferred' "feature" is still present (it might be disabled by setting LAST_DEFERRED to ZERO).

Now the question is what to do. Can we make this work for all cases with some simple improvement (I don't want to go into "let's rewrite it all" because this is pretty good right now)? Basically by simply disabling the last_deferred feature we can prevent job starvation at a risk of queue jamming. last_deferred prevents queue jamming but not completely and introduces the job starvation problem.

We can solve the described build jamming by always preferring jobs for dirty VMS that we currently have. But this can again cause build starvation. This is a problem of what matters more: "resources that we have allocated" or "order in the queue". And we need to find the best balance between these two.

clime commented 6 years ago

Previous job deferring mechanism was returned back as fallback because
even with the job criteria sent to frontend, it might happen that frontend
will pick a job that backend has no resources for (e.g. all builders in group
READY but dirty and frontend will pick job of an owner that does not have a VM
assigned in the group)

I am not very familiar with this topic, so I am sure that you considered this, but ...
is backend able to find to which users the dirty builders are assigned and use also these criteria while selecting the job? In such case maybe any deferring mechanism wouldn't be needed?

That's true but I am not sure that we want to go for full "prefer jobs for dirty VMs". That again introduces job starvation when users can block builders indefinitely (unintentionally). I would rather go for: keep dirty VMs for some time and allocate them if there is one available for the given user (which should have an advantage of re-using a mean-while created mock root cache so that buildroot does not need to be initialized from scratch and of course it is much more efficient than to immeditely throw away the machine after it has been used) but do not pick jobs for dirty VMs from the queue out of order.

clime commented 6 years ago

Previous job deferring mechanism was returned back as fallback because
even with the job criteria sent to frontend, it might happen that frontend
will pick a job that backend has no resources for (e.g. all builders in group
READY but dirty and frontend will pick job of an owner that does not have a VM
assigned in the group)
I am not very familiar with this topic, so I am sure that you considered this, but ...
is backend able to find to which users the dirty builders are assigned and use also these criteria while selecting the job? In such case maybe any deferring mechanism wouldn't be needed?

That's true but I am not sure that we want to go for full "prefer jobs for dirty VMs". That again introduces job starvation when users can block builders indefinitely (unintentionally). I would rather go for: keep dirty VMs for some time and allocate them if there is one available for the given user (which should have an advantage of re-using a mean-while created mock root cache so that buildroot does not need to be initialized from scratch and of course it is much more efficient than to immeditely throw away the machine after it has been used) but do not pick jobs for dirty VMs from the queue out of order.

I wasn't reacting very precisely here. We can give even more information to frontend - the information about archs for which all the ready VMs are dirty and also give it owners of these dirty VMs but I don't think it will help us in any particular way. We will move the problem to frontend, which is good because frontend selects jobs, backend only gives "hints" to it but frontend will still somehow need to decide what to do if there are some jobs for which there are dirty VMs but these jobs are at the end of a long queue. It can either:
- block and wait until new clean VM is spawned
- re-use dirty VMs, which means that jobs can skip the queue

Currently my view is: let's keep job deferring but allow it only once. That way some jobs can "skip" the queue if there is no clean VM for a job at the top but the job cannot be skipped over all the time (only once). We can differentiate between was deferred/was not deferred easily by comparing the field with NULL (or zero as Mirek suggested) so we don't need to add any additional db columns to count how many times a job was deferred.

praiskup commented 6 years ago

Thanks for all the words @clime, appreciated!

The problem is that frontend does not know the state of builders. It picks jobs based on their order solely but this can cause queue jamming and job starvation consequently.

Just one question comes to my mind, what if frontend provided "/backend/<all-tasks-url>/", and backend simply picked what can be handled by "/backend/<take-concrete-task>/"?

clime commented 6 years ago

Thanks for all the words @clime, appreciated!

The problem is that frontend does not know the state of builders. It picks jobs based on their order solely but this can cause queue jamming and job starvation consequently.

Just one question comes to my mind, what if frontend provided "/backend/<all-tasks-url>/", and backend simply picked what can be handled by "/backend/<take-concrete-task>/"?

This is a very nice idea! That basically means dropping lots of what I did here but maybe it's worth it.

1 new commit added

[frontend][backend] frontend now presents the whole job queue state to backend

6 years ago

3 new commits added

[backend] enable and update vmmamanger tests, fix three minor bugs in the manager
[backend] fix createrepo tests
[backend] fix acquire_vm

6 years ago

rebased onto 30763f0

6 years ago

2 new commits added

[frontend] fix unit tests
[frontend] fix revision sequence

6 years ago

clime commented 6 years ago

Hello, I did the proposed changes. Then I also fixed the frontend revision sequence, fixed frontend unit tests and re-enabled backend unit-tests for vm_manger (and also did some fixes there).

I am going to test on dev and probably merge in short while unless you have some objections.

By the way, I would like to do some additional clean-up in the backend unit-tests and also enable them during build. (First enable them clean them up step-by-step). Just letting you know that it would be good.

Pull-Request has been merged by clime

6 years ago