Package backend :: Package daemons :: Module dispatcher
[hide private]
[frames] | no frames]

Source Code for Module backend.daemons.dispatcher

  1  import re 
  2  import os 
  3  import sys 
  4  import time 
  5  import fcntl 
  6  import json 
  7  import subprocess 
  8  from subprocess import CalledProcessError 
  9  import multiprocessing 
 10   
 11  import ansible 
 12  import ansible.runner 
 13  import ansible.utils 
 14   
 15  from ansible.errors import AnsibleError 
 16   
 17  from setproctitle import setproctitle 
 18  from IPy import IP 
 19  from retask.queue import Queue 
 20   
 21   
 22   
 23  from ..mockremote.callback import CliLogCallBack 
 24   
 25  from ..exceptions import MockRemoteError, CoprWorkerError, CoprWorkerSpawnFailError 
 26  from ..job import BuildJob 
 27   
 28  from ..mockremote import MockRemote 
 29  from ..frontend import FrontendClient 
 30  from ..constants import BuildStatus 
 31  from ..helpers import register_build_result 
 32   
 33  ansible_playbook = "ansible-playbook" 
 34   
 35  try: 
 36      import fedmsg 
 37  except ImportError: 
 38      # fedmsg is optional 
 39      fedmsg = None 
40 41 42 -def ans_extra_vars_encode(extra_vars, name):
43 """ transform dict into --extra-vars="json string" """ 44 if not extra_vars: 45 return "" 46 return "--extra-vars='{{\"{0}\": {1}}}'".format(name, json.dumps(extra_vars))
47
48 49 -class WorkerCallback(object):
50 """ 51 Callback class for worker. Now used only for message logging 52 53 :param logfile: path to the log file 54 """ 55
56 - def __init__(self, logfile=None):
57 self.logfile = logfile
58
59 - def log(self, msg):
60 """ 61 Safely writes msg to the logfile 62 63 :param str msg: message to be logged 64 """ 65 if self.logfile: 66 now = time.strftime("%F %T") 67 try: 68 with open(self.logfile, 'a') as lf: 69 fcntl.flock(lf, fcntl.LOCK_EX) 70 lf.write(str(now) + ': ' + msg + '\n') 71 fcntl.flock(lf, fcntl.LOCK_UN) 72 except (IOError, OSError) as e: 73 sys.stderr.write("Could not write to logfile {0} - {1}\n" 74 .format(self.logfile, str(e)))
75
76 77 # TODO: Extract VmManager class 78 -class Worker(multiprocessing.Process):
79 80 """ 81 Worker process dispatches building tasks. Backend spin-up multiple workers, each 82 worker associated to one group_id and process one task at the each moment. 83 84 Worker listens for the new tasks from :py:class:`retask.Queueu` associated with its group_id 85 86 :param Bunch opts: backend config 87 :param queue: (:py:class:`multiprocessing.Queue`) queue to announce new events 88 :param int worker_num: worker number 89 :param int group_id: group_id from the set of groups defined in config 90 :param callback: callback object to handle internal workers events. Should implement method ``log(msg)``. 91 :param lock: (:py:class:`multiprocessing.Lock`) global backend lock 92 93 """ 94
95 - def __init__(self, opts, events, worker_num, group_id, 96 callback=None, lock=None):
97 98 # base class initialization 99 multiprocessing.Process.__init__(self, name="worker-builder") 100 101 self.opts = opts 102 103 # job management stuff 104 self.task_queue = Queue("copr-be-{0}".format(str(group_id))) 105 self.task_queue.connect() 106 # event queue for communicating back to dispatcher 107 self.events = events 108 self.worker_num = worker_num 109 self.group_id = group_id 110 111 self.kill_received = False 112 self.lock = lock 113 self.frontend_callback = FrontendClient(opts, events) 114 self.callback = callback 115 if not self.callback: 116 log_name = "worker-{0}-{1}.log".format( 117 self.group_name, 118 self.worker_num) 119 120 self.logfile = os.path.join(self.opts.worker_logdir, log_name) 121 self.callback = WorkerCallback(logfile=self.logfile) 122 123 self.vm_name = None 124 self.vm_ip = None 125 self.callback.log("creating worker: dynamic ip")
126 127 @property
128 - def group_name(self):
129 try: 130 return self.opts.build_groups[self.group_id]["name"] 131 except Exception as error: 132 self.callback.log("Failed to get builder group name from config, using group_id as name." 133 "Original error: {}".format(error)) 134 return self.group_id
135
136 - def event(self, topic, template, content=None):
137 """ Multi-purpose logging method. 138 139 Logs messages to three different destinations: 140 - To log file 141 - The internal "events" queue for communicating back to the 142 dispatcher. 143 - The fedmsg bus. Messages are posted asynchronously to a 144 zmq.PUB socket. 145 146 """ 147 148 content = content or {} 149 what = template.format(**content) 150 who = "worker-{0}".format(self.worker_num) 151 152 self.callback.log("event: who: {0}, what: {1}".format(who, what)) 153 self.events.put({"when": time.time(), "who": who, "what": what}) 154 155 if self.opts.fedmsg_enabled and fedmsg: 156 content["who"] = who 157 content["what"] = what 158 try: 159 fedmsg.publish(modname="copr", topic=topic, msg=content) 160 # pylint: disable=W0703 161 except Exception as e: 162 # XXX - Maybe log traceback as well with traceback.format_exc() 163 self.callback.log("failed to publish message: {0}".format(e))
164
165 - def _announce_start(self, job):
166 """ 167 Announce everywhere that a build process started now. 168 """ 169 job.started_on = time.time() 170 self.mark_started(job) 171 172 template = "build start: user:{user} copr:{copr}" \ 173 "pkg: {pkg} build:{build} ip:{ip} pid:{pid}" 174 175 content = dict(user=job.submitter, copr=job.project_name, 176 owner=job.project_owner, pkg=job.pkg_name, 177 build=job.build_id, ip=self.vm_ip, pid=self.pid) 178 self.event("build.start", template, content) 179 180 template = "chroot start: chroot:{chroot} user:{user}" \ 181 "copr:{copr} pkg: {pkg} build:{build} ip:{ip} pid:{pid}" 182 183 content = dict(chroot=job.chroot, user=job.submitter, 184 owner=job.project_owner, pkg=job.pkg_name, 185 copr=job.project_name, build=job.build_id, 186 ip=self.vm_ip, pid=self.pid) 187 188 self.event("chroot.start", template, content)
189
190 - def _announce_end(self, job):
191 """ 192 Announce everywhere that a build process ended now. 193 """ 194 job.ended_on = time.time() 195 196 self.return_results(job) 197 self.callback.log("worker finished build: {0}".format(self.vm_ip)) 198 template = "build end: user:{user} copr:{copr} build:{build}" \ 199 " pkg: {pkg} version: {version} ip:{ip} pid:{pid} status:{status}" 200 201 content = dict(user=job.submitter, copr=job.project_name, 202 owner=job.project_owner, 203 pkg=job.pkg_name, version=job.pkg_version, 204 build=job.build_id, ip=self.vm_ip, pid=self.pid, 205 status=job.status, chroot=job.chroot) 206 self.event("build.end", template, content)
207
208 - def run_ansible_playbook(self, args, name="running playbook", attempts=9):
209 """ 210 Call ansible playbook: 211 212 - well mostly we run out of space in OpenStack so we rather try 213 multiple times (attempts param) 214 - dump any attempt failure 215 """ 216 217 # Ansible playbook python API does not work here, dunno why. See: 218 # https://groups.google.com/forum/#!topic/ansible-project/DNBD2oHv5k8 219 220 command = "{0} {1}".format(ansible_playbook, args) 221 222 result = None 223 for i in range(0, attempts): 224 try: 225 attempt_desc = ": retry: " if i > 0 else ": begin: " 226 self.callback.log(name + attempt_desc + command) 227 result = subprocess.check_output(command, shell=True) 228 self.callback.log("Raw playbook output:\n{0}\n".format(result)) 229 break 230 231 except CalledProcessError as e: 232 self.callback.log("CalledProcessError: \n{0}\n".format(e.output)) 233 sys.stderr.write("{0}\n".format(e.output)) 234 # FIXME: this is not purpose of opts.sleeptime 235 time.sleep(self.opts.sleeptime) 236 237 self.callback.log(name + ": end") 238 return result
239
240 - def validate_vm(self):
241 """ 242 Test connectivity to the VM 243 244 :param ipaddr: ip address to the newly created VM 245 :raises: :py:class:`~backend.exceptions.CoprWorkerSpawnFailError`: validation fails 246 """ 247 # we were getting some dead instances 248 # that's why I'm testing the connectivity here 249 runner_options = dict( 250 remote_user="root", 251 host_list="{},".format(self.vm_ip), 252 pattern=self.vm_ip, 253 forks=1, 254 transport=self.opts.ssh.transport, 255 timeout=500 256 ) 257 connection = ansible.runner.Runner(**runner_options) 258 connection.module_name = "shell" 259 connection.module_args = "echo hello" 260 261 try: 262 res = connection.run() 263 except Exception as exception: 264 raise CoprWorkerSpawnFailError( 265 "Failed to check created VM ({})" 266 "due to ansible error: {}".format(self.vm_ip, exception)) 267 268 if self.vm_ip not in res.get("contacted", {}): 269 self.callback.log( 270 "Worker is not responding to the testing playbook. Terminating it." 271 "Runner options: {}".format(runner_options) + 272 "Ansible raw response:\n{}".format(res)) 273 raise CoprWorkerSpawnFailError("Created VM ({}) was unresponsive " 274 "and therefore terminated".format(self.vm_ip))
275
276 - def try_spawn(self, args):
277 """ 278 Tries to spawn new vm using ansible 279 280 :param args: ansible for ansible command which spawns VM 281 :return str: valid ip address of new machine (nobody guarantee machine availability) 282 """ 283 result = self.run_ansible_playbook(args, "spawning instance") 284 if not result: 285 raise CoprWorkerSpawnFailError("No result, trying again") 286 match = re.search(r'IP=([^\{\}"]+)', result, re.MULTILINE) 287 288 if not match: 289 raise CoprWorkerSpawnFailError("No ip in the result, trying again") 290 ipaddr = match.group(1) 291 match = re.search(r'vm_name=([^\{\}"]+)', result, re.MULTILINE) 292 293 if match: 294 self.vm_name = match.group(1) 295 self.callback.log("got instance ip: {0}".format(ipaddr)) 296 297 try: 298 IP(ipaddr) 299 except ValueError: 300 # if we get here we"re in trouble 301 msg = "Invalid IP back from spawn_instance - dumping cache output\n" 302 msg += str(result) 303 raise CoprWorkerSpawnFailError(msg) 304 305 return ipaddr
306
307 - def spawn_instance(self):
308 """ 309 Spawn new VM, executing the following steps: 310 311 - call the spawn playbook to startup/provision a building instance 312 - get an IP and test if the builder responds 313 - repeat this until you get an IP of working builder 314 315 :param BuildJob job: 316 :return ip: of created VM 317 :return None: if couldn't find playbook to spin ip VM 318 """ 319 320 start = time.time() 321 322 # Ansible playbook python API does not work here, dunno why. See: 323 # https://groups.google.com/forum/#!topic/ansible-project/DNBD2oHv5k8 324 325 try: 326 spawn_playbook = self.opts.build_groups[self.group_id]["spawn_playbook"] 327 except KeyError: 328 return 329 330 spawn_args = "-c ssh {}".format(spawn_playbook) 331 332 # TODO: replace with for i in range(MAX_SPAWN_TRIES): ... else raise FatalError 333 i = 0 334 while self.vm_ip is None: 335 i += 1 336 try: 337 self.callback.log("Spawning a builder. Try No. {0}".format(i)) 338 339 self.vm_ip = self.try_spawn(spawn_args) 340 self.update_process_title() 341 try: 342 self.validate_vm() 343 except CoprWorkerSpawnFailError: 344 self.terminate_instance() 345 raise 346 347 self.callback.log("Instance spawn/provision took {0} sec" 348 .format(time.time() - start)) 349 350 except CoprWorkerSpawnFailError as exception: 351 self.callback.log("VM Spawn attempt failed with message: {}" 352 .format(exception.msg))
353
354 - def terminate_instance(self):
355 """ 356 Call the terminate playbook to destroy the building instance 357 """ 358 self.update_process_title(suffix="Terminating VM") 359 term_args = {} 360 if "ip" in self.opts.terminate_vars: 361 term_args["ip"] = self.vm_ip 362 if "vm_name" in self.opts.terminate_vars: 363 term_args["vm_name"] = self.vm_name 364 365 try: 366 playbook = self.opts.build_groups[self.group_id]["terminate_playbook"] 367 except KeyError: 368 self.callback.log( 369 "Fatal error: no terminate playbook for group_id: {}; exiting" 370 .format(self.group_id)) 371 sys.exit(255) 372 373 # args = "-c ssh -i '{0},' {1} {2}".format( 374 args = "-c ssh {} {}".format( 375 # self.vm_ip, 376 playbook, 377 ans_extra_vars_encode(term_args, "copr_task")) 378 379 try: 380 self.run_ansible_playbook(args, "terminate instance") 381 except Exception as error: 382 self.callback.log("Failed to terminate an instance: vm_name={}, vm_ip={}. Original error: {}" 383 .format(self.vm_name, self.vm_ip, error)) 384 385 # TODO: should we check that machine was destroyed? 386 self.vm_ip = None 387 self.vm_name = None 388 self.update_process_title()
389
390 - def mark_started(self, job):
391 """ 392 Send data about started build to the frontend 393 """ 394 395 job.status = 3 # running 396 build = job.to_dict() 397 self.callback.log("build: {}".format(build)) 398 399 data = {"builds": [build]} 400 try: 401 self.frontend_callback.update(data) 402 except: 403 raise CoprWorkerError( 404 "Could not communicate to front end to submit status info")
405
406 - def return_results(self, job):
407 """ 408 Send the build results to the frontend 409 """ 410 self.callback.log( 411 "{0} status {1}. Took {2} seconds".format( 412 job.build_id, job.status, job.ended_on - job.started_on)) 413 414 self.callback.log("build: {}".format(job.to_dict())) 415 data = {"builds": [job.to_dict()]} 416 417 try: 418 self.frontend_callback.update(data) 419 except Exception as err: 420 raise CoprWorkerError( 421 "Could not communicate to front end to submit results: {}" 422 .format(err) 423 )
424
425 - def starting_build(self, job):
426 """ 427 Announce to the frontend that a build is starting. 428 429 :return True: if the build can start 430 :return False: if the build can not start (build is cancelled) 431 """ 432 433 try: 434 can_start = self.frontend_callback.starting_build(job.build_id, job.chroot) 435 except Exception as err: 436 raise CoprWorkerError( 437 "Could not communicate to front end to submit results: {}" 438 .format(err) 439 ) 440 441 return can_start
442 443 @classmethod
444 - def pkg_built_before(cls, pkg, chroot, destdir):
445 """ 446 Check whether the package has already been built in this chroot. 447 """ 448 s_pkg = os.path.basename(pkg) 449 pdn = s_pkg.replace(".src.rpm", "") 450 resdir = "{0}/{1}/{2}".format(destdir, chroot, pdn) 451 resdir = os.path.normpath(resdir) 452 if os.path.exists(resdir) and os.path.exists(os.path.join(resdir, "success")): 453 return True 454 return False
455
457 """ 458 Wrapper around self.spawn_instance() with exception checking 459 460 :param BuildJob job: 461 462 :return str: ip of spawned vm 463 :raises: 464 465 - :py:class:`~backend.exceptions.CoprWorkerError`: spawn function doesn't return ip 466 - :py:class:`AnsibleError`: failure during anible command execution 467 """ 468 self.update_process_title(suffix="Spawning a new VM") 469 try: 470 self.spawn_instance() 471 if not self.vm_ip: 472 # TODO: maybe add specific exception? 473 raise CoprWorkerError( 474 "No IP found from creating instance") 475 except AnsibleError as e: 476 register_build_result(self.opts, failed=True) 477 478 self.callback.log("failure to setup instance: {0}".format(e)) 479 raise
480
481 - def init_fedmsg(self):
482 """ 483 Initialize Fedmsg 484 (this assumes there are certs and a fedmsg config on disk) 485 """ 486 487 if not (self.opts.fedmsg_enabled and fedmsg): 488 return 489 490 try: 491 fedmsg.init(name="relay_inbound", cert_prefix="copr", active=True) 492 except Exception as e: 493 self.callback.log( 494 "failed to initialize fedmsg: {0}".format(e))
495
496 - def on_pkg_skip(self, job):
497 """ 498 Handle package skip 499 """ 500 self._announce_start(job) 501 self.callback.log( 502 "Skipping: package {0} has been already built before.".format(job.pkg)) 503 job.status = BuildStatus.SKIPPED # skipped 504 self._announce_end(job)
505
506 - def obtain_job(self):
507 """ 508 Retrieves new build task from queue. 509 Checks if the new job can be started and not skipped. 510 """ 511 self.update_process_title(suffix="No task") 512 513 # this sometimes caused TypeError in random worker 514 # when another one picekd up a task to build 515 # why? 516 try: 517 task = self.task_queue.dequeue() 518 except TypeError: 519 return 520 if not task: 521 return 522 523 # import ipdb; ipdb.set_trace() 524 job = BuildJob(task.data, self.opts) 525 526 self.update_process_title(suffix="Task: {} chroot: {}".format(job.build_id, job.chroot)) 527 528 # Checking whether the build is not cancelled 529 if not self.starting_build(job): 530 return 531 532 # Checking whether to build or skip 533 if self.pkg_built_before(job.pkg, job.chroot, job.destdir): 534 self.on_pkg_skip(job) 535 return 536 537 # FIXME 538 # this is our best place to sanity check the job before starting 539 # up any longer process 540 541 return job
542
543 - def do_job(self, job):
544 """ 545 Executes new job. 546 547 :param job: :py:class:`~backend.job.BuildJob` 548 """ 549 self._announce_start(job) 550 status = BuildStatus.SUCCEEDED 551 chroot_destdir = os.path.normpath(job.destdir + '/' + job.chroot) 552 553 # setup our target dir locally 554 if not os.path.exists(chroot_destdir): 555 try: 556 os.makedirs(chroot_destdir) 557 except (OSError, IOError) as e: 558 msg = "Could not make results dir" \ 559 " for job: {0} - {1}".format(chroot_destdir, str(e)) 560 561 self.callback.log(msg) 562 status = BuildStatus.FAILURE 563 564 if status == BuildStatus.SUCCEEDED: 565 # FIXME 566 # need a plugin hook or some mechanism to check random 567 # info about the pkgs 568 # this should use ansible to download the pkg on 569 # the remote system 570 # and run a series of checks on the package before we 571 # start the build - most importantly license checks. 572 573 self.callback.log( 574 "Starting build: id={0} builder={1} timeout={2} destdir={3}" 575 " chroot={4} repos={5}" 576 .format(job.build_id, self.vm_ip, job.timeout, job.destdir, 577 job.chroot, str(job.repos))) 578 579 self.callback.log("Building pkgs: {0}".format(job.pkg)) 580 581 chroot_repos = list(job.repos) 582 chroot_repos.append(job.results + job.chroot + '/') 583 chroot_repos.append(job.results + job.chroot + '/devel/') 584 585 chroot_logfile = "{0}/build-{1}.log".format( 586 chroot_destdir, job.build_id) 587 588 macros = { 589 "copr_username": job.project_owner, 590 "copr_projectname": job.project_name, 591 "vendor": "Fedora Project COPR ({0}/{1})".format( 592 job.project_owner, job.project_name) 593 } 594 595 try: 596 mr = MockRemote( 597 builder_host=self.vm_ip, job=job, repos=chroot_repos, 598 macros=macros, opts=self.opts, lock=self.lock, 599 callback=CliLogCallBack(quiet=True, logfn=chroot_logfile), 600 ) 601 mr.check() 602 603 build_details = mr.build_pkg() 604 job.update(build_details) 605 606 if self.opts.do_sign: 607 mr.add_pubkey() 608 609 register_build_result(self.opts) 610 611 except MockRemoteError as e: 612 # record and break 613 self.callback.log("{0} - {1}".format(self.vm_ip, e)) 614 status = BuildStatus.FAILURE 615 register_build_result(self.opts, failed=True) 616 617 self.callback.log( 618 "Finished build: id={0} builder={1} timeout={2} destdir={3}" 619 " chroot={4} repos={5}" 620 .format(job.build_id, self.vm_ip, job.timeout, job.destdir, 621 job.chroot, str(job.repos))) 622 623 job.status = status 624 self._announce_end(job) 625 self.update_process_title(suffix="Task: {} chroot: {} done" 626 .format(job.build_id, job.chroot))
627
628 - def check_vm_still_alive(self):
629 """ 630 Ensure that if we have vm_ip it is alive. 631 Terminates unresponsive instance. 632 """ 633 if self.vm_ip: 634 # TODO: extract method: check_vm_still_alive 635 try: 636 self.validate_vm() 637 except CoprWorkerSpawnFailError: 638 self.terminate_instance()
639
640 - def update_process_title(self, suffix=None):
641 title = "worker-{} {} ".format(self.group_name, self.worker_num) 642 if self.vm_ip: 643 title += "VM_IP={} ".format(self.vm_ip) 644 if self.vm_name: 645 title += "VM_NAME={} ".format(self.vm_name) 646 if suffix: 647 title += str(suffix) 648 649 setproctitle(title)
650
651 - def run(self):
652 """ 653 Worker should startup and check if it can function 654 for each job it takes from the jobs queue 655 run opts.setup_playbook to create the instance 656 do the build (mockremote) 657 terminate the instance. 658 659 """ 660 self.init_fedmsg() 661 662 while not self.kill_received: 663 self.update_process_title() 664 self.check_vm_still_alive() 665 666 if self.opts.spawn_in_advance and not self.vm_ip: 667 self.spawn_instance_with_check() 668 669 job = self.obtain_job() 670 if not job: 671 time.sleep(self.opts.sleeptime) 672 continue 673 674 if not self.vm_ip: 675 self.spawn_instance_with_check() 676 677 try: 678 self.do_job(job) 679 except Exception as error: 680 self.callback.log("Unhandled build error: {}".format(error)) 681 finally: 682 # clean up the instance 683 self.terminate_instance()
684