#44 backend: More robust ssh connection to builder
Merged 7 years ago by clime. Opened 7 years ago by praiskup.
Unknown source detect-ssh-issues  into  master

@@ -150,20 +150,6 @@

  

              job = self.load_job()

  

-             # first check if we do not have

-             # worker already running for the job

-             worker = None

-             for w in self.workers:

-                 if job.task_id == w.job.task_id:

-                     worker = w

-                     break

- 

-             if worker:

-                 self.log.warning("Frontend asked to run already running task '{}'...skipping..."

-                                  .format(job.task_id))

-                 worker.mark_started(job)

-                 continue
clime commented 7 years ago

Why remove this? If we get a job for which a worker is already running, we don't want to spawn another one. It should not normally happen but might so we handle that case like this.

- 

              # now search db builder records for the job and

              # if we found it, just spawn a worker to reattach

              vm = self.vm_manager.get_vm_by_task_id(job.task_id)

@@ -13,8 +13,10 @@

  from ..constants import BuildStatus, build_log_format

  from ..helpers import register_build_result, get_redis_connection, get_redis_logger, \

      local_file_logger

+ from ..vm_manage import VmStates

  

  from ..msgbus import MsgBusStomp, MsgBusFedmsg

+ from ..sshcmd import SSHConnectionError

  

  

  # ansible_playbook = "ansible-playbook"
@@ -218,6 +220,19 @@

                      )

                      status = BuildStatus.FAILURE

                      register_build_result(self.opts, failed=True)

+ 

+                 except SSHConnectionError as err:

+                     self.log.exception(

+                         "SSH connection staled: {0}".format(str(err)))

+                     # The VM is unusable, don't wait for relatively slow

+                     # garbage collector.

+                     self.vm_manager.start_vm_termination(

+                             self.vm.vm_name,

+                             allowed_pre_state=VmStates.IN_USE)

+                     self.frontend_client.reschedule_build(

+                             job.build_id, job.chroot)

+                     raise VmError("SSH connection issue, build rescheduled")

+ 

                  except: # programmer's failures

                      self.log.exception("Unexpected error")

                      status = BuildStatus.FAILURE

file modified
+2 -4
@@ -298,10 +298,8 @@

  

          # ssh options

          opts.ssh = Munch()

-         opts.ssh.port = _get_conf(

-             cp, "ssh", "port", 22, mode="int")

-         opts.ssh.identity_file = _get_conf(

-             cp, "ssh", "identity_file", None, mode="path")

+         opts.ssh.builder_config = _get_conf(

+             cp, "builder_config", "builder_config", "/home/copr/.ssh/builder_config")

  

          opts.msg_buses = []

          for bus_config in glob.glob('/etc/copr/msgbuses/*.conf'):

@@ -289,14 +289,11 @@

          self.prepare_build_dir()

  

          try:

-             stdout, stderr = self.builder.build()

+             self.builder.build()

          except BuilderError as error:

              self.log.error(str(error))

              raise MockRemoteError("Builder error during job {}.".format(self.job))

  

-         self.log.info("builder.build stdout: {}, stderr: {}"

-                       .format(stdout, stderr))

- 

      def reattach_to_pkg_build(self):

          """

          :raises VmError, BuilderError
@@ -304,14 +301,11 @@

          self.log.info("Reattach to build: {}".format(self.job))

  

          try:

-             stdout, stderr = self.builder.reattach()

+             self.builder.reattach()

          except BuilderError as error:

              self.log.error(str(error))

              raise MockRemoteError("Builder error during job {}.".format(self.job))

  

-         self.log.info("builder.reattach stdout: {}, stderr: {}"

-                       .format(stdout, stderr))

- 

      def check_build_success(self):

          try:

              self.builder.check_build_success()
@@ -330,8 +324,20 @@

              self.log.error(str(error))

              raise MockRemoteError("Builder error during job {}.".format(self.job))

          finally:

-             self.builder.download_results(self.job.results_dir)

-             self.builder.download_configs(os.path.join(self.job.results_dir, "configs"))

+             # Don't ever raise exception here because such exception could

+             # overwrite some more important - not yet handled - exception.  But

+             # anyways *do our best* to download the build results and mock

+             # configuration.

+             try:

+                 self.builder.download_results(self.job.results_dir)

+             except Exception as err:

+                 self.log.exception(err)

+ 

+             try:

+                 self.builder.download_configs(os.path.join(self.job.results_dir, "configs"))

+             except Exception as err:

+                 self.log.exception(err)

+ 

              # self.add_log_symlinks()  # todo: add config option, need this for nginx

              self.log.info("End Build: {0}".format(self.job))

  

@@ -4,7 +4,6 @@

  from subprocess import Popen

  import time

  from urlparse import urlparse

- import paramiko

  import glob

  

  from backend.vm_manage import PUBSUB_INTERRUPT_BUILDER
@@ -13,6 +12,7 @@

  from ..exceptions import BuilderError, BuilderTimeOutError, RemoteCmdError, VmError

  

  from ..constants import mockchain, rsync, DEF_BUILD_TIMEOUT

+ from ..sshcmd import SSHConnectionError, SSHConnection

  

  import modulemd

  
@@ -33,9 +33,8 @@

          self._remote_basedir = self.opts.remote_basedir

          self._remote_pkg_path = None

  

-         # if we're at this point we've connected and done stuff on the host

-         self.conn = self._create_ssh_conn(username=self.opts.build_user)

-         self.root_conn = self._create_ssh_conn(username="root")

+         self.root_conn = SSHConnection(host=self.hostname, config_file=self.opts.ssh.builder_config)

+         self.conn = SSHConnection(user=self.opts.build_user, host=self.hostname, config_file=self.opts.ssh.builder_config)

  

          self.module_dist_tag = self._load_module_dist_tag()

  
@@ -77,13 +76,6 @@

      def tempdir(self, value):

          self._remote_tempdir = value

  

-     def _create_ssh_conn(self, username):

-         conn = paramiko.SSHClient()

-         conn.set_missing_host_key_policy(paramiko.AutoAddPolicy())

-         conn.connect(hostname=self.hostname, port=self.opts.ssh.port,

-                      username=username, key_filename=self.opts.ssh.identity_file)

-         return conn

- 

      def _run_ssh_cmd(self, cmd, as_root=False):

          """

          Executes single shell command remotely
@@ -99,17 +91,9 @@

  

          self.log.info("BUILDER CMD: "+cmd)

  

-         try:

-             stdin, stdout, stderr = conn.exec_command(cmd)

-         except paramiko.SSHException as err:

-             raise RemoteCmdError("Paramiko failure.",

-                                  cmd, -1, as_root, str(err), "(none)")

- 

-         rc = stdout.channel.recv_exit_status()

-         out, err = stdout.read(), stderr.read()

- 

+         rc, out, err = conn.run_expensive(cmd)

          if rc != 0:

-             raise RemoteCmdError("Remote command error occurred.",

+             raise RemoteCmdError("Error running remote ssh command.",

                                   cmd, rc, as_root, err, out)

          return out, err

  
@@ -266,7 +250,17 @@

  

          buildcmd += self.remote_pkg_path

  

-         buildcmd_async = '{buildcmd} &>> {livelog} &'.format(

+         # To run something on background, we need to:

+         # - ignore SIGHUP, 'nohup' is racy -> sighup'ed by ssh before signal

+         #   handler is actually set by nohup

+         # - make sure to not have attached std{out,err} descriptors to the pty

+         #   provided by 'ssh -t' (redirect or close them!); otherwise 'ssh -t'

+         #   hangs here till the command finishes, OTOH ...

+         # - doing &>{livelog} means that the mockchain command has no terminal

+         #   on std{out,err} which means that it's output are not line-buffered

+         #   (which wouldn't be very useful live-log), so let's use `unbuffer`

+         #   from expect.rpm to allocate _persistent_ server-side pseudo-terminal

+         buildcmd_async = 'trap "" SIGHUP; unbuffer {buildcmd} &>{livelog} &'.format(

              livelog=self.livelog_name, buildcmd=buildcmd)

  

          self._run_ssh_cmd(buildcmd_async)
@@ -308,14 +302,22 @@

          try:

              pidof_cmd = "/usr/bin/pgrep -u {user} {command}".format(

                  user=self.opts.build_user, command="mockchain")

-             out, err = self._run_ssh_cmd(pidof_cmd)

-         except RemoteCmdError as err:

+             out, _ = self._run_ssh_cmd(pidof_cmd)

+         except RemoteCmdError:

              self.log.info("Build is not running. Continuing...")

-             return None, None

+             return

+ 

+         ensure_dir_exists(self.job.results_dir, self.log)

+         live_log = os.path.join(self.job.results_dir, 'mockchain-live.log')

+ 

+         live_cmd = '/usr/bin/tail -f --pid={pid} {log}'.format(

+             pid=out.strip(), log=self.livelog_name)

+ 

+         self.log.info("Attaching to live build log: " + live_cmd)

+         with open(live_log, 'w') as logfile:

+             # Ignore the exit status.

+             self.conn.run(live_cmd, stdout=logfile, stderr=logfile)

  

-         self.log.info("Attaching to live build log...")

-         return self._run_ssh_cmd('/usr/bin/tail -f --pid={pid} {log}'

-                                  .format(pid=out.strip(), log=self.livelog_name))

  

      def build(self):

          # make mock config
@@ -328,13 +330,10 @@

          self.run_mockchain_async()

  

          # attach to building output

-         stdout, stderr = self.attach_to_build()

- 

-         return stdout, stderr

+         self.attach_to_build()

  

      def reattach(self):

-         stdout, stderr = self.attach_to_build()

-         return stdout, stderr

+         self.attach_to_build()

  

      def rsync_call(self, source_path, target_path):

          ensure_dir_exists(target_path, self.log)

@@ -0,0 +1,127 @@

+ import os

+ import subprocess

+ import select

+ import socket

+ 

+ class SSHConnectionError(Exception):

+     pass

+ 

+ class SSHConnection(object):

+     """

+     SSH connection representation.

+ 

+     This class heavily depedns on the native openssh client configuration file

+     (man 5 sh_config).  The example configuration might be:

+ 

+         $ cat /home/copr/.ssh/builder_config

+         Host *

+         # For dynamically started VMs.

+         StrictHostKeyChecking no

+         UserKnownHostsFile /dev/null

+ 

+         # For non-default paths to identity file.

+         IdentityFile ~/.ssh/id_rsa_copr

+ 

+         # Ensure remote command uses proper line buffering for live logs

+         # (so called live logs).

+         RequestTTY=force

+ 

+         # Keep control sockets open, to speedup subsequent command runs.

+         ControlPath=/home/copr/ssh_socket_%h_%p_%r

+         ControlMaster=auto

+         ControlPersist=900

+ 

+     Then just use:

+     SSHConnection(user='mockbuilder', host=vm_ip,

+                   config_file='/home/copr/.ssh/builder_config')

+ 

+     :param user:

+         Remote user name.  'root' by default.

+     :param host:

+         Remote hostname or IP.  'localhost' by default.

+     :param  config_file:

+         Full (absolute) path ssh config file to be used.  None by default means

+         the default ssh configuration is used /etc/ssh_config and ~/.ssh/config.

+     """

+ 

+     def __init__(self, user=None, host=None, config_file=None):

+         self.config_file = config_file

+         self.user = user or 'root'

+         self.host = host or 'localhost'

+ 

+     def _ssh_base(self):

+         cmd = ['ssh']

+         if self.config_file:

+             cmd = cmd + ['-F', self.config_file]

+         cmd.append('{0}@{1}'.format(self.user, self.host))

+         return cmd

+ 

+     def _run(self, user_command, stdout, stderr):

+         real_command = self._ssh_base() + [user_command]

+         proc = subprocess.Popen(real_command, stdout=stdout, stderr=stderr)

+         retval = proc.wait()

+         if retval == 255:

+             # Because we don't manage the control path (that's done in ssh

+             # configuration), we can not really check that 255 exit status is

+             # ssh connection error, or command error.

+             raise SSHConnectionError("Connection broke.")

+ 

+         return retval

+ 

+     def run(self, user_command, stdout=None, stderr=None):

+         """

+         Run user_command (blocking) and redirect stdout and/or stderr into

+         pre-opened python file descriptor.  When stdout/stderr is not set, the

+         output from particular output is ignored.

+ 

+         :param user_command:

+             Command (string) to be executed (note: use pipes.quote).

+ 

+         :param stdout:

+             File descriptor to write standard output into.

+ 

+         :param stderr:

+             File descriptor to write standard error output into.

+ 

+         :returns:

+             Triple (rc, stdout, stderr).  Stdout and stderr are of type str,

+             and might be pretty large.

+ 

+         :type command: str

+         :type stdout: file or None

+         :type stderr: file or None

+         :rtype: list

+ 

+         """

+         rc = -1

+         with open(os.devnull, "w") as devnull:

+             rc = self._run(user_command, stdout or devnull, stderr or devnull)

+         return rc

+ 

+     def run_expensive(self, user_command):

+         """

+         Run user_command (blocking) and return exit status together with

+         standard outputs in string variables.  Note that this can pretty easily

+         waste a lot of memory, run() is better option.

+ 

+         :param user_command:

+             Command (string) to be run as string (note: use pipes.quote).

+ 

+         :returns:

+             Tripple (rc, stdout, stderr).  Stdout and stderr are strings, those

+             might be pretty large.

+         """

+         real_command = self._ssh_base() + [user_command]

+         proc = subprocess.Popen(real_command, stdout=subprocess.PIPE,

+                                 stderr=subprocess.PIPE)

+         stdout, stderr = proc.communicate()

+         if proc.returncode == 255:

+             # Value 255 means either that 255 was returned by remote command or

+             # the ssh connection broke.  Because we need to handle "Connection

+             # broke" issues (resubmit build e.g.), please avoid situations when

+             # remote command returns value 255 (or -1).

+             raise SSHConnectionError(

+                 "Connection broke:\nOUT:\n{0}\nERR:\n{1}".format(

+                     stdout, stderr))

+ 

+         return proc.returncode, stdout, stderr

@@ -1,6 +1,5 @@

  # coding: utf-8

  import json

- import paramiko

  #from setproctitle import setproctitle

  # from multiprocessing import Process

  #from threading import Thread
@@ -10,6 +9,7 @@

  from backend.vm_manage.executor import Executor

  

  from ..helpers import get_redis_logger

+ from ..sshcmd import SSHConnection

  

  

  def check_health(opts, vm_name, vm_ip):
@@ -31,13 +31,10 @@

  

      err_msg = None

      try:

-         conn = paramiko.SSHClient()

-         conn.set_missing_host_key_policy(paramiko.AutoAddPolicy())

-         conn.connect(hostname=vm_ip, port=opts.ssh.port,

-                      username=opts.build_user or "root",

-                      key_filename=opts.ssh.identity_file)

-         stdin, stdout, stderr = conn.exec_command("echo hello")

-         stdout.channel.recv_exit_status()

+         conn = SSHConnection(opts.build_user or "root", vm_ip)

+         rc, stdout, _ = conn.run_expensive("echo hello")

+         if rc != 0 or stdout != "hello\n":

+             err_msg = "Unexpected check output"

      except Exception as error:

          err_msg = "Healtcheck failed for VM {} with error {}".format(vm_ip, error)

          log.exception(err_msg)

@@ -42,7 +42,6 @@

  BuildRequires: python-retask

  BuildRequires: python-copr >= 1.60

  BuildRequires: python-IPy

- BuildRequires: python-paramiko

  BuildRequires: python-psutil

  BuildRequires: python-futures

  BuildRequires: python-dateutil
@@ -83,7 +82,6 @@

  Requires:   gawk

  Requires:   crontabs

  Requires:   prunerepo

- Requires:   python-paramiko

  Requires:   python-lockfile

  Requires:   python2-modulemd

  # Requires:   python-ipdb

no initial comment

2 new commits added

  • [backend] reschedule build in case of SSH issues
  • [backend] allow using raw ssh instead of paramiko
7 years ago

It's very complex. Personally, I would opt to use only pure ssh command line client if paramiko cannot cover our current needs. Additionally, it would be better to rely only on ssh_config for setting up connections because in future, you could have different settings for different hosts even. The only backend config options could perhaps be port, config_path, identity_file_path. And we could also avoid those connects and disconnects altogether and just have something like run method that captures output.

It's very complex.

Yup, sorry for that. OTOH, except for necessary bugs as in every PR :) -- it
does basically what it declares (plus it allows us to move from
run_expensive() to run() in some cases), so yes ...

Personally, I would opt to use only pure ssh command line client if paramiko
cannot cover our current needs.

... I could drop Paramiko and other parts. But because I'm not sure what
side-effects the commandline ssh client will have, I would rather keep the
paramiko option at least for some time period (so you especially fedora's copr
can go back without too much hassle). Btw. the Paramiko stuff is almost
unchanged from 4028fea, except for some additions -- so we should have
non-breaking path forward.

Additionally, it would be better to rely only on ssh_config for setting up
connections because in future, you could have different settings for different
hosts even.

Yes! Per builder config (or per set-of-builders config) would make sense. But
in case of we gone that direction -- shouldn't this be configured in
copr-be.conf anyways?

The only backend config options could perhaps be port, config_path,
identity_file_path.

No problem to iterate, though there might be requirement to have per-host port
configuration, too, similarly to identityfile.

It would sound a bit awkward to have per-builder ssh configuration stored
somewhere on backend, or what is your plan? Some config format proposal?

And we could also avoid those connects and disconnects altogether and just
have something like run method that captures output.

The point of having connect() and disconnect() is efficiency though.
There's delay after first connection, but the following commands are run
immediately. It is meant to be rather library code, originally developed at
http://github.com/praiskup/python-sshcmd (playground basically, but it could
mature in future).

So basically, I'll cut-out whatever part for copr purposes you request.

Perhaps I could connect() automatically after first _run_ssh_cmd()?

Not doing disconnect() explicitly causes that sshd needs to clean the
connections somehow automagically, which is not a problem I guess (sshd is
pretty robust), but how should I set ControlPersist which is now 7200?

rebased

7 years ago

It's very complex.

Yup, sorry for that. OTOH, except for necessary bugs as in every PR :) -- it
does basically what it declares (plus it allows us to move from
run_expensive() to run() in some cases), so yes ...

I don't know what run_expensive does from the name, which is not optimal.

Personally, I would opt to use only pure ssh command line client if paramiko
cannot cover our current needs.

... I could drop Paramiko and other parts. But because I'm not sure what
side-effects the commandline ssh client will have, I would rather keep the
paramiko option at least for some time period (so you especially fedora's copr
can go back without too much hassle). Btw. the Paramiko stuff is almost
unchanged from 4028fea, except for some additions -- so we should have
non-breaking path forward.

I think the ssh client will do job just as well.

Additionally, it would be better to rely only on ssh_config for setting up
connections because in future, you could have different settings for different
hosts even.

Yes! Per builder config (or per set-of-builders config) would make sense. But
in case of we gone that direction -- shouldn't this be configured in
copr-be.conf anyways?

I don't think so. We don't want to duplicate what already can be described in ssh_config.

The only backend config options could perhaps be port, config_path,
identity_file_path.

No problem to iterate, though there might be requirement to have per-host port
configuration, too, similarly to identityfile.

That's true. IdentityFile and Port can also be described in ssh_config. Cool!

It would sound a bit awkward to have per-builder ssh configuration stored
somewhere on backend, or what is your plan? Some config format proposal.

To store it in ~/copr/ssh_config ideally.

And we could also avoid those connects and disconnects altogether and just
have something like run method that captures output.

The point of having connect() and disconnect() is efficiency though.
There's delay after first connection, but the following commands are run
immediately. It is meant to be rather library code, originally developed at
http://github.com/praiskup/python-sshcmd (playground basically, but it could
mature in future).
So basically, I'll cut-out whatever part for copr purposes you request.
Perhaps I could connect() automatically after first _run_ssh_cmd()?

I think, ssh will take care of itself (to create and keep the socket after the first remote command is run). Just ControlMaster, ControlPath, and ControlPersist need to be set up in the ssh_config for a host group.

Not doing disconnect() explicitly causes that sshd needs to clean the
connections somehow automagically, which is not a problem I guess (sshd is
pretty robust), but how should I set ControlPersist which is now 7200?

I don't mind keeping the sockets around. We can also use ssh(1) “-O exit” but that's just an optimization.

I don't know what run_expensive does from the name, which is not optimal.

Proposal for rename?

I think the ssh client will do job just as well.

I hope so, ACK -> I'll drop the Paramiko part then.

To store it in ~/copr/ssh_config ideally.

There's now ~/copr directory, so I guess we can use the default .ssh/config.

I think, ssh will take care of itself (to create and keep the socket after
the first remote command is run). Just ControlMaster, ControlPath, and
ControlPersist need to be set up in the ssh_config for a host group.

Ah, I'll try that.

I don't mind keeping the sockets around. We can also use ssh(1) “-O exit”
but that's just an optimization.

Ok.

1 new commit added

  • [backend] drop paramiko support
7 years ago

I updated the patch, please take another look.

  • There's no ssh configuration now (uses /etc/ssh_config and ~/.ssh/config).
  • Paramiko support dropped.
  • kept the connect() and disconnect() methods .. I'm not sure how to proceed without
    it. What whould be the api and sequence of ssh client calls in step-by-step example?
  • dunno how to rename the run_expensive call .. it is documented why it is expensive, but please advice

I updated the patch, please take another look.

There's no ssh configuration now (uses /etc/ssh_config and ~/.ssh/config).

Cool but I can still see some ssh_options, control paths, identifyFile, port etc. in the code. That should all be contained in the ssh_config file. The only thing that can kept be configurable is path to the ssh_config file.

Paramiko support dropped.

Great but the whole inheritance hiearchy now from Shell down to SSHConnectionRaw is not needed. Just one class implementing a run method should be enough. In fact, we only need the run method. I will let you decide how much you want to simplify but the more the better.

kept the connect() and disconnect() methods .. I'm not sure how to proceed without
it. What whould be the api and sequence of ssh client calls in step-by-step example?

Just a sequence of runs is not enough? ssh itself should keep the connection open for the subsequent runs according to man pages.

dunno how to rename the run_expensive call .. it is documented why it is expensive, but please advice

You can always capture the output. That means just run is good. Well...we can do this one later.

First two points are good points, but that would diverge from the original
python-sshcmd code too much, so I wouldn't be able to sync copr && and
that project. Because copr is not the only place I plan to use the
python-sshcmd... I can postpone this PR till I have it packaged separately
or we can probably drop this PR.

Just a sequence of runs is not enough? ssh itself should keep the
connection open for the subsequent runs according to man pages.

I wasn't able to find the proper set of options. IMO the first call needs
to setup the control channel (because the cli command is endless, even if
the cli command executed some remote command).

The last needs to terminate it. So some hint is needed :).

First two points are good points, but that would diverge from the original
python-sshcmd code too much, so I wouldn't be able to sync copr && and
that project. Because copr is not the only place I plan to use the
python-sshcmd... I can postpone this PR till I have it packaged separately
or we can probably drop this PR.

We might use python-sshcmd if it's simple enough. Now, however, I would like to pick the simplest solution available.

Just a sequence of runs is not enough? ssh itself should keep the
connection open for the subsequent runs according to man pages.

I wasn't able to find the proper set of options. IMO the first call needs
to setup the control channel (because the cli command is endless, even if
the cli command executed some remote command).
The last needs to terminate it. So some hint is needed :).

What about this then?

Host localhost
  Hostname localhost
  User=clime
  RequestTTY=auto
  ControlPath=/home/clime/%h_%p_%r
  ControlMaster=auto
  ControlPersist=7200

What about this then?
Host localhost
Hostname localhost
User=clime
RequestTTY=auto
ControlPath=/home/clime/%h_%p_%r
ControlMaster=auto
ControlPersist=7200

With this, I can do ssh localhost, the socket is created, and then I can do another ssh localhost already on that socket. At least if I understand it correctly.

rebased

7 years ago

Looks like it could work, thanks! Please take another look.

Nice, how did you test it?

Something like:

IdentityFile ~/.ssh/rhcopr-backend.priv
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
ServerAliveInterval 60

Host *
RequestTTY=force
ControlPath=/home/copr/ssh_socket_%h_%p_%r
ControlMaster=auto
ControlPersist=900

rebased

7 years ago

Sorry, I fixed several bugs and added this option:

[ssh]
builder_config=

So now alloc.yml (uses raw ssh in my case) can use a bit different (default)
ssh configuration than following MockRemote code. The tested configuration
is here:

[root@dev-coprbe backend]# cat /home/copr/.ssh/config
IdentityFile ~/.ssh/rhcopr-backend.priv
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
ServerAliveInterval 60
[root@dev-coprbe backend]# cat /home/copr/.ssh/builder_config 
Host *
IdentityFile ~/.ssh/rhcopr-backend.priv
RequestTTY=force
ControlPath=/home/copr/ssh_socket_%h_%p_%r
ControlMaster=auto
ControlPersist=900
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
ServerAliveInterval 60

I meant if you used the automated Regression test suite we have ready for backend or if you just tested it "by hand". "By hand" is alright in this case even though I recommend to have a look at those Regression tests at some point.

By hand unfortunately, but next week I'd be probably fine to have a look at automatic testing (more likely at least having a look at how to run them, so far I wasn't successful).

OK, anyway feel free to merge now.

rebased

7 years ago

After, chat with @clime, waiting a day more with merge. There is some ongoing parallel work which this merge could make harder. I'll re-base and merge later.

rebased

7 years ago

rebased

7 years ago

Uh, this became debugging nightmare TBH (don't ask me how much time I spent on this ...:) ). The major issue is that we do that many commands from builder, not being sure what is actually happening. So as a follow up to this PR, I really, really pretty please to re-consider to have done as much work as possible builder-side (pr#50).

@clime, can you please have a look? I've tested this, but there are some changes that need to be reviewed, namely expect package on builder, dropped a part of 7302518, some function return value rework).

rebased

7 years ago

Why remove this? If we get a job for which a worker is already running, we don't want to spawn another one. It should not normally happen but might so we handle that case like this.

Generally I like this. I need to play around with it for a while to find out if all that magic around running mockchain in background is needed (isn't </dev/null on mockchain command enough instead of that sigtrap?).

We discussed with Michal f2f, but to sum it up for the record:

Why remove this?

Because Worker kills VM and re-submits the job in frontend, which leads to brand new job request.. which build_dispatcher immediately and asynchronously handles by this removed code -> but there's a race that build_dispatcher still thinks that worker is still alive, so what only happens is that build is marked as running on frontend, and is never processed later.

The other answer is that frontend should never resubmit the same job again without explicit request from backend; unless there's some bug we need to chase.

Shouldn't we use (opts.build_user or "root") as the SSHConnection user? The same as for running other ssh commands. I don't thing we need to ping the remote machines as root.

Note that here it would be useful to log out stdout, stderr cause we get here also if e.g. /home/copr/.ssh/builder_config file is missing which might quite hard to debug otherwise.

rebased

7 years ago

Btw., while trying to minimize changes in api, I kept the as_root option in _run_ssh_cmd, but that's probably not needed and the whole _run_ssh_cmd could be dropped. IMO this doesn't belong to this PR, but I'm not against doing it too... PTAL.

Btw., while trying to minimize changes in api, I kept the as_root option in _run_ssh_cmd, but that's probably not needed and the whole _run_ssh_cmd could be dropped. IMO this doesn't belong to this PR, but I'm not against doing it too... PTAL.

Thank you but I think we can improve on this later. I have tested this pull request generally works. We can continue on it while it is in master already. Thank you for the great work!

Pull-Request has been merged by clime

7 years ago