#1114 Try git archive first instead of clone
Closed 3 months ago by mprahl. Opened 3 months ago by vmaljulin.
vmaljulin/fm-orchestrator FACTORY-3634  into  master

file modified
+51 -2

@@ -33,6 +33,7 @@ 

  import tempfile

  import shutil

  import datetime

+ import tarfile

  

  from module_build_service import log, conf

  from module_build_service.errors import (

@@ -145,6 +146,26 @@ 

                  cmd, proc.returncode, stdout, stderr))

          return proc.returncode, stdout, stderr

  

+     @staticmethod

+     @retry(

+         timeout=conf.scm_net_timeout,

+         interval=conf.scm_net_retry_interval,

+         wait_on=UnprocessableEntity)

+     def _run_with_tar(cmd, chdir='.'):

+         """

+         Run a shell command, parsing it's tar output

+         :param cmd: shell command to run

+         :param chdir: directory command will be run and to which files will be extracted

+         :return: shell command return code

+         """

+         p = sp.Popen(cmd, bufsize=10240, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, cwd=chdir)

+         with tarfile.open(fileobj=p.stdout, mode='r|') as tar:

+             tar.extractall(chdir)

+             (so, se) = p.communicate()

+             if se:

+                 log.warning(se)

+         return p.returncode

+ 

      def checkout(self, scmdir):

          """Checkout the module from SCM.

  

@@ -154,8 +175,36 @@ 

          """

          # TODO: sanity check arguments

          if self.scheme == "git":

-             self.sourcedir = '%s/%s' % (scmdir, self.name)

- 

+             self.sourcedir = os.path.normpath(os.path.join(scmdir, self.name))

+             try:

+                 os.makedirs(self.sourcedir)

+             except OSError as e:    # Python 2 does not support exists_ok argument

+                 if 'File exists' not in str(e):

+                     raise

+             module_archive_cmd = ['git', 'archive', '--format=tar',

+                                   '--remote=%s' % self.repository]

+             if self.commit:

+                 module_archive_cmd.append(self.commit)

+             else:

+                 module_archive_cmd.append('HEAD')

+             try:

+                 archive_ret_code = self._run_with_tar(module_archive_cmd, chdir=self.sourcedir)

+                 if archive_ret_code:

+                     raise RuntimeError('git archive returned a non-zero exit code')

+             except Exception as e:

+                 log.exception('Error occurred while trying to use a git archive. '

+                               'Fallback to git clone.')

+             else:

+                 source_dir_cnt = os.listdir(self.sourcedir)

+                 if source_dir_cnt:

+                     timestamp = os.path.getmtime(os.path.join(self.sourcedir, source_dir_cnt[0]))

+                     dt = datetime.datetime.utcfromtimestamp(int(timestamp))

+                     self.version = dt.strftime("%Y%m%d%H%M%S")

+                 else:

+                     self.version = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

+                 return self.sourcedir

+ 

+             # Fallback to clone

              module_clone_cmd = ['git', 'clone', '-q']

              if self.commit:

                  module_checkout_cmd = ['git', 'checkout', '-q', self.commit]

file modified
+12

@@ -28,6 +28,7 @@ 

  

  import module_build_service.scm

  from module_build_service.errors import ValidationError, UnprocessableEntity

+ from mock import patch

  

  base_dir = os.path.join(os.path.dirname(__file__), 'scm_data')

  repo_url = 'file://' + base_dir + '/testrepo'

@@ -137,3 +138,14 @@ 

          scm = module_build_service.scm.SCM(repo_url)

          with pytest.raises(UnprocessableEntity):

              scm.get_latest('15481faa232d66589e660cc301179867fb00842c9')

+ 

+     @patch('module_build_service.scm.SCM._run_with_tar', return_value=2)

+     @patch('logging.Logger.exception')

+     def test_simple_local_checkout_clone(self, exception_log, run_pipe):

+         """ See if we can clone a local git repo. """

+         scm = module_build_service.scm.SCM(repo_url)

+         scm.checkout(self.tempdir)

+         files = os.listdir(self.repodir)

+         assert 'foo' in files, "foo not in %r" % files

+         exception_log.assert_called_once()

+         run_pipe.assert_called_once()

rebased onto b32818ea7ab867aa9a0140faccfb9ecd296321e3

3 months ago

I really would prefer to not have this method. Invoking external processes should only be done when absolutely required. The tar operation could be done using python libraries (StringIO/BytesIO and tarfile).

This will give you the date of the file when it was copied to the local system, instead of the date of the commit.
Won't this have any implications in the build output?

I would like to suggest to separate following lines for this if branch into two methods, one is to run git-archive and another is to run git-clone. Then, following lines could be simplified:

try:
    git archive method
except Exception as e:
    # Fallback to
    git clone method

This could also simplify the tests for each of the git operations individually.

Suggest to log the error message firstly, then fallback to git-clone method.

May I ask what problem could break git-archive but git-clone works? I just thought if it is necessary to have this fallback.

May I ask what problem could break git-archive but git-clone works? I just thought if it is necessary to have this fallback.

Because some repositories don't support git archive at all.

This will give you the date of the file when it was copied to the local system, instead of the date of the commit.
Won't this have any implications in the build output?

Any idea how to do it better?

Suggest to log the error message firstly, then fallback to git-clone method.

Probably I didn't understand you, but IMO this is the way it's being done now.

formate?

Oh, I see I forgot to merge it :(

I really would prefer to not have this method. Invoking external processes should only be done when absolutely required. The tar operation could be done using python libraries (StringIO/BytesIO and tarfile).

I'll check this

Probably I didn't understand you, but IMO this is the way it's being done now.

Sorry, I meant to log underlying error messages, probably str(e), so that we can know from the log what's the concrete problem that cause fallback to git-clone operation.

Because some repositories don't support git archive at all.

Got it. Thanks. :)

This will give you the date of the file when it was copied to the local system, instead of the date of the commit.
Won't this have any implications in the build output?

Any idea how to do it better?

I don't unfortunately...
@jkaluza, how important is this time stamp?

rebased onto b0421df8e43f49b3f6da7828bc85ffb6f6c3c65c

3 months ago

I really would prefer to not have this method. Invoking external processes should only be done when absolutely required. The tar operation could be done using python libraries (StringIO/BytesIO and tarfile).

Done

with tarfile.open(...) as tar:

Why is default value for chdir None?

Missing the description of return?

Maybe use log.exception here:

except Exception:
    log.exception('Error occured while trying to use a git archive. Fallback to git clone.')

It will dump the exception automatically.

You should also keep the same retry decorator here as it is used in _run(...):

    @retry(
        timeout=conf.scm_net_timeout,
        interval=conf.scm_net_retry_interval,
        wait_on=UnprocessableEntity)

And when the p.returncode is not 0, try running it again (again, the same thing is already done in _run(...)).

The reason is that sometimes the git server might have temporary issues (like networking problem, dns problem, whatever) and we still want to retry in this case few times instead of simply failing whole module build.

I'm OK with chdir None. It simply means no chdir change and current working dir is used instead. The true is that in our current use-case, we always set chdir. But this is done this way also in current _run(...) method, so I think it's fine here too.

Hm, but the true is you will just fallback to git clone in case the git archive fails because of these temporary issues. Maybe the current way is acceptable in the end, because fallback to git clone will try retrying few times.

What do other think?

Hm, but the true is you will just fallback to git clone in case the git archive fails because of these temporary issues. Maybe the current way is acceptable in the end, because fallback to git clone will try retrying few times.
What do other think?

Looks like git archive is supported only with git protocol (i.e. won't work if url starts https://).

If you expect any git repo URLs, I would avoid using git archive altogether.

rebased onto 7cb8523

3 months ago

with tarfile.open(...) as tar:

Done

Missing the description of return?

Done

Maybe use log.exception here:
log.exception('Error occured while trying to use a git archive. Fallback to git clone.')

Done

You should also keep the same retry decorator here as it is used in _run(...):

Done

I don't think passing None to TarFile.extractall() works.

This really shouldn't happen now as long as this parameter will be always filled. But for further usages, it will be better to change it and I've done it.

This will give you the date of the file when it was copied to the local system, instead of the date of the commit.
Won't this have any implications in the build output?

Any idea how to do it better?

This is a problem and I don't know how to do this without cloning the repo.

@lucarval any ideas?

This will give you the date of the file when it was copied to the local system, instead of the date of the commit.
Won't this have any implications in the build output?
Any idea how to do it better?

This is a problem and I don't know how to do this without cloning the repo.
@lucarval any ideas?

I don't know how to get around this either :(

Okay, I think git-archve isn't a viable solution. I'll close this PR and we can close the issue.

Pull-Request has been closed by mprahl

3 months ago