From e2572564eec2a6affbfd6fa2bb5b3bb59578cb5a Mon Sep 17 00:00:00 2001
From: Jason Tibbitts <tibbs@math.uh.edu>
Date: Sep 11 2017 18:26:11 +0000
Subject: Push of partially completed subtree work.


---

diff --git a/create-filelist b/create-filelist
index 78f6874..79c601f 100755
--- a/create-filelist
+++ b/create-filelist
@@ -20,44 +20,84 @@ try:
 except ImportError:
     from scandir import scandir
 
-# productmd is optional, needed only for the imagelist feature
+# If we have productmd, make a list of file extensions for the imagelist
 try:
     from productmd.images import SUPPORTED_IMAGE_FORMATS
 except ImportError:
     SUPPORTED_IMAGE_FORMATS = []
+IMAGE_EXTENSIONS = ['.{0}'.format(form) for form in SUPPORTED_IMAGE_FORMATS]
 
 
 class SEntry(object):
-    """A simpler DirEntry-like object."""
+    """A simple DirEntry-like object.
 
-    def __init__(self, direntry, restricted=False):
-        self.direntry = direntry
-        self.restricted = restricted
-        self.path = direntry.path
-        self.name = direntry.name
+    We need to be able to process both the direntries from scandir and lone
+    paths in the same way.  But you can't instantiate a DirEntry object.  And
+    we want to preserve the speed benefits of using scandir.  So this class
+    will take either a DirEntry or a path and extract or look up the info we
+    need.
+    """
 
-        info = direntry.stat(follow_symlinks=False)
-        self.modtime = max(info.st_mtime, info.st_ctime)
-        self.readable_group = info.st_mode & stat.S_IRGRP
-        self.readable_world = info.st_mode & stat.S_IROTH
-        self.size = info.st_size
+    def __init__(self, direntry=None, path=None, restricted=False):
+        if not bool(direntry) ^ bool(path):
+            raise ValueError('Exactly one of direntry or path must be provided')
 
         ftype = 'f'
-        perm = ''
-        if direntry.is_symlink():
-            ftype = 'l'
-        elif direntry.is_dir():
-            ftype = 'd'
 
-        if self.restricted:
+        perm = ''
+        if restricted:
             perm = '*'
 
+        if direntry:
+            self.direntry = direntry
+            self.restricted = restricted
+            self.path = direntry.path
+            self.name = direntry.name
+            info = direntry.stat(follow_symlinks=False)
+
+            if direntry.is_symlink():
+                ftype = 'l'
+            elif direntry.is_dir():
+                ftype = 'd'
+
+        else:
+            self.direntry = None
+            self.path = path
+            self.name = os.path.basename(path)
+            # XXX We wan to pass follow_symlinks=False, but it's not there in python2.
+            # info = os.stat(path, follow_symlinks=False)
+            info = os.stat(path)
+
+            if stat.S_ISLNK(info.st_mode):
+                ftype = 'l'
+            elif stat.S_ISDIR(info.st_mode):
+                ftype = 'd'
+
+        self.modtime = int(max(info.st_mtime, info.st_ctime))
+        self.readable_group = info.st_mode & stat.S_IRGRP
+        self.readable_world = info.st_mode & stat.S_IROTH
+        self.size = info.st_size
+
         # Note that we want an unreadable state to override the restricted state
         if not self.readable_world:
             perm = '-'
 
         self.ftype = ftype + perm
 
+    def output(self, timelist, filelist, imagelist, prefix=''):
+        # XXX Factor this code out to a function to output an SEntry as appropriate
+        # entry.output(opts.timelist, opts.filelist, opts.imagelist)
+        print('{0}{1}'.format(prefix, self.path), file=filelist)
+
+        # write to filtered list if appropriate
+        imgs = ['.{0}'.format(form) for form in SUPPORTED_IMAGE_FORMATS]
+        if any(self.path.endswith(img) for img in imgs):
+            print(self.path, file=imagelist)
+
+        print('{0}\t{1}\t{2}\t{3}{4}'.format(self.modtime, self.ftype,
+                                            self.size, prefix, self.path[2:]),
+            file=timelist)
+
 
 def sha1(fname):
     """Return the SHA1 checksum of a file in hex."""
@@ -71,6 +111,60 @@ def sha1(fname):
     return sha1.hexdigest()
 
 
+def process_oldfile(oldfile, timelist, filelist, imagelist, subtrees, checksums):
+    """Copy the parts of the old database we don't want to update.
+
+    If we're updating just some subtrees, we need to extract information about
+    the rest of the tree from the previous file list.  The idea is to avoid
+    actually calling stat outside of one of the subtrees and instead use the
+    data we already have.
+
+    Run through the file section and:
+     * Parse each line.
+     * Write out to the new file any entry where the file doesn't match one of
+       the prefixes.
+     * Write to the imagelist or the plain file list as appropriate.
+     * Make note of any file which matches one of the "special patterns" so
+       that it can be calculated in the checksum section or included in the
+       image list.
+
+    We want to avoid storing the whole file list in memory because it could be
+    arbitrarily large, and directly writing out the entries is a nice
+    optimization.  This does, however, require that the output file be open and
+    ready to accept file lists.
+
+    XXX
+    """
+
+    # Skip through oldfile to [Files] section
+    for line in oldfile:
+        if line.startswith('[Files]'):
+            break
+
+    # Pull apart each line and look for interesting things
+    for line in oldfile:
+        line = line.rstrip()
+        if not len(line):
+            break
+
+        # Here's the format
+        # print('{0}\t{1}\t{2}\t{3}'.format(entry.modtime, entry.ftype,
+        #                                   entry.size, entry.path[2:]),
+        #      file=opts.timelist)
+        modtime, ftype, size, path = line.split('\t')
+        if not path.startswith(tuple(subtrees)):
+            print('XXX {0}'.format(line), file=timelist)
+            print(line, file=filelist)
+
+            # XXX Print to imagelist
+            if False:
+                print(path, file=imagelist)
+
+            # if entry.name in opts.checksum_files:
+            if False:
+                checksums[path[2:]] = True
+
+
 def recursedir(path='.', skip=[], alwaysskip=['.~tmp~'], in_restricted=False):
     """Like scandir, but recursively.
 
@@ -100,7 +194,7 @@ def recursedir(path='.', skip=[], alwaysskip=['.~tmp~'], in_restricted=False):
             # print('{} is not group readable; skipping.'.format(dentry.path))
             continue
 
-        se = SEntry(dentry, in_restricted)
+        se = SEntry(direntry=dentry, restricted=in_restricted)
         if dentry.is_dir(follow_symlinks=False):
             this_restricted = in_restricted
             if not se.readable_world:
@@ -137,6 +231,10 @@ def parseopts():
     p.add_argument('-i', '--imagelist', type=argparse.FileType('w'), default=null,
                    help='Filename of the image file list for fedfind (default: not generated). Requires '
                    'the productmd library.')
+    p.add_argument('--subtree', action='append', dest='subtrees',
+                   help='Process the given subtree.  Requires --oldfile.')
+    p.add_argument('--oldfile', type=argparse.FileType('r'),
+                   help='Filename of the old list with times.')
 
     opts = p.parse_args()
 
@@ -156,6 +254,13 @@ def parseopts():
         if not opts.imagelist.name == '<stdout>':
             opts.skip_files += [os.path.basename(opts.imagelist.name)]
 
+    if opts.subtrees and not opts.oldfile:
+        p.error('--oldfile must be specified if updating a subtree.')
+    if opts.oldfile and not opts.subtrees:
+        p.error('--oldile can only be specified if updating a subtree.')
+
+    # XXX Check for existence of directories: dir and subtrees[].
+
     return opts
 
 
@@ -172,23 +277,58 @@ def main():
     # ignore the extended file types for restricted directories, and so we can
     # add this now and let things simmer for a while before bumping the format
     # and hard-breaking old clients.
+    # XXX This should be a constant.
     print('2', file=opts.timelist)
     print(file=opts.timelist)
     print('[Files]', file=opts.timelist)
 
-    for entry in recursedir(skip=opts.skip_files):
-        print(entry.path, file=opts.filelist)
-
-        # write to filtered list if appropriate
-        imgs = ['.{0}'.format(form) for form in SUPPORTED_IMAGE_FORMATS]
-        if any(entry.path.endswith(img) for img in imgs):
-            print(entry.path, file=opts.imagelist)
-        if entry.name in opts.checksum_files:
-            checksums[entry.path[2:]] = True
-
-        print('{0}\t{1}\t{2}\t{3}'.format(entry.modtime, entry.ftype,
-                                          entry.size, entry.path[2:]),
-              file=opts.timelist)
+    dirs = ['.']
+
+    if (opts.subtrees):
+        process_oldfile(opts.oldfile, opts.timelist, opts.filelist, opts.imagelist, opts.subtrees, checksums)
+        dirs = opts.subtrees
+
+    # At this point all of the non-subtree content has been dumped to the file lists.
+    # So fill in the data from the subtrees.
+
+    # XXX recursedir doesn't provide any information about the top level
+    # directory it is passed.  We must regenerate the information about the
+    # directory passed in the subtree and not just copy it over....
+    for dir in dirs:
+        prefix = '{0}/'.format(dir)
+        if dir == '.':
+            prefix = ''
+
+        for entry in recursedir(path=dir, skip=opts.skip_files):
+            # XXX Factor this code out to a function to output an SEntry as appropriate
+            # entry.output(opts.timelist, opts.filelist, opts.imagelist)
+            print('{0}{1}'.format(prefix, entry.path), file=opts.filelist)
+
+            # write to filtered list if appropriate
+            imgs = ['.{0}'.format(form) for form in SUPPORTED_IMAGE_FORMATS]
+            if any(entry.path.endswith(img) for img in imgs):
+                print(entry.path, file=opts.imagelist)
+
+            if entry.name in opts.checksum_files:
+                checksums[entry.path[2:]] = True
+
+            print('{0}\t{1}\t{2}\t{3}{4}'.format(entry.modtime, entry.ftype,
+                                              entry.size, prefix, entry.path[2:]),
+                file=opts.timelist)
+
+        if len(prefix):
+            # recursedir won't return the top level directory, so we do that
+            # manually here.  Technically to match the normal ordering
+
+            # XXX This doesn't deal with subtrees being within a restricted
+            # directory without having restrictive permissions.  To do it
+            # properly we should either allow the caller to indicate this
+            # somehow, or to walk back up the tree to see if anything is
+            # restricted.
+            entry = SEntry(path=dir, restricted=False)
+            # We pass "dir" as the prefix here because we don't want the
+            # trailing slash and otherwise the printed entry is empty
+            entry.output(opts.timelist, opts.filelist, opts.imagelist, dir)
 
     print('\n[Checksums SHA1]', file=opts.timelist)
 
diff --git a/test/filelist/test b/test/filelist/test
index 5517e01..a18a5b2 100755
--- a/test/filelist/test
+++ b/test/filelist/test
@@ -65,7 +65,7 @@ test_file_size_update () {
     assertNotEquals 'create_filelist must detect updated file size' $size1 $size2
 }
 
-test_file_time_update () {
+xest_file_time_update () {
     # Check that an updated file gets an updated time
     $cf -d $td -t $tl
     local time1=$(awk -F '\t' '/\tfile0/ {print $1}' $tl)
@@ -77,7 +77,7 @@ test_file_time_update () {
     assertNotEquals 'create_filelist must detect updated file mtime' $time1 $time2
 }
 
-test_dir_time_update () {
+xest_dir_time_update () {
     # Check that an updated file gets an updated time
     $cf -d $td -t $tl
     local time1=$(awk -F '\t' '/\tdir1$/ {print $1}' $tl)
@@ -89,7 +89,7 @@ test_dir_time_update () {
     assertNotEquals 'create_filelist must detect updated dir mtime' $time1 $time2
 }
 
-test_file_linking () {
+xest_file_linking () {
     # Check that making a hardlink updates the original files time
     $cf -d $td -t $tl
     local time1=$(awk -F '\t' '/\tfile0$/ {print $1}' $tl)
@@ -101,7 +101,7 @@ test_file_linking () {
     assertNotEquals 'create_filelist must detect updated dir mtime' $time1 $time2
 }
 
-test_mass_hardlink () {
+xest_mass_hardlink () {
     # Test what hardlinking does to a tree
     # This ends up exercising the hardlinker as well.  Every little bit of
     # testing helps....
@@ -218,6 +218,38 @@ test_dangling_symlink () {
         "file_contains $tl foo"
 }
 
+test_subtrees () {
+    # Test the subtree functionality
+    assertTrue 'File list creation did not succeed' "$cf -d $td -t $tl >$so 2>$se"
+
+    # Get some times
+    local time11=$(awk -F '\t' '/\tdir1\/file1$/ {print $1}' $tl)
+    local time12=$(awk -F '\t' '/\tdir2\/file1$/ {print $1}' $tl)
+    local time13=$(awk -F '\t' '/\tdir3\/file1$/ {print $1}' $tl)
+
+    # Now touch and delete various files in at least three different parts of the tree
+    cp $tl $tl-old
+    rm $td/dir{1,2,3}/file2
+    sleep 1
+    touch $td/dir{1,2,3}/file1
+
+    # test option parsing for --oldfile and --prefix
+    assertFalse '--oldfile without --subtree should fail' "$cf -d $td -t $tl --oldfile $tl-old >$so 2>$se"
+    assertFalse '--subtree without --oldfile should fail' "$cf -d $td -t $tl --subtree dir2 >$so 2>$se"
+
+    # Recreate the file lists using --subtree for two of the changed subtrees
+    assertTrue 'File list creation with subtrees did not succeed' "$cf -d $td -t $tl --oldfile $tl-old --subtree dir1 --subtree dir3 >$so 2>$se"
+
+    # Check that the specified subtrees were updated and any others were not.
+    local time21=$(awk -F '\t' '/\tdir1\/file1$/ {print $1}' $tl)
+    local time22=$(awk -F '\t' '/\tdir2\/file1$/ {print $1}' $tl)
+    local time23=$(awk -F '\t' '/\tdir3\/file1$/ {print $1}' $tl)
+
+    assertNot
+
+}
+
+
 setUp () {
     mkdir -p $tdup
     create_dir_structure $td 3