#2699 Fix race handling in rmtree
Merged 3 years ago by tkopecek. Opened 3 years ago by mikem.
mikem/koji rmtree-updates  into  master

file modified
+65 -46
@@ -428,6 +428,11 @@ 

      setattr(object, name, value)

  

  

+ class _RetryRmtree(Exception):

+     """This exception is used internally by rmtree"""

+     # We raise this exception only when it makes sense for rmtree to retry from the top

+ 

+ 

  def rmtree(path, logger=None):

      """Delete a directory tree without crossing fs boundaries"""

      # implemented to avoid forming long paths
@@ -442,78 +447,91 @@ 

          raise koji.GenericError("Not a directory: %s" % path)

      dev = st.st_dev

      cwd = os.getcwd()

-     root = os.path.abspath(path)

+ 

      try:

-         try:

-             os.chdir(path)

-         except OSError as e:

-             if e.errno not in (errno.ENOENT, errno.ESTALE):

-                 return

-             raise

-         _rmtree(dev, root)

+         # retry loop

+         while True:

+             try:

+                 os.chdir(path)

+             except OSError as e:

+                 if e.errno in (errno.ENOENT, errno.ESTALE):

+                     # likely racing with another rmtree

+                     # if the dir doesn't exist, we're done

+                     return

+                 raise

+             try:

+                 _rmtree(dev)

+             except _RetryRmtree as e:

+                 # reset and retry

+                 os.chdir(cwd)

+                 continue

+             break

      finally:

          os.chdir(cwd)

+ 

+     # a successful _rmtree call should leave us with an empty directory

      try:

          os.rmdir(path)

      except OSError as e:

-         if e.errno == errno.ENOTEMPTY:

-             logger.warning('%s path is not empty, but it may be a phantom error caused by some'

-                            ' race condition', path, exc_info=True)

-         elif e.errno != errno.ENOENT:

+         if e.errno != errno.ENOENT:

              raise

  

  

- def _rmtree(dev, root):

+ def _rmtree(dev):

+     """Remove all contents of CWD"""

+     # This implementation avoids forming long paths and recursion. Otherwise

+     # we will have errors with very deep directory trees.

+     # - to avoid forming long paths we change directory as we go

+     # - to avoid recursion we maintain our own stack

      dirstack = []

+     # Each entry in dirstack is a list of subdirs for that level

+     # As we descend into the tree, we append a new entry to dirstack

+     # When we ascend back up after removal, we pop them off

      while True:

          dirs = _stripcwd(dev)

-         # if no dirs, walk back up until we find some

+ 

+         # if cwd has no subdirs, walk back up until we find some

          while not dirs and dirstack:

              try:

                  os.chdir('..')

-                 dirs = dirstack.pop()

-                 empty_dir = dirs.pop()

-                 try:

-                     os.rmdir(empty_dir)

-                 except OSError:

-                     # we'll still fail at the top level

-                     pass

              except OSError as e:

                  if e.errno in (errno.ENOENT, errno.ESTALE):

-                     # go back to root if chdir fails

-                     dirstack = []

-                     dirs = _stripcwd(dev)

-                     try:

-                         os.chdir(root)

-                     except OSError as e:

-                         # root has been deleted

-                         if e.errno == errno.ENOENT:

-                             return

-                         raise

-                 else:

-                     raise

+                     # likely in a race with another rmtree

+                     # however, we cannot proceed from here, so we return to the top

+                     raise _RetryRmtree(str(e))

+                 raise

+             dirs = dirstack.pop()

+ 

+             # now that we've ascended back up by one, the first dir entry is

+             # one we've just cleared, so we should remove it

+             empty_dir = dirs.pop()

+             try:

+                 os.rmdir(empty_dir)

+             except OSError:

+                 # If this happens, either something else is writing to the dir,

+                 # or there is a bug in our code.

+                 # For now, we ignore this and proceed, but we'll still fail at

+                 # the top level rmdir

+                 pass

+ 

          if not dirs:

              # we are done

              break

-         # otherwise go deeper

+ 

+         # otherwise we descend into the next subdir

          subdir = dirs[-1]

          # note: we do not pop here because we need to remember to remove subdir later

-         dirstack.append(dirs)

          try:

              os.chdir(subdir)

          except OSError as e:

-             # go back to root if subdir doesn't exist

              if e.errno == errno.ENOENT:

-                 dirstack = []

-                 try:

-                     os.chdir(root)

-                 except OSError as e:

-                     # root has been deleted

-                     if e.errno == errno.ENOENT:

-                         return

-                     raise

-             else:

-                 raise

+                 # likely in a race with another rmtree

+                 # we'll ignore this and continue

+                 # since subdir doesn't exist, we'll pop it off and forget about it

+                 dirs.pop()

+                 continue  # with dirstack unchanged

+             raise

+         dirstack.append(dirs)

  

  

  def _stripcwd(dev):
@@ -525,6 +543,7 @@ 

          # cwd has been removed by others, just return an empty list

          if e.errno in (errno.ENOENT, errno.ESTALE):

              return dirs

+         raise

      for fn in fdirs:

          try:

              st = os.lstat(fn)

file modified
+136 -9
@@ -10,6 +10,8 @@ 

  import six.moves.configparser

  import time

  import six

+ import shutil

+ import tempfile

  import unittest

  

  import requests_mock
@@ -1278,7 +1280,7 @@ 

  

          self.assertEquals(koji.util.rmtree(path), None)

          chdir.assert_called_with('cwd')

-         _rmtree.assert_called_once_with('dev', path)

+         _rmtree.assert_called_once_with('dev')

          rmdir.assert_called_once_with(path)

  

      @patch('koji.util._rmtree')
@@ -1309,7 +1311,7 @@ 

          dev = 'dev'

          stripcwd.return_value = []

  

-         koji.util._rmtree(dev, 'any')

+         koji.util._rmtree(dev)

  

          stripcwd.assert_called_once_with(dev)

          rmdir.assert_not_called()
@@ -1322,7 +1324,7 @@ 

          dev = 'dev'

          stripcwd.side_effect = (['a', 'b'], [], [])

  

-         koji.util._rmtree(dev, 'any')

+         koji.util._rmtree(dev)

  

          stripcwd.assert_has_calls([call(dev), call(dev), call(dev)])

          rmdir.assert_has_calls([call('b'), call('a')])
@@ -1337,7 +1339,7 @@ 

          rmdir.side_effect = OSError()

  

          # don't fail on anything

-         koji.util._rmtree(dev, 'any')

+         koji.util._rmtree(dev)

  

          stripcwd.assert_has_calls([call(dev), call(dev), call(dev)])

          rmdir.assert_has_calls([call('b'), call('a')])
@@ -1347,7 +1349,7 @@ 

      @patch('os.lstat')

      @patch('stat.S_ISDIR')

      @patch('os.unlink')

-     def test_stripcwd_empty(dev, unlink, isdir, lstat, listdir):

+     def test_stripcwd_empty(self, unlink, isdir, lstat, listdir):

          # simple empty directory

          dev = 'dev'

          listdir.return_value = []
@@ -1363,7 +1365,7 @@ 

      @patch('os.lstat')

      @patch('stat.S_ISDIR')

      @patch('os.unlink')

-     def test_stripcwd_all(dev, unlink, isdir, lstat, listdir):

+     def test_stripcwd_all(self, unlink, isdir, lstat, listdir):

          # test valid file + dir

          dev = 'dev'

          listdir.return_value = ['a', 'b']
@@ -1384,7 +1386,7 @@ 

      @patch('os.lstat')

      @patch('stat.S_ISDIR')

      @patch('os.unlink')

-     def test_stripcwd_diffdev(dev, unlink, isdir, lstat, listdir):

+     def test_stripcwd_diffdev(self, unlink, isdir, lstat, listdir):

          # ignore files on different devices

          dev = 'dev'

          listdir.return_value = ['a', 'b']
@@ -1408,7 +1410,7 @@ 

      @patch('os.lstat')

      @patch('stat.S_ISDIR')

      @patch('os.unlink')

-     def test_stripcwd_fails(dev, unlink, isdir, lstat, listdir):

+     def test_stripcwd_fails(self, unlink, isdir, lstat, listdir):

          # ignore all unlink errors

          dev = 'dev'

          listdir.return_value = ['a', 'b']
@@ -1430,7 +1432,7 @@ 

      @patch('os.lstat')

      @patch('stat.S_ISDIR')

      @patch('os.unlink')

-     def test_stripcwd_stat_fail(dev, unlink, isdir, lstat, listdir):

+     def test_stripcwd_stat_fail(self, unlink, isdir, lstat, listdir):

          # something else deletes a file in the middle of _stripcwd()

          dev = 'dev'

          listdir.return_value = ['will-not-exist.txt']
@@ -1444,6 +1446,131 @@ 

          isdir.assert_not_called()

  

  

+ class TestRmtree2(unittest.TestCase):

+ 

+     def setUp(self):

+         self.tempdir = tempfile.mkdtemp()

+ 

+     def tearDown(self):

+         shutil.rmtree(self.tempdir)

+ 

+     def test_rmtree_parallel_chdir_down_failure(self):

+         dirname = '%s/some-dir/' % self.tempdir

+         os.makedirs('%s/a/b/c/d/e/f/g/h/i/j/k' % dirname)

+         mock_data = {'n': 0, 'removed': False}

+         os_chdir = os.chdir

+         def my_chdir(*a, **kw):

+             # after 4 calls, remove the tree

+             # this should happen during the descent

+             # rmtree should gracefully handle this

+             mock_data['n'] += 1

+             if mock_data['n'] == 4:

+                 shutil.rmtree(dirname)

+                 mock_data['removed'] = True

+             return os_chdir(*a, **kw)

+         with mock.patch('os.chdir', new=my_chdir):

+             koji.util.rmtree(dirname)

+         if not mock_data['removed']:

+             raise Exception('mocked call not working')

+         if os.path.exists(dirname):

+             raise Exception('test directory not removed')

+ 

+     def test_rmtree_parallel_chdir_down_complex(self):

+         dirname = '%s/some-dir/' % self.tempdir

+         # For this test, we make a complex tree to remove

+         # We remove a subtree partway through to verify that the error is

+         # ignored without breaking the remaining traversal

+         for i in range(8):

+             for j in range(8):

+                 for k in range(8):

+                     os.makedirs('%s/a/%s/c/d/%s/e/f/%s/g/h' % (dirname, i, j, k))

+         mock_data = {'n': 0, 'removed': False}

+         os_chdir = os.chdir

+         def my_chdir(path):

+             mock_data['n'] += 1

+             if path == 'f':

+                 # when we hit the first f, remove the subtree

+                 shutil.rmtree(os.path.abspath(path))

+                 mock_data['removed'] = True

+             return os_chdir(path)

+         with mock.patch('os.chdir', new=my_chdir):

+             koji.util.rmtree(dirname)

+         if not mock_data['removed']:

+             raise Exception('mocked call not working')

+         if os.path.exists(dirname):

+             raise Exception('test directory not removed')

+ 

+     def test_rmtree_parallel_chdir_up_failure(self):

+         dirname = '%s/some-dir/' % self.tempdir

+         os.makedirs('%s/a/b/c/d/e/f/g/h/i/j/k' % dirname)

+         mock_data = {'n': 0, 'removed': False}

+         os_chdir = os.chdir

+         def my_chdir(path):

+             # remove the tree when we start ascending

+             # rmtree should gracefully handle this

+             mock_data['n'] += 1

+             if path == '..' and not mock_data['removed']:

+                 shutil.rmtree(dirname)

+                 mock_data['removed'] = True

+                 # os.chdir('..') might not error on normal filesystems

+                 # we'll raise ESTALE to simulate the nfs case

+                 e = OSError()

+                 e.errno = errno.ESTALE

+                 raise e

+             return os_chdir(path)

+         with mock.patch('os.chdir', new=my_chdir):

+             koji.util.rmtree(dirname)

+         if not mock_data['removed']:

+             raise Exception('mocked call not working')

+         if os.path.exists(dirname):

+             raise Exception('test directory not removed')

+ 

+     def test_rmtree_parallel_listdir_fails(self):

+         dirname = '%s/some-dir/' % self.tempdir

+         os.makedirs('%s/a/b/c/d/e/f/g/h/i/j/k' % dirname)

+         mock_data = {'n': 0, 'removed': False}

+         os_listdir = os.listdir

+         def my_listdir(*a, **kw):

+             # after 4 calls, remove the tree

+             # rmtree should gracefully handle this

+             mock_data['n'] += 1

+             if mock_data['n'] == 4:

+                 shutil.rmtree(dirname)

+                 mock_data['removed'] = True

+                 # os.listdir('.') might not error on normal filesystems

+                 # we'll raise ESTALE to simulate the nfs case

+                 e = OSError()

+                 e.errno = errno.ESTALE

+                 raise e

+             return os_listdir(*a, **kw)

+         with mock.patch('os.listdir', new=my_listdir):

+             koji.util.rmtree(dirname)

+         if not mock_data['removed']:

+             raise Exception('mocked call not working')

+         if os.path.exists(dirname):

+             raise Exception('test directory not removed')

+ 

+     def test_rmtree_parallel_new_file(self):

+         # Testing case where a separate process adds new files during after we have stripped a directory

+         # This should cause rmtree to fail

+         dirname = '%s/some-dir/' % self.tempdir

+         os.makedirs('%s/a/b/c/d/e/f/g/h/i/j/k' % dirname)

+         os_listdir = os.listdir

+         mock_data = {}

+         def my_listdir(path):

+             ret = os_listdir(path)

+             if 'b' in ret:

+                 mock_data['ping'] = 1

+                 with open('extra_file', 'w') as fo:

+                     fo.write('hello world\n')

+             return ret  # does not contain extra_file

+         with mock.patch('os.listdir', new=my_listdir):

+             with self.assertRaises(OSError):

+                 koji.util.rmtree(dirname)

+         if not mock_data.get('ping'):

+             raise Exception('mocked call not working')

+ 

+ 

  class TestMoveAndSymlink(unittest.TestCase):

      @mock.patch('koji.ensuredir')

      @mock.patch('koji.util.safer_move')

Fixes https://pagure.io/koji/issue/2698

Despite previous work on rmtree to mitigate race conditions, there are still cases where it fails.

This PR cleans up the code a bit, adds some clearer comments on the trickier bits, and expands the unit tests to cover some race conditions.

Previous work on rmtree:

Metadata Update from @tkopecek:
- Pull-request tagged with: testing-ready

3 years ago

It would be good if testing could include an nfs filesystem

I believe that nfs is likely required to replicate #2698. On local filesystems, calls to os.listdir('.') and os.chdir('..') appear to always succeed even when the directory is deleted out from under us. However, with nfs we'll see ESTALE errors

Commit 131c5fa fixes this pull-request

Pull-Request has been merged by tkopecek

3 years ago

Metadata Update from @tkopecek:
- Pull-request untagged with: testing-ready
- Pull-request tagged with: no_qe

3 years ago