From 9d5fe06e36c6bf7d2e6854b8f9c703a1779b81c2 Mon Sep 17 00:00:00 2001
From: Sylvie Gouverneyre <sgouvern@redhat.com>
Date: May 25 2020 09:51:24 +0000
Subject: Issue 49850 -ldbm_get_nonleaf_ids() painfully slow for databases with many non-leaf entries


Description:
With a very large database, gathering non-leaf IDs for creating the ancestorid index took an enormous amount of time.
This test is to verify the fix for this problem.

https://pagure.io/389-ds-base/issue/49850

Author: sgouvern

Reviewed by: firstyear, spichugi

---

diff --git a/dirsrvtests/tests/suites/import/regression_test.py b/dirsrvtests/tests/suites/import/regression_test.py
index e78d338..dedc991 100644
--- a/dirsrvtests/tests/suites/import/regression_test.py
+++ b/dirsrvtests/tests/suites/import/regression_test.py
@@ -5,6 +5,7 @@
 # See LICENSE for details.
 # --- END COPYRIGHT BLOCK ---
 #
+from decimal import *
 import ldap
 import logging
 import os
@@ -19,6 +20,8 @@ from lib389._constants import DEFAULT_SUFFIX
 from lib389.tasks import *
 from lib389.idm.user import UserAccounts
 from lib389.idm.directorymanager import DirectoryManager
+from lib389.dbgen import *
+from lib389.utils import *
 
 pytestmark = pytest.mark.tier1
 
@@ -298,6 +301,111 @@ ou: myDups00001
     log.info('Error log should have "Duplicated DN detected" message')
     assert standalone.ds_error_log.match('.*Duplicated DN detected.*')
 
+@pytest.mark.bz1749595
+@pytest.mark.tier2
+@pytest.mark.xfail(ds_is_older("1.3.10.1"), reason="bz1749595 not fixed on versions older than 1.3.10.1")
+def test_large_ldif2db_ancestorid_index_creation(topo):
+    """Import with ldif2db a large file - check that the ancestorid index creation phase has a correct performance 
+
+    :id: fe7f78f6-6e60-425d-ad47-b39b67e29113
+    :setup: Standalone instance
+    :steps:
+        1. Delete the previous errors log to start from a fresh one
+        2. Create test suffix and backend 
+        3. Create a large nested ldif file
+        4. Stop the server
+        5. Run an offline import
+        6. Restart the server
+        7. Check in the errors log that an independant ancestorid IDs sorting is done
+        8. Get the log of the starting of the ancestorid indexing process
+        9. Get the log of the end of the ancestorid indexing process
+        10. Get the start and end time for ancestorid index creation from these logs
+        11. Calculate the duration of the ancestorid indexing process
+    :expectedresults:
+        1. Success
+        2. Test suffix and backend successfully created
+        3. ldif file successfully created
+        4. Success
+        5. Import is successfully performed
+        6. Success
+        7. Log of ancestorid sorting start and end are present
+        8. Log of the beginning of gathering ancestorid is found
+        9. Log of the final ancestorid index creation is found
+        10. Start and end times are successfully extracted
+        11. The duration of the ancestorid index creation process should be less than 10s
+    """
+    
+    ldif_dir = topo.standalone.get_ldif_dir()
+    ldif_file = os.path.join(topo.standalone.ds_paths.ldif_dir, 'large_nested.ldif')
+
+    # Have a reasonable balance between the need for a large ldif file to import and the time of test execution
+    # total number of users
+    num_users = 100000
+
+    # Choose a limited number of users per node to get as much as possible non-leaf entries
+    node_limit = 5 
+
+    # top suffix
+    suffix = 'o=test'
+
+    # backend
+    backend = 'test'
+
+    log.info('Delete the previous errors logs')
+    topo.standalone.deleteErrorLogs()
+
+    log.info('Add suffix:{} and backend: {}...'.format(suffix, backend))
+                                                         
+    backends = Backends(topo.standalone)
+    backends.create(properties={'nsslapd-suffix': suffix,
+                                'name': backend})
+
+    props = {
+        'numUsers' : num_users,
+        'nodeLimit' : node_limit,
+        'suffix' : suffix
+    }
+    instance = topo.standalone
+
+    log.info('Create a large nested ldif file using dbgen : %s' % ldif_file)
+    dbgen_nested_ldif(instance, ldif_file, props)
+
+    log.info('Stop the server and run offline import...')
+    topo.standalone.stop()
+    assert topo.standalone.ldif2db(backend, None, None,
+                                   None, ldif_file)
+
+    log.info('Starting the server')
+    topo.standalone.start()
+
+    log.info('parse the errors logs to check lines with "Starting sort of ancestorid" are present')
+    start_sort_str = str(topo.standalone.ds_error_log.match(r'.*Starting sort of ancestorid non-leaf IDs*'))[1:-1]
+    assert len(start_sort_str) > 0
+
+    log.info('parse the errors logs to check lines with "Finished sort of ancestorid" are present')
+    end_sort_str = str(topo.standalone.ds_error_log.match(r'.*Finished sort of ancestorid non-leaf IDs*'))[1:-1]
+    assert len(end_sort_str) > 0
+
+    log.info('parse the error logs for the line with "Gathering ancestorid non-leaf IDs"')
+    start_ancestorid_indexing_op_str = str(topo.standalone.ds_error_log.match(r'.*Gathering ancestorid non-leaf IDs*'))[1:-1]
+    assert len(start_ancestorid_indexing_op_str) > 0
+
+    log.info('parse the error logs for the line with "Created ancestorid index"')
+    end_ancestorid_indexing_op_str = str(topo.standalone.ds_error_log.match(r'.*Created ancestorid index*'))[1:-1]
+    assert len(end_ancestorid_indexing_op_str) > 0
+
+    log.info('get the ancestorid non-leaf IDs indexing start and end time from the collected strings')
+    # Collected lines look like : '[15/May/2020:05:30:27.245967313 -0400] - INFO - bdb_get_nonleaf_ids - import userRoot: Gathering ancestorid non-leaf IDs...'
+    # We are getting the sec.nanosec part of the date, '27.245967313' in the above example
+    start_time = (start_ancestorid_indexing_op_str.split()[0]).split(':')[3]
+    end_time = (end_ancestorid_indexing_op_str.split()[0]).split(':')[3]
+  
+    log.info('Calculate the elapsed time for the ancestorid non-leaf IDs index creation')
+    etime = (Decimal(end_time) - Decimal(start_time))
+    # The time for the ancestorid index creation should be less than 10s for an offline import of an ldif file with 100000 entries / 5 entries per node
+    # Should be adjusted if these numbers are modified in the test
+    assert etime <= 10
+
 
 if __name__ == '__main__':
     # Run isolated