#50222 Ticket 49463 After cleanALLruv, replication is looping on keep alive DEL
Closed 3 years ago by spichugi. Opened 5 years ago by aadhikari.
aadhikari/389-ds-base ticket49463  into  master

@@ -0,0 +1,227 @@ 

+ import time

+ import ldap

+ import logging

+ import pytest

+ import os

+ import re

+ from lib389._constants import DEFAULT_SUFFIX, SUFFIX, LOG_REPLICA, LOG_DEFAULT

+ from lib389.config import Config

+ from lib389 import DirSrv, Entry

+ from lib389.topologies import topology_m4 as topo

+ from lib389.replica import Replicas, ReplicationManager

+ from lib389.idm.user import UserAccounts, UserAccount

+ from lib389.tasks import *

+ from lib389.utils import *

+ 

+ 

+ USER_CN = "test_user"

+ 

+ 

+ def add_user(server, no, desc='dummy'):

+     user = UserAccounts(server, DEFAULT_SUFFIX)

+     users = user.create_test_user(uid=no)

+     users.add('description', [desc])

+     users.add('objectclass', 'userSecurityInformation')

+ 

+ 

+ def pattern_errorlog(server, log_pattern):

+     for i in range(10):

+         time.sleep(5)

+         found = server.ds_error_log.match(log_pattern)

+         if found == '' or found:

+             return found

+             break

+ 

+ 

+ def fractional_server_to_replica(server, replica):

+     repl = ReplicationManager(DEFAULT_SUFFIX)

+     repl.ensure_agreement(server, replica)

+     replica_server = Replicas(server).get(DEFAULT_SUFFIX)

+     agmt_server = replica_server.get_agreements().list()[0]

+     agmt_server.replace_many(

+         ('nsDS5ReplicatedAttributeListTotal', '(objectclass=*) $ EXCLUDE telephoneNumber'),

+         ('nsDS5ReplicatedAttributeList', '(objectclass=*) $ EXCLUDE telephoneNumber'),

+         ('nsds5ReplicaStripAttrs', 'modifiersname modifytimestamp'),

+     )

+ 

+ 

+ def count_pattern_accesslog(server, log_pattern):

+     count = 0

+     server.config.set('nsslapd-accesslog-logbuffering', 'off')

+     if server.ds_access_log.match(log_pattern):

+         count = count + 1

+ 

+     return count

+ 

+ 

+ def test_ticket_49463(topo):

+     """Specify a test case purpose or name here

+ 

+     :id: d1aa2e8b-e6ab-4fc6-9c63-c6f622544f2d

+     :setup: Fill in set up configuration here

+     :steps:

+         1. Enable fractional replication

+         2. Enable replication logging

+         3. Check that replication is working fine

+         4. Generate skipped updates to create keep alive entries

+         5. Remove M3 from the topology

+         6. issue cleanAllRuv FORCE that will run on M1 then propagated M2 and M4

+         7. Check that Number DEL keep alive '3' is <= 1

+         8. Check M1 is the originator of cleanAllRuv and M2/M4 the propagated ones

+         9. Check replication M1,M2 and M4 can recover

+         10. Remove M4 from the topology

+         11. Issue cleanAllRuv not force  while M2 is stopped (that hangs the cleanAllRuv)

+         12. Check that nsds5ReplicaCleanRUV is correctly encoded on M1 (last value: 1)

+         13. Check that nsds5ReplicaCleanRUV encoding survives M1 restart

+         14. Check that nsds5ReplicaCleanRUV encoding is valid on M2 (last value: 0)

+         15. Check that (for M4 cleanAllRUV) M1 is Originator and M2 propagation

+     :expectedresults:

+         1. No report of failure when the RUV is updated

+     """

+ 

+     # Step 1 - Configure fractional (skip telephonenumber) replication

+     M1 = topo.ms["master1"]

+     M2 = topo.ms["master2"]

+     M3 = topo.ms["master3"]

+     M4 = topo.ms["master4"]

+     repl = ReplicationManager(DEFAULT_SUFFIX)

+     fractional_server_to_replica(M1, M2)

+     fractional_server_to_replica(M1, M3)

+     fractional_server_to_replica(M1, M4)

+ 

+     fractional_server_to_replica(M2, M1)

+     fractional_server_to_replica(M2, M3)

+     fractional_server_to_replica(M2, M4)

+ 

+     fractional_server_to_replica(M3, M1)

+     fractional_server_to_replica(M3, M2)

+     fractional_server_to_replica(M3, M4)

+ 

+     fractional_server_to_replica(M4, M1)

+     fractional_server_to_replica(M4, M2)

+     fractional_server_to_replica(M4, M3)

+ 

+     # Step 2 - enable internal op logging and replication debug

+     for i in (M1, M2, M3, M4):

+         i.config.loglevel(vals=[256 + 4], service='access')

+         i.config.loglevel(vals=[LOG_REPLICA, LOG_DEFAULT], service='error')

+ 

+     # Step 3 - Check that replication is working fine

+     add_user(M1, 11, desc="add to M1")

+     add_user(M2, 21, desc="add to M2")

+     add_user(M3, 31, desc="add to M3")

+     add_user(M4, 41, desc="add to M4")

+ 

+     for i in (M1, M2, M3, M4):

+         for j in (M1, M2, M3, M4):

+             if i == j:

+                 continue

+             repl.wait_for_replication(i, j)

+ 

+     # Step 4 - Generate skipped updates to create keep alive entries

+     for i in (M1, M2, M3, M4):

+         cn = '%s_%d' % (USER_CN, 11)

+         dn = 'uid=%s,ou=People,%s' % (cn, SUFFIX)

+         users = UserAccount(i, dn)

+         for j in range(110):

+             users.set('telephoneNumber', str(j))

+ 

+     # Step 5 - Remove M3 from the topology

+     M3.stop()

+     M1.agreement.delete(suffix=SUFFIX, consumer_host=M3.host, consumer_port=M3.port)

+     M2.agreement.delete(suffix=SUFFIX, consumer_host=M3.host, consumer_port=M3.port)

+     M4.agreement.delete(suffix=SUFFIX, consumer_host=M3.host, consumer_port=M3.port)

+ 

+     # Step 6 - Then issue cleanAllRuv FORCE that will run on M1, M2 and M4

+     M1.tasks.cleanAllRUV(suffix=SUFFIX, replicaid='3',

+                          force=True, args={TASK_WAIT: True})

+ 

+     # Step 7 - Count the number of received DEL of the keep alive 3

+     for i in (M1, M2, M4):

+         i.restart()

+     regex = re.compile(".*DEL dn=.cn=repl keep alive 3.*")

+     for i in (M1, M2, M4):

+         count = count_pattern_accesslog(M1, regex)

+         log.debug("count on %s = %d" % (i, count))

+ 

+         # check that DEL is replicated once (If DEL is kept in the fix)

+         # check that DEL is is not replicated (If DEL is finally no long done in the fix)

+         assert ((count == 1) or (count == 0))

+ 

+     # Step 8 - Check that M1 is Originator of cleanAllRuv and M2, M4 propagation

+     regex = re.compile(".*Original task deletes Keep alive entry .3.*")

+     assert pattern_errorlog(M1, regex)

+ 

+     regex = re.compile(".*Propagated task does not delete Keep alive entry .3.*")

+     assert pattern_errorlog(M2, regex)

+     assert pattern_errorlog(M4, regex)

+ 

+     # Step 9 - Check replication M1,M2 and M4 can recover

+     add_user(M1, 12, desc="add to M1")

+     add_user(M2, 22, desc="add to M2")

+     for i in (M1, M2, M4):

+         for j in (M1, M2, M4):

+             if i == j:

+                 continue

+             repl.wait_for_replication(i, j)

+ 

+     # Step 10 - Remove M4 from the topology

+     M4.stop()

+     M1.agreement.delete(suffix=SUFFIX, consumer_host=M4.host, consumer_port=M4.port)

+     M2.agreement.delete(suffix=SUFFIX, consumer_host=M4.host, consumer_port=M4.port)

+ 

+     # Step 11 - Issue cleanAllRuv not force  while M2 is stopped (that hangs the cleanAllRuv)

+     M2.stop()

+     M1.tasks.cleanAllRUV(suffix=SUFFIX, replicaid='4',

+                          force=False, args={TASK_WAIT: False})

+ 

+     # Step 12

+     # CleanAllRuv is hanging waiting for M2 to restart

+     # Check that nsds5ReplicaCleanRUV is correctly encoded on M1

+     replicas = Replicas(M1)

+     replica = replicas.list()[0]

+     time.sleep(0.5)

+     replica.present('nsds5ReplicaCleanRUV')

+     log.info("M1: nsds5ReplicaCleanRUV=%s" % replica.get_attr_val_utf8('nsds5replicacleanruv'))

+     regex = re.compile("^4:.*:no:1$")

+     assert regex.match(replica.get_attr_val_utf8('nsds5replicacleanruv'))

+ 

+     # Step 13

+     # Check that it encoding survives restart

+     M1.restart()

+     assert replica.present('nsds5ReplicaCleanRUV')

+     assert regex.match(replica.get_attr_val_utf8('nsds5replicacleanruv'))

+ 

+     # Step 14 - Check that nsds5ReplicaCleanRUV encoding is valid on M2

+     M1.stop()

+     M2.start()

+     replicas = Replicas(M2)

+     replica = replicas.list()[0]

+     M1.start()

+     time.sleep(0.5)

+     if replica.present('nsds5ReplicaCleanRUV'):

+         log.info("M2: nsds5ReplicaCleanRUV=%s" % replica.get_attr_val_utf8('nsds5replicacleanruv'))

+         regex = re.compile("^4:.*:no:0$")

+         assert regex.match(replica.get_attr_val_utf8('nsds5replicacleanruv'))

+ 

+     # time to run cleanAllRuv

+     for i in (M1, M2):

+         for j in (M1, M2):

+             if i == j:

+                 continue

+             repl.wait_for_replication(i, j)

+ 

+     # Step 15 - Check that M1 is Originator of cleanAllRuv and M2 propagation

+     regex = re.compile(".*Original task deletes Keep alive entry .4.*")

+     assert pattern_errorlog(M1, regex)

+ 

+     regex = re.compile(".*Propagated task does not delete Keep alive entry .4.*")

+     assert pattern_errorlog(M2, regex)

+ 

+ 

+ if __name__ == '__main__':

+     # Run isolated

+     # -s for DEBUG mode

+     CURRENT_FILE = os.path.realpath(__file__)

+     pytest.main("-s %s" % CURRENT_FILE)

+      

\ No newline at end of file

Bug Description: When cleanAllRuv is launched, it spawn cleanAllRuv on all replicas.
Each replica will clean its changelog and database RUV but in addition
will DEL the keep alive entry of the target ReplicaID.

Fix Description: Test case cover all the scenario to be tested for the fix.

https://pagure.io/389-ds-base/issue/49463

Review by: ??

The test looks good. Should it be integrated into replication test suite or kept into a regression ticket test case ?

I don't think you need any of this logging boilerplate, it,s in topologies now.

There is already a userAccount create helper in the code you can use.

Please don't use getEntry, it's going to be removed.

There is an errorlog parsing and manipulation api already in lib389 that allows applying a seanch to the log to find errors.

Don't use raw modify_s. We also already have replica objects you can use to setup replication and fractional attributes, so you should use those.

Don't use raw modify, you should be updating the various objects. Don't use ensure_bytes outside of core lib389 (and even then it should be rare). Basiccaly, if you ever need "ensure bytes", you probably are doing something in a "non-correct" way,

There is an access log searching api in the lib389 code already.

Why is this sleep present? If you are waiting for replication, there is a replication wait task. You should never need a sleep in tests

Remove un-used boilerplate from copy-paste.

I want to point out generally, that if you are adding "helpers" to make this test case easier, it's likely they will be needed to make other tests easier too. This mean creating user, log checks etc. It also is likely that these things already exist in lib389, because you probably aren't the first person to require this kind of interaction. There is lots of functionality in lib389 to make writing tests quick, simple, and compact. It's probably worth your time to have a quick read of the library to find many things that will help you. :)

I want to point out generally, that if you are adding "helpers" to make this test case easier, it's likely they will be needed to make other tests easier too. This mean creating user, log checks etc. It also is likely that these things already exist in lib389, because you probably aren't the first person to require this kind of interaction. There is lots of functionality in lib389 to make writing tests quick, simple, and compact. It's probably worth your time to have a quick read of the library to find many things that will help you. :)

@firstyear It will be really nice if you a draft a mail which consists of all the things which will be either removed or should not be done. Like, DO's and DONT. It will really help anyone who is writing test cases. It's confusing when we are using modify_s like everywhere in the tests so it will be really nice to have an example or convection which we all can follow. Also for eg: getEntry will be removed, so what should be its replacement. We probably can fix many tests which are already in the upstream. Thanks!

I'm literally adding code level warnings for this, so they can be found easier during devel and testing. https://pagure.io/389-ds-base/pull-request/50226# Soon I'm going to start addressing these warnings in existing tests (but it will take ... a long time ....).
As well, @spichugi has a document on how to write tests I think with lib389.

The tl;dr is basically, that everything should be passed through a DSLdapObject(s) type, so you don't "get" and entry, you would do like:

thing = DSLdapObjects().get(name)

And similar. There are plenty of reasons for this. It makes more compact tests, it allows encapsulating logic into the code (IE what attributes can I set? there are get/set for that!). It handles python3 for you, it handles lots of sticky cases with renames and modifies, it has a stateful version (ensure this thing exists rather than needing to check etc).

Often lots of these things already exist, but it's a matter of familiarity. Part of it is you making an effort to read and understand what the libraries provide. Part of it is reviews like this, where I can explain how the library works (I wrote most of it I think ...). Every time I give these reviews, they are times to learn and then you don't make the mistake again.
Part of it is also that soon, I'll start to embed warnings through the libraries so we'll have an automated list of bad behaviours we can check.

@firstyear many time our whole focus is on quick porting as this was already in the ticket (https://pagure.io/389-ds-base/issue/raw/files/a51cc3d9137740fe66aa5bb98ef78e38f8c7937d8d197cfb1cc86a1c2b23a1a5-0001-Ticket-49463-testcase.patch)

so we quickly wanna have this, and I see your point we will do the changes but instead of you taking efforts while reviewing, we can save both efforts and time if the information is passed before anything and I understand there are so many cases but at least some of the legacy one can be mentioned so we can have quality code for the first review itself :) BTW the how to write test using lib389 is little old and would require updating as I can statement like: standalone.modify_s(USER_DN, [(ldap.MOD_REPLACE, 'jpegPhoto', ensure_bytes(var_with_content)]) which I think we wanna avoid. If there is any new document other than this: https://fedorapeople.org/~spichugi/html/guidelines.html then please do share It will be really helpful.

Just for curiosity, what is the recommended way to retrieve an entry knowing its DN ?

I agree that sleep is a bad idea as it slows does a full suite.
I do not recall exactly why it was needed. My guess is that previous updates creates keep_alive entries, and the sleep gives time to replicate them everywhere. So that later during cleanallruv, each replica will do a DEL of the removed keep_alive (when the bug is not fixed).

So the right approach would be to do a search of the keep_alive entries, have we got a lib389 helper for that ?

Here to be honest I do not recall or guess why it is needed. The RA threads should be stopped when the DEL return.
You may try remove it

I agree that sleep is a bad idea as it slows does a full suite.
I do not recall exactly why it was needed. My guess is that previous updates creates keep_alive entries, and the sleep gives time to replicate them everywhere. So that later during cleanallruv, each replica will do a DEL of the removed keep_alive (when the bug is not fixed).
So the right approach would be to do a search of the keep_alive entries, have we got a lib389 helper for that ?

Sure I will remove the sleep there.

so we quickly wanna have this, and I see your point we will do the changes but

Quick always leads to rushed results and mistakes. It is better to take your time and you will have a better experience. There are long term consequences to "quick" and "rush", one of which is reviews take longer.

If there is any new document other than this: https://fedorapeople.org/~spichugi/html/guidelines.html then please do share It will be really helpful.

If you want to see "good" tests, look at src/lib389/lib389/tests/idm/
These all exclusively use lib389.

rebased onto 09be3e19c93e484204a95f4e3ec63eb0c60135d3

5 years ago

rebased onto 7f0b51298a5c38575e6196cadd6548407e66e641

5 years ago

@firstyear, we have made some changes suggested by you. Can you help here, what would be the best way to avoid these sleeps, @tbordaz had asked and explained the purpose of it. Thanks!

@aadhikari In the replication libraries of lib389, there is a "wait for replication" function you can call that waits for a replication to event, and then continues. Use that please.

More specific:

repman = ReplicationManager(DEFAULT_SUFFIX)
repman.wait_for_replication(from_instance, to_instance)

IE

repman.wait_for_replication(topology.m1, topology.m2)

The entire REplicationManager class may just help you in general to understand replication configuration, it's really good.

rebased onto ab6559c2cd1b9b5d3f714183c49e848776828f91

5 years ago

@firstyear changes have been made, please have a look

Why are you importing *? Can we only import what we need?

There is a user create helper in lib389 stop creating your own, please.

What is the purpose of this? If you are looping and waiting, it's like you are waiting on replication. Use the replman.wait_for_replication, then just assert if you have the value or not?

Also, I think you want UserAccount(server, dn).exists(). I think you are trying to do an existance test,

This is an infinite loop risk if the pattern isn't found. Put the errorlog buffered to false, then you don't need a loop at all.

As above with error log. Buffered to false, and just read once.

Is this needed for fractional replication to take effect ...?

This is where you should use replmanager.wait_for_replication(). I said this in my last review .....

You probably want for a, b in permutations (M1, M2, M3, M4): replman.wai_for_replication(a, b).

You only use this once, so why make it a function? it only breaks the flow and clarity of the test. Only make it a function if you call it from multiple places in the function ....

I think replman has a way to ensure an agreement is removed?

Can you comment with an example string of what this is attempting to match in the assert positive and negative case?

There is a permutations module you want. This also seems to want to do "wait for replication(M1, M1)". which is invalid.

rebased onto 87c8b32717c342643492da80197d983ba21122ae

5 years ago

This is an infinite loop risk if the pattern isn't found. Put the errorlog buffered to false, then you don't need a loop at all.

@firstyear there is no buffered to false for the error log, just for access log. Also with looping, it takes an iteration of 318 to actually have the results in the log. Restarting the instance either in the loop or outside didn't help. So have used a small sleep time.

Remove boilerplate.

Do you mean the comments?

Can you comment with an example string of what this is attempting to match in the assert positive and negative case?

@firstyear a sample entry: 4:5c6fa6fe000000040000:no:1, so here 5c6fa6fe000000040000 will be changing, but rest will be same, so we are checking do we a have an entry which starts with 4: and ends with "any:no:1"(excluding any). replica.get_attr_val_utf8('nsds5replicacleanruv') do have the actual value. We are just comparing it with the regex. @tbordaz can you please give more details here, I might be missing out something particularly.

@aadhikari I meant to put that into the code so we can see what you are trying to do and why...

Remove boilerplate.

Do you mean the comments?

Yes

Also, I think the errorlog isn't buffered now I check, only the access log is, so I'm still not sure this sleep is needed.

@aadhikari I meant to put that into the code so we can see what you are trying to do and why...
@firstyear we are already printing the value in the code.

This is an infinite loop risk if the pattern isn't found. Put the errorlog buffered to false, then you don't need a loop at all.

@firstyear there is no buffered to false for the error log, just for access log. Also with looping, it takes an iteration of 318 to actually have the results in the log. Restarting the instance either in the loop or outside didn't help. So have used a small sleep time.

@firstyear yeah there is no error log buffer in the source, I talked to @spichugi for this issue even he can't find it in the source. Above are some observation please have a look. I didn't find any solution. personally not a fan of sleep :)

Reading log.c in the main server core, errorlog is not buffered. However that doesn't mean that the content is flushed to disk after an operation. The right solution could be to fsync on the errorlog after a write, but that imposes a performance penalty. But sleep also doesn't fix the issue either. Perhaps we need to ammend the lib389 errorlog parsing function to fsync before read?

Reading log.c in the main server core, errorlog is not buffered. However that doesn't mean that the content is flushed to disk after an operation. The right solution could be to fsync on the errorlog after a write, but that imposes a performance penalty. But sleep also doesn't fix the issue either. Perhaps we need to ammend the lib389 errorlog parsing function to fsync before read?

@firstyear sleep give us little time to read the content, also it's only failing when we search for .Original task deletes Keep alive entry .4. If you run the code you will these logs are generated after a great pause.

Let me think about this for a bit. I'm wondering if the issue is some async replication behaviour and if there is another way to wait on it.

rebased onto 86359a5f69e4874b74bdcfd433fd2581ed3175b1

5 years ago

May be better to put the sleep after the .match()? So it checks "as soon" as possible.
An option to speed this up could be to have a back off too. Right now this is 50 seconds max, so you could do something like

time.sleep( (i/2.0)**2 )

That should give you a sleep timer of

>>> for i in range(0, 10):
...     ((i/2.0) ** 2)
...
0.0
0.25
1.0
2.25
4.0
6.25
9.0
12.25
16.0
20.25

The idea being you do many quicker checks to start, then "back off" if it's taking longer.

Hmmm, now I think about it, you should not use this cleanAllRuv, but you should use :100:

from lib389.tasks import CleanAllRUVTask
task = CleanAllRUVTask(M1).create(properties={'replica-force-cleaning': 'yes'}
task.wait()

Look at your pytest warnings: You should have a bunch of them from the cleanruv task as it uses deprecated lib389 apis. The above example should work without warnings.

Why not set a timeout on the task and with pytest.raises()

rebased onto 8db9c5cf6f2f926a39a2d7a87f8726253fa9d46d

5 years ago

@firstyear Changes have been made, but now the whole test is taking more time to complete.

Can you debug and print what timing event was the one taken for the error log?

Can you debug and print what timing event was the one taken for the error log?
@firstyear what exactly you are stating here? The overall timing of the execution has increased. Earlier with the loop, it uses to take 1 iteration or max 2 but now I can see 6 iterations for the error log to be generated. But I still think this must not be the real reason to increase the overall time to this much.

My understanding is that replica.present will do an active lookup (up to 20 SRCH on the replica) without pause.
It looks very intensive, could not we pause between each search (may be 0.1s)

My understanding is that replica.present will do an active lookup (up to 20 SRCH on the replica) without pause.
It looks very intensive, could not we pause between each search (may be 0.1s)

@tbordaz pause as in a time.sleep?

@aadhikari , yes. It is very convenient to put some delays during some specific steps of testcase. So far having to wait 1s instead of few msec was not a big deal, but as many testcases do the same at the end it ends to be slow.
I do not know how we can sleep for ms, but in this case it would be useful instead of active srch

You can put in floats in python for the sleep, ie 0.5.

Generally though, I think it would be great if we could find a way to "block until the task is ready for check" to avoid sleeps.

You can put in floats in python for the sleep, ie 0.5.
Generally though, I think it would be great if we could find a way to "block until the task is ready for check" to avoid sleeps.
@firstyear that's why i used a loop, to avoid sleep as per our discussion of not using it wherever we can, but as per @tbordaz suggestions I will be using sleep with someone float value.

rebased onto d8936be2785c3e722de13f8b63edc7270e327b0a

5 years ago

@firstyear @tbordaz Changes have been made accordingly.

looks good from here, @tbordaz if you are happy, ack and merge thanks :)

rebased onto 28a5ddb

5 years ago

Pull-Request has been merged by tbordaz

5 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3281

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago
Metadata