#48122 nunc-stans FD leak
Closed: Fixed None Opened 5 years ago by mreynolds.

When testing thousands of sequential connections (bind, search, unbind), some connections are not closed because the connection refcnt is greater than zero, even though the connection/ops have been processed by the server.

Nunc-stans then starts spinning, consuming 100% of the CPU, trying to close these connections via ns_handle_closure.


This shows the aftermath of the connection refcnt getting out of sync and CPU going to 100% where ns_handle_closure is called over and over::

{{{
#1 0x000000000041ec79 in ns_handle_closure (job=0x7f7e84003ea0) at ../ds/ldap/servers/slapd/daemon.c:2420
#2 0x00007f7ebc6f0733 in event_cb (job=0x7f7e84003ea0) at ../nunc-stans/ns_thrpool.c:328
#3 0x00007f7ebc6f1d97 in event_cb (fd=64, event=1, arg=0x7f7e84003ea0) at ../nunc-stans/ns_event_fw_event.c:74
#4 0x00007f7ebc4b5a14 in event_process_active_single_queue (activeq=0x1be9500, base=0x1be90c0) at event.c:1350
#5 event_process_active (base=<optimized out="">) at event.c:1420
#6 event_base_loop (base=0x1be90c0, flags=1) at event.c:1621
#7 0x00007f7ebc6f227f in ns_event_fw_loop (ns_event_fw_ctx=0x1be90c0) at ../nunc-stans/ns_event_fw_event.c:279
#8 0x00007f7ebc6f06be in event_loop_thread_func (arg=0x1c22990) at ../nunc-stans/ns_thrpool.c:302
#9 0x00007f7eba2f9740 in _pt_root (arg=0x1c162a0) at ../../../nspr/pr/src/pthreads/ptthread.c:204
#10 0x00007f7eb9c9adf3 in start_thread (arg=0x7f7e8bfe7700) at pthread_create.c:308
#11 0x00007f7eb97b23dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113

ns_handle_closure()

(gdb) p *c
$1 = {c_sb = 0x1c4b6d0, c_sd = 64, c_ldapversion = 3, c_dn = 0x7f7e80001250 "cn=dm", c_isroot = 1, c_isreplication_session = 0,
c_authtype = 0x7f7e80001cd0 "simple", c_external_dn = 0x0, c_external_authtype = 0x7f7ebca10711 "none", cin_addr = 0x7f7e84003a70,
cin_destaddr = 0x7f7e84004370, c_domain = 0x0, c_ops = 0x0, c_gettingber = 0, c_currentber = 0x0, c_starttime = 1426019022, c_connid = 626169,
c_maxthreadscount = 0, c_maxthreadsblocked = 0, c_opsinitiated = 3, c_opscompleted = 3, c_threadnumber = 0, c_refcnt = 3,
c_mutex = 0x7f7e84000f90, c_pdumutex = 0x7f7e84001040, c_idlesince = 1426019022, c_idletimeout = 0, c_idletimeout_handle = 8,
c_private = 0x7f7e84003c50, c_flags = 2, c_needpw = 0, c_client_cert = 0x0, c_prfd = 0x7f7e300008e0, c_ci = 64, c_fdi = -1, c_next = 0x0,
c_prev = 0x7f7ebcd87010, c_bi_backend = 0x0, c_extension = 0x7f7e84002cd0, c_sasl_conn = 0x7f7e84005d50, c_local_ssf = 0, c_sasl_ssf = 0,
c_ssl_ssf = 0, c_unix_local = 0, c_local_valid = 0, c_local_uid = 0, c_local_gid = 0, c_pagedresults = {prl_maxlen = 0, prl_count = 0,
prl_list = 0x0}, c_push_io_layer_cb = 0x0, c_pop_io_layer_cb = 0x0, c_io_layer_cb_data = 0x0, c_ct = 0x1be73f0, c_tp = 0x1c22990,
c_ns_close_jobs = 1}

(gdb) n
2417 connection_release_nolock(c); / release ref acquired when job was added /
2418 if (connection_table_move_connection_out_of_active_list(c->c_ct, c)) {
(gdb) p *c
$2 = {c_sb = 0x1c4b6d0, c_sd = 64, c_ldapversion = 3, c_dn = 0x7f7e80001250 "cn=dm", c_isroot = 1, c_isreplication_session = 0,
c_authtype = 0x7f7e80001cd0 "simple", c_external_dn = 0x0, c_external_authtype = 0x7f7ebca10711 "none", cin_addr = 0x7f7e84003a70,
cin_destaddr = 0x7f7e84004370, c_domain = 0x0, c_ops = 0x0, c_gettingber = 0, c_currentber = 0x0, c_starttime = 1426019022, c_connid = 626169,
c_maxthreadscount = 0, c_maxthreadsblocked = 0, c_opsinitiated = 3, c_opscompleted = 3, c_threadnumber = 0, c_refcnt = 2,
c_mutex = 0x7f7e84000f90, c_pdumutex = 0x7f7e84001040, c_idlesince = 1426019022, c_idletimeout = 0, c_idletimeout_handle = 8,
c_private = 0x7f7e84003c50, c_flags = 2, c_needpw = 0, c_client_cert = 0x0, c_prfd = 0x7f7e300008e0, c_ci = 64, c_fdi = -1, c_next = 0x0,
c_prev = 0x7f7ebcd87010, c_bi_backend = 0x0, c_extension = 0x7f7e84002cd0, c_sasl_conn = 0x7f7e84005d50, c_local_ssf = 0, c_sasl_ssf = 0,
c_ssl_ssf = 0, c_unix_local = 0, c_local_valid = 0, c_local_uid = 0, c_local_gid = 0, c_pagedresults = {prl_maxlen = 0, prl_count = 0,
prl_list = 0x0}, c_push_io_layer_cb = 0x0, c_pop_io_layer_cb = 0x0, c_io_layer_cb_data = 0x0, c_ct = 0x1be73f0, c_tp = 0x1c22990,
c_ns_close_jobs = 0}

2420 ns_connection_post_io_or_closing(c);
2422 PR_Unlock(c->c_mutex);
(gdb) p *c
$3 = {c_sb = 0x1c4b6d0, c_sd = 64, c_ldapversion = 3, c_dn = 0x7f7e80001250 "cn=dm", c_isroot = 1, c_isreplication_session = 0,
c_authtype = 0x7f7e80001cd0 "simple", c_external_dn = 0x0, c_external_authtype = 0x7f7ebca10711 "none", cin_addr = 0x7f7e84003a70,
cin_destaddr = 0x7f7e84004370, c_domain = 0x0, c_ops = 0x0, c_gettingber = 0, c_currentber = 0x0, c_starttime = 1426019022, c_connid = 626169,
c_maxthreadscount = 0, c_maxthreadsblocked = 0, c_opsinitiated = 3, c_opscompleted = 3, c_threadnumber = 0, c_refcnt = 3,
c_mutex = 0x7f7e84000f90, c_pdumutex = 0x7f7e84001040, c_idlesince = 1426019022, c_idletimeout = 0, c_idletimeout_handle = 8,
c_private = 0x7f7e84003c50, c_flags = 2, c_needpw = 0, c_client_cert = 0x0, c_prfd = 0x7f7e300008e0, c_ci = 64, c_fdi = -1, c_next = 0x0,
c_prev = 0x7f7ebcd87010, c_bi_backend = 0x0, c_extension = 0x7f7e84002cd0, c_sasl_conn = 0x7f7e84005d50, c_local_ssf = 0, c_sasl_ssf = 0,
c_ssl_ssf = 0, c_unix_local = 0, c_local_valid = 0, c_local_uid = 0, c_local_gid = 0, c_pagedresults = {prl_maxlen = 0, prl_count = 0,
prl_list = 0x0}, c_push_io_layer_cb = 0x0, c_pop_io_layer_cb = 0x0, c_io_layer_cb_data = 0x0, c_ct = 0x1be73f0, c_tp = 0x1c22990,
c_ns_close_jobs = 1}

}}}

The access log shows this connection as being closed:

{{{
access:[10/Mar/2015:13:23:42 -0700] conn=626169 fd=64 slot=64 connection from ::1 to ::1
access:[10/Mar/2015:13:23:42 -0700] conn=626169 op=0 BIND dn="cn=dm" method=128 version=3
access:[10/Mar/2015:13:23:42 -0700] conn=626169 op=0 RESULT err=0 tag=97 nentries=0 etime=0 dn="cn=dm"
access:[10/Mar/2015:13:23:42 -0700] conn=626169 op=1 SRCH base="dc=example,dc=com" scope=2 filter="(uid=scarter)" attrs=ALL
access:[10/Mar/2015:13:23:42 -0700] conn=626169 op=1 RESULT err=0 tag=101 nentries=1 etime=0
access:[10/Mar/2015:13:23:42 -0700] conn=626169 op=2 UNBIND
access:[10/Mar/2015:13:23:42 -0700] conn=626169 op=2 fd=64 closed - U1
}}}

Is there another thread that has a reference to the conn? What is your reproducer?

Replying to [comment:4 rmeggins]:

Is there another thread that has a reference to the conn? What is your reproducer?

I don't believe there are any other threads active, but I'll reproduce it here in a moment(takes 30 minutes for it to break).

This is the reproducer script:

{{{
python ./nconns-long.py 'ldap://localhost:389' 'cn=directory manager' password 'dc=example,dc=com' 2000

import sys
import ldap
import time

url, binddn, bindpw, basedn, nconns = sys.argv[1:]

conns = []
count = 1
while url:
print ('Run: %d ' % count)
count += 1

for ii in xrange(0, int(nconns)):
    conn = ldap.initialize(url)
    conn.simple_bind(binddn, bindpw)
    ents = conn.search_s(basedn, ldap.SCOPE_SUBTREE, "uid=scarter")
    conn.unbind_s()

time.sleep(5)

}}}

It does 2000 sequential connections(bind, search, unbind). So there is only one connection active at a time in the server(but its possible the event thread is still processing a closure when a new connection comes in). Then it sleeps for 5 seconds, does another 2000 connections. It does this over and over, and at some point the server goes to 100% CPU and that's when a connection ref count gets out of synch(and 389 won't close the connection at that point).

I get a crash due to an assertion failure. Are you configuring ds with --enable-debug?

Replying to [comment:6 rmeggins]:

I get a crash due to an assertion failure. Are you configuring ds with --enable-debug?

I am using --enable-debug, and I was just getting random crashes as well after I get an error about acquiring a connection. Trying to catch the crash in a core file, but haven't been able to reproduce it again. Could be the same assertion you are getting.

commit 091dca10357d99a97e01be39c2ccf0cf5b841929
Author: Rich Megginson rmeggins@redhat.com
Date: Thu Mar 12 14:15:51 2015 -0600

pushed

To ssh://git.fedorahosted.org/git/389/ds.git
4051fe0..1bf67a4 master -> master
commit 1bf67a4
Author: Rich Megginson rmeggins@redhat.com
Date: Mon May 11 11:33:04 2015 -0600

Metadata Update from @mreynolds:
- Issue assigned to rmeggins
- Issue set to the milestone: 1.3.4 backlog

3 years ago

Login to comment on this ticket.

Metadata