On 12-05-2021 19:44, Thierry Bordaz wrote:
On 5/12/21 4:55 PM, Kees Bakker wrote:
> Hi Thierry,
>
> Just to be clear, changelogmaxage was changed to -1 by me after the
> upgrade and I've
> confirmed it is now set to -1.
>
> The reason for me to change the value was because of the deadlock.
> Apparently, it did not make much of a difference. It still gets into a
> deadlock
> with the value -1.
Did you set nsslapd-changelogmaxage=-1 in the retroCL config entry ?
I've used this input file for ldapmodify
[root@linge ~]# cat change-nsslapd-changelogmaxage.txt
dn: cn=Retro Changelog Plugin,cn=plugins,cn=config
changetype: modify
replace: nsslapd-changelogmaxage
nsslapd-changelogmaxage: -1
After that
[root@linge ~]# ldapsearch -H ldapi://%2fvar%2frun%2fslapd-GHS-NL.socket -LLL -o
ldif-wrap=no -b 'cn=Retro Changelog Plugin,cn=plugins,cn=config'
SASL/EXTERNAL authentication started
SASL username: gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth
SASL SSF: 0
dn: cn=Retro Changelog Plugin,cn=plugins,cn=config
cn: Retro Changelog Plugin
nsslapd-attribute: nsuniqueid:targetUniqueId
nsslapd-changelogmaxage: -1
nsslapd-include-suffix: cn=dns,dc=ghs,dc=nl
nsslapd-plugin-depends-on-named: Class of Service
nsslapd-plugin-depends-on-type: database
nsslapd-pluginDescription: Retrocl Plugin
nsslapd-pluginEnabled: on
nsslapd-pluginId: retrocl
nsslapd-pluginInitfunc: retrocl_plugin_init
nsslapd-pluginPath: libretrocl-plugin
nsslapd-pluginType: object
nsslapd-pluginVendor: 389 Project
nsslapd-pluginVersion: 1.3.10.2
nsslapd-pluginbetxn: on
nsslapd-pluginprecedence: 25
objectClass: top
objectClass: nsSlapdPlugin
objectClass: extensibleObject
>
> BTW. I've sent the whole stacktrace directly to you to avoid the 4000+
> lines to
> this mailing list.
>
> Here is the stack trace of one of the threads. The one that hangs in
> trim_changelog
Something weird is that the same hang occurs
https://bugzilla.redhat.com/show_bug.cgi?id=1751295.
If I am correct it is between thread 14th and 2nd.
As you are running with a fixed version (> slapi-nis-0.56.5), that means
the fix is incomplete. It would require to debug a dump core to know why
the fix fails.
[root@linge ~]# rpm -qa slapi\*
slapi-nis-0.56.5-3.el7_9.x86_64
So, that is not > 0.56.5, but it is the newest available for CentOS7. No?
At the moment I killed it and restarted. No deadlock yet, but the system is extreemly
slow. And there is just ns-slapd using CPU cycles.
regards
thierry
>
>
> Thread 2 (Thread 0x7f96ede68700 (LWP 2151)):
> #0 0x00007f96eaf3939e in pthread_rwlock_wrlock () at
> /lib64/libpthread.so.0
> #1 0x00007f96da9d281f in map_wrlock () at
> /usr/lib64/dirsrv/plugins/schemacompat-plugin.so
> #2 0x00007f96da9c1e58 in backend_shr_delete_cb.part.5 () at
> /usr/lib64/dirsrv/plugins/schemacompat-plugin.so
> #3 0x00007f96da9c1fd1 in backend_shr_betxn_post_delete_cb () at
> /usr/lib64/dirsrv/plugins/schemacompat-plugin.so
> #4 0x00007f96ed7ec688 in plugin_call_func (list=0x55cb03cdd8c0,
> operation=operation@entry=563, pb=pb@entry=0x55cb4c670fc0,
> call_one=call_one@entry=0) at ldap/servers/slapd/plugin.c:2028
> n = <optimized out>
> func = 0x7f96da9c1f70 <backend_shr_betxn_post_delete_cb>
> rc = <optimized out>
> return_value = 0
> count = 2
> #5 0x00007f96ed7ec943 in plugin_call_list (pb=0x55cb4c670fc0,
> operation=563, list=<optimized out>) at ldap/servers/slapd/plugin.c:1972
> p = 0x55cb03cb0000
> locked = <optimized out>
> plugin_list_number = 21
> rc = 0
> do_op = <optimized out>
> #6 0x00007f96ed7ec943 in plugin_call_plugins
> (pb=pb@entry=0x55cb4c670fc0, whichfunction=whichfunction@entry=563) at
> ldap/servers/slapd/plugin.c:442
> p = 0x55cb03cb0000
> locked = <optimized out>
> plugin_list_number = 21
> rc = 0
> do_op = <optimized out>
> #7 0x00007f96dc990def in ldbm_back_delete (pb=0x55cb4c670fc0) at
> ldap/servers/slapd/back-ldbm/ldbm_delete.c:1267
> be = 0x55cb03c31040
> inst = 0x55cb03a8c680
> li = 0x55cb03a017c0
> e = 0x55cb15499a40
> tombstone = 0x0
> original_tombstone = 0x0
> tmptombstone = 0x0
> dn = 0x55cb1742a600 "changenumber=891343,cn=changelog"
> txn = {back_txn_txn = 0x55cb2b7b6dc0}
> parent_txn = 0x0
> retval = 0
> msg = <optimized out>
> errbuf = 0x0
> retry_count = <optimized out>
> disk_full = 0
> parent_found = <optimized out>
> ruv_c_init = 0
> parent_modify_c = {old_entry = 0x55cb159982a0, new_entry =
> 0x55cb19e03030, smods = 0x55cb47da1ae0, attr_encrypt = 1}
> ruv_c = {old_entry = 0x0, new_entry = 0x0, smods = 0x0,
> attr_encrypt = 0}
> rc = 0
> ldap_result_code = 0
> ldap_result_message = 0x0
> sdnp = 0x55cb5fd93bc0
> e_uniqueid = 0x0
> nscpEntrySDN = {flag = 0 '\000', udn = 0x0, dn = 0x0, ndn =
> 0x0, ndn_len = 0}
> operation = 0x55cb5ef86000
> opcsn = 0x0
> is_fixup_operation = 0
> is_ruv = 0
> is_replicated_operation = 0
> is_tombstone_entry = <optimized out>
> delete_tombstone_entry = 0
> create_tombstone_entry = 0
> addr = 0x55cb5ef86100
> addordel_flags = 38
> entryusn_str = 0x0
> orig_entry = 0x0
> parentsdn = {flag = 2 '\002', udn = 0x0, dn = 0x55cb516c3580
> "cn=changelog", ndn = 0x0, ndn_len = 12}
> opreturn = 0
> free_delete_existing_entry = 1
> not_an_error = 0
> parent_switched = 0
> myrc = 0
> conn_id = 0
> tombstone_csn = <optimized out>
> deletion_csn_str =
> "\247\001\000\000\000\000\000\000\000T\334YHi\365\357\300\017g",
> <incomplete sequence \313>
> op_id = 0
> ep_id = <optimized out>
> tomb_ep_id = 0
> result_sent = 0
> pb_conn = 0x0
> parent_op = 1
> parent_time = {tv_sec = 183860, tv_nsec = 41743941}
> #8 0x00007f96ed79d3bb in op_shared_delete
> (pb=pb@entry=0x55cb4c670fc0) at ldap/servers/slapd/delete.c:324
> rc = 0
> rawdn = 0x55cb5b28de00 "changenumber=891343, cn=changelog"
> dn = <optimized out>
> be = 0x55cb03c31040
> internal_op = 32
> sdn = 0x55cb5fd93bc0
> operation = 0x55cb5ef86000
> referral = 0x0
> ecopy = 0x0
> errorbuf = "\000\066\062\062Z\nmodifyTime", '\000'
<repeats 16
> times>, "\071\063\066\062\062Z\nnsUniqueI", '\000' <repeats
16 times>,
>
"\327\320~\003ڇ\316c!\f=\037$z\215\301;\215\006\065\341\n\310V\300\006\365I\344|x\210\216\f\215\060e\243S\350\000T\334YHi\365\357\016\237V\202\332PG\347\300\070\314\003\313U\000\000\300\070\314\003\313U\000\000\000T\334YHi\365\357\000\000\000\000\000\000\000\000\300\070\314\003\313U\000\000\300\070\314\003\313U\000\000\240<\250\355\226\177\000\000\000\243\002\000\000\000\000\000p|\346\355\226\177\000\000ϙ\r\000\000\000\000\000\312\360"...
> err = <optimized out>
> proxydn = 0x0
> proxystr = 0x0
> proxy_err = <optimized out>
> errtext = 0x0
> #9 0x00007f96ed79d598 in delete_internal_pb
> (pb=pb@entry=0x55cb4c670fc0) at ldap/servers/slapd/delete.c:208
> controls = 0x0
> op = 0x55cb5ef86000
> opresult = 0
> #10 0x00007f96ed79d813 in slapi_delete_internal_pb
> (pb=pb@entry=0x55cb4c670fc0) at ldap/servers/slapd/delete.c:151
> #11 0x00007f96daff731e in delete_changerecord (cnum=cnum@entry=891343)
> at ldap/servers/plugins/retrocl/retrocl_trim.c:89
> pb = 0x55cb4c670fc0
> dnbuf = 0x55cb5b28de00 "changenumber=891343, cn=changelog"
> delrc = 32662
> #12 0x00007f96daff7605 in trim_changelog () at
> ldap/servers/plugins/retrocl/retrocl_trim.c:290
> did_delete = 0
> done = 0
> last_in_log = <optimized out>
> me = 172800
> lt = <optimized out>
> ldrc = <optimized out>
> now = 1620819600
> first_in_log = 891343
> num_deleted = 3655
> #13 0x00007f96daff7605 in changelog_trim_thread_fn (arg=<optimized
> out>) at ldap/servers/plugins/retrocl/retrocl_trim.c:333
> #14 0x00007f96eb595c5b in _pt_root () at /lib64/libnspr4.so
> #15 0x00007f96eaf35ea5 in start_thread () at /lib64/libpthread.so.0
> #16 0x00007f96ea5e19fd in clone () at /lib64/libc.so.6
>
> -- Kees
>
> On 12-05-2021 15:49, Thierry Bordaz wrote:
>> Hi Kees,
>>
>> Is changelogmaxage=-1 after the upgrade ?
>>
>> would you send a full pstack when it hangs ? If pthread_rwlock_wrlock is
>> trim_changelog then you may hit another flavor of [1] (without known
>> reason).
>>
>> regards
>> thierry
>>
>> On 5/12/21 2:40 PM, Kees Bakker wrote:
>>> Sorry to revive an old thread. I'm getting deadlocks again. See below
>>>
>>> On 20-04-2020 15:16, thierry bordaz wrote:
>>>> [...]This is a known bug [1].
>>>> With the same bug there are two deadlock scenario but only one is
>>>> fixed (for example in slapi-nis-0.56.4-1 [2]).
>>>> A fix for the second one is under tests.
>>>>
>>>> At the moment I would recommend the workaround [3]. The drawback is a
>>>> growth of the retroCL database but unless you have a very high rate
>>>> of update it should not be a concern.
>>>>
>>>> [1]
https://bugzilla.redhat.com/show_bug.cgi?id=1751295
>>>> [2]
https://koji.fedoraproject.org/koji/buildinfo?buildID=1457771
>>>> [3]
https://bugzilla.redhat.com/show_bug.cgi?id=1751295#c5
>>>>
>>>> best regards
>>>> thierry
>>>>
>>>
>>> I followed the recommendation to set nsslapd-changelogmaxage to -1.
>>> The system
>>> has been running successfully for a year.
>>>
>>> Recently I upgraded all packages in this CentOS7 system. Ever since
>>> that moment
>>> the server is quite unusable.
>>>
>>> [root@linge ~]# gdb -ex 'set confirm off' -ex 'set pagination
off' -ex
>>> 'thread apply all bt full' -ex 'quit' /usr/sbin/ns-slapd
`pidof
>>> ns-slapd` |& grep '^#0.*lock'
>>> #0 0x00007f96eaf3939e in pthread_rwlock_wrlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf39184 in pthread_rwlock_rdlock () at
>>> /lib64/libpthread.so.0
>>> #0 0x00007f96eaf3939e in pthread_rwlock_wrlock () at
>>> /lib64/libpthread.so.0
>>>
>>> [root@linge ~]# rpm -qa 389\*
>>> 389-ds-base-libs-1.3.10.2-10.el7_9.x86_64
>>> 389-ds-base-1.3.10.2-10.el7_9.x86_64
>>> 389-ds-base-debuginfo-1.3.10.2-10.el7_9.x86_64
>>>
>>> [root@linge ~]# rpm -qa slapi\*
>>> slapi-nis-0.56.5-3.el7_9.x86_64
>>>
>>> [root@linge ~]# rpm -qa centos-release
>>> centos-release-7-9.2009.1.el7.centos.x86_64
>>>
>>> Are there any new hints to avoid the deadlock?
>>
>