On Wed, May 16, 2018 at 2:38 PM, Jonathan Vaughn <jonathan@creatuity.com> wrote:

(gdb) up

#1 0xb6926b40 in cvt_s (flags=0, prec=<optimized out>, width=0, str=0x4 <error: Cannot access memory at address 0x4>, ss=<optimized out>)

at ../../.././nspr/pr/src/io/prprf.c:374

374 slen = strlen(str);

(gdb) up

#2 dosprintf (ss=ss@entry=0x9e06e4bc, fmt=0xb34b0df2 "", fmt@entry=0xb34da770 <agmt_set> "\360\317p\002", ap=...) at ../../.././nspr/pr/src/io/prprf.c:1018

1018 rv = cvt_s(ss, u.s, width, prec, flags);

(gdb) up

#3 0xb6926c8c in PR_vsmprintf (fmt=fmt@entry=0xb34da770 <agmt_set> "\360\317p\002", ap=..., ap@entry=...) at ../../.././nspr/pr/src/io/prprf.c:1184

1184 rv = dosprintf(&ss, fmt, ap);

(gdb) up

#4 0xb6de9d18 in slapi_ch_smprintf (fmt=0xb34b0de0 "%s;%s;%s;%ld;%d;%s") at ldap/servers/slapd/ch_malloc.c:331

331 p = PR_vsmprintf(fmt, ap);

(gdb) up

#5 0xb34625a4 in agmt_update_maxcsn (r=r@entry=0x2737810, sdn=0x38d9a40, sdn@entry=0x10, op=<optimized out>, mods=0x10, mods@entry=0x27d0100, csn=csn@entry=0x38de350)

at ldap/servers/plugins/replication/repl5_agmt.c:3036

3036 agmt->maxcsn = slapi_ch_smprintf("%s;%s;%s;%ld;%d;%s", slapi_sdn_get_dn(agmt->replarea),

(gdb) p *agmt

$1 = {hostname = 0x2757be0 "ipa-12.company.internal", port = 389, transport_flags = 0, binddn = 0x26ed650 "", creds = 0x26ed7a0, bindmethod = 3, replarea = 0x2757480,

frac_attrs = 0x27579c0, frac_attrs_total = 0x2757a40, frac_attr_total_defined = 1, schedule = 0x22dd0c0, auto_initialize = 502, dn = 0x2756d00, rdn = 0x2756c20,

long_name = 0x22dd100 "agmt=\"cn=meToipa-12.company.internal\" (ipa-12:5)", protocol = 0x2220930, changecounters = 0x20cb180, num_changecounters = 0,

max_changecounters = 256, last_update_start_time = 1526499214, last_update_end_time = 1526499214,

last_update_status = "Error (0) Replica acquired successfully: Incremental update succeeded", '\000' <repeats 954 times>, update_in_progress = 0, is_enabled = 1,

last_init_start_time = 0, last_init_end_time = 0, last_init_status = '\000' <repeats 1023 times>, lock = 0x2741740, consumerRUV = 0x2772e50,

consumerSchemaCSN = 0x39da520, consumerRID = 4, tmpConsumerRID = 0, timeout = 120, stop_in_progress = 0, busywaittime = 0, pausetime = 0, priv = 0x0,

attrs_to_strip = 0x2757ba0, agreement_type = 0, protocol_timeout = 0x26ed5f0, maxcsn = 0x0, flowControlWindow = 1000, flowControlPause = 2000, ignoreMissingChange = 0,

attr_lock = 0x2757c20, WaitForAsyncResults = 100}
On Tue, May 15, 2018 at 5:36 PM, Mark Reynolds <mreynolds@redhat.com> wrote:
This looks really familiar and I thought it was fixed. It should have been fixed in 1.3.7.10-1 (https://pagure.io/389-ds-base/issue/49618). In your debug session go "up" into agmt_maxcsn_update() and do:

(gdb) p *agmt

Then send us that output please.

Thanks,
Mark

On 05/15/2018 05:29 PM, Jonathan Vaughn via FreeIPA-users wrote:
Here is a backtrace from live gdb after the segfault. Looks like things went wrong somewhere during in the replication code ?

Thread 36 "ns-slapd" received signal SIGSEGV, Segmentation fault.

[Switching to Thread 0x9e0bc280 (LWP 4662)]

strlen () at ../sysdeps/arm/armv6t2/strlen.S:142

142 ldrd data1a, data1b, [src]

(gdb) bt

#0 0xb6728f2e in strlen () at ../sysdeps/arm/armv6t2/strlen.S:142

#1 0xb6973b40 in cvt_s (flags=0, prec=<optimized out>, width=0, str=0x4 <error: Cannot access memory at address 0x4>, ss=<optimized out>)

at ../../.././nspr/pr/src/io/prprf.c:374

#2 0xb6973b40 in dosprintf (ss=ss@entry=0x9e0bb4bc, fmt=0xb34fddf2 "", fmt@entry=0xb3527770 <agmt_set> "\360\357\305\002", ap=...)

at ../../.././nspr/pr/src/io/prprf.c:1018

#3 0xb6973c8c in PR_vsmprintf (fmt=fmt@entry=0xb3527770 <agmt_set> "\360\357\305\002", ap=..., ap@entry=...) at ../../.././nspr/pr/src/io/prprf.c:1184

#4 0xb6e36d18 in slapi_ch_smprintf (fmt=0xb34fdde0 "%s;%s;%s;%ld;%d;%s") at ldap/servers/slapd/ch_malloc.c:331

#5 0xb34af5a4 in agmt_update_maxcsn (r=r@entry=0x2c89810, sdn=0x3e2bac0, sdn@entry=0x10, op=<optimized out>, mods=0x10, mods@entry=0x3eec100, csn=csn@entry=0x3e30220)

at ldap/servers/plugins/replication/repl5_agmt.c:3036

#6 0xb34bd424 in write_changelog_and_ruv (pb=pb@entry=0x2cb54a0) at ldap/servers/plugins/replication/repl5_plugins.c:1124

#7 0xb34be7ec in multimaster_be_betxnpostop_add (pb=0x2cb54a0) at ldap/servers/plugins/replication/repl5_plugins.c:855

#8 0xb34be880 in multimaster_mmr_postop (pb=<optimized out>, flags=560) at ldap/servers/plugins/replication/repl5_plugins.c:616

#9 0xb6e8fcf0 in plugin_call_mmr_plugin_postop (pb=pb@entry=0x2cb54a0, e=e@entry=0x0, flags=flags@entry=560) at ldap/servers/slapd/plugin_mmr.c:65

#10 0xb35ba870 in ldbm_back_add (pb=0x2cb54a0) at ldap/servers/slapd/back-ldbm/ldbm_add.c:1218

#11 0xb6e2ce4c in op_shared_add (pb=pb@entry=0x2cb54a0) at ldap/servers/slapd/add.c:679

#12 0xb6e2d35c in add_internal_pb (pb=pb@entry=0x2cb54a0) at ldap/servers/slapd/add.c:407

#13 0xb6e2e0f8 in slapi_add_internal_pb (pb=pb@entry=0x2cb54a0) at ldap/servers/slapd/add.c:332

#14 0xb368db50 in ipa_topo_util_segment_write (tconf=tconf@entry=0x2cb5860, tsegm=tsegm@entry=0x3f2d9a0) at topology_util.c:1251

#15 0xb368e01c in ipa_topo_util_update_agmt_list (conf=0x2cb5860, repl_segments=<optimized out>) at topology_util.c:696

#16 0xb368891c in ipa_topo_apply_shared_config () at topology_init.c:165

#17 0xb6e4e65c in eq_call_all () at ldap/servers/slapd/eventq.c:278

#18 0xb6e4e65c in eq_loop (arg=<optimized out>) at ldap/servers/slapd/eventq.c:323

#19 0xb6982d68 in _pt_root (arg=0x2c8da40) at ../../.././nspr/pr/src/pthreads/ptthread.c:201

#20 0xb6906ee8 in start_thread (arg=0x9e0bc280) at pthread_create.c:465

#21 0xb6785da8 in None () at ../sysdeps/unix/sysv/linux/arm/clone.S:73

On Tue, May 15, 2018 at 3:01 AM, thierry bordaz <tbordaz@redhat.com> wrote:

Hi Jonathan,

This problem looks new to me and has something specific to your environment.
I think the best approach is to continue to debug on your system if you have the possibility to do so.

From strace we can see that DS started smoothly (created its pid file then notified systemd it was running fine). According to the pstack nunc-stans was running and was able to accept network events even if it appears it detected no incoming connection.
So the server should be ready to serve for some seconds (more than a minute), then it crashed with one thread dereferencing likely a wrong pointer.

Could you attach a debugger when the server is started and wait for the sigsegv to occur. Then confirm the crashing thread backstack.
If it is confirmed, I am afraid this is a stack corruption and valgrind could help (http://www.port389.org/docs/389ds/FAQ/faq.html#debugging-memory-growthinvalid-access-with-valgrind).

best regards
thierry

On 05/14/2018 10:20 PM, Jonathan Vaughn wrote:

Here's a strace from before it dies. Most of the elapsed time is it waiting on some futex call it looks like near the end, when it finally "returns" (from lack of strace output for duration of call I assume it didn't actually return but SIGSEGV in that call) and strace prints ' = ?' on the futex it then immediately reports SIGSEGV after. So maybe the problem is that futex call, which may mean the problem is not directly in 389DS / FreeIPA itself?

15:13:31.626587 (+ 0.000630) listen(8, 128) = 0 <0.000068>

15:13:31.626857 (+ 0.000235) listen(9, 128) = 0 <0.000048>

15:13:31.627111 (+ 0.000251) clock_gettime(CLOCK_MONOTONIC, {tv_sec=1464932, tv_nsec=41120614}) = 0 <0.000085>

15:13:31.627457 (+ 0.000356) clock_gettime(CLOCK_REALTIME, {tv_sec=1526328811, tv_nsec=627560772}) = 0 <0.000043>

15:13:31.631233 (+ 0.003839) clock_gettime(CLOCK_MONOTONIC, {tv_sec=1464932, tv_nsec=45286796}) = 0 <0.000077>

15:13:31.631720 (+ 0.000427) clock_gettime(CLOCK_MONOTONIC, {tv_sec=1464932, tv_nsec=45661430}) = 0 <0.000042>

15:13:31.631955 (+ 0.000220) clock_gettime(CLOCK_REALTIME, {tv_sec=1526328811, tv_nsec=632049036}) = 0 <0.000047>

15:13:31.635669 (+ 0.003785) clock_gettime(CLOCK_MONOTONIC, {tv_sec=1464932, tv_nsec=49725840}) = 0 <0.000146>

15:13:31.636484 (+ 0.000784) write(16, "a", 1) = 1 <0.000118>

15:13:31.636855 (+ 0.000341) sched_yield() = 0 <0.000252>

15:13:31.637322 (+ 0.000470) futex(0x1cb57a0, FUTEX_WAKE_PRIVATE, 1) = 1 <0.000088>

15:13:31.637897 (+ 0.000610) write(16, "a", 1) = 1 <0.000221>

15:13:31.638394 (+ 0.000467) sched_yield() = 0 <0.000047>

15:13:31.638619 (+ 0.000202) futex(0x1cb5710, FUTEX_WAKE_PRIVATE, 1) = 1 <0.000065>

15:13:31.638908 (+ 0.000298) openat(AT_FDCWD, "/var/run/dirsrv/slapd-COMPANY-INTERNAL.pid", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 33 <0.000831>

15:13:31.640260 (+ 0.001387) getpid() = 32353 <0.000077>

15:13:31.640558 (+ 0.000256) fstat64(33, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 <0.000119>

15:13:31.641106 (+ 0.000556) write(33, "32353\n", 6) = 6 <0.000127>

15:13:31.641472 (+ 0.000362) close(33) = 0 <0.000519>

15:13:31.642216 (+ 0.000758) chmod("/var/run/dirsrv/slapd-COMPANY-INTERNAL.pid", 0644) = 0 <0.000152>

15:13:31.642900 (+ 0.000679) clock_gettime(CLOCK_REALTIME, {tv_sec=1526328811, tv_nsec=643020294}) = 0 <0.000056>

15:13:31.643495 (+ 0.000590) write(2, "[14/May/2018:15:13:31.643020294 "..., 134) = 134 <0.002697>

15:13:31.646515 (+ 0.003052) clock_gettime(CLOCK_REALTIME, {tv_sec=1526328811, tv_nsec=646694394}) = 0 <0.000075>

15:13:31.646892 (+ 0.000337) write(4, "[14/May/2018:15:13:31.646694394 "..., 134) = 134 <0.000522>

15:13:31.647841 (+ 0.000973) fsync(4) = 0 <0.005967>

15:13:31.654425 (+ 0.006617) clock_gettime(CLOCK_REALTIME, {tv_sec=1526328811, tv_nsec=654598946}) = 0 <0.000253>

15:13:31.655137 (+ 0.000717) write(2, "[14/May/2018:15:13:31.654598946 "..., 136) = 136 <0.002427>

15:13:31.658312 (+ 0.003165) clock_gettime(CLOCK_REALTIME, {tv_sec=1526328811, tv_nsec=658486117}) = 0 <0.000251>

15:13:31.659032 (+ 0.000682) write(4, "[14/May/2018:15:13:31.658486117 "..., 136) = 136 <0.000346>

15:13:31.659623 (+ 0.000595) fsync(4) = 0 <0.003311>

15:13:31.663230 (+ 0.003642) getpid() = 32353 <0.000045>

15:13:31.663732 (+ 0.000454) socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 33 <0.000296>

15:13:31.664760 (+ 0.001048) getsockopt(33, SOL_SOCKET, SO_SNDBUF, [163840], [4]) = 0 <0.000108>

15:13:31.665141 (+ 0.000386) setsockopt(33, SOL_SOCKET, SO_SNDBUFFORCE, [8388608], 4) = -1 EPERM (Operation not permitted) <0.000051>

15:13:31.665500 (+ 0.000334) setsockopt(33, SOL_SOCKET, SO_SNDBUF, [8388608], 4) = 0 <0.000229>

15:13:31.665973 (+ 0.000468) sendmsg(33, {msg_name={sa_family=AF_UNIX, sun_path="/run/systemd/notify"}, msg_namelen=21, msg_iov=[{iov_base="READY=1\nSTATUS=slapd started: Re"..., iov_len=69}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 69 <0.000600>

15:13:31.667270 (+ 0.001334) close(33) = 0 <0.003140>

15:13:31.677320 (+ 0.010146) futex(0xb31122e8, FUTEX_WAIT, 32357, NULL) = ?

15:14:27.661809 (+ 55.984448) +++ killed by SIGSEGV (core dumped) +++

On Mon, May 14, 2018 at 10:27 AM, thierry bordaz <tbordaz@redhat.com> wrote:

Hi Jonathan,

This is weird as the crashing thread stack looks truncated (did you copy/paste all of it ?)

Thread 1 (Thread 0x9e13c280 (LWP 17245)):

#0 0xb67bbf2e in strlen () at /lib/libc.so.6

#1 0xb6a06b40 in dosprintf () at /lib/libnspr4.so

#2 0x00000000 in None ()

Did you install 389-ds-base-debuginfo ?
How did you get that backtrace ? from a core dumped, pstack ? Can you attach a debugger before the crash occurs ?

It looks it crashed soon at startup, could it be related to a broken dse.ldif. It should exists a dse.ldif.OK, is it possibly to try to start with it ?

best regards
thierry

On 05/12/2018 01:22 AM, Jonathan Vaughn via FreeIPA-users wrote:

Not sure if it makes a difference... I was looking into this again and realized I had a bunch of messages from gdb telling me to install more debuginfo. I've done that now, here it is again freshly run through gdb

GNU gdb (GDB) Fedora 8.0.1-36.fc27

Copyright (C) 2017 Free Software Foundation, Inc.

License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>

This is free software: you are free to change and redistribute it.

There is NO WARRANTY, to the extent permitted by law. Type "show copying"

and "show warranty" for details.

This GDB was configured as "armv7hl-redhat-linux-gnueabi".

Type "show configuration" for configuration details.

For bug reporting instructions, please see:

<http://www.gnu.org/software/gdb/bugs/>.

Find the GDB manual and other documentation resources online at:

<http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".

Type "apropos word" to search for commands related to "word"...

Reading symbols from /usr/sbin/ns-slapd...Reading symbols from /usr/lib/debug/usr/sbin/ns-slapd-1.3.7.10-1.fc27.arm.debug...done.

done.
...

Thread 1 (Thread 0x9e13c280 (LWP 17245)):

#0 0xb67bbf2e in strlen () at /lib/libc.so.6

#1 0xb6a06b40 in dosprintf () at /lib/libnspr4.so

#2 0x00000000 in None ()

On Tue, May 8, 2018 at 7:52 AM, Rob Crittenden <rcritten@redhat.com> wrote:

Jonathan Vaughn via FreeIPA-users wrote:

Still trying to figure this out. It looks like slapd is dying, I thought it was still running for some reason.

slapd is dying to segfault. strace of it happening doesn't seem to reveal much:

A stack trace would very much help trying to track down the cause.

http://directory.fedoraproject.org/docs/389ds/FAQ/faq.html#debugging-crashes

rob

18:32:41.543717 (+ 0.000801) openat(AT_FDCWD, "/var/run/dirsrv/slapd-COMPANY-INTERNAL.pid", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 32
18:32:41.544907 (+ 0.001195) getpid() = 16014
18:32:41.545269 (+ 0.000329) fstat64(32, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
18:32:41.545799 (+ 0.000536) write(32, "16014\n", 6) = 6
18:32:41.546603 (+ 0.000818) close(32) = 0
18:32:41.547061 (+ 0.000448) chmod("/var/run/dirsrv/slapd-COMPANY-INTERNAL.pid", 0644) = 0
18:32:41.547741 (+ 0.000676) clock_gettime(CLOCK_REALTIME, {tv_sec=1525735961, tv_nsec=548030641}) = 0
18:32:41.548324 (+ 0.000587) write(2, "[07/May/2018:18:32:41.548030641 "..., 134) = 134
18:32:41.551096 (+ 0.002840) clock_gettime(CLOCK_REALTIME, {tv_sec=1525735961, tv_nsec=551287555}) = 0
18:32:41.551568 (+ 0.000406) write(4, "[07/May/2018:18:32:41.551287555 "..., 134) = 134
18:32:41.552360 (+ 0.000811) fsync(4) = 0
18:32:41.558499 (+ 0.006170) clock_gettime(CLOCK_REALTIME, {tv_sec=1525735961, tv_nsec=558678099}) = 0
18:32:41.558901 (+ 0.000350) write(2, "[07/May/2018:18:32:41.558678099 "..., 136) = 136
18:32:41.561537 (+ 0.002680) clock_gettime(CLOCK_REALTIME, {tv_sec=1525735961, tv_nsec=561718659}) = 0
18:32:41.562357 (+ 0.000793) write(4, "[07/May/2018:18:32:41.561718659 "..., 136) = 136
18:32:41.563293 (+ 0.001148) fsync(4) = 0
18:32:41.566928 (+ 0.003452) getpid() = 16014
18:32:41.567712 (+ 0.000752) socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 32
18:32:41.568628 (+ 0.000912) getsockopt(32, SOL_SOCKET, SO_SNDBUF, [163840], [4]) = 0
18:32:41.568972 (+ 0.000319) setsockopt(32, SOL_SOCKET, SO_SNDBUFFORCE, [8388608], 4) = -1 EPERM (Operation not permitted)
18:32:41.569548 (+ 0.000589) setsockopt(32, SOL_SOCKET, SO_SNDBUF, [8388608], 4) = 0
18:32:41.570064 (+ 0.000513) sendmsg(32, {msg_name={sa_family=AF_UNIX, sun_path="/run/systemd/notify"}, msg_namelen=21, msg_iov=[{iov_base="READY=1\nSTATUS=slapd started: Re"..., iov_len=69}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 69
18:32:41.570845 (+ 0.000789) close(32) = 0
18:32:41.576358 (+ 0.005575) futex(0xb30ed2e8, FUTEX_WAIT, 16016, NULL) = ?
18:33:01.730774 (+ 20.154428) +++ killed by SIGSEGV +++

_______________________________________________ FreeIPA-users mailing list -- freeipa-users@lists.fedorahosted.org To unsubscribe send an email to freeipa-users-leave@lists.fedorahosted.org