#!/usr/bin/env python """ Run a bunch of processes as root that call initgroups concurrently with a slight delay between them to trigger a weird behaviour in sssd after a cold start. It will print the number of bad answer from sssd. Usage: ## log as root (do not run any of those commands through sudo, because it will populate ## the sssd cache) ## check your number secondary group using id for example # id jbdenis uid=21489(jbdenis) gid=110(sis) groups=110(sis),3044(CIB),19(floppy),1177(dump-projets),56(netadm),3125(vpn-ssl-admin) Here I've got 5 secondary groups (sis is my primary group) ## cleanup sssd state /etc/init.d/sssd stop && rm -f /var/lib/sss/mc/* /var/lib/sss/db/* && /etc/init.d/sssd start ## run this program # python initgroups.py jbdenis 110 5 24 200 wrong number of secondary groups in process 17145 : 0 instead of 5 (sleep 55ms) wrong number of secondary groups in process 17149 : 0 instead of 5 (sleep 55ms) 2/24 failed # first parameter is a login # second parameter is your primary gid # third parameter is your number of secondary groups # fourth parameter is the number of process you want to run conccurently # the last parameter is the maximum delay in milliseconds before calling initgroups # this delay is randomized up to this maximum I've got good results with 24 processes and randomized delay of 200ms between startup. Those parameter are somewhat relative to the machine you're running the script on I guess. You could have to run this test multiple time before triggering the bug. I'm unable to reproduce the bug when I use 0 delay. !! DONT FORGET TO CLEANUP SSSD STATE BEFORE RUNNING AGAIN THIS TEST !! """ import sys import os import ctypes import multiprocessing import Queue import time import random def initgroups(event, results, user, gid, sleep_ms): """ Wrapper around initgroups We will sleep up to sleep_ms milliseconds before calling the real libc initgroups results parameter is queue which hold the results """ sleep = random.randint(0, sleep_ms) rsleep = sleep / 1000.0 #event.wait() # this synchronization is not ncessary to trigger the bug time.sleep(rsleep) LIBC.initgroups(user, gid) results.put((LIBC.getgroups(0,0)-1, os.getpid(), sleep)) def queue_to_list(queue): """ Dummy helper """ res = list() try: while True: yield queue.get(block=False) except Queue.Empty: pass if __name__ == '__main__': if len(sys.argv) != 6: print "usage", sys.argv[0], " " sys.exit(1) user = sys.argv[1] gid = int(sys.argv[2]) secondary_groups_nb = int(sys.argv[3]) processes = int(sys.argv[4]) randsleep = int(sys.argv[5]) LIBC = ctypes.cdll.LoadLibrary("libc.so.6") worker_processes = list() results = multiprocessing.Queue() start = multiprocessing.Event() initgroups(start, results, user, gid, randsleep) for _ in range(processes): worker_processes.append( multiprocessing.Process(target=initgroups, args=(start, results, user, gid, randsleep))) worker_processes[-1].start() initgroups(start, results, user, gid, randsleep) start.set() # synchronization primitive not useful to reproduce the bug map(lambda p: p.join(), worker_processes) miss = 0 qsize = 0 for (nb, pid, sleep) in queue_to_list(results): qsize += 1 if nb != secondary_groups_nb: miss += 1 print "wrongs number of secondary groups in process %d : %d instead of %d (sleep %dms)" % (pid, nb, secondary_groups_nb, sleep) if miss > 0: print "%d/%d failed" % (miss, qsize) sys.exit(1)