better scheduling has uncovered a recent error in updating the Ref structure to use ainc rather than a lock. the symptom observed was the authentication server's factotum was crashing at a rate of ~1x per 24 hrs. it was observed that factotum had a segment map with two Stack segments and no Text segment. further investigation showed that one of the Stack segments had the same address as the parent. charles suggested Ref counting, and it became apparent that writes to the bss and shared memory rfork were the keys to the failure. a program was written (see below) to tickle the bug. i was getting one crash per 4 runs. the bug was declared fixed when the program ran 5000 times with no failure. as it turned out, the segment code appears to be relying on locking to provde mutex. the fix is unsatisfying because the locking is hard to reason about. i am not convinced that the new conditions are either minimal nor sufficient. thoughts on this would be appreciated. ---- #include #include int countslots(int *tab, int ntab) { int i, c; c = 0; for(i = 0; i < ntab; i++) if(tab[i] != 0) c++; return c; } int findslot(int *tab, int ntab, int pid) { int i; for(i = 0; i < ntab; i++) if(tab[i] == pid) return i; return -1; } int emptyslot(int *tab, int ntab) { int i; if((i = findslot(tab, ntab, 0)) >= 0) return i; for(;;){ i = findslot(tab, ntab, waitpid()); if(i == -1) continue; tab[i] = 0; return i; } } static char data[10][4096]; void main(void) { char *p; int i, slot, pid, tab[10]; memset(tab, 0, sizeof tab); for(i = 0; i < 1000; i++){ slot = emptyslot(tab, nelem(tab)); switch(pid = rfork(RFPROC|RFMEM)){ case -1: sysfatal("rfork: %r"); case 0: p = data[slot]; memset(p, 0, 4096); exits(""); default: tab[slot] = pid; break; } } while(countslots(tab, nelem(tab)) > 0){ if((i = findslot(tab, nelem(tab), waitpid())) >= 0) tab[i] = 0; else break; } exits(""); } Reference: /n/atom/patch/applied/seguseafterfree Date: Mon Jun 16 08:21:49 CES 2014 Signed-off-by: quanstro@quanstro.net --- /sys/src/nix/port/segment.c Mon Jun 16 08:10:38 2014 +++ /sys/src/nix/port/segment.c Mon Jun 16 08:10:39 2014 @@ -91,12 +91,10 @@ return s; } -#define NHASH 101 -#define SHASH(np) (PTR2UINT(np)%NHASH) - void putseg(Segment *s) { + int n; Pte **pp, **emap; Image *i; @@ -107,19 +105,20 @@ if(i != 0) { lock(i); lock(s); - if(i->s == s && s->ref == 1) + n = decref(s); + if(i->s == s && n == 0) i->s = 0; unlock(i); + unlock(s); } - else + else{ lock(s); - - s->ref--; - if(s->ref != 0) { + n = decref(s); unlock(s); - return; } - unlock(s); + + if(n != 0) + return; qlock(&s->lk); if(i) @@ -135,6 +134,7 @@ free(s->map); if(s->profile != 0) free(s->profile); +// memset(s, 0x22, sizeof *s); free(s); }