# HG changeset patch # User Francisco J Ballesteros # Date 1317317544 0 # Node ID 25e87a450f81c22ad161f26ce030e54ac2ddf34c # Parent 001d20def327de8461ae1b0d513668fcc7ea6e60 memfixes: tagged physallocs and closer to smp This included a set of changes that - prepare the mmu for smp (in fact, it's all in there, but there's still a bug and smp is still disabled). - change physalloc so we can tag allocs with a void* (that's to be used to learn which Page* are used for particular physallocs, to replace pages to get big ones when there are none). R=nix-dev, rminnich, nemo CC=nix-dev http://codereview.appspot.com/5149042 diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/k10/apic.c --- a/sys/src/nix/k10/apic.c Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/k10/apic.c Thu Sep 29 17:32:24 2011 +0000 @@ -303,11 +303,11 @@ microdelay((TK2MS(1)*1000/apmachno) * m->machno); apicrput(Tic, apic->max); - if(apic->machno == 0){ + if(apic->machno == 0) intrenable(IdtTIMER, apictimer, 0, -1, "APIC timer"); - apicrput(Tlvt, Periodic|IrqTIMER); + apicrput(Tlvt, Periodic|IrqTIMER); + if(m->machno == 0) apicrput(Tp, 0); - } xapicmachptr[apicno] = sys->machptr[m->machno]; diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/k10/fns.h --- a/sys/src/nix/k10/fns.h Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/k10/fns.h Thu Sep 29 17:32:24 2011 +0000 @@ -34,6 +34,7 @@ int decref(Ref*); void delay(int); void dumpmmu(Proc*); +void dumpmmuwalk(u64int pa); void dumpptepg(int lvl,uintptr pa); #define evenaddr(x) /* x86 doesn't care */ int fpudevprocio(Proc*, void*, long, uintptr, int); diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/k10/mem.h --- a/sys/src/nix/k10/mem.h Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/k10/mem.h Thu Sep 29 17:32:24 2011 +0000 @@ -44,8 +44,8 @@ * 2M pages * these defines must go. */ -#define BIGPGSZ (2*MiB) #define BIGPGSHFT 21 +#define BIGPGSZ (1ull<pml4->pa); } +void +dumpmmuwalk(u64int addr) +{ + int l; + PTE *pte, *pml4; + + pml4 = UINT2PTR(m->pml4->va); + if((l = mmuwalk(pml4, addr, 3, &pte, nil)) >= 0) + print("cpu%d: mmu l%d pte %#p = %llux\n", m->machno, l, pte, *pte); + if((l = mmuwalk(pml4, addr, 2, &pte, nil)) >= 0) + print("cpu%d: mmu l%d pte %#p = %llux\n", m->machno, l, pte, *pte); + if((l = mmuwalk(pml4, addr, 1, &pte, nil)) >= 0) + print("cpu%d: mmu l%d pte %#p = %llux\n", m->machno, l, pte, *pte); + if((l = mmuwalk(pml4, addr, 0, &pte, nil)) >= 0) + print("cpu%d: mmu l%d pte %#p = %llux\n", m->machno, l, pte, *pte); +} + static Page mmuptpfreelist; static Page* @@ -173,7 +190,9 @@ { PTE *pte; Page *page; + Mpl pl; + pl = splhi(); if(proc->newtlb){ /* * NIX: We cannot clear our page tables if they are going to @@ -198,6 +217,7 @@ tssrsp0(STACKALIGN(PTR2UINT(proc->kstack+KSTACK))); cr3put(m->pml4->pa); + splx(pl); } void @@ -226,6 +246,31 @@ cr3put(m->pml4->pa); } +static void +checkpte(uintmem ppn, void *a) +{ + int l; + PTE *pte, *pml4; + u64int addr; + + addr = PTR2UINT(a); + pml4 = UINT2PTR(m->pml4->va); + pte = 0; + if((l = mmuwalk(pml4, addr, 3, &pte, nil)) < 0 || (*pte&PteP) == 0) + goto Panic; + else if((l = mmuwalk(pml4, addr, 2, &pte, nil)) < 0 || (*pte&PteP) == 0) + goto Panic; + else if(*pte&PtePS) + return; + else if((l = mmuwalk(pml4, addr, 1, &pte, nil)) < 0 || (*pte&PteP) == 0) + goto Panic; + return; +Panic: + panic("cpu%d: checkpte l%d ppn %#ullx kadr %#ullx pte %#p = %llux\n", + m->machno, l, ppn, KADDR(ppn), pte, *pte); + +} + /* * pg->pgszi indicates the page size in m->pgsz[] used for the mapping. * For the user, it can be either 2*MiB or 1*GiB pages. @@ -241,6 +286,7 @@ Mpl pl; uintmem pa; +uintmem ppn; pa = pg->pa; DBG("up %#p mmuput %#p %#Px %#ux\n", up, va, pa, attr); @@ -261,7 +307,7 @@ if(user){ if(pgsz == 2*MiB && lvl == 1) /* use 2M */ break; - if(pgsz == 1*GiB && lvl == 2) /* use 1G */ + if(pgsz == 1ull*GiB && lvl == 2) /* use 1G */ break; } for(page = up->mmuptp[lvl]; page != nil; page = page->next){ @@ -284,11 +330,15 @@ m->pml4->daddr = x+1; } x = PTLX(va, lvl-1); - pte = UINT2PTR(KADDR(PPN(*pte))); +ppn = PPN(*pte); + + pte = UINT2PTR(KADDR(ppn)); pte += x; +ppn += x; prev = page; } +checkpte(ppn, pte); *pte = pa|PteU; if(user) switch(pgsz){ @@ -544,7 +594,11 @@ uintmem pa; PTE *pte; - DBG("mmuwalk%d: va %#p level %d\n", m->machno, va, level); + Mpl pl; + + pl = splhi(); + if(DBGFLG > 1) + DBG("mmuwalk%d: va %#p level %d\n", m->machno, va, level); pte = &pml4[PTLX(va, 3)]; for(l = 3; l >= 0; l--){ if(l == level) @@ -564,7 +618,7 @@ pte += PTLX(va, l-1); } *ret = pte; - + splx(pl); return l; } @@ -599,9 +653,7 @@ void mmuinit(void) { - int l; uchar *p; - PTE *pte, *pml4; Page *page; u64int o, pa, r, sz; @@ -678,15 +730,7 @@ assert((pdeget(PDMAP) & ~(PteD|PteA)) == (PADDR(sys->pd)|PteRW|PteP)); - pml4 = UINT2PTR(m->pml4->va); - if((l = mmuwalk(pml4, KZERO, 3, &pte, nil)) >= 0) - print("l %d %#p %llux\n", l, pte, *pte); - if((l = mmuwalk(pml4, KZERO, 2, &pte, nil)) >= 0) - print("l %d %#p %llux\n", l, pte, *pte); - if((l = mmuwalk(pml4, KZERO, 1, &pte, nil)) >= 0) - print("l %d %#p %llux\n", l, pte, *pte); - if((l = mmuwalk(pml4, KZERO, 0, &pte, nil)) >= 0) - print("l %d %#p %llux\n", l, pte, *pte); + dumpmmuwalk(KZERO); mmuphysaddr(PTR2UINT(end)); } diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/k10/physalloc.c --- a/sys/src/nix/k10/physalloc.c Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/k10/physalloc.c Thu Sep 29 17:32:24 2011 +0000 @@ -19,6 +19,9 @@ BKmax = 30, /* Maximum lg2 */ Ndoms = 16, /* Max # of domains */ + + Used = 0, + Avail = 1, }; @@ -27,10 +30,11 @@ typedef struct Buddy Buddy; struct Buddy { - int tag; - int kval; + short tag; /* Used or Avail */ + short kval; uint next; uint prev; + void *p; }; /* @@ -100,6 +104,7 @@ * Find buddy. */ l = &blocks[BLOCK(b,i)]; + l->p = nil; DBG("\tbsl: BLOCK(b,i) %d index %ulld kval %d\n", BLOCK(b,i), BLOCK(b,i)/((1<kval)/b->bminsz), l->kval); if((BLOCK(b,i)/((1<kval)/b->bminsz)) & 1) /* simpler test? */ @@ -115,11 +120,11 @@ * buddy isn't free; * buddy has been subsequently split again. */ - if(l->kval == b->kmax || p->tag == 0 || (p->tag == 1 && p->kval != l->kval)){ + if(l->kval == b->kmax || p->tag == Used || (p->tag == Avail && p->kval != l->kval)){ /* * Put on list. */ - l->tag = 1; + l->tag = Avail; l->next = avail[l->kval].next; l->prev = 0; if(l->next != 0) @@ -129,8 +134,8 @@ b->nfree += size/b->bminsz; unlock(&budlock); - DBG("bsl: free @ i %d BLOCK(b,i) %d kval %d next %d tag %d\n", - i, BLOCK(b,i), l->kval, l->next, l->tag); + DBG("bsl: free @ i %d BLOCK(b,i) %d kval %d next %d %s\n", + i, BLOCK(b,i), l->kval, l->next, l->tag?"avail":"used"); return; } @@ -148,7 +153,7 @@ blocks[BLOCK(b,p->next)].prev = p->prev; p->next = 0; } - p->tag = 0; + p->tag = Used; /* * Now can try to merge this larger block. @@ -159,8 +164,8 @@ l = p; i = l - blocks + INDEX(b,b->memory); l->kval++; - DBG("bsl: merge @ i %d BLOCK(b,i) %d kval %d next %d tag %d\n", - i, BLOCK(b,i), l->kval, l->next, l->tag); + DBG("bsl: merge @ i %d BLOCK(b,i) %d kval %d next %d tag %s\n", + i, BLOCK(b,i), l->kval, l->next, l->tag?"avail":"used"); goto S1; } @@ -180,6 +185,37 @@ panic("physfree: no bal"); } +static void* +xphystag(Bal *b, uintmem data) +{ + uint i; + Buddy *l, *p; + Buddy *blocks, *avail; + + DBG("phystag\n"); + + blocks = b->blocks; + avail = b->avail; + + if(data == 0 /*|| !ALIGNED(data, b->bminsz)*/) + return; + i = INDEX(b,data); + return blocks[BLOCK(b,i)].p; +} + +void* +phystag(uintmem data) +{ + Bal *b; + int i; + + for(i = 0; i < Ndoms; i++){ + b = &bal[i]; + if(b->base <= data && data < b->base + b->size) + return xphystag(b, data); + } + return nil; +} static uchar lg2table[256] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, @@ -226,7 +262,7 @@ } static uintmem -xphysalloc(Bal *b, u64int size) +xphysalloc(Bal *b, u64int size, void *tag) { uint i, j, k; Buddy *l, *p; @@ -268,12 +304,12 @@ */ i = avail[j].next; l = &blocks[BLOCK(b,i)]; - DBG("bsr: block @ i %d BLOCK(b,i) %d kval %d next %d tag %d\n", - i, BLOCK(b,i), l->kval, l->next, l->tag); + DBG("bsr: block @ i %d BLOCK(b,i) %d kval %d next %d %s\n", + i, BLOCK(b,i), l->kval, l->next, l->tag?"avail":"used"); avail[j].next = l->next; blocks[avail[j].next].prev = 0; l->prev = l->next = 0; - l->tag = 0; + l->tag = Used; l->kval = k; /* @@ -285,26 +321,28 @@ */ j--; p = &blocks[BLOCK(b,i) + (UNO<bminsz)]; - p->tag = 1; + p->tag = Avail; p->kval = j; p->next = avail[j].next; p->prev = 0; if(p->next != 0) blocks[BLOCK(b,p->next)].prev = i + (UNO<bminsz); avail[j].next = i + (UNO<bminsz); - DBG("bsr: split @ i %d BLOCK(b,i) %ld j %d next %d (%d) tag %d\n", - i, p - blocks, j, p->next, BLOCK(b,p->next), p->tag); + DBG("bsr: split @ i %d BLOCK(b,i) %ld j %d next %d (%d) %s\n", + i, p - blocks, j, p->next, BLOCK(b,p->next), + p->tag?"avail":"used"); } b->nfree -= size/b->bminsz; unlock(&budlock); m = b->memory + b->bminsz*BLOCK(b,i); assert(m >= b->base && m < b->base + b->size); + blocks[BLOCK(b,i)].p = tag; return m; } uintmem -physalloc(u64int size, int *colorp) +physalloc(u64int size, int *colorp, void *tag) { int i, color; uintmem m; @@ -315,13 +353,13 @@ color %= ndoms; if(bal[color].kmin > 0){ *colorp = color; - m = xphysalloc(&bal[color], size); + m = xphysalloc(&bal[color], size, tag); } } if(m == 0) for(i = 0; i < ndoms; i++) if(bal[i].kmin > 0) - if((m = xphysalloc(&bal[i], size)) != 0){ + if((m = xphysalloc(&bal[i], size, tag)) != 0){ *colorp = i; return m; } @@ -336,7 +374,7 @@ blocks = b->blocks; for(i = 0; i < (UNO<<(b->kmax-b->kmin+1)); i++){ - if(blocks[i].tag == 0) + if(blocks[i].tag == Used) continue; print("blocks[%d]: size %d prev %d next %d\n", i, 1<blocks[i].kval, blocks[i].prev, blocks[i].next); diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/k10/trap.c --- a/sys/src/nix/k10/trap.c Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/k10/trap.c Thu Sep 29 17:32:24 2011 +0000 @@ -296,7 +296,7 @@ * We cannot do this in trap() because application cores * would update m->cr2 with their cr2 values upon page faults, * and then call trap(). - * If we do this in trap(), we would overwrite that with our own one. + * If we do this in trap(), we would overwrite that with our own cr2. */ if(ureg->type == VectorPF) m->cr2 = cr2get(); @@ -319,9 +319,6 @@ vno = ureg->type; -if(m->machno != 0 && m->nixtype != NIXAC) -print("cpu%d trap %ulld\n", m->machno, ureg->type); - m->perf.intrts = perfticks(); user = userureg(ureg); if(user && (m->nixtype == NIXTC)){ @@ -607,6 +604,7 @@ */ if(!user && (!insyscall || up->nerrlab == 0)){ dumpregs(ureg); + dumpmmuwalk(m->cr2); panic("fault: %#llux\n", addr); } sprint(buf, "sys: trap: fault %s addr=%#llux", diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/port/devcons.c --- a/sys/src/nix/port/devcons.c Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/port/devcons.c Thu Sep 29 17:32:24 2011 +0000 @@ -416,7 +416,7 @@ panicking = 1; pl = splhi(); - strcpy(buf, "panic: "); + seprint(buf, buf+sizeof buf, "panic: cpu%d: ", m->machno); va_start(arg, fmt); n = vseprint(buf+strlen(buf), buf+sizeof(buf), fmt, arg) - buf; va_end(arg); diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/port/page.c --- a/sys/src/nix/port/page.c Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/port/page.c Thu Sep 29 17:32:24 2011 +0000 @@ -106,11 +106,12 @@ return nil; } memset(pg, 0, sizeof *pg); - if((pg->pa = physalloc(size, &color)) == 0){ + if((pg->pa = physalloc(size, &color, pg)) == 0){ DBG("pgalloc: physalloc failed for size %#ulx color %d\n", size, color); free(pg); return nil; } +assert(phystag(pg->pa) == pg); pg->pgszi = si; /* size index */ incref(&pga.pgsza[si].npages); pg->color = color; @@ -412,8 +413,22 @@ } pageunchain(np); - /* don't pagechaintail(np) here; see below */ - + pagechaintail(np); + /* + * XXX - here's a bug? - np is on the freelist but it's not really free. + * when we unlock palloc someone else can come in, decide to + * use np, and then try to lock it. they succeed after we've + * run copypage and cachepage and unlock(np). then what? + * they call pageunchain before locking(np), so it's removed + * from the freelist, but still in the cache because of + * cachepage below. if someone else looks in the cache + * before they remove it, the page will have a nonzero ref + * once they finally lock(np). + * + * What I know is that not doing the pagechaintail, but + * doing it at the end, to prevent the race, leads to a + * deadlock, even following the pga, pg lock ordering. -nemo + */ lock(np); unlock(&pga); @@ -426,22 +441,6 @@ unlock(np); uncachepage(p); - /* - * This is here to prevent a bug(?) - * np is on the freelist but it's not really free. - * when we unlock palloc someone else can come in, decide to - * use np, and then try to lock it. they succeed after we've - * run copypage and cachepage and unlock(np). then what? - * they call pageunchain before locking(np), so it's removed - * from the freelist, but still in the cache because of - * cachepage below. if someone else looks in the cache - * before they remove it, the page will have a nonzero ref - * once they finally lock(np). - * Because np was not chained until now, nobody could see it. - */ - lock(&pga); - pagechaintail(np); - unlock(&pga); return 0; } diff -r 001d20def327 -r 25e87a450f81 sys/src/nix/port/portfns.h --- a/sys/src/nix/port/portfns.h Tue Sep 27 14:33:27 2011 -0700 +++ b/sys/src/nix/port/portfns.h Thu Sep 29 17:32:24 2011 +0000 @@ -230,10 +230,11 @@ void pgfree(Page*); void pgrpcpy(Pgrp*, Pgrp*); void pgrpnote(ulong, char*, long, int); -uintmem physalloc(u64int, int*); +uintmem physalloc(u64int, int*, void*); void physdump(void); void physfree(uintmem, u64int); void physinit(uintmem, u64int); +void* phystag(uintmem); void pio(Segment*, uintptr, ulong, Page**); #define poperror() up->nerrlab-- int postnote(Proc*, int, char*, int);