Changed the name of nrun to nrunhz to make the difference with nrunning clearer. Reference: /n/patches.lsub.org/patch/newmwait Date: Mon Jul 2 10:41:34 CES 2012 Signed-off-by: paurea@lsub.org # rm /usr/paurea/portfns.h # rm /usr/paurea/taslock.c # rm /usr/paurea/tcklock.c # rm /usr/paurea/acore.c # rm /usr/paurea/arch.c # rm /usr/paurea/archk10.c # rm /usr/paurea/fns.h # rm /usr/paurea/l64v.s --- /sys/src/nix/port/portfns.h Thu Jun 14 09:36:25 2012 +++ /sys/src/nix/port/portfns.h Thu Jun 14 11:54:11 2012 @@ -193,7 +193,7 @@ ulong ms2tk(ulong); void mul64fract(uvlong*, uvlong, uvlong); void muxclose(Mnt*); -void (*mwait)(void *, int); +void (*waitwhile)(void *, uintptr); Chan* namec(char*, int, int, int); void nameerror(char*, char*); Chan* newchan(void); @@ -240,7 +240,7 @@ void* phystag(uintmem); void pio(Segment*, uintptr, ulong, Page**, int); #define poperror() up->nerrlab-- -void portmwait(void*, int); +void portwaitwhile(void*, uintptr); int postnote(Proc*, int, char*, int); int pprint(char*, ...); int preempted(void); --- /sys/src/nix/port/taslock.c Thu Jun 14 09:36:25 2012 +++ /sys/src/nix/port/taslock.c Fri Jun 22 11:18:02 2012 @@ -314,10 +314,13 @@ } void -portmwait(void *value, int val) +portwaitwhile(void *value, uintptr val) { - while (*(void**)value == (void *)val) - ; + int i; + /* it just waits for a little while */ + for(i = 0; i<100; i++) + if(*(uintptr *)value == val) + break; } -void (*mwait)(void *, int) = portmwait; +void (*waitwhile)(void *, uintptr) = portwaitwhile; --- /sys/src/nix/port/proc.c Thu Jun 14 09:36:25 2012 +++ /sys/src/nix/port/proc.c Mon Jul 2 10:39:08 2012 @@ -187,6 +187,8 @@ stackok(); + if(up->state != Exotic) + adec(&run.nrunning); procsave(up); mmuflushtlb(m->pml4->pa); if(setlabel(&up->sched)){ @@ -230,6 +232,12 @@ return run.runvec & ~((1<<(up->priority+1))-1); } +int +anyactive(void) +{ + return run.runvec || run.nrunning; +} + /* * here once per clock tick to see if we should resched */ @@ -496,6 +504,9 @@ void ready(Proc *p) { + if(p->state == Exotic) + adec(&run.nrunning); + schedready(procsched(p), p, 0); } @@ -644,7 +655,8 @@ goto found; } /* waste time or halt the CPU */ - idlehands(); + if(!anyactive()) + idlehands(); /* remember how much time we're here */ now = perfticks(); m->perf.inidle += now-start; @@ -730,7 +742,8 @@ } /* waste time or halt the CPU */ - idlehands(); + if(!anyactive()) + idlehands(); if(isbooting(m)) tcquiesce(); /* remember how much time we're here */ @@ -754,6 +767,7 @@ } if(p->trace) proctrace(p, SRun, 0); + ainc(&run.nrunning); return p; } @@ -789,7 +803,8 @@ break; tcquiesce(); } - idlehands(); + //idlehands(); + waitwhile(&m->proc, (uintptr)nil); } now = perfticks(); m->perf.inidle += now-start; @@ -806,6 +821,7 @@ } if(p->trace) proctrace(p, SRun, 0); + ainc(&run.nrunning); return p; } @@ -1835,7 +1851,7 @@ p = m->proc; if(p) { if(m->machno == 1) - run.nrun++; + run.nrunhz++; p->time[p->insyscall]++; } @@ -1868,8 +1884,8 @@ * approximately the load over the last second, * with a tail lasting about 5 seconds. */ - n = run.nrun; - run.nrun = 0; + n = run.nrunhz; + run.nrunhz = 0; n = (run.nrdy+n)*1000; sys->load = (sys->load*(HZ-1)+n)/HZ; } --- /usr/paurea/proc.c Thu Jan 1 00:00:00 1970 +++ /usr/paurea/proc.c Tue Jun 26 14:00:58 2012 @@ -0,0 +1,1897 @@ +#include +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +#include "../port/edf.h" +#include "errstr.h" +#include + +enum +{ + Scaling=2, + + AMPmincores = 5, +}; + +Ref noteidalloc; + +static Ref pidalloc; + +static Sched run; + + +struct Procalloc procalloc; + +extern Proc* psalloc(void); +extern void pshash(Proc*); +extern void psrelease(Proc*); +extern void psunhash(Proc*); + +static int reprioritize(Proc*); +static void updatecpu(Proc*); + +static void rebalance(void); + +char *statename[] = +{ /* BUG: generate automatically */ + "Dead", + "Moribund", + "Ready", + "Scheding", + "Running", + "Queueing", + "QueueingR", + "QueueingW", + "Wakeme", + "Broken", + "Stopped", + "Rendez", + "Waitrelease", + "Exotic", + "Down", +}; + +Sched* +procsched(Proc *) +{ + return &run; +} + +/* + * bad planning, once more. + */ +void +procinit0(void) +{ + run.schedgain = 30; + +} + +/* + * Always splhi()'ed. + */ +void +schedinit(void) /* never returns */ +{ + Edf *e; + + m->inidle = 1; + m->proc = nil; + + setlabel(&m->sched); + if(up) { + if((e = up->edf) && (e->flags & Admitted)) + edfrecord(up); + m->qstart = 0; + m->qexpired = 0; + coherence(); + m->proc = 0; + switch(up->state) { + case Running: + ready(up); + break; + case Moribund: + up->state = Dead; + stopac(); + edfstop(up); + if (up->edf) + free(up->edf); + up->edf = nil; + + /* + * Holding locks from pexit: + * procalloc + * pga + */ + mmurelease(up); + unlock(&pga); + + psrelease(up); + unlock(&procalloc); + break; + } + adec(&run.nrunning); + up->mach = nil; + updatecpu(up); + up = nil; + } + sched(); +} + +/* + * Check if the stack has more than 4*KiB free. + * Do not call panic, the stack is gigantic. + */ +static void +stackok(void) +{ + char dummy; + + if(&dummy < (char*)up->kstack + 4*KiB){ + print("tc kernel stack overflow, cpu%d stopped\n", m->machno); + DONE(); + } +} + +/* + * If changing this routine, look also at sleep(). It + * contains a copy of the guts of sched(). + */ +void +sched(void) +{ + Proc *p; + + if(m->ilockdepth) + panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p", + m->machno, + m->ilockdepth, + up? up->lastilock: nil, + (up && up->lastilock)? up->lastilock->pc: 0, + getcallerpc(&p+2)); + + if(up){ + /* + * Delay the sched until the process gives up the locks + * it is holding. This avoids dumb lock loops. + * Don't delay if the process is Moribund. + * It called sched to die. + * But do sched eventually. This avoids a missing unlock + * from hanging the entire kernel. + * But don't reschedule procs holding palloc or procalloc. + * Those are far too important to be holding while asleep. + * + * This test is not exact. There can still be a few + * instructions in the middle of taslock when a process + * holds a lock but Lock.p has not yet been initialized. + */ + if(up->nlocks) + if(up->state != Moribund) + if(up->delaysched < 20 + || pga.Lock.p == up + || procalloc.Lock.p == up){ + up->delaysched++; + run.delayedscheds++; + ainc(&run.nrunning); + return; + } + up->delaysched = 0; + + splhi(); + /* statistics */ + if(up->nqtrap == 0 && up->nqsyscall == 0) + up->nfullq++; + m->cs++; + + stackok(); + procsave(up); + mmuflushtlb(m->pml4->pa); + if(setlabel(&up->sched)){ + procrestore(up); + spllo(); + return; + } + gotolabel(&m->sched); + } + + m->inidle = 1; + p = runproc(); /* core 0 never returns */ + m->inidle = 0; + + if(!p->edf){ + updatecpu(p); + p->priority = reprioritize(p); + } + up = p; + m->qstart = m->ticks; + up->nqtrap = 0; + up->nqsyscall = 0; + up->state = Running; + up->mach = m; + m->proc = up; + mmuswitch(up); + + assert(!up->wired || up->wired == m); + gotolabel(&up->sched); +} + +int +anyready(void) +{ + return run.runvec; +} + +int +anyhigher(void) +{ + return run.runvec & ~((1<<(up->priority+1))-1); +} + +int +anyactive(void) +{ + return run.runvec != 0 || run.nrunning != 0; +} + +/* + * here once per clock tick to see if we should resched + */ + +void +hzsched(void) +{ + /* once a second, rebalance will reprioritize ready procs */ + if(m->machno == 0){ + rebalance(); + return; + } + + /* with <= 4 cores, we use SMP and core 0 does not set qexpired for us */ + if(sys->nmach <= AMPmincores) + if(m->ticks - m->qstart >= HZ/10) + m->qexpired = 1; + + /* unless preempted, get to run */ + if(m->qexpired && anyready()) + up->delaysched++; + + /* BUG, not enough if the number of cores can change */ + if(isbooting(m) && sys->nmach > AMPmincores) + sched(); + +} + +/* + * here at the end of non-clock interrupts to see if we should preempt the + * current process. Returns 1 if preempted, 0 otherwise. + */ +int +preempted(void) +{ + if(up && up->state == Running) + if(up->preempted == 0) + if(anyhigher()) + if(!active.exiting){ + /* Core 0 is dispatching all interrupts, so no core + * actually running a user process is ever going call preempted, unless + * we consider IPIs for preemption or we distribute interrupts. + * But we are going to use SMP for machines with few cores. + panic("preemted used"); + */ + + up->preempted = 1; + sched(); + splhi(); + up->preempted = 0; + return 1; + } + return 0; +} + +/* + * Update the cpu time average for this particular process, + * which is about to change from up -> not up or vice versa. + * p->lastupdate is the last time an updatecpu happened. + * + * The cpu time average is a decaying average that lasts + * about D clock ticks. D is chosen to be approximately + * the cpu time of a cpu-intensive "quick job". A job has to run + * for approximately D clock ticks before we home in on its + * actual cpu usage. Thus if you manage to get in and get out + * quickly, you won't be penalized during your burst. Once you + * start using your share of the cpu for more than about D + * clock ticks though, your p->cpu hits 1000 (1.0) and you end up + * below all the other quick jobs. Interactive tasks, because + * they basically always use less than their fair share of cpu, + * will be rewarded. + * + * If the process has not been running, then we want to + * apply the filter + * + * cpu = cpu * (D-1)/D + * + * n times, yielding + * + * cpu = cpu * ((D-1)/D)^n + * + * but D is big enough that this is approximately + * + * cpu = cpu * (D-n)/D + * + * so we use that instead. + * + * If the process has been running, we apply the filter to + * 1 - cpu, yielding a similar equation. Note that cpu is + * stored in fixed point (* 1000). + * + * Updatecpu must be called before changing up, in order + * to maintain accurate cpu usage statistics. It can be called + * at any time to bring the stats for a given proc up-to-date. + */ +static void +updatecpu(Proc *p) +{ + int D, n, t, ocpu; + + if(p->edf) + return; + + t = sys->ticks*Scaling + Scaling/2; + n = t - p->lastupdate; + p->lastupdate = t; + + if(n == 0) + return; + D = run.schedgain*HZ*Scaling; + if(n > D) + n = D; + + ocpu = p->cpu; + if(p != up) + p->cpu = (ocpu*(D-n))/D; + else{ + t = 1000 - ocpu; + t = (t*(D-n))/D; + p->cpu = 1000 - t; + } + +//iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu); +} + +/* + * On average, p has used p->cpu of a cpu recently. + * Its fair share is nmach/m->load of a cpu. If it has been getting + * too much, penalize it. If it has been getting not enough, reward it. + * I don't think you can get much more than your fair share that + * often, so most of the queues are for using less. Having a priority + * of 3 means you're just right. Having a higher priority (up to p->basepri) + * means you're not using as much as you could. + */ +static int +reprioritize(Proc *p) +{ + int fairshare, n, load, ratio; + + load = sys->load; + if(load == 0) + return p->basepri; + + /* + * fairshare = 1.000 * conf.nproc * 1.000/load, + * except the decimal point is moved three places + * on both load and fairshare. + */ + fairshare = (sys->nmach*1000*1000)/load; + n = p->cpu; + if(n == 0) + n = 1; + ratio = (fairshare+n/2) / n; + if(ratio > p->basepri) + ratio = p->basepri; + if(ratio < 0) + panic("reprioritize"); +//iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio); + return ratio; +} + +/* + * add a process to a scheduling queue + */ +static void +queueproc(Sched *sch, Schedq *rq, Proc *p, int locked) +{ + int pri; + + pri = rq - sch->runq; + if(!locked) + lock(sch); + else if(canlock(sch)) + panic("queueproc: locked and can lock"); + p->priority = pri; + p->rnext = 0; + if(rq->tail) + rq->tail->rnext = p; + else + rq->head = p; + rq->tail = p; + rq->n++; + sch->nrdy++; + sch->runvec |= 1<head; p; p = p->rnext){ + if(p == tp) + break; + l = p; + } + + /* + * p->mach==0 only when process state is saved + */ + + if(p == 0 || p->mach){ + unlock(sch); + return nil; + } + if(p->rnext == 0) + rq->tail = l; + if(l) + l->rnext = p->rnext; + else + rq->head = p->rnext; + if(rq->head == nil) + sch->runvec &= ~(1<<(rq-sch->runq)); + rq->n--; + sch->nrdy--; + if(p->state != Ready) + print("dequeueproc %s %d %s\n", p->text, p->pid, statename[p->state]); + + unlock(sch); + return p; +} + +static void +schedready(Sched *sch, Proc *p, int locked) +{ + Mpl pl; + int pri; + Schedq *rq; + + pl = splhi(); + if(edfready(p)){ + splx(pl); + return; + } + + updatecpu(p); + pri = reprioritize(p); + p->priority = pri; + rq = &sch->runq[pri]; + p->state = Ready; + queueproc(sch, rq, p, locked); + if(p->trace) + proctrace(p, SReady, 0); + splx(pl); +} + +/* + * ready(p) picks a new priority for a process and sticks it in the + * runq for that priority. + */ +void +ready(Proc *p) +{ + if(p->state == Exotic) + adec(&run.nrunning); + schedready(procsched(p), p, 0); +} + +/* + * yield the processor and drop our priority + */ +void +yield(void) +{ + if(anyready()){ + /* pretend we just used 1/2 tick */ + up->lastupdate -= Scaling/2; + sched(); + } +} + +/* + * recalculate priorities once a second. We need to do this + * since priorities will otherwise only be recalculated when + * the running process blocks. + */ +static void +rebalance(void) +{ + Mpl pl; + int pri, npri, t; + Schedq *rq; + Proc *p; + + t = m->ticks; + if(t - run.balancetime < HZ) + return; + run.balancetime = t; + + for(pri=0, rq=run.runq; prihead; + if(p == nil) + continue; + if(p->mp != m) + continue; + if(pri == p->basepri) + continue; + updatecpu(p); + npri = reprioritize(p); + if(npri != pri){ + pl = splhi(); + p = dequeueproc(&run, rq, p); + if(p) + queueproc(&run, &run.runq[npri], p, 0); + splx(pl); + goto another; + } + } +} + +/* + * Process p is ready to run, but there's no available core. + * Try to make a core available by + * 1. preempting a process with lower priority, or + * 2. preempting one with the same priority that had more than HZ/10, or + * 3. rescheduling one that run more than HZ, in the hope he gets his priority lowered. + */ +static void +preemptfor(Proc *p) +{ + ulong delta; + uint i, j, rr; + Proc *mup; + Mach *mp; + + assert(m->machno == 0); + /* + * try to preempt a lower priority process first, default back to + * round robin otherwise. + */ + for(rr = 0; rr < 2; rr++) + for(i = 0; i < MACHMAX; i++){ + j = pickcore(p->color, i); + if((mp = sys->machptr[j]) != nil && mp->nixrole == NIXTC){ + if(mp == m) + continue; + if(isbooting(mp)){ + print("isbooting\n"); + continue; + } + /* + * Caution here: mp->proc can change, even die. + */ + mup = mp->proc; + if(mup == nil) /* one got idle */ + return; + delta = mp->ticks - mp->qstart; + if(mup->priority < p->priority){ + mp->qexpired = 1; + return; + } + if(rr && mup->priority == p->priority && delta > HZ/10){ + mp->qexpired = 1; + return; + } + if(rr & delta > HZ){ + mp->qexpired = 1; + return; + } + } + } +} + +/* + * Scheduling thread run as the main loop of cpu 0 + * Used in AMP sched. + */ +static void +mach0sched(void) +{ + Schedq *rq; + Proc *p; + Mach *mp; + ulong start, now; + int n, i, j; + + assert(m->machno == 0); + acmodeset(NIXKC); /* we don't time share any more */ + n = 0; + start = perfticks(); +loop: + + /* + * find a ready process that we might run. + */ + spllo(); + for(rq = &run.runq[Nrq-1]; rq >= run.runq; rq--) + for(p = rq->head; p; p = p->rnext){ + /* + * wired processes may only run when their core is available. + */ + if(p->wired != nil){ + if(p->wired->proc == nil) + goto found; + continue; + } + /* + * find a ready process that did run at an available core + * or one that has not moved for some time. + */ + if(p->mp == nil || p->mp->proc == nil || n>0) + goto found; + } + /* waste time or halt the CPU */ + if(!anyactive()) + idlehands(); + /* remember how much time we're here */ + now = perfticks(); + m->perf.inidle += now-start; + start = now; + n++; + goto loop; + +found: + assert(m->machno == 0); + splhi(); + /* + * find a core for this process, but honor wiring. + */ + mp = p->wired; + if(mp != nil){ + if(mp->proc != nil) + goto loop; + }else{ + for(i = 0; i < MACHMAX; i++){ + j = pickcore(p->color, i); + if((mp = sys->machptr[j]) != nil && mp->nixrole == NIXTC){ + if(isbooting(mp)) + continue; + if(mp != m && mp->proc == nil) + break; + } + } + if(i == MACHMAX){ + preemptfor(p); + goto loop; + } + } + + p = dequeueproc(&run, rq, p); + mp->proc = p; + if(p != nil){ + p->state = Scheding; + p->mp = mp; + } + + n = 0; + goto loop; +} + +/* + * SMP performs better than AMP with few cores. + * So, leave this here by now. We should probably + * write a unified version of runproc good enough for + * both SMP and AMP. + */ +static Proc* +smprunproc(void) +{ + Schedq *rq; + Proc *p; + ulong start, now; + int i; + + start = perfticks(); + run.preempts++; + +loop: + /* + * find a process that last ran on this processor (affinity), + * or one that hasn't moved in a while (load balancing). Every + * time around the loop affinity goes down. + */ + spllo(); + if(isbooting(m)) + tcquiesce(); + for(i = 0;; i++){ + /* + * find the highest priority target process that this + * processor can run given affinity constraints. + * + */ + for(rq = &run.runq[Nrq-1]; rq >= run.runq; rq--){ + for(p = rq->head; p; p = p->rnext){ + if(p->mp == nil || p->mp == sys->machptr[m->machno] + || (!p->wired && i > 0)) + goto found; + } + } + + /* waste time or halt the CPU */ + if(!anyactive()) + idlehands(); + if(isbooting(m)) + tcquiesce(); + /* remember how much time we're here */ + now = perfticks(); + m->perf.inidle += now-start; + start = now; + } + +found: + splhi(); + p = dequeueproc(&run, rq, p); + if(p == nil) + goto loop; + + p->state = Scheding; + p->mp = sys->machptr[m->machno]; + + if(edflock(p)){ + edfrun(p, rq == &run.runq[PriEdf]); /* start deadline timer and do admin */ + edfunlock(); + } + if(p->trace) + proctrace(p, SRun, 0); + ainc(&run.nrunning); + return p; +} + +/* + * pick a process to run. + * most of this is used in AMP sched. + * (on a quad core or less, we use SMP). + * In the case of core 0 we always return nil, but + * schedule the picked process at any other available TC. + * In the case of other cores we wait until a process is given + * by core 0. + */ +Proc* +runproc(void) +{ + Schedq *rq; + Proc *p; + ulong start, now; + + if(sys->nmach <= AMPmincores) + return smprunproc(); + + start = perfticks(); + run.preempts++; + rq = nil; + if(m->machno != 0){ + do{ + spllo(); + while(m->proc == nil){ + if(isbooting(m)){ + coherence(); + if(m->proc != nil) + break; + tcquiesce(); + } + waitwhile(&m->proc, (uintptr)nil); + } + now = perfticks(); + m->perf.inidle += now-start; + start = now; + splhi(); + p = m->proc; + }while(p == nil); + p->state = Scheding; + p->mp = sys->machptr[m->machno]; + + if(edflock(p)){ + edfrun(p, rq == &run.runq[PriEdf]); /* start deadline timer and do admin */ + edfunlock(); + } + if(p->trace) + proctrace(p, SRun, 0); + ainc(&run.nrunning); + return p; + } + + mach0sched(); + return nil; /* not reached */ +} + +int +canpage(Proc *p) +{ + int ok; + Sched *sch; + + splhi(); + sch = procsched(p); + lock(sch); + /* Only reliable way to see if we are Running */ + if(p->mach == 0) { + p->newtlb = 1; + ok = 1; + } + else + ok = 0; + unlock(sch); + spllo(); + + return ok; +} + +Proc* +newproc(void) +{ + Proc *p; + + p = psalloc(); + + p->state = Scheding; + p->psstate = "New"; + p->mach = 0; + p->qnext = 0; + p->nchild = 0; + p->nwait = 0; + p->waitq = 0; + p->parent = 0; + p->pgrp = 0; + p->egrp = 0; + p->fgrp = 0; + p->rgrp = 0; + p->pdbg = 0; + p->kp = 0; + if(up != nil && up->procctl == Proc_tracesyscall) + p->procctl = Proc_tracesyscall; + else + p->procctl = 0; + p->syscalltrace = nil; + p->notepending = 0; + p->ureg = 0; + p->privatemem = 0; + p->noswap = 0; + p->errstr = p->errbuf0; + p->syserrstr = p->errbuf1; + p->errbuf0[0] = '\0'; + p->errbuf1[0] = '\0'; + p->nlocks = 0; + p->delaysched = 0; + p->trace = 0; + kstrdup(&p->user, "*nouser"); + kstrdup(&p->text, "*notext"); + kstrdup(&p->args, ""); + p->nargs = 0; + p->setargs = 0; + memset(p->seg, 0, sizeof p->seg); + p->pid = incref(&pidalloc); + pshash(p); + p->noteid = incref(¬eidalloc); + if(p->pid <= 0 || p->noteid <= 0) + panic("pidalloc"); + if(p->kstack == 0) + p->kstack = smalloc(KSTACK); + + /* sched params */ + p->mp = 0; + p->wired = 0; + procpriority(p, PriNormal, 0); + p->cpu = 0; + p->lastupdate = sys->ticks*Scaling; + p->edf = nil; + + p->ntrap = 0; + p->nintr = 0; + p->nsyscall = 0; + p->nactrap = 0; + p->nacsyscall = 0; + p->nicc = 0; + p->actime = 0ULL; + p->tctime = 0ULL; + p->ac = nil; + p->nfullq = 0; + memset(&p->PMMU, 0, sizeof p->PMMU); + return p; +} + +/* + * wire this proc to a machine + */ +void +procwired(Proc *p, int bm) +{ + Proc *pp; + int i; + char nwired[MACHMAX]; + Mach *wm; + + if(bm < 0){ + /* pick a machine to wire to */ + memset(nwired, 0, sizeof(nwired)); + p->wired = 0; + for(i=0; (pp = psincref(i)) != nil; i++){ + wm = pp->wired; + if(wm && pp->pid) + nwired[wm->machno]++; + psdecref(pp); + } + bm = 0; + for(i=0; inmach; i++) + if(nwired[i] < nwired[bm]) + bm = i; + } else { + /* use the virtual machine requested */ + bm = bm % sys->nmach; + } + + p->wired = sys->machptr[bm]; + p->mp = p->wired; + + /* + * adjust our color to the new domain. + */ + if(up == nil || p != up) + return; + up->color = corecolor(up->mp->machno); + qlock(&up->seglock); + for(i = 0; i < NSEG; i++) + if(up->seg[i]) + up->seg[i]->color = up->color; + qunlock(&up->seglock); +} + +void +procpriority(Proc *p, int pri, int fixed) +{ + if(pri >= Npriq) + pri = Npriq - 1; + else if(pri < 0) + pri = 0; + p->basepri = pri; + p->priority = pri; + if(fixed){ + p->fixedpri = 1; + } else { + p->fixedpri = 0; + } +} + +/* + * sleep if a condition is not true. Another process will + * awaken us after it sets the condition. When we awaken + * the condition may no longer be true. + * + * we lock both the process and the rendezvous to keep r->p + * and p->r synchronized. + */ +void +sleep(Rendez *r, int (*f)(void*), void *arg) +{ + Mpl pl; + + pl = splhi(); + + if(up->nlocks) + print("process %d sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n", + up->pid, up->nlocks, up->lastlock, up->lastlock->pc, getcallerpc(&r)); + lock(r); + lock(&up->rlock); + if(r->p){ + print("double sleep called from %#p, %d %d\n", + getcallerpc(&r), r->p->pid, up->pid); + dumpstack(); + } + + /* + * Wakeup only knows there may be something to do by testing + * r->p in order to get something to lock on. + * Flush that information out to memory in case the sleep is + * committed. + */ + r->p = up; + + if((*f)(arg) || up->notepending){ + /* + * if condition happened or a note is pending + * never mind + */ + r->p = nil; + unlock(&up->rlock); + unlock(r); + } else { + /* + * now we are committed to + * change state and call scheduler + */ + if(up->trace) + proctrace(up, SSleep, 0); + up->state = Wakeme; + up->r = r; + + /* statistics */ + m->cs++; + + procsave(up); + mmuflushtlb(m->pml4->pa); + if(setlabel(&up->sched)) { + /* + * here when the process is awakened + */ + procrestore(up); + } else { + /* + * here to go to sleep (i.e. stop Running) + */ + unlock(&up->rlock); + unlock(r); + gotolabel(&m->sched); + } + } + + if(up->notepending) { + up->notepending = 0; + splx(pl); + if(up->procctl == Proc_exitme && up->closingfgrp) + forceclosefgrp(); + error(Eintr); + } + + splx(pl); +} + +static int +tfn(void *arg) +{ + return up->trend == nil || up->tfn(arg); +} + +void +twakeup(Ureg*, Timer *t) +{ + Proc *p; + Rendez *trend; + + p = t->ta; + trend = p->trend; + p->trend = 0; + if(trend) + wakeup(trend); +} + +void +tsleep(Rendez *r, int (*fn)(void*), void *arg, long ms) +{ + if (up->tt){ + print("tsleep: timer active: mode %d, tf %#p\n", + up->tmode, up->tf); + timerdel(up); + } + up->tns = MS2NS(ms); + up->tf = twakeup; + up->tmode = Trelative; + up->ta = up; + up->trend = r; + up->tfn = fn; + timeradd(up); + + if(waserror()){ + timerdel(up); + nexterror(); + } + sleep(r, tfn, arg); + if (up->tt) + timerdel(up); + up->twhen = 0; + poperror(); +} + +/* + * Expects that only one process can call wakeup for any given Rendez. + * We hold both locks to ensure that r->p and p->r remain consistent. + * Richard Miller has a better solution that doesn't require both to + * be held simultaneously, but I'm a paranoid - presotto. + */ +Proc* +wakeup(Rendez *r) +{ + Mpl pl; + Proc *p; + + pl = splhi(); + + lock(r); + p = r->p; + + if(p != nil){ + lock(&p->rlock); + if(p->state != Wakeme || p->r != r) + panic("wakeup: state"); + r->p = nil; + p->r = nil; + ready(p); + unlock(&p->rlock); + } + unlock(r); + + splx(pl); + + return p; +} + +/* + * if waking a sleeping process, this routine must hold both + * p->rlock and r->lock. However, it can't know them in + * the same order as wakeup causing a possible lock ordering + * deadlock. We break the deadlock by giving up the p->rlock + * lock if we can't get the r->lock and retrying. + */ +int +postnote(Proc *p, int dolock, char *n, int flag) +{ + Mpl pl; + int ret; + Rendez *r; + Proc *d, **l; + + if(dolock) + qlock(&p->debug); + + if(flag != NUser && (p->notify == 0 || p->notified)) + p->nnote = 0; + + ret = 0; + if(p->nnote < NNOTE) { + strcpy(p->note[p->nnote].msg, n); + p->note[p->nnote++].flag = flag; + ret = 1; + } + p->notepending = 1; + + /* NIX */ + if(p->state == Exotic){ + /* it could be that the process is not running + * in the AC when we interrupt the AC, but then + * we'd only get an extra interrupt in the AC, and + * nothing should happen. + */ + intrac(p); + } + + if(dolock) + qunlock(&p->debug); + + /* this loop is to avoid lock ordering problems. */ + for(;;){ + pl = splhi(); + lock(&p->rlock); + r = p->r; + + /* waiting for a wakeup? */ + if(r == nil) + break; /* no */ + + /* try for the second lock */ + if(canlock(r)){ + if(p->state != Wakeme || r->p != p) + panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state); + p->r = nil; + r->p = nil; + ready(p); + unlock(r); + break; + } + + /* give other process time to get out of critical section and try again */ + unlock(&p->rlock); + splx(pl); + sched(); + } + unlock(&p->rlock); + splx(pl); + + if(p->state != Rendezvous){ + if(p->state == Semdown || p->state == Semalt) + ready(p); + return ret; + } + /* Try and pull out of a rendezvous */ + lock(p->rgrp); + if(p->state == Rendezvous) { + p->rendval = ~0; + l = &REND(p->rgrp, p->rendtag); + for(d = *l; d; d = d->rendhash) { + if(d == p) { + *l = p->rendhash; + break; + } + l = &d->rendhash; + } + ready(p); + } + unlock(p->rgrp); + return ret; +} + +/* + * weird thing: keep at most NBROKEN around + */ +#define NBROKEN 4 +struct +{ + QLock; + int n; + Proc *p[NBROKEN]; +}broken; + +void +addbroken(Proc *p) +{ + qlock(&broken); + if(broken.n == NBROKEN) { + ready(broken.p[0]); + memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1)); + --broken.n; + } + broken.p[broken.n++] = p; + qunlock(&broken); + + stopac(); + edfstop(up); + p->state = Broken; + p->psstate = 0; + sched(); +} + +void +unbreak(Proc *p) +{ + int b; + + qlock(&broken); + for(b=0; b < broken.n; b++) + if(broken.p[b] == p) { + broken.n--; + memmove(&broken.p[b], &broken.p[b+1], + sizeof(Proc*)*(NBROKEN-(b+1))); + ready(p); + break; + } + qunlock(&broken); +} + +int +freebroken(void) +{ + int i, n; + + qlock(&broken); + n = broken.n; + for(i=0; infullq > 0) + iprint(" %s=%d", up->text, up->nfullq); + if(0 && up->nicc > 0) + iprint(" [%s nicc %ud tctime %ulld actime %ulld]\n", + up->text, up->nicc, up->tctime, up->actime); + if(up->syscalltrace != nil) + free(up->syscalltrace); + up->syscalltrace = nil; + up->alarm = 0; + + if (up->tt) + timerdel(up); + if(up->trace) + proctrace(up, SDead, 0); + + /* nil out all the resources under lock (free later) */ + qlock(&up->debug); + fgrp = up->fgrp; + up->fgrp = nil; + egrp = up->egrp; + up->egrp = nil; + rgrp = up->rgrp; + up->rgrp = nil; + pgrp = up->pgrp; + up->pgrp = nil; + dot = up->dot; + up->dot = nil; + qunlock(&up->debug); + + + if(fgrp) + closefgrp(fgrp); + if(egrp) + closeegrp(egrp); + if(rgrp) + closergrp(rgrp); + if(dot) + cclose(dot); + if(pgrp) + closepgrp(pgrp); + + /* + * if not a kernel process and have a parent, + * do some housekeeping. + */ + if(up->kp == 0) { + p = up->parent; + if(p == 0) { + if(exitstr == 0) + exitstr = "unknown"; + panic("boot process died: %s", exitstr); + } + + while(waserror()) + ; + + wq = smalloc(sizeof(Waitq)); + poperror(); + + wq->w.pid = up->pid; + utime = up->time[TUser] + up->time[TCUser]; + stime = up->time[TSys] + up->time[TCSys]; + wq->w.time[TUser] = tk2ms(utime); + wq->w.time[TSys] = tk2ms(stime); + wq->w.time[TReal] = tk2ms(sys->ticks - up->time[TReal]); + if(exitstr && exitstr[0]) + snprint(wq->w.msg, sizeof(wq->w.msg), "%s %d: %s", + up->text, up->pid, exitstr); + else + wq->w.msg[0] = '\0'; + + lock(&p->exl); + /* + * Check that parent is still alive. + */ + if(p->pid == up->parentpid && p->state != Broken) { + p->nchild--; + p->time[TCUser] += utime; + p->time[TCSys] += stime; + /* + * If there would be more than 128 wait records + * processes for my parent, then don't leave a wait + * record behind. This helps prevent badly written + * daemon processes from accumulating lots of wait + * records. + */ + if(p->nwait < 128) { + wq->next = p->waitq; + p->waitq = wq; + p->nwait++; + wq = nil; + wakeup(&p->waitr); + } + } + unlock(&p->exl); + if(wq) + free(wq); + } + + if(!freemem) + addbroken(up); + + qlock(&up->seglock); + es = &up->seg[NSEG]; + for(s = up->seg; s < es; s++) { + if(*s) { + putseg(*s); + *s = 0; + } + } + qunlock(&up->seglock); + + lock(&up->exl); /* Prevent my children from leaving waits */ + psunhash(up); + up->pid = 0; + wakeup(&up->waitr); + unlock(&up->exl); + + for(f = up->waitq; f; f = next) { + next = f->next; + free(f); + } + + /* release debuggers */ + qlock(&up->debug); + if(up->pdbg) { + wakeup(&up->pdbg->sleep); + up->pdbg = 0; + } + qunlock(&up->debug); + + /* Sched must not loop for these locks */ + lock(&procalloc); + lock(&pga); + + stopac(); + edfstop(up); + up->state = Moribund; + sched(); + panic("pexit"); +} + +int +haswaitq(void *x) +{ + Proc *p; + + p = (Proc *)x; + return p->waitq != 0; +} + +int +pwait(Waitmsg *w) +{ + int cpid; + Waitq *wq; + + if(!canqlock(&up->qwaitr)) + error(Einuse); + + if(waserror()) { + qunlock(&up->qwaitr); + nexterror(); + } + + lock(&up->exl); + if(up->nchild == 0 && up->waitq == 0) { + unlock(&up->exl); + error(Enochild); + } + unlock(&up->exl); + + sleep(&up->waitr, haswaitq, up); + + lock(&up->exl); + wq = up->waitq; + up->waitq = wq->next; + up->nwait--; + unlock(&up->exl); + + qunlock(&up->qwaitr); + poperror(); + + if(w) + memmove(w, &wq->w, sizeof(Waitmsg)); + cpid = wq->w.pid; + free(wq); + + return cpid; +} + +void +dumpaproc(Proc *p) +{ + uintptr bss; + char *s; + + if(p == 0) + return; + + bss = 0; + if(p->seg[HSEG]) + bss = p->seg[HSEG]->top; + else if(p->seg[BSEG]) + bss = p->seg[BSEG]->top; + + s = p->psstate; + if(s == 0) + s = statename[p->state]; + print("%3d:%10s pc %#p dbgpc %#p %8s (%s) ut %ld st %ld bss %#p qpc %#p nl %d nd %lud lpc %#p pri %lud\n", + p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state], + p->time[0], p->time[1], bss, p->qpc, p->nlocks, + p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority); +} + +void +procdump(void) +{ + int i; + Proc *p; + + if(up) + print("up %d\n", up->pid); + else + print("no current process\n"); + for(i=0; (p = psincref(i)) != nil; i++) { + if(p->state != Dead) + dumpaproc(p); + psdecref(p); + } +} + +/* + * wait till all processes have flushed their mmu + * state about segement s + */ +void +procflushseg(Segment *s) +{ + int i, ns, nm, nwait; + Proc *p; + Mach *mp; + + /* + * tell all processes with this + * segment to flush their mmu's + */ + nwait = 0; + for(i=0; (p = psincref(i)) != nil; i++) { + if(p->state == Dead){ + psdecref(p); + continue; + } + for(ns = 0; ns < NSEG; ns++){ + if(p->seg[ns] == s){ + p->newtlb = 1; + for(nm = 0; nm < MACHMAX; nm++) + if((mp = sys->machptr[nm]) != nil && mp->nixrole != NIXUC) + if(mp->proc == p){ + mp->mmuflush = 1; + nwait++; + } + break; + } + } + psdecref(p); + } + + if(nwait == 0) + return; + + /* + * wait for all processors to take a clock interrupt + * and flush their mmu's. + * NIX BUG: this won't work if another core is in AC mode. + * In that case we must IPI it, but only if that core is + * using this segment. + */ + for(i = 0; i < MACHMAX; i++) + if((mp = sys->machptr[i]) != nil && mp->nixrole != NIXUC) + if(mp != m) + while(mp->mmuflush) + sched(); +} + +void +scheddump(void) +{ + Proc *p; + Schedq *rq; + + for(rq = &run.runq[Nrq-1]; rq >= run.runq; rq--){ + if(rq->head == 0) + continue; + print("run[%ld]:", rq-run.runq); + for(p = rq->head; p; p = p->rnext) + print(" %d(%lud)", p->pid, m->ticks - p->readytime); + print("\n"); + delay(150); + } + print("nrdy %d\n", run.nrdy); +} + +void +kproc(char *name, void (*func)(void *), void *arg) +{ + Proc *p; + static Pgrp *kpgrp; + + p = newproc(); + p->psstate = 0; + p->procmode = 0640; + p->kp = 1; + p->noswap = 1; + + p->scallnr = up->scallnr; + memmove(p->arg, up->arg, sizeof(up->arg)); + p->nerrlab = 0; + p->slash = up->slash; + p->dot = up->dot; + if(p->dot) + incref(p->dot); + + memmove(p->note, up->note, sizeof(p->note)); + p->nnote = up->nnote; + p->notified = 0; + p->lastnote = up->lastnote; + p->notify = up->notify; + p->ureg = 0; + p->dbgreg = 0; + + procpriority(p, PriKproc, 0); + + kprocchild(p, func, arg); + + kstrdup(&p->user, eve); + kstrdup(&p->text, name); + if(kpgrp == 0) + kpgrp = newpgrp(); + p->pgrp = kpgrp; + incref(kpgrp); + + memset(p->time, 0, sizeof(p->time)); + p->time[TReal] = sys->ticks; + ready(p); + /* + * since the bss/data segments are now shareable, + * any mmu info about this process is now stale + * and has to be discarded. + */ + p->newtlb = 1; + mmuflush(); +} + +/* + * called splhi() by notify(). See comment in notify for the + * reasoning. + */ +void +procctl(Proc *p) +{ + Mpl pl; + char *state; + + switch(p->procctl) { + case Proc_exitbig: + spllo(); + pexit("Killed: Insufficient physical memory", 1); + + case Proc_exitme: + spllo(); /* pexit has locks in it */ + pexit("Killed", 1); + + case Proc_traceme: + if(p->nnote == 0) + return; + /* No break */ + + case Proc_stopme: + p->procctl = 0; + state = p->psstate; + p->psstate = "Stopped"; + /* free a waiting debugger */ + pl = spllo(); + qlock(&p->debug); + if(p->pdbg) { + wakeup(&p->pdbg->sleep); + p->pdbg = 0; + } + qunlock(&p->debug); + splhi(); + p->state = Stopped; + sched(); + p->psstate = state; + splx(pl); + return; + + case Proc_toac: + p->procctl = 0; + /* + * This pretends to return from the system call, + * by moving to a core, but never returns (unless + * the process gets moved back to a TC.) + */ + spllo(); + if(p->ac == nil) + getac(p, -1); + runacore(); + return; + + case Proc_totc: + p->procctl = 0; + if(p != up) + panic("procctl: stopac: p != up"); + spllo(); + stopac(); + return; + } +} + +void +error(char *err) +{ + spllo(); + + assert(up->nerrlab < NERR); + kstrcpy(up->errstr, err, ERRMAX); + setlabel(&up->errlab[NERR-1]); + nexterror(); +} + +void +nexterror(void) +{ + gotolabel(&up->errlab[--up->nerrlab]); +} + +void +exhausted(char *resource) +{ + char buf[ERRMAX]; + + sprint(buf, "no free %s", resource); + iprint("%s\n", buf); + error(buf); +} + +void +killbig(char *why) +{ + int i, x; + Segment *s; + ulong l, max; + Proc *p, *kp; + + max = 0; + kp = nil; + for(x = 0; (p = psincref(x)) != nil; x++) { + if(p->state == Dead || p->kp){ + psdecref(p); + continue; + } + l = 0; + for(i=1; iseg[i]; + if(s != 0) + l += s->top - s->base; + } + if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) { + if(kp != nil) + psdecref(kp); + kp = p; + max = l; + } + else + psdecref(p); + } + if(kp == nil) + return; + + print("%d: %s killed: %s\n", kp->pid, kp->text, why); + for(x = 0; (p = psincref(x)) != nil; x++) { + if(p->state == Dead || p->kp){ + psdecref(p); + continue; + } + if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG]) + p->procctl = Proc_exitbig; + psdecref(p); + } + + kp->procctl = Proc_exitbig; + for(i = 0; i < NSEG; i++) { + s = kp->seg[i]; + if(s != 0 && canqlock(&s->lk)) { + mfreeseg(s, s->base, (s->top - s->base)/BIGPGSZ); + qunlock(&s->lk); + } + } + psdecref(kp); +} + +/* + * change ownership to 'new' of all processes owned by 'old'. Used when + * eve changes. + */ +void +renameuser(char *old, char *new) +{ + int i; + Proc *p; + + for(i = 0; (p = psincref(i)) != nil; i++){ + if(p->user!=nil && strcmp(old, p->user)==0) + kstrdup(&p->user, new); + psdecref(p); + } +} + +/* + * time accounting called by clock() splhi'd + * only cpu1 computes system load average + * but the system load average is accounted for cpu0. + */ +void +accounttime(void) +{ + Proc *p; + ulong n, per; + + p = m->proc; + if(p) { + if(m->machno == 1) + run.nrun++; + p->time[p->insyscall]++; + } + + /* calculate decaying duty cycles */ + n = perfticks(); + per = n - m->perf.last; + m->perf.last = n; + per = (m->perf.period*(HZ-1) + per)/HZ; + if(per != 0) + m->perf.period = per; + + m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ; + m->perf.inidle = 0; + + m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ; + m->perf.inintr = 0; + + /* only one processor gets to compute system load averages. + * it has to be mach 1 when we use AMP. + */ + if(sys->nmach > 1 && m->machno != 1) + return; + + /* + * calculate decaying load average. + * if we decay by (n-1)/n then it takes + * n clock ticks to go from load L to .36 L once + * things quiet down. it takes about 5 n clock + * ticks to go to zero. so using HZ means this is + * approximately the load over the last second, + * with a tail lasting about 5 seconds. + */ + n = run.nrun; + run.nrun = 0; + n = (run.nrdy+n)*1000; + sys->load = (sys->load*(HZ-1)+n)/HZ; +} + +void +halt(void) +{ + if(run.nrdy != 0) + return; + hardhalt(); +} --- /sys/src/nix/port/tcklock.c Thu Jun 14 09:36:26 2012 +++ /sys/src/nix/port/tcklock.c Thu Jun 21 12:23:24 2012 @@ -178,7 +178,7 @@ lockstats.glare++; i = 0; while(getticket(l->key) != myticket(user)){ - if(conf.nmach < 2 && up && up->edf && (up->edf->flags & Admitted)){ + if(sys->nmach < 2 && up && up->edf && (up->edf->flags & Admitted)){ /* * Priority inversion, yield on a uniprocessor; on a * multiprocessor, the other processor will unlock @@ -321,10 +321,10 @@ } void -portmwait(void *value, int val) +portwaitwhile(void *value, uintptr val) { - while (*(void**)value == val) + while (*(uintptr*)value == val) ; } -void (*mwait)(void *, int) = portmwait; +void (*waitwhile)(void *, uintptr) = portwaitwhile; --- /sys/src/nix/k10/acore.c Thu Jun 14 09:36:22 2012 +++ /sys/src/nix/k10/acore.c Thu Jun 14 11:53:21 2012 @@ -65,7 +65,7 @@ snprint((char*)mp->icc->data, ICCLNSZ, "<%d>", i); coherence(); mp->icc->fn = testiccfn; - mwait(&mp->icc->fn, 0); + waitwhile(&mp->icc->fn, 0); } /* @@ -99,7 +99,7 @@ acmmuswitch(); for(;;){ acstackok(); - mwait(&m->icc->fn, 0); + waitwhile(&m->icc->fn, 0); if(m->icc->flushtlb) acmmuswitch(); DBG("acsched: cpu%d: fn %#p\n", m->machno, m->icc->fn); @@ -210,7 +210,7 @@ m->icc->fn = nil; ready(m->proc); - mwait(&m->icc->fn, 0); + waitwhile(&m->icc->fn, 0); if(m->icc->flushtlb) acmmuswitch(); --- /sys/src/nix/k10/arch.c Thu Apr 12 12:26:27 2012 +++ /sys/src/nix/k10/arch.c Thu Jun 14 11:53:28 2012 @@ -95,12 +95,10 @@ * an interrupt will get us going again. * The boot TC in nix can't halt, because it must stay alert in * case an AC makes a handler process ready. - * We should probably use mwait in that case. + * We should probably use waitwhile in that case. */ void idlehands(void) { -if(0) - if(m->machno != 0) - halt(); + halt(); } --- /sys/src/nix/k10/archk10.c Thu Apr 12 12:26:27 2012 +++ /sys/src/nix/k10/archk10.c Thu Jun 14 11:53:34 2012 @@ -31,7 +31,7 @@ /* is mnonitor supported? */ if (m->cpuinfo[1][2] & 8) { cpuid(5, 0, m->cpuinfo[2]); - mwait = k10mwait; + waitwhile = k10waitwhile; } return 1; --- /sys/src/nix/k10/fns.h Thu Jun 14 09:36:22 2012 +++ /sys/src/nix/k10/fns.h Thu Jun 14 11:53:41 2012 @@ -229,7 +229,7 @@ * archk10.c */ extern void millidelay(int); -extern void k10mwait(void*, int); +extern void k10waitwhile(void*, uintptr); /* * i8259.c --- /sys/src/nix/k10/l64v.s Thu Jun 14 09:36:23 2012 +++ /sys/src/nix/k10/l64v.s Tue Jun 26 14:03:54 2012 @@ -366,12 +366,12 @@ BYTE $0x0f; BYTE $0x01; BYTE $0xc8 /* MONITOR */ RET -TEXT _mwait(SB), 1, $-4 /* void mwait(u32int); */ +TEXT _waitwhile(SB), 1, $-4 /* void waitwhile(u32int); */ MOVLQZX RARG, CX /* optional extensions */ BYTE $0x0f; BYTE $0x01; BYTE $0xc9 /* MWAIT */ RET -TEXT k10mwait+0(SB),0,$16 +TEXT k10waitwhile+0(SB),0,$16 k10mwloop: MOVQ RARG, CX MOVQ val+8(FP), DX --- /sys/src/nix/k10/tcore.c Mon Jun 25 17:02:06 2012 +++ /sys/src/nix/k10/tcore.c Mon Jun 25 17:05:13 2012 @@ -124,9 +124,8 @@ mp->nixrole = NIXSC; mp->icc->fn = acquiesce; coherence(); - while(mp->icc->fn == acquiesce){ - mwait(&mp->nixrole, NIXSC); - } + while(mp->icc->fn == acquiesce) + waitwhile(&mp->nixrole, NIXSC); if(role == NIXOC){ mp->nixrole = role; apicnipi(mp->apicno); --- /usr/paurea/tcore.c Thu Jan 1 00:00:00 1970 +++ /usr/paurea/tcore.c Mon Jun 11 12:50:38 2012 @@ -0,0 +1,468 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include +#include +#include "amd64.h" +#include "ureg.h" +#include "io.h" + +Lock nixaclock; /* NIX AC lock; held while assigning procs to cores */ + +/* + * NIX support for the time sharing core. + */ + +extern void actrapret(void); +extern void acsysret(void); + +Mach* +getac(Proc *p, int core) +{ + int i; + Mach *mp; + + mp = nil; + if(core == 0) + panic("can't getac for a %s", rolename[NIXTC]); + lock(&nixaclock); + if(waserror()){ + unlock(&nixaclock); + nexterror(); + } + if(core > 0){ + if(core >= MACHMAX) + error("no such core"); + mp = sys->machptr[core]; + if(mp == nil || mp->nixrole == NIXUC || mp->proc != nil) + error("core not online or busy"); + if(mp->nixrole != NIXAC) + error("core is not an AC"); + Found: + mp->proc = p; + }else{ + for(i = 0; i < MACHMAX; i++) + if((mp = sys->machptr[i]) != nil && mp->nixrole == NIXAC){ + if(isbooting(mp)) + continue; + if(mp->proc == nil) + goto Found; + } + error("not enough cores"); + } + unlock(&nixaclock); + poperror(); + return mp; +} + +/* + * BUG: + * The AC must not accept interrupts while in the kernel, + * or we must be prepared for nesting them, which we are not. + * This is important for note handling, because postnote() + * assumes that it's ok to send an IPI to an AC, no matter its + * state. The /proc interface also assumes that. + * + */ +void +intrac(Proc *p) +{ + Mach *ac; + + ac = p->ac; + if(ac == nil){ + DBG("intrac: Proc.ac is nil. no ipi sent.\n"); + return; + } + /* + * It's ok if the AC gets idle in the mean time. + */ + DBG("intrac: ipi to cpu%d\n", ac->machno); + apicipi(ac->apicno); +} + +void +putac(Mach *m) +{ + coherence(); + m->proc = nil; +} + +void +stopac(void) +{ + Mach *mp; + + mp = up->ac; + if(mp == nil) + return; + if(mp->proc != up) + panic("stopac"); + + lock(&nixaclock); + up->ac = nil; + mp->proc = nil; + unlock(&nixaclock); + + /* TODO: + * send sipi to up->ac, it would rerun squidboy(), and + * wait for us to give it a function to run. + */ +} + + + +static void +roleac(Mach *mp, int role) +{ + if(mp == nil) + return; + /* wake it up... */ + mp->nixrole = NIXSC; + mp->icc->fn = acquiesce; + coherence(); + while(mp->icc->fn == acquiesce){ + mwait(&mp->nixrole, NIXSC); + } + if(role == NIXOC){ + mp->nixrole = role; + apicnipi(mp->apicno); + }else + sipicore(mp->machno); +} + + +void +tcquiesce(void) +{ + /* sync, suicide is not possible */ + m->proc = nil; + m->icc->fn = nil; + + /* signal we are done */ + m->nixrole= NIXQC; + coherence(); + wakeup(&m->sipir); + for(;;) + halt(); +} + +void +rolestable(Mach *mp) +{ + mp->nixrole = mp->nnixrole; + mp->nnixrole = NIXSTABLE; + coherence(); +} + +int +isbooting(Mach *mp) +{ + return mp->nnixrole != NIXSTABLE; +} + +static int +donequiesce(void *x) +{ + Mach *mp; + mp = (Mach *)x; + return mp->nixrole == NIXQC; +} + +/* + * what should happen if you are wired and the core dissapears? + * for now, this is for testing and it executes in the context of + * a process (it shouldn't) + */ +int +changerole(int role, int core) +{ + int apicno; + Mach *mp, *mpc, *w; + + /* + * 1 *has* to be a TC. + */ + mpc = sys->machptr[1]; + if(core == 1 || core >= MACHMAX || !mpc->nixrole == NIXTC) + return -1; + w = up->wired; + procwired(up, 1); + if(m != mpc) + sched(); + mp = sys->machptr[core]; + + lock(&mp->sipilock); + apicno = mp->apicno; + if(isbooting(mp) && mp->nixrole != NIXOC){ + print("core is already rebooting, nnixrole %#ux\n", mp->nnixrole); + unlock(&mp->sipilock); + return -1; + } + mp->nnixrole = role; + unlock(&mp->sipilock); + switch(mp->nixrole){ + case NIXAC: + lock(&nixaclock); /* so noone reassigns the core */ + if(mp->proc != nil){ + mp->proc->procctl = Proc_totc; + unlock(&nixaclock); + apicipi(apicno); + }else{ + unlock(&nixaclock); + roleac(mp, role); + } + break; + case NIXTC: + mp->nixrole = NIXSC; + coherence(); + /* when idlehands sleeps, this needs to send a wake it up IPI? */ + //apicipi(mp->apicno); + sleep(&mp->sipir, donequiesce, mp); + + /* fall */ + case NIXOC: + mp->nnixrole = role; + coherence(); + if(role == NIXOC){ + mp->proc = nil; + mp->nixrole = NIXOC; + apicnipi(mp->apicno); + }else + sipicore(mp->machno); + break; + default: + print("don't know how to change my role\n"); + } + up->wired = w; + return 0; +} + +/* + * Functions starting with ac... are run in the application core. + * All other functions are run by the time-sharing cores. + */ + +typedef void (*APfunc)(void); +extern int notify(Ureg*); + +/* + * run an arbitrary function with arbitrary args on an ap core + * first argument is always pml4 for process + * make a field and a struct for the args cache line. + * + * Returns the return-code for the ICC or -1 if the process was + * interrupted while issuing the ICC. + */ +int +runac(Mach *mp, APfunc func, int flushtlb, void *a, long n) +{ + uchar *dpg, *spg; + + if (n > sizeof(mp->icc->data)) + panic("runac: args too long"); + + if(mp->nixrole == NIXUC) + panic("Bad core"); + if(mp->proc != nil && mp->proc != up) + panic("runapfunc: mach is busy with another proc?"); + + memmove(mp->icc->data, a, n); + if(flushtlb){ + DBG("runac flushtlb: cppml4 %#p %#p\n", mp->pml4->pa, m->pml4->pa); + dpg = UINT2PTR(mp->pml4->va); + spg = UINT2PTR(m->pml4->va); + /* We should copy less: + * memmove(dgp, spg, m->pml4->daddr * sizeof(PTE)); + */ + memmove(dpg, spg, PTSZ); + if(0){ + print("runac: upac pml4 %#p\n", up->ac->pml4->pa); + dumpptepg(4, up->ac->pml4->pa); + } + } + mp->icc->flushtlb = flushtlb; + mp->icc->rc = ICCOK; + + DBG("runac: exotic proc on cpu%d\n", mp->machno); + qlock(&up->debug); + up->nicc++; + up->state = Exotic; + up->psstate = 0; + qunlock(&up->debug); + coherence(); + mp->icc->fn = func; + sched(); + return mp->icc->rc; +} + +/* + * Cleanup done by runacore to pretend we are going back to user space. + * We won't return and won't do what syscall() would normally do. + * Do it here instead. + */ +static void +fakeretfromsyscall(Ureg *ureg) +{ + int s; + + poperror(); /* as syscall() would do if we would return */ + if(up->procctl == Proc_tracesyscall){ /* Would this work? */ + up->procctl = Proc_stopme; + s = splhi(); + procctl(up); + splx(s); + } + + up->insyscall = 0; + /* if we delayed sched because we held a lock, sched now */ + if(up->delaysched){ + sched(); + splhi(); + } + kexit(ureg); +} + +/* + * Move the current process to an application core. + * This is performed at the end of execac(), and + * we pretend to be returning to user-space, but instead we + * dispatch the process to another core. + * 1. We do the final bookkeeping that syscall() would do after + * a return from sysexec(), because we are not returning. + * 2. We dispatch the process to an AC using an ICC. + * + * This function won't return unless the process is reclaimed back + * to the time-sharing core, and is the handler for the process + * to deal with traps and system calls until the process dies. + * + * Remember that this function is the "line" between user and kernel + * space, it's not expected to raise|handle any error. + * + * We install a safety error label, just in case we raise errors, + * which we shouldn't. (noerrorsleft knows that for exotic processes + * there is an error label pushed by us). + */ +void +runacore(void) +{ + Ureg *ureg; + void (*fn)(void); + int rc, flush, s; + char *n; + uvlong t1; + Mach *ac; + + if(waserror()) + panic("runacore: error: %s\n", up->errstr); + ureg = up->dbgreg; + fakeretfromsyscall(ureg); + fpusysrfork(ureg); + + procpriority(up, PriKproc, 1); + ac = up->ac; + rc = runac(up->ac, actouser, 1, nil, 0); + procpriority(up, PriNormal, 0); + for(;;){ + t1 = fastticks(nil); + flush = 0; + fn = nil; + switch(rc){ + case ICCTRAP: + s = splhi(); + m->cr2 = up->ac->cr2; + DBG("runacore: trap %ulld cr2 %#ullx ureg %#p\n", + ureg->type, m->cr2, ureg); + switch(ureg->type){ + case IdtIPI: + if(up->procctl || up->nnote) + notify(up->dbgreg); + if(up->ac == nil) + goto ToTC; + kexit(up->dbgreg); + break; + case IdtNM: + case IdtMF: + case IdtXF: + /* these are handled in the AC; + * If we get here, they left in m->icc->data + * a note to be posted to the process. + * Post it, and make the vector a NOP. + */ + n = up->ac->icc->note; + if(n != nil) + postnote(up, 1, n, NDebug); + ureg->type = IdtIPI; /* NOP */ + break; + default: + cr3put(m->pml4->pa); + if(0 && ureg->type == IdtPF){ + print("before PF:\n"); + print("AC:\n"); + dumpptepg(4, up->ac->pml4->pa); + print("\n%s:\n", rolename[NIXTC]); + dumpptepg(4, m->pml4->pa); + } + trap(ureg); + } + splx(s); + flush = 1; + fn = actrapret; + break; + case ICCSYSCALL: + DBG("runacore: syscall ax %#ullx ureg %#p\n", + ureg->ax, ureg); + cr3put(m->pml4->pa); + syscall(ureg->ax, ureg); + flush = 1; + fn = acsysret; + if(0) + if(up->nqtrap > 2 || up->nsyscall > 1) + goto ToTC; + if(up->ac == nil) + goto ToTC; + break; + default: + panic("runacore: unexpected rc = %d", rc); + } + up->tctime += fastticks2us(fastticks(nil) - t1); + procpriority(up, PriExtra, 1); + rc = runac(up->ac, fn, flush, nil, 0); + procpriority(up, PriNormal, 0); + } +ToTC: + /* + * to procctl, then syscall, to + * be back in the TC + */ + DBG("runacore: up %#p: return\n", up); + if(isbooting(ac)){ + roleac(ac, ac->nnixrole); + } +} + +extern ACVctl *acvctl[]; + +void +actrapenable(int vno, char* (*f)(Ureg*, void*), void* a, char *name) +{ + ACVctl *v; + + if(vno < 0 || vno >= 256) + panic("actrapenable: vno %d\n", vno); + v = malloc(sizeof(Vctl)); + v->f = f; + v->a = a; + v->vno = vno; + strncpy(v->name, name, KNAMELEN); + v->name[KNAMELEN-1] = 0; + + if(acvctl[vno]) + panic("AC traps can't be shared"); + acvctl[vno] = v; +} + + --- /sys/src/nix/port/portdat.h Fri Jun 8 11:35:43 2012 +++ /sys/src/nix/port/portdat.h Mon Jul 2 10:39:35 2012 @@ -678,7 +678,6 @@ EXXC, /* want an XC for the exec'd image */ }; - /* * process memory segments - NSEG always last ! * HSEG is a potentially huge bss segment. @@ -754,7 +753,8 @@ Schedq runq[Nrq]; ulong runvec; int nmach; /* # of cores with this color */ - ulong nrun; /* to compute load */ + ulong nrunhz; /* to compute load */ + int nrunning; }; typedef union Ar0 Ar0;