Changed the name of nrun to nrunhz to make the difference with nrunning clearer.

Reference: /n/patches.lsub.org/patch/newmwait
Date: Mon Jul  2 10:41:34 CES 2012
Signed-off-by: paurea@lsub.org

# rm /usr/paurea/portfns.h
# rm /usr/paurea/taslock.c
# rm /usr/paurea/tcklock.c
# rm /usr/paurea/acore.c
# rm /usr/paurea/arch.c
# rm /usr/paurea/archk10.c
# rm /usr/paurea/fns.h
# rm /usr/paurea/l64v.s

--- /sys/src/nix/port/portfns.h	Thu Jun 14 09:36:25 2012
+++ /sys/src/nix/port/portfns.h	Thu Jun 14 11:54:11 2012
@@ -193,7 +193,7 @@
 ulong		ms2tk(ulong);
 void		mul64fract(uvlong*, uvlong, uvlong);
 void		muxclose(Mnt*);
-void		(*mwait)(void *, int);
+void		(*waitwhile)(void *, uintptr);
 Chan*		namec(char*, int, int, int);
 void		nameerror(char*, char*);
 Chan*		newchan(void);
@@ -240,7 +240,7 @@
 void*		phystag(uintmem);
 void		pio(Segment*, uintptr, ulong, Page**, int);
 #define		poperror()		up->nerrlab--
-void		portmwait(void*, int);
+void		portwaitwhile(void*, uintptr);
 int		postnote(Proc*, int, char*, int);
 int		pprint(char*, ...);
 int		preempted(void);
--- /sys/src/nix/port/taslock.c	Thu Jun 14 09:36:25 2012
+++ /sys/src/nix/port/taslock.c	Fri Jun 22 11:18:02 2012
@@ -314,10 +314,13 @@
 }
 
 void
-portmwait(void *value, int val)
+portwaitwhile(void *value, uintptr val)
 {
-	while (*(void**)value == (void *)val)
-		;
+	int i;
+	/* it just waits for a little while */
+	for(i = 0; i<100; i++)
+		if(*(uintptr *)value == val)
+			break;
 }
 
-void (*mwait)(void *, int) = portmwait;
+void (*waitwhile)(void *, uintptr) = portwaitwhile;
--- /sys/src/nix/port/proc.c	Thu Jun 14 09:36:25 2012
+++ /sys/src/nix/port/proc.c	Mon Jul  2 10:39:08 2012
@@ -187,6 +187,8 @@
 
 		stackok();
 
+		if(up->state != Exotic)
+			adec(&run.nrunning);
 		procsave(up);
 		mmuflushtlb(m->pml4->pa);
 		if(setlabel(&up->sched)){
@@ -230,6 +232,12 @@
 	return run.runvec & ~((1<<(up->priority+1))-1);
 }
 
+int
+anyactive(void)
+{
+	return run.runvec || run.nrunning;
+}
+
 /*
  *  here once per clock tick to see if we should resched
  */
@@ -496,6 +504,9 @@
 void
 ready(Proc *p)
 {
+	if(p->state == Exotic)
+		adec(&run.nrunning);
+	
 	schedready(procsched(p), p, 0);
 }
 
@@ -644,7 +655,8 @@
 				goto found;
 		}
 	/* waste time or halt the CPU */
-	idlehands();
+	if(!anyactive())
+		idlehands();
 	/* remember how much time we're here */
 	now = perfticks();
 	m->perf.inidle += now-start;
@@ -730,7 +742,8 @@
 		}
 
 		/* waste time or halt the CPU */
-		idlehands();
+		if(!anyactive())
+			idlehands();
 		if(isbooting(m))
 			tcquiesce();
 		/* remember how much time we're here */
@@ -754,6 +767,7 @@
 	}
 	if(p->trace)
 		proctrace(p, SRun, 0);
+	ainc(&run.nrunning);
 	return p;
 }
 
@@ -789,7 +803,8 @@
 						break;
 					tcquiesce();
 				}
-				idlehands();
+				//idlehands();
+				waitwhile(&m->proc, (uintptr)nil);
 			}
 			now = perfticks();
 			m->perf.inidle += now-start;
@@ -806,6 +821,7 @@
 		}
 		if(p->trace)
 			proctrace(p, SRun, 0);
+		ainc(&run.nrunning);
 		return p;
 	}
 
@@ -1835,7 +1851,7 @@
 	p = m->proc;
 	if(p) {
 		if(m->machno == 1)
-			run.nrun++;
+			run.nrunhz++;
 		p->time[p->insyscall]++;
 	}
 
@@ -1868,8 +1884,8 @@
 	 * approximately the load over the last second,
 	 * with a tail lasting about 5 seconds.
 	 */
-	n = run.nrun;
-	run.nrun = 0;
+	n = run.nrunhz;
+	run.nrunhz = 0;
 	n = (run.nrdy+n)*1000;
 	sys->load = (sys->load*(HZ-1)+n)/HZ;
 }
--- /usr/paurea/proc.c	Thu Jan  1 00:00:00 1970
+++ /usr/paurea/proc.c	Tue Jun 26 14:00:58 2012
@@ -0,0 +1,1897 @@
+#include	<u.h>
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	"../port/edf.h"
+#include	"errstr.h"
+#include	<trace.h>
+
+enum
+{
+	Scaling=2,
+
+	AMPmincores = 5,
+};
+
+Ref	noteidalloc;
+
+static Ref pidalloc;
+
+static Sched run;
+
+
+struct Procalloc procalloc;
+
+extern Proc* psalloc(void);
+extern void pshash(Proc*);
+extern void psrelease(Proc*);
+extern void psunhash(Proc*);
+
+static int reprioritize(Proc*);
+static void updatecpu(Proc*);
+
+static void rebalance(void);
+
+char *statename[] =
+{	/* BUG: generate automatically */
+	"Dead",
+	"Moribund",
+	"Ready",
+	"Scheding",
+	"Running",
+	"Queueing",
+	"QueueingR",
+	"QueueingW",
+	"Wakeme",
+	"Broken",
+	"Stopped",
+	"Rendez",
+	"Waitrelease",
+	"Exotic",
+	"Down",
+};
+
+Sched*
+procsched(Proc *)
+{
+	return &run;
+}
+
+/*
+ * bad planning, once more.
+ */
+void
+procinit0(void)
+{
+	run.schedgain = 30;
+
+}
+
+/*
+ * Always splhi()'ed.
+ */
+void
+schedinit(void)		/* never returns */
+{
+	Edf *e;
+
+	m->inidle = 1;
+	m->proc = nil;
+
+	setlabel(&m->sched);
+	if(up) {
+		if((e = up->edf) && (e->flags & Admitted))
+			edfrecord(up);
+		m->qstart = 0;
+		m->qexpired = 0;
+		coherence();
+		m->proc = 0;
+		switch(up->state) {
+		case Running:
+			ready(up);
+			break;
+		case Moribund:
+			up->state = Dead;
+			stopac();
+			edfstop(up);
+			if (up->edf)
+				free(up->edf);
+			up->edf = nil;
+
+			/*
+			 * Holding locks from pexit:
+			 * 	procalloc
+			 *	pga
+			 */
+			mmurelease(up);
+			unlock(&pga);
+
+			psrelease(up);
+			unlock(&procalloc);
+			break;
+		}
+		adec(&run.nrunning);
+		up->mach = nil;
+		updatecpu(up);
+		up = nil;
+	}
+	sched();
+}
+
+/*
+ * Check if the stack has more than 4*KiB free.
+ * Do not call panic, the stack is gigantic.
+ */
+static void
+stackok(void)
+{
+	char dummy;
+
+	if(&dummy < (char*)up->kstack + 4*KiB){
+		print("tc kernel stack overflow, cpu%d stopped\n", m->machno);
+		DONE();
+	}
+}
+
+/*
+ *  If changing this routine, look also at sleep().  It
+ *  contains a copy of the guts of sched().
+ */
+void
+sched(void)
+{
+	Proc *p;
+
+	if(m->ilockdepth)
+		panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
+			m->machno,
+			m->ilockdepth,
+			up? up->lastilock: nil,
+			(up && up->lastilock)? up->lastilock->pc: 0,
+			getcallerpc(&p+2));
+
+	if(up){
+		/*
+		 * Delay the sched until the process gives up the locks
+		 * it is holding.  This avoids dumb lock loops.
+		 * Don't delay if the process is Moribund.
+		 * It called sched to die.
+		 * But do sched eventually.  This avoids a missing unlock
+		 * from hanging the entire kernel.
+		 * But don't reschedule procs holding palloc or procalloc.
+		 * Those are far too important to be holding while asleep.
+		 *
+		 * This test is not exact.  There can still be a few
+		 * instructions in the middle of taslock when a process
+		 * holds a lock but Lock.p has not yet been initialized.
+		 */
+		if(up->nlocks)
+		if(up->state != Moribund)
+		if(up->delaysched < 20
+		|| pga.Lock.p == up
+		|| procalloc.Lock.p == up){
+			up->delaysched++;
+ 			run.delayedscheds++;
+			ainc(&run.nrunning);
+			return;
+		}
+		up->delaysched = 0;
+
+		splhi();
+		/* statistics */
+		if(up->nqtrap == 0 && up->nqsyscall == 0)
+			up->nfullq++;
+		m->cs++;
+
+		stackok();
+		procsave(up);
+		mmuflushtlb(m->pml4->pa);
+		if(setlabel(&up->sched)){
+			procrestore(up);
+			spllo();
+			return;
+		}
+		gotolabel(&m->sched);
+	}
+
+	m->inidle = 1;
+	p = runproc();	/* core 0 never returns */
+	m->inidle = 0;
+
+	if(!p->edf){
+		updatecpu(p);
+		p->priority = reprioritize(p);
+	}
+	up = p;
+	m->qstart = m->ticks;
+	up->nqtrap = 0;
+	up->nqsyscall = 0;
+	up->state = Running;
+	up->mach = m;
+	m->proc = up;
+	mmuswitch(up);
+
+	assert(!up->wired || up->wired == m);
+	gotolabel(&up->sched);
+}
+
+int
+anyready(void)
+{
+	return run.runvec;
+}
+
+int
+anyhigher(void)
+{
+	return run.runvec & ~((1<<(up->priority+1))-1);
+}
+
+int
+anyactive(void)
+{
+	return run.runvec != 0 || run.nrunning != 0;
+}
+
+/*
+ *  here once per clock tick to see if we should resched
+ */
+
+void
+hzsched(void)
+{
+	/* once a second, rebalance will reprioritize ready procs */
+	if(m->machno == 0){
+		rebalance();
+		return;
+	}
+
+	/* with <= 4 cores, we use SMP and core 0 does not set qexpired for us */
+	if(sys->nmach <= AMPmincores)
+		if(m->ticks - m->qstart >= HZ/10)
+			m->qexpired = 1;
+
+	/* unless preempted, get to run */
+	if(m->qexpired && anyready())
+		up->delaysched++;
+
+	/* BUG, not enough if the number of cores can change */
+	if(isbooting(m) && sys->nmach > AMPmincores)
+		sched();
+	
+}
+
+/*
+ *  here at the end of non-clock interrupts to see if we should preempt the
+ *  current process.  Returns 1 if preempted, 0 otherwise.
+ */
+int
+preempted(void)
+{
+	if(up && up->state == Running)
+	if(up->preempted == 0)
+	if(anyhigher())
+	if(!active.exiting){
+		/*  Core 0 is dispatching all interrupts, so no core
+		 *  actually running a user process is ever going call preempted, unless
+		 *  we consider IPIs for preemption or we distribute interrupts.
+		 *  But we are going to use SMP for machines with few cores.
+		panic("preemted used");
+		 */
+
+		up->preempted = 1;
+		sched();
+		splhi();
+		up->preempted = 0;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Update the cpu time average for this particular process,
+ * which is about to change from up -> not up or vice versa.
+ * p->lastupdate is the last time an updatecpu happened.
+ *
+ * The cpu time average is a decaying average that lasts
+ * about D clock ticks.  D is chosen to be approximately
+ * the cpu time of a cpu-intensive "quick job".  A job has to run
+ * for approximately D clock ticks before we home in on its
+ * actual cpu usage.  Thus if you manage to get in and get out
+ * quickly, you won't be penalized during your burst.  Once you
+ * start using your share of the cpu for more than about D
+ * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
+ * below all the other quick jobs.  Interactive tasks, because
+ * they basically always use less than their fair share of cpu,
+ * will be rewarded.
+ *
+ * If the process has not been running, then we want to
+ * apply the filter
+ *
+ *	cpu = cpu * (D-1)/D
+ *
+ * n times, yielding
+ *
+ *	cpu = cpu * ((D-1)/D)^n
+ *
+ * but D is big enough that this is approximately
+ *
+ * 	cpu = cpu * (D-n)/D
+ *
+ * so we use that instead.
+ *
+ * If the process has been running, we apply the filter to
+ * 1 - cpu, yielding a similar equation.  Note that cpu is
+ * stored in fixed point (* 1000).
+ *
+ * Updatecpu must be called before changing up, in order
+ * to maintain accurate cpu usage statistics.  It can be called
+ * at any time to bring the stats for a given proc up-to-date.
+ */
+static void
+updatecpu(Proc *p)
+{
+	int D, n, t, ocpu;
+
+	if(p->edf)
+		return;
+
+	t = sys->ticks*Scaling + Scaling/2;
+	n = t - p->lastupdate;
+	p->lastupdate = t;
+
+	if(n == 0)
+		return;
+	D = run.schedgain*HZ*Scaling;
+	if(n > D)
+		n = D;
+
+	ocpu = p->cpu;
+	if(p != up)
+		p->cpu = (ocpu*(D-n))/D;
+	else{
+		t = 1000 - ocpu;
+		t = (t*(D-n))/D;
+		p->cpu = 1000 - t;
+	}
+
+//iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
+}
+
+/*
+ * On average, p has used p->cpu of a cpu recently.
+ * Its fair share is nmach/m->load of a cpu.  If it has been getting
+ * too much, penalize it.  If it has been getting not enough, reward it.
+ * I don't think you can get much more than your fair share that
+ * often, so most of the queues are for using less.  Having a priority
+ * of 3 means you're just right.  Having a higher priority (up to p->basepri)
+ * means you're not using as much as you could.
+ */
+static int
+reprioritize(Proc *p)
+{
+	int fairshare, n, load, ratio;
+
+	load = sys->load;
+	if(load == 0)
+		return p->basepri;
+
+	/*
+	 *  fairshare = 1.000 * conf.nproc * 1.000/load,
+	 * except the decimal point is moved three places
+	 * on both load and fairshare.
+	 */
+	fairshare = (sys->nmach*1000*1000)/load;
+	n = p->cpu;
+	if(n == 0)
+		n = 1;
+	ratio = (fairshare+n/2) / n;
+	if(ratio > p->basepri)
+		ratio = p->basepri;
+	if(ratio < 0)
+		panic("reprioritize");
+//iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
+	return ratio;
+}
+
+/*
+ * add a process to a scheduling queue
+ */
+static void
+queueproc(Sched *sch, Schedq *rq, Proc *p, int locked)
+{
+	int pri;
+
+	pri = rq - sch->runq;
+	if(!locked)
+		lock(sch);
+	else if(canlock(sch))
+		panic("queueproc: locked and can lock");
+	p->priority = pri;
+	p->rnext = 0;
+	if(rq->tail)
+		rq->tail->rnext = p;
+	else
+		rq->head = p;
+	rq->tail = p;
+	rq->n++;
+	sch->nrdy++;
+	sch->runvec |= 1<<pri;
+	if(!locked)
+		unlock(sch);
+}
+
+/*
+ *  try to remove a process from a scheduling queue (called splhi)
+ */
+Proc*
+dequeueproc(Sched *sch, Schedq *rq, Proc *tp)
+{
+	Proc *l, *p;
+
+	if(!canlock(sch))
+		return nil;
+
+	/*
+	 *  the queue may have changed before we locked runq,
+	 *  refind the target process.
+	 */
+	l = 0;
+	for(p = rq->head; p; p = p->rnext){
+		if(p == tp)
+			break;
+		l = p;
+	}
+
+	/*
+	 *  p->mach==0 only when process state is saved
+	 */
+
+	if(p == 0 || p->mach){
+		unlock(sch);
+		return nil;
+	}
+	if(p->rnext == 0)
+		rq->tail = l;
+	if(l)
+		l->rnext = p->rnext;
+	else
+		rq->head = p->rnext;
+	if(rq->head == nil)
+		sch->runvec &= ~(1<<(rq-sch->runq));
+	rq->n--;
+	sch->nrdy--;
+	if(p->state != Ready)
+		print("dequeueproc %s %d %s\n", p->text, p->pid, statename[p->state]);
+
+	unlock(sch);
+	return p;
+}
+
+static void
+schedready(Sched *sch, Proc *p, int locked)
+{
+	Mpl pl;
+	int pri;
+	Schedq *rq;
+
+	pl = splhi();
+	if(edfready(p)){
+		splx(pl);
+		return;
+	}
+
+	updatecpu(p);
+	pri = reprioritize(p);
+	p->priority = pri;
+	rq = &sch->runq[pri];
+	p->state = Ready;
+	queueproc(sch, rq, p, locked);
+	if(p->trace)
+		proctrace(p, SReady, 0);
+	splx(pl);
+}
+
+/*
+ *  ready(p) picks a new priority for a process and sticks it in the
+ *  runq for that priority.
+ */
+void
+ready(Proc *p)
+{
+	if(p->state == Exotic)
+		adec(&run.nrunning);
+	schedready(procsched(p), p, 0);
+}
+
+/*
+ *  yield the processor and drop our priority
+ */
+void
+yield(void)
+{
+	if(anyready()){
+		/* pretend we just used 1/2 tick */
+		up->lastupdate -= Scaling/2;
+		sched();
+	}
+}
+
+/*
+ *  recalculate priorities once a second.  We need to do this
+ *  since priorities will otherwise only be recalculated when
+ *  the running process blocks.
+ */
+static void
+rebalance(void)
+{
+	Mpl pl;
+	int pri, npri, t;
+	Schedq *rq;
+	Proc *p;
+
+	t = m->ticks;
+	if(t - run.balancetime < HZ)
+		return;
+	run.balancetime = t;
+
+	for(pri=0, rq=run.runq; pri<Npriq; pri++, rq++){
+another:
+		p = rq->head;
+		if(p == nil)
+			continue;
+		if(p->mp != m)
+			continue;
+		if(pri == p->basepri)
+			continue;
+		updatecpu(p);
+		npri = reprioritize(p);
+		if(npri != pri){
+			pl = splhi();
+			p = dequeueproc(&run, rq, p);
+			if(p)
+				queueproc(&run, &run.runq[npri], p, 0);
+			splx(pl);
+			goto another;
+		}
+	}
+}
+
+/*
+ * Process p is ready to run, but there's no available core.
+ * Try to make a core available by
+ * 1. preempting a process with lower priority, or
+ * 2. preempting one with the same priority that had more than HZ/10, or
+ * 3. rescheduling one that run more than HZ, in the hope he gets his priority lowered.
+ */
+static void
+preemptfor(Proc *p)
+{
+	ulong delta;
+	uint i, j, rr;
+	Proc *mup;
+	Mach *mp;
+
+	assert(m->machno == 0);
+	/*
+	 * try to preempt a lower priority process first, default back to
+	 * round robin otherwise.
+	 */
+	for(rr = 0; rr < 2; rr++)
+		for(i = 0; i < MACHMAX; i++){
+			j = pickcore(p->color, i);
+			if((mp = sys->machptr[j]) != nil && mp->nixrole == NIXTC){
+				if(mp == m)
+					continue;
+				if(isbooting(mp)){
+					print("isbooting\n");
+					continue;
+				}
+				/*
+				 * Caution here: mp->proc can change, even die.
+				 */
+				mup = mp->proc;
+				if(mup == nil)		/* one got idle */
+					return;
+				delta = mp->ticks - mp->qstart;
+				if(mup->priority < p->priority){
+					mp->qexpired = 1;
+					return;
+				}
+				if(rr && mup->priority == p->priority && delta > HZ/10){
+					mp->qexpired = 1;
+					return;
+				}
+				if(rr & delta > HZ){
+					mp->qexpired = 1;
+					return;
+				}
+			}
+	}
+}
+
+/*
+ * Scheduling thread run as the main loop of cpu 0
+ * Used in AMP sched.
+ */
+static void
+mach0sched(void)
+{
+	Schedq *rq;
+	Proc *p;
+	Mach *mp;
+	ulong start, now;
+	int n, i, j;
+
+	assert(m->machno == 0);
+	acmodeset(NIXKC);		/* we don't time share any more */
+	n = 0;
+	start = perfticks();
+loop:
+
+	/*
+	 * find a ready process that we might run.
+	 */
+	spllo();
+	for(rq = &run.runq[Nrq-1]; rq >= run.runq; rq--)
+		for(p = rq->head; p; p = p->rnext){
+			/*
+			 * wired processes may only run when their core is available.
+			 */
+			if(p->wired != nil){
+				if(p->wired->proc == nil)
+					goto found;
+				continue;
+			}
+			/*
+			 * find a ready process that did run at an available core
+			 * or one that has not moved for some time.
+			 */
+			if(p->mp == nil || p->mp->proc == nil || n>0)
+				goto found;
+		}
+	/* waste time or halt the CPU */
+	if(!anyactive())
+		idlehands();
+	/* remember how much time we're here */
+	now = perfticks();
+	m->perf.inidle += now-start;
+	start = now;
+	n++;
+	goto loop;
+
+found:
+	assert(m->machno == 0);
+	splhi();
+	/*
+	 * find a core for this process, but honor wiring.
+	 */
+	mp = p->wired;
+	if(mp != nil){
+		if(mp->proc != nil)
+			goto loop;
+	}else{
+		for(i = 0; i < MACHMAX; i++){
+			j = pickcore(p->color, i);
+			if((mp = sys->machptr[j]) != nil && mp->nixrole == NIXTC){
+				if(isbooting(mp))
+					continue;
+				if(mp != m && mp->proc == nil)
+					break;
+			}
+		}
+		if(i == MACHMAX){
+			preemptfor(p);
+			goto loop;
+		}
+	}
+
+	p = dequeueproc(&run, rq, p);
+	mp->proc = p;
+	if(p != nil){
+		p->state = Scheding;
+		p->mp = mp;
+	}
+
+	n = 0;
+	goto loop;
+}
+
+/*
+ * SMP performs better than AMP with few cores.
+ * So, leave this here by now. We should probably
+ * write a unified version of runproc good enough for
+ * both SMP and AMP.
+ */
+static Proc*
+smprunproc(void)
+{
+	Schedq *rq;
+	Proc *p;
+	ulong start, now;
+	int i;
+
+	start = perfticks();
+	run.preempts++;
+
+loop:
+	/*
+	 *  find a process that last ran on this processor (affinity),
+	 *  or one that hasn't moved in a while (load balancing).  Every
+	 *  time around the loop affinity goes down.
+	 */
+	spllo();
+	if(isbooting(m))
+		tcquiesce();
+	for(i = 0;; i++){
+		/*
+		 *  find the highest priority target process that this
+		 *  processor can run given affinity constraints.
+		 *
+		 */
+		for(rq = &run.runq[Nrq-1]; rq >= run.runq; rq--){
+			for(p = rq->head; p; p = p->rnext){
+				if(p->mp == nil || p->mp == sys->machptr[m->machno]
+				|| (!p->wired && i > 0))
+					goto found;
+			}
+		}
+
+		/* waste time or halt the CPU */
+		if(!anyactive())
+			idlehands();
+		if(isbooting(m))
+			tcquiesce();
+		/* remember how much time we're here */
+		now = perfticks();
+		m->perf.inidle += now-start;
+		start = now;
+	}
+
+found:
+	splhi();
+	p = dequeueproc(&run, rq, p);
+	if(p == nil)
+		goto loop;
+
+	p->state = Scheding;
+	p->mp = sys->machptr[m->machno];
+
+	if(edflock(p)){
+		edfrun(p, rq == &run.runq[PriEdf]);	/* start deadline timer and do admin */
+		edfunlock();
+	}
+	if(p->trace)
+		proctrace(p, SRun, 0);
+	ainc(&run.nrunning);
+	return p;
+}
+
+/*
+ *  pick a process to run.
+ *  most of this is used in AMP sched.
+ *  (on a quad core or less, we use SMP).
+ *  In the case of core 0 we always return nil, but
+ *  schedule the picked process at any other available TC.
+ *  In the case of other cores we wait until a process is given
+ *  by core 0.
+ */
+Proc*
+runproc(void)
+{
+	Schedq *rq;
+	Proc *p;
+	ulong start, now;
+
+	if(sys->nmach <= AMPmincores)
+		return smprunproc();
+
+	start = perfticks();
+	run.preempts++;
+	rq = nil;
+	if(m->machno != 0){
+		do{
+			spllo();
+			while(m->proc == nil){
+				if(isbooting(m)){
+					coherence();
+					if(m->proc != nil)
+						break;
+					tcquiesce();
+				}
+				waitwhile(&m->proc, (uintptr)nil);
+			}
+			now = perfticks();
+			m->perf.inidle += now-start;
+			start = now;
+			splhi();
+			p = m->proc;
+		}while(p == nil);
+		p->state = Scheding;
+		p->mp = sys->machptr[m->machno];
+	
+		if(edflock(p)){
+			edfrun(p, rq == &run.runq[PriEdf]);	/* start deadline timer and do admin */
+			edfunlock();
+		}
+		if(p->trace)
+			proctrace(p, SRun, 0);
+		ainc(&run.nrunning);
+		return p;
+	}
+
+	mach0sched();
+	return nil;	/* not reached */
+}
+
+int
+canpage(Proc *p)
+{
+	int ok;
+	Sched *sch;
+
+	splhi();
+	sch = procsched(p);
+	lock(sch);
+	/* Only reliable way to see if we are Running */
+	if(p->mach == 0) {
+		p->newtlb = 1;
+		ok = 1;
+	}
+	else
+		ok = 0;
+	unlock(sch);
+	spllo();
+
+	return ok;
+}
+
+Proc*
+newproc(void)
+{
+	Proc *p;
+
+	p = psalloc();
+
+	p->state = Scheding;
+	p->psstate = "New";
+	p->mach = 0;
+	p->qnext = 0;
+	p->nchild = 0;
+	p->nwait = 0;
+	p->waitq = 0;
+	p->parent = 0;
+	p->pgrp = 0;
+	p->egrp = 0;
+	p->fgrp = 0;
+	p->rgrp = 0;
+	p->pdbg = 0;
+	p->kp = 0;
+	if(up != nil && up->procctl == Proc_tracesyscall)
+		p->procctl = Proc_tracesyscall;
+	else
+		p->procctl = 0;
+	p->syscalltrace = nil;
+	p->notepending = 0;
+	p->ureg = 0;
+	p->privatemem = 0;
+	p->noswap = 0;
+	p->errstr = p->errbuf0;
+	p->syserrstr = p->errbuf1;
+	p->errbuf0[0] = '\0';
+	p->errbuf1[0] = '\0';
+	p->nlocks = 0;
+	p->delaysched = 0;
+	p->trace = 0;
+	kstrdup(&p->user, "*nouser");
+	kstrdup(&p->text, "*notext");
+	kstrdup(&p->args, "");
+	p->nargs = 0;
+	p->setargs = 0;
+	memset(p->seg, 0, sizeof p->seg);
+	p->pid = incref(&pidalloc);
+	pshash(p);
+	p->noteid = incref(&noteidalloc);
+	if(p->pid <= 0 || p->noteid <= 0)
+		panic("pidalloc");
+	if(p->kstack == 0)
+		p->kstack = smalloc(KSTACK);
+
+	/* sched params */
+	p->mp = 0;
+	p->wired = 0;
+	procpriority(p, PriNormal, 0);
+	p->cpu = 0;
+	p->lastupdate = sys->ticks*Scaling;
+	p->edf = nil;
+
+	p->ntrap = 0;
+	p->nintr = 0;
+	p->nsyscall = 0;
+	p->nactrap = 0;
+	p->nacsyscall = 0;
+	p->nicc = 0;
+	p->actime = 0ULL;
+	p->tctime = 0ULL;
+	p->ac = nil;
+	p->nfullq = 0;
+	memset(&p->PMMU, 0, sizeof p->PMMU);
+	return p;
+}
+
+/*
+ * wire this proc to a machine
+ */
+void
+procwired(Proc *p, int bm)
+{
+	Proc *pp;
+	int i;
+	char nwired[MACHMAX];
+	Mach *wm;
+
+	if(bm < 0){
+		/* pick a machine to wire to */
+		memset(nwired, 0, sizeof(nwired));
+		p->wired = 0;
+		for(i=0; (pp = psincref(i)) != nil; i++){
+			wm = pp->wired;
+			if(wm && pp->pid)
+				nwired[wm->machno]++;
+			psdecref(pp);
+		}
+		bm = 0;
+		for(i=0; i<sys->nmach; i++)
+			if(nwired[i] < nwired[bm])
+				bm = i;
+	} else {
+		/* use the virtual machine requested */
+		bm = bm % sys->nmach;
+	}
+
+	p->wired = sys->machptr[bm];
+	p->mp = p->wired;
+
+	/*
+	 * adjust our color to the new domain.
+	 */
+	if(up == nil || p != up)
+		return;
+	up->color = corecolor(up->mp->machno);
+	qlock(&up->seglock);
+	for(i = 0; i < NSEG; i++)
+		if(up->seg[i])
+			up->seg[i]->color = up->color;
+	qunlock(&up->seglock);
+}
+
+void
+procpriority(Proc *p, int pri, int fixed)
+{
+	if(pri >= Npriq)
+		pri = Npriq - 1;
+	else if(pri < 0)
+		pri = 0;
+	p->basepri = pri;
+	p->priority = pri;
+	if(fixed){
+		p->fixedpri = 1;
+	} else {
+		p->fixedpri = 0;
+	}
+}
+
+/*
+ *  sleep if a condition is not true.  Another process will
+ *  awaken us after it sets the condition.  When we awaken
+ *  the condition may no longer be true.
+ *
+ *  we lock both the process and the rendezvous to keep r->p
+ *  and p->r synchronized.
+ */
+void
+sleep(Rendez *r, int (*f)(void*), void *arg)
+{
+	Mpl pl;
+
+	pl = splhi();
+
+	if(up->nlocks)
+		print("process %d sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n",
+			up->pid, up->nlocks, up->lastlock, up->lastlock->pc, getcallerpc(&r));
+	lock(r);
+	lock(&up->rlock);
+	if(r->p){
+		print("double sleep called from %#p, %d %d\n",
+			getcallerpc(&r), r->p->pid, up->pid);
+		dumpstack();
+	}
+
+	/*
+	 *  Wakeup only knows there may be something to do by testing
+	 *  r->p in order to get something to lock on.
+	 *  Flush that information out to memory in case the sleep is
+	 *  committed.
+	 */
+	r->p = up;
+
+	if((*f)(arg) || up->notepending){
+		/*
+		 *  if condition happened or a note is pending
+		 *  never mind
+		 */
+		r->p = nil;
+		unlock(&up->rlock);
+		unlock(r);
+	} else {
+		/*
+		 *  now we are committed to
+		 *  change state and call scheduler
+		 */
+		if(up->trace)
+			proctrace(up, SSleep, 0);
+		up->state = Wakeme;
+		up->r = r;
+
+		/* statistics */
+		m->cs++;
+
+		procsave(up);
+		mmuflushtlb(m->pml4->pa);
+		if(setlabel(&up->sched)) {
+			/*
+			 *  here when the process is awakened
+			 */
+			procrestore(up);
+		} else {
+			/*
+			 *  here to go to sleep (i.e. stop Running)
+			 */
+			unlock(&up->rlock);
+			unlock(r);
+			gotolabel(&m->sched);
+		}
+	}
+
+	if(up->notepending) {
+		up->notepending = 0;
+		splx(pl);
+		if(up->procctl == Proc_exitme && up->closingfgrp)
+			forceclosefgrp();
+		error(Eintr);
+	}
+
+	splx(pl);
+}
+
+static int
+tfn(void *arg)
+{
+	return up->trend == nil || up->tfn(arg);
+}
+
+void
+twakeup(Ureg*, Timer *t)
+{
+	Proc *p;
+	Rendez *trend;
+
+	p = t->ta;
+	trend = p->trend;
+	p->trend = 0;
+	if(trend)
+		wakeup(trend);
+}
+
+void
+tsleep(Rendez *r, int (*fn)(void*), void *arg, long ms)
+{
+	if (up->tt){
+		print("tsleep: timer active: mode %d, tf %#p\n",
+			up->tmode, up->tf);
+		timerdel(up);
+	}
+	up->tns = MS2NS(ms);
+	up->tf = twakeup;
+	up->tmode = Trelative;
+	up->ta = up;
+	up->trend = r;
+	up->tfn = fn;
+	timeradd(up);
+
+	if(waserror()){
+		timerdel(up);
+		nexterror();
+	}
+	sleep(r, tfn, arg);
+	if (up->tt)
+		timerdel(up);
+	up->twhen = 0;
+	poperror();
+}
+
+/*
+ *  Expects that only one process can call wakeup for any given Rendez.
+ *  We hold both locks to ensure that r->p and p->r remain consistent.
+ *  Richard Miller has a better solution that doesn't require both to
+ *  be held simultaneously, but I'm a paranoid - presotto.
+ */
+Proc*
+wakeup(Rendez *r)
+{
+	Mpl pl;
+	Proc *p;
+
+	pl = splhi();
+
+	lock(r);
+	p = r->p;
+
+	if(p != nil){
+		lock(&p->rlock);
+		if(p->state != Wakeme || p->r != r)
+			panic("wakeup: state");
+		r->p = nil;
+		p->r = nil;
+		ready(p);
+		unlock(&p->rlock);
+	}
+	unlock(r);
+
+	splx(pl);
+
+	return p;
+}
+
+/*
+ *  if waking a sleeping process, this routine must hold both
+ *  p->rlock and r->lock.  However, it can't know them in
+ *  the same order as wakeup causing a possible lock ordering
+ *  deadlock.  We break the deadlock by giving up the p->rlock
+ *  lock if we can't get the r->lock and retrying.
+ */
+int
+postnote(Proc *p, int dolock, char *n, int flag)
+{
+	Mpl pl;
+	int ret;
+	Rendez *r;
+	Proc *d, **l;
+
+	if(dolock)
+		qlock(&p->debug);
+
+	if(flag != NUser && (p->notify == 0 || p->notified))
+		p->nnote = 0;
+
+	ret = 0;
+	if(p->nnote < NNOTE) {
+		strcpy(p->note[p->nnote].msg, n);
+		p->note[p->nnote++].flag = flag;
+		ret = 1;
+	}
+	p->notepending = 1;
+
+	/* NIX  */
+	if(p->state == Exotic){
+		/* it could be that the process is not running 
+		 * in the AC when we interrupt the AC, but then
+		 * we'd only get an extra interrupt in the AC, and
+		 * nothing should happen.
+		 */
+		intrac(p);
+	}
+
+	if(dolock)
+		qunlock(&p->debug);
+
+	/* this loop is to avoid lock ordering problems. */
+	for(;;){
+		pl = splhi();
+		lock(&p->rlock);
+		r = p->r;
+
+		/* waiting for a wakeup? */
+		if(r == nil)
+			break;	/* no */
+
+		/* try for the second lock */
+		if(canlock(r)){
+			if(p->state != Wakeme || r->p != p)
+				panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
+			p->r = nil;
+			r->p = nil;
+			ready(p);
+			unlock(r);
+			break;
+		}
+
+		/* give other process time to get out of critical section and try again */
+		unlock(&p->rlock);
+		splx(pl);
+		sched();
+	}
+	unlock(&p->rlock);
+	splx(pl);
+
+	if(p->state != Rendezvous){
+		if(p->state == Semdown || p->state == Semalt)
+			ready(p);
+		return ret;
+	}
+	/* Try and pull out of a rendezvous */
+	lock(p->rgrp);
+	if(p->state == Rendezvous) {
+		p->rendval = ~0;
+		l = &REND(p->rgrp, p->rendtag);
+		for(d = *l; d; d = d->rendhash) {
+			if(d == p) {
+				*l = p->rendhash;
+				break;
+			}
+			l = &d->rendhash;
+		}
+		ready(p);
+	}
+	unlock(p->rgrp);
+	return ret;
+}
+
+/*
+ * weird thing: keep at most NBROKEN around
+ */
+#define	NBROKEN 4
+struct
+{
+	QLock;
+	int	n;
+	Proc	*p[NBROKEN];
+}broken;
+
+void
+addbroken(Proc *p)
+{
+	qlock(&broken);
+	if(broken.n == NBROKEN) {
+		ready(broken.p[0]);
+		memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
+		--broken.n;
+	}
+	broken.p[broken.n++] = p;
+	qunlock(&broken);
+
+	stopac();
+	edfstop(up);
+	p->state = Broken;
+	p->psstate = 0;
+	sched();
+}
+
+void
+unbreak(Proc *p)
+{
+	int b;
+
+	qlock(&broken);
+	for(b=0; b < broken.n; b++)
+		if(broken.p[b] == p) {
+			broken.n--;
+			memmove(&broken.p[b], &broken.p[b+1],
+					sizeof(Proc*)*(NBROKEN-(b+1)));
+			ready(p);
+			break;
+		}
+	qunlock(&broken);
+}
+
+int
+freebroken(void)
+{
+	int i, n;
+
+	qlock(&broken);
+	n = broken.n;
+	for(i=0; i<n; i++) {
+		ready(broken.p[i]);
+		broken.p[i] = 0;
+	}
+	broken.n = 0;
+	qunlock(&broken);
+	return n;
+}
+
+void
+pexit(char *exitstr, int freemem)
+{
+	Proc *p;
+	Segment **s, **es;
+	long utime, stime;
+	Waitq *wq, *f, *next;
+	Fgrp *fgrp;
+	Egrp *egrp;
+	Rgrp *rgrp;
+	Pgrp *pgrp;
+	Chan *dot;
+
+	if(0 && up->nfullq > 0)
+		iprint(" %s=%d", up->text, up->nfullq);
+	if(0 && up->nicc > 0)
+		iprint(" [%s nicc %ud tctime %ulld actime %ulld]\n",
+			up->text, up->nicc, up->tctime, up->actime);
+	if(up->syscalltrace != nil)
+		free(up->syscalltrace);
+	up->syscalltrace = nil;
+	up->alarm = 0;
+
+	if (up->tt)
+		timerdel(up);
+	if(up->trace)
+		proctrace(up, SDead, 0);
+
+	/* nil out all the resources under lock (free later) */
+	qlock(&up->debug);
+	fgrp = up->fgrp;
+	up->fgrp = nil;
+	egrp = up->egrp;
+	up->egrp = nil;
+	rgrp = up->rgrp;
+	up->rgrp = nil;
+	pgrp = up->pgrp;
+	up->pgrp = nil;
+	dot = up->dot;
+	up->dot = nil;
+	qunlock(&up->debug);
+
+
+	if(fgrp)
+		closefgrp(fgrp);
+	if(egrp)
+		closeegrp(egrp);
+	if(rgrp)
+		closergrp(rgrp);
+	if(dot)
+		cclose(dot);
+	if(pgrp)
+		closepgrp(pgrp);
+
+	/*
+	 * if not a kernel process and have a parent,
+	 * do some housekeeping.
+	 */
+	if(up->kp == 0) {
+		p = up->parent;
+		if(p == 0) {
+			if(exitstr == 0)
+				exitstr = "unknown";
+			panic("boot process died: %s", exitstr);
+		}
+
+		while(waserror())
+			;
+
+		wq = smalloc(sizeof(Waitq));
+		poperror();
+
+		wq->w.pid = up->pid;
+		utime = up->time[TUser] + up->time[TCUser];
+		stime = up->time[TSys] + up->time[TCSys];
+		wq->w.time[TUser] = tk2ms(utime);
+		wq->w.time[TSys] = tk2ms(stime);
+		wq->w.time[TReal] = tk2ms(sys->ticks - up->time[TReal]);
+		if(exitstr && exitstr[0])
+			snprint(wq->w.msg, sizeof(wq->w.msg), "%s %d: %s",
+				up->text, up->pid, exitstr);
+		else
+			wq->w.msg[0] = '\0';
+
+		lock(&p->exl);
+		/*
+		 * Check that parent is still alive.
+		 */
+		if(p->pid == up->parentpid && p->state != Broken) {
+			p->nchild--;
+			p->time[TCUser] += utime;
+			p->time[TCSys] += stime;
+			/*
+			 * If there would be more than 128 wait records
+			 * processes for my parent, then don't leave a wait
+			 * record behind.  This helps prevent badly written
+			 * daemon processes from accumulating lots of wait
+			 * records.
+		 	 */
+			if(p->nwait < 128) {
+				wq->next = p->waitq;
+				p->waitq = wq;
+				p->nwait++;
+				wq = nil;
+				wakeup(&p->waitr);
+			}
+		}
+		unlock(&p->exl);
+		if(wq)
+			free(wq);
+	}
+
+	if(!freemem)
+		addbroken(up);
+
+	qlock(&up->seglock);
+	es = &up->seg[NSEG];
+	for(s = up->seg; s < es; s++) {
+		if(*s) {
+			putseg(*s);
+			*s = 0;
+		}
+	}
+	qunlock(&up->seglock);
+
+	lock(&up->exl);		/* Prevent my children from leaving waits */
+	psunhash(up);
+	up->pid = 0;
+	wakeup(&up->waitr);
+	unlock(&up->exl);
+
+	for(f = up->waitq; f; f = next) {
+		next = f->next;
+		free(f);
+	}
+
+	/* release debuggers */
+	qlock(&up->debug);
+	if(up->pdbg) {
+		wakeup(&up->pdbg->sleep);
+		up->pdbg = 0;
+	}
+	qunlock(&up->debug);
+
+	/* Sched must not loop for these locks */
+	lock(&procalloc);
+	lock(&pga);
+
+	stopac();
+	edfstop(up);
+	up->state = Moribund;
+	sched();
+	panic("pexit");
+}
+
+int
+haswaitq(void *x)
+{
+	Proc *p;
+
+	p = (Proc *)x;
+	return p->waitq != 0;
+}
+
+int
+pwait(Waitmsg *w)
+{
+	int cpid;
+	Waitq *wq;
+
+	if(!canqlock(&up->qwaitr))
+		error(Einuse);
+
+	if(waserror()) {
+		qunlock(&up->qwaitr);
+		nexterror();
+	}
+
+	lock(&up->exl);
+	if(up->nchild == 0 && up->waitq == 0) {
+		unlock(&up->exl);
+		error(Enochild);
+	}
+	unlock(&up->exl);
+
+	sleep(&up->waitr, haswaitq, up);
+
+	lock(&up->exl);
+	wq = up->waitq;
+	up->waitq = wq->next;
+	up->nwait--;
+	unlock(&up->exl);
+
+	qunlock(&up->qwaitr);
+	poperror();
+
+	if(w)
+		memmove(w, &wq->w, sizeof(Waitmsg));
+	cpid = wq->w.pid;
+	free(wq);
+
+	return cpid;
+}
+
+void
+dumpaproc(Proc *p)
+{
+	uintptr bss;
+	char *s;
+
+	if(p == 0)
+		return;
+
+	bss = 0;
+	if(p->seg[HSEG])
+		bss = p->seg[HSEG]->top;
+	else if(p->seg[BSEG])
+		bss = p->seg[BSEG]->top;
+
+	s = p->psstate;
+	if(s == 0)
+		s = statename[p->state];
+	print("%3d:%10s pc %#p dbgpc %#p  %8s (%s) ut %ld st %ld bss %#p qpc %#p nl %d nd %lud lpc %#p pri %lud\n",
+		p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state],
+		p->time[0], p->time[1], bss, p->qpc, p->nlocks,
+		p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
+}
+
+void
+procdump(void)
+{
+	int i;
+	Proc *p;
+
+	if(up)
+		print("up %d\n", up->pid);
+	else
+		print("no current process\n");
+	for(i=0; (p = psincref(i)) != nil; i++) {
+		if(p->state != Dead)
+			dumpaproc(p);
+		psdecref(p);
+	}
+}
+
+/*
+ *  wait till all processes have flushed their mmu
+ *  state about segement s
+ */
+void
+procflushseg(Segment *s)
+{
+	int i, ns, nm, nwait;
+	Proc *p;
+	Mach *mp;
+
+	/*
+	 *  tell all processes with this
+	 *  segment to flush their mmu's
+	 */
+	nwait = 0;
+	for(i=0; (p = psincref(i)) != nil; i++) {
+		if(p->state == Dead){
+			psdecref(p);
+			continue;
+		}
+		for(ns = 0; ns < NSEG; ns++){
+			if(p->seg[ns] == s){
+				p->newtlb = 1;
+				for(nm = 0; nm < MACHMAX; nm++)
+					if((mp = sys->machptr[nm]) != nil && mp->nixrole != NIXUC)
+						if(mp->proc == p){
+							mp->mmuflush = 1;
+							nwait++;
+						}
+				break;
+			}
+		}
+		psdecref(p);
+	}
+
+	if(nwait == 0)
+		return;
+
+	/*
+	 *  wait for all processors to take a clock interrupt
+	 *  and flush their mmu's.
+	 *  NIX BUG: this won't work if another core is in AC mode.
+	 *  In that case we must IPI it, but only if that core is
+	 *  using this segment.
+	 */
+	for(i = 0; i < MACHMAX; i++)
+		if((mp = sys->machptr[i]) != nil && mp->nixrole != NIXUC)
+			if(mp != m)
+				while(mp->mmuflush)
+					sched();
+}
+
+void
+scheddump(void)
+{
+	Proc *p;
+	Schedq *rq;
+
+	for(rq = &run.runq[Nrq-1]; rq >= run.runq; rq--){
+		if(rq->head == 0)
+			continue;
+		print("run[%ld]:", rq-run.runq);
+		for(p = rq->head; p; p = p->rnext)
+			print(" %d(%lud)", p->pid, m->ticks - p->readytime);
+		print("\n");
+		delay(150);
+	}
+	print("nrdy %d\n", run.nrdy);
+}
+
+void
+kproc(char *name, void (*func)(void *), void *arg)
+{
+	Proc *p;
+	static Pgrp *kpgrp;
+
+	p = newproc();
+	p->psstate = 0;
+	p->procmode = 0640;
+	p->kp = 1;
+	p->noswap = 1;
+
+	p->scallnr = up->scallnr;
+	memmove(p->arg, up->arg, sizeof(up->arg));
+	p->nerrlab = 0;
+	p->slash = up->slash;
+	p->dot = up->dot;
+	if(p->dot)
+		incref(p->dot);
+
+	memmove(p->note, up->note, sizeof(p->note));
+	p->nnote = up->nnote;
+	p->notified = 0;
+	p->lastnote = up->lastnote;
+	p->notify = up->notify;
+	p->ureg = 0;
+	p->dbgreg = 0;
+
+	procpriority(p, PriKproc, 0);
+
+	kprocchild(p, func, arg);
+
+	kstrdup(&p->user, eve);
+	kstrdup(&p->text, name);
+	if(kpgrp == 0)
+		kpgrp = newpgrp();
+	p->pgrp = kpgrp;
+	incref(kpgrp);
+
+	memset(p->time, 0, sizeof(p->time));
+	p->time[TReal] = sys->ticks;
+	ready(p);
+	/*
+	 *  since the bss/data segments are now shareable,
+	 *  any mmu info about this process is now stale
+	 *  and has to be discarded.
+	 */
+	p->newtlb = 1;
+	mmuflush();
+}
+
+/*
+ *  called splhi() by notify().  See comment in notify for the
+ *  reasoning.
+ */
+void
+procctl(Proc *p)
+{
+	Mpl pl;
+	char *state;
+
+	switch(p->procctl) {
+	case Proc_exitbig:
+		spllo();
+		pexit("Killed: Insufficient physical memory", 1);
+
+	case Proc_exitme:
+		spllo();		/* pexit has locks in it */
+		pexit("Killed", 1);
+
+	case Proc_traceme:
+		if(p->nnote == 0)
+			return;
+		/* No break */
+
+	case Proc_stopme:
+		p->procctl = 0;
+		state = p->psstate;
+		p->psstate = "Stopped";
+		/* free a waiting debugger */
+		pl = spllo();
+		qlock(&p->debug);
+		if(p->pdbg) {
+			wakeup(&p->pdbg->sleep);
+			p->pdbg = 0;
+		}
+		qunlock(&p->debug);
+		splhi();
+		p->state = Stopped;
+		sched();
+		p->psstate = state;
+		splx(pl);
+		return;
+
+	case Proc_toac:
+		p->procctl = 0;
+		/*
+		 * This pretends to return from the system call,
+		 * by moving to a core, but never returns (unless
+		 * the process gets moved back to a TC.)
+		 */
+		spllo();
+		if(p->ac == nil)
+			getac(p, -1);
+		runacore();
+		return;
+
+	case Proc_totc:
+		p->procctl = 0;
+		if(p != up)
+			panic("procctl: stopac: p != up");
+		spllo();
+		stopac();
+		return;
+	}
+}
+
+void
+error(char *err)
+{
+	spllo();
+
+	assert(up->nerrlab < NERR);
+	kstrcpy(up->errstr, err, ERRMAX);
+	setlabel(&up->errlab[NERR-1]);
+	nexterror();
+}
+
+void
+nexterror(void)
+{
+	gotolabel(&up->errlab[--up->nerrlab]);
+}
+
+void
+exhausted(char *resource)
+{
+	char buf[ERRMAX];
+
+	sprint(buf, "no free %s", resource);
+	iprint("%s\n", buf);
+	error(buf);
+}
+
+void
+killbig(char *why)
+{
+	int i, x;
+	Segment *s;
+	ulong l, max;
+	Proc *p, *kp;
+
+	max = 0;
+	kp = nil;
+	for(x = 0; (p = psincref(x)) != nil; x++) {
+		if(p->state == Dead || p->kp){
+			psdecref(p);
+			continue;
+		}
+		l = 0;
+		for(i=1; i<NSEG; i++) {
+			s = p->seg[i];
+			if(s != 0)
+				l += s->top - s->base;
+		}
+		if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
+			if(kp != nil)
+				psdecref(kp);
+			kp = p;
+			max = l;
+		}
+		else
+			psdecref(p);
+	}
+	if(kp == nil)
+		return;
+
+	print("%d: %s killed: %s\n", kp->pid, kp->text, why);
+	for(x = 0; (p = psincref(x)) != nil; x++) {
+		if(p->state == Dead || p->kp){
+			psdecref(p);
+			continue;
+		}
+		if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
+			p->procctl = Proc_exitbig;
+		psdecref(p);
+	}
+
+	kp->procctl = Proc_exitbig;
+	for(i = 0; i < NSEG; i++) {
+		s = kp->seg[i];
+		if(s != 0 && canqlock(&s->lk)) {
+			mfreeseg(s, s->base, (s->top - s->base)/BIGPGSZ);
+			qunlock(&s->lk);
+		}
+	}
+	psdecref(kp);
+}
+
+/*
+ *  change ownership to 'new' of all processes owned by 'old'.  Used when
+ *  eve changes.
+ */
+void
+renameuser(char *old, char *new)
+{
+	int i;
+	Proc *p;
+
+	for(i = 0; (p = psincref(i)) != nil; i++){
+		if(p->user!=nil && strcmp(old, p->user)==0)
+			kstrdup(&p->user, new);
+		psdecref(p);
+	}
+}
+
+/*
+ *  time accounting called by clock() splhi'd
+ *  only cpu1 computes system load average
+ *  but the system load average is accounted for cpu0.
+ */
+void
+accounttime(void)
+{
+	Proc *p;
+	ulong n, per;
+
+	p = m->proc;
+	if(p) {
+		if(m->machno == 1)
+			run.nrun++;
+		p->time[p->insyscall]++;
+	}
+
+	/* calculate decaying duty cycles */
+	n = perfticks();
+	per = n - m->perf.last;
+	m->perf.last = n;
+	per = (m->perf.period*(HZ-1) + per)/HZ;
+	if(per != 0)
+		m->perf.period = per;
+
+	m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
+	m->perf.inidle = 0;
+
+	m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
+	m->perf.inintr = 0;
+
+	/* only one processor gets to compute system load averages.
+	 * it has to be mach 1 when we use AMP.
+	 */
+	if(sys->nmach > 1 && m->machno != 1)
+		return;
+
+	/*
+	 * calculate decaying load average.
+	 * if we decay by (n-1)/n then it takes
+	 * n clock ticks to go from load L to .36 L once
+	 * things quiet down.  it takes about 5 n clock
+	 * ticks to go to zero.  so using HZ means this is
+	 * approximately the load over the last second,
+	 * with a tail lasting about 5 seconds.
+	 */
+	n = run.nrun;
+	run.nrun = 0;
+	n = (run.nrdy+n)*1000;
+	sys->load = (sys->load*(HZ-1)+n)/HZ;
+}
+
+void
+halt(void)
+{
+	if(run.nrdy != 0)
+		return;
+	hardhalt();
+}
--- /sys/src/nix/port/tcklock.c	Thu Jun 14 09:36:26 2012
+++ /sys/src/nix/port/tcklock.c	Thu Jun 21 12:23:24 2012
@@ -178,7 +178,7 @@
 		lockstats.glare++;
 		i = 0;
 		while(getticket(l->key) != myticket(user)){
-			if(conf.nmach < 2 && up && up->edf && (up->edf->flags & Admitted)){
+			if(sys->nmach < 2 && up && up->edf && (up->edf->flags & Admitted)){
 				/*
 				 * Priority inversion, yield on a uniprocessor; on a
 				 * multiprocessor, the other processor will unlock
@@ -321,10 +321,10 @@
 }
 
 void
-portmwait(void *value, int val)
+portwaitwhile(void *value, uintptr val)
 {
-	while (*(void**)value == val)
+	while (*(uintptr*)value == val)
 		;
 }
 
-void (*mwait)(void *, int) = portmwait;
+void (*waitwhile)(void *, uintptr) = portwaitwhile;
--- /sys/src/nix/k10/acore.c	Thu Jun 14 09:36:22 2012
+++ /sys/src/nix/k10/acore.c	Thu Jun 14 11:53:21 2012
@@ -65,7 +65,7 @@
 	snprint((char*)mp->icc->data, ICCLNSZ, "<%d>", i);
 	coherence();
 	mp->icc->fn = testiccfn;
-	mwait(&mp->icc->fn, 0);
+	waitwhile(&mp->icc->fn, 0);
 }
 
 /*
@@ -99,7 +99,7 @@
 	acmmuswitch();
 	for(;;){
 		acstackok();
-		mwait(&m->icc->fn, 0);
+		waitwhile(&m->icc->fn, 0);
 		if(m->icc->flushtlb)
 			acmmuswitch();
 		DBG("acsched: cpu%d: fn %#p\n", m->machno, m->icc->fn);
@@ -210,7 +210,7 @@
 	m->icc->fn = nil;
 	ready(m->proc);
 
-	mwait(&m->icc->fn, 0);
+	waitwhile(&m->icc->fn, 0);
 
 	if(m->icc->flushtlb)
 		acmmuswitch();
--- /sys/src/nix/k10/arch.c	Thu Apr 12 12:26:27 2012
+++ /sys/src/nix/k10/arch.c	Thu Jun 14 11:53:28 2012
@@ -95,12 +95,10 @@
  *  an interrupt will get us going again.
  *  The boot TC in nix can't halt, because it must stay alert in
  *  case an AC makes a handler process ready.
- *  We should probably use mwait in that case.
+ *  We should probably use waitwhile in that case.
  */
 void
 idlehands(void)
 {
-if(0)
-	if(m->machno != 0)
-		halt();
+	halt();
 }
--- /sys/src/nix/k10/archk10.c	Thu Apr 12 12:26:27 2012
+++ /sys/src/nix/k10/archk10.c	Thu Jun 14 11:53:34 2012
@@ -31,7 +31,7 @@
 	/* is mnonitor supported? */
 	if (m->cpuinfo[1][2] & 8) {
 		cpuid(5, 0, m->cpuinfo[2]);	
-		mwait = k10mwait;
+		waitwhile = k10waitwhile;
 	}
 
 	return 1;
--- /sys/src/nix/k10/fns.h	Thu Jun 14 09:36:22 2012
+++ /sys/src/nix/k10/fns.h	Thu Jun 14 11:53:41 2012
@@ -229,7 +229,7 @@
  * archk10.c
  */
 extern void millidelay(int);
-extern void k10mwait(void*, int);
+extern void k10waitwhile(void*, uintptr);
 
 /*
  * i8259.c
--- /sys/src/nix/k10/l64v.s	Thu Jun 14 09:36:23 2012
+++ /sys/src/nix/k10/l64v.s	Tue Jun 26 14:03:54 2012
@@ -366,12 +366,12 @@
 	BYTE $0x0f; BYTE $0x01; BYTE $0xc8	/* MONITOR */
 	RET
 
-TEXT _mwait(SB), 1, $-4				/* void mwait(u32int); */
+TEXT _waitwhile(SB), 1, $-4				/* void waitwhile(u32int); */
 	MOVLQZX	RARG, CX			/* optional extensions */
 	BYTE $0x0f; BYTE $0x01; BYTE $0xc9	/* MWAIT */
 	RET
 
-TEXT	k10mwait+0(SB),0,$16
+TEXT	k10waitwhile+0(SB),0,$16
 k10mwloop:
 	MOVQ	RARG, CX
 	MOVQ	val+8(FP), DX
--- /sys/src/nix/k10/tcore.c	Mon Jun 25 17:02:06 2012
+++ /sys/src/nix/k10/tcore.c	Mon Jun 25 17:05:13 2012
@@ -124,9 +124,8 @@
 	mp->nixrole = NIXSC;
 	mp->icc->fn = acquiesce;
 	coherence();
-	while(mp->icc->fn == acquiesce){
-		mwait(&mp->nixrole, NIXSC);
-	}
+	while(mp->icc->fn == acquiesce)
+		waitwhile(&mp->nixrole, NIXSC);
 	if(role == NIXOC){
 		mp->nixrole = role;
 		apicnipi(mp->apicno);
--- /usr/paurea/tcore.c	Thu Jan  1 00:00:00 1970
+++ /usr/paurea/tcore.c	Mon Jun 11 12:50:38 2012
@@ -0,0 +1,468 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+#include <tos.h>
+#include <pool.h>
+#include "amd64.h"
+#include "ureg.h"
+#include "io.h"
+
+Lock nixaclock;	/* NIX AC lock; held while assigning procs to cores */
+
+/*
+ * NIX support for the time sharing core.
+ */
+
+extern void actrapret(void);
+extern void acsysret(void);
+
+Mach*
+getac(Proc *p, int core)
+{
+	int i;
+	Mach *mp;
+
+	mp = nil;
+	if(core == 0)
+		panic("can't getac for a %s", rolename[NIXTC]);
+	lock(&nixaclock);
+	if(waserror()){
+		unlock(&nixaclock);
+		nexterror();
+	}
+	if(core > 0){
+		if(core >= MACHMAX)
+			error("no such core");
+		mp = sys->machptr[core];
+		if(mp == nil || mp->nixrole == NIXUC || mp->proc != nil)
+			error("core not online or busy");
+		if(mp->nixrole != NIXAC)
+			error("core is not an AC");
+	Found:
+		mp->proc = p;
+	}else{
+		for(i = 0; i < MACHMAX; i++)
+			if((mp = sys->machptr[i]) != nil && mp->nixrole == NIXAC){
+				if(isbooting(mp))
+					continue;
+				if(mp->proc == nil)
+					goto Found;
+			}
+		error("not enough cores");
+	}
+	unlock(&nixaclock);
+	poperror();
+	return mp;
+}
+
+/*
+ * BUG:
+ * The AC must not accept interrupts while in the kernel,
+ * or we must be prepared for nesting them, which we are not.
+ * This is important for note handling, because postnote()
+ * assumes that it's ok to send an IPI to an AC, no matter its
+ * state. The /proc interface also assumes that.
+ * 
+ */
+void
+intrac(Proc *p)
+{
+	Mach *ac;
+
+	ac = p->ac;
+	if(ac == nil){
+		DBG("intrac: Proc.ac is nil. no ipi sent.\n");
+		return;
+	}
+	/*
+	 * It's ok if the AC gets idle in the mean time.
+	 */
+	DBG("intrac: ipi to cpu%d\n", ac->machno);
+	apicipi(ac->apicno);
+}
+
+void
+putac(Mach *m)
+{
+	coherence();
+	m->proc = nil;
+}
+
+void
+stopac(void)
+{
+	Mach *mp;
+
+	mp = up->ac;
+	if(mp == nil)
+		return;
+	if(mp->proc != up)
+		panic("stopac");
+
+	lock(&nixaclock);
+	up->ac = nil;
+	mp->proc = nil;
+	unlock(&nixaclock);
+
+	/* TODO:
+	 * send sipi to up->ac, it would rerun squidboy(), and
+	 * wait for us to give it a function to run.
+	 */
+}
+
+
+
+static void
+roleac(Mach *mp, int role)
+{
+	if(mp == nil)
+		return;
+	/* wake it up... */
+	mp->nixrole = NIXSC;
+	mp->icc->fn = acquiesce;
+	coherence();
+	while(mp->icc->fn == acquiesce){
+		mwait(&mp->nixrole, NIXSC);
+	}
+	if(role == NIXOC){
+		mp->nixrole = role;
+		apicnipi(mp->apicno);
+	}else
+		sipicore(mp->machno);
+}
+
+
+void
+tcquiesce(void)
+{
+	/* sync, suicide is not possible */
+	m->proc = nil;
+	m->icc->fn = nil;
+
+	/* signal we are done */
+	m->nixrole= NIXQC;
+	coherence();
+	wakeup(&m->sipir);
+	for(;;)
+		halt();
+}
+
+void
+rolestable(Mach *mp)
+{
+	mp->nixrole = mp->nnixrole;
+	mp->nnixrole = NIXSTABLE;
+	coherence();
+}
+
+int
+isbooting(Mach *mp)
+{
+	return mp->nnixrole != NIXSTABLE;
+}
+
+static int
+donequiesce(void *x)
+{
+	Mach *mp;
+	mp = (Mach *)x;
+	return mp->nixrole == NIXQC;
+}
+
+/*
+ *	what should happen if you are wired and the core dissapears?
+ *	for now, this is for testing and it executes in the context of
+ *	a process (it shouldn't)
+ */
+int
+changerole(int role, int core)
+{
+	int apicno;
+	Mach *mp, *mpc, *w;
+
+	/* 
+	 *	1 *has* to be a TC.
+	 */
+	mpc = sys->machptr[1];
+	if(core == 1 || core >= MACHMAX || !mpc->nixrole == NIXTC)
+		return -1;
+	w = up->wired;
+	procwired(up, 1);
+	if(m != mpc)
+		sched();
+	mp = sys->machptr[core];
+
+	lock(&mp->sipilock);
+	apicno = mp->apicno;
+	if(isbooting(mp) && mp->nixrole != NIXOC){
+		print("core is already rebooting, nnixrole %#ux\n", mp->nnixrole);
+		unlock(&mp->sipilock);
+		return -1;
+	}
+	mp->nnixrole = role;
+	unlock(&mp->sipilock);
+	switch(mp->nixrole){
+	case NIXAC:
+		lock(&nixaclock);	/* so noone reassigns the core */
+		if(mp->proc != nil){
+			mp->proc->procctl = Proc_totc;
+			unlock(&nixaclock);
+			apicipi(apicno);
+		}else{
+			unlock(&nixaclock);
+			roleac(mp, role);
+		}
+	break;
+	case NIXTC:
+		mp->nixrole = NIXSC;
+		coherence();
+		/* when idlehands sleeps, this needs to send a wake it up IPI? */
+		//apicipi(mp->apicno);
+		sleep(&mp->sipir, donequiesce, mp);
+
+	/* fall */
+	case NIXOC:
+		mp->nnixrole = role;
+		coherence();
+		if(role == NIXOC){
+			mp->proc = nil;
+			mp->nixrole = NIXOC;
+			apicnipi(mp->apicno);
+		}else
+			sipicore(mp->machno);
+	break;
+	default:
+		print("don't know how to change my role\n");
+	}
+	up->wired = w;
+	return 0;
+}
+
+/*
+ * Functions starting with ac... are run in the application core.
+ * All other functions are run by the time-sharing cores.
+ */
+
+typedef void (*APfunc)(void);
+extern int notify(Ureg*);
+
+/*
+ * run an arbitrary function with arbitrary args on an ap core
+ * first argument is always pml4 for process
+ * make a field and a struct for the args cache line.
+ *
+ * Returns the return-code for the ICC or -1 if the process was
+ * interrupted while issuing the ICC.
+ */
+int
+runac(Mach *mp, APfunc func, int flushtlb, void *a, long n)
+{
+	uchar *dpg, *spg;
+
+	if (n > sizeof(mp->icc->data))
+		panic("runac: args too long");
+
+	if(mp->nixrole == NIXUC)
+		panic("Bad core");
+	if(mp->proc != nil && mp->proc != up)
+		panic("runapfunc: mach is busy with another proc?");
+
+	memmove(mp->icc->data, a, n);
+	if(flushtlb){
+		DBG("runac flushtlb: cppml4 %#p %#p\n", mp->pml4->pa, m->pml4->pa);
+		dpg = UINT2PTR(mp->pml4->va);
+		spg = UINT2PTR(m->pml4->va);
+		/* We should copy less:
+		 *	memmove(dgp, spg, m->pml4->daddr * sizeof(PTE));
+		 */
+		memmove(dpg, spg, PTSZ);
+		if(0){
+			print("runac: upac pml4 %#p\n", up->ac->pml4->pa);
+			dumpptepg(4, up->ac->pml4->pa);
+		}
+	}
+	mp->icc->flushtlb = flushtlb;
+	mp->icc->rc = ICCOK;
+
+	DBG("runac: exotic proc on cpu%d\n", mp->machno);
+	qlock(&up->debug);
+	up->nicc++;
+	up->state = Exotic;
+	up->psstate = 0;
+	qunlock(&up->debug);
+	coherence();
+	mp->icc->fn = func;
+	sched();
+	return mp->icc->rc;
+}
+
+/*
+ * Cleanup done by runacore to pretend we are going back to user space.
+ * We won't return and won't do what syscall() would normally do.
+ * Do it here instead.
+ */
+static void
+fakeretfromsyscall(Ureg *ureg)
+{
+	int s;
+
+	poperror();	/* as syscall() would do if we would return */
+	if(up->procctl == Proc_tracesyscall){	/* Would this work? */
+		up->procctl = Proc_stopme;
+		s = splhi();
+		procctl(up);
+		splx(s);
+	}
+
+	up->insyscall = 0;
+	/* if we delayed sched because we held a lock, sched now */
+	if(up->delaysched){
+		sched();
+		splhi();
+	}
+	kexit(ureg);
+}
+
+/*
+ * Move the current process to an application core.
+ * This is performed at the end of execac(), and
+ * we pretend to be returning to user-space, but instead we
+ * dispatch the process to another core.
+ * 1. We do the final bookkeeping that syscall() would do after
+ *    a return from sysexec(), because we are not returning.
+ * 2. We dispatch the process to an AC using an ICC.
+ *
+ * This function won't return unless the process is reclaimed back
+ * to the time-sharing core, and is the handler for the process
+ * to deal with traps and system calls until the process dies.
+ *
+ * Remember that this function is the "line" between user and kernel
+ * space, it's not expected to raise|handle any error.
+ *
+ * We install a safety error label, just in case we raise errors,
+ * which we shouldn't. (noerrorsleft knows that for exotic processes
+ * there is an error label pushed by us).
+ */
+void
+runacore(void)
+{
+	Ureg *ureg;
+	void (*fn)(void);
+	int rc, flush, s;
+	char *n;
+	uvlong t1;
+	Mach *ac;
+
+	if(waserror())
+		panic("runacore: error: %s\n", up->errstr);
+	ureg = up->dbgreg;
+	fakeretfromsyscall(ureg);
+	fpusysrfork(ureg);
+
+	procpriority(up, PriKproc, 1);
+	ac = up->ac;
+	rc = runac(up->ac, actouser, 1, nil, 0);
+	procpriority(up, PriNormal, 0);
+	for(;;){
+		t1 = fastticks(nil);
+		flush = 0;
+		fn = nil;
+		switch(rc){
+		case ICCTRAP:
+			s = splhi();
+			m->cr2 = up->ac->cr2;
+			DBG("runacore: trap %ulld cr2 %#ullx ureg %#p\n",
+				ureg->type, m->cr2, ureg);
+			switch(ureg->type){
+			case IdtIPI:
+				if(up->procctl || up->nnote)
+					notify(up->dbgreg);
+				if(up->ac == nil)
+					goto ToTC;
+				kexit(up->dbgreg);
+				break;
+			case IdtNM:
+			case IdtMF:
+			case IdtXF:
+				/* these are handled in the AC;
+				 * If we get here, they left in m->icc->data
+				 * a note to be posted to the process.
+				 * Post it, and make the vector a NOP.
+				 */
+				n = up->ac->icc->note;
+				if(n != nil)
+					postnote(up, 1, n, NDebug);
+				ureg->type = IdtIPI;		/* NOP */
+				break;
+			default:
+				cr3put(m->pml4->pa);
+				if(0 && ureg->type == IdtPF){
+					print("before PF:\n");
+					print("AC:\n");
+					dumpptepg(4, up->ac->pml4->pa);
+					print("\n%s:\n", rolename[NIXTC]);
+					dumpptepg(4, m->pml4->pa);
+				}
+				trap(ureg);
+			}
+			splx(s);
+			flush = 1;
+			fn = actrapret;
+			break;
+		case ICCSYSCALL:
+			DBG("runacore: syscall ax %#ullx ureg %#p\n",
+				ureg->ax, ureg);
+			cr3put(m->pml4->pa);
+			syscall(ureg->ax, ureg);
+			flush = 1;
+			fn = acsysret;
+			if(0)
+			if(up->nqtrap > 2 || up->nsyscall > 1)
+				goto ToTC;
+			if(up->ac == nil)
+				goto ToTC;
+			break;
+		default:
+			panic("runacore: unexpected rc = %d", rc);
+		}
+		up->tctime += fastticks2us(fastticks(nil) - t1);
+		procpriority(up, PriExtra, 1);
+		rc = runac(up->ac, fn, flush, nil, 0);
+		procpriority(up, PriNormal, 0);
+	}
+ToTC:
+	/*
+	 *  to procctl, then syscall,  to 
+	 *  be back in the TC
+	 */
+	DBG("runacore: up %#p: return\n", up);
+	if(isbooting(ac)){
+		roleac(ac, ac->nnixrole);
+	}
+}
+
+extern ACVctl *acvctl[];
+
+void
+actrapenable(int vno, char* (*f)(Ureg*, void*), void* a, char *name)
+{
+	ACVctl *v;
+
+	if(vno < 0 || vno >= 256)
+		panic("actrapenable: vno %d\n", vno);
+	v = malloc(sizeof(Vctl));
+	v->f = f;
+	v->a = a;
+	v->vno = vno;
+	strncpy(v->name, name, KNAMELEN);
+	v->name[KNAMELEN-1] = 0;
+
+	if(acvctl[vno])
+		panic("AC traps can't be shared");
+	acvctl[vno] = v;
+}
+
+
--- /sys/src/nix/port/portdat.h	Fri Jun  8 11:35:43 2012
+++ /sys/src/nix/port/portdat.h	Mon Jul  2 10:39:35 2012
@@ -678,7 +678,6 @@
 	EXXC,		/* want an XC for the exec'd image */
 };
 
-
 /*
  *  process memory segments - NSEG always last !
  *  HSEG is a potentially huge bss segment.
@@ -754,7 +753,8 @@
 	Schedq	runq[Nrq];
 	ulong	runvec;
 	int	nmach;		/* # of cores with this color */
-	ulong	nrun;		/* to compute load */
+	ulong	nrunhz;		/* to compute load */
+	int	nrunning;
 };
 
 typedef union Ar0 Ar0;