use a lock to reduce contention on runq.  trade for contention on
a single lock.  this is a good idea because we now have pretty efficient
locks.

Reference: /n/atom/patch/applied/nixsched2
Date: Thu May 29 07:14:13 CES 2014
Signed-off-by: quanstro@quanstro.net

--- /sys/src/nix/port/proc.c	Thu May 29 07:13:11 2014
+++ /sys/src/nix/port/proc.c	Thu May 29 07:13:13 2014
@@ -13,6 +13,7 @@
 {
 	Scaling		= 2,
 	Shortburst	= HZ,		/* 1 second */
+	Migratedelay	= 500*1000,	/* 500µs — very little affinity */
 
 	/*
 	 * number of schedulers used.
@@ -23,7 +24,8 @@
 
 Ref	noteidalloc;
 
-static Ref pidalloc;
+static	Ref	pidalloc;
+static	uvlong	affdelay;
 
 /*
  * Many multiprocessor machines are NUMA
@@ -98,6 +100,10 @@
 {
 	Edf *e;
 
+	affdelay = fastticks2ns(Migratedelay);
+	if(affdelay == 0)
+		affdelay++;
+
 	m->inidle = 1;
 	ainc(&m->sch->nmach);
 
@@ -256,7 +262,7 @@
 
 	/* unless preempted, get to run for at least 100ms */
 	if(anyhigher()
-	|| (!m->proc->fixedpri && tickscmp(m->ticks, m->schedticks) > 0 && anyready())){
+	|| (!m->proc->fixedpri && tickscmp(m->ticks, m->schedticks) >= 0 && anyready())){
 		m->readied = nil;	/* avoid cooperative scheduling */
 		m->proc->delaysched++;
 	}
@@ -475,7 +481,7 @@
 	 * BUG: if schedready is called to rebalance the scheduler,
 	 * for another core, then this is wrong.
 	 */
-	if(up != p && (p->wired == nil || p->wired->machno == m->machno))
+	if(m->proc != p && (p->wired == nil || p->wired->machno == m->machno))
 		m->readied = p;	/* group scheduling */
 
 	updatecpu(p);
@@ -604,7 +610,7 @@
 			continue;
 		for(rq = &sch->runq[Nrq-1]; rq >= sch->runq; rq--){
 			for(p = rq->head; p != nil; p = p->rnext)
-				if(!p->wired && p->priority < PriKproc)
+				if(p->wired == nil && p->priority < PriKproc)
 					break;
 			if(p != nil && dequeueproc(sch, rq, p) != nil)
 				return p;
@@ -613,16 +619,6 @@
 	return nil;
 }
 
-enum {
-	/*
-	 * if there are more runnable processes than mach, and nmach>1,
-	 * and nproc%nmach != 0, we need to migrate processes around to
-	 * avoid serious unfairness.  this is architecture sensitive.
-	 */
-//	Migratedelay	= 250*1000,	/* 250 µs */
-	Migratedelay	= 10*1000*1000,	/* 10 ms */
-};
-
 /*
  *  pick a process to run
  */
@@ -634,11 +630,12 @@
 	Proc *p;
 	uvlong start, now;
 	int i;
+	static Lock monitor;		/* fight over one cache line, not many */
 
 	start = perfticks();
 	sch = m->sch;
 	/* cooperative scheduling until the clock ticks */
-	if((p=m->readied) != nil && procsaved(p) && p->state==Ready
+	if((p=m->readied) != nil && procsaved(p) && p->state == Ready
 	&& sch->runq[Nrq-1].head == nil && sch->runq[Nrq-2].head == nil
 	&& (p->wired == nil || p->wired->machno == m->machno)){
 		sch->skipscheds++;
@@ -649,24 +646,23 @@
 	sch->preempts++;
 
 loop:
-	/*
-	 *  find a process that last ran on this processor (affinity),
-	 *  or one that hasn't moved in a while (load balancing).  Every
-	 *  time around the loop affinity goes down.
-	 */
 	spllo();
 	for(i = 0;; i++){
 		/*
 		 *  find the highest priority target process that this
 		 *  processor can run given affinity constraints.
 		 */
+		ilock(&monitor);
 		for(rq = &sch->runq[Nrq-1]; rq >= sch->runq; rq--){
 			for(p = rq->head; p != nil; p = p->rnext){
 				if(p->mp == nil || p->mp == m
-				|| p->wired == nil && (i>0 || fastticks2ns(fastticks(nil) - p->readytime) >= Migratedelay))
+				|| (p->wired == nil || p->wired->machno == m->machno) && fastticks(nil) - p->readytime >= affdelay){
+					iunlock(&monitor);
 					goto found;
+				}
 			}
 		}
+		iunlock(&monitor);
 
 		splhi();
 		p = steal();
@@ -674,8 +670,8 @@
 			goto stolen;
 		spllo();
 
-		while(sch->runvec == 0)
-			monmwait((int*)&sch->runvec, 0);
+		while(monmwait((int*)&sch->runvec, 0) == 0)
+			;
 
 		/* remember how much time we're here */
 		now = perfticks();
@@ -1385,7 +1381,7 @@
 	int i;
 	Proc *p;
 
-	if(up)
+	if(up != nil)
 		print("up %d\n", up->pid);
 	else
 		print("no current process\n");
@@ -1495,8 +1491,8 @@
 	p->notified = 0;
 	p->lastnote = up->lastnote;
 	p->notify = up->notify;
-	p->ureg = 0;
-	p->dbgreg = 0;
+	p->ureg = nil;
+	p->dbgreg = nil;
 
 	procpriority(p, PriKproc, 0);