1. Allow emulation of old arm7500 binaries and hardware execution of new vfp binaries in the same kernel. 2. Support VFPv2 as well as VFPv3 (so code can be shared by armv6 kernels). 3. Fix some kernel panics caused by confusion over vfp enabled/disabled state. 4. Safely preserve fpu condition codes across process context switches. 5. Don't enable interrupts during vfp exception handling (to prevent race condition if clock interrupt calls sched which causes fp context saving). Interrupts are still enabled for emulation of arm7500 code (safe because all fp context is already in the Proc structure). Reference: /n/sources/patch/applied/teg2-vfp Date: Fri Jan 11 21:11:15 CET 2013 Signed-off-by: miller@hamnavoe.com --- /sys/src/9/teg2/dat.h Fri Jan 11 20:59:16 2013 +++ /sys/src/9/teg2/dat.h Fri Jan 11 20:59:12 2013 @@ -95,13 +95,14 @@ }; /* - * FPsave.status + * FPsave.fpstate */ enum { FPinit, FPactive, FPinactive, + FPemu, /* bit or'd with the state */ FPillegal= 0x100, --- /sys/src/9/teg2/fpiarm.c Fri Jan 11 21:37:47 2013 +++ /sys/src/9/teg2/fpiarm.c Fri Jan 11 21:37:25 2013 @@ -475,9 +475,13 @@ * because all the emulated fp state is in the proc structure, * it need not be saved/restored */ - if(up->fpstate != FPactive){ + switch(up->fpstate){ + case FPactive: + case FPinactive: + error("illegal instruction: emulated fpu opcode in VFP mode"); + case FPinit: assert(sizeof(Internal) <= sizeof(ufp->regs[0])); - up->fpstate = FPactive; + up->fpstate = FPemu; ufp->control = 0; ufp->status = (0x01<<28)|(1<<12); /* sw emulation, alt. C flag */ for(n = 0; n < 8; n++) --- /sys/src/9/teg2/vfp3.c Fri Jan 11 20:59:22 2013 +++ /sys/src/9/teg2/vfp3.c Wed Jan 23 16:42:52 2013 @@ -1,5 +1,5 @@ /* - * VFPv3 floating point unit + * VFPv2 or VFPv3 floating point unit */ #include "u.h" #include "../port/lib.h" @@ -9,6 +9,12 @@ #include "ureg.h" #include "arm.h" +/* subarchitecture code in m->havefp */ +enum { + VFPv2 = 2, + VFPv3 = 3, +}; + /* fp control regs. most are read-only */ enum { Fpsid = 0, @@ -34,12 +40,16 @@ /* Fpscr bits; see u.h for more */ Stride = MASK(2) << 20, Len = MASK(3) << 16, + Dn= 1 << 25, + Fz= 1 << 24, /* trap exception enables (not allowed in vfp3) */ FPIDNRM = 1 << 15, /* input denormal */ Alltraps = FPIDNRM | FPINEX | FPUNFL | FPOVFL | FPZDIV | FPINVAL, /* pending exceptions */ FPAIDNRM = 1 << 7, /* input denormal */ -#define Allexc (FPAIDNRM | FPAINEX | FPAUNFL | FPAOVFL | FPAZDIV | FPAINVAL) + Allexc = FPAIDNRM | FPAINEX | FPAUNFL | FPAOVFL | FPAZDIV | FPAINVAL, + /* condition codes */ + Allcc = MASK(4) << 28, }; enum { /* CpCPaccess bits */ @@ -51,8 +61,8 @@ subarch(int impl, uint sa) { static char *armarchs[] = { - "VFPv1 (pre-armv7)", - "VFPv2 (pre-armv7)", + "VFPv1 (unsupported)", + "VFPv2", "VFPv3+ with common VFP subarch v2", "VFPv3+ with null subarch", "VFPv3+ with common VFP subarch v3", @@ -77,7 +87,7 @@ havefp(void) { int gotfp; - ulong acc; + ulong acc, sid; if (m->havefpvalid) return m->havefp; @@ -99,14 +109,24 @@ m->havefpvalid = 1; return 0; } - if (acc & Cpaccd16) + m->fpon = 1; /* don't panic */ + sid = fprd(Fpsid); + m->fpon = 0; + switch((sid >> 16) & MASK(7)){ + case 0: /* VFPv1 */ + break; + case 1: /* VFPv2 */ + m->havefp = VFPv2; m->fpnregs = 16; - else - m->fpnregs = 32; + break; + default: /* VFPv3 or later */ + m->havefp = VFPv3; + m->fpnregs = (acc & Cpaccd16) ? 16 : 32; + break; + } if (m->machno == 0) print("fp: %d registers,%s simd\n", m->fpnregs, (acc & Cpaccnosimd? " no": "")); - m->havefp = 1; m->havefpvalid = 1; return 1; } @@ -143,7 +163,7 @@ static int printed; /* clear pending exceptions; no traps in vfp3; all v7 ops are scalar */ - m->fpscr = FPRNR | (FPINVAL | FPZDIV | FPOVFL) & ~Alltraps; + m->fpscr = Dn | Fz | FPRNR | (FPINVAL | FPZDIV | FPOVFL) & ~Alltraps; fpwr(Fpscr, m->fpscr); m->fpconfiged = 1; @@ -151,7 +171,7 @@ return; sid = fprd(Fpsid); impl = sid >> 24; - print("fp: %s arch %s; r%ld\n", implement(impl), + print("fp: %s arch %s; rev %ld\n", implement(impl), subarch(impl, (sid >> 16) & MASK(7)), sid & MASK(4)); printed = 1; } @@ -171,7 +191,7 @@ if (havefp()) { fpononly(); if (m->fpconfiged) - fpwr(Fpscr, m->fpscr); + fpwr(Fpscr, (fprd(Fpscr) & Allcc) | m->fpscr); else fpcfg(); /* 1st time on this fpu; configure it */ } @@ -260,6 +280,19 @@ fpoff(); } +static void +fprestore(Proc *p) +{ + int n; + + fpon(); + fpwr(Fpscr, p->fpsave.control); + m->fpscr = fprd(Fpscr) & ~Allcc; + assert(m->fpnregs); + for (n = 0; n < m->fpnregs; n++) + fprestreg(n, *(uvlong *)p->fpsave.regs[n]); +} + /* * Called from sched() and sleep() via the machine-dependent * procsave() routine. @@ -295,18 +328,8 @@ * exception and the state will then be restored. */ void -fpuprocrestore(Proc *p) +fpuprocrestore(Proc *) { - int n; - - if (p->fpstate == FPactive) { - fpon(); - fpwr(Fpscr, p->fpsave.control); - m->fpscr = fprd(Fpscr); - assert(m->fpnregs); - for (n = 0; n < m->fpnregs; n++) - fprestreg(n, *(uvlong *)p->fpsave.regs[n]); - } } /* @@ -353,9 +376,9 @@ static void mathemu(Ureg *) { - if (!(fprd(Fpexc) & (Fpex|Fpdex))) - iprint("mathemu: not an FP exception but an unknown FP opcode\n"); switch(up->fpstate){ + case FPemu: + error("illegal instruction: VFP opcode in emulated mode"); case FPinit: fpinit(); up->fpstate = FPactive; @@ -372,7 +395,7 @@ mathnote(); break; } - fpuprocrestore(up); + fprestore(up); up->fpstate = FPactive; break; case FPactive: @@ -454,9 +477,7 @@ int cop, op; uintptr pc; - s = spllo(); if(waserror()){ - splx(s); postnote(up, 1, up->errstr, NDebug); return 1; } @@ -471,18 +492,25 @@ iprint("fpuemu: conditional instr shouldn't have got here\n"); op = (*(ulong *)pc >> 24) & MASK(4); cop = (*(ulong *)pc >> 8) & MASK(4); - fpstuck(pc); /* debugging; could move down 1 line */ + if(m->fpon) + fpstuck(pc); /* debugging; could move down 1 line */ if (ISFPAOP(cop, op)) { /* old arm 7500 fpa opcode? */ - iprint("fpuemu: fpa instr %#8.8lux at %#p\n", *(ulong *)pc, pc); - error("illegal instruction: old arm 7500 fpa opcode"); -// nfp = fpiarm(ureg); /* advances pc past emulated instr(s) */ -// if (nfp > 1) /* could adjust this threshold */ -// m->fppc = m->fpcnt = 0; +// iprint("fpuemu: fpa instr %#8.8lux at %#p\n", *(ulong *)pc, pc); +// error("illegal instruction: old arm 7500 fpa opcode"); + s = spllo(); + if(waserror()){ + splx(s); + nexterror(); + } + nfp = fpiarm(ureg); /* advances pc past emulated instr(s) */ + if (nfp > 1) /* could adjust this threshold */ + m->fppc = m->fpcnt = 0; + splx(s); + poperror(); } else if (ISVFPOP(cop, op)) { /* if vfp, fpu must be off */ mathemu(ureg); /* enable fpu & retry */ nfp = 1; } - splx(s); poperror(); return nfp; --- /sys/src/9/teg2/words Fri Jan 11 20:59:27 2013 +++ /sys/src/9/teg2/words Fri Jan 11 20:59:24 2013 @@ -5,7 +5,7 @@ linux believes that u-boot runs in the bottom 4MB. the l2 cache is a non-architectural bag nailed on the side. mp arm systems have a generic interrupt controller; this one is gic v1(!). -vfp 3 floating-point is present. 5l doesn't yet generate those instructions. +vfp 3 floating-point is present. section numbers (§) are in the tegra 2 tech. ref. man. for a minimal cpu server, need these devices to work: @@ -19,7 +19,7 @@ then add these: ☑ 2nd cpu (cortex.a9.mpcore.pdf), ☑ l2 cache (l2cache.pl310.pdf, errata), -☹ fpu (cortex.a9.fp.pdf): kernel done, 5l isn't, +☑ fpu (cortex.a9.fp.pdf), ☑ user profiling, kprof, in-line 64-bit arithmetic, --- /sys/src/9/teg2/ts Fri Jan 11 20:59:32 2013 +++ /sys/src/9/teg2/ts Fri Jan 11 20:59:29 2013 @@ -68,13 +68,13 @@ uarti8250 ucalloc ucallocb -# hardware fp; can't get 5l to generate the right opcodes -# vfp3 -# emulated fp +# include vfp3 to use hardware fp, otherwise include softfpu + vfp3 +# softfpu +# emulated arm7500 fp fpi fpiarm fpimem - softfpu port int cpuserver = 1; --- /sys/src/9/teg2/_announce Fri Jan 11 20:59:37 2013 +++ /sys/src/9/teg2/_announce Fri Jan 11 20:59:34 2013 @@ -31,7 +31,8 @@ instructions (among other changes). Attempts to transplant just that code into our 5l failed to generate correct code. Eventually someone will get this to work, and then we'll be able to use the hardware -floating-point. Even with only software emulation of floating-point, +floating-point. [Added 2013.01.11: eventually someone did -- see +patch 5l-vfp] Even with only software emulation of floating-point, astro runs in under 3 seconds. In-line 64-bit arithmetic in 5[cl].