additional case that needs the slow version Reference: /n/atom/patch/applied/dodiv386 Date: Fri Mar 21 00:16:07 CET 2014 Signed-off-by: quanstro@quanstro.net --- /sys/src/libc/386/vlrt.c Thu Mar 20 20:27:35 2014 +++ /sys/src/libc/386/vlrt.c Thu Mar 20 20:27:35 2014 @@ -199,8 +199,8 @@ if(den.hi != 0){ q.hi = 0; n = num.hi/den.hi; - _mul64by32(&x, den, n); - if(x.hi > num.hi || (x.hi == num.hi && x.lo > num.lo)) + if(_mul64by32(&x, den, n) || x.hi > num.hi || + (x.hi == num.hi && x.lo > num.lo)) slowdodiv(num, den, &q, &r); else { q.lo = n; --- /sys/src/libc/386/vlop.s Thu Mar 20 20:27:35 2014 +++ /sys/src/libc/386/vlop.s Thu Mar 20 20:27:35 2014 @@ -13,16 +13,24 @@ MOVL BX, 4(CX) RET +/* + * _mul64by32(uint64 *r, uint64 a, uint32 b) + * sets *r = low 64 bits of 96-bit product a*b; returns high 32 bits. + */ TEXT _mul64by32(SB), $0 MOVL r+0(FP), CX MOVL a+4(FP), AX MULL b+12(FP) - MOVL AX, 0(CX) - MOVL DX, BX + MOVL AX, 0(CX) /* *r = low 32 bits of a*b */ + MOVL DX, BX /* BX = high 32 bits of a*b */ + MOVL a+8(FP), AX - MULL b+12(FP) - ADDL AX, BX - MOVL BX, 4(CX) + MULL b+12(FP) /* hi = (a>>32) * b */ + ADDL AX, BX /* BX += low 32 bits of hi */ + ADCL $0, DX /* DX = high 32 bits of hi + carry */ + MOVL BX, 4(CX) /* *r |= (high 32 bits of a*b) << 32 */ + + MOVL DX, AX /* return hi>>32 */ RET TEXT _div64by32(SB), $0