# HG changeset patch # User Noah Evans # Date 1315579787 -7200 # Node ID fb7d4fad4e57ae6e68544e73bfac1bd62638fd3f # Parent 2adaa665956e330e30ae7b476f984b3ca18e0d1b nix: megapatch, this might require a couple of iterations. R=nix-dev, rminnich CC=nix-dev http://codereview.appspot.com/4991048 diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/386/ether82563.c --- a/sys/src/nix/386/ether82563.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/386/ether82563.c Fri Sep 09 16:49:47 2011 +0200 @@ -791,7 +791,7 @@ static void i82563rbfree(Block* b) { - b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base); + b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, 4*KiB); b->flag &= ~(Bpktck|Btcpck|Budpck|Bipck); ilock(&i82563rblock); b->next = i82563rbpool; @@ -1255,7 +1255,7 @@ error(Enomem); for(ctlr->nrb = 0; ctlr->nrb < Nrb; ctlr->nrb++){ - if((bp = allocb(ctlr->rbsz + BY2PG)) == nil) + if((bp = allocb(ctlr->rbsz + 4*KiB)) == nil) break; bp->free = i82563rbfree; freeb(bp); diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/386/etherm10g.c --- a/sys/src/nix/386/etherm10g.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/386/etherm10g.c Fri Sep 09 16:49:47 2011 +0200 @@ -880,7 +880,7 @@ { Bpool *p; - b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base); + b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, 4*KiB); b->flag &= ~(Bpktck|Btcpck|Budpck|Bipck); p = &smpool; @@ -897,7 +897,7 @@ { Bpool *p; - b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base); + b->rp = b->wp = (uchar*)ROUNDUP((uintptr)b->base, 4*KiB); b->flag &= ~(Bpktck|Btcpck|Budpck|Bipck); p = &bgpool; @@ -1000,14 +1000,14 @@ c->bg.m = entries-1; c->bg.host = emalign(entries * sizeof *c->bg.host); - sz = c->sm.pool->size + BY2PG; + sz = c->sm.pool->size + 4*KiB; for(i = 0; i < c->sm.n; i++){ if((b = allocb(sz)) == 0) break; b->free = smbfree; freeb(b); } - sz = c->bg.pool->size + BY2PG; + sz = c->bg.pool->size + 4*KiB; for(i = 0; i < c->bg.n; i++){ if((b = allocb(sz)) == 0) break; diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/386/pci.c --- a/sys/src/nix/386/pci.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/386/pci.c Fri Sep 09 16:49:47 2011 +0200 @@ -927,7 +927,7 @@ for(p=pciroot; p; p=p->list) for(i=0; imem); i++) if(p->mem[i].bar && (p->mem[i].bar&1) == 0) - upareserve(p->mem[i].bar&~0x0F, p->mem[i].size); + asmmapinit(p->mem[i].bar&~0x0F, p->mem[i].size, 5); } static int diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/DIFFS --- a/sys/src/nix/DIFFS Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/DIFFS Fri Sep 09 16:49:47 2011 +0200 @@ -1,97 +1,15 @@ - -diff -n /sys/src/nix/k8/dat.h /sys/src/9kronsc/k8/dat.h -/sys/src/nix/k8/dat.h:26,27 d /sys/src/9kronsc/k8/dat.h:25 -< typedef struct PmcCtr PmcCtr; -< typedef struct PmcCtl PmcCtl; -/sys/src/nix/k8/dat.h:188,216 d /sys/src/9kronsc/k8/dat.h:185 -< * hw perf counters -< */ -< struct PmcCtl { -< u32int coreno; -< int enab; -< int user; -< int os; -< int nodesc; -< char descstr[KNAMELEN]; -< int reset; -< }; -< -< struct PmcCtr{ -< int stale; -< Rendez r; -< u64int ctr; -< int ctrset; -< PmcCtl; -< int ctlset; -< }; -< -< enum { -< PmcMaxCtrs = 4, -< PmcIgn = 0, -< PmcGet = 1, -< PmcSet = 2, -< }; -< -< /* -/sys/src/nix/k8/dat.h:272,274 d /sys/src/9kronsc/k8/dat.h:240 -< Lock pmclock; -< PmcCtr pmc[PmcMaxCtrs]; -< -/sys/src/nix/k8/dat.h:328 c /sys/src/9kronsc/k8/dat.h:294 -< #define kmap(p) (KMap*)(KADDR((p)->pa)) +diff -n /sys/src/nix/port/cache.c /sys/src/nixpa/port/cache.c +/sys/src/nix/port/cache.c:114 c /sys/src/nixpa/port/cache.c:114 +< // if(conf.npage*BY2PG > 200*MB) --- -> #define kmap(p) (KMap*)((p)->pa|kseg0) -Only in /sys/src/9kronsc/k8: dev.6 -Only in /sys/src/9kronsc/k8: devarch.6 -diff -n /sys/src/nix/k8/devarch.c /sys/src/9kronsc/k8/devarch.c -/sys/src/nix/k8/devarch.c:37 d /sys/src/9kronsc/k8/devarch.c:36 -< Qmapram, -/sys/src/nix/k8/devarch.c:53 d /sys/src/9kronsc/k8/devarch.c:51 -< "mapram", { Qmapram, 0 }, 0, 0444, -/sys/src/nix/k8/devarch.c:352 d /sys/src/9kronsc/k8/devarch.c:349 -< Map *mp; -/sys/src/nix/k8/devarch.c:400,433 c /sys/src/9kronsc/k8/devarch.c:397,403 -< switch((ulong)c->qid.path){ -< case Qioalloc: -< lock(&iomap); -< for(map = iomap.map; n > 0 && map != nil; map = map->next){ -< if(offset-- > 0) -< continue; -< sprint(p, "%#8lux %#8lux %-12.12s\n", map->start, map->end-1, map->tag); -< p += Linelen; -< n--; -< } -< unlock(&iomap); -< break; -< case Qmapram: -< USED(mp); -< /* shit */ -< #ifdef NOTYET -< for(mp = rmapram.map; mp->size; mp++){ -< /* -< * Up to MemMinMiB is already set up. -< */ -< if(mp->addr < MemMinMiB*MiB){ -< if(mp->addr+mp->size <= MemMinMiB*MiB) -< continue; -< pa = MemMinMiB*MiB; -< size = mp->size - MemMinMiB*MiB-mp->addr; -< } -< else{ -< pa = mp->addr; -< size = mp->size; -< } -< #endif -< error("Not yet"); -< -< break; +> // if(conf.npage*PGSZ > 200*MB) +/sys/src/nix/port/cache.c:116 c /sys/src/nixpa/port/cache.c:116 +< // if(conf.npage*BY2PG > 400*MB) --- -> lock(&iomap); -> for(map = iomap.map; n > 0 && map != nil; map = map->next){ -> if(offset-- > 0) -> continue; -> sprint(p, "%#8lux %#8lux %-12.12s\n", map->start, map->end-1, map->tag); -> p += Linelen; -> n--; -/sys/src/nix/k8/devarch.c:434 a /sys/src/9kronsc/k8/devarch.c:405 -> unlock(&iomap); +> // if(conf.npage*PGSZ > 400*MB) +/sys/src/nix/port/cache.c:363 c /sys/src/nixpa/port/cache.c:363 +< p = auxpage(); +--- +> p = auxpage(BIGPGSZ); +diff -n /sys/src/nix/port/devcons.c /sys/src/nixpa/port/devcons.c +/sys/src/nix/po \ No newline at end of file diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/boot/libboot.a6 Binary file sys/src/nix/boot/libboot.a6 has changed diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/include/tos.h --- a/sys/src/nix/include/tos.h Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/include/tos.h Fri Sep 09 16:49:47 2011 +0200 @@ -1,8 +1,8 @@ typedef struct Callq Callq; typedef struct Nixcall Nixcall; typedef struct Nixret Nixret; +typedef struct Plink Plink; typedef struct Tos Tos; -typedef struct Plink Plink; #pragma incomplete Plink diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/ip/devip.c --- a/sys/src/nix/ip/devip.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/ip/devip.c Fri Sep 09 16:49:47 2011 +0200 @@ -1410,7 +1410,7 @@ ulong scalednconv(void) { - if(cpuserver && conf.npage*BY2PG >= 128*MB) + if(cpuserver && conf.npage*PGSZ >= 128*MB) return Nchans*4; return Nchans; } diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/acpi.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/acpi.h Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,396 @@ +typedef struct Atable Atable; +typedef struct Facs Facs; +typedef struct Fadt Fadt; +typedef struct Gas Gas; +typedef struct Gpe Gpe; +typedef struct Rsdp Rsdp; +typedef struct Sdthdr Sdthdr; +typedef struct Parse Parse; +typedef struct Xsdt Xsdt; +typedef struct Regio Regio; +typedef struct Reg Reg; +typedef struct Madt Madt; +typedef struct Msct Msct; +typedef struct Mdom Mdom; +typedef struct Apicst Apicst; +typedef struct Srat Srat; + +enum +{ + + Sdthdrsz = 36, /* size of SDT header */ + + /* ACPI regions. Gas ids */ + Rsysmem = 0, + Rsysio, + Rpcicfg, + Rembed, + Rsmbus, + Rcmos, + Rpcibar, + Ripmi, + Rfixedhw = 0x7f, + + /* ACPI PM1 control */ + Pm1SciEn = 0x1, /* Generate SCI and not SMI */ + + /* ACPI tbdf as encoded in acpi region base addresses */ + Rpciregshift = 0, + Rpciregmask = 0xFFFF, + Rpcifunshift = 16, + Rpcifunmask = 0xFFFF, + Rpcidevshift = 32, + Rpcidevmask = 0xFFFF, + Rpcibusshift = 48, + Rpcibusmask = 0xFFFF, + + /* Apic structure types */ + ASlapic = 0, /* processor local apic */ + ASioapic, /* I/O apic */ + ASintovr, /* Interrupt source override */ + ASnmi, /* NMI source */ + ASlnmi, /* local apic nmi */ + ASladdr, /* local apic address override */ + ASiosapic, /* I/O sapic */ + ASlsapic, /* local sapic */ + ASintsrc, /* platform interrupt sources */ + ASlx2apic, /* local x2 apic */ + ASlx2nmi, /* local x2 apic NMI */ + + /* Apic flags */ + AFbus = 0, /* polarity/trigger like in ISA */ + AFhigh = 1, /* active high */ + AFlow = 3, /* active low */ + AFpmask = 3, /* polarity bits */ + AFedge = 1<<2, /* edge triggered */ + AFlevel = 3<<2, /* level triggered */ + AFtmask = 3<<2, /* trigger bits */ + + /* SRAT types */ + SRlapic = 0, /* Local apic/sapic affinity */ + SRmem, /* Memory affinity */ + SRlx2apic, /* x2 apic affinity */ + + /* Arg for _PIC */ + Ppic = 0, /* PIC interrupt model */ + Papic, /* APIC interrupt model */ + Psapic, /* SAPIC interrupt model */ + + + CMregion = 0, /* regio name spc base len accsz*/ + CMgpe, /* gpe name id */ + + Qdir = 0, + Qctl, + Qtbl, + Qio, +}; + +/* + * ACPI table (sw) + */ +struct Atable +{ + Atable* next; /* next table in list */ + int is64; /* uses 64bits */ + char sig[5]; /* signature */ + char oemid[7]; /* oem id str. */ + char oemtblid[9]; /* oem tbl. id str. */ + uchar* tbl; /* pointer to table in memory */ + long dlen; /* size of data in table, after Stdhdr */ +}; + +struct Gpe +{ + uintptr stsio; /* port used for status */ + int stsbit; /* bit number */ + uintptr enio; /* port used for enable */ + int enbit; /* bit number */ + int nb; /* event number */ + char* obj; /* handler object */ + int id; /* id as supplied by user */ +}; + +struct Parse +{ + char* sig; + Atable* (*f)(uchar*, int); /* return nil to keep vmap */ +}; + +struct Regio{ + void *arg; + u8int (*get8)(uintptr, void*); + void (*set8)(uintptr, u8int, void*); + u16int (*get16)(uintptr, void*); + void (*set16)(uintptr, u16int, void*); + u32int (*get32)(uintptr, void*); + void (*set32)(uintptr, u32int, void*); + u64int (*get64)(uintptr, void*); + void (*set64)(uintptr, u64int, void*); +}; + +struct Reg +{ + char* name; + int spc; /* io space */ + u64int base; /* address, physical */ + uchar* p; /* address, kmapped */ + u64int len; + int tbdf; + int accsz; /* access size */ +}; + +/* Generic address structure. + */ +#pragma pack on +struct Gas +{ + u8int spc; /* address space id */ + u8int len; /* register size in bits */ + u8int off; /* bit offset */ + u8int accsz; /* 1: byte; 2: word; 3: dword; 4: qword */ + u64int addr; /* address (or acpi encoded tbdf + reg) */ +}; + +/* Root system description table pointer. + * Used to locate the root system description table RSDT + * (or the extended system description table from version 2) XSDT. + * The XDST contains (after the DST header) a list of pointers to tables: + * - FADT fixed acpi description table. + * It points to the DSDT, AML code making the acpi namespace. + * - SSDTs tables with AML code to add to the acpi namespace. + * - pointers to other tables for apics, etc. + */ + +struct Rsdp +{ + u8int signature[8]; /* "RSD PTR " */ + u8int rchecksum; + u8int oemid[6]; + u8int revision; + u8int raddr[4]; /* RSDT */ + u8int length[4]; + u8int xaddr[8]; /* XSDT */ + u8int xchecksum; /* XSDT */ + u8int _33_[3]; /* reserved */ +}; + +/* Header for ACPI description tables + */ +struct Sdthdr +{ + u8int sig[4]; /* "FACP" or whatever */ + u8int length[4]; + u8int rev; + u8int csum; + u8int oemid[6]; + u8int oemtblid[8]; + u8int oemrev[4]; + u8int creatorid[4]; + u8int creatorrev[4]; +}; + +/* Firmware control structure + */ +struct Facs +{ + u32int hwsig; + u32int wakingv; + u32int glock; + u32int flags; + u64int xwakingv; + u8int vers; + u32int ospmflags; +}; + +#pragma pack off + +/* Maximum System Characteristics table + */ +struct Msct +{ + int ndoms; /* number of domains */ + int nclkdoms; /* number of clock domains */ + u64int maxpa; /* max physical address */ + + Mdom* dom; /* domain information list */ +}; + +struct Mdom +{ + Mdom* next; + int start; /* start dom id */ + int end; /* end dom id */ + int maxproc; /* max processor capacity */ + u64int maxmem; /* max memory capacity */ +}; + +/* Multiple APIC description table + * Interrupts are virtualized by ACPI and each APIC has + * a `virtual interrupt base' where its interrupts start. + * Addresses are processor-relative physical addresses. + * Only enabled devices are linked, others are filtered out. + */ +struct Madt +{ + u64int lapicpa; /* local APIC addr */ + int pcat; /* the machine has PC/AT 8259s */ + Apicst* st; /* list of Apic related structures */ +}; + +struct Apicst +{ + int type; + Apicst* next; + union{ + struct{ + int pid; /* processor id */ + int id; /* apic no */ + } lapic; + struct{ + int id; /* io apic id */ + u32int ibase; /* interrupt base addr. */ + u64int addr; /* base address */ + } ioapic, iosapic; + struct{ + int irq; /* bus intr. source (ISA only) */ + int intr; /* system interrupt */ + int flags; /* apic flags */ + } intovr; + struct{ + int intr; /* system interrupt */ + int flags; /* apic flags */ + } nmi; + struct{ + int pid; /* processor id */ + int flags; /* lapic flags */ + int lint; /* lapic LINTn for nmi */ + } lnmi; + struct{ + int pid; /* processor id */ + int id; /* apic id */ + int eid; /* apic eid */ + int puid; /* processor uid */ + char* puids; /* same thing */ + } lsapic; + struct{ + int pid; /* processor id */ + int peid; /* processor eid */ + int iosv; /* io sapic vector */ + int intr; /* global sys intr. */ + int type; /* intr type */ + int flags; /* apic flags */ + int any; /* err sts at any proc */ + } intsrc; + struct{ + int id; /* x2 apic id */ + int puid; /* processor uid */ + } lx2apic; + struct{ + int puid; + int flags; + int intr; + } lx2nmi; + }; +}; + +/* System resource affinity table + */ +struct Srat +{ + int type; + Srat* next; + union{ + struct{ + int dom; /* proximity domain */ + int apic; /* apic id */ + int sapic; /* sapic id */ + int clkdom; /* clock domain */ + } lapic; + struct{ + int dom; /* proximity domain */ + u64int addr; /* base address */ + u64int len; + int hplug; /* hot pluggable */ + int nvram; /* non volatile */ + } mem; + struct{ + int dom; /* proximity domain */ + int apic; /* x2 apic id */ + int clkdom; /* clock domain */ + } lx2apic; + }; +}; + +/* Fixed ACPI description table. + * Describes implementation and hardware registers. + * PM* blocks are low level functions. + * GPE* blocks refer to general purpose events. + * P_* blocks are for processor features. + * Has address for the DSDT. + */ +struct Fadt +{ + u32int facs; + u32int dsdt; + /* 1 reserved */ + u8int pmprofile; + u16int sciint; + u32int smicmd; + u8int acpienable; + u8int acpidisable; + u8int s4biosreq; + u8int pstatecnt; + u32int pm1aevtblk; + u32int pm1bevtblk; + u32int pm1acntblk; + u32int pm1bcntblk; + u32int pm2cntblk; + u32int pmtmrblk; + u32int gpe0blk; + u32int gpe1blk; + u8int pm1evtlen; + u8int pm1cntlen; + u8int pm2cntlen; + u8int pmtmrlen; + u8int gpe0blklen; + u8int gpe1blklen; + u8int gp1base; + u8int cstcnt; + u16int plvl2lat; + u16int plvl3lat; + u16int flushsz; + u16int flushstride; + u8int dutyoff; + u8int dutywidth; + u8int dayalrm; + u8int monalrm; + u8int century; + u16int iapcbootarch; + /* 1 reserved */ + u32int flags; + Gas resetreg; + u8int resetval; + /* 3 reserved */ + u64int xfacs; + u64int xdsdt; + Gas xpm1aevtblk; + Gas xpm1bevtblk; + Gas xpm1acntblk; + Gas xpm1bcntblk; + Gas xpm2cntblk; + Gas xpmtmrblk; + Gas xgpe0blk; + Gas xgpe1blk; +}; + +/* XSDT/RSDT. 4/8 byte addresses starting at p. + */ +struct Xsdt +{ + int len; + int asize; + u8int* p; +}; + +extern uintmem acpimblocksize(uintmem, int*); diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/archk10.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/archk10.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,365 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +static int +cpuidinit(void) +{ + u32int eax, info[4]; + + /* + * Standard CPUID functions. + * Functions 0 and 1 will be needed multiple times + * so cache the info now. + */ + if((m->ncpuinfos = cpuid(0, 0, m->cpuinfo[0])) == 0) + return 0; + m->ncpuinfos++; + + if(memcmp(&m->cpuinfo[0][1], "GenuntelineI", 12) == 0) + m->isintelcpu = 1; + cpuid(1, 0, m->cpuinfo[1]); + + /* + * Extended CPUID functions. + */ + if((eax = cpuid(0x80000000, 0, info)) >= 0x80000000) + m->ncpuinfoe = (eax & ~0x80000000) + 1; + + return 1; +} + +static int +cpuidinfo(u32int eax, u32int ecx, u32int info[4]) +{ + if(m->ncpuinfos == 0 && cpuidinit() == 0) + return 0; + + if(!(eax & 0x80000000)){ + if(eax >= m->ncpuinfos) + return 0; + } + else if(eax >= (0x80000000|m->ncpuinfoe)) + return 0; + + cpuid(eax, ecx, info); + + return 1; +} + +static vlong +cpuidhz(u32int info[2][4]) +{ + int f, r; + vlong hz; + u64int msr; + + if(memcmp(&info[0][1], "GenuntelineI", 12) == 0){ + switch(info[1][0] & 0x0fff3ff0){ + default: + return 0; + case 0x00000f30: /* Xeon (MP), Pentium [4D] */ + case 0x00000f40: /* Xeon (MP), Pentium [4D] */ + case 0x00000f60: /* Xeon 7100, 5000 or above */ + msr = rdmsr(0x2c); + r = (msr>>16) & 0x07; + switch(r){ + default: + return 0; + case 0: + hz = 266666666666ll; + break; + case 1: + hz = 133333333333ll; + break; + case 2: + hz = 200000000000ll; + break; + case 3: + hz = 166666666666ll; + break; + case 4: + hz = 333333333333ll; + break; + } + + /* + * Hz is *1000 at this point. + * Do the scaling then round it. + * The manual is conflicting about + * the size of the msr field. + */ + hz = (((hz*(msr>>24))/100)+5)/10; + break; + case 0x00000690: /* Pentium M, Celeron M */ + case 0x000006d0: /* Pentium M, Celeron M */ + hz = ((rdmsr(0x2a)>>22) & 0x1f)*100 * 1000000ll; + break; + case 0x000006e0: /* Core Duo */ + case 0x000006f0: /* Core 2 Duo/Quad/Extreme */ + case 0x00010670: /* Core 2 Extreme */ + case 0x000006a0: /* i7 paurea... */ + /* + * Get the FSB frequemcy. + * If processor has Enhanced Intel Speedstep Technology + * then non-integer bus frequency ratios are possible. + */ + if(info[1][2] & 0x00000080){ + msr = rdmsr(0x198); + r = (msr>>40) & 0x1f; + } + else{ + msr = 0; + r = rdmsr(0x2a) & 0x1f; + } + f = rdmsr(0xcd) & 0x07; + switch(f){ + default: + return 0; + case 5: + hz = 100000000000ll; + break; + case 1: + hz = 133333333333ll; + break; + case 3: + hz = 166666666666ll; + break; + case 2: + hz = 200000000000ll; + break; + case 0: + hz = 266666666666ll; + break; + case 4: + hz = 333333333333ll; + break; + case 6: + hz = 400000000000ll; + break; + } + + /* + * Hz is *1000 at this point. + * Do the scaling then round it. + */ + if(msr & 0x0000400000000000ll) + hz = hz*r + hz/2; + else + hz = hz*r; + hz = ((hz/100)+5)/10; + break; + } + DBG("cpuidhz: 0x2a: %#llux hz %lld\n", rdmsr(0x2a), hz); + } + else if(memcmp(&info[0][1], "AuthcAMDenti", 12) == 0){ + switch(info[1][0] & 0x0fff0ff0){ + default: + return 0; + case 0x00000f50: /* K8 */ + msr = rdmsr(0xc0010042); + if(msr == 0) + return 0; + hz = (800 + 200*((msr>>1) & 0x1f)) * 1000000ll; + break; + case 0x00100f90: /* K10 */ + case 0x00000620: /* QEMU64 */ + msr = rdmsr(0xc0010064); + r = (msr>>6) & 0x07; + hz = (((msr & 0x3f)+0x10)*100000000ll)/(1<ncpuinfos == 0 && cpuidinit() == 0) + return; + + for(i = 0; i < m->ncpuinfos; i++){ + cpuid(i, 0, info); + DBG("eax = %#8.8ux: %8.8ux %8.8ux %8.8ux %8.8ux\n", + i, info[0], info[1], info[2], info[3]); + } + for(i = 0; i < m->ncpuinfoe; i++){ + cpuid(0x80000000|i, 0, info); + DBG("eax = %#8.8ux: %8.8ux %8.8ux %8.8ux %8.8ux\n", + 0x80000000|i, info[0], info[1], info[2], info[3]); + } +} + +vlong +archhz(void) +{ + vlong hz; + u32int info[2][4]; + + if(!cpuidinfo(0, 0, info[0]) || !cpuidinfo(1, 0, info[1])) + return 0; + + hz = cpuidhz(info); + if(hz != 0 || m->machno != 0) + return hz; + + return i8254hz(info); +} + +int +archmmu(void) +{ + u32int info[4]; + + /* + * Should the check for m->machno != 0 be here + * or in the caller (mmuinit)? + * + * To do here: + * check and enable Pse; + * Pge; Nxe. + */ + + /* + * How many page sizes are there? + * Always have 4*KiB, but need to check + * configured correctly. + */ + assert(PGSZ == 4*KiB); + + m->pgszlg2[0] = 12; + m->pgszmask[0] = (1<<12)-1; + m->pgsz[0] = 1<<12; + m->npgsz = 1; + if(m->ncpuinfos == 0 && cpuidinit() == 0) + return 1; + + /* + * Check the Pse bit in function 1 DX for 2*MiB support; + * if false, only 4*KiB is available. + */ + if(!(m->cpuinfo[1][3] & 0x00000008)) + return 1; + m->pgszlg2[1] = 21; + m->pgszmask[1] = (1<<21)-1; + m->pgsz[1] = 1<<21; + m->npgsz = 2; + + /* + * Check the Page1GB bit in function 0x80000001 DX for 1*GiB support. + */ + if(cpuidinfo(0x80000001, 0, info) && (info[3] & 0x04000000)){ + m->pgszlg2[2] = 30; + m->pgszmask[2] = (1<<30)-1; + m->pgsz[2] = 1<<30; + m->npgsz = 3; + } + + return m->npgsz; +} + +static int +fmtP(Fmt* f) +{ + uintmem pa; + + pa = va_arg(f->args, uintmem); + + if(f->flags & FmtSharp) + return fmtprint(f, "%#16.16llux", pa); + + return fmtprint(f, "%llud", pa); +} + +static int +fmtL(Fmt* f) +{ + Mpl pl; + + pl = va_arg(f->args, Mpl); + + return fmtprint(f, "%#16.16llux", pl); +} + +static int +fmtR(Fmt* f) +{ + u64int r; + + r = va_arg(f->args, u64int); + + return fmtprint(f, "%#16.16llux", r); +} + +/* virtual address fmt */ +static int +fmtW(Fmt *f) +{ + u64int va; + + va = va_arg(f->args, u64int); + return fmtprint(f, "%#ullx=0x[%ullx][%ullx][%ullx][%ullx][%ullx]", va, + PTLX(va, 3), PTLX(va, 2), PTLX(va, 1), PTLX(va, 0), + va & ((1<cpumhz*microsecs; r < t; r = rdtsc()) + ; +} + +void +millidelay(int millisecs) +{ + u64int r, t; + + r = rdtsc(); + for(t = r + m->cpumhz*1000ull*millisecs; r < t; r = rdtsc()) + ; +} diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/asm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/asm.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,436 @@ +/* + * To do: + * find a purpose for this... + */ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include "amd64.h" + +/* + * Address Space Map. + * Low duty cycle. + */ +typedef struct Asm Asm; +typedef struct Asm { + uintmem addr; + uintmem size; + int type; + int location; + Asm* next; +} Asm; + +enum { + AsmNONE = 0, + AsmMEMORY = 1, + AsmRESERVED = 2, + AsmACPIRECLAIM = 3, + AsmACPINVS = 4, + + AsmDEV = 5, +}; + +static Lock asmlock; +static Asm asmarray[64] = { + { 0, ~0, AsmNONE, nil, }, +}; +static int asmindex = 1; +static Asm* asmlist = &asmarray[0]; +static Asm* asmfreelist; + +/*static*/ void +asmdump(void) +{ + Asm* asm; + + print("asm: index %d:\n", asmindex); + for(asm = asmlist; asm != nil; asm = asm->next){ + print(" %#P %#P %d (%P)\n", + asm->addr, asm->addr+asm->size, + asm->type, asm->size); + } +} + +static Asm* +asmnew(uintmem addr, uintmem size, int type) +{ + Asm * asm; + + if(asmfreelist != nil){ + asm = asmfreelist; + asmfreelist = asm->next; + asm->next = nil; + } + else{ + if(asmindex >= nelem(asmarray)) + return nil; + asm = &asmarray[asmindex++]; + } + asm->addr = addr; + asm->size = size; + asm->type = type; + + return asm; +} + +int +asmfree(uintmem addr, uintmem size, int type) +{ + Asm *np, *pp, **ppp; + + DBG("asmfree: %#P@%#P, type %d\n", size, addr, type); + if(size == 0) + return 0; + + lock(&asmlock); + + /* + * Find either a map entry with an address greater + * than that being returned, or the end of the map. + */ + pp = nil; + ppp = &asmlist; + for(np = *ppp; np != nil && np->addr <= addr; np = np->next){ + pp = np; + ppp = &np->next; + } + + if((pp != nil && pp->addr+pp->size > addr) + || (np != nil && addr+size > np->addr)){ + unlock(&asmlock); + DBG("asmfree: overlap %#Px@%#P, type %d\n", size, addr, type); + return -1; + } + + if(pp != nil && pp->type == type && pp->addr+pp->size == addr){ + pp->size += size; + if(np != nil && np->type == type && addr+size == np->addr){ + pp->size += np->size; + pp->next = np->next; + + np->next = asmfreelist; + asmfreelist = np; + } + + unlock(&asmlock); + return 0; + } + + if(np != nil && np->type == type && addr+size == np->addr){ + np->addr -= size; + np->size += size; + + unlock(&asmlock); + return 0; + } + + if((pp = asmnew(addr, size, type)) == nil){ + unlock(&asmlock); + DBG("asmfree: losing %#P@%#P, type %d\n", size, addr, type); + return -1; + } + *ppp = pp; + pp->next = np; + + unlock(&asmlock); + + return 0; +} + +uintmem +asmalloc(uintmem addr, uintmem size, int type, int align) +{ + uintmem a, o; + Asm *asm, *pp; + + DBG("asmalloc: %#P@%#P, type %d\n", size, addr, type); + lock(&asmlock); + for(pp = nil, asm = asmlist; asm != nil; pp = asm, asm = asm->next){ + if(asm->type != type) + continue; + a = asm->addr; + + if(addr != 0){ + /* + * A specific address range has been given: + * if the current map entry is greater then + * the address is not in the map; + * if the current map entry does not overlap + * the beginning of the requested range then + * continue on to the next map entry; + * if the current map entry does not entirely + * contain the requested range then the range + * is not in the map. + * The comparisons are strange to prevent + * overflow. + */ + if(a > addr) + break; + if(asm->size < addr - a) + continue; + if(addr - a > asm->size - size) + break; + a = addr; + } + + if(align > 0) + a = ((a+align-1)/align)*align; + if(asm->addr+asm->size-a < size) + continue; + + o = asm->addr; + asm->addr = a+size; + asm->size -= a-o+size; + if(asm->size == 0){ + if(pp != nil) + pp->next = asm->next; + asm->next = asmfreelist; + asmfreelist = asm; + } + + unlock(&asmlock); + if(o != a) + asmfree(o, a-o, type); + return a; + } + unlock(&asmlock); + + return 0; +} + +static void +asminsert(uintmem addr, uintmem size, int type) +{ + if(type == AsmNONE || asmalloc(addr, size, AsmNONE, 0) == 0) + return; + if(asmfree(addr, size, type) == 0) + return; + asmfree(addr, size, 0); +} + +void +asminit(void) +{ + sys->pmstart = ROUNDUP(PADDR(end), PGSZ); + sys->pmend = sys->pmstart; + asmalloc(0, sys->pmstart, AsmNONE, 0); +} + +/* + * Notes: + * asmmapinit and asmmodinit called from multiboot; + * subject to change; the numerology here is probably suspect. + * Multiboot defines the alignment of modules as 4096. + */ +void +asmmapinit(uintmem addr, uintmem size, int type) +{ + switch(type){ + default: + asminsert(addr, size, type); + break; + case AsmMEMORY: + /* + * Adjust things for the peculiarities of this + * architecture. + * Sys->pmend is the largest physical memory address found, + * there may be gaps between it and sys->pmstart, the range + * and how much of it is occupied, might need to be known + * for setting up allocators later. + */ + if(addr < 1*MiB || addr+size < sys->pmstart) + break; + if(addr < sys->pmstart){ + size -= sys->pmstart - addr; + addr = sys->pmstart; + } + asminsert(addr, size, type); + sys->pmoccupied += size; + if(addr+size > sys->pmend) + sys->pmend = addr+size; + break; + } +} + +void +asmmodinit(u32int start, u32int end, char* s) +{ + DBG("asmmodinit: %#ux -> %#ux: <%s> %#ux\n", + start, end, s, ROUNDUP(end, 4096)); + + if(start < sys->pmstart) + return; + end = ROUNDUP(end, 4096); + if(end > sys->pmstart){ + asmalloc(sys->pmstart, end-sys->pmstart, AsmNONE, 0); + sys->pmstart = end; + } +} + +static int npg[4]; + +void* +asmbootalloc(usize size) +{ + uintptr va; + + assert(sys->vmunused+size <= sys->vmunmapped); + va = sys->vmunused; + sys->vmunused += size; + memset(UINT2PTR(va), 0, size); + return UINT2PTR(va); +} + +static PTE +asmwalkalloc(usize size) +{ + uintmem pa; + + assert(size == PTSZ && sys->vmunused+size <= sys->vmunmapped); + + if(!ALIGNED(sys->vmunused, PTSZ)){ + DBG("asmwalkalloc: %ulld wasted\n", + ROUNDUP(sys->vmunused, PTSZ) - sys->vmunused); + sys->vmunused = ROUNDUP(sys->vmunused, PTSZ); + } + if((pa = mmuphysaddr(sys->vmunused)) != ~0) + sys->vmunused += size; + + return pa; +} + +// still needed so iallocb gets initialised correctly. needs to go. +#define ConfCrap + +void +asmmeminit(void) +{ + int i, l; + Asm* asm; + PTE *pte, *pml4; + uintptr va; + uintmem hi, lo, mem, nextmem, pa; +#ifdef ConfCrap + int cx; +#endif /* ConfCrap */ + + assert(!((sys->vmunmapped|sys->vmend) & m->pgszmask[1])); + + if((pa = mmuphysaddr(sys->vmunused)) == ~0) + panic("asmmeminit 1"); + pa += sys->vmunmapped - sys->vmunused; + mem = asmalloc(pa, sys->vmend - sys->vmunmapped, 1, 0); + if(mem != pa) + panic("asmmeminit 2"); + DBG("pa %#llux mem %#llux\n", pa, mem); + + /* assume already 2MiB aligned*/ + assert(ALIGNED(sys->vmunmapped, 2*MiB)); + pml4 = UINT2PTR(m->pml4->va); + while(sys->vmunmapped < sys->vmend){ + l = mmuwalk(pml4, sys->vmunmapped, 1, &pte, asmwalkalloc); + DBG("%#p l %d\n", sys->vmunmapped, l); + *pte = pa|PtePS|PteRW|PteP; + sys->vmunmapped += 2*MiB; + pa += 2*MiB; + } + +#ifdef ConfCrap + cx = 0; +#endif /* ConfCrap */ + for(asm = asmlist; asm != nil; asm = asm->next){ + if(asm->type != AsmMEMORY) + continue; + va = KSEG2+asm->addr; + print(" %#P %#P %d (%P)\n", + asm->addr, asm->addr+asm->size, + asm->type, asm->size); + + lo = asm->addr; + hi = asm->addr+asm->size; + /* Convert a range into pages */ + for(mem = lo; mem < hi; mem = nextmem){ + nextmem = (mem + PGLSZ(0)) & ~m->pgszmask[0]; + + /* Try large pages first */ + for(i = m->npgsz - 1; i >= 0; i--){ + if((mem & m->pgszmask[i]) != 0) + continue; + if(mem + PGLSZ(i) > hi) + continue; + /* This page fits entirely within the range. */ + /* Mark it a usable */ +// print("%#P %d\n", mem, i); + + if((l = mmuwalk(pml4, va, i, &pte, asmwalkalloc)) < 0) + panic("asmmeminit 3"); + + *pte = mem|PteRW|PteP; + if(l > 0) + *pte |= PtePS; +//print("pte %#p *pte %#16.16llux l %d\n", pte, *pte, l); + + nextmem = mem + PGLSZ(i); +/* touch it */ +*((uintptr*)va) = 0; + va += PGLSZ(i); + npg[i]++; + + break; + } + } +// physinit(asm->addr, asm->size); + +#ifdef ConfCrap + /* + * Fill in conf crap. + */ + if(cx >= nelem(conf.mem)) + continue; + lo = ROUNDUP(asm->addr, PGSZ); +//if(lo >= 600ull*MiB) +// continue; + conf.mem[cx].base = lo; + hi = ROUNDDN(hi, PGSZ); +//if(hi > 600ull*MiB) +// hi = 600*MiB; + conf.mem[cx].npage = (hi - lo)/PGSZ; + conf.npage += conf.mem[cx].npage; + print("cm %d: addr %#llux npage %lud\n", + cx, conf.mem[cx].base, conf.mem[cx].npage); + cx++; +#endif /* ConfCrap */ + } + print("%d %d %d\n", npg[0], npg[1], npg[2]); + +#ifdef ConfCrap + /* + * Fill in more conf crap. + * This is why I hate Plan 9. + */ + conf.upages = conf.npage; + i = (sys->vmend - sys->vmstart)/PGSZ; /* close enough */ + conf.ialloc = (i/2)*PGSZ; + print("npage %llud upage %lud kpage %d\n", + conf.npage, conf.upages, i); + +#endif /* ConfCrap */ +} + +void +asmumeminit(void) +{ + Asm *asm; + extern void physallocdump(void); + + for(asm = asmlist; asm != nil; asm = asm->next){ + if(asm->type != AsmMEMORY) + continue; + if(0)print("asmumeminit: addr %ullx size %ullx\n", asm->addr, asm->size); + physinit(asm->addr, asm->size); + } + physallocdump(); +} diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/bal.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/bal.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,254 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +/* + * Buddy allocator from forsyth, possible tweaked to + * be used in nix for physical memory allocation. + * + * - locks + * - auxilary structure allocation and sizing + * - largest size + * - could instead coalesce free items on demand (cf. Wulf) + * - or lazy buddy (cf. Barkley) + */ + +#define DPRINT if(0)iprint + +enum{ + MinK= 16, /* 64k */ + MinBsize= 1<>MinK) +#define IB(x) (((x)< round up */ + if((n>>8) == 0) + return log2v[n] + r; + if((n>>16) == 0) + return 8 + log2v[n>>8] + r; + if((n>>24) == 0) + return 16 + log2v[n>>16] + r; + return 24 + log2v[n>>24] + r; +} + +void +balinit(physaddr base, usize size) +{ + int k; + Blk *b; + + loginit(); + for(k = 0; k < nelem(blist); k++){ + b = &blist[k]; + b->forw = b->back = b; + } + memset(kofb, 0, sizeof(kofb)); + balfreephys(base, size); + DPRINT("Aspace=%#ux MaxBlocks=%#ux (%d) len bdesc=%d\n", + Aspace, MaxBlocks, MaxBlocks, nelem(blocks)); + baldump(); +} + +physaddr +bal(usize size) +{ + int j, k; + Blk *b, *b2; + physaddr a, a2; + uint bi; + + k = log2of(size); + if(k < MinK) + k = MinK; + DPRINT("size=%lud k=%d\n", size, k); + lock(&balk); + for(j = k;;){ + b = blist[j].forw; + if(b != &blist[j]) + break; + if(++j > MaxK){ + unlock(&balk); + return 0; /* out of space */ + } + } + b->forw->back = b->back; + b->back->forw = b->forw; + /* set busy state */ + bi = b-blocks; + a = IB(bi); + kofb[bi] = k | Busy; + while(j != k){ + /* split */ + j--; + a2 = a+((physaddr)1<forw = &blist[j]; + b2->back = blist[j].back; + blist[j].back = b2; + b2->back->forw = b2; + } + unlock(&balk); + return a; +} + +void +bfree(physaddr a, usize size) +{ + int k; + Blk *b, *b2; + physaddr a2; + uint bi, bi2; + + k = log2of(size); /* could look it up in kofb */ + if(k < MinK) + k = MinK; + DPRINT("free %#llux %d\n", a, k); + bi = BI(a); + lock(&balk); + if(kofb[bi] != 0 && kofb[bi] != (Busy|k)){ + unlock(&balk); + panic("balfree: busy %#llux odd k k=%d kfob=%#ux\n", a, k, kofb[bi]); + } + for(; k != MaxK; k++){ + a2 = a ^ ((physaddr)1<back->forw = b2->forw; + b2->forw->back = b2->back; + if(a2 < a) + a = a2; + } + kofb[bi] = k; /* sets size and resets Busy */ + b = &blocks[bi]; + b->forw = &blist[k]; + b->back = blist[k].back; + blist[k].back = b; + b->back->forw = b; + unlock(&balk); +} + +void +balfreephys(physaddr base, usize size) +{ + physaddr top, a, lim; + usize m; + int i; + + /* round base to min block size */ + if(base & (MinBsize-1)){ + i = MinBsize - (base & (MinBsize-1)); + base += i; + size -= i; + } + size &= ~(MinBsize-1); + if(size < MinBsize) + return; + DPRINT("%#.8llux %#lux (%lud) start\n", base, size, size); + if(BI(base+size) >= MaxBlocks) + panic("balfreephys: address space too large"); + /* split base and size into suitable chunks */ + for(top = MinBsize; top < base+size; top <<= 1) + {} + /* free maximal power-of-2 chunks below mid-point */ + lim = base+size; + a = top>>1; + m = a>>1; + DPRINT("a=%llux m=%#lux (%ld)\n", a, m, m); + if(m > ((usize)1<= MinBsize){ + DPRINT("a==%#llux m=%#lux base=%#llux\n", a, m, base); + if(a-m >= base && a <= lim){ + a -= m; + bfree(a, m); + }else + m >>= 1; + } + /* free chunks above mid-point */ + a = top>>1; + m = a>>1; + if(m > ((usize)1<= MinBsize){ + DPRINT("a[2]==%#llux m=%#lux base=%#llux\n", a, m, base); + if(a >= base && a+m <= lim){ + bfree(a, m); + a += m; + }else + m >>= 1; + } +} + +void +baldump(void) +{ + physaddr a; + uint bi; + int i, k; + Blk *b; + + for(i=0; iforw){ + bi = b-blocks; + a = IB(bi); + k = kofb[bi]; + print(" [%#llux %d %#ux b=%#llux]", a, k, 1<= 0 && id < nelem(regnames)) + return regnames[id]; + seprint(buf, buf+sizeof(buf), "spc:%#x", id); + return buf; +} + +static int +acpiregid(char *s) +{ + int i; + + for(i = 0; i < nelem(regnames); i++) + if(strcmp(regnames[i], s) == 0) + return i; + return -1; +} + +static u64int +l64get(u8int* p) +{ + /* + * Doing this as a define + * #define l64get(p) (((u64int)l32get(p+4)<<32)|l32get(p)) + * causes 8c to abort with "out of fixed registers" in + * rsdlink() below. + */ + return (((u64int)l32get(p+4)<<32)|l32get(p)); +} + +static u8int +mget8(uintptr p, void*) +{ + u8int *cp = (u8int*)p; + return *cp; +} + +static void +mset8(uintptr p, u8int v, void*) +{ + u8int *cp = (u8int*)p; + *cp = v; +} + +static u16int +mget16(uintptr p, void*) +{ + u16int *cp = (u16int*)p; + return *cp; +} + +static void +mset16(uintptr p, u16int v, void*) +{ + u16int *cp = (u16int*)p; + *cp = v; +} + +static u32int +mget32(uintptr p, void*) +{ + u32int *cp = (u32int*)p; + return *cp; +} + +static void +mset32(uintptr p, u32int v, void*) +{ + u32int *cp = (u32int*)p; + *cp = v; +} + +static u64int +mget64(uintptr p, void*) +{ + u64int *cp = (u64int*)p; + return *cp; +} + +static void +mset64(uintptr p, u64int v, void*) +{ + u64int *cp = (u64int*)p; + *cp = v; +} + +static u8int +ioget8(uintptr p, void*) +{ + return inb(p); +} + +static void +ioset8(uintptr p, u8int v, void*) +{ + outb(p, v); +} + +static u16int +ioget16(uintptr p, void*) +{ + return ins(p); +} + +static void +ioset16(uintptr p, u16int v, void*) +{ + outs(p, v); +} + +static u32int +ioget32(uintptr p, void*) +{ + return inl(p); +} + +static void +ioset32(uintptr p, u32int v, void*) +{ + outl(p, v); +} + +static u8int +cfgget8(uintptr p, void* r) +{ + Reg *ro = r; + Pcidev d; + + d.tbdf = ro->tbdf; + return pcicfgr8(&d, p); +} + +static void +cfgset8(uintptr p, u8int v, void* r) +{ + Reg *ro = r; + Pcidev d; + + d.tbdf = ro->tbdf; + pcicfgw8(&d, p, v); +} + +static u16int +cfgget16(uintptr p, void* r) +{ + Reg *ro = r; + Pcidev d; + + d.tbdf = ro->tbdf; + return pcicfgr16(&d, p); +} + +static void +cfgset16(uintptr p, u16int v, void* r) +{ + Reg *ro = r; + Pcidev d; + + d.tbdf = ro->tbdf; + pcicfgw16(&d, p, v); +} + +static u32int +cfgget32(uintptr p, void* r) +{ + Reg *ro = r; + Pcidev d; + + d.tbdf = ro->tbdf; + return pcicfgr32(&d, p); +} + +static void +cfgset32(uintptr p, u32int v, void* r) +{ + Reg *ro = r; + Pcidev d; + + d.tbdf = ro->tbdf; + pcicfgw32(&d, p, v); +} + +static Regio memio = +{ + nil, + mget8, mset8, mget16, mset16, + mget32, mset32, mget64, mset64 +}; + +static Regio ioio = +{ + nil, + ioget8, ioset8, ioget16, ioset16, + ioget32, ioset32, nil, nil +}; + +static Regio cfgio = +{ + nil, + cfgget8, cfgset8, cfgget16, cfgset16, + cfgget32, cfgset32, nil, nil +}; + +/* + * Copy memory, 1/2/4/8-bytes at a time, to/from a region. + */ +static long +regcpy(Regio *dio, uintptr da, Regio *sio, uintptr sa, long len, int align) +{ + int n, i; + + DBG("regcpy %#ullx %#ullx %#ulx %#ux\n", da, sa, len, align); + if((len%align) != 0) + print("regcpy: bug: copy not aligned. truncated\n"); + n = len/align; + for(i = 0; i < n; i++){ + switch(align){ + case 1: + DBG("cpy8 %#p %#p\n", da, sa); + dio->set8(da, sio->get8(sa, sio->arg), dio->arg); + break; + case 2: + DBG("cpy16 %#p %#p\n", da, sa); + dio->set16(da, sio->get16(sa, sio->arg), dio->arg); + break; + case 4: + DBG("cpy32 %#p %#p\n", da, sa); + dio->set32(da, sio->get32(sa, sio->arg), dio->arg); + break; + case 8: + DBG("cpy64 %#p %#p\n", da, sa); + // dio->set64(da, sio->get64(sa, sio->arg), dio->arg); + break; + default: + panic("regcpy: align bug"); + } + da += align; + sa += align; + } + return n*align; +} + +/* + * Perform I/O within region in access units of accsz bytes. + * All units in bytes. + */ +static long +regio(Reg *r, void *p, ulong len, uintptr off, int iswr) +{ + Regio rio; + uintptr rp; + + DBG("reg%s %s %#p %#ullx %#lx sz=%d\n", + iswr ? "out" : "in", r->name, p, off, len, r->accsz); + rp = 0; + if(off + len > r->len){ + print("regio: access outside limits"); + len = r->len - off; + } + if(len <= 0){ + print("regio: zero len\n"); + return 0; + } + switch(r->spc){ + case Rsysmem: + // XXX should map only what we are going to use + // A region might be too large. + if(r->p == nil) + r->p = vmap(r->base, len); + if(r->p == nil) + error("regio: vmap failed"); + rp = (uintptr)r->p + off; + rio = memio; + break; + case Rsysio: + rp = r->base + off; + rio = ioio; + break; + case Rpcicfg: + rp = r->base + off; + rio = cfgio; + rio.arg = r; + break; + case Rpcibar: + case Rembed: + case Rsmbus: + case Rcmos: + case Ripmi: + case Rfixedhw: + print("regio: reg %s not supported\n", acpiregstr(r->spc)); + error("region not supported"); + } + if(iswr) + regcpy(&rio, rp, &memio, (uintptr)p, len, r->accsz); + else + regcpy(&memio, (uintptr)p, &rio, rp, len, r->accsz); + return len; +} + +static Atable* +newtable(uchar *p) +{ + Atable *t; + Sdthdr *h; + + t = malloc(sizeof(Atable)); + if(t == nil) + panic("no memory for more aml tables"); + t->tbl = p; + h = (Sdthdr*)t->tbl; + t->is64 = h->rev >= 2; + t->dlen = l32get(h->length) - Sdthdrsz; + memmove(t->sig, h->sig, sizeof(h->sig)); + t->sig[sizeof(t->sig)-1] = 0; + memmove(t->oemid, h->oemid, sizeof(h->oemid)); + t->oemtblid[sizeof(t->oemtblid)-1] = 0; + memmove(t->oemtblid, h->oemtblid, sizeof(h->oemtblid)); + t->oemtblid[sizeof(t->oemtblid)-1] = 0; + t->next = nil; + if(tfirst == nil) + tfirst = tlast = t; + else{ + tlast->next = t; + tlast = t; + } + return t; +} + +static void* +sdtchecksum(void* addr, int len) +{ + u8int *p, sum; + + sum = 0; + for(p = addr; len-- > 0; p++) + sum += *p; + if(sum == 0) + return addr; + + return nil; +} + +static void * +sdtmap(uintptr pa, int *n, int cksum) +{ + Sdthdr* sdt; + + sdt = vmap(pa, sizeof(Sdthdr)); + if(sdt == nil){ + DBG("acpi: vmap1: nil\n"); + return nil; + } + *n = l32get(sdt->length); + vunmap(sdt, sizeof(Sdthdr)); + if((sdt = vmap(pa, *n)) == nil){ + DBG("acpi: nil vmap\n"); + return nil; + } + if(cksum != 0 && sdtchecksum(sdt, *n) == nil){ + DBG("acpi: SDT: bad checksum\n"); + vunmap(sdt, sizeof(Sdthdr)); + return nil; + } + return sdt; +} + +static int +loadfacs(uintptr pa) +{ + int n; + + facs = sdtmap(pa, &n, 0); + if(facs == nil) + return -1; + if(memcmp(facs, "FACS", 4) != 0){ + vunmap(facs, n); + facs = nil; + return -1; + } + /* no unmap */ + + DBG("acpi: facs: hwsig: %#ux\n", facs->hwsig); + DBG("acpi: facs: wakingv: %#ux\n", facs->wakingv); + DBG("acpi: facs: flags: %#ux\n", facs->flags); + DBG("acpi: facs: glock: %#ux\n", facs->glock); + DBG("acpi: facs: xwakingv: %#llux\n", facs->xwakingv); + DBG("acpi: facs: vers: %#ux\n", facs->vers); + DBG("acpi: facs: ospmflags: %#ux\n", facs->ospmflags); + return 0; +} + +static void +loaddsdt(uintptr pa) +{ + int n; + uchar *dsdtp; + + dsdtp = sdtmap(pa, &n, 1); + if(dsdtp == nil) + return; + if(acpitable(dsdtp, n) == nil) + vunmap(dsdtp, n); +} + +static void +gasget(Gas *gas, uchar *p) +{ + gas->spc = p[0]; + gas->len = p[1]; + gas->off = p[2]; + gas->accsz = p[3]; + gas->addr = l64get(p+4); +} + +static void +dumpfadt(Fadt *fp) +{ + if(DBGFLG == 0) + return; + + DBG("acpi: fadt: facs: %#ux\n", fp->facs); + DBG("acpi: fadt: dsdt: %#ux\n", fp->dsdt); + DBG("acpi: fadt: pmprofile: %#ux\n", fp->pmprofile); + DBG("acpi: fadt: sciint: %#ux\n", fp->sciint); + DBG("acpi: fadt: smicmd: %#ux\n", fp->smicmd); + DBG("acpi: fadt: acpienable: %#ux\n", fp->acpienable); + DBG("acpi: fadt: acpidisable: %#ux\n", fp->acpidisable); + DBG("acpi: fadt: s4biosreq: %#ux\n", fp->s4biosreq); + DBG("acpi: fadt: pstatecnt: %#ux\n", fp->pstatecnt); + DBG("acpi: fadt: pm1aevtblk: %#ux\n", fp->pm1aevtblk); + DBG("acpi: fadt: pm1bevtblk: %#ux\n", fp->pm1bevtblk); + DBG("acpi: fadt: pm1acntblk: %#ux\n", fp->pm1acntblk); + DBG("acpi: fadt: pm1bcntblk: %#ux\n", fp->pm1bcntblk); + DBG("acpi: fadt: pm2cntblk: %#ux\n", fp->pm2cntblk); + DBG("acpi: fadt: pmtmrblk: %#ux\n", fp->pmtmrblk); + DBG("acpi: fadt: gpe0blk: %#ux\n", fp->gpe0blk); + DBG("acpi: fadt: gpe1blk: %#ux\n", fp->gpe1blk); + DBG("acpi: fadt: pm1evtlen: %#ux\n", fp->pm1evtlen); + DBG("acpi: fadt: pm1cntlen: %#ux\n", fp->pm1cntlen); + DBG("acpi: fadt: pm2cntlen: %#ux\n", fp->pm2cntlen); + DBG("acpi: fadt: pmtmrlen: %#ux\n", fp->pmtmrlen); + DBG("acpi: fadt: gpe0blklen: %#ux\n", fp->gpe0blklen); + DBG("acpi: fadt: gpe1blklen: %#ux\n", fp->gpe1blklen); + DBG("acpi: fadt: gp1base: %#ux\n", fp->gp1base); + DBG("acpi: fadt: cstcnt: %#ux\n", fp->cstcnt); + DBG("acpi: fadt: plvl2lat: %#ux\n", fp->plvl2lat); + DBG("acpi: fadt: plvl3lat: %#ux\n", fp->plvl3lat); + DBG("acpi: fadt: flushsz: %#ux\n", fp->flushsz); + DBG("acpi: fadt: flushstride: %#ux\n", fp->flushstride); + DBG("acpi: fadt: dutyoff: %#ux\n", fp->dutyoff); + DBG("acpi: fadt: dutywidth: %#ux\n", fp->dutywidth); + DBG("acpi: fadt: dayalrm: %#ux\n", fp->dayalrm); + DBG("acpi: fadt: monalrm: %#ux\n", fp->monalrm); + DBG("acpi: fadt: century: %#ux\n", fp->century); + DBG("acpi: fadt: iapcbootarch: %#ux\n", fp->iapcbootarch); + DBG("acpi: fadt: flags: %#ux\n", fp->flags); + DBG("acpi: fadt: resetreg: %G\n", &fp->resetreg); + DBG("acpi: fadt: resetval: %#ux\n", fp->resetval); + DBG("acpi: fadt: xfacs: %#llux\n", fp->xfacs); + DBG("acpi: fadt: xdsdt: %#llux\n", fp->xdsdt); + DBG("acpi: fadt: xpm1aevtblk: %G\n", &fp->xpm1aevtblk); + DBG("acpi: fadt: xpm1bevtblk: %G\n", &fp->xpm1bevtblk); + DBG("acpi: fadt: xpm1acntblk: %G\n", &fp->xpm1acntblk); + DBG("acpi: fadt: xpm1bcntblk: %G\n", &fp->xpm1bcntblk); + DBG("acpi: fadt: xpm2cntblk: %G\n", &fp->xpm2cntblk); + DBG("acpi: fadt: xpmtmrblk: %G\n", &fp->xpmtmrblk); + DBG("acpi: fadt: xgpe0blk: %G\n", &fp->xgpe0blk); + DBG("acpi: fadt: xgpe1blk: %G\n", &fp->xgpe1blk); +} + +static Atable* +acpifadt(uchar *p, int) +{ + Fadt *fp; + + fp = &fadt; + fp->facs = l32get(p + 36); + fp->dsdt = l32get(p + 40); + fp->pmprofile = p[45]; + fp->sciint = l16get(p+46); + fp->smicmd = l32get(p+48); + fp->acpienable = p[52]; + fp->acpidisable = p[53]; + fp->s4biosreq = p[54]; + fp->pstatecnt = p[55]; + fp->pm1aevtblk = l32get(p+56); + fp->pm1bevtblk = l32get(p+60); + fp->pm1acntblk = l32get(p+64); + fp->pm1bcntblk = l32get(p+68); + fp->pm2cntblk = l32get(p+72); + fp->pmtmrblk = l32get(p+76); + fp->gpe0blk = l32get(p+80); + fp->gpe1blk = l32get(p+84); + fp->pm1evtlen = p[88]; + fp->pm1cntlen = p[89]; + fp->pm2cntlen = p[90]; + fp->pmtmrlen = p[91]; + fp->gpe0blklen = p[92]; + fp->gpe1blklen = p[93]; + fp->gp1base = p[94]; + fp->cstcnt = p[95]; + fp->plvl2lat = l16get(p+96); + fp->plvl3lat = l16get(p+98); + fp->flushsz = l16get(p+100); + fp->flushstride = l16get(p+102); + fp->dutyoff = p[104]; + fp->dutywidth = p[105]; + fp->dayalrm = p[106]; + fp->monalrm = p[107]; + fp->century = p[108]; + fp->iapcbootarch = l16get(p+109); + fp->flags = l32get(p+112); + gasget(&fp->resetreg, p+116); + fp->resetval = p[128]; + fp->xfacs = l64get(p+132); + fp->xdsdt = l64get(p+140); + gasget(&fp->xpm1aevtblk, p+148); + gasget(&fp->xpm1bevtblk, p+160); + gasget(&fp->xpm1acntblk, p+172); + gasget(&fp->xpm1bcntblk, p+184); + gasget(&fp->xpm2cntblk, p+196); + gasget(&fp->xpmtmrblk, p+208); + gasget(&fp->xgpe0blk, p+220); + gasget(&fp->xgpe1blk, p+232); + + dumpfadt(fp); + if(fp->xfacs != 0) + loadfacs(fp->xfacs); + else + loadfacs(fp->facs); + if(fp->xdsdt != 0) + loaddsdt(fp->xdsdt); + else + loaddsdt(fp->dsdt); + + return nil; /* can be unmapped once parsed */ +} + +static void +dumpmsct(Msct *msct) +{ + Mdom *st; + + DBG("acpi: msct: %d doms %d clkdoms %#ullx maxpa\n", + msct->ndoms, msct->nclkdoms, msct->maxpa); + for(st = msct->dom; st != nil; st = st->next) + DBG("\t[%d:%d] %d maxproc %#ullx maxmmem\n", + st->start, st->end, st->maxproc, st->maxmem); + DBG("\n"); +} + +/* + * XXX: should perhaps update our idea of available memory. + * Else we should remove this code. + */ +static Atable* +acpimsct(uchar *p, int len) +{ + uchar *pe; + Mdom **stl, *st; + int off; + + msct = mallocz(sizeof(Msct), 1); + msct->ndoms = l32get(p+40) + 1; + msct->nclkdoms = l32get(p+44) + 1; + msct->maxpa = l64get(p+48); + msct->dom = nil; + stl = &msct->dom; + pe = p + len; + off = l32get(p+36); + for(p += off; p < pe; p += 22){ + st = mallocz(sizeof(Mdom), 1); + st->next = nil; + st->start = l32get(p+2); + st->end = l32get(p+6); + st->maxproc = l32get(p+10); + st->maxmem = l64get(p+14); + *stl = st; + stl = &st->next; + } + + dumpmsct(msct); + return nil; /* can be unmapped once parsed */ +} + +static void +dumpsrat(Srat *st) +{ + DBG("acpi: srat:\n"); + for(; st != nil; st = st->next) + switch(st->type){ + case SRlapic: + DBG("\tlapic: dom %d apic %d sapic %d clk %d\n", + st->lapic.dom, st->lapic.apic, + st->lapic.sapic, st->lapic.clkdom); + break; + case SRmem: + DBG("\tmem: dom %d %#ullx %#ullx %c%c\n", + st->mem.dom, st->mem.addr, st->mem.len, + st->mem.hplug?'h':'-', + st->mem.nvram?'n':'-'); + break; + case SRlx2apic: + DBG("\tlx2apic: dom %d apic %d clk %d\n", + st->lx2apic.dom, st->lx2apic.apic, + st->lx2apic.clkdom); + break; + default: + DBG("\t\n"); + } + DBG("\n"); +} + +static Atable* +acpisrat(uchar *p, int len) +{ + Srat **stl, *st; + uchar *pe; + int stlen, flags; + + if(srat != nil){ + print("acpi: two SRATs?\n"); + return nil; + } + + stl = &srat; + pe = p + len; + for(p += 48; p < pe; p += stlen){ + st = mallocz(sizeof(Srat), 1); + st->type = p[0]; + st->next = nil; + stlen = p[1]; + switch(st->type){ + case SRlapic: + st->lapic.dom = p[2] | p[9]<<24| p[10]<<16 | p[11]<<8; + st->lapic.apic = p[3]; + st->lapic.sapic = p[8]; + st->lapic.clkdom = l32get(p+12); + if(l32get(p+4) == 0){ + free(st); + st = nil; + } + break; + case SRmem: + st->mem.dom = l32get(p+2); + st->mem.addr = l64get(p+8); + st->mem.len = l64get(p+16); + flags = l32get(p+28); + if((flags&1) == 0){ /* not enabled */ + free(st); + st = nil; + }else{ + st->mem.hplug = flags & 2; + st->mem.nvram = flags & 4; + } + break; + case SRlx2apic: + st->lx2apic.dom = l32get(p+4); + st->lx2apic.apic = l32get(p+8); + st->lx2apic.clkdom = l32get(p+16); + if(l32get(p+12) == 0){ + free(st); + st = nil; + } + break; + default: + print("unknown SRAT structure\n"); + free(st); + st = nil; + } + if(st != nil){ + *stl = st; + stl = &st->next; + } + } + + dumpsrat(srat); + return nil; /* can be unmapped once parsed */ +} + +uintmem +acpimblocksize(uintmem addr, int *dom) +{ + Srat *sl; + + for(sl = srat; sl != nil; sl = sl->next) + if(sl->type == SRmem) + if(sl->mem.addr <= addr && sl->mem.addr + sl->mem.len > addr){ + *dom = sl->mem.dom; + return sl->mem.len - (addr - sl->mem.addr); + } + return 0; +} + +static void +dumpmadt(Madt *apics) +{ + Apicst *st; + + DBG("acpi: madt lapic paddr %llux pcat %d:\n", apics->lapicpa, apics->pcat); + for(st = apics->st; st != nil; st = st->next) + switch(st->type){ + case ASlapic: + DBG("\tlapic pid %d id %d\n", st->lapic.pid, st->lapic.id); + break; + case ASioapic: + case ASiosapic: + DBG("\tioapic id %d addr %#llux ibase %d\n", + st->ioapic.id, st->ioapic.addr, st->ioapic.ibase); + break; + case ASintovr: + DBG("\tintovr irq %d intr %d flags %#ux\n", + st->intovr.irq, st->intovr.intr,st->intovr.flags); + break; + case ASnmi: + DBG("\tnmi intr %d flags %#ux\n", + st->nmi.intr, st->nmi.flags); + break; + case ASlnmi: + DBG("\tlnmi pid %d lint %d flags %#ux\n", + st->lnmi.pid, st->lnmi.lint, st->lnmi.flags); + break; + case ASlsapic: + DBG("\tlsapic pid %d id %d eid %d puid %d puids %s\n", + st->lsapic.pid, st->lsapic.id, + st->lsapic.eid, st->lsapic.puid, + st->lsapic.puids); + break; + case ASintsrc: + DBG("\tintr type %d pid %d peid %d iosv %d intr %d %#x\n", + st->type, st->intsrc.pid, + st->intsrc.peid, st->intsrc.iosv, + st->intsrc.intr, st->intsrc.flags); + break; + case ASlx2apic: + DBG("\tlx2apic puid %d id %d\n", st->lx2apic.puid, st->lx2apic.id); + break; + case ASlx2nmi: + DBG("\tlx2nmi puid %d intr %d flags %#ux\n", + st->lx2nmi.puid, st->lx2nmi.intr, st->lx2nmi.flags); + break; + default: + DBG("\t\n"); + } + DBG("\n"); +} + +static Atable* +acpimadt(uchar *p, int len) +{ + uchar *pe; + Apicst *st, *l, **stl; + int stlen, id; + + apics = mallocz(sizeof(Madt), 1); + apics->lapicpa = l32get(p+36); + apics->pcat = l32get(p+40); + apics->st = nil; + stl = &apics->st; + pe = p + len; + for(p += 44; p < pe; p += stlen){ + st = mallocz(sizeof(Apicst), 1); + st->type = p[0]; + st->next = nil; + stlen = p[1]; + switch(st->type){ + case ASlapic: + st->lapic.pid = p[2]; + st->lapic.id = p[3]; + if(l32get(p+4) == 0){ + free(st); + st = nil; + } + break; + case ASioapic: + st->ioapic.id = id = p[2]; + st->ioapic.addr = l32get(p+4); + st->ioapic.ibase = l32get(p+8); + /* iosapic overrides any ioapic entry for the same id */ + for(l = apics->st; l != nil; l = l->next) + if(l->type == ASiosapic && l->iosapic.id == id){ + st->ioapic = l->iosapic; + /* we leave it linked; could be removed */ + break; + } + break; + case ASintovr: + st->intovr.irq = p[3]; + st->intovr.intr = l32get(p+4); + st->intovr.flags = l16get(p+8); + break; + case ASnmi: + st->nmi.flags = l16get(p+2); + st->nmi.intr = l32get(p+4); + break; + case ASlnmi: + st->lnmi.pid = p[2]; + st->lnmi.flags = l16get(p+3); + st->lnmi.lint = p[5]; + break; + case ASladdr: + /* This is for 64 bits, perhaps we should not + * honor it on 32 bits. + */ + apics->lapicpa = l64get(p+8); + break; + case ASiosapic: + id = st->iosapic.id = p[2]; + st->iosapic.ibase = l32get(p+4); + st->iosapic.addr = l64get(p+8); + /* iosapic overrides any ioapic entry for the same id */ + for(l = apics->st; l != nil; l = l->next) + if(l->type == ASioapic && l->ioapic.id == id){ + l->ioapic = st->iosapic; + free(st); + st = nil; + break; + } + break; + case ASlsapic: + st->lsapic.pid = p[2]; + st->lsapic.id = p[3]; + st->lsapic.eid = p[4]; + st->lsapic.puid = l32get(p+12); + if(l32get(p+8) == 0){ + free(st); + st = nil; + }else + kstrdup(&st->lsapic.puids, (char*)p+16); + break; + case ASintsrc: + st->intsrc.flags = l16get(p+2); + st->type = p[4]; + st->intsrc.pid = p[5]; + st->intsrc.peid = p[6]; + st->intsrc.iosv = p[7]; + st->intsrc.intr = l32get(p+8); + st->intsrc.any = l32get(p+12); + break; + case ASlx2apic: + st->lx2apic.id = l32get(p+4); + st->lx2apic.puid = l32get(p+12); + if(l32get(p+8) == 0){ + free(st); + st = nil; + } + break; + case ASlx2nmi: + st->lx2nmi.flags = l16get(p+2); + st->lx2nmi.puid = l32get(p+4); + st->lx2nmi.intr = p[8]; + break; + default: + print("unknown APIC structure\n"); + free(st); + st = nil; + } + if(st != nil){ + *stl = st; + stl = &st->next; + } + } + + dumpmadt(apics); + return nil; /* can be unmapped once parsed */ +} + +/* + * Map the table and keep it there. + */ +static Atable* +acpitable(uchar *p, int len) +{ + if(len < Sdthdrsz) + return nil; + return newtable(p); +} + +static void +dumptable(char *sig, uchar *p, int l) +{ + int n, i; + + if(DBGFLG > 1){ + DBG("%s @ %#p\n", sig, p); + if(DBGFLG > 2) + n = l; + else + n = 256; + for(i = 0; i < n; i++){ + if((i % 16) == 0) + DBG("%x: ", i); + DBG(" %2.2ux", p[i]); + if((i % 16) == 15) + DBG("\n"); + } + DBG("\n"); + DBG("\n"); + } +} + +static char* +seprinttable(char *s, char *e, Atable *t) +{ + uchar *p; + int i, n; + + p = (uchar*)t->tbl; /* include header */ + n = Sdthdrsz + t->dlen; + s = seprint(s, e, "%s @ %#p\n", t->sig, p); + for(i = 0; i < n; i++){ + if((i % 16) == 0) + s = seprint(s, e, "%x: ", i); + s = seprint(s, e, " %2.2ux", p[i]); + if((i % 16) == 15) + s = seprint(s, e, "\n"); + } + return seprint(s, e, "\n\n"); +} + +/* + * process xsdt table and load tables with sig, or all if nil. + * (XXX: should be able to search for sig, oemid, oemtblid) + */ +static int +acpixsdtload(char *sig) +{ + int i, l, t, unmap, found; + uintptr dhpa; + uchar *sdt; + char tsig[5]; + + found = 0; + for(i = 0; i < xsdt->len; i += xsdt->asize){ + if(xsdt->asize == 8) + dhpa = l64get(xsdt->p+i); + else + dhpa = l32get(xsdt->p+i); + if((sdt = sdtmap(dhpa, &l, 1)) == nil) + continue; + unmap = 1; + memmove(tsig, sdt, 4); + tsig[4] = 0; + if(sig == nil || strcmp(sig, tsig) == 0){ + DBG("acpi: %s addr %#p\n", tsig, sdt); + for(t = 0; t < nelem(ptables); t++) + if(strcmp(tsig, ptables[t].sig) == 0){ + dumptable(tsig, sdt, l); + unmap = ptables[t].f(sdt, l) == nil; + found = 1; + break; + } + } + if(unmap) + vunmap(sdt, l); + } + return found; +} + +static void* +rsdscan(u8int* addr, int len, char* signature) +{ + int sl; + u8int *e, *p; + + e = addr+len; + sl = strlen(signature); + for(p = addr; p+sl < e; p += 16){ + if(memcmp(p, signature, sl)) + continue; + return p; + } + + return nil; +} + +static void* +rsdsearch(char* signature) +{ + uintptr p; + u8int *bda; + void *rsd; + + /* + * Search for the data structure signature: + * 1) in the first KB of the EBDA; + * 2) in the BIOS ROM between 0xE0000 and 0xFFFFF. + */ + if(strncmp((char*)KADDR(0xFFFD9), "EISA", 4) == 0){ + bda = BIOSSEG(0x40); + if((p = (bda[0x0F]<<8)|bda[0x0E])){ + if(rsd = rsdscan(KADDR(p), 1024, signature)) + return rsd; + } + } + return rsdscan(BIOSSEG(0xE000), 0x20000, signature); +} + +static void +acpirsdptr(void) +{ + Rsdp *rsd; + int asize; + uintptr sdtpa; + + if((rsd = rsdsearch("RSD PTR ")) == nil) + return; + + assert(sizeof(Sdthdr) == 36); + + DBG("acpi: RSD PTR@ %#p, physaddr %#ux length %ud %#llux rev %d\n", + rsd, l32get(rsd->raddr), l32get(rsd->length), + l64get(rsd->xaddr), rsd->revision); + + if(rsd->revision >= 2){ + if(sdtchecksum(rsd, 36) == nil){ + DBG("acpi: RSD: bad checksum\n"); + return; + } + sdtpa = l64get(rsd->xaddr); + asize = 8; + } + else{ + if(sdtchecksum(rsd, 20) == nil){ + DBG("acpi: RSD: bad checksum\n"); + return; + } + sdtpa = l32get(rsd->raddr); + asize = 4; + } + + /* + * process the RSDT or XSDT table. + */ + xsdt = malloc(sizeof(Xsdt)); + if(xsdt == nil){ + DBG("acpi: malloc failed\n"); + return; + } + if((xsdt->p = sdtmap(sdtpa, &xsdt->len, 1)) == nil){ + DBG("acpi: sdtmap failed\n"); + return; + } + if((xsdt->p[0] != 'R' && xsdt->p[0] != 'X') || memcmp(xsdt->p+1, "SDT", 3) != 0){ + DBG("acpi: xsdt sig: %c%c%c%c\n", + xsdt->p[0], xsdt->p[1], xsdt->p[2], xsdt->p[3]); + free(xsdt); + xsdt = nil; + vunmap(xsdt, xsdt->len); + return; + } + xsdt->p += sizeof(Sdthdr); + xsdt->len -= sizeof(Sdthdr); + xsdt->asize = asize; + DBG("acpi: XSDT %#p\n", xsdt); + acpixsdtload(nil); + /* xsdt is kept and not unmapped */ + +} + +static int +acpigen(Chan *c, char*, Dirtab *tab, int ntab, int i, Dir *dp) +{ + Qid qid; + + if(i == DEVDOTDOT){ + mkqid(&qid, Qdir, 0, QTDIR); + devdir(c, qid, ".", 0, eve, 0555, dp); + return 1; + } + i++; /* skip first element for . itself */ + if(tab==0 || i>=ntab) + return -1; + tab += i; + qid = tab->qid; + qid.path &= ~Qdir; + qid.vers = 0; + devdir(c, qid, tab->name, tab->length, eve, tab->perm, dp); + return 1; +} + +static int +Gfmt(Fmt* f) +{ + static char* rnames[] = { + "mem", "io", "pcicfg", "embed", + "smb", "cmos", "pcibar", "ipmi"}; + Gas *g; + + g = va_arg(f->args, Gas*); + switch(g->spc){ + case Rsysmem: + case Rsysio: + case Rembed: + case Rsmbus: + case Rcmos: + case Rpcibar: + case Ripmi: + fmtprint(f, "[%s ", rnames[g->spc]); + break; + case Rpcicfg: + fmtprint(f, "[pci "); + fmtprint(f, "dev %#ulx ", (ulong)(g->addr >> 32) & 0xFFFF); + fmtprint(f, "fn %#ulx ", (ulong)(g->addr & 0xFFFF0000) >> 16); + fmtprint(f, "adr %#ulx ", (ulong)(g->addr &0xFFFF)); + break; + case Rfixedhw: + fmtprint(f, "[hw "); + break; + default: + fmtprint(f, "[spc=%#ux ", g->spc); + } + return fmtprint(f, "off %d len %d addr %#ullx sz%d]", + g->off, g->len, g->addr, g->accsz); +} + +static uint +getbanked(uintptr ra, uintptr rb, int sz) +{ + uint r; + + r = 0; + switch(sz){ + case 1: + if(ra != 0) + r |= inb(ra); + if(rb != 0) + r |= inb(rb); + break; + case 2: + if(ra != 0) + r |= ins(ra); + if(rb != 0) + r |= ins(rb); + break; + case 4: + if(ra != 0) + r |= inl(ra); + if(rb != 0) + r |= inl(rb); + break; + default: + print("getbanked: wrong size\n"); + } + return r; +} + +static uint +setbanked(uintptr ra, uintptr rb, int sz, int v) +{ + uint r; + + r = -1; + switch(sz){ + case 1: + if(ra != 0) + outb(ra, v); + if(rb != 0) + outb(rb, v); + break; + case 2: + if(ra != 0) + outs(ra, v); + if(rb != 0) + outs(rb, v); + break; + case 4: + if(ra != 0) + outl(ra, v); + if(rb != 0) + outl(rb, v); + break; + default: + print("setbanked: wrong size\n"); + } + return r; +} + +static uint +getpm1ctl(void) +{ + return getbanked(fadt.pm1acntblk, fadt.pm1bcntblk, fadt.pm1cntlen); +} + +static void +setpm1sts(uint v) +{ + DBG("acpi: setpm1sts %#ux\n", v); + setbanked(fadt.pm1aevtblk, fadt.pm1bevtblk, fadt.pm1evtlen/2, v); +} + +static uint +getpm1sts(void) +{ + return getbanked(fadt.pm1aevtblk, fadt.pm1bevtblk, fadt.pm1evtlen/2); +} + +static uint +getpm1en(void) +{ + int sz; + + sz = fadt.pm1evtlen/2; + return getbanked(fadt.pm1aevtblk+sz, fadt.pm1bevtblk+sz, sz); +} + +static int +getgpeen(int n) +{ + return inb(gpes[n].enio) & 1<ho in the + // aml process. + // enable it again when it returns. + } + sts = getpm1sts(); + en = getpm1en(); + print("acpiitr: pm1sts %#ux pm1en %#ux\n", sts, en); + if(sts&en) + print("have enabled events\n"); + if(sts&1) + print("power button\n"); + // XXX serve other interrupts here. + setpm1sts(sts); +} + +static void +initgpes(void) +{ + int i, n0, n1; + + n0 = fadt.gpe0blklen/2; + n1 = fadt.gpe1blklen/2; + ngpes = n0 + n1; + gpes = mallocz(sizeof(Gpe) * ngpes, 1); + for(i = 0; i < n0; i++){ + gpes[i].nb = i; + gpes[i].stsbit = i&7; + gpes[i].stsio = fadt.gpe0blk + (i>>3); + gpes[i].enbit = (n0 + i)&7; + gpes[i].enio = fadt.gpe0blk + ((n0 + i)>>3); + } + for(i = 0; i + n0 < ngpes; i++){ + gpes[i + n0].nb = fadt.gp1base + i; + gpes[i + n0].stsbit = i&7; + gpes[i + n0].stsio = fadt.gpe1blk + (i>>3); + gpes[i + n0].enbit = (n1 + i)&7; + gpes[i + n0].enio = fadt.gpe1blk + ((n1 + i)>>3); + } + for(i = 0; i < ngpes; i++){ + setgpeen(i, 0); + clrgpests(i); + } +} + +static void +acpiioalloc(uint addr, int len) +{ + if(addr != 0) + ioalloc(addr, len, 0, "acpi"); +} + +int +acpiinit(void) +{ + if(fadt.smicmd == 0){ + fmtinstall('G', Gfmt); + acpirsdptr(); + if(fadt.smicmd == 0) + return -1; + } + return 0; +} + +static Chan* +acpiattach(char *spec) +{ + int i; + + /* + * This was written for the stock kernel. + * This code must use 64 registers to be acpi ready in nix. + */ + if(1 || acpiinit() < 0) + error("no acpi"); + + /* + * should use fadt->xpm* and fadt->xgpe* registers for 64 bits. + * We are not ready in this kernel for that. + */ + DBG("acpi io alloc\n"); + acpiioalloc(fadt.smicmd, 1); + acpiioalloc(fadt.pm1aevtblk, fadt.pm1evtlen); + acpiioalloc(fadt.pm1bevtblk, fadt.pm1evtlen ); + acpiioalloc(fadt.pm1acntblk, fadt.pm1cntlen); + acpiioalloc(fadt.pm1bcntblk, fadt.pm1cntlen); + acpiioalloc(fadt.pm2cntblk, fadt.pm2cntlen); + acpiioalloc(fadt.pmtmrblk, fadt.pmtmrlen); + acpiioalloc(fadt.gpe0blk, fadt.gpe0blklen); + acpiioalloc(fadt.gpe1blk, fadt.gpe1blklen); + + DBG("acpi init gpes\n"); + initgpes(); + + /* + * This starts ACPI, which may require we handle + * power mgmt events ourselves. Use with care. + */ + DBG("acpi starting\n"); + outb(fadt.smicmd, fadt.acpienable); + for(i = 0; i < 10; i++) + if(getpm1ctl() & Pm1SciEn) + break; + if(i == 10) + error("acpi: failed to enable\n"); + if(fadt.sciint != 0) + intrenable(fadt.sciint, acpiintr, 0, BUSUNKNOWN, "acpi"); + return devattach(L'α', spec); +} + +static Walkqid* +acpiwalk(Chan *c, Chan *nc, char **name, int nname) +{ + return devwalk(c, nc, name, nname, acpidir, nelem(acpidir), acpigen); +} + +static long +acpistat(Chan *c, uchar *dp, long n) +{ + return devstat(c, dp, n, acpidir, nelem(acpidir), acpigen); +} + +static Chan* +acpiopen(Chan *c, int omode) +{ + return devopen(c, omode, acpidir, nelem(acpidir), acpigen); +} + +static void +acpiclose(Chan *) +{ +} + +static char*ttext; +static int tlen; + +static long +acpiread(Chan *c, void *a, long n, vlong off) +{ + long q; + Atable *t; + char *ns, *s, *e, *ntext; + + q = c->qid.path; + switch(q){ + case Qdir: + return devdirread(c, a, n, acpidir, nelem(acpidir), acpigen); + case Qtbl: + if(ttext == nil){ + tlen = 1024; + ttext = malloc(tlen); + if(ttext == nil){ + print("acpi: no memory\n"); + return 0; + } + s = ttext; + e = ttext + tlen; + strcpy(s, "no tables\n"); + for(t = tfirst; t != nil; t = t->next){ + ns = seprinttable(s, e, t); + while(ns == e - 1){ + DBG("acpiread: allocated %d\n", tlen*2); + ntext = realloc(ttext, tlen*2); + if(ntext == nil) + panic("acpi: no memory\n"); + s = ntext + (ttext - s); + ttext = ntext; + tlen *= 2; + e = ttext + tlen; + ns = seprinttable(s, e, t); + } + s = ns; + } + + } + return readstr(off, a, n, ttext); + case Qio: + if(reg == nil) + error("region not configured"); + return regio(reg, a, n, off, 0); + } + error(Eperm); + return -1; +} + +static long +acpiwrite(Chan *c, void *a, long n, vlong off) +{ + Cmdtab *ct; + Cmdbuf *cb; + Reg *r; + uint rno, fun, dev, bus, i; + + if(c->qid.path == Qio){ + if(reg == nil) + error("region not configured"); + return regio(reg, a, n, off, 1); + } + if(c->qid.path != Qctl) + error(Eperm); + + cb = parsecmd(a, n); + if(waserror()){ + free(cb); + nexterror(); + } + ct = lookupcmd(cb, ctls, nelem(ctls)); + DBG("acpi ctl %s\n", cb->f[0]); + switch(ct->index){ + case CMregion: + r = reg; + if(r == nil){ + r = smalloc(sizeof(Reg)); + r->name = nil; + } + kstrdup(&r->name, cb->f[1]); + r->spc = acpiregid(cb->f[2]); + if(r->spc < 0){ + free(r); + reg = nil; + error("bad region type"); + } + if(r->spc == Rpcicfg || r->spc == Rpcibar){ + rno = r->base>>Rpciregshift & Rpciregmask; + fun = r->base>>Rpcifunshift & Rpcifunmask; + dev = r->base>>Rpcidevshift & Rpcidevmask; + bus = r->base>>Rpcibusshift & Rpcibusmask; + r->tbdf = MKBUS(BusPCI, bus, dev, fun); + r->base = rno; /* register ~ our base addr */ + } + r->base = strtoull(cb->f[3], nil, 0); + r->len = strtoull(cb->f[4], nil, 0); + r->accsz = strtoul(cb->f[5], nil, 0); + if(r->accsz < 1 || r->accsz > 4){ + free(r); + reg = nil; + error("bad region access size"); + } + reg = r; + DBG("region %s %s %llux %llux sz%d", + r->name, acpiregstr(r->spc), r->base, r->len, r->accsz); + break; + case CMgpe: + i = strtoul(cb->f[1], nil, 0); + if(i >= ngpes) + error("gpe out of range"); + kstrdup(&gpes[i].obj, cb->f[2]); + DBG("gpe %d %s\n", i, gpes[i].obj); + setgpeen(i, 1); + break; + default: + panic("acpi: unknown ctl"); + } + poperror(); + free(cb); + return n; +} + +Dev acpidevtab = { + L'α', + "acpi", + + devreset, + devinit, + devshutdown, + acpiattach, + acpiwalk, + acpistat, + acpiopen, + devcreate, + acpiclose, + acpiread, + devbread, + acpiwrite, + devbwrite, + devremove, + devwstat, +}; diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/l64cpuid.s --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/l64cpuid.s Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,26 @@ +/* + * The CPUID instruction is always supported on the amd64. + */ +TEXT cpuid(SB), $-4 + MOVL RARG, AX /* function in AX */ + MOVLQZX cx+8(FP), CX /* iterator/index/etc. */ + + CPUID + + MOVQ info+16(FP), BP + MOVL AX, 0(BP) + MOVL BX, 4(BP) + MOVL CX, 8(BP) + MOVL DX, 12(BP) + RET + +/* + * Basic timing loop to determine CPU frequency. + * The AAM instruction is not available in 64-bit mode. + */ +TEXT aamloop(SB), 1, $-4 + MOVLQZX RARG, CX +aaml1: + XORQ AX, AX /* close enough */ + LOOP aaml1 + RET diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/l64sipi.s --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/l64sipi.s Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,246 @@ +/* + * Start-up request IPI handler. + * + * This code is executed on an application processor in response to receiving + * a Start-up IPI (SIPI) from another processor. + * This must be placed on a 4KiB boundary + * somewhere in the 1st MiB of conventional memory. However, + * due to some shortcuts below it's restricted further to within the 1st 64KiB. + * The AP starts in real-mode, with + * CS selector set to the startup memory address/16; + * CS base set to startup memory address; + * CS limit set to 64KiB; + * CPL and IP set to 0. + */ +#include "mem.h" +#include "amd64l.h" + +/* + * Some machine instructions not handled well by [68][al]. + * This is a messy piece of code, requiring instructions in real mode, + * protected mode (+long mode on amd64). The MODE psuedo-op of 6[al] handles + * the latter two OK, but 'MODE $16' is incomplete, e.g. it does + * not truncate operands appropriately, hence the ugly 'rMOVAX' macro. + * Fortunately, the only other instruction executed in real mode that + * could cause a problem (ORL) is encoded such that it will work OK. + */ +#define DELAY BYTE $0xeb; /* JMP .+2 */ \ + BYTE $0x00 +#define NOP BYTE $0x90 /* NOP */ + +#define pFARJMP32(s, o) BYTE $0xea; /* far jmp ptr32:16 */ \ + LONG $o; WORD $s + +#define rFARJMP16(s, o) BYTE $0xea; /* far jump ptr16:16 */ \ + WORD $o; WORD $s; +#define rFARJMP32(s, o) BYTE $0x66; /* far jump ptr32:16 */ \ + pFARJMP32(s, o) +#define rLGDT(gdtptr) BYTE $0x0f; /* LGDT */ \ + BYTE $0x01; BYTE $0x16; \ + WORD $gdtptr +#define rMOVAX(i) BYTE $0xb8; /* i -> AX */ \ + WORD $i; + +/* + * Real mode. Welcome to 1978. + * Load a basic GDT, turn on protected mode and make + * inter-segment jump to the protected mode code. + */ +MODE $16 + +TEXT _real<>(SB), 1, $-4 + rFARJMP16(0, _endofheader<>-KZERO(SB)) /* */ + +_startofheader: + NOP; NOP; NOP + QUAD $0xa5a5a5a5a5a5a5a5 + +TEXT _gdt32p<>(SB), 1, $-4 + QUAD $0x0000000000000000 /* NULL descriptor */ + QUAD $0x00cf9a000000ffff /* CS */ + QUAD $0x00cf92000000ffff /* DS */ + QUAD $0x0020980000000000 /* Long mode CS */ + +TEXT _gdtptr32p<>(SB), 1, $-4 + WORD $(4*8-1) /* includes long mode */ + LONG $_gdt32p<>-KZERO(SB) + +TEXT _gdt64<>(SB), 1, $-4 + QUAD $0x0000000000000000 /* NULL descriptor */ + QUAD $0x0020980000000000 /* CS */ + QUAD $0x0000800000000000 /* DS */ + +TEXT _gdtptr64v<>(SB), 1, $-4 + WORD $(3*8-1) + QUAD $_gdt64<>(SB) + +TEXT _endofheader<>(SB), 1, $-4 + MOVW CS, AX + MOVW AX, DS /* initialise DS */ + + rLGDT(_gdtptr32p<>-KZERO(SB)) /* load a basic gdt */ + + MOVL CR0, AX + ORL $Pe, AX + MOVL AX, CR0 /* turn on protected mode */ + DELAY /* JMP .+2 */ + + rMOVAX (SSEL(SiDS, SsTIGDT|SsRPL0)) /* */ + MOVW AX, DS + MOVW AX, ES + MOVW AX, FS + MOVW AX, GS + MOVW AX, SS + + rFARJMP32(SSEL(SiCS, SsTIGDT|SsRPL0), _protected<>-KZERO(SB)) + +/* + * Protected mode. Welcome to 1982. + * Get the local APIC ID from the memory mapped APIC; +#ifdef UseOwnPageTables + * load the PDB with the page table address, which is located + * in the word immediately preceeding _real<>-KZERO(SB); + * this is also the (physical) address of the top of stack; +#else + * load the PML4 with the shared page table address; +#endif + * make an identity map for the inter-segment jump below, + * using the stack space to hold a temporary PDP and PD; + * enable and activate long mode; + * make an inter-segment jump to the long mode code. + */ +MODE $32 + +/* + * Macros for accessing page table entries; must turn + * the C-style array-index macros into a page table byte + * offset. + */ +#define PML4O(v) ((PTLX((v), 3))<<3) +#define PDPO(v) ((PTLX((v), 2))<<3) +#define PDO(v) ((PTLX((v), 1))<<3) +#define PTO(v) ((PTLX((v), 0))<<3) + +TEXT _protected<>(SB), 1, $-4 + MOVL $0xfee00000, BP /* apicbase */ + MOVL 0x20(BP), BP /* Id */ + SHRL $24, BP /* becomes RARG later */ +//MOVL $_real<>-KZERO(SB), CX +//MOVL BX, -4(CX) +//_spin: JMP _spin + +#ifdef UseOwnPageTables + MOVL $_real<>-KZERO(SB), AX + MOVL -4(AX), SI /* page table PML4 */ +#else + MOVL $(0x00100000+MACHSTKSZ), SI /* page table PML4 */ +#endif + + MOVL SI, AX + MOVL AX, CR3 /* load the mmu */ + + MOVL AX, DX + SUBL $MACHSTKSZ, DX /* PDP for identity map */ + ADDL $(PteRW|PteP), DX + MOVL DX, PML4O(0)(AX) /* PML4E for identity map */ + + SUBL $MACHSTKSZ, AX /* PDP for identity map */ + ADDL $PTSZ, DX + MOVL DX, PDPO(0)(AX) /* PDPE for identity map */ + MOVL $(PtePS|PteRW|PteP), DX + ADDL $PTSZ, AX /* PD for identity map */ + MOVL DX, PDO(0)(AX) /* PDE for identity 0-[24]MiB */ + +/* + * Enable and activate Long Mode. From the manual: + * make sure Page Size Extentions are off, and Page Global + * Extensions and Physical Address Extensions are on in CR4; + * set Long Mode Enable in the Extended Feature Enable MSR; + * set Paging Enable in CR0; + * make an inter-segment jump to the Long Mode code. + * It's all in 32-bit mode until the jump is made. + */ +TEXT _lme<>(SB), 1, $-4 + MOVL CR4, AX + ANDL $~Pse, AX /* Page Size */ + ORL $(Pge|Pae), AX /* Page Global, Phys. Address */ + MOVL AX, CR4 + + MOVL $Efer, CX /* Extended Feature Enable */ + RDMSR + ORL $Lme, AX /* Long Mode Enable */ + WRMSR + + MOVL CR0, DX + ANDL $~(Cd|Nw|Ts|Mp), DX + ORL $(Pg|Wp), DX /* Paging Enable */ + MOVL DX, CR0 + + pFARJMP32(SSEL(3, SsTIGDT|SsRPL0), _identity<>-KZERO(SB)) + +/* + * Long mode. Welcome to 2003. + * Jump out of the identity map space; + * load a proper long mode GDT; + * zap the identity map; + * initialise the stack and call the + * C startup code in m->splpc. + */ +MODE $64 + +TEXT _identity<>(SB), 1, $-4 + MOVQ $_start64v<>(SB), AX + JMP* AX + +TEXT _start64v<>(SB), 1, $-4 + MOVQ $_gdtptr64v<>(SB), AX + MOVL (AX), GDTR + + XORQ DX, DX + MOVW DX, DS /* not used in long mode */ + MOVW DX, ES /* not used in long mode */ + MOVW DX, FS + MOVW DX, GS + MOVW DX, SS /* not used in long mode */ + + MOVLQZX SI, SI /* PML4-KZERO */ + MOVQ SI, AX + ADDQ $KZERO, AX /* PML4 and top of stack */ + + MOVQ AX, SP /* set stack */ + + MOVQ DX, PML4O(0)(AX) /* zap identity map */ + + MOVQ SI, CR3 /* flush TLB */ +#ifndef UseOwnPageTables + /* + * SI still points to the base of the bootstrap + * processor page tables. + * Want to use that for clearing the identity map, + * but want to use the passed-in address for + * setting up the stack and Mach. + */ + MOVQ $_real<>(SB), AX + MOVL -4(AX), SI /* PML4 */ + MOVLQZX SI, SI /* PML4-KZERO */ +#endif + MOVQ SI, AX + ADDQ $KZERO, AX /* PML4 and top of stack */ + + MOVQ AX, SP /* set stack */ + + ADDQ $(4*PTSZ+4*KiB), AX /* PML4+PDP+PD+PT+vsvm */ + MOVQ AX, RMACH /* Mach */ + MOVQ DX, RUSER + + PUSHQ DX /* clear flags */ + POPFQ + + MOVLQZX RARG, RARG /* APIC ID */ + PUSHQ RARG /* apicno */ + + MOVQ 8(RMACH), AX /* m->splpc */ + CALL* AX /* CALL squidboy(SB) */ + +_ndnr: + JMP _ndnr diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/linuxarchsys.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/linuxarchsys.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,44 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include "../port/error.h" +#include +#include "ureg.h" + +/* from linux */ +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + +void +arch_prctl(Ar0*ar, va_list list) +{ + uintptr va; + int code; + code = va_arg(list, int); + va = va_arg(list, uintptr); + if (up->linux & 128) print("%d:arch_prctl code %x va %p: ", up->pid, code, va); + if (code < ARCH_SET_GS || code > ARCH_GET_GS) + error("Bad code!"); + /* always make sure it's a valid address, no matter what the command */ + validaddr((void *)va, 8, code > ARCH_SET_FS); + + if (code > ARCH_SET_FS) { + uvlong val; + val = rdmsr(code == ARCH_GET_FS ? 0xC0000100 : 0xC0000101); + memmove((void *)va, &val, sizeof(uvlong)); + if (up->linux & 128) print("get %#p\n", (void *)val); + } else { + if (code == ARCH_SET_GS) + error("Can't set GS yet"); + wrmsr(code == ARCH_SET_FS ? 0xC0000100 : 0xC0000101, va); + if (up->linux & 128) print("set\n"); + } + + ar->i = 0; +} + diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/linuxsyscall.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/linuxsyscall.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,241 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include "../port/error.h" + +#include "/sys/src/libc/9syscall/sys.h" +#include "linuxsystab.h" + +#include +#include "amd64.h" +#include "ureg.h" + +/* linux calling convention is callee-save. We are caller-save. + * But that issue is covered in trap, which saves everything. + * so we only need to know the calling conventions. + * when we call a(1,2,3,4,5,6). NO on-stack params. + movl $6, %r9d + movl $5, %r8d + movl $4, %r10 + movl $3, %edx + movl $2, %esi + movl $1, %edi + * syscall is in %ax however. + * return is in %ax + */ + +void +linuxsyscall(unsigned int, Ureg* ureg) +{ + void noted(Ureg*, uintptr); + unsigned int scallnr; + void notify(Ureg *); + char *e; + uintptr sp; + int s; + Ar0 ar0; + static Ar0 zar0; + int i; + uintptr linuxargs[6]; + +//print("linuxsyscall: wrong %d\n", wrong); +//dumpstack(); + if(!userureg(ureg)) + panic("syscall: cs %#llux\n", ureg->cs); + + cycles(&up->kentry); + + m->syscall++; + up->nsyscall++; + up->nqsyscall++; + up->insyscall = 1; + up->pc = ureg->ip; + up->dbgreg = ureg; + + if(up->procctl == Proc_tracesyscall){ + up->procctl = Proc_stopme; + procctl(up); + } + scallnr = ureg->ax; +//print("# %d\n", scallnr); + up->scallnr = scallnr; + + if(scallnr == 56) + fpusysrfork(ureg); + spllo(); + + sp = ureg->sp; + up->nerrlab = 0; + ar0 = zar0; + if(!waserror()){ + int printarg; + char *name = scallnr < nlinuxsyscall ? linuxsystab[scallnr].n : "Unknown"; + if(scallnr >= nlinuxsyscall || linuxsystab[scallnr].f == nil){ + pprint("bad linux sys call number %d(%s) pc %#ullx max %d\n", + scallnr, name, ureg->ip, nlinuxsyscall); + postnote(up, 1, "sys: bad sys call", NDebug); + error(Ebadarg); + } + + if(sp < (USTKTOP-BIGPGSZ) || sp > (USTKTOP-sizeof(up->arg)-BY2SE)) + validaddr(UINT2PTR(sp), sizeof(up->arg)+BY2SE, 0); + + up->psstate = linuxsystab[scallnr].n; + + linuxargs[0] = ureg->di; + linuxargs[1] = ureg->si; + linuxargs[2] = ureg->dx; + linuxargs[3] = ureg->r10; + linuxargs[4] = ureg->r8; + linuxargs[5] = ureg->r9; + + if (up->linux & 16) {print("%d:linux: %s: pc %#p ", up->pid, linuxsystab[scallnr].n,(void *)ureg->ip); + for(printarg = 0; printarg < linuxsystab[scallnr].narg; printarg++) + print("%p ", (void *)linuxargs[printarg]); + print("\n"); + } + if (up->linux&32) dumpregs(ureg); + linuxsystab[scallnr].f(&ar0, (va_list)linuxargs); + if (up->linux & 64){print("AFTER: ");dumpregs(ureg);} + poperror(); + }else{ + /* failure: save the error buffer for errstr */ + if (up->linux & 16){ + int i; + print("Error path in linuxsyscall: %#ux, %s\n", scallnr, up->syserrstr ? up->syserrstr : "no errstr"); + for(i = 0; i < nelem(linuxargs); i++) + print("%d: %#p\n", i, linuxargs[i]); + dumpregs(ureg); + } + e = up->syserrstr; + up->syserrstr = up->errstr; + up->errstr = e; + if (scallnr < nlinuxsyscall) + ar0 = linuxsystab[scallnr].r; + else + ar0.i = -1; + } + + /* normal amd64 kernel does not have this; remove? */ + if(up->nerrlab){ + print("bad errstack [%d]: %d extra\n", scallnr, up->nerrlab); + for(i = 0; i < NERR; i++) + print("sp=%#ullx pc=%#ullx\n", + up->errlab[i].sp, up->errlab[i].pc); + panic("error stack"); + } + + /* + * NIX: for the execac() syscall, what follows is done within + * the system call, because it never returns. + * See acore.c:/^retfromsyscall + */ + + noerrorsleft(); + /* + * Put return value in frame. + */ + ureg->ax = ar0.p; + if (up->linux & 16)print("%d:Ret from syscall %#lx\n", up->pid, (unsigned long) ar0.p); + if(up->procctl == Proc_tracesyscall){ + up->procctl = Proc_stopme; + s = splhi(); + procctl(up); + splx(s); + }else if(up->procctl == Proc_totc || up->procctl == Proc_toac) + procctl(up); + + + up->insyscall = 0; + up->psstate = 0; + + if(scallnr == NOTED) + noted(ureg, *(uintptr*)(sp+BY2SE)); + + splhi(); + + if(scallnr != 56 && (up->procctl || up->nnote)) + notify(ureg); + + /* if we delayed sched because we held a lock, sched now */ + if(up->delaysched){ + sched(); + splhi(); + } + kexit(ureg); +} + +void* +linuxsysexecregs(uintptr entry, ulong ssize, ulong nargs) +{ + int i; + uvlong *l; + Ureg *ureg; + uintptr *sp; + + if(!up->linux) + panic("linuxsysexecregs: up->linux %d\n", up->linux); + + /* need to figure out linux exec conventions :-( */ + sp = (uintptr*)(USTKTOP - ssize); + *--sp = nargs; + + ureg = up->dbgreg; + l = &ureg->bp; + print("Starting linux proc pc %#ullx sp %p nargs %ld\n", + ureg->ip, sp+1, nargs); + + /* set up registers for linux */ + /* we are dying in getenv. */ + /* because glibc does not follow the PPC ABI. */ + /* you have to push the env, then the args. */ + /* so to do this, well, we'll push an empty env on stack, i.e. shift + * the args down one. stack grows down. We already made space + * when we pushed nargs. + */ + memmove(sp, sp+1, nargs * sizeof(*sp)); + sp[nargs] = 0; + *--sp = nargs; + for(i = 7; i < 16; i++) + *l++ = 0xdeadbeef + (i*0x110); + + ureg->sp = PTR2UINT(sp); + ureg->ip = entry; + print("Starting linux proc pc %#ullx\n", ureg->ip); + + /* + */ + return UINT2PTR(nargs); +} + +void +linuxsysrforkchild(Proc* child, Proc* parent) +{ + Ureg *cureg; + + /* don't clear linux any more. linux procs can now fork */ + child->linuxexec = 0; + /* + * Add 3*BY2SE to the stack to account for + * - the return PC + * - trap's arguments (syscallnr, ureg) + */ + child->sched.sp = PTR2UINT(child->kstack+KSTACK-(sizeof(Ureg)+3*BY2SE)); + child->sched.pc = PTR2UINT(sysrforkret); + + cureg = (Ureg*)(child->sched.sp+3*BY2SE); + memmove(cureg, parent->dbgreg, sizeof(Ureg)); + + /* Things from bottom of syscall which were never executed */ + child->psstate = 0; + child->insyscall = 0; + + cureg->ax = 0; + child->hang = 1; + + dumpregs(cureg); + fpusysrforkchild(child, parent); +} diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/linuxsystab.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/linuxsystab.h Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,361 @@ +typedef void Syscall(Ar0*, va_list); + +Syscall linuxuname; +Syscall linuxbrk; +Syscall linuxopen; +Syscall syssegbrk; +Syscall linuxwritev; +Syscall linuxsocketcall; +Syscall linuxgeteuid; +Syscall linuxmmap; +Syscall linuxexit; +Syscall linuxsettid; +Syscall sigaction; +Syscall rt_sigprocmask; +Syscall fstat64; +Syscall returnok; +Syscall futex; +Syscall linuxgetpersonality; +Syscall linuxr2c; +Syscall linuxc2r; +Syscall linuxprocid; +Syscall linuxranks2coords; +Syscall linuxmprotect; +Syscall linuxclone; +Syscall gasm; +Syscall linuxcga; +Syscall getrusage; +Syscall arch_prctl; +extern Syscall sys_write; +extern Syscall sys_read; +extern Syscall sysopen; +extern Syscall syspread; + +struct syscall { + char* n; + Syscall*f; + int narg; + Ar0 r; +}; + +struct syscall linuxsystab[] = { + [0] {"read", sys_read, 3, {.i = 0}}, + [1] {"write", sys_write, 3, {.i = -1}}, + [2] {"linuxopen", sysopen, 2, {.i = -1}}, + [102] {"getuid", linuxgeteuid, 0, {.i = -1}}, + [12] {"linuxbrk", linuxbrk, 1, {.i = -1}}, + [104] {"getgid", linuxgeteuid, 0, {.i = -1}}, + [107] {"geteuid", linuxgeteuid, 0, {.i = -1}}, + [108] {"getegid", linuxgeteuid, 0, {.i = -1}}, + [98] {"getrusage", getrusage, 1, {.i = 0}}, + [9] {"mmap", linuxmmap, 5, {.i = -1}}, + [63] {"linuxuname", linuxuname, 1, {.i = -1}}, + [13] {"sigaction", sigaction, 2, {.i = -1}}, + [14] {"rt_sigprocmask", rt_sigprocmask, 4, {.l = -1}}, + [60] {"linuxexit", linuxexit, 1, {.i = -1}}, + [17] {"pread64", syspread, 5, {.i = 0}}, +// [102] {"socketcall", linuxsocketcall, 1, {.i = 0}}, + [56] {"clone", linuxclone, 1, {.i = 0}}, + [10] {"mprotect", linuxmprotect, 1, {.i = 0}}, + [20] {"writev", linuxwritev, 1, {.i = 0}}, +// [197] {"fstat64", fstat64, 1, {.i = -1}}, +// [221] {"futex", futex, 1, {.i = 0}}, + [158] {"arch_prctl", arch_prctl, 2, {.p = (void *)-1}}, + + [3] {"close", nil, 1, {.p = (void *)-1}}, + [4] {"stat", nil, 1, {.p = (void *)-1}}, + [5] {"fstat", nil, 1, {.p = (void *)-1}}, + [6] {"lstat", nil, 1, {.p = (void *)-1}}, + [7] {"poll", nil, 1, {.p = (void *)-1}}, + [8] {"lseek", nil, 1, {.p = (void *)-1}}, + + [11] {"munmap", nil, 1, {.p = (void *)-1}}, + + [15] {"rt_sigreturn", nil, 1, {.p = (void *)-1}}, + [16] {"ioctl", nil, 1, {.p = (void *)-1}}, + + [18] {"pwrite64", nil, 1, {.p = (void *)-1}}, + [19] {"readv", nil, 1, {.p = (void *)-1}}, + + [21] {"access", nil, 1, {.p = (void *)-1}}, + [22] {"pipe", nil, 1, {.p = (void *)-1}}, + [23] {"select", nil, 1, {.p = (void *)-1}}, + [24] {"sched_yield", nil, 1, {.p = (void *)-1}}, + [25] {"mremap", nil, 1, {.p = (void *)-1}}, + [26] {"msync", nil, 1, {.p = (void *)-1}}, + [27] {"mincore", nil, 1, {.p = (void *)-1}}, + [28] {"madvise", nil, 1, {.p = (void *)-1}}, + [29] {"shmget", nil, 1, {.p = (void *)-1}}, + [30] {"shmat", nil, 1, {.p = (void *)-1}}, + [31] {"shmctl", nil, 1, {.p = (void *)-1}}, + [32] {"dup", nil, 1, {.p = (void *)-1}}, + [33] {"dup2", nil, 1, {.p = (void *)-1}}, + [34] {"pause", nil, 1, {.p = (void *)-1}}, + [35] {"nanosleep", nil, 1, {.p = (void *)-1}}, + [36] {"getitimer", nil, 1, {.p = (void *)-1}}, + [37] {"alarm", nil, 1, {.p = (void *)-1}}, + [38] {"setitimer", nil, 1, {.p = (void *)-1}}, + [39] {"getpid", nil, 1, {.p = (void *)-1}}, + [40] {"sendfile", nil, 1, {.p = (void *)-1}}, + [41] {"socket", nil, 1, {.p = (void *)-1}}, + [42] {"connect", nil, 1, {.p = (void *)-1}}, + [43] {"accept", nil, 1, {.p = (void *)-1}}, + [44] {"sendto", nil, 1, {.p = (void *)-1}}, + [45] {"recvfrom", nil, 1, {.p = (void *)-1}}, + [46] {"sendmsg", nil, 1, {.p = (void *)-1}}, + [47] {"recvmsg", nil, 1, {.p = (void *)-1}}, + [48] {"shutdown", nil, 1, {.p = (void *)-1}}, + [49] {"bind", nil, 1, {.p = (void *)-1}}, + [50] {"listen", nil, 1, {.p = (void *)-1}}, + [51] {"getsockname", nil, 1, {.p = (void *)-1}}, + [52] {"getpeername", nil, 1, {.p = (void *)-1}}, + [53] {"socketpair", nil, 1, {.p = (void *)-1}}, + [54] {"setsockopt", nil, 1, {.p = (void *)-1}}, + [55] {"getsockopt", nil, 1, {.p = (void *)-1}}, + + [57] {"fork", nil, 1, {.p = (void *)-1}}, + [58] {"vfork", nil, 1, {.p = (void *)-1}}, + [59] {"execve", nil, 1, {.p = (void *)-1}}, + + [61] {"wait4", nil, 1, {.p = (void *)-1}}, + [62] {"kill", nil, 1, {.p = (void *)-1}}, + + [64] {"semget", nil, 1, {.p = (void *)-1}}, + [65] {"semop", nil, 1, {.p = (void *)-1}}, + [66] {"semctl", nil, 1, {.p = (void *)-1}}, + [67] {"shmdt", nil, 1, {.p = (void *)-1}}, + [68] {"msgget", nil, 1, {.p = (void *)-1}}, + [69] {"msgsnd", nil, 1, {.p = (void *)-1}}, + [70] {"msgrcv", nil, 1, {.p = (void *)-1}}, + [71] {"msgctl", nil, 1, {.p = (void *)-1}}, + [72] {"fcntl", nil, 1, {.p = (void *)-1}}, + [73] {"flock", nil, 1, {.p = (void *)-1}}, + [74] {"fsync", nil, 1, {.p = (void *)-1}}, + [75] {"fdatasync", nil, 1, {.p = (void *)-1}}, + [76] {"truncate", nil, 1, {.p = (void *)-1}}, + [77] {"ftruncate", nil, 1, {.p = (void *)-1}}, + [78] {"getdents", nil, 1, {.p = (void *)-1}}, + [79] {"getcwd", nil, 1, {.p = (void *)-1}}, + [80] {"chdir", nil, 1, {.p = (void *)-1}}, + [81] {"fchdir", nil, 1, {.p = (void *)-1}}, + [82] {"rename", nil, 1, {.p = (void *)-1}}, + [83] {"mkdir", nil, 1, {.p = (void *)-1}}, + [84] {"rmdir", nil, 1, {.p = (void *)-1}}, + [85] {"creat", nil, 1, {.p = (void *)-1}}, + [86] {"link", nil, 1, {.p = (void *)-1}}, + [87] {"unlink", nil, 1, {.p = (void *)-1}}, + [88] {"symlink", nil, 1, {.p = (void *)-1}}, + [89] {"readlink", nil, 1, {.p = (void *)-1}}, + [90] {"chmod", nil, 1, {.p = (void *)-1}}, + [91] {"fchmod", nil, 1, {.p = (void *)-1}}, + [92] {"chown", nil, 1, {.p = (void *)-1}}, + [93] {"fchown", nil, 1, {.p = (void *)-1}}, + [94] {"lchown", nil, 1, {.p = (void *)-1}}, + [95] {"umask", nil, 1, {.p = (void *)-1}}, + [96] {"gettimeofday", nil, 1, {.p = (void *)-1}}, + [97] {"getrlimit", nil, 1, {.p = (void *)-1}}, + + [99] {"sysinfo", nil, 1, {.p = (void *)-1}}, + [100] {"times", nil, 1, {.p = (void *)-1}}, + [101] {"ptrace", nil, 1, {.p = (void *)-1}}, + + [103] {"syslog", nil, 1, {.p = (void *)-1}}, + + [105] {"setuid", nil, 1, {.p = (void *)-1}}, + [106] {"setgid", nil, 1, {.p = (void *)-1}}, + + [109] {"setpgid", nil, 1, {.p = (void *)-1}}, + [110] {"getppid", nil, 1, {.p = (void *)-1}}, + [111] {"getpgrp", nil, 1, {.p = (void *)-1}}, + [112] {"setsid", nil, 1, {.p = (void *)-1}}, + [113] {"setreuid", nil, 1, {.p = (void *)-1}}, + [114] {"setregid", nil, 1, {.p = (void *)-1}}, + [115] {"getgroups", nil, 1, {.p = (void *)-1}}, + [116] {"setgroups", nil, 1, {.p = (void *)-1}}, + [117] {"setresuid", nil, 1, {.p = (void *)-1}}, + [118] {"getresuid", nil, 1, {.p = (void *)-1}}, + [119] {"setresgid", nil, 1, {.p = (void *)-1}}, + [120] {"getresgid", nil, 1, {.p = (void *)-1}}, + [121] {"getpgid", nil, 1, {.p = (void *)-1}}, + [122] {"setfsuid", nil, 1, {.p = (void *)-1}}, + [123] {"setfsgid", nil, 1, {.p = (void *)-1}}, + [124] {"getsid", nil, 1, {.p = (void *)-1}}, + [125] {"capget", nil, 1, {.p = (void *)-1}}, + [126] {"capset", nil, 1, {.p = (void *)-1}}, + [127] {"rt_sigpending", nil, 1, {.p = (void *)-1}}, + [128] {"rt_sigtimedwait", nil, 1, {.p = (void *)-1}}, + [129] {"rt_sigqueueinfo", nil, 1, {.p = (void *)-1}}, + [130] {"rt_sigsuspend", nil, 1, {.p = (void *)-1}}, + [131] {"sigaltstack", nil, 1, {.p = (void *)-1}}, + [132] {"utime", nil, 1, {.p = (void *)-1}}, + [133] {"mknod", nil, 1, {.p = (void *)-1}}, + [134] {"uselib", nil, 1, {.p = (void *)-1}}, + [135] {"personality", nil, 1, {.p = (void *)-1}}, + [136] {"ustat", nil, 1, {.p = (void *)-1}}, + [137] {"statfs", nil, 1, {.p = (void *)-1}}, + [138] {"fstatfs", nil, 1, {.p = (void *)-1}}, + [139] {"sysfs", nil, 1, {.p = (void *)-1}}, + [140] {"getpriority", nil, 1, {.p = (void *)-1}}, + [141] {"setpriority", nil, 1, {.p = (void *)-1}}, + [142] {"sched_setparam", nil, 1, {.p = (void *)-1}}, + [143] {"sched_getparam", nil, 1, {.p = (void *)-1}}, + [144] {"sched_setscheduler", nil, 1, {.p = (void *)-1}}, + [145] {"sched_getscheduler", nil, 1, {.p = (void *)-1}}, + [146] {"sched_get_priority_max", nil, 1, {.p = (void *)-1}}, + [147] {"sched_get_priority_min", nil, 1, {.p = (void *)-1}}, + [148] {"sched_rr_get_interval", nil, 1, {.p = (void *)-1}}, + [149] {"mlock", nil, 1, {.p = (void *)-1}}, + [150] {"munlock", nil, 1, {.p = (void *)-1}}, + [151] {"mlockall", nil, 1, {.p = (void *)-1}}, + [152] {"munlockall", nil, 1, {.p = (void *)-1}}, + [153] {"vhangup", nil, 1, {.p = (void *)-1}}, + [154] {"modify_ldt", nil, 1, {.p = (void *)-1}}, + [155] {"pivot_root", nil, 1, {.p = (void *)-1}}, + [156] {"_sysctl", nil, 1, {.p = (void *)-1}}, + [157] {"prctl", nil, 1, {.p = (void *)-1}}, + + [159] {"adjtimex", nil, 1, {.p = (void *)-1}}, + [160] {"setrlimit", nil, 1, {.p = (void *)-1}}, + [161] {"chroot", nil, 1, {.p = (void *)-1}}, + [162] {"sync", nil, 1, {.p = (void *)-1}}, + [163] {"acct", nil, 1, {.p = (void *)-1}}, + [164] {"settimeofday", nil, 1, {.p = (void *)-1}}, + [165] {"mount", nil, 1, {.p = (void *)-1}}, + [166] {"umount2", nil, 1, {.p = (void *)-1}}, + [167] {"swapon", nil, 1, {.p = (void *)-1}}, + [168] {"swapoff", nil, 1, {.p = (void *)-1}}, + [169] {"reboot", nil, 1, {.p = (void *)-1}}, + [170] {"sethostname", nil, 1, {.p = (void *)-1}}, + [171] {"setdomainname", nil, 1, {.p = (void *)-1}}, + [172] {"iopl", nil, 1, {.p = (void *)-1}}, + [173] {"ioperm", nil, 1, {.p = (void *)-1}}, + [174] {"create_module", nil, 1, {.p = (void *)-1}}, + [175] {"init_module", nil, 1, {.p = (void *)-1}}, + [176] {"delete_module", nil, 1, {.p = (void *)-1}}, + [177] {"get_kernel_syms", nil, 1, {.p = (void *)-1}}, + [178] {"query_module", nil, 1, {.p = (void *)-1}}, + [179] {"quotactl", nil, 1, {.p = (void *)-1}}, + [180] {"nfsservctl", nil, 1, {.p = (void *)-1}}, + [181] {"getpmsg", nil, 1, {.p = (void *)-1}}, + [182] {"putpmsg", nil, 1, {.p = (void *)-1}}, + [183] {"afs_syscall", nil, 1, {.p = (void *)-1}}, + [184] {"tuxcall", nil, 1, {.p = (void *)-1}}, + [185] {"security", nil, 1, {.p = (void *)-1}}, + [186] {"gettid", nil, 1, {.p = (void *)-1}}, + [187] {"readahead", nil, 1, {.p = (void *)-1}}, + [188] {"setxattr", nil, 1, {.p = (void *)-1}}, + [189] {"lsetxattr", nil, 1, {.p = (void *)-1}}, + [190] {"fsetxattr", nil, 1, {.p = (void *)-1}}, + [191] {"getxattr", nil, 1, {.p = (void *)-1}}, + [192] {"lgetxattr", nil, 1, {.p = (void *)-1}}, + [193] {"fgetxattr", nil, 1, {.p = (void *)-1}}, + [194] {"listxattr", nil, 1, {.p = (void *)-1}}, + [195] {"llistxattr", nil, 1, {.p = (void *)-1}}, + [196] {"flistxattr", nil, 1, {.p = (void *)-1}}, + [197] {"removexattr", nil, 1, {.p = (void *)-1}}, + [198] {"lremovexattr", nil, 1, {.p = (void *)-1}}, + [199] {"fremovexattr", nil, 1, {.p = (void *)-1}}, + [200] {"tkill", nil, 1, {.p = (void *)-1}}, + [201] {"time", nil, 1, {.p = (void *)-1}}, + [202] {"futex", nil, 1, {.p = (void *)-1}}, + [203] {"sched_setaffinity", nil, 1, {.p = (void *)-1}}, + [204] {"sched_getaffinity", nil, 1, {.p = (void *)-1}}, + [205] {"set_thread_area", nil, 1, {.p = (void *)-1}}, + [206] {"io_setup", nil, 1, {.p = (void *)-1}}, + [207] {"io_destroy", nil, 1, {.p = (void *)-1}}, + [208] {"io_getevents", nil, 1, {.p = (void *)-1}}, + [209] {"io_submit", nil, 1, {.p = (void *)-1}}, + [210] {"io_cancel", nil, 1, {.p = (void *)-1}}, + [211] {"get_thread_area", nil, 1, {.p = (void *)-1}}, + [212] {"lookup_dcookie", nil, 1, {.p = (void *)-1}}, + [213] {"epoll_create", nil, 1, {.p = (void *)-1}}, + [214] {"epoll_ctl_old", nil, 1, {.p = (void *)-1}}, + [215] {"epoll_wait_old", nil, 1, {.p = (void *)-1}}, + [216] {"remap_file_pages", nil, 1, {.p = (void *)-1}}, + [217] {"getdents64", nil, 1, {.p = (void *)-1}}, + [218] {"set_tid_address", nil, 1, {.p = (void *)-1}}, + [219] {"restart_syscall", nil, 1, {.p = (void *)-1}}, + [220] {"semtimedop", nil, 1, {.p = (void *)-1}}, + [221] {"fadvise64", nil, 1, {.p = (void *)-1}}, + [222] {"timer_create", nil, 1, {.p = (void *)-1}}, + [223] {"timer_settime", nil, 1, {.p = (void *)-1}}, + [224] {"timer_gettime", nil, 1, {.p = (void *)-1}}, + [225] {"timer_getoverrun", nil, 1, {.p = (void *)-1}}, + [226] {"timer_delete", nil, 1, {.p = (void *)-1}}, + [227] {"clock_settime", nil, 1, {.p = (void *)-1}}, + [228] {"clock_gettime", nil, 1, {.p = (void *)-1}}, + [229] {"clock_getres", nil, 1, {.p = (void *)-1}}, + [230] {"clock_nanosleep", nil, 1, {.p = (void *)-1}}, + [231] {"exit_group", nil, 1, {.p = (void *)-1}}, + [232] {"epoll_wait", nil, 1, {.p = (void *)-1}}, + [233] {"epoll_ctl", nil, 1, {.p = (void *)-1}}, + [234] {"tgkill", nil, 1, {.p = (void *)-1}}, + [235] {"utimes", nil, 1, {.p = (void *)-1}}, + [236] {"vserver", nil, 1, {.p = (void *)-1}}, + [237] {"mbind", nil, 1, {.p = (void *)-1}}, + [238] {"set_mempolicy", nil, 1, {.p = (void *)-1}}, + [239] {"get_mempolicy", nil, 1, {.p = (void *)-1}}, + [240] {"mq_open", nil, 1, {.p = (void *)-1}}, + [241] {"mq_unlink", nil, 1, {.p = (void *)-1}}, + [242] {"mq_timedsend", nil, 1, {.p = (void *)-1}}, + [243] {"mq_timedreceive", nil, 1, {.p = (void *)-1}}, + [244] {"mq_notify", nil, 1, {.p = (void *)-1}}, + [245] {"mq_getsetattr", nil, 1, {.p = (void *)-1}}, + [246] {"kexec_load", nil, 1, {.p = (void *)-1}}, + [247] {"waitid", nil, 1, {.p = (void *)-1}}, + [248] {"add_key", nil, 1, {.p = (void *)-1}}, + [249] {"request_key", nil, 1, {.p = (void *)-1}}, + [250] {"keyctl", nil, 1, {.p = (void *)-1}}, + [251] {"ioprio_set", nil, 1, {.p = (void *)-1}}, + [252] {"ioprio_get", nil, 1, {.p = (void *)-1}}, + [253] {"inotify_init", nil, 1, {.p = (void *)-1}}, + [254] {"inotify_add_watch", nil, 1, {.p = (void *)-1}}, + [255] {"inotify_rm_watch", nil, 1, {.p = (void *)-1}}, + [256] {"migrate_pages", nil, 1, {.p = (void *)-1}}, + [257] {"openat", nil, 1, {.p = (void *)-1}}, + [258] {"mkdirat", nil, 1, {.p = (void *)-1}}, + [259] {"mknodat", nil, 1, {.p = (void *)-1}}, + [260] {"fchownat", nil, 1, {.p = (void *)-1}}, + [261] {"futimesat", nil, 1, {.p = (void *)-1}}, + [262] {"newfstatat", nil, 1, {.p = (void *)-1}}, + [263] {"unlinkat", nil, 1, {.p = (void *)-1}}, + [264] {"renameat", nil, 1, {.p = (void *)-1}}, + [265] {"linkat", nil, 1, {.p = (void *)-1}}, + [266] {"symlinkat", nil, 1, {.p = (void *)-1}}, + [267] {"readlinkat", nil, 1, {.p = (void *)-1}}, + [268] {"fchmodat", nil, 1, {.p = (void *)-1}}, + [269] {"faccessat", nil, 1, {.p = (void *)-1}}, + [270] {"pselect6", nil, 1, {.p = (void *)-1}}, + [271] {"ppoll", nil, 1, {.p = (void *)-1}}, + [272] {"unshare", nil, 1, {.p = (void *)-1}}, + [273] {"set_robust_list", nil, 1, {.p = (void *)-1}}, + [274] {"get_robust_list", nil, 1, {.p = (void *)-1}}, + [275] {"splice", nil, 1, {.p = (void *)-1}}, + [276] {"tee", nil, 1, {.p = (void *)-1}}, + [277] {"sync_file_range", nil, 1, {.p = (void *)-1}}, + [278] {"vmsplice", nil, 1, {.p = (void *)-1}}, + [279] {"move_pages", nil, 1, {.p = (void *)-1}}, + [280] {"utimensat", nil, 1, {.p = (void *)-1}}, + [281] {"epoll_pwait", nil, 1, {.p = (void *)-1}}, + [282] {"signalfd", nil, 1, {.p = (void *)-1}}, + [283] {"timerfd_create", nil, 1, {.p = (void *)-1}}, + [284] {"eventfd", nil, 1, {.p = (void *)-1}}, + [285] {"fallocate", nil, 1, {.p = (void *)-1}}, + [286] {"timerfd_settime", nil, 1, {.p = (void *)-1}}, + [287] {"timerfd_gettime", nil, 1, {.p = (void *)-1}}, + [288] {"accept4", nil, 1, {.p = (void *)-1}}, + [289] {"signalfd4", nil, 1, {.p = (void *)-1}}, + [290] {"eventfd2", nil, 1, {.p = (void *)-1}}, + [291] {"epoll_create1", nil, 1, {.p = (void *)-1}}, + [292] {"dup3", nil, 1, {.p = (void *)-1}}, + [293] {"pipe2", nil, 1, {.p = (void *)-1}}, + [294] {"inotify_init1", nil, 1, {.p = (void *)-1}}, + [295] {"preadv", nil, 1, {.p = (void *)-1}}, + [296] {"pwritev", nil, 1, {.p = (void *)-1}}, + [297] {"rt_tgsigqueueinfo", nil, 1, {.p = (void *)-1}}, + [298] {"perf_event_open", nil, 1, {.p = (void *)-1}}, + [299] {"recvmmsg", nil, 1, {.p = (void *)-1}}, + +}; + +int nlinuxsyscall = nelem(linuxsystab); \ No newline at end of file diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/physalloc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/physalloc.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,488 @@ +/* + * Buddy allocator from Knuth. + * Written by jmk. + * being adapted by nemo so we could have one per + * ACPI affinity domain, to color pages depending on their + * NUMA location. + * + */ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "acpi.h" + +#define ISPOWEROF2(x) (((x) != 0) && !((x) & ((x)-1))) +#define UNO ((uintmem)1) + +enum { + BKmin = 21, /* Minimum lg2 */ + BKmax = 30, /* Maximum lg2 */ + + Ndoms = 16, /* Max # of domains */ +}; + + +#define INDEX(b, v) ((uint)(((v))/(b)->bminsz)) +#define BLOCK(b, i) ((i)-INDEX((b),(b)->memory)) + +typedef struct Buddy Buddy; +struct Buddy { + int tag; + int kval; + uint next; + uint prev; +}; + +/* + * Bals should allocate using its base address as 0. + * For now, all of them refer to the entire memory and we record + * the base and size for each one. + */ +typedef struct Bal Bal; +struct Bal { + uintmem base; + u64int size; + usize nfree; + usize nblocks; + int kmin; /* Minimum lg2 */ + int kmax; /* Maximum lg2 */ + uintmem bminsz; /* minimum block sz */ + uintmem memory; + uint kspan; + + Buddy* blocks; + Buddy* avail; +}; + +static Bal bal[Ndoms]; +static int ndoms; +static Lock budlock; + +char* +seprintphysstats(char *s, char *e) +{ + Bal *b; + int i; + + lock(&budlock); + for(i = 0; i < Ndoms; i++){ + b = &bal[i]; + if(b->size > 0) + s = seprint(s, e, "%uld/%uld %ulldK color %d blocks avail\n", + b->nfree, b->nblocks, b->bminsz/KiB, i); + } + unlock(&budlock); + return s; +} + +static void +xphysfree(Bal *b, uintmem data, u64int size) +{ + uint i; + Buddy *l, *p; + Buddy *blocks, *avail; + + DBG("physfree\n"); + + /* + * Knuth's Algorithm S (Buddy System Liberation). + */ + blocks = b->blocks; + avail = b->avail; + + if(data == 0 /*|| !ALIGNED(data, b->bminsz)*/) + return; + i = INDEX(b,data); + + lock(&budlock); +S1: + /* + * Find buddy. + */ + l = &blocks[BLOCK(b,i)]; + DBG("\tbsl: BLOCK(b,i) %d index %ulld kval %d\n", + BLOCK(b,i), BLOCK(b,i)/((1<kval)/b->bminsz), l->kval); + if((BLOCK(b,i)/((1<kval)/b->bminsz)) & 1) /* simpler test? */ + p = l - (1<kval)/b->bminsz; + else + p = l + (1<kval)/(b->bminsz); + DBG("\tbsl: l @ %ld buddy @ %ld\n", l - blocks, p - blocks); + + /* + * Is buddy available? + * Can't merge if: + * this is the largest block; + * buddy isn't free; + * buddy has been subsequently split again. + */ + if(l->kval == b->kmax || p->tag == 0 || (p->tag == 1 && p->kval != l->kval)){ + /* + * Put on list. + */ + l->tag = 1; + l->next = avail[l->kval].next; + l->prev = 0; + if(l->next != 0) + blocks[BLOCK(b,l->next)].prev = i; + avail[l->kval].next = i; + + b->nfree += size/b->bminsz; + + unlock(&budlock); + DBG("bsl: free @ i %d BLOCK(b,i) %d kval %d next %d tag %d\n", + i, BLOCK(b,i), l->kval, l->next, l->tag); + return; + } + + /* + * Combine with buddy. + * This removes block P from the avail list. + */ + if(p->prev != 0){ + blocks[BLOCK(b,p->prev)].next = p->next; + p->prev = 0; + } + else + avail[p->kval].next = 0; + if(p->next != 0){ + blocks[BLOCK(b,p->next)].prev = p->prev; + p->next = 0; + } + p->tag = 0; + + /* + * Now can try to merge this larger block. + k++; + */ + DBG("\tbsl: l @ %ld p @ %ld\n", l - blocks, p - blocks); + if(p < l) + l = p; + i = l - blocks + INDEX(b,b->memory); + l->kval++; + DBG("bsl: merge @ i %d BLOCK(b,i) %d kval %d next %d tag %d\n", + i, BLOCK(b,i), l->kval, l->next, l->tag); + goto S1; +} + +void +physfree(uintmem data, u64int size) +{ + Bal *b; + int i; + + for(i = 0; i < Ndoms; i++){ + b = &bal[i]; + if(b->base <= data && data < b->base + b->size){ + xphysfree(b, data, size); + return; + } + } + panic("physfree: no bal"); +} + + +static uchar lg2table[256] = { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static int +lg2floor(u64int w) +{ + u64int hi, lo; + + if((lo = (w>>48)) != 0){ + if((hi = (lo>>8)) != 0) + return 56+lg2table[hi]; + return 48+lg2table[lo]; + } + if((lo = (w>>32)) != 0){ + if((hi = (lo>>8)) != 0) + return 40+lg2table[hi]; + return 32+lg2table[lo]; + } + if((lo = (w>>16)) != 0){ + if((hi = (lo>>8)) != 0) + return 24+lg2table[hi]; + return 16+lg2table[lo]; + } + if((hi = (w>>8)) != 0) + return 8+lg2table[hi]; + return lg2table[w]; +} + +static uintmem +xphysalloc(Bal *b, u64int size) +{ + uint i, j, k; + Buddy *l, *p; + Buddy *avail, *blocks; + uintmem m; + + DBG("physalloc\n"); + assert(b->size > 0); + + avail = b->avail; + blocks = b->blocks; + + /* + * Knuth's Algorithm R (Buddy System Reservation). + */ + if(size < b->bminsz) + size = b->bminsz; + + /* + * Find block. + */ + if(!ISPOWEROF2(size)) + return 0; + k = lg2floor(size); + + lock(&budlock); + for(j = k; j <= b->kmax; j++){ + if(avail[j].next != 0) + break; + } + DBG("bsr: size %#llud k %d j %d\n", size, k, j); + if(j > b->kmax){ + unlock(&budlock); + return 0; + } + + /* + * Remove from list. + */ + i = avail[j].next; + l = &blocks[BLOCK(b,i)]; + DBG("bsr: block @ i %d BLOCK(b,i) %d kval %d next %d tag %d\n", + i, BLOCK(b,i), l->kval, l->next, l->tag); + avail[j].next = l->next; + blocks[avail[j].next].prev = 0; + l->prev = l->next = 0; + l->tag = 0; + l->kval = k; + + /* + * Split required? + */ + while(j > k){ + /* + * Split. + */ + j--; + p = &blocks[BLOCK(b,i) + (UNO<bminsz)]; + p->tag = 1; + p->kval = j; + p->next = avail[j].next; + p->prev = 0; + if(p->next != 0) + blocks[BLOCK(b,p->next)].prev = i + (UNO<bminsz); + avail[j].next = i + (UNO<bminsz); + DBG("bsr: split @ i %d BLOCK(b,i) %ld j %d next %d (%d) tag %d\n", + i, p - blocks, j, p->next, BLOCK(b,p->next), p->tag); + } + b->nfree -= size/b->bminsz; + unlock(&budlock); + + m = b->memory + b->bminsz*BLOCK(b,i); + assert(m >= b->base && m < b->base + b->size); + return m; +} + +uintmem +physalloc(u64int size, int *colorp) +{ + int i, color; + uintmem m; + + m = 0; + color = *colorp; + if(color >= 0){ + color %= ndoms; + *colorp = color; + m = xphysalloc(&bal[color], size); + } + if(m == 0) + for(i = 0; i < ndoms; i++) + if((m = xphysalloc(&bal[i], size)) != 0){ + *colorp = i; + return m; + } + return m; +} + +static void +dump(Bal *b) +{ + uint bi, i, k; + Buddy *blocks; + + blocks = b->blocks; + for(i = 0; i < (UNO<<(b->kmax-b->kmin+1)); i++){ + if(blocks[i].tag == 0) + continue; + print("blocks[%d]: size %d prev %d next %d\n", + i, 1<blocks[i].kval, blocks[i].prev, blocks[i].next); + //i += (1<bminsz-1; + } + + for(k = 0; k <= b->kmax; k++){ + print("a[%d]:", k); + for(bi = b->avail[k].next; bi != 0; bi = blocks[BLOCK(b,bi)].next){ + print(" %d", bi); + } + print("\n"); + } +} + +void +physallocdump(void) +{ + int n; + + for(n = 0; n < Ndoms; n++) + if(bal[n].size > 0) + print("physalloc color=%d base=%#ullx size=%#ullx\n", + n, bal[n].base, bal[n].size); +} + +static int +plop(Bal *b, uintmem a, int k, int type) +{ + uint i; + Buddy *l; + + + DBG("plop(a %#p k %d type %d)\n", a, k, type); + + i = INDEX(b,a); + l = &b->blocks[BLOCK(b,i)]; + + l->kval = k; + xphysfree(b, a, 1<bminsz); + e = ROUNDDN(e, b->bminsz); + DBG("iimbchunk: start a %#P e %#P\n", a, e); + + b->nblocks += (e-a)/b->bminsz; + + for(k = b->kmin, s = b->bminsz; a+s < e && k < b->kmax; s <<= 1, k += 1){ + if(a & s){ + plop(b, a, k, type); + a += s; + } + } + DBG("done1 a %#P e %#P s %#ux %d\n", a, e, s, k); + + while(a+s <= e){ + plop(b, a, k, type); + a += s; + } + DBG("done2 a %#P e %#P s %#ux %d\n", a, e, s, k); + + for(k -= 1, s >>= 1; a < e; s >>= 1, k -= 1){ + if(a+s <= e){ + plop(b, a, k, type); + a += s; + } + } + DBG("done3 a %#P e %#P s %#ux %d\n", a, e, s, k); + + return 0; +} + +/* + * Called from umeminit to initialize user memory allocators. + */ +void +physinit(uintmem a, u64int size) +{ + uintmem dtsz; + Bal *b; + int i, dom; + uintmem addr, len; + + DBG("physinit %#ullx %#ullx\n", a, size); + + for(addr = a; addr < a+size; addr += len){ + len = acpimblocksize(addr, &dom); + /* len can be zero if there's on acpi information about addr */ + if(len == 0 || addr + len > a + size) + len = a + size - addr; + /* + * Each block belongs to a different domain (ie. cpu/mem socket) + * We must create a buddy allocator for each block, so we could + * allocate memory from different domains. + * + * This code assumes that a domain may be extended later and + * that there is no interleaving of domains. Ok by now. + */ + DBG("physmem block dom %d addr %#ullx size %#ullx\n", dom, addr, len); + if(dom >= Ndoms) + panic("dom %d too large", dom); + b = &bal[dom]; + if(dom >= ndoms) + ndoms = dom+1; + if(b->kmin == 0){ + b->base = addr; + b->size = len; + b->kmin = BKmin; + b->kmax = BKmax; + b->bminsz = (UNO<kmin); + b->memory = sys->pmstart; + b->kspan = lg2floor(sys->pmend); + if(!ISPOWEROF2(sys->pmend)) + b->kspan++; + dtsz = sizeof(Buddy)*(UNO<<(b->kspan-b->kmin+1)); + DBG("kspan %ud (arrysz = %llud)\n", b->kspan, dtsz); + b->blocks = malloc(dtsz); + if(b->blocks == nil) + panic("physinit: no blocks"); + memset(b->blocks, 0, dtsz); + b->avail = malloc(sizeof(Buddy)*(b->kmax+1)); + if(b->avail == nil) + panic("physinit: no avail"); + memset(b->avail, 0, sizeof(Buddy)*(b->kmax+1)); + }else{ + if(addr < b->base) + panic("physinit: decreasing base"); + if(b->base+b->size < addr + len) + b->size = (addr-b->base) + len; + for(i = 0; i < Ndoms; i++) + if(bal[i].kmin && &bal[i] != b) + if(bal[i].base < b->base + b->size && + bal[i].base + bal[i].size > b->base + b->size) + panic("physinit: doms overlap"); + } + assert(addr >= b->base && addr+len <= b->base + b->size); + iimbchunk(b, addr, addr+len, 0); + } + + +} diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/k10/qmalloc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/k10/qmalloc.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,615 @@ +/* + * malloc + * + * Uses Quickfit (see SIGPLAN Notices October 1988) + * with allocator from Kernighan & Ritchie + * + * This is a placeholder. + */ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +typedef double Align; +typedef union Header Header; +typedef struct Qlist Qlist; + +union Header { + struct { + Header* next; + uint size; + } s; + Align al; +}; + +struct Qlist { + Lock lk; + Header* first; + + uint nalloc; +}; + +enum { + Unitsz = sizeof(Header), +}; + +#define NUNITS(n) (HOWMANY(n, Unitsz) + 1) +#define NQUICK ((512/Unitsz)+1) + +static Qlist quicklist[NQUICK+1]; +static Header misclist; +static Header *rover; +static unsigned tailsize; +static unsigned tailnunits; +static Header *tailbase; +static Header *tailptr; +static Header checkval; +static int morecore(unsigned); + +static void qfreeinternal(void*); +static int qstats[32]; + +static Lock mainlock; + +#define MLOCK ilock(&mainlock) +#define MUNLOCK iunlock(&mainlock) +#define QLOCK(l) ilock(l) +#define QUNLOCK(l) iunlock(l) + +#define tailalloc(p, n) ((p)=tailptr, tailsize -= (n), tailptr+=(n),\ + (p)->s.size=(n), (p)->s.next = &checkval) + +#define ISPOWEROF2(x) (/*((x) != 0) && */!((x) & ((x)-1))) +#define ALIGNHDR(h, a) (Header*)((((uintptr)(h))+((a)-1)) & ~((a)-1)) + +static void* +qmallocalign(usize nbytes, uintptr align, long offset, usize span) +{ + Qlist *qlist; + uintptr aligned; + Header **pp, *p, *q, *r; + uint naligned, nunits, n; + + if(nbytes == 0 || offset != 0 || span != 0) + return nil; + + if(!ISPOWEROF2(align) || align < sizeof(Header)) + return nil; + + qstats[5]++; + nunits = NUNITS(nbytes); + if(nunits <= NQUICK){ + /* + * Look for a conveniently aligned block + * on one of the quicklists. + */ + qlist = &quicklist[nunits]; + QLOCK(&qlist->lk); + pp = &qlist->first; + for(p = *pp; p != nil; p = p->s.next){ + if(ALIGNED(p+1, align)){ + *pp = p->s.next; + p->s.next = &checkval; + QUNLOCK(&qlist->lk); + qstats[6]++; + return p+1; + } + pp = &p->s.next; + } + QUNLOCK(&qlist->lk); + } + + MLOCK; + if(nunits > tailsize) { + /* hard way */ + if((q = rover) != nil){ + do { + p = q->s.next; + if(p->s.size < nunits) + continue; + aligned = ALIGNED(p+1, align); + naligned = NUNITS(align)-1; + if(!aligned && p->s.size < nunits+naligned) + continue; + + /* + * This block is big enough, remove it + * from the list. + */ + q->s.next = p->s.next; + rover = q; + qstats[7]++; + + /* + * Free any runt in front of the alignment. + */ + if(!aligned){ + r = p; + p = ALIGNHDR(p+1, align) - 1; + n = p - r; + p->s.size = r->s.size - n; + + r->s.size = n; + r->s.next = &checkval; + qfreeinternal(r+1); + qstats[8]++; + } + + /* + * Free any residue after the aligned block. + */ + if(p->s.size > nunits){ + r = p+nunits; + r->s.size = p->s.size - nunits; + r->s.next = &checkval; + qstats[9]++; + qfreeinternal(r+1); + + p->s.size = nunits; + } + + p->s.next = &checkval; + MUNLOCK; + return p+1; + } while((q = p) != rover); + } + if((n = morecore(nunits)) == 0){ + MUNLOCK; + return nil; + } + tailsize += n; + } + + q = ALIGNHDR(tailptr+1, align); + if(q == tailptr+1){ + tailalloc(p, nunits); + qstats[10]++; + } + else{ + naligned = NUNITS(align)-1; + if(tailsize < nunits+naligned){ + /* + * There are at least nunits, + * get enough for alignment. + */ + if((n = morecore(naligned)) == 0){ + MUNLOCK; + return nil; + } + tailsize += n; + } + /* + * Save the residue before the aligned allocation + * and free it after the tail pointer has been bumped + * for the main allocation. + */ + n = q-tailptr - 1; + tailalloc(r, n); + tailalloc(p, nunits); + qstats[11]++; + qfreeinternal(r+1); + } + MUNLOCK; + + return p+1; +} + +static void* +qmalloc(usize nbytes) +{ + Qlist *qlist; + Header *p, *q; + uint nunits, n; + +///* FIXME: (ignore for now) + if(nbytes == 0) + return nil; +//*/ + + qstats[0]++; + nunits = NUNITS(nbytes); + if(nunits <= NQUICK){ + qlist = &quicklist[nunits]; + QLOCK(&qlist->lk); + if((p = qlist->first) != nil){ + qlist->first = p->s.next; + qlist->nalloc++; + QUNLOCK(&qlist->lk); + p->s.next = &checkval; + return p+1; + } + QUNLOCK(&qlist->lk); + } + + MLOCK; + if(nunits > tailsize) { + /* hard way */ + if((q = rover) != nil){ + do { + p = q->s.next; + if(p->s.size >= nunits) { + if(p->s.size > nunits) { + p->s.size -= nunits; + p += p->s.size; + p->s.size = nunits; + } else + q->s.next = p->s.next; + p->s.next = &checkval; + rover = q; + qstats[2]++; + MUNLOCK; + return p+1; + } + } while((q = p) != rover); + } + if((n = morecore(nunits)) == 0){ + MUNLOCK; + return nil; + } + tailsize += n; + } + qstats[3]++; + tailalloc(p, nunits); + MUNLOCK; + + return p+1; +} + +static void +qfreeinternal(void* ap) +{ + Qlist *qlist; + Header *p, *q; + uint nunits; + + if(ap == nil) + return; + qstats[16]++; + + p = (Header*)ap - 1; + if((nunits = p->s.size) == 0 || p->s.next != &checkval) + panic("malloc: corrupt allocation arena\n"); + if(tailptr != nil && p+nunits == tailptr) { + /* block before tail */ + tailptr = p; + tailsize += nunits; + qstats[18]++; + return; + } + if(nunits <= NQUICK) { + qlist = &quicklist[nunits]; + QLOCK(&qlist->lk); + p->s.next = qlist->first; + qlist->first = p; + QUNLOCK(&qlist->lk); + qstats[18]++; + return; + } + if((q = rover) == nil) { + q = &misclist; + q->s.size = 0; + q->s.next = q; + } + for(; !(p > q && p < q->s.next); q = q->s.next) + if(q >= q->s.next && (p > q || p < q->s.next)) + break; + if(p+p->s.size == q->s.next) { + p->s.size += q->s.next->s.size; + p->s.next = q->s.next->s.next; + qstats[19]++; + } else + p->s.next = q->s.next; + if(q+q->s.size == p) { + q->s.size += p->s.size; + q->s.next = p->s.next; + qstats[20]++; + } else + q->s.next = p; + rover = q; +} + +ulong +msize(void* ap) +{ + Header *p; + uint nunits; + + if(ap == nil) + return 0; + + p = (Header*)ap - 1; + if((nunits = p->s.size) == 0 || p->s.next != &checkval) + panic("malloc: corrupt allocation arena\n"); + + return (nunits-1) * sizeof(Header); +} + +static void +mallocreadfmt(char* s, char* e) +{ + char *p; + Header *q; + int i, n, t; + Qlist *qlist; + + p = seprint(s, e, + "%llud memory\n" + "%d pagesize\n" + "%llud kernel\n", + (uvlong)conf.npage*PGSZ, + PGSZ, + (uvlong)conf.npage-conf.upages); + + t = 0; + for(i = 0; i <= NQUICK; i++) { + n = 0; + qlist = &quicklist[i]; + QLOCK(&qlist->lk); + for(q = qlist->first; q != nil; q = q->s.next){ +// if(q->s.size != i) +// p = seprint(p, e, "q%d\t%#p\t%ud\n", +// i, q, q->s.size); + n++; + } + QUNLOCK(&qlist->lk); + +// if(n != 0) +// p = seprint(p, e, "q%d %d\n", i, n); + t += n * i*sizeof(Header); + } + p = seprint(p, e, "quick: %ud bytes total\n", t); + + MLOCK; + if((q = rover) != nil){ + i = t = 0; + do { + t += q->s.size; + i++; +// p = seprint(p, e, "m%d\t%#p\n", q->s.size, q); + } while((q = q->s.next) != rover); + + p = seprint(p, e, "rover: %d blocks %ud bytes total\n", + i, t*sizeof(Header)); + } + p = seprint(p, e, "total allocated %lud, %ud remaining\n", + (tailptr-tailbase)*sizeof(Header), tailnunits*sizeof(Header)); + + for(i = 0; i < 32; i++){ + if(qstats[i] == 0) + continue; + p = seprint(p, e, "qstats[%d] %ud\n", i, qstats[i]); + } + MUNLOCK; +} + +long +mallocreadsummary(Chan*, void *a, long n, long offset) +{ + char *alloc; + + alloc = malloc(16*READSTR); + mallocreadfmt(alloc, alloc+16*READSTR); + n = readstr(offset, a, n, alloc); + free(alloc); + + return n; +} + +void +mallocsummary(void) +{ + Header *q; + int i, n, t; + Qlist *qlist; + + t = 0; + for(i = 0; i <= NQUICK; i++) { + n = 0; + qlist = &quicklist[i]; + QLOCK(&qlist->lk); + for(q = qlist->first; q != nil; q = q->s.next){ + if(q->s.size != i) + DBG("q%d\t%#p\t%ud\n", i, q, q->s.size); + n++; + } + QUNLOCK(&qlist->lk); + + t += n * i*sizeof(Header); + } + print("quick: %ud bytes total\n", t); + + MLOCK; + if((q = rover) != nil){ + i = t = 0; + do { + t += q->s.size; + i++; + } while((q = q->s.next) != rover); + } + MUNLOCK; + + if(i != 0){ + print("rover: %d blocks %ud bytes total\n", + i, t*sizeof(Header)); + } + print("total allocated %lud, %ud remaining\n", + (tailptr-tailbase)*sizeof(Header), tailnunits*sizeof(Header)); + + for(i = 0; i < 32; i++){ + if(qstats[i] == 0) + continue; + print("qstats[%d] %ud\n", i, qstats[i]); + } +} + +void +free(void* ap) +{ + MLOCK; + qfreeinternal(ap); + MUNLOCK; +} + +void* +malloc(ulong size) +{ + void* v; + + if((v = qmalloc(size)) != nil) + memset(v, 0, size); + + return v; +} + +void* +mallocz(ulong size, int clr) +{ + void *v; + + if((v = qmalloc(size)) != nil && clr) + memset(v, 0, size); + + return v; +} + +void* +mallocalign(ulong nbytes, ulong align, long offset, ulong span) +{ + void *v; + + /* + * Should this memset or should it be left to the caller? + */ + if((v = qmallocalign(nbytes, align, offset, span)) != nil) + memset(v, 0, nbytes); + + return v; +} + +void* +smalloc(ulong size) +{ + void *v; + + while((v = malloc(size)) == nil) + tsleep(&up->sleep, return0, 0, 100); + memset(v, 0, size); + + return v; +} + +void* +realloc(void* ap, ulong size) +{ + void *v; + Header *p; + ulong osize; + uint nunits, ounits; + + /* + * Easy stuff: + * free and return nil if size is 0 + * (implementation-defined behaviour); + * behave like malloc if ap is nil; + * check for arena corruption; + * do nothing if units are the same. + */ + if(size == 0){ + MLOCK; + qfreeinternal(ap); + MUNLOCK; + + return nil; + } + if(ap == nil) + return qmalloc(size); + + p = (Header*)ap - 1; + if((ounits = p->s.size) == 0 || p->s.next != &checkval) + panic("realloc: corrupt allocation arena\n"); + + if((nunits = NUNITS(size)) == ounits) + return ap; + + /* + * Slightly harder: + * if this allocation abuts the tail, try to just + * adjust the tailptr. + */ + MLOCK; + if(tailptr != nil && p+ounits == tailptr){ + if(ounits > nunits){ + p->s.size = nunits; + tailsize += ounits-nunits; + MUNLOCK; + return ap; + } + if(tailsize >= nunits-ounits){ + p->s.size = nunits; + tailsize -= nunits-ounits; + MUNLOCK; + return ap; + } + } + MUNLOCK; + + /* + * Worth doing if it's a small reduction? + * Do it anyway if <= NQUICK? + if((ounits-nunits) < 2) + return ap; + */ + + /* + * Too hard (or can't be bothered): + * allocate, copy and free. + * What does the standard say for failure here? + */ + if((v = qmalloc(size)) != nil){ + osize = (ounits-1)*sizeof(Header); + if(size < osize) + osize = size; + memmove(v, ap, osize); + MLOCK; + qfreeinternal(ap); + MUNLOCK; + } + + return v; +} + +void +setmalloctag(void*, ulong) +{ +} + +void +mallocinit(void) +{ + if(tailptr != nil) + return; + + tailbase = UINT2PTR(sys->vmunused); + tailptr = tailbase; + tailnunits = NUNITS(sys->vmend - sys->vmunused); + print("base %#p ptr %#p nunints %ud\n", tailbase, tailptr, tailnunits); +} + +static int +morecore(uint nunits) +{ + /* + * First (simple) cut. + * Pump it up when you don't really need it. + * Pump it up until you can feel it. + */ + if(nunits < NUNITS(128*KiB)) + nunits = NUNITS(128*KiB); + if(nunits > tailnunits) + nunits = tailnunits; + tailnunits -= nunits; + + return nunits; +} diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/libc/9sys/mkfile --- a/sys/src/nix/libc/9sys/mkfile Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/libc/9sys/mkfile Fri Sep 09 16:49:47 2011 +0200 @@ -56,9 +56,9 @@ werrstr.$O\ write.$O\ writev.$O\ + zp.$O\ zread.$O\ zwrite.$O\ - zp.$O\ HFILES=\ /sys/include/libc.h\ diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/cache.c --- a/sys/src/nix/port/cache.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/cache.c Fri Sep 09 16:49:47 2011 +0200 @@ -111,9 +111,9 @@ mc = cache.head; /* a better algorithm would be nice */ -// if(conf.npage*BY2PG > 200*MB) +// if(conf.npage*PGSZ > 200*MB) // maxcache = 10*MAXCACHE; -// if(conf.npage*BY2PG > 400*MB) +// if(conf.npage*PGSZ > 400*MB) // maxcache = 50*MAXCACHE; for(i = 0; i < NFILE-1; i++) { @@ -360,7 +360,7 @@ if(e == 0) break; - p = auxpage(); + p = auxpage(BIGPGSZ); if(p == 0) { extentfree(e); break; diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/devcons.c --- a/sys/src/nix/port/devcons.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/devcons.c Fri Sep 09 16:49:47 2011 +0200 @@ -376,12 +376,12 @@ int iprint(char *fmt, ...) { - Mreg s; + Mpl pl; int i, n, locked; va_list arg; char buf[PRINTSIZE]; - s = splhi(); + pl = splhi(); va_start(arg, fmt); n = vseprint(buf, buf+sizeof(buf), fmt, arg) - buf; va_end(arg); @@ -395,7 +395,7 @@ } if(locked) unlock(&iprintlock); - splx(s); + splx(pl); return n; } @@ -405,7 +405,7 @@ panic(char *fmt, ...) { int n; - Mreg s; + Mpl pl; va_list arg; char buf[PRINTSIZE]; @@ -415,7 +415,7 @@ for(;;); panicking = 1; - s = splhi(); + pl = splhi(); strcpy(buf, "panic: "); va_start(arg, fmt); n = vseprint(buf+strlen(buf), buf+sizeof(buf), fmt, arg) - buf; @@ -423,11 +423,12 @@ iprint("%s\n", buf); if(consdebug) (*consdebug)(); - splx(s); + splx(pl); prflush(); buf[n] = '\n'; putstrn(buf, n+1); dumpstack(); + delay(1000); /* give time to consoles */ exit(1); } @@ -485,7 +486,7 @@ static void echo(char *buf, int n) { - Mreg s; + Mpl pl; static int ctrlt, pid; char *e, *p; @@ -515,10 +516,10 @@ ctrlt = 0; switch(*p){ case 'S': - s = splhi(); + pl = splhi(); dumpstack(); procdump(); - splx(s); + splx(pl); return; case 's': dumpstack(); @@ -542,9 +543,9 @@ consdebug(); return; case 'p': - s = spllo(); + pl = spllo(); procdump(); - splx(s); + splx(pl); return; case 'q': scheddump(); @@ -817,8 +818,8 @@ { ulong l; Mach *mp; - char *b, *bp, ch; - char tmp[256]; /* must be >= 18*NUMSIZE (Qswap) */ + char *b, *bp, ch, *s; + char tmp[512]; /* Qswap is 381 bytes at clu */ int i, k, id, send; long offset; @@ -939,7 +940,7 @@ return 0; case Qsysstat: - b = smalloc(MAXMACH*(NUMSIZE*10+2+1) + 1); /* +1 for NUL */ + b = smalloc(MACHMAX*(NUMSIZE*10+2+1) + 1); /* +1 for NUL */ bp = b; for(id = 0; id < MACHMAX; id++) { mp = sys->machptr[id]; @@ -992,19 +993,12 @@ return n; case Qswap: - l = snprint(tmp, sizeof tmp, - "%llud memory\n" - "%d pagesize\n" - "%llud kernel\n" - "%lud/%lud user\n" - "%lud/%lud swap\n", - (uvlong)conf.npage*BY2PG, - BY2PG, - (uvlong)conf.npage-conf.upages, - palloc.user-palloc.freecount, palloc.user, - conf.nswap-swapalloc.free, conf.nswap); + tmp[0] = 0; + s = seprintpagestats(tmp, tmp + sizeof tmp); + s = seprintphysstats(s, tmp + sizeof tmp); b = buf; - i = readstr(offset, b, n, tmp); + l = s - tmp; + i = readstr(offset, b, l, tmp); b += i; n -= i; if(offset > l) diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/devproc.c --- a/sys/src/nix/port/devproc.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/devproc.c Fri Sep 09 16:49:47 2011 +0200 @@ -1551,7 +1551,7 @@ break; case CMcore: core = atoi(cb->f[1]); - if(core >= MAXMACH) + if(core >= MACHMAX) error("wrong core number"); else if(core == 0){ if(p->ac == nil) @@ -1593,6 +1593,7 @@ Segment *s; uintptr soff, l; /* hmmmm */ uchar *b; + uintmem pgsz; for(;;) { s = seg(p, offset, 1); @@ -1620,11 +1621,12 @@ pte = s->map[soff/PTEMAPMEM]; if(pte == 0) panic("procctlmemio"); - pg = pte->pages[(soff&(PTEMAPMEM-1))/BIGPGSZ]; + pgsz = m->pgsz[s->pgszi]; + pg = pte->pages[(soff&(PTEMAPMEM-1))/pgsz]; if(pagedout(pg)) panic("procctlmemio1"); - l = BIGPGSZ - (offset&(BIGPGSZ-1)); + l = pgsz - (offset&(pgsz-1)); if(n > l) n = l; @@ -1635,7 +1637,7 @@ nexterror(); } b = (uchar*)VA(k); - b += offset&(BIGPGSZ-1); + b += offset&(pgsz-1); if(read == 1) memmove(va, b, n); /* This can fault */ else diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/devsegment.c --- a/sys/src/nix/port/devsegment.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/devsegment.c Fri Sep 09 16:49:47 2011 +0200 @@ -66,7 +66,6 @@ static int cmddone(void*); static void segmentkproc(void*); static void docmd(Globalseg*, int); -static Segment* mkseg(uintptr va, uintptr len); /* * returns with globalseg incref'd @@ -152,6 +151,10 @@ case Qdata: case Qfree: g = getgseg(c); + if(waserror()){ + putgseg(g); + nexterror(); + } q.vers = 0; q.type = QTFILE; switch(s){ @@ -173,9 +176,11 @@ break; default: + poperror(); putgseg(g); return -1; } + poperror(); putgseg(g); break; } @@ -457,6 +462,7 @@ /* * BUG: we allocate virtual addresses but never give them * back when the segment is destroyed. + * BUG: what if we overlap other segments attached by the user? */ static ulong placeseg(ulong len) @@ -509,13 +515,13 @@ error("already has a virtual address"); if(cb->nf < 3) cmderror(cb, Ebadarg); - va = strtoull(cb->f[1], 0, 0); - len = strtoull(cb->f[2], 0, 0); + va = strtoul(cb->f[1], 0, 0); + len = strtoul(cb->f[2], 0, 0); if(va == 0) va = placeseg(len); - top = BIGPGROUND(va+len); + top = BIGPGROUND(va + len); va = va&~(BIGPGSZ-1); - len = (top-va)/BIGPGSZ; + len = (top - va) / BIGPGSZ; if(len == 0) cmderror(cb, "empty segment"); diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/fault.c --- a/sys/src/nix/port/fault.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/fault.c Fri Sep 09 16:49:47 2011 +0200 @@ -60,6 +60,7 @@ pexit(s, freemem); } + int fixfault(Segment *s, uintptr addr, int read, int dommuput) { @@ -67,18 +68,21 @@ int ref; Pte **p, *etp; uintptr soff; - physaddr mmuphys; + uintmem mmuphys, pgsz; + uint mmuattr; Page **pg, *lkp, *new; Page *(*fn)(Segment*, uintptr); - addr &= ~(BIGPGSZ-1); + pgsz = m->pgsz[s->pgszi]; + + addr &= ~(pgsz-1); soff = addr-s->base; p = &s->map[soff/PTEMAPMEM]; if(*p == 0) - *p = ptealloc(); + *p = ptealloc(s); etp = *p; - pg = &etp->pages[(soff&(PTEMAPMEM-1))/BIGPGSZ]; + pg = &etp->pages[(soff&(PTEMAPMEM-1))/pgsz]; type = s->type&SG_TYPE; if(pg < etp->first) @@ -87,6 +91,7 @@ etp->last = pg; mmuphys = 0; + mmuattr = 0; switch(type) { default: panic("fault"); @@ -96,7 +101,8 @@ if(pagedout(*pg)) pio(s, addr, soff, pg); - mmuphys = BIGPPN((*pg)->pa) | PTERONLY|PTEVALID; + mmuphys = segppn(s, (*pg)->pa); + mmuattr = PTERONLY|PTEVALID; (*pg)->modref = PG_REF; break; @@ -104,7 +110,7 @@ case SG_SHARED: /* Zero fill on demand */ case SG_STACK: if(*pg == 0) { - new = newpage(1, &s, addr, BIGPGSZ); + new = newpage(1, &s, addr, pgsz); if(s == 0) return -1; @@ -122,7 +128,8 @@ * we're the only user of the segment. */ if(read && conf.copymode == 0 && s->ref == 1) { - mmuphys = BIGPPN((*pg)->pa)|PTERONLY|PTEVALID; + mmuphys = segppn(s, (*pg)->pa); + mmuattr = PTERONLY|PTEVALID; (*pg)->modref |= PG_REF; break; } @@ -130,14 +137,11 @@ lkp = *pg; lock(lkp); - if(lkp->image == &swapimage) - ref = lkp->ref + swapcount(lkp->daddr); - else - ref = lkp->ref; + ref = lkp->ref; if(ref > 1) { unlock(lkp); - new = newpage(0, &s, addr, BIGPGSZ); + new = newpage(0, &s, addr, pgsz); if(s == 0) return -1; *pg = new; @@ -151,7 +155,8 @@ unlock(lkp); } - mmuphys = BIGPPN((*pg)->pa) | PTEWRITE | PTEVALID; + mmuphys = segppn(s, (*pg)->pa); + mmuattr = PTEWRITE|PTEVALID; (*pg)->modref = PG_MOD|PG_REF; break; @@ -165,23 +170,24 @@ new->va = addr; new->pa = s->pseg->pa+(addr-s->base); new->ref = 1; - new->lgsize = s->pseg->lgpgsize; + new->pgszi = s->pseg->pgszi; *pg = new; } } - mmuphys = BIGPPN((*pg)->pa) |PTEVALID; + mmuphys = segppn(s, (*pg)->pa); + mmuattr = PTEVALID; if((s->pseg->attr & SG_RONLY) == 0) - mmuphys |= PTEWRITE; + mmuattr |= PTEWRITE; if((s->pseg->attr & SG_CACHED) == 0) - mmuphys |= PTEUNCACHED; + mmuattr |= PTEUNCACHED; (*pg)->modref = PG_MOD|PG_REF; break; } qunlock(&s->lk); if(dommuput) - mmuput(addr, mmuphys, *pg); + mmuput(addr, mmuphys, mmuattr, *pg); return 0; } @@ -193,12 +199,15 @@ KMap *k; Chan *c; int n, ask; + uintmem pgsz; char *kaddr; ulong daddr; Page *loadrec; -retry: loadrec = *p; + daddr = ask = 0; + c = nil; + pgsz = m->pgsz[s->pgszi]; if(loadrec == nil) { /* from a text/data image */ daddr = s->fstart+soff; new = lookpage(s->image, daddr); @@ -209,24 +218,15 @@ c = s->image->c; ask = s->flen-soff; - if(ask > BIGPGSZ) - ask = BIGPGSZ; + if(ask > pgsz) + ask = pgsz; } - else { /* from a swap image */ - daddr = swapaddr(loadrec); - new = lookpage(&swapimage, daddr); - if(new != nil) { - putswap(loadrec); - *p = new; - return; - } + else + panic("no swap"); - c = swapimage.c; - ask = BIGPGSZ; - } qunlock(&s->lk); - new = newpage(0, 0, addr, BIGPGSZ); + new = newpage(0, 0, addr, pgsz); k = kmap(new); kaddr = (char*)VA(k); @@ -241,8 +241,8 @@ n = c->dev->read(c, kaddr, ask, daddr); if(n != ask) faulterror(Eioload, c, 0); - if(ask < BIGPGSZ) - memset(kaddr+ask, 0, BIGPGSZ-ask); + if(ask < pgsz) + memset(kaddr+ask, 0, pgsz-ask); poperror(); kunmap(k); @@ -261,31 +261,9 @@ else putpage(new); } - else { /* This is paged out */ - /* - * race, another proc may have gotten here first - * (and the pager may have run on that page) while - * s->lk was unlocked - */ - if(*p != loadrec){ - if(!pagedout(*p)){ - /* another process did it for me */ - putpage(new); - goto done; - } else { - /* another process and the pager got in */ - putpage(new); - goto retry; - } - } + else + panic("no swap"); - new->daddr = daddr; - cachepage(new, &swapimage); - *p = new; - putswap(loadrec); - } - -done: if(s->flushme) memset((*p)->cachectl, PG_TXTFLUSH, sizeof((*p)->cachectl)); } @@ -329,7 +307,8 @@ /* * &s[0] is known to be a valid address. - * Assume 4K pages, so it works for both 4K and 2M pages. + * Assume 2M pages, so it works for both 2M and 1G pages. + * Note this won't work for 4*KiB pages! */ void* vmemchr(void *s, int c, int n) @@ -339,9 +318,9 @@ void *t; a = PTR2UINT(s); - while(PGROUND(a) != PGROUND(a+n-1)){ + while(ROUNDUP(a, BIGPGSZ) != ROUNDUP(a+n-1, BIGPGSZ)){ /* spans pages; handle this page */ - m = PGSZ - (a & (PGSZ-1)); + m = BIGPGSZ - (a & (BIGPGSZ-1)); t = memchr(UINT2PTR(a), c, m); if(t) return t; diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/image.c --- a/sys/src/nix/port/image.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/image.c Fri Sep 09 16:49:47 2011 +0200 @@ -12,7 +12,8 @@ static struct Imagealloc { Lock; - Image *free; + Image *mru; /* head of LRU list */ + Image *lru; /* tail of LRU list */ Image *hash[IHASHSIZE]; QLock ireclaim; /* mutex on reclaiming free images */ @@ -27,37 +28,142 @@ int loops; /* times the main loop was run */ uvlong ticks; /* total time in the main loop */ uvlong maxt; /* longest time in main loop */ + int noluck; /* # of times we couldn't get one */ + int nolock; /* # of times we couldn't get the lock */ } irstats; +static void +dumplru(void) +{ + Image *i; + + print("lru:"); + for(i = imagealloc.mru; i != nil; i = i->next) + print(" %p(c%p,r%d)", i, i->c, i->ref); + print("\n"); +} + +/* + * imagealloc and i must be locked. + */ +static void +imageunused(Image *i) +{ + if(i->prev != nil) + i->prev->next = i->next; + else + imagealloc.mru = i->next; + if(i->next != nil) + i->next->prev = i->prev; + else + imagealloc.lru = i->prev; + i->next = i->prev = nil; +} + +/* + * imagealloc and i must be locked. + */ +static void +imageused(Image *i) +{ + imageunused(i); + i->next = imagealloc.mru; + i->next->prev = i; + imagealloc.mru = i; + if(imagealloc.lru == nil) + imagealloc.lru = i; +} + +/* + * imagealloc must be locked. + */ +static Image* +lruimage(void) +{ + Image *i; + + for(i = imagealloc.lru; i != nil; i = i->prev) + if(i->c == nil){ + /* + * i->c will be set before releasing the + * lock on imagealloc, which means it's in use. + */ + return i; + } + return nil; +} + +/* + * On clu, set conf.nimages = 10 to exercise reclaiming. + * It won't be able to get through all of cpurc, but will reclaim. + */ void initimage(void) { Image *i, *ie; - imagealloc.free = malloc(conf.nimage*sizeof(Image)); - if(imagealloc.free == nil) + DBG("initimage: %uld images\n", conf.nimage); + imagealloc.mru = malloc(conf.nimage*sizeof(Image)); + if(imagealloc.mru == nil) panic("imagealloc: no memory"); - ie = &imagealloc.free[conf.nimage-1]; - for(i = imagealloc.free; i < ie; i++) + ie = &imagealloc.mru[conf.nimage]; + for(i = imagealloc.mru; i < ie; i++){ + i->c = nil; + i->ref = 0; + i->prev = i-1; i->next = i+1; - i->next = 0; + } + imagealloc.mru[0].prev = nil; + imagealloc.mru[conf.nimage-1].next = nil; + imagealloc.lru = &imagealloc.mru[conf.nimage-1]; imagealloc.freechan = malloc(NFREECHAN * sizeof(Chan*)); imagealloc.szfreechan = NFREECHAN; + } static void imagereclaim(void) { - uvlong ticks; + Image *i; + uvlong ticks0, ticks; irstats.calls++; /* Somebody is already cleaning the page cache */ if(!canqlock(&imagealloc.ireclaim)) return; + DBG("imagereclaim maxt %ulld noluck %d nolock %d\n", + irstats.maxt, irstats.noluck, irstats.nolock); + ticks0 = fastticks(nil); + if(!canlock(&imagealloc)){ + /* never happen in the experiments I made */ + qunlock(&imagealloc.ireclaim); + return; + } - ticks = pagereclaim(1000); + for(i = imagealloc.lru; i != nil; i = i->prev){ + if(canlock(i)){ + i->ref++; /* make sure it does not go away */ + unlock(i); + pagereclaim(i); + lock(i); + DBG("imagereclaim: image %p(c%p, r%d)\n", i, i->c, i->ref); + if(i->ref == 1){ /* no pages referring to it, it's ours */ + unlock(i); + unlock(&imagealloc); + putimage(i); + break; + }else + --i->ref; + unlock(i); + } + } + if(i == nil){ + irstats.noluck++; + unlock(&imagealloc); + } irstats.loops++; + ticks = fastticks(nil) - ticks0; irstats.ticks += ticks; if(ticks > irstats.maxt) irstats.maxt = ticks; @@ -128,15 +234,13 @@ * imagereclaim dumps pages from the free list which are cached by image * structures. This should free some image structures. */ - while(!(i = imagealloc.free)) { + while(!(i = lruimage())) { unlock(&imagealloc); imagereclaim(); sched(); lock(&imagealloc); } - imagealloc.free = i->next; - lock(i); incref(c); i->c = c; @@ -149,6 +253,7 @@ i->hash = *l; *l = i; found: + imageused(i); unlock(&imagealloc); if(i->s == 0) { @@ -193,9 +298,6 @@ l = &f->hash; } - i->next = imagealloc.free; - imagealloc.free = i; - /* defer freeing channel till we're out of spin lock's */ if(imagealloc.nfreechan == imagealloc.szfreechan){ imagealloc.szfreechan += NFREECHAN; @@ -207,6 +309,7 @@ imagealloc.freechan = cp; } imagealloc.freechan[imagealloc.nfreechan++] = c; + i->c = nil; /* flag as unused in lru list */ unlock(&imagealloc); return; diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/lib.h --- a/sys/src/nix/port/lib.h Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/lib.h Fri Sep 09 16:49:47 2011 +0200 @@ -83,6 +83,25 @@ int prec; ulong flags; }; + +enum { + FmtWidth = 1, + FmtLeft = FmtWidth<<1, + FmtPrec = FmtLeft<<1, + FmtSharp = FmtPrec<<1, + FmtSpace = FmtSharp<<1, + FmtSign = FmtSpace<<1, + FmtZero = FmtSign<<1, + FmtUnsigned = FmtZero<<1, + FmtShort = FmtUnsigned<<1, + FmtLong = FmtShort<<1, + FmtVLong = FmtLong<<1, + FmtComma = FmtVLong<<1, + FmtByte = FmtComma<<1, + + FmtFlag = FmtByte<<1 +}; + extern int print(char*, ...); extern char* seprint(char*, char*, char*, ...); extern char* vseprint(char*, char*, char*, va_list); diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/linuxsysemu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/nix/port/linuxsysemu.c Fri Sep 09 16:49:47 2011 +0200 @@ -0,0 +1,747 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include "../port/error.h" +#include +#include "ureg.h" + +/* from linux */ + +struct iovec { + void *base; + int len; +}; + + +struct utsname { + char sysname[65]; + char nodename[65]; + char release[65]; + char version[65]; + char machine[65]; + char gnu[65]; +}; + + +struct timeval + { + u32int tv_sec; /* Seconds. */ + u32int tv_usec; /* Microseconds. */ + }; + +struct rusage { + struct timeval ru_utime; /* user time used */ + struct timeval ru_stime; /* system time used */ + long ru_maxrss; /* maximum resident set size */ + long ru_ixrss; /* integral shared memory size */ + long ru_idrss; /* integral unshared data size */ + long ru_isrss; /* integral unshared stack size */ + long ru_minflt; /* page reclaims */ + long ru_majflt; /* page faults */ + long ru_nswap; /* swaps */ + long ru_inblock; /* block input operations */ + long ru_oublock; /* block output operations */ + long ru_msgsnd; /* messages sent */ + long ru_msgrcv; /* messages received */ + long ru_nsignals; /* signals received */ + long ru_nvcsw; /* voluntary context switches */ + long ru_nivcsw; /* involuntary " */ +}; + +/*Linux kerne.org 2.6.35-30-generic #54-Ubuntu SMP Tue Jun 7 18:41:54 UTC 2011 x86_64 GNU/Linux\n*/ +struct utsname linuxutsname = { + "Linux", "mynode", "2.6.35", "NIX", "x86_64", "GNUsucks" +}; + +void +linuxexit(Ar0*, va_list list) +{ + int val; + char exitstr[32] = ""; + + val = va_arg(list, int); + if (val) + snprint(exitstr, sizeof(exitstr), "%d", val); + if (up->linux & 128) print("%d:linuxexit %d\n", up->pid, val); + up->linux = 0; + pexit(exitstr, 1); +} + +void +linuxuname(Ar0*ar, va_list list) +{ + void *va; + va = va_arg(list, void *); + if (up->linux & 128) print("%d:linuxuname va %p\n", up->pid, va); + validaddr(va, 1, 1); + memmove(va, &linuxutsname, sizeof(linuxutsname)); + // if this does not work we will need a /proc for bgl + // uname is just such a piece of shit. Some systems want things of the size in the struct, + // others don't. Idiots. +//#define BULLSHIT "Linux\0 NIX\0 2.6.19\0NIX\0x86_64\0GNUsucks" +// memmove(va, BULLSHIT, strlen(BULLSHIT)+1); + if (up->linux&128) print("Returns %s\n", linuxutsname.release); + ar->i = 0; +} + +/* this was in port/sysseg.c and was copied here. */ +/* There are a few special bits for CNK needs. */ +void +linuxsbrk(Ar0* ar0, va_list list) +{ + uintptr addr; + uintptr ibrk(uintptr addr, int seg); + extern Segment *heapseg; + int i; + + addr = PTR2UINT(va_arg(list, void*)); + + if (! heapseg){ + print("linuxsbrk: no heap set up yet\n"); + error("No heap set up yet"); + } + + if(addr == 0){ + ar0->p = heapseg->top; + return; + } + + if (addr < heapseg->top){ + print("linuxsbrk: can't shrink heap\n"); + error("can't shrink heap"); + } + + /* now this is a hack ... but we're going to assume this thing is not + * only mapped but the TLB is set up for it. + * + heapseg->top = addr; + ar0->p = heapseg->top; + return; + */ + + /* find the index of the heap segment; call ibrk with that segment. */ + /* consider flagging heapseg by base address or p==v, but it's too soon to know + * if that is a universal test and I hate to do a strcmp on each linuxsbrk + */ + for(i = 0; i < NSEG; i++) { + if (heapseg == up->seg[i]) + break; + } + /* isn't life grand? The heap is already mapped. So just grow the end of heap pointer but no need to + * allocate a page. + */ + if (i < NSEG) + ar0->p = ibrk(addr, i); + if (up->linux & 128) print("%d:linuxsbrk for %p returns %p\n", up->pid, addr, ar0->p); +} + +/* special case: interpretation of '0' is done in USER MODE on Plan 9 */ +/* but old deprecated sysbrk_ does what we need */ +void +linuxbrk(Ar0* ar0, va_list list) +{ +// void linuxsbrk(Ar0* ar0, va_list list); + uintptr ibrk(uintptr addr, int seg); + void sysbrk_(Ar0*, va_list); + uintptr va; + //void *arg[1]; + va = va_arg(list, uintptr); + if (up->linux & 128) print("%d:linuxbrk va %#p: ", up->pid, (void *)va); + //arg[0] = va; + //sysbrk_(ar0, (va_list) arg); + va = ibrk(va, BSEG); + /* it is possible, though unlikely, that libc wants exactly the value it asked for. Plan 9 is returning rounded-up-to-next-page values. */ + if (va) + ar0->v = (void *)va; + if (up->linux & 128) print("returns %#p\n", va); + +} + +void +linuxopen(Ar0 *ar0, va_list list) +{ + char *aname; + int omode; + void sysopen(Ar0 *, va_list); + aname = va_arg(list, char*); + omode = va_arg(list, int); + USED(aname,omode); + sysopen(ar0, list); +} + +void +linuxwritev(Ar0 *ar0, va_list list) +{ + void sys_write(Ar0* ar0, va_list list); + int fd; + struct iovec *iov; + int iovcnt; + int i; + fd = va_arg(list, int); + iov = va_arg(list, struct iovec *); + iovcnt = va_arg(list, int); + if (up->linux & 128) print("%d:linuxwritev (%d, %p, %d):", up->pid, fd, iov, iovcnt); + validaddr(iov, iovcnt * sizeof(*iov), 0); + /* don't validate all the pointers in the iov; sys_write does this */ + for(i = 0; i < iovcnt; i++) { + Ar0 war0; + uintptr arg[3]; + if (up->linux & 128) print("[%p,%d],", iov[i].base, iov[i].len); + arg[0] = fd; + arg[1] = (uintptr) iov[i].base; + arg[2] = iov[i].len; + sys_write(&war0, (va_list) arg); + if (war0.l < 0) + break; + /* if the first one fails, we get 0 */ + ar0->l += war0.l; + } + if (up->linux & 128) print("\n"); +} + + +void +linuxsocketcall(Ar0 *ar0, va_list list) +{ + int fd; + uintptr *args; + + USED(ar0); + + fd = va_arg(list, int); + args = va_arg(list, uintptr *); + if (up->linux & 128) print("%d:linuxsocketcall (%d, %p):", up->pid, fd, args); + validaddr(args,sizeof(*args), 0); + if (up->linux & 128) print("\n"); +} + + +void +linuxgeteuid(Ar0 *ar0, va_list) +{ + ar0->i = 0; +} + +/* ow this hurts. */ +typedef unsigned long int __ino_t; +typedef long long int __quad_t; +typedef unsigned int __mode_t; +typedef unsigned int __nlink_t; +typedef long int __off_t; +typedef unsigned int __uid_t; +typedef unsigned int __gid_t; +typedef long int __blksize_t; +typedef long int __time_t; +typedef long int __blkcnt_t; + +typedef unsigned long long int __u_quad_t; + +typedef __u_quad_t __dev_t; + +struct timespec + { + __time_t tv_sec; + long int tv_nsec; + }; +/* +# 103 "/bgsys/drivers/V1R2M0_200_2008-080513P/ppc/gnu-linux/lib/gcc/powerpc-bgp-linux/4.1.2/../../../../powerpc-bgp-linux/sys-include/sys/stat.h" 3 4 +*/ +/* how many stat structs does linux have? too many. */ +struct stat { + __dev_t st_dev; + unsigned short int __pad1; + __ino_t st_ino; + __mode_t st_mode; + __nlink_t st_nlink; + __uid_t st_uid; + __gid_t st_gid; + __dev_t st_rdev; + unsigned short int __pad2; + __off_t st_size; + __blksize_t st_blksize; + __blkcnt_t st_blocks; + struct timespec st_atim; + struct timespec st_mtim; + struct timespec st_ctim; + unsigned long int __unused4; + unsigned long int __unused5; +} stupid = { + .st_blksize = 4096, + .st_dev = 1, + .st_gid = 0, + .st_ino = 0x12345, + .st_mode = 0664 | 020000, + .st_nlink = 1, + .st_rdev = 501 +}; + +void +fstat64(Ar0 *ar0, va_list list) +{ + void *v; + int fd; + fd = va_arg(list, int); + v = va_arg(list, void *); + validaddr(v, 1, 0); + switch(fd) { + case 0: + case 1: + case 2: + ar0->i = 0; + memmove(v, &stupid, sizeof(stupid)); + break; + } + +} + + +/* do nothing, succesfully */ +void +returnok(Ar0*, va_list) +{ + + return; +} + +/* void * mmap(void *start, size_t length, int prot , int flags, int fd, + off_t offset); */ +/* They are using this as a poor man's malloc. */ + +void linuxmmap(Ar0 *ar0, va_list list) +{ + void *v; + int length, prot, flags, fd; + ulong offset; + void linuxsbrk(Ar0* ar0, va_list list); + v = va_arg(list, void *); + length = va_arg(list, int); + prot = va_arg(list, int); + flags = va_arg(list, int); + fd = va_arg(list, int); + offset = va_arg(list, ulong); + if (up->linux & 128) print("%d:CNK: mmap %p %#x %#x %#x %d %#ulx\n", up->pid, v, length, prot, flags, fd, offset); + if (fd == -1){ + unsigned char *newv, *oldv; + uintptr args[1]; + args[0] = 0; + linuxsbrk(ar0, (va_list) args); + if (up->linux & 128) print("%d:mmap anon: current is %p\n", up->pid, ar0->v); + oldv =ar0->v; + newv = ((unsigned char *)oldv) + length; + if (up->linux & 128) print("%d:mmap anon: ask for %p\n", up->pid, newv); + args[0] = (uintptr) newv; + linuxsbrk(ar0, (va_list) args); + if (up->linux & 128) print("%d:mmap anon: new is %p\n", up->pid, ar0->v); + /* success means "return the old pointer" ... */ + ar0->v = oldv; + return; + } + + ar0->i = -1; + +} + + +void linuxprocid(Ar0 *ar0, va_list) +{ + ar0->i = 0; +} + +/*Kernel_Ranks2Coords((kernel_coords_t *)_mapcache, _fullSize);*/ + + +/* int sigaction(int sig, const struct sigaction *restrict act, + struct sigaction *restrict oact); */ + +void sigaction(Ar0 *ar0, va_list list) +{ + void *act, *oact; + act = va_arg(list, void *); + oact = va_arg(list, void *); + if (up->linux & 128) print("%d:sigaction, %p %p\n", up->pid, act, oact); + ar0->i = 0; +} + +/*long rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize); */ + +void rt_sigprocmask(Ar0 *ar0, va_list list) +{ + int how; + void *set, *oset; + int size; + how = va_arg(list,int); + set = va_arg(list, void *); + oset = va_arg(list, void *); + size = va_arg(list, int); + if (up->linux & 128) print("%d:sigaction, %d %p %p %d\n", up->pid, how, set, oset, size); + ar0->l = 0; +} + +/* damn. Did not want to do futtocks */ +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 + + +void futex(Ar0 *ar0, va_list list) +{ + int *uaddr, op, val; + int *uaddr2, val3; + struct timespec *timeout; + uaddr = va_arg(list,int *); + op = va_arg(list, int); + val = va_arg(list, int); + timeout = va_arg(list, struct timespec *); + uaddr2 = va_arg(list, int *); + val3 = va_arg(list, int); + USED(uaddr); + USED(op); + USED(val); + USED(timeout); + USED(uaddr2); + USED(val3); + if (up->linux & 128) print("%d:futex, uaddr %p op %x val %x uaddr2 %p timeout %p val3 %x\n", up->pid, + uaddr, op, val, uaddr2, timeout, val); + switch(op) { + default: + ar0->i = -1; + break; + case FUTEX_WAIT: + /* + * This operation atomically verifies that the futex address uaddr + * still contains the value val, and sleeps awaiting FUTEX_WAKE on + * this futex address. If the timeout argument is non-NULL, its + * contents describe the maximum duration of the wait, which is + * infinite otherwise. The arguments uaddr2 and val3 are ignored. + */ + validaddr(uaddr, sizeof(*uaddr), 0); + if (up->linux & 128) print("%d:futex: value at %p is %#x, val is %#x\n", up->pid, uaddr, *uaddr, val); + if (*uaddr != val) { + ar0->i = -11; + return; + } + if (timeout) { + validaddr(timeout, sizeof(*timeout), 0); + if (timeout->tv_sec == timeout->tv_nsec == 0) + return; + } + if (up->linux & 128) print("%d:Not going to sleep\n", up->pid); + break; + } +} + +/* mprotect is used to set a stack red zone. We may want to use + * segattach for anon page alloc and then use segdetach for the same purpose. + */ +void linuxmprotect(Ar0 *ar0, va_list list) +{ + u32int addr, len, prot; + addr = va_arg(list, u32int); + len = va_arg(list, u32int); + prot = va_arg(list, u32int); + if (up->linux & 128) print("%d:mprotect: %#x %#x %#x\n", up->pid, addr, len, prot); + ar0->i = 0; +} + +/* this is a hack. */ +Segment* +linuxdupseg(Segment **seg, int segno, int share) +{ + int i, size; + Pte *pte; + Segment *n, *s; + + SET(n); + s = seg[segno]; + + qlock(&s->lk); + if(waserror()){ + qunlock(&s->lk); + nexterror(); + } + switch(s->type&SG_TYPE) { + case SG_TEXT: /* New segment shares pte set */ + case SG_SHARED: + case SG_PHYSICAL: + goto sameseg; + + case SG_STACK: + /* linux wants to share the stack. */ + if(share){ if (up->linux & 128) print("CLONE STACK IS SHARE\n"); + goto sameseg; + } + /* that is all the change */ +if (up->linux & 128) print("CLONE STACK IS NEW\n"); + n = newseg(s->type, s->base, s->size); + break; + + case SG_BSS: /* Just copy on write */ + if(share) + goto sameseg; +if (up->linux & 128) print("CLONE NEW BSS\n"); + n = newseg(s->type, s->base, s->size); + break; + + case SG_DATA: /* Copy on write plus demand load info */ + if(segno == TSEG){ + poperror(); + qunlock(&s->lk); + return data2txt(s); + } + + if(share) + goto sameseg; + n = newseg(s->type, s->base, s->size); + + incref(s->image); + n->image = s->image; + n->fstart = s->fstart; + n->flen = s->flen; + break; + } + size = s->mapsize; + for(i = 0; i < size; i++) + if(pte = s->map[i]) + n->map[i] = ptecpy(n, pte); + + n->flushme = s->flushme; + if(s->ref > 1) + procflushseg(s); + poperror(); + qunlock(&s->lk); + return n; + +sameseg: + incref(s); + poperror(); + qunlock(&s->lk); + return s; +} + +/* the big problem here is that linux clone wants to allow the user to set the + * stack. It's stupid but it's what they do. Linux NPTL pretty much requires it. + * we are going to be dumb here for now and assume we only use + * RFPROC|RFMEM. + * What do we do about stack longer term? It gets a bit weird. + * the child process stack is in the data segment. Should we make the + * child process stack segment a DATA segment, share the data segment, + * and make it a STACK for the child? If we did things right in linuxclone + * we could remove our private dupseg. + */ +void linuxclone(Ar0 *ar0, va_list list) +{ + void linuxsysrforkchild(Proc* child, Proc* parent, uintptr newsp); + u32int flags, stack; + Proc *p; + int flag, i, n, pid; + Mach *wm; + flags = va_arg(list, u32int); + stack = va_arg(list, u32int); + if (up->linux & 128) print("%d:CLONE: %#x %#x\n", up->pid, flags, stack); + if (flags != 0x7d0f00) { + print("%d:CLONE: don't know what to do with flags %#x\n", up->pid, flags); + ar0->i = -1; + return; + } + flag = RFPROC | RFMEM; + + p = newproc(); + + p->trace = up->trace; + p->scallnr = up->scallnr; + memmove(p->arg, up->arg, sizeof(up->arg)); + p->nerrlab = 0; + p->slash = up->slash; + p->dot = up->dot; + incref(p->dot); + + memmove(p->note, up->note, sizeof(p->note)); + p->privatemem = up->privatemem; + p->noswap = up->noswap; + p->nnote = up->nnote; + p->notified = 0; + p->lastnote = up->lastnote; + p->notify = up->notify; + p->ureg = up->ureg; + p->dbgreg = 0; + p->linux = 1; + + /* Make a new set of memory segments */ + n = flag & RFMEM; + qlock(&p->seglock); + if(waserror()){ + qunlock(&p->seglock); + nexterror(); + } + for(i = 0; i < NSEG; i++) + if(up->seg[i]) + p->seg[i] = linuxdupseg(up->seg, i, n); + qunlock(&p->seglock); + poperror(); + + /* File descriptors */ + if(flag & (RFFDG|RFCFDG)) { + if(flag & RFFDG) + p->fgrp = dupfgrp(up->fgrp); + else + p->fgrp = dupfgrp(nil); + } + else { + p->fgrp = up->fgrp; + incref(p->fgrp); + } + + /* Process groups */ + if(flag & (RFNAMEG|RFCNAMEG)) { + p->pgrp = newpgrp(); + if(flag & RFNAMEG) + pgrpcpy(p->pgrp, up->pgrp); + /* inherit noattach */ + p->pgrp->noattach = up->pgrp->noattach; + } + else { + p->pgrp = up->pgrp; + incref(p->pgrp); + } + if(flag & RFNOMNT) + up->pgrp->noattach = 1; + + if(flag & RFREND) + p->rgrp = newrgrp(); + else { + incref(up->rgrp); + p->rgrp = up->rgrp; + } + + /* Environment group */ + if(flag & (RFENVG|RFCENVG)) { + p->egrp = smalloc(sizeof(Egrp)); + p->egrp->ref = 1; + if(flag & RFENVG) + envcpy(p->egrp, up->egrp); + } + else { + p->egrp = up->egrp; + incref(p->egrp); + } + p->hang = up->hang; + p->procmode = up->procmode; + + /* Craft a return frame which will cause the child to pop out of + * the scheduler in user mode with the return register zero + */ + /* fix the stack for linux semantics */ + linuxsysrforkchild(p, up, stack); + + p->parent = up; + p->parentpid = up->pid; + if(flag&RFNOWAIT) + p->parentpid = 0; + else { + lock(&up->exl); + up->nchild++; + unlock(&up->exl); + } + if((flag&RFNOTEG) == 0) + p->noteid = up->noteid; + + pid = p->pid; + memset(p->time, 0, sizeof(p->time)); + p->time[TReal] = MACHP(0)->ticks; + + kstrdup(&p->text, up->text); + kstrdup(&p->user, up->user); + /* + * since the bss/data segments are now shareable, + * any mmu info about this process is now stale + * (i.e. has bad properties) and has to be discarded. + */ + mmuflush(); + p->basepri = up->basepri; + p->priority = up->basepri; + p->fixedpri = up->fixedpri; + p->mp = up->mp; + wm = up->wired; + if(wm) + procwired(p, wm->machno); + ready(p); + sched(); + + ar0->i = pid; +} + +/* get app segment mapping. Not the gasm you think, you dirty-minded person. + */ +/* we are going to deprecate this call. It was only there for libraries (dcmf, MPI) that needed + * the huge physical segment. I think nowadays that is a bad idea. + */ +void gasm(Ar0 *, va_list) +{ +#ifdef NOMORE + void seginfo(int seg, u32int *va, u64int *pa, u32int *len); + u64int *pa; + int whichseg; + int corenum; + u32int *va; + u32int *slen; + + whichseg = va_arg(list, int); + corenum = va_arg(list, int); + va = va_arg(list, u32int *); + pa = va_arg(list, u64int *); + slen = va_arg(list, u32int *); + validaddr(va, sizeof(*va), 1); + validaddr(pa, sizeof(*pa), 1); + validaddr(slen, sizeof(*slen), 1); + + if (up->linux & 128) print("%d:gasm: %#x %#x %p %p %p\n", up->pid, whichseg, corenum, va, pa, slen); + + /* we can not run any more without devsegment. Sorry. */ + seginfo(whichseg, va, pa, slen); + if (up->linux & 128) print("%d:gasm: %#x %#llx %#x\n", up->pid, *va, *pa, *slen); + ar0->i = 0; +#endif +} + +void timeconv(ulong l, struct timeval *t) +{ + u32int ms; + ms = TK2MS(l); + t->tv_sec += ms / 1000; + ms %= 1000; + t->tv_usec += ms * 1000; +} +void getrusage(Ar0 *ar0, va_list list) +{ + int what; + struct rusage *r; + + what = va_arg(list, int); + r = va_arg(list, struct rusage *); + validaddr(r, sizeof(*r), 1); + + if (up->linux & 128) print("%d:getrusage: %s %p:",up->pid, !what? "self" : "kids", r); + memset(r, 0, sizeof(*r)); + if (what) { + timeconv(up->time[3], &r->ru_utime); + timeconv(up->time[4], &r->ru_stime); + } else { + timeconv(up->time[0], &r->ru_utime); + timeconv(up->time[1], &r->ru_stime); + if (up->linux & 128) print("%#lx:%#lx, ", up->time[0], up->time[1]); + } + if (up->linux & 128) print("%#x:%#x\n", r->ru_utime.tv_sec, r->ru_stime.tv_sec); + + ar0->i = 0; +} diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/page.c --- a/sys/src/nix/port/page.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/page.c Fri Sep 09 16:49:47 2011 +0200 @@ -6,314 +6,228 @@ enum { - Min4kpages = 30 + Nstartpgs = 32, + Nminfree = 3, + Nfreepgs = 512, }; -/* - * Changes for 2M pages: - * - * A new allocator, bigpalloc contains all 2M pages and divides - * them into 4K pages as needed. - * - * Segment sizes are still in 4K pages. - * The first page to attach to a segment fixes the segment pg sz. - */ +typedef struct Pgnd Pgnd; +enum +{ + Punused = 0, + Pused, + Pfreed, +}; -#define pghash(daddr) palloc.hash[(daddr>>PGSHIFT)&(PGHSIZE-1)] +struct Pgnd +{ + uintmem pa; + int sts; +}; + +#define pghash(daddr) pga.hash[(daddr>>PGSHFT)&(PGHSIZE-1)] +Pgalloc pga; /* new allocator */ + +char* +seprintpagestats(char *s, char *e) +{ + int i; + + lock(&pga); + for(i = 0; i < m->npgsz; i++) + if(m->pgsz[i] != 0) + s = seprint(s, e, "%uld/%d %dK user pages avail\n", + pga.pgsza[i].freecount, + pga.pgsza[i].npages.ref, m->pgsz[i]/KiB); + unlock(&pga); + return s; +} /* - * Palloc contains PGSZ pages. - * bigpalloc contains BIGPGSZ pages. - * The first one is used for locking and hashing. - * The second one contains just aggregated pages. + * Preallocate some pages: + * some 2M ones will be used by the first process. + * some 1G ones will be allocated for each domain so processes may use them. */ - -struct Palloc palloc; -struct Palloc bigpalloc; -static void splitbigpage(void); - void pageinit(void) { - int color, i, j; - Page *p, *lastp, *lastbigp; - Pallocmem *pm; - uvlong pnp, np, pkb, nbigp; + int si, i, color; + Page *pg; - np = 0; - nbigp = 0; + pga.userinit = 1; + DBG("pageinit: npgsz = %d\n", m->npgsz); /* - * For each palloc memory map we have a bunch of 4K pages - * not aligned to make a full 2M page and then a bunch of 2M pages. - * BUG: We leak pages at the end if they are not enough to make a full 2M page. - * We also assume that each map is at least 2M. + * Don't pre-allocate 4K pages, we are not using them anymore. */ - for(i=0; inpage %uld pm->base %#p \n", i, pm->npage, pm->base); - if(pm->npage == 0) - continue; - pnp = (BIGPGROUND(pm->base) - pm->base) / PGSZ; - pm->nbigpage = pm->npage - pnp; - pm->npage = pnp; - pm->nbigpage /= PGSPERBIG; - if(1) - DBG("pnp %#ullx pm npage %#ulx nbigpage %#ulx\n", - pnp, pm->npage, pm->nbigpage); - np += pm->npage; - nbigp += pm->nbigpage; - } - palloc.pages = malloc(np*sizeof(Page)); - bigpalloc.pages = malloc(nbigp*sizeof(Page)); - if(palloc.pages == 0 || bigpalloc.pages == 0) - panic("pageinit"); - DBG("npages %#ullx nbigpages %#ullx pgsz %d\n", np, nbigp, sizeof(Page)); - color = 0; - lastp = nil; - palloc.head = palloc.tail = nil; - palloc.user = 0; - lastbigp = nil; - bigpalloc.head = bigpalloc.tail = nil; - - for(i=0; inpage, pm->nbigpage); - if(pm->npage == 0 && pm->nbigpage == 0) - continue; - if(lastp == nil) - p = palloc.pages; - else - p = palloc.tail+1; - for(j=0; jnpage; j++){ - assert(p >= palloc.pages && p < palloc.pages + np); - p->prev = lastp; - if(lastp != nil) - lastp->next = p; + for(si = 1; si < m->npgsz; si++){ + for(i = 0; i < Nstartpgs; i++){ + if(si < 2) + color = -1; else - palloc.head = p; - p->next = nil; - p->pa = pm->base+j*PGSZ; - p->color = color; - p->lgsize = PGSHIFT; - palloc.freecount++; - color = (color+1)%NCOLOR; - lastp = p++; - palloc.user++; + color = i; + pg = pgalloc(m->pgsz[si], color); + if(pg == nil){ + DBG("pageinit: pgalloc failed. breaking.\n"); + break; /* don't consume more memory */ + } + DBG("pageinit: alloced pa %#P sz %#ux color %d\n", + pg->pa, m->pgsz[si], pg->color); + lock(&pga); + pg->ref = 0; + pagechainhead(pg); + unlock(&pga); } - palloc.tail = lastp; - if(lastbigp == nil) - p = bigpalloc.pages; - else - p = bigpalloc.tail+1; - for(j = 0; j < pm->nbigpage; j++){ - assert(p >= bigpalloc.pages && p < bigpalloc.pages + nbigp); - p->prev = lastbigp; - if(lastbigp != nil) - lastbigp->next = p; - else - bigpalloc.head = p; - p->next = nil; - p->pa = pm->base+pm->npage*PGSZ+j*BIGPGSZ; - assert(p->pa == BIGPGROUND(p->pa)); - p->color = color; - p->lgsize = BIGPGSHFT; - bigpalloc.freecount++; - color = (color+1)%NCOLOR; - lastbigp = p++; - bigpalloc.user++; - } - bigpalloc.tail = lastbigp; } - DBG("%uld big pages; %uld small pages\n", - bigpalloc.freecount, palloc.freecount); - pkb = palloc.user*PGSZ/1024 + bigpalloc.user*BIGPGSZ/1024ULL; - - /* Paging numbers */ - swapalloc.highwater = (palloc.user*5)/100; - swapalloc.headroom = swapalloc.highwater + (swapalloc.highwater/4); - - /* How to compute total and kernel memory in this kernel? */ - print("%lldM user memory\n", pkb/1024ULL); - - /* - * This is not necessary, but it makes bugs in memory scan/page init - * show up right now, so we split now a big page into 4K pages. - */ - lock(&palloc); - splitbigpage(); - unlock(&palloc); - DBG("pageinit done\n"); + pga.userinit = 0; } -static Palloc* -getalloc(Page *p) +int +getpgszi(usize size) { - if(p->lgsize == PGSHIFT) - return &palloc; - if(p->lgsize != BIGPGSHFT) - panic("getalloc"); - return &bigpalloc; + int si; + + for(si = 0; si < m->npgsz; si++) + if(size == m->pgsz[si]) + return si; + print("getpgszi: size %#ulx not found\n", size); + return -1; } -static void +Page* +pgalloc(usize size, int color) +{ + Page *pg; + int si; + + si = getpgszi(size); + if((pg = malloc(sizeof(Page))) == nil){ + DBG("pgalloc: malloc failed\n"); + return nil; + } + memset(pg, 0, sizeof *pg); + if((pg->pa = physalloc(size, &color)) == 0){ + DBG("pgalloc: physalloc failed for size %#ulx color %d\n", size, color); + free(pg); + return nil; + } + pg->pgszi = si; /* size index */ + incref(&pga.pgsza[si].npages); + pg->color = color; + return pg; +} + +void +pgfree(Page* pg) +{ + decref(&pga.pgsza[pg->pgszi].npages); + physfree(pg->pa, m->pgsz[pg->pgszi]); + free(pg); +} + +void pageunchain(Page *p) { - Palloc *pp; + Pgsza *pa; - if(canlock(&palloc)) - panic("pageunchain (palloc %#p)", &palloc); - pp = getalloc(p); + if(canlock(&pga)) + panic("pageunchain"); + pa = &pga.pgsza[p->pgszi]; if(p->prev) p->prev->next = p->next; else - pp->head = p->next; + pa->head = p->next; if(p->next) p->next->prev = p->prev; else - pp->tail = p->prev; + pa->tail = p->prev; p->prev = p->next = nil; - pp->freecount--; + pa->freecount--; } void pagechaintail(Page *p) { - Palloc *pp; + Pgsza *pa; - if(canlock(&palloc)) + if(canlock(&pga)) panic("pagechaintail"); - pp = getalloc(p); - if(pp->tail) { - p->prev = pp->tail; - pp->tail->next = p; + pa = &pga.pgsza[p->pgszi]; + if(pa->tail) { + p->prev = pa->tail; + pa->tail->next = p; } else { - pp->head = p; + pa->head = p; p->prev = 0; } - pp->tail = p; + pa->tail = p; p->next = 0; - pp->freecount++; + pa->freecount++; } void pagechainhead(Page *p) { - Palloc *pp; + Pgsza *pa; - if(canlock(&palloc)) + if(canlock(&pga)) panic("pagechainhead"); - pp = getalloc(p); - if(pp->head) { - p->next = pp->head; - pp->head->prev = p; + pa = &pga.pgsza[p->pgszi]; + if(pa->head) { + p->next = pa->head; + pa->head->prev = p; } else { - pp->tail = p; + pa->tail = p; p->next = 0; } - pp->head = p; + pa->head = p; p->prev = 0; - pp->freecount++; + pa->freecount++; } /* - * allocator is locked. - * Low on pages, split a big one a release all its pages - * into the small page allocator. - * The Page structs for the new pages are allocated within - * the big page being split, so we don't have to allocate more memory. - * For a 2M page we need 512 Page structs (for new 4K pages). - * That's 80K if Page is 160 bytes. - * + * XXX: newpage could receive a hit regarding the color we prefer. + * fault calls newpage to do pio and install new pages. + * Also, processes could keep track of a preferred color, so + * that they try to allocate all their segments of the same color. */ -static void -splitbigpage(void) -{ - Page *bigp, *p; - ulong arrysz, npage; - KMap *k; - int color; - - if(canlock(&palloc)) - panic("splitbigpage"); - if(bigpalloc.freecount == 0) - panic("no big pages; no memory\n"); - bigp = bigpalloc.head; - pageunchain(bigp); - DBG("big page %#ullx split...\n", bigp->pa); - - arrysz = PGROUND(PGSPERBIG * sizeof(Page)); /* size consumed in Page array */ - npage = arrysz/PGSZ; - k = KADDR(bigp->pa); - memset(k, 0, BIGPGSZ); - p = k; - p += npage; - p->next = nil; - color = 0; - for(; npage < PGSPERBIG; npage++){ - p->prev = palloc.tail; - if(palloc.tail == nil) - palloc.head = p; - else - palloc.tail->next = p; - p->next = nil; - p->color = color; - color = (color+1)%NCOLOR; - p->lgsize = PGSHIFT; - p->pa = bigp->pa + npage*PGSZ; - palloc.tail = p; - palloc.freecount++; - } - - /* - * We leak the big page, we will never coallesce - * small pages into a big page. - * Also, we must leave the bigpage mapped, or we won't - * be able to access its Page structs for inner 4K pages. - */ - DBG("big page split %#ullx done\n", bigp->pa); -} - Page* -newpage(int clear, Segment **s, uintptr va, uintptr pgsz) +newpage(int clear, Segment **s, uintptr va, usize size) { Page *p; KMap *k; uchar ct; - int i, color, dontalloc; - Palloc *pp; - static int once, last; + Pgsza *pa; + int i, color, dontalloc, si; + static int once; - pp=&palloc; - if(pgsz == BIGPGSZ) - pp = &bigpalloc; - lock(&palloc); - color = getpgcolor(va); + si = getpgszi(size); + pa = &pga.pgsza[si]; + color = -1; + if(s && (*s)->color != NOCOLOR) + color = (*s)->color; - DBG("newpage up %#p va %#ullx pgsz %#ullx free %uld bigfree %uld\n", - up, va, pgsz, palloc.freecount, bigpalloc.freecount); - if(pp == &palloc && (pp->freecount % 100) == 0 && pp->freecount != last) - DBG("newpage: %uld free 4K pages\n", palloc.freecount); - if(pp == &bigpalloc && pp->freecount <= 5 && pp->freecount != last) - DBG("newpage: %uld free 2M pages\n", bigpalloc.freecount); - last = pp->freecount; + lock(&pga); + for(;;){ + if(pa->freecount > 1) + break; + unlock(&pga); - for(;;){ - if(pp == &palloc && pp->freecount < Min4kpages) - splitbigpage(); - if(pp->freecount > 1) - break; - - unlock(&palloc); dontalloc = 0; if(s && *s) { qunlock(&((*s)->lk)); *s = 0; dontalloc = 1; } - kickpager(); + + /* + * Tries 3) flusing images if size is <= 2M, + * 4) releasing bigger pages, and 5) releasing smaller pages. + * in that order. + */ + kickpager(si, color); /* * If called from fault and we lost the segment from @@ -324,17 +238,17 @@ if(dontalloc) return 0; - lock(&palloc); + lock(&pga); } /* First try for our colour */ - for(p = pp->head; p; p = p->next) + for(p = pa->head; p; p = p->next) if(p->color == color) break; ct = PG_NOFLUSH; if(p == 0) { - p = pp->head; + p = pa->head; p->color = color; ct = PG_NEWCOL; } @@ -349,35 +263,36 @@ p->ref++; p->va = va; p->modref = 0; - for(i = 0; i < MAXMACH; i++) + for(i = 0; i < MACHMAX; i++) p->cachectl[i] = ct; unlock(p); - unlock(&palloc); + unlock(&pga); if(clear) { k = kmap(p); - memset((void*)VA(k), 0, 1<lgsize); + memset((void*)VA(k), 0, m->pgsz[p->pgszi]); kunmap(k); } + DBG("newpage: va %#p pa %#ullx pgsz %#ux\n", + p->va, p->pa, m->pgsz[p->pgszi]); return p; } -int -ispages(void *) -{ - return bigpalloc.freecount > 0; -} - +/* + * Caching/free policy imlemented in putpage. + * Make sure elsewhere that pg->pa low bits are not + * set e.g. with attribute bits. + * + * TODO: change mmuput to take pa from page argument. + */ void putpage(Page *p) { - if(onswap(p)) { - putswap(p); - return; - } + Pgsza *pa; + int rlse; - lock(&palloc); + lock(&pga); lock(p); if(p->ref == 0) @@ -385,42 +300,59 @@ if(--p->ref > 0) { unlock(p); - unlock(&palloc); + unlock(&pga); return; } - - if(p->image && p->image != &swapimage) + rlse = 0; + if(p->image != nil) pagechaintail(p); - else - pagechainhead(p); - - if(palloc.r.p != 0) - wakeup(&palloc.r); - + else{ + /* + * Free pages if we have plenty in the free list. + */ + pa = &pga.pgsza[p->pgszi]; + if(pa->freecount > Nfreepgs) + rlse = 1; + else + pagechainhead(p); + } + if(pga.r.p != nil) + wakeup(&pga.r); unlock(p); - unlock(&palloc); + if(rlse) + pgfree(p); + unlock(&pga); } +/* + * Get an auxiliary page. + * Don't do so if less than Nminfree pages. + * Only used by cache. + * The interface must specify page size. + */ Page* -auxpage(void) +auxpage(usize size) { Page *p; + Pgsza *pa; + int si; - lock(&palloc); - p = palloc.head; - if(palloc.freecount < swapalloc.highwater) { - unlock(&palloc); - return 0; + si = getpgszi(size); + lock(&pga); + pa = &pga.pgsza[si]; + p = pa->head; + if(pa->freecount < Nminfree){ + unlock(&pga); + return nil; } pageunchain(p); - lock(p); if(p->ref != 0) panic("auxpage"); p->ref++; uncachepage(p); unlock(p); - unlock(&palloc); + unlock(&pga); return p; } @@ -430,7 +362,7 @@ int duppage(Page *p) /* Always call with p locked */ { - Palloc *pp; + Pgsza *pa; Page *np; int color; int retries; @@ -454,11 +386,11 @@ /* * normal lock ordering is to call - * lock(&palloc) before lock(p). + * lock(&pga) before lock(p). * To avoid deadlock, we have to drop * our locks and try again. */ - if(!canlock(&palloc)){ + if(!canlock(&pga)){ unlock(p); if(up) sched(); @@ -466,42 +398,31 @@ goto retry; } - pp = getalloc(p); + pa = &pga.pgsza[p->pgszi]; /* No freelist cache when memory is very low */ - if(pp->freecount < swapalloc.highwater) { - unlock(&palloc); + if(pa->freecount < Nminfree){ + unlock(&pga); uncachepage(p); return 1; } - color = getpgcolor(p->va); - for(np = pp->head; np; np = np->next) + color = p->color; + for(np = pa->head; np; np = np->next) if(np->color == color) break; /* No page of the correct color */ - if(np == 0) { - unlock(&palloc); + if(np == 0){ + unlock(&pga); uncachepage(p); return 1; } pageunchain(np); - pagechaintail(np); + /* don't pagechaintail(np) here; see below */ -/* -* XXX - here's a bug? - np is on the freelist but it's not really free. -* when we unlock palloc someone else can come in, decide to -* use np, and then try to lock it. they succeed after we've -* run copypage and cachepage and unlock(np). then what? -* they call pageunchain before locking(np), so it's removed -* from the freelist, but still in the cache because of -* cachepage below. if someone else looks in the cache -* before they remove it, the page will have a nonzero ref -* once they finally lock(np). -*/ lock(np); - unlock(&palloc); + unlock(&pga); /* Cache the new version */ uncachepage(np); @@ -512,6 +433,22 @@ unlock(np); uncachepage(p); + /* + * This is here to prevent a bug(?) + * np is on the freelist but it's not really free. + * when we unlock palloc someone else can come in, decide to + * use np, and then try to lock it. they succeed after we've + * run copypage and cachepage and unlock(np). then what? + * they call pageunchain before locking(np), so it's removed + * from the freelist, but still in the cache because of + * cachepage below. if someone else looks in the cache + * before they remove it, the page will have a nonzero ref + * once they finally lock(np). + * Because np was not chained until now, nobody could see it. + */ + lock(&pga); + pagechaintail(np); + unlock(&pga); return 0; } @@ -520,11 +457,11 @@ { KMap *ks, *kd; - if(f->lgsize != t->lgsize) + if(f->pgszi != t->pgszi || t->pgszi < 0) panic("copypage"); ks = kmap(f); kd = kmap(t); - memmove((void*)VA(kd), (void*)VA(ks), 1<lgsize); + memmove((void*)VA(kd), (void*)VA(ks), m->pgsz[t->pgszi]); kunmap(ks); kunmap(kd); } @@ -537,16 +474,16 @@ if(p->image == 0) return; - lock(&palloc.hashlock); + lock(&pga.hashlock); l = &pghash(p->daddr); - for(f = *l; f; f = f->hash) { - if(f == p) { + for(f = *l; f; f = f->hash){ + if(f == p){ *l = p->hash; break; } l = &f->hash; } - unlock(&palloc.hashlock); + unlock(&pga.hashlock); putimage(p->image); p->image = 0; p->daddr = 0; @@ -566,12 +503,12 @@ panic("cachepage"); incref(i); - lock(&palloc.hashlock); + lock(&pga.hashlock); p->image = i; l = &pghash(p->daddr); p->hash = *l; *l = p; - unlock(&palloc.hashlock); + unlock(&pga.hashlock); } void @@ -579,15 +516,15 @@ { Page *f, **l; - lock(&palloc.hashlock); + lock(&pga.hashlock); l = &pghash(daddr); - for(f = *l; f; f = f->hash) { - if(f->image == i && f->daddr == daddr) { + for(f = *l; f; f = f->hash){ + if(f->image == i && f->daddr == daddr){ lock(f); if(f->image == i && f->daddr == daddr){ *l = f->hash; putimage(f->image); - f->image = 0; + f->image = nil; f->daddr = 0; } unlock(f); @@ -595,7 +532,7 @@ } l = &f->hash; } - unlock(&palloc.hashlock); + unlock(&pga.hashlock); } Page * @@ -603,38 +540,43 @@ { Page *f; - lock(&palloc.hashlock); - for(f = pghash(daddr); f; f = f->hash) { - if(f->image == i && f->daddr == daddr) { - unlock(&palloc.hashlock); + lock(&pga.hashlock); + for(f = pghash(daddr); f; f = f->hash){ + if(f->image == i && f->daddr == daddr){ + unlock(&pga.hashlock); - lock(&palloc); + lock(&pga); lock(f); - if(f->image != i || f->daddr != daddr) { + if(f->image != i || f->daddr != daddr){ unlock(f); - unlock(&palloc); + unlock(&pga); return 0; } if(++f->ref == 1) pageunchain(f); - unlock(&palloc); + unlock(&pga); unlock(f); return f; } } - unlock(&palloc.hashlock); + unlock(&pga.hashlock); - return 0; + return nil; } +/* + * Called from imagereclaim, to try to release Images. + * The argument shows the preferred image to release pages from. + * All images will be tried, from lru to mru. + */ uvlong -pagereclaim(int npages) +pagereclaim(Image *i) { Page *p; uvlong ticks; - lock(&palloc); + lock(&pga); ticks = fastticks(nil); /* @@ -642,35 +584,34 @@ * end of the list (see putpage) so start there and work * backward. */ - for(p = palloc.tail; p && p->image && npages > 0; p = p->prev) { - if(p->ref == 0 && canlock(p)) { + for(p = pga.pgsza[0].tail; p && p->image == i; p = p->prev){ + if(p->ref == 0 && canlock(p)){ if(p->ref == 0) { - npages--; uncachepage(p); } unlock(p); } } ticks = fastticks(nil) - ticks; - unlock(&palloc); + unlock(&pga); return ticks; } Pte* -ptecpy(Pte *old) +ptecpy(Segment *s, Pte *old) { Pte *new; Page **src, **dst; - new = ptealloc(); + new = ptealloc(s); dst = &new->pages[old->first-old->pages]; new->first = dst; for(src = old->first; src <= old->last; src++, dst++) - if(*src) { + if(*src){ if(onswap(*src)) - dupswap(*src); - else { + panic("ptecpy: no swap"); + else{ lock(*src); (*src)->ref++; unlock(*src); @@ -683,12 +624,12 @@ } Pte* -ptealloc(void) +ptealloc(Segment *s) { Pte *new; - new = smalloc(sizeof(Pte)); - new->first = &new->pages[PTEPERTAB]; + new = smalloc(sizeof(Pte) + sizeof(Page*)*s->ptepertab); + new->first = &new->pages[s->ptepertab]; new->last = new->pages; return new; } @@ -703,7 +644,7 @@ switch(s->type&SG_TYPE) { case SG_PHYSICAL: fn = s->pseg->pgfree; - ptop = &p->pages[PTEPERTAB]; + ptop = &p->pages[s->ptepertab]; if(fn) { for(pg = p->pages; pg < ptop; pg++) { if(*pg == 0) diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/pager.c --- a/sys/src/nix/port/pager.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/pager.c Fri Sep 09 16:49:47 2011 +0200 @@ -5,18 +5,28 @@ #include "fns.h" #include "../port/error.h" +/* + * There's no pager process here. + * One process waiting for memory becomes the pager, + * during the call to kickpager() + */ + enum { Minpages = 2 }; -Image swapimage; -QLock pagerlck; +static QLock pagerlck; +static struct +{ + ulong ntext; + ulong nbig; + ulong nall; +} pstats; void swapinit(void) { - swapimage.notext = 1; } void @@ -125,29 +135,25 @@ return n; } -/* - * Someone thinks memory is needed. - * Try to page out some text pages and keep on doing so - * while needpages() and we could do something about it. - * We ignore prepaged processes in a first pass. - */ -void -kickpager(void) +static void +pageouttext(int pgszi, int color) { + Proc *p; + Pgsza *pa; int i, n, np, x; Segment *s; int prepaged; - static int nwant; - DBG("kickpager() %#p\n", up); - if(waserror()) - panic("error in kickpager"); - qlock(&pagerlck); - if(bigpalloc.freecount > Minpages) - goto Done; + USED(color); + pa = &pga.pgsza[pgszi]; n = x = 0; prepaged = 0; + + /* + * Try first to steal text pages from non-prepaged processes, + * then from anyone. + */ Again: do{ if((p = psincref(x)) == nil) @@ -171,20 +177,131 @@ if(np > 0) DBG("pager: %d from proc #%d %#p\n", np, x, p); x++; - }while(bigpalloc.freecount < Minpages); - if(bigpalloc.freecount < Minpages){ - if(prepaged++ == 0) - goto Again; - panic("no physical memory"); + }while(pa->freecount < Minpages); + + if(pa->freecount < Minpages && prepaged++ == 0) + goto Again; +} + +static void +freepages(int si, int once) +{ + Pgsza *pa; + Page *p; + + for(; si < m->npgsz; si++){ + pa = &pga.pgsza[si]; + if(pa->freecount > 0){ + DBG("kickpager() up %#p: releasing %udK pages\n", + up, m->pgsz[si]/KiB); + lock(&pga); + if(pa->freecount == 0){ + unlock(&pga); + continue; + } + p = pa->head; + pageunchain(p); + unlock(&pga); + if(p->ref != 0) + panic("freepages pa %#ullx", p->pa); + pgfree(p); + if(once) + break; + } } +} + +static int +tryalloc(int pgszi, int color) +{ + Page *p; + + p = pgalloc(m->pgsz[pgszi], color); + if(p != nil){ + lock(&pga); + pagechainhead(p); + unlock(&pga); + return 0; + } + return -1; +} + +/* + * Someone thinks pages of size m->pgsz[pgszi] are needed + * and is trying to make them available. + * Many processes may be calling this at the same time, + * in which case they will enter one by one. Only when more than + * Minpages are available they will simply return. + */ +void +kickpager(int pgszi, int color) +{ + Pgsza *pa; + + if(DBGFLG>1) + DBG("kickpager() %#p\n", up); + if(waserror()) + panic("error in kickpager"); + qlock(&pagerlck); + pa = &pga.pgsza[pgszi]; + + /* + * First try allocating from physical memory. + */ + tryalloc(pgszi, color); + if(pa->freecount > Minpages) + goto Done; + + /* + * If pgszi is <= page size for text (assumed to be 2M) + * try to release text pages. + */ + if(m->pgsz[pgszi] <= 2*MiB){ + pstats.ntext++; + DBG("kickpager() up %#p: reclaiming text pages\n", up); + pageouttext(pgszi, color); + tryalloc(pgszi, color); + if(pa->freecount > Minpages){ + DBG("kickpager() found %uld free\n", pa->freecount); + goto Done; + } + } + + /* + * Try releasing memory from one bigger page, perhaps from text + * pages released in the previous step. + */ + pstats.nbig++; + freepages(pgszi+1, 1); + while(tryalloc(pgszi, color) != -1 && pa->freecount < Minpages) + ; + if(pa->freecount > 1){ + DBG("kickpager() found %uld free\n", pa->freecount); + goto Done; + } + /* + * Try releasing memory from all pages. + */ + pstats.nall++; + DBG("kickpager() up %#p: releasing all pages\n", up); + freepages(0, 0); + tryalloc(pgszi, color); + if(pa->freecount > 1){ + DBG("kickpager() found %uld free\n", pa->freecount); + goto Done; + } + panic("kickpager(): no physical memory"); Done: + poperror(); qunlock(&pagerlck); - DBG("kickpager() done %#p\n", up); + if(DBGFLG>1) + DBG("kickpager() done %#p\n", up); } void pagersummary(void) { + print("ntext %uld nbig %uld nall %uld\n", + pstats.ntext, pstats.nbig, pstats.nall); print("no swap\n"); } - diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/portclock.c --- a/sys/src/nix/port/portclock.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/portclock.c Fri Sep 09 16:49:47 2011 +0200 @@ -12,10 +12,10 @@ Timer *head; }; -static Timers timers[MAXMACH]; +static Timers timers[MACHMAX]; -ulong intrcount[MAXMACH]; -ulong fcallcount[MAXMACH]; +ulong intrcount[MACHMAX]; +ulong fcallcount[MACHMAX]; static vlong tadd(Timers *tt, Timer *nt) diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/portdat.h --- a/sys/src/nix/port/portdat.h Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/portdat.h Fri Sep 09 16:49:47 2011 +0200 @@ -16,21 +16,22 @@ typedef struct Kzio Kzio; typedef struct Log Log; typedef struct Logflag Logflag; +typedef struct Mhead Mhead; +typedef struct Mnt Mnt; typedef struct Mntcache Mntcache; -typedef struct Mount Mount; typedef struct Mntrpc Mntrpc; typedef struct Mntwalk Mntwalk; -typedef struct Mnt Mnt; -typedef struct Mhead Mhead; +typedef struct Mount Mount; typedef struct Note Note; typedef struct Page Page; +typedef struct Pallocmem Pallocmem; typedef struct Path Path; -typedef struct Palloc Palloc; -typedef struct Pallocmem Pallocmem; typedef struct Perf Perf; +typedef struct Pgalloc Pgalloc; +typedef struct Pgrp Pgrp; +typedef struct Pgsza Pgsza; +typedef struct Physseg Physseg; typedef struct PhysUart PhysUart; -typedef struct Pgrp Pgrp; -typedef struct Physseg Physseg; typedef struct Proc Proc; typedef struct Procalloc Procalloc; typedef struct Pte Pte; @@ -42,9 +43,9 @@ typedef struct RWlock RWlock; typedef struct Schedq Schedq; typedef struct Segment Segment; -typedef struct Sems Sems; typedef struct Sem Sem; typedef struct Sema Sema; +typedef struct Sems Sems; typedef struct Timer Timer; typedef struct Timers Timers; typedef struct Uart Uart; @@ -63,9 +64,6 @@ #include -#ifndef physaddr -#define physaddr uintptr -#endif struct Ref { Lock; @@ -198,7 +196,6 @@ int malen; /* allocated length of mtpt */ }; - struct Dev { int dc; @@ -317,18 +314,18 @@ struct Page { Lock; - physaddr pa; /* Physical address in memory */ + uintmem pa; /* Physical address in memory */ uintptr va; /* Virtual address for user */ ulong daddr; /* Disc address on swap */ int ref; /* Reference count */ uchar modref; /* Simulated modify/reference bits */ uchar color; /* Cache coloring */ - uchar lgsize; /* log2(page size) */ - char cachectl[MAXMACH]; /* Cache flushing control for mmuput */ + char cachectl[MACHMAX]; /* Cache flushing control for mmuput */ Image *image; /* Associated text or swap image */ Page *next; /* Lru free list */ Page *prev; Page *hash; /* Image hash chains */ + int pgszi; /* size index in m->pgsz[] */ }; struct Swapalloc @@ -355,15 +352,32 @@ //subtype Segment *s; /* TEXT segment for image if running */ Image *hash; /* Qid hash chains */ - Image *next; /* Free list */ + Image *next; /* Free list or lru list */ + Image *prev; /* lru list */ int notext; /* no file associated */ }; +/* + * virtual MMU + */ +#define PTEMAPMEM (1ULL*GiB) +#define SEGMAPSIZE 1984 +#define SSEGMAPSIZE 16 /* XXX: shouldn't be 32 at least? */ + +/* + * Interface between fixfault and mmuput. + */ +#define PTEVALID (1<<0) +#define PTEWRITE (1<<1) +#define PTERONLY (0<<1) +#define PTEUSER (1<<2) +#define PTEUNCACHED (1<<4) + struct Pte { - Page *pages[PTEPERTAB]; /* Page map for this chunk of pte */ Page **first; /* First used entry */ - Page **last; /* Last used entry */ + Page **last; /* Last used entry */ + Page *pages[]; /* Page map for this chunk of pte */ }; /* Segment types */ @@ -395,9 +409,9 @@ { ulong attr; /* Segment attributes */ char *name; /* Attach name */ - physaddr pa; /* Physical address */ + uintmem pa; /* Physical address */ usize size; /* Maximum segment size in pages */ - uchar lgpgsize; /* log2(size of pages in segment) */ + int pgszi; /* Page size index in Mach */ Page *(*pgalloc)(Segment*, uintptr); /* Allocation if we need it */ void (*pgfree)(Page*); uintptr gva; /* optional global virtual address */ @@ -439,13 +453,17 @@ Rendez rr; /* process waiting to read free addresses */ }; +#define NOCOLOR ((uchar)~0) + struct Segment { Ref; QLock lk; ushort steal; /* Page stealer lock */ ushort type; /* segment type */ - uchar lgpgsize; /* log2(size of pages in segment) */ + int pgszi; /* page size index in Mach MMMU */ + uint ptepertab; + uchar color; uintptr base; /* virtual base */ uintptr top; /* virtual top */ usize size; /* size in pages */ @@ -538,24 +556,27 @@ struct Pallocmem { - physaddr base; + uintmem base; ulong npage; - ulong nbigpage; }; -struct Palloc +struct Pgsza +{ + ulong freecount; /* how many pages in the free list? */ + Ref npages; /* how many pages of this size? */ + Page *head; /* MRU */ + Page *tail; /* LRU */ +}; + +struct Pgalloc { Lock; - Pallocmem mem[8]; - Page *head; /* most recently used */ - Page *tail; /* least recently used */ - ulong freecount; /* how many pages on free list now */ - Page *pages; /* array of all pages */ - ulong user; /* how many user pages */ - Page *hash[PGHSIZE]; + int userinit; /* working in user init mode */ + Pgsza pgsza[NPGSZ]; /* allocs for m->npgsz page sizes */ + Page* hash[PGHSIZE]; /* only used for user pages */ Lock hashlock; - Rendez r; /* Sleep for free mem */ - QLock pwait; /* Queue of procs waiting for memory */ + Rendez r; /* sleep for free mem */ + QLock pwait; /* queue of procs waiting for this pgsz */ }; struct Waitq @@ -609,10 +630,11 @@ /* * process memory segments - NSEG always last ! + * HSEG is a potentially huge bss segment. */ enum { - SSEG, TSEG, DSEG, BSEG, ESEG, LSEG, SEG1, SEG2, SEG3, SEG4, NSEG + SSEG, TSEG, DSEG, BSEG, HSEG, ESEG, LSEG, SEG1, SEG2, SEG3, SEG4, NSEG }; enum @@ -832,6 +854,13 @@ int nqsyscall; /* # of syscalls in the last quantum */ int nfullq; + /* might want a struct someday but this is good for now. + * if that day comes, better use a pointer to a Linux struct, so + * we don't pay the price for all processes. + */ + int linux; /* bit 0 is "linux emulation". Others debug */ + int linuxexec; /* Plan 9 process starting a Linux process */ + /* * machine specific fpu, mmu and notify */ @@ -870,12 +899,11 @@ extern Ref noteidalloc; extern int nphysseg; extern int nsyscall; -extern Palloc palloc, bigpalloc; +extern Pgalloc pga; extern Physseg physseg[]; extern Procalloc procalloc; extern uint qiomaxatomic; extern char* statename[]; -extern Image swapimage; extern char* sysname; extern struct { char* n; @@ -1067,13 +1095,11 @@ vlong offset; }; - - #define DEVDOTDOT -1 #pragma varargck type "I" uchar* #pragma varargck type "V" uchar* #pragma varargck type "E" uchar* #pragma varargck type "M" uchar* +#pragma varargck type "W" u64int #pragma varargck type "Z" Kzio* -#pragma varargck type "m" Mreg diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/portfns.h --- a/sys/src/nix/port/portfns.h Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/portfns.h Fri Sep 09 16:49:47 2011 +0200 @@ -1,387 +1,394 @@ +void _assert(char*); +void accounttime(void); +void acsched(void); +void addbootfile(char*, uchar*, ulong); +Timer* addclock0link(void (*)(void), int); +int addconsdev(Queue*, void (*fn)(char*,int), int, int); +int addkbdq(Queue*, int); +int addphysseg(Physseg*); +void addwatchdog(Watchdog*); +int adec(int*); +Block* adjustblock(Block*, int); +int ainc(int*); +void alarmkproc(void*); +Block* allocb(int); +void* alloczio(Segment*, long); +int anyhigher(void); +int anyready(void); +Image* attachimage(int, Chan*, uintptr, usize); +Page* auxpage(usize); +Block* bl2mem(uchar*, Block*, int); +int blocklen(Block*); +void bootlinks(void); +void cachedel(Image*, ulong); +void cachepage(Page*, Image*); +void callwithureg(void (*)(Ureg*)); +int canlock(Lock*); +int canpage(Proc*); +int canqlock(QLock*); +int canrlock(RWlock*); +Chan* cclone(Chan*); +void cclose(Chan*); +void ccloseq(Chan*); +void chanfree(Chan*); +char* chanpath(Chan*); +void checkalarms(void); +void checkb(Block*, char*); +void closeegrp(Egrp*); +void closefgrp(Fgrp*); +void closepgrp(Pgrp*); +void closergrp(Rgrp*); +void cmderror(Cmdbuf*, char*); +int cmount(Chan**, Chan*, int, char*); +Block* concatblock(Block*); +void confinit(void); +int consactive(void); +void (*consdebug)(void); +void (*consputs)(char*, int); +Block* copyblock(Block*, int); +void copypage(Page*, Page*); +void cunmount(Chan*, Chan*); +Segment* data2txt(Segment*); +uintptr dbgpc(Proc*); +int decrypt(void*, void*, int); +void delay(int); +void delconsdevs(void); +Proc* dequeueproc(Schedq*, Proc*); +Chan* devattach(int, char*); +Block* devbread(Chan*, long, vlong); +long devbwrite(Chan*, Block*, vlong); +Chan* devclone(Chan*); +int devconfig(int, char *, DevConf *); +void devcreate(Chan*, char*, int, int); +void devdir(Chan*, Qid, char*, vlong, char*, long, Dir*); +long devdirread(Chan*, char*, long, Dirtab*, int, Devgen*); +Devgen devgen; +void devinit(void); +Chan* devopen(Chan*, int, Dirtab*, int, Devgen*); +void devpermcheck(char*, int, int); +void devpower(int); +void devremove(Chan*); +void devreset(void); +void devshutdown(void); +long devstat(Chan*, uchar*, long, Dirtab*, int, Devgen*); +Dev* devtabget(int, int); +void devtabinit(void); +long devtabread(Chan*, void*, long, vlong); +void devtabreset(void); +void devtabshutdown(void); +Walkqid* devwalk(Chan*, Chan*, char**, int, Dirtab*, int, Devgen*); +long devwstat(Chan*, uchar*, long); +int devzread(Chan*, Kzio*, int, usize, vlong); +int devzwrite(Chan*, Kzio*, int, vlong); +void drawactive(int); +void drawcmap(void); +void dumpaproc(Proc*); +void dumpregs(Ureg*); +void dumpstack(void); +void dumpzseg(Segment*); +Fgrp* dupfgrp(Fgrp*); +int duppage(Page*); +Segment* dupseg(Segment**, int, int); +void dupswap(Page*); +char* edfadmit(Proc*); +void edfinit(Proc*); +int edfready(Proc*); +void edfrecord(Proc*); +void edfrun(Proc*, int); +void edfstop(Proc*); +void edfyield(void); +int emptystr(char*); +int encrypt(void*, void*, int); +void envcpy(Egrp*, Egrp*); +int eqchanddq(Chan*, int, uint, Qid, int); +int eqqid(Qid, Qid); +void error(char*); +void exhausted(char*); +void exit(int); +uvlong fastticks(uvlong*); +uvlong fastticks2ns(uvlong); +uvlong fastticks2us(uvlong); +int fault(uintptr, int); +void fdclose(int, int); +Chan* fdtochan(int, int, int, int); +int findmount(Chan**, Mhead**, int, uint, Qid); +int fixfault(Segment*, uintptr, int, int); +void fmtinit(void); +void forceclosefgrp(void); +void free(void*); +void freeb(Block*); +void freeblist(Block*); +int freebroken(void); +void freepte(Segment*, Pte*); +void getcolor(ulong, ulong*, ulong*, ulong*); +char* getconfenv(void); +int getpgszi(ulong); +Segment* getzkseg(void); +void gotolabel(Label*); +int haswaitq(void*); +void hnputl(void*, uint); +void hnputs(void*, ushort); +void hnputv(void*, uvlong); +long hostdomainwrite(char*, long); +long hostownerwrite(char*, long); +void hzsched(void); +Block* iallocb(int); +void iallocsummary(void); +void ilock(Lock*); +void initimage(void); +int iprint(char*, ...); +void isdir(Chan*); +int iseve(void); +int islo(void); +Segment* isoverlap(Proc*, uintptr, usize); +int isphysseg(char*); +void iunlock(Lock*); +void ixsummary(void); +int kbdcr2nl(Queue*, int); +int kbdgetmap(int, int*, int*, Rune*); +int kbdputc(Queue*, int); +void kbdputmap(ushort, ushort, Rune); +void kickpager(int, int); +void killbig(char*); +void kproc(char*, void(*)(void*), void*); +void kprocchild(Proc*, void (*)(void*), void*); +void (*kproftimer)(uintptr); +void ksetenv(char*, char*, int); +void kstrcpy(char*, char*, int); +void kstrdup(char**, char*); +long latin1(Rune*, int); +int lock(Lock*); +void log(Log*, int, char*, ...); +void logclose(Log*); +char* logctl(Log*, int, char**, Logflag*); +void logn(Log*, int, void*, int); +void logopen(Log*); +long logread(Log*, void*, ulong, long); +Page* lookpage(Image*, ulong); +Cmdtab* lookupcmd(Cmdbuf*, Cmdtab*, int); +void mallocinit(void); +long mallocreadsummary(Chan*, void*, long, long); +void mallocsummary(void); +Block* mem2bl(uchar*, int); +void (*mfcinit)(void); +void (*mfcopen)(Chan*); +int (*mfcread)(Chan*, uchar*, int, vlong); +void (*mfcupdate)(Chan*, uchar*, int, vlong); +void (*mfcwrite)(Chan*, uchar*, int, vlong); +void mfreeseg(Segment*, uintptr, int); +void microdelay(int); +uvlong mk64fract(uvlong, uvlong); +void mkqid(Qid*, vlong, ulong, int); +void mmuflush(void); +void mmuput(uintptr, uintmem, uint, Page*); +void mmurelease(Proc*); +void mmuswitch(Proc*); +Chan* mntauth(Chan*, char*); +usize mntversion(Chan*, u32int, char*, usize); +void mountfree(Mount*); +uvlong ms2fastticks(ulong); +#define MS2NS(n) (((vlong)(n))*1000000LL) +ulong ms2tk(ulong); +void mul64fract(uvlong*, uvlong, uvlong); +void muxclose(Mnt*); +Chan* namec(char*, int, int, int); +void nameerror(char*, char*); +Chan* newchan(void); +int newfd(Chan*); +Mhead* newmhead(Chan*); +Mount* newmount(Mhead*, Chan*, int, char*); +Page* newpage(int, Segment **, uintptr, usize); +Path* newpath(char*); +Pgrp* newpgrp(void); +Proc* newproc(void); +Rgrp* newrgrp(void); +Segment* newseg(int, uintptr, u64int); +void newzmap(Segment*); +void nexterror(void); +uint nhgetl(void*); +ushort nhgets(void*); +uvlong nhgetv(void*); +void nixprepage(int); +int nrand(int); +uvlong ns2fastticks(uvlong); +int okaddr(uintptr, long, int); +int openmode(int); +Block* packblock(Block*); +Block* padblock(Block*, int); +void pagechainhead(Page*); +void pageinit(void); +ulong pagenumber(Page*); +uvlong pagereclaim(Image*); +void pagersummary(void); +void pageunchain(Page*); +void panic(char*, ...); +Cmdbuf* parsecmd(char *a, int n); +void pathclose(Path*); +ulong perfticks(void); +void pexit(char*, int); +Page* pgalloc(usize, int); +void pgfree(Page*); +void pgrpcpy(Pgrp*, Pgrp*); +void pgrpnote(ulong, char*, long, int); +uintmem physalloc(u64int, int*); +void physdump(void); +void physfree(uintmem, u64int); +void physinit(uintmem, u64int); +void pio(Segment*, uintptr, ulong, Page**); +#define poperror() up->nerrlab-- +int postnote(Proc*, int, char*, int); +int pprint(char*, ...); +int preempted(void); +void prflush(void); +void printinit(void); +ulong procalarm(ulong); +void procctl(Proc*); +void procdump(void); +int procfdprint(Chan*, int, int, char*, int); +void procflushseg(Segment*); +void procpriority(Proc*, int, int); +void procrestore(Proc*); +void procsave(Proc*); +void (*proctrace)(Proc*, int, vlong); +void procwired(Proc*, int); +void psdecref(Proc*); +Proc* psincref(int); +int psindex(int); +void psinit(int); +Pte* ptealloc(Segment*); +Pte* ptecpy(Segment*,Pte*); +int pullblock(Block**, int); +Block* pullupblock(Block*, int); +Block* pullupqueue(Queue*, int); +void putimage(Image*); +void putmhead(Mhead*); +void putpage(Page*); +void putseg(Segment*); +void putstrn(char*, int); +void putswap(Page*); +int pwait(Waitmsg*); +void qaddlist(Queue*, Block*); +Block* qbread(Queue*, int); +long qbwrite(Queue*, Block*); +Queue* qbypass(void (*)(void*, Block*), void*); +int qcanread(Queue*); +void qclose(Queue*); +int qconsume(Queue*, void*, int); +Block* qcopy(Queue*, int, ulong); +int qdiscard(Queue*, int); +void qflush(Queue*); +void qfree(Queue*); +int qfull(Queue*); +Block* qget(Queue*); +void qhangup(Queue*, char*); +int qisclosed(Queue*); +int qiwrite(Queue*, void*, int); +int qlen(Queue*); +void qlock(QLock*); +void qnoblock(Queue*, int); +Queue* qopen(int, int, void (*)(void*), void*); +int qpass(Queue*, Block*); +int qpassnolim(Queue*, Block*); +int qproduce(Queue*, void*, int); +void qputback(Queue*, Block*); +long qread(Queue*, void*, int); +Block* qremove(Queue*); +void qreopen(Queue*); +void qsetlimit(Queue*, int); +void qunlock(QLock*); +int qwindow(Queue*); +int qwrite(Queue*, void*, int); +int rand(void); +void randominit(void); +ulong randomread(void*, ulong); +void rdb(void); +int readnum(ulong, char*, ulong, ulong, int); +long readstr(long, char*, long, char*); +void ready(Proc*); +long readzio(Kzio[], int, void*, long); +void reboot(void*, void*, long); +void rebootcmd(int, char**); +void relocateseg(Segment*, uintptr); +void renameuser(char*, char*); +void resched(char*); +void resrcwait(char*); +int return0(void*); +void rlock(RWlock*); +long rtctime(void); +int runac(Mach *m, void(*func)(void), int flushtlb, void *a, long n); +void runlock(RWlock*); +Proc* runproc(void); +void sched(void); +void scheddump(void); +void schedinit(void); +long seconds(void); +Segment* seg(Proc*, uintptr, int); +void segclock(uintptr); +Sem* segmksem(Segment*, int*); +void segpage(Segment*, Page*); +uintmem segppn(Segment*, uintmem); +int semadec(int*); +int semainc(int*); +char* seprintpagestats(char*, char*); +char* seprintphysstats(char*, char*); +int setcolor(ulong, ulong, ulong, ulong); +void setkernur(Ureg*, Proc*); +int setlabel(Label*); +void setregisters(Ureg*, char*, char*, int); +char* skipslash(char*); +void sleep(Rendez*, int (*)(void*), void*); +void* smalloc(ulong); +char* srvname(Chan*); +void stopnixproc(void); +int swapcount(ulong); +void swapinit(void); +void syscallfmt(int, va_list list); +void sysretfmt(int, va_list, Ar0*, uvlong, uvlong); +void sysrforkchild(Proc*, Proc*); +void timeradd(Timer*); +void timerdel(Timer*); +void timerintr(Ureg*, vlong); +void timerset(uvlong); +void timersinit(void); +ulong tk2ms(ulong); +#define TK2MS(x) ((x)*(1000/HZ)) +uvlong tod2fastticks(vlong); +vlong todget(vlong*); +void todinit(void); +void todset(vlong, vlong, int); +void todsetfreq(vlong); +Block* trimblock(Block*, int, int); +void tsleep(Rendez*, int (*)(void*), void*, long); +Uart* uartconsole(int, char*); +int uartctl(Uart*, char*); +int uartgetc(void); +void uartkick(void*); +void uartputc(int); +void uartputs(char*, int); +void uartrecv(Uart*, char); +int uartstageoutput(Uart*); +void unbreak(Proc*); +void uncachepage(Page*); +void unlock(Lock*); +void userinit(void); +uintptr userpc(Ureg*); +long userwrite(char*, long); +void* validaddr(void*, long, int); +void validname(char*, int); +char* validnamedup(char*, int); +void validstat(uchar*, usize); +void* vmemchr(void*, int, int); +Proc* wakeup(Rendez*); +int walk(Chan**, char**, int, int, int*); +void wlock(RWlock*); +void wunlock(RWlock*); +void yield(void); +uintptr zgetaddr(Segment*); +void zgrow(Segment*); +int ziofmt(Fmt*); +int zputaddr(Segment*, uintptr); +ulong µs(void); -void accounttime(void); -void acsched(void); -void addbootfile(char*, uchar*, ulong); -Timer* addclock0link(void (*)(void), int); -int addconsdev(Queue*, void (*fn)(char*,int), int, int); -int addkbdq(Queue*, int); -int addphysseg(Physseg*); -void addwatchdog(Watchdog*); -int adec(int*); -Block* adjustblock(Block*, int); -int ainc(int*); -void alarmkproc(void*); -Block* allocb(int); -void* alloczio(Segment*, long); -int anyhigher(void); -int anyready(void); -Image* attachimage(int, Chan*, uintptr, usize); -Page* auxpage(void); -Block* bl2mem(uchar*, Block*, int); -int blocklen(Block*); -void bootlinks(void); -void cachedel(Image*, ulong); -void cachepage(Page*, Image*); -void callwithureg(void (*)(Ureg*)); -int canlock(Lock*); -int canpage(Proc*); -int canqlock(QLock*); -int canrlock(RWlock*); -Chan* cclone(Chan*); -void cclose(Chan*); -void ccloseq(Chan*); -void chanfree(Chan*); -char* chanpath(Chan*); -void checkalarms(void); -void checkb(Block*, char*); -void closeegrp(Egrp*); -void closefgrp(Fgrp*); -void closepgrp(Pgrp*); -void closergrp(Rgrp*); -void cmderror(Cmdbuf*, char*); -int cmount(Chan**, Chan*, int, char*); -Block* concatblock(Block*); -void confinit(void); -int consactive(void); -void (*consdebug)(void); -void (*consputs)(char*, int); -Block* copyblock(Block*, int); -void copypage(Page*, Page*); -void cunmount(Chan*, Chan*); -Segment* data2txt(Segment*); -uintptr dbgpc(Proc*); -int decrypt(void*, void*, int); -void delay(int); -void delconsdevs(void); -Proc* dequeueproc(Schedq*, Proc*); -Chan* devattach(int, char*); -Block* devbread(Chan*, long, vlong); -long devbwrite(Chan*, Block*, vlong); -Chan* devclone(Chan*); -int devconfig(int, char *, DevConf *); -void devcreate(Chan*, char*, int, int); -void devdir(Chan*, Qid, char*, vlong, char*, long, Dir*); -long devdirread(Chan*, char*, long, Dirtab*, int, Devgen*); -Devgen devgen; -void devinit(void); -Chan* devopen(Chan*, int, Dirtab*, int, Devgen*); -void devpermcheck(char*, int, int); -void devpower(int); -void devremove(Chan*); -void devreset(void); -void devshutdown(void); -long devstat(Chan*, uchar*, long, Dirtab*, int, Devgen*); -Dev* devtabget(int, int); -void devtabinit(void); -long devtabread(Chan*, void*, long, vlong); -void devtabreset(void); -void devtabshutdown(void); -Walkqid* devwalk(Chan*, Chan*, char**, int, Dirtab*, int, Devgen*); -long devwstat(Chan*, uchar*, long); -int devzread(Chan*, Kzio*, int, usize, vlong); -int devzwrite(Chan*, Kzio*, int, vlong); -void drawactive(int); -void drawcmap(void); -void dumpaproc(Proc*); -void dumpregs(Ureg*); -void dumpstack(void); -void dumpzseg(Segment*); -Fgrp* dupfgrp(Fgrp*); -int duppage(Page*); -Segment* dupseg(Segment**, int, int); -void dupswap(Page*); -char* edfadmit(Proc*); -void edfinit(Proc*); -int edfready(Proc*); -void edfrecord(Proc*); -void edfrun(Proc*, int); -void edfstop(Proc*); -void edfyield(void); -int emptystr(char*); -int encrypt(void*, void*, int); -void envcpy(Egrp*, Egrp*); -int eqchanddq(Chan*, int, uint, Qid, int); -int eqqid(Qid, Qid); -void error(char*); -void exhausted(char*); -void exit(int); -uvlong fastticks(uvlong*); -uvlong fastticks2ns(uvlong); -uvlong fastticks2us(uvlong); -int fault(uintptr, int); -void fdclose(int, int); -Chan* fdtochan(int, int, int, int); -int findmount(Chan**, Mhead**, int, uint, Qid); -int fixfault(Segment*, uintptr, int, int); -void fmtinit(void); -void forceclosefgrp(void); -void free(void*); -void freeb(Block*); -void freeblist(Block*); -int freebroken(void); -void freepte(Segment*, Pte*); -void getcolor(ulong, ulong*, ulong*, ulong*); -char* getconfenv(void); -Segment* getzkseg(void); -void gotolabel(Label*); -int haswaitq(void*); -void hnputl(void*, uint); -void hnputs(void*, ushort); -void hnputv(void*, uvlong); -long hostdomainwrite(char*, long); -long hostownerwrite(char*, long); -void hzsched(void); -Block* iallocb(int); -void iallocsummary(void); -void ilock(Lock*); -void initimage(void); -int iprint(char*, ...); -void isdir(Chan*); -int iseve(void); -int islo(void); -Segment* isoverlap(Proc*, uintptr, usize); -int ispages(void*); -int isphysseg(char*); -void iunlock(Lock*); -void ixsummary(void); -int kbdcr2nl(Queue*, int); -int kbdgetmap(int, int*, int*, Rune*); -int kbdputc(Queue*, int); -void kbdputmap(ushort, ushort, Rune); -void kickpager(void); -void killbig(char*); -void kproc(char*, void(*)(void*), void*); -void kprocchild(Proc*, void (*)(void*), void*); -void (*kproftimer)(uintptr); -void ksetenv(char*, char*, int); -void kstrcpy(char*, char*, int); -void kstrdup(char**, char*); -long latin1(Rune*, int); -int lock(Lock*); -void log(Log*, int, char*, ...); -void logclose(Log*); -char* logctl(Log*, int, char**, Logflag*); -void logn(Log*, int, void*, int); -void logopen(Log*); -long logread(Log*, void*, ulong, long); -Page* lookpage(Image*, ulong); -Cmdtab* lookupcmd(Cmdbuf*, Cmdtab*, int); -void mallocinit(void); -long mallocreadsummary(Chan*, void*, long, long); -void mallocsummary(void); -Block* mem2bl(uchar*, int); -void (*mfcinit)(void); -void (*mfcopen)(Chan*); -int (*mfcread)(Chan*, uchar*, int, vlong); -void (*mfcupdate)(Chan*, uchar*, int, vlong); -void (*mfcwrite)(Chan*, uchar*, int, vlong); -void mfreeseg(Segment*, uintptr, int); -void microdelay(int); -uvlong mk64fract(uvlong, uvlong); -void mkqid(Qid*, vlong, ulong, int); -void mmuflush(void); -void mmuput(uintptr, physaddr, Page*); -void mmurelease(Proc*); -void mmuswitch(Proc*); -Chan* mntauth(Chan*, char*); -usize mntversion(Chan*, u32int, char*, usize); -void mountfree(Mount*); -int mregfmt(Fmt*); -uvlong ms2fastticks(ulong); -#define MS2NS(n) (((vlong)(n))*1000000LL) -ulong ms2tk(ulong); -void mul64fract(uvlong*, uvlong, uvlong); -void muxclose(Mnt*); -Chan* namec(char*, int, int, int); -void nameerror(char*, char*); -Chan* newchan(void); -int newfd(Chan*); -Mhead* newmhead(Chan*); -Mount* newmount(Mhead*, Chan*, int, char*); -Page* newpage(int, Segment **, uintptr, uintptr); -Path* newpath(char*); -Pgrp* newpgrp(void); -Proc* newproc(void); -Rgrp* newrgrp(void); -Segment* newseg(int, uintptr, usize); -void newzmap(Segment*); -void nexterror(void); -uint nhgetl(void*); -ushort nhgets(void*); -uvlong nhgetv(void*); -void nixprepage(int); -int nrand(int); -uvlong ns2fastticks(uvlong); -int okaddr(uintptr, long, int); -int openmode(int); -Block* packblock(Block*); -Block* padblock(Block*, int); -void pagechainhead(Page*); -void pageinit(void); -ulong pagenumber(Page*); -uvlong pagereclaim(int); -void pagersummary(void); -void panic(char*, ...); -Cmdbuf* parsecmd(char *a, int n); -void pathclose(Path*); -ulong perfticks(void); -void pexit(char*, int); -void pgrpcpy(Pgrp*, Pgrp*); -void pgrpnote(ulong, char*, long, int); -void pio(Segment*, uintptr, ulong, Page**); -#define poperror() up->nerrlab-- -int postnote(Proc*, int, char*, int); -int pprint(char*, ...); -int preempted(void); -void prflush(void); -void printinit(void); -ulong procalarm(ulong); -void procctl(Proc*); -void procdump(void); -int procfdprint(Chan*, int, int, char*, int); -void procflushseg(Segment*); -void procpriority(Proc*, int, int); -void procrestore(Proc*); -void procsave(Proc*); -void (*proctrace)(Proc*, int, vlong); -void procwired(Proc*, int); -void psdecref(Proc*); -Proc* psincref(int); -int psindex(int); -void psinit(int); -Pte* ptealloc(void); -Pte* ptecpy(Pte*); -int pullblock(Block**, int); -Block* pullupblock(Block*, int); -Block* pullupqueue(Queue*, int); -void putimage(Image*); -void putmhead(Mhead*); -void putpage(Page*); -void putseg(Segment*); -void putstrn(char*, int); -void putswap(Page*); -int pwait(Waitmsg*); -void qaddlist(Queue*, Block*); -Block* qbread(Queue*, int); -long qbwrite(Queue*, Block*); -Queue* qbypass(void (*)(void*, Block*), void*); -int qcanread(Queue*); -void qclose(Queue*); -int qconsume(Queue*, void*, int); -Block* qcopy(Queue*, int, ulong); -int qdiscard(Queue*, int); -void qflush(Queue*); -void qfree(Queue*); -int qfull(Queue*); -Block* qget(Queue*); -void qhangup(Queue*, char*); -int qisclosed(Queue*); -int qiwrite(Queue*, void*, int); -int qlen(Queue*); -void qlock(QLock*); -void qnoblock(Queue*, int); -Queue* qopen(int, int, void (*)(void*), void*); -int qpass(Queue*, Block*); -int qpassnolim(Queue*, Block*); -int qproduce(Queue*, void*, int); -void qputback(Queue*, Block*); -long qread(Queue*, void*, int); -Block* qremove(Queue*); -void qreopen(Queue*); -void qsetlimit(Queue*, int); -void qunlock(QLock*); -int qwindow(Queue*); -int qwrite(Queue*, void*, int); -int rand(void); -void randominit(void); -ulong randomread(void*, ulong); -void rdb(void); -int readnum(ulong, char*, ulong, ulong, int); -long readstr(long, char*, long, char*); -void ready(Proc*); -long readzio(Kzio[], int, void*, long); -void reboot(void*, void*, long); -void rebootcmd(int, char**); -void relocateseg(Segment*, uintptr); -void renameuser(char*, char*); -void resched(char*); -void resrcwait(char*); -int return0(void*); -void rlock(RWlock*); -long rtctime(void); -int runac(Mach *m, void(*func)(void), int flushtlb, void *a, long n); -void runlock(RWlock*); -Proc* runproc(void); -void sched(void); -void scheddump(void); -void schedinit(void); -long seconds(void); -Segment* seg(Proc*, uintptr, int); -void segclock(uintptr); -Sem* segmksem(Segment*, int*); -void segpage(Segment*, Page*); -int semadec(int*); -int semainc(int*); -int setcolor(ulong, ulong, ulong, ulong); -void setkernur(Ureg*, Proc*); -int setlabel(Label*); -void setregisters(Ureg*, char*, char*, int); -char* skipslash(char*); -void sleep(Rendez*, int (*)(void*), void*); -void* smalloc(ulong); -char* srvname(Chan*); -void stopnixproc(void); -int swapcount(ulong); -void swapinit(void); -void syscallfmt(int, va_list list); -void sysretfmt(int, va_list, Ar0*, uvlong, uvlong); -void sysrforkchild(Proc*, Proc*); -void timeradd(Timer*); -void timerdel(Timer*); -void timerintr(Ureg*, vlong); -void timerset(uvlong); -void timersinit(void); -ulong tk2ms(ulong); -#define TK2MS(x) ((x)*(1000/HZ)) -uvlong tod2fastticks(vlong); -vlong todget(vlong*); -void todinit(void); -void todset(vlong, vlong, int); -void todsetfreq(vlong); -Block* trimblock(Block*, int, int); -void tsleep(Rendez*, int (*)(void*), void*, long); -Uart* uartconsole(int, char*); -int uartctl(Uart*, char*); -int uartgetc(void); -void uartkick(void*); -void uartputc(int); -void uartputs(char*, int); -void uartrecv(Uart*, char); -int uartstageoutput(Uart*); -void unbreak(Proc*); -void uncachepage(Page*); -void unlock(Lock*); -void userinit(void); -uintptr userpc(Ureg*); -long userwrite(char*, long); -void* validaddr(void*, long, int); -void validname(char*, int); -char* validnamedup(char*, int); -void validstat(uchar*, usize); -void* vmemchr(void*, int, int); -Proc* wakeup(Rendez*); -int walk(Chan**, char**, int, int, int*); -void wlock(RWlock*); -void wunlock(RWlock*); -void yield(void); -void zgrow(Segment*); -int ziofmt(Fmt*); -int zputaddr(Segment*, uintptr); -void _assert(char*); -ulong µs(void); -uintptr zgetaddr(Segment*); - - -#pragma varargck argpos iprint 1 -#pragma varargck argpos panic 1 -#pragma varargck argpos pprint 1 +#pragma varargck argpos iprint 1 +#pragma varargck argpos panic 1 +#pragma varargck argpos pprint 1 diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/print.c --- a/sys/src/nix/port/print.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/print.c Fri Sep 09 16:49:47 2011 +0200 @@ -24,20 +24,9 @@ return -1; } -int -mregfmt(Fmt* f) -{ - Mreg mreg; - - mreg = va_arg(f->args, Mreg); - if(sizeof(Mreg) == sizeof(uvlong)) - return fmtprint(f, "%#16.16llux", (uvlong)mreg); - return fmtprint(f, "%#8.8ux", (uint)mreg); -} - void fmtinit(void) { quotefmtinstall(); - fmtinstall('m', mregfmt); + archfmtinstall(); } diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/proc.c --- a/sys/src/nix/port/proc.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/proc.c Fri Sep 09 16:49:47 2011 +0200 @@ -87,10 +87,10 @@ /* * Holding locks from pexit: * procalloc - * palloc + * pga */ mmurelease(up); - unlock(&palloc); + unlock(&pga); psrelease(up); unlock(&procalloc); @@ -153,7 +153,7 @@ if(up->nlocks) if(up->state != Moribund) if(up->delaysched < 20 - || palloc.Lock.p == up + || pga.Lock.p == up || procalloc.Lock.p == up){ up->delaysched++; delayedscheds++; @@ -429,14 +429,14 @@ void ready(Proc *p) { - Mreg s; + Mpl pl; int pri; Schedq *rq; void (*pt)(Proc*, int, vlong); - s = splhi(); + pl = splhi(); if(edfready(p)){ - splx(s); + splx(pl); return; } @@ -454,7 +454,7 @@ pt = proctrace; if(pt) pt(p, SReady, 0); - splx(s); + splx(pl); } /* @@ -478,7 +478,7 @@ static void rebalance(void) { - Mreg s; + Mpl pl; int pri, npri, t; Schedq *rq; Proc *p; @@ -500,11 +500,11 @@ updatecpu(p); npri = reprioritize(p); if(npri != pri){ - s = splhi(); + pl = splhi(); p = dequeueproc(rq, p); if(p) queueproc(&runq[npri], p); - splx(s); + splx(pl); goto another; } } @@ -688,7 +688,7 @@ { Proc *pp; int i; - char nwired[MAXMACH]; + char nwired[MACHMAX]; Mach *wm; if(bm < 0){ @@ -741,10 +741,10 @@ void sleep(Rendez *r, int (*f)(void*), void *arg) { - Mreg s; + Mpl pl; void (*pt)(Proc*, int, vlong); - s = splhi(); + pl = splhi(); if(up->nlocks) print("process %d sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n", @@ -806,13 +806,13 @@ if(up->notepending) { up->notepending = 0; - splx(s); + splx(pl); if(up->procctl == Proc_exitme && up->closingfgrp) forceclosefgrp(); error(Eintr); } - splx(s); + splx(pl); } static int @@ -870,10 +870,10 @@ Proc* wakeup(Rendez *r) { - Mreg s; + Mpl pl; Proc *p; - s = splhi(); + pl = splhi(); lock(r); p = r->p; @@ -889,7 +889,7 @@ } unlock(r); - splx(s); + splx(pl); return p; } @@ -904,7 +904,7 @@ int postnote(Proc *p, int dolock, char *n, int flag) { - Mreg s; + Mpl pl; int ret; Rendez *r; Proc *d, **l; @@ -938,7 +938,7 @@ /* this loop is to avoid lock ordering problems. */ for(;;){ - s = splhi(); + pl = splhi(); lock(&p->rlock); r = p->r; @@ -959,11 +959,11 @@ /* give other process time to get out of critical section and try again */ unlock(&p->rlock); - splx(s); + splx(pl); sched(); } unlock(&p->rlock); - splx(s); + splx(pl); if(p->state != Rendezvous){ if(p->state == Semdown) @@ -1199,7 +1199,7 @@ /* Sched must not loop for these locks */ lock(&procalloc); - lock(&palloc); + lock(&pga); stopac(); stopnixproc(); @@ -1268,7 +1268,9 @@ return; bss = 0; - if(p->seg[BSEG]) + if(p->seg[HSEG]) + bss = p->seg[HSEG]->top; + else if(p->seg[BSEG]) bss = p->seg[BSEG]->top; s = p->psstate; @@ -1421,7 +1423,7 @@ void procctl(Proc *p) { - Mreg s; + Mpl pl; char *state; switch(p->procctl) { @@ -1443,7 +1445,7 @@ state = p->psstate; p->psstate = "Stopped"; /* free a waiting debugger */ - s = spllo(); + pl = spllo(); qlock(&p->debug); if(p->pdbg) { wakeup(&p->pdbg->sleep); @@ -1454,7 +1456,7 @@ p->state = Stopped; sched(); p->psstate = state; - splx(s); + splx(pl); return; case Proc_toac: diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/segment.c --- a/sys/src/nix/port/segment.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/segment.c Fri Sep 09 16:49:47 2011 +0200 @@ -5,32 +5,56 @@ #include "fns.h" #include "../port/error.h" -extern void freezseg(Segment*); +uintmem +segppn(Segment *s, uintmem pa) +{ + uintmem pgsz; + pgsz = m->pgsz[s->pgszi]; + pa &= ~(pgsz-1); + return pa; +} + +/* + * Sizes are given in multiples of BIGPGSZ. + * The actual page size used is either BIGPGSZ or 1*GiB + * if base is aligned to 1G and size is >= 1G and we support 1G pages. + */ Segment * -newseg(int type, uintptr base, usize size) +newseg(int type, uintptr base, u64int size) { Segment *s; int mapsize; + uint pgsz; - if(size > (SEGMAPSIZE*PTEPERTAB)) + if(size > SEGMAPSIZE*(PTEMAPMEM/BIGPGSZ)) error(Enovmem); + pgsz = BIGPGSZ; + if(size*BIGPGSZ >= 1*GiB && getpgszi(1*GiB) >= 0 && + (base&(1ULL*GiB-1)) == 0 && ((size*BIGPGSZ)&(1ULL*GiB-1)) == 0){ + DBG("newseg: using 1G pages\n"); + pgsz = 1*GiB; + } s = smalloc(sizeof(Segment)); s->ref = 1; s->type = type; s->base = base; + s->ptepertab = PTEMAPMEM/pgsz; s->top = base+(size*BIGPGSZ); s->size = size; - s->lgpgsize = 0; + s->pgszi = getpgszi(pgsz); + if(s->pgszi < 0) + panic("newseg: getpgszi %d", pgsz); s->sema.prev = &s->sema; s->sema.next = &s->sema; + s->color = NOCOLOR; - mapsize = HOWMANY(size, PTEPERTAB); + mapsize = HOWMANY(size*BIGPGSZ/pgsz, s->ptepertab); if(mapsize > nelem(s->ssegmap)){ mapsize *= 2; - if(mapsize > (SEGMAPSIZE*PTEPERTAB)) - mapsize = (SEGMAPSIZE*PTEPERTAB); + if(mapsize > (SEGMAPSIZE*s->ptepertab)) + mapsize = (SEGMAPSIZE*s->ptepertab); s->map = smalloc(mapsize*sizeof(Pte*)); s->mapsize = mapsize; } @@ -70,6 +94,7 @@ { Pte **pp, **emap; Image *i; + extern void freezseg(Segment*); if(s == 0) return; @@ -177,12 +202,15 @@ n->image = s->image; n->fstart = s->fstart; n->flen = s->flen; + n->pgszi = s->pgszi; + n->color = s->color; + n->ptepertab = s->ptepertab; break; } size = s->mapsize; for(i = 0; i < size; i++) if(pte = s->map[i]) - n->map[i] = ptecpy(pte); + n->map[i] = ptecpy(n, pte); n->flushme = s->flushme; if(s->ref > 1) @@ -203,12 +231,15 @@ { Pte **pte; uintptr soff; + uintmem pgsz; Page **pg; - if(s->lgpgsize == 0) - s->lgpgsize = p->lgsize; - if(s->lgpgsize != p->lgsize) - panic("segpage: s->lgpgsize != p->lgsize"); + if(s->pgszi < 0) + s->pgszi = p->pgszi; + if(s->color == NOCOLOR) + s->color = p->color; + if(s->pgszi != p->pgszi) + panic("segpage: s->pgszi != p->pgszi"); if(p->va < s->base || p->va >= s->top) panic("segpage: p->va < s->base || p->va >= s->top"); @@ -216,9 +247,9 @@ soff = p->va - s->base; pte = &s->map[soff/PTEMAPMEM]; if(*pte == 0) - *pte = ptealloc(); - - pg = &(*pte)->pages[(soff&(PTEMAPMEM-1))/BIGPGSZ]; + *pte = ptealloc(s); + pgsz = m->pgsz[s->pgszi]; + pg = &(*pte)->pages[(soff&(PTEMAPMEM-1))/pgsz]; *pg = p; if(pg < (*pte)->first) (*pte)->first = pg; @@ -234,11 +265,13 @@ { int i, j, size; uintptr soff; + uintmem pgsz; Page *pg; Page *list; + pgsz = m->pgsz[s->pgszi]; soff = start-s->base; - j = (soff&(PTEMAPMEM-1))/BIGPGSZ; + j = (soff&(PTEMAPMEM-1))/pgsz; size = s->mapsize; list = nil; @@ -246,11 +279,11 @@ if(pages <= 0) break; if(s->map[i] == 0) { - pages -= PTEPERTAB-j; + pages -= s->ptepertab-j; j = 0; continue; } - while(j < PTEPERTAB) { + while(j < s->ptepertab) { pg = s->map[i]->pages[j]; /* * We want to zero s->map[i]->page[j] and putpage(pg), @@ -329,13 +362,14 @@ prepageseg(int i) { Segment *s; - uintptr addr; + uintptr addr, pgsz; s = up->seg[i]; if(s == nil) return; DBG("prepage: base %#p top %#p\n", s->base, s->top); - for(addr = s->base; addr < s->top; addr += BIGPGSZ) + pgsz = m->pgsz[s->pgszi]; + for(addr = s->base; addr < s->top; addr += pgsz) fault(addr, i == TSEG); } diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/syscallfmt.c --- a/sys/src/nix/port/syscallfmt.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/syscallfmt.c Fri Sep 09 16:49:47 2011 +0200 @@ -57,6 +57,7 @@ syscallfmt(int syscallno, va_list list) { long l; + ulong ul; Fmt fmt; void *v; vlong vl; @@ -320,12 +321,26 @@ fmtprint(&fmt, " %lld", vl); } break; - case MREAD: - case MWRITE: + case ZPREAD: i[0] = va_arg(list, int); v = va_arg(list, void*); i[1] = va_arg(list, int); - fmtprint(&fmt, "%d %#p %d", i[0], v, i[1]); + ul = va_arg(list, usize); + vl = va_arg(list, vlong); + fmtprint(&fmt, "%d %#p %d %ld %ulld", i[0], v, i[1], ul, vl); + break; + case ZPWRITE: + i[0] = va_arg(list, int); + v = va_arg(list, void*); + i[1] = va_arg(list, int); + vl = va_arg(list, vlong); + fmtprint(&fmt, "%d %#p %d %ulld", i[0], v, i[1], vl); + break; + case ZFREE: + v = va_arg(list, void*); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%#p %d", v, i[1]); + case NIXSYSCALL: break; } up->syscalltrace = fmtstrflush(&fmt); diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/sysfile.c --- a/sys/src/nix/port/sysfile.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/sysfile.c Fri Sep 09 16:49:47 2011 +0200 @@ -1577,4 +1577,3 @@ { error("old fwstat system call - recompile"); } - diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/sysproc.c --- a/sys/src/nix/port/sysproc.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/sysproc.c Fri Sep 09 16:49:47 2011 +0200 @@ -407,6 +407,13 @@ nexterror(); } up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BIGPGSZ); + /* + * The color for the new stack determines the colors for the new segments. + * Even a cached text image changes its color to that of the stack. + * This will make new pages allocated for these segments prefer the color + * for the core where the program will run. + */ + // up->seg[ESEG]->color = acpicorecolor(core); /* * Stack is a pointer into the temporary stack @@ -539,11 +546,11 @@ * Free old memory. * Special segments maintained across exec. */ - for(i = SSEG; i <= BSEG; i++) { + for(i = SSEG; i <= HSEG; i++) { putseg(up->seg[i]); up->seg[i] = nil; /* in case of error */ } - for(i = BSEG+1; i< NSEG; i++) { + for(i = HSEG+1; i< NSEG; i++) { s = up->seg[i]; if(s && (s->type&SG_CEXEC)) { putseg(s); @@ -557,6 +564,7 @@ img = attachimage(SG_TEXT|SG_RONLY, chan, UTZERO, (textlim-UTZERO)/BIGPGSZ); s = img->s; up->seg[TSEG] = s; + s->color = up->seg[ESEG]->color; s->flushme = 1; s->fstart = 0; s->flen = hdrsz+textsz; @@ -565,6 +573,7 @@ /* Data. Shared. */ s = newseg(SG_DATA, textlim, (datalim-textlim)/BIGPGSZ); up->seg[DSEG] = s; + s->color = up->seg[ESEG]->color; /* Attached by hand */ incref(img); @@ -574,6 +583,7 @@ /* BSS. Zero fill on demand for TS */ up->seg[BSEG] = newseg(SG_BSS, datalim, (bsslim-datalim)/BIGPGSZ); + up->seg[BSEG]->color= up->seg[ESEG]->color; /* * Move the stack @@ -581,6 +591,9 @@ s = up->seg[ESEG]; up->seg[ESEG] = nil; up->seg[SSEG] = s; + /* the color of the stack was decided when we created it before, + * it may have nothing to do with the color of other segments. + */ qunlock(&up->seglock); poperror(); /* seglock */ diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/sysseg.c --- a/sys/src/nix/port/sysseg.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/sysseg.c Fri Sep 09 16:49:47 2011 +0200 @@ -30,9 +30,10 @@ return -1; } - if(new->lgpgsize == 0) - new->lgpgsize = BIGPGSHFT; - + if(new->pgszi < 0) + new->pgszi = getpgszi(2*MiB); /* 2M pages by default */ + if(new->pgszi < 0) + panic("addphysseg"); *ps = *new; unlock(&physseglock); @@ -62,10 +63,11 @@ ibrk(uintptr addr, int seg) { Segment *s, *ns; - uintptr newtop; + uintptr newtop, rtop; long newsize; int i, mapsize; Pte **map; + uintmem pgsz; s = up->seg[seg]; if(s == 0) @@ -75,42 +77,67 @@ return s->top; qlock(&s->lk); + if(waserror()) { + qunlock(&s->lk); + nexterror(); + } /* We may start with the bss overlapping the data */ if(addr < s->base) { - if(seg != BSEG || up->seg[DSEG] == 0 || addr < up->seg[DSEG]->base) { - qunlock(&s->lk); + if(seg != BSEG || up->seg[DSEG] == 0 || addr < up->seg[DSEG]->base) error(Enovmem); - } addr = s->base; } - newtop = BIGPGROUND(addr); - newsize = (newtop-s->base)/BIGPGSZ; + pgsz = m->pgsz[s->pgszi]; + if(seg == BSEG && addr >= ROUNDUP(s->top, 1*GiB) + 1*GiB) + newtop = ROUNDUP(addr, 1*GiB); + else + newtop = ROUNDUP(addr, pgsz); + newsize = (newtop-s->base)/pgsz; if(newtop < s->top) { - mfreeseg(s, newtop, (s->top-newtop)/BIGPGSZ); + mfreeseg(s, newtop, (s->top-newtop)/pgsz); s->top = newtop; s->size = newsize; + poperror(); qunlock(&s->lk); mmuflush(); return newtop; } + if(newsize > (SEGMAPSIZE*s->ptepertab)) + error(Enovmem); for(i = 0; i < NSEG; i++) { ns = up->seg[i]; if(ns == 0 || ns == s) continue; - if(newtop >= ns->base && newtop < ns->top) { - qunlock(&s->lk); + if(newtop >= ns->base && newtop < ns->top) error(Esoverlap); - } } - if(newsize > (SEGMAPSIZE*PTEPERTAB)) { - qunlock(&s->lk); - error(Enovmem); - } - mapsize = HOWMANY(newsize, PTEPERTAB); + if(seg == BSEG && newtop >= ROUNDUP(s->top, 1*GiB) + 1*GiB){ + DBG("segment using 1G pages\n"); + /* + * brk the bss up to the 1G boundary, and create + * a segment placed at that boundary, using 1G pages if it can. + * This is both back compatible, transparent, + * and permits using 1G pages. + */ + rtop = ROUNDUP(newtop,1*GiB); + newtop = ROUNDUP(s->top, 1*GiB); + newsize -= (rtop-newtop)/BIGPGSZ; +assert(newsize >= 0); + DBG("ibrk: newseg %#ullx %ullx\n", newtop, (rtop-newtop)/BIGPGSZ); + ns = newseg(SG_BSS, newtop, (rtop-newtop)/BIGPGSZ); + ns->color= s->color; + up->seg[HSEG] = ns; + DBG("ibrk: newtop %#ullx newsize %#ulx \n", newtop, newsize); + /* now extend the bss up to newtop */ + }else + rtop = newtop; + + + mapsize = HOWMANY(newsize, s->ptepertab); if(mapsize > s->mapsize){ map = smalloc(mapsize*sizeof(Pte*)); memmove(map, s->map, s->mapsize*sizeof(Pte*)); @@ -122,9 +149,10 @@ s->top = newtop; s->size = newsize; + poperror(); qunlock(&s->lk); - return newtop; + return rtop; } void @@ -140,9 +168,21 @@ * void* segbrk(void* saddr, void* addr); */ addr = PTR2UINT(va_arg(list, void*)); + if(addr == 0){ + if(up->seg[HSEG]) + ar0->v = UINT2PTR(up->seg[HSEG]->top); + else + ar0->v = UINT2PTR(up->seg[BSEG]->top); + return; + } for(i = 0; i < NSEG; i++) { s = up->seg[i]; - if(s == nil || addr < s->base || addr >= s->top) + if(s == nil) + continue; + /* Ok to extend an empty segment */ + if(addr < s->base || addr > s->top) + continue; + if(addr == s->top && (s->base < s->top)) continue; switch(s->type&SG_TYPE) { case SG_TEXT: @@ -203,6 +243,8 @@ s = (*_globalsegattach)(p, name); if(s != nil){ p->seg[sno] = s; + if(p == up && up->prepagemem) + nixprepage(sno); return s->base; } } @@ -260,7 +302,7 @@ s->pseg = ps; p->seg[sno] = s; - if(up->prepagemem) + if(p == up && up->prepagemem) nixprepage(sno); return va; diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/syszio.c --- a/sys/src/nix/port/syszio.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/syszio.c Fri Sep 09 16:49:47 2011 +0200 @@ -276,7 +276,6 @@ nexterror(); } io[0].size = c->dev->read(c, io[0].data, tot, offset); - nio = 1; poperror(); return 1; } diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/taslock.c --- a/sys/src/nix/port/taslock.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/taslock.c Fri Sep 09 16:49:47 2011 +0200 @@ -109,13 +109,13 @@ void ilock(Lock *l) { - Mreg s; + Mpl pl; uintptr pc; pc = getcallerpc(&l); lockstats.locks++; - s = splhi(); + pl = splhi(); if(TAS(&l->key) != 0){ lockstats.glare++; /* @@ -125,10 +125,10 @@ */ for(;;){ lockstats.inglare++; - splx(s); + splx(pl); while(l->key) ; - s = splhi(); + pl = splhi(); if(TAS(&l->key) == 0) goto acquire; } @@ -137,7 +137,7 @@ m->ilockdepth++; if(up) up->lastilock = l; - l->sr = s; + l->pl = pl; l->pc = pc; l->p = up; l->isilock = 1; @@ -205,7 +205,7 @@ void iunlock(Lock *l) { - Mreg s; + Mpl pl; #ifdef LOCKCYCLES uvlong x; @@ -228,12 +228,12 @@ m->machno, l->m->machno, getcallerpc(&l), l->pc); } - s = l->sr; + pl = l->pl; l->m = nil; l->key = 0; coherence(); m->ilockdepth--; if(up) up->lastilock = nil; - splx(s); + splx(pl); } diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/port/xalloc.c --- a/sys/src/nix/port/xalloc.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/port/xalloc.c Fri Sep 09 16:49:47 2011 +0200 @@ -70,13 +70,13 @@ n = kpages; if(m->base >= maxkpa) n = 0; - else if(n > 0 && m->base+n*BY2PG >= maxkpa) - n = (maxkpa - m->base)/BY2PG; + else if(n > 0 && m->base+n*PGSZ >= maxkpa) + n = (maxkpa - m->base)/PGSZ; /* first give to kernel */ if(n > 0){ m->kbase = PTR2UINT(KADDR(m->base)); - m->klimit = PTR2UINT(KADDR(m->base+n*BY2PG)); - xhole(m->base, n*BY2PG); + m->klimit = PTR2UINT(KADDR(m->base+n*PGSZ)); + xhole(m->base, n*PGSZ); kpages -= n; } /* if anything left over, give to user */ @@ -85,7 +85,7 @@ print("xinit: losing %lud pages\n", m->npage-n); continue; } - pm->base = m->base+n*BY2PG; + pm->base = m->base+n*PGSZ; pm->npage = m->npage - n; pm++; } diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/root/bigloop.c --- a/sys/src/nix/root/bigloop.c Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/root/bigloop.c Fri Sep 09 16:49:47 2011 +0200 @@ -9,10 +9,10 @@ if (argc > 1) num = strtoul(argv[1], 0, 0); - print("Try to malloc %ulld bytes in %d loops\n", num*0x200000ULL, num); + print("Try to malloc %ulld bytes in %ld loops\n", num*0x200000ULL, num); for(i = 0; i < num; i++) - if (mallocz(0x200000, i) == nil){ - print("%d malloc failed\n", i); + if (sbrk(0x200000) == nil){ + print("%d sbrk failed\n", i); break; } print("Did it\n"); diff -r 2adaa665956e -r fb7d4fad4e57 sys/src/nix/words --- a/sys/src/nix/words Fri Sep 09 11:03:29 2011 +0200 +++ b/sys/src/nix/words Fri Sep 09 16:49:47 2011 +0200 @@ -1,16 +1,20 @@ -To compile and use it you have to: +To compile and use it you have to do the binds shown in ./nix -! bind -c libc /sys/src/libc -! bind -b include /sys/include +This is the main nix source tree. +It's using 2M pages, and supports 1G pages. +In there, there are experimental frameworks for zero-copy, tubes, what else? +The main author was initially jmk and perhaps others from bell labs, when it was 9k. +Today it has been changed by many others including (|sort order): +esoriano +forsyth +jmk +nemo +npe +paurea +rminnich -This is 9k, from jmk, changed by rminnich, and -further changed to use 2M pages. - -This is the first version that worked with such pgsz. -You might be also interested in a (non-working) version -that was a previous attempt, found in the dump -at /sys/src/9kron2M, and removed when this tree was created. - - +You can reach as at nix A_T lsub.org +But don't write A_T like that. +Fri Sep 2 22:03:15 CET 2011