Support for Roget's thesaurus from project Gutenberg. Datafiles are (temporarly) in /n/sources/steve/roget, roget and rogetindex are the data and index respectively. /sys/src/cmd/dict/mkroget will rebuild build these from roget-body.rtf (with the help of rtf2txt). I suggest the data and index should go in /n/sources/roget similar to /n/sources/pgw. -Steve Reference: /n/sources/patch/applied/roget Date: Fri Nov 19 21:00:32 CET 2004 --- /sys/src/cmd/dict/utils.c Fri Nov 19 21:00:32 2004 +++ /sys/src/cmd/dict/utils.c Fri Nov 19 21:00:32 2004 @@ -16,6 +16,9 @@ {"thesaurus", "Collins Thesaurus", "/lib/dict/thesaurus", "/lib/dict/thesindex", thesnextoff, thesprintentry, thesprintkey}, + {"roget", "Project Gutenberg Roget's Thesaurus", + "/lib/dict/roget", "/lib/dict/rogetindex", + rogetnextoff, rogetprintentry, rogetprintkey}, {"ce", "Gendai Chinese->English", "/lib/dict/world/sansdata/sandic24.dat", --- /sys/src/cmd/dict/dict.h Fri Nov 19 21:00:32 2004 +++ /sys/src/cmd/dict/dict.h Fri Nov 19 21:00:32 2004 @@ -130,6 +130,9 @@ long pgwnextoff(long); void pgwprintentry(Entry,int); void pgwprintkey(void); +void rogetprintentry(Entry, int); +long rogetnextoff(long); +void rogetprintkey(void); long slangnextoff(long); void slangprintentry(Entry, int); void slangprintkey(void); --- /sys/src/cmd/dict/roget.c Thu Jan 1 00:00:00 1970 +++ /sys/src/cmd/dict/roget.c Fri Nov 19 21:00:32 2004 @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include "dict.h" + +/* Roget's Thesaurus from project Gutenberg */ + +static long Last = 0; + +void +rogetprintentry(Entry e, int cmd) +{ + int spc; + char c, *p; + + spc = 0; + p = e.start; + + if(cmd == 'h'){ + while(!isspace(*p) && p < e.end) + p++; + while(strncmp(p, " -- ", 4) != 0 && p < e.end){ + while(isspace(*p) && p < e.end) + p++; + if (*p == '[' || *p == '{'){ + c = (*p == '[')? ']': '}'; + while(*p != c && p < e.end) + p++; + p++; + continue; + } + if (isdigit(*p) || ispunct(*p)){ + while(!isspace(*p) && p < e.end) + p++; + continue; + } + + + if (isspace(*p)) + spc = 1; + else + if (spc){ + outchar(' '); + spc = 0; + } + + while(!isspace(*p) && p < e.end) + outchar(*p++); + } + return; + } + + while(p < e.end && !isspace(*p)) + p++; + while(p < e.end && isspace(*p)) + p++; + + while (p < e.end){ + if (p < e.end -4 && strncmp(p, " -- ", 4) == 0){ /* first line */ + outnl(2); + p += 4; + spc = 0; + } + + if (p < e.end -2 && strncmp(p, "[ ", 4) == 0){ /* twiddle layout */ + outchars(" ["); + continue; + } + + if (p < e.end -4 && strncmp(p, "&c (", 4) == 0){ /* usefull xref */ + if (spc) + outchar(' '); + outchar('/'); + while(p < e.end && *p != '(') + p++; + p++; + while(p < e.end && *p != ')') + outchar(*p++); + p++; + while(p < e.end && isspace(*p)) + p++; + while(p < e.end && isdigit(*p)) + p++; + outchar('/'); + continue; + } + + if (p < e.end -3 && strncmp(p, "&c ", 3) == 0){ /* less usefull xref */ + while(p < e.end && !isdigit(*p)) + p++; + while(p < e.end && isdigit(*p)) + p++; + continue; + } + + if (*p == '\n' && p < (e.end -1)){ /* their newlines */ + spc = 0; + p++; + if (isspace(*p)){ /* their continuation line */ + while (isspace(*p)) + p++; + p--; + } + else{ + outnl(2); + } + } + if (spc && *p != ';' && *p != '.' && + *p != ',' && !isspace(*p)){ /* drop spaces before punct */ + spc = 0; + outchar(' '); + } + if (isspace(*p)) + spc = 1; + else + outchar(*p); + p++; + } + outnl(0); +} + +long +rogetnextoff(long fromoff) +{ + int i; + vlong l; + char *p; + + Bseek(bdict, fromoff, 0); + Brdline(bdict, '\n'); + while ((p = Brdline(bdict, '\n')) != nil){ + l = Blinelen(bdict); + if (!isdigit(*p)) + continue; + for (i = 0; i < l-4; i++) + if (strncmp(p+i, " -- ", 4) == 0) + return Boffset(bdict)-l; + } + return Boffset(bdict); +} + +void +rogetprintkey(void) +{ + Bprint(bout, "No pronunciation key.\n"); +} --- /sys/src/cmd/dict/mkfile Fri Nov 19 21:00:32 2004 +++ /sys/src/cmd/dict/mkfile Fri Nov 19 21:00:32 2004 @@ -2,7 +2,7 @@ TARG=dict LFILES=oed.$O ahd.$O pcollins.$O pcollinsg.$O movie.$O slang.$O robert.$O\ - world.$O jis208.$O gb2312.$O thesaurus.$O simple.$O pgw.$O + world.$O jis208.$O gb2312.$O thesaurus.$O simple.$O pgw.$O roget.$O OFILES=dict.$O\ $LFILES\ --- /sys/src/cmd/dict/mkroget Thu Jan 1 00:00:00 1970 +++ /sys/src/cmd/dict/mkroget Fri Nov 19 21:00:32 2004 @@ -0,0 +1,23 @@ +#!/bin/rc + +rtf2txt roget-body.rtf | + sed ' + 1,12d + /^100. /{ + N + s/\n// + p + } + /^388a. /{ + N + s/\n// + p + } + ' > /lib/dict/roget + +mkindex -d roget | + sort -u -t' ' +0f -1 +0 -1 +1n -2 | + sed ' + s/[ ]+$//g + s/ / /g + ' > /lib/dict/rogetindex