114040Smckusick #ifndef lint 2*37636Sjak static char sccsid[] = "@(#)locate.code.c 4.3 (Berkeley) 05/04/89"; 314040Smckusick #endif not lint 414040Smckusick 514040Smckusick /* 614040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 714040Smckusick * to encode/decode a filename database) 814040Smckusick * 914051Smckusick * USAGE: bigram < list > bigrams 1014051Smckusick * process bigrams (see updatedb) > common_bigrams 1114051Smckusick * code common_bigrams < list > squozen_list 1214040Smckusick * 13*37636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 14*37636Sjak * February/March 1983, p. 8 ). Output format is, per line, an 15*37636Sjak * offset differential count byte followed by a partially bigram- 16*37636Sjak * encoded ascii residue. A bigram is a two-character sequence, 17*37636Sjak * the first 128 most common of which are encoded in one byte. 1814040Smckusick * 19*37636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 20*37636Sjak * if the input is... then the output is... 21*37636Sjak * 22*37636Sjak * /usr/src 0 /usr/src 23*37636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 24*37636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 25*37636Sjak * /usr/tmp/zoo 5 tmp/zoo 26*37636Sjak * 2714040Smckusick * The codes are: 2814040Smckusick * 2914040Smckusick * 0-28 likeliest differential counts + offset to make nonnegative 30*37636Sjak * 30 switch code for out-of-range count to follow in next word 31*37636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 32*37636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 3314040Smckusick * 3414051Smckusick * SEE ALSO: updatedb.csh, bigram.c, find.c 3514040Smckusick * 3614040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 3714040Smckusick * NASA Ames Research Center, 10/82 3814040Smckusick */ 3914040Smckusick 4014040Smckusick #include <stdio.h> 41*37636Sjak #include <sys/param.h> 42*37636Sjak #include "find.h" 4314040Smckusick 44*37636Sjak #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 4514040Smckusick 46*37636Sjak char buf1[MAXPATHLEN] = " "; 47*37636Sjak char buf2[MAXPATHLEN]; 48*37636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 4914040Smckusick 5014040Smckusick main ( argc, argv ) 5114040Smckusick int argc; char *argv[]; 5214040Smckusick { 53*37636Sjak register char *cp, *oldpath = buf1, *path = buf2; 54*37636Sjak int code, count, diffcount, oldcount = 0; 5514040Smckusick FILE *fp; 5614040Smckusick 5714051Smckusick if ((fp = fopen(argv[1], "r")) == NULL) { 58*37636Sjak printf("Usage: code common_bigrams < list > squozen_list\n"); 5914051Smckusick exit(1); 6014051Smckusick } 61*37636Sjak /* first copy bigram array to stdout */ 62*37636Sjak fgets ( bigrams, BGBUFSIZE + 1, fp ); 63*37636Sjak fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 64*37636Sjak fclose( fp ); 6514040Smckusick 66*37636Sjak /* every path will fit in path buffer, so safe to use gets */ 6714040Smckusick while ( gets ( path ) != NULL ) { 68*37636Sjak /* squelch characters that would botch the decoding */ 69*37636Sjak for ( cp = path; *cp != NULL; cp++ ) { 70*37636Sjak if ( *cp >= PARITY ) 71*37636Sjak *cp &= PARITY-1; 72*37636Sjak else if ( *cp <= SWITCH ) 73*37636Sjak *cp = '?'; 7414040Smckusick } 75*37636Sjak /* skip longest common prefix */ 76*37636Sjak for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 77*37636Sjak if ( *oldpath == NULL ) 78*37636Sjak break; 79*37636Sjak count = cp - path; 80*37636Sjak diffcount = count - oldcount + OFFSET; 81*37636Sjak oldcount = count; 82*37636Sjak if ( diffcount < 0 || diffcount > 2*OFFSET ) { 83*37636Sjak putc ( SWITCH, stdout ); 84*37636Sjak putw ( diffcount, stdout ); 8514040Smckusick } 8614040Smckusick else 87*37636Sjak putc ( diffcount, stdout ); 8814040Smckusick 89*37636Sjak while ( *cp != NULL ) { 90*37636Sjak if ( *(cp + 1) == NULL ) { 91*37636Sjak putchar ( *cp ); 9214040Smckusick break; 9314040Smckusick } 94*37636Sjak if ( (code = bgindex ( cp )) < 0 ) { 95*37636Sjak putchar ( *cp++ ); 96*37636Sjak putchar ( *cp++ ); 97*37636Sjak } 98*37636Sjak else { /* found, so mark byte with parity bit */ 99*37636Sjak putchar ( (code / 2) | PARITY ); 100*37636Sjak cp += 2; 101*37636Sjak } 10214040Smckusick } 103*37636Sjak if ( path == buf1 ) /* swap pointers */ 104*37636Sjak path = buf2, oldpath = buf1; 105*37636Sjak else 106*37636Sjak path = buf1, oldpath = buf2; 10714040Smckusick } 10814040Smckusick } 10914040Smckusick 110*37636Sjak bgindex ( bg ) /* return location of bg in bigrams or -1 */ 111*37636Sjak char *bg; 11214040Smckusick { 113*37636Sjak register char *p; 114*37636Sjak register char bg0 = bg[0], bg1 = bg[1]; 11514040Smckusick 116*37636Sjak for ( p = bigrams; *p != NULL; p++ ) 117*37636Sjak if ( *p++ == bg0 && *p == bg1 ) 118*37636Sjak break; 119*37636Sjak return ( *p == NULL ? -1 : --p - bigrams ); 12014040Smckusick } 121