137637Sbostic /* 237637Sbostic * Copyright (c) 1989 The Regents of the University of California. 337637Sbostic * All rights reserved. 437637Sbostic * 537637Sbostic * This code is derived from software contributed to Berkeley by 637637Sbostic * James A. Woods. 737637Sbostic * 8*42737Sbostic * %sccs.include.redist.c% 937637Sbostic */ 1037637Sbostic 1114040Smckusick #ifndef lint 1237637Sbostic char copyright[] = 1337637Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 1437637Sbostic All rights reserved.\n"; 1537637Sbostic #endif /* not lint */ 1614040Smckusick 1737637Sbostic #ifndef lint 18*42737Sbostic static char sccsid[] = "@(#)locate.code.c 4.8 (Berkeley) 06/01/90"; 1937637Sbostic #endif /* not lint */ 2037637Sbostic 2114040Smckusick /* 2214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 2314040Smckusick * to encode/decode a filename database) 2414040Smckusick * 2514051Smckusick * USAGE: bigram < list > bigrams 2614051Smckusick * process bigrams (see updatedb) > common_bigrams 2714051Smckusick * code common_bigrams < list > squozen_list 2814040Smckusick * 2937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 3037636Sjak * February/March 1983, p. 8 ). Output format is, per line, an 3137636Sjak * offset differential count byte followed by a partially bigram- 3237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 3337636Sjak * the first 128 most common of which are encoded in one byte. 3414040Smckusick * 3537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 3637636Sjak * if the input is... then the output is... 3737636Sjak * 3837636Sjak * /usr/src 0 /usr/src 3937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 4037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 4137636Sjak * /usr/tmp/zoo 5 tmp/zoo 4237636Sjak * 4314040Smckusick * The codes are: 4414040Smckusick * 4514040Smckusick * 0-28 likeliest differential counts + offset to make nonnegative 4637636Sjak * 30 switch code for out-of-range count to follow in next word 4737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 4837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 4914040Smckusick * 5014051Smckusick * SEE ALSO: updatedb.csh, bigram.c, find.c 5114040Smckusick * 5214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 5314040Smckusick * NASA Ames Research Center, 10/82 5414040Smckusick */ 5514040Smckusick 5640302Sbostic #include <sys/param.h> 5714040Smckusick #include <stdio.h> 5840302Sbostic #include "locate.h" 5914040Smckusick 6037636Sjak #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 6114040Smckusick 6237636Sjak char buf1[MAXPATHLEN] = " "; 6337636Sjak char buf2[MAXPATHLEN]; 6437636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 6514040Smckusick 6614040Smckusick main ( argc, argv ) 6714040Smckusick int argc; char *argv[]; 6814040Smckusick { 6937636Sjak register char *cp, *oldpath = buf1, *path = buf2; 7037636Sjak int code, count, diffcount, oldcount = 0; 7114040Smckusick FILE *fp; 7214040Smckusick 7314051Smckusick if ((fp = fopen(argv[1], "r")) == NULL) { 7437636Sjak printf("Usage: code common_bigrams < list > squozen_list\n"); 7514051Smckusick exit(1); 7614051Smckusick } 7737636Sjak /* first copy bigram array to stdout */ 7837636Sjak fgets ( bigrams, BGBUFSIZE + 1, fp ); 7937636Sjak fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 8037636Sjak fclose( fp ); 8114040Smckusick 8238085Sbostic while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { 8339275Sbostic /* truncate newline */ 8439275Sbostic cp = path + strlen(path) - 1; 8539275Sbostic if (cp > path && *cp == '\n') 8639275Sbostic *cp = '\0'; 8737636Sjak /* squelch characters that would botch the decoding */ 8837636Sjak for ( cp = path; *cp != NULL; cp++ ) { 8939275Sbostic if ( (unsigned char)*cp >= PARITY ) 9037636Sjak *cp &= PARITY-1; 9137636Sjak else if ( *cp <= SWITCH ) 9237636Sjak *cp = '?'; 9314040Smckusick } 9437636Sjak /* skip longest common prefix */ 9537636Sjak for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 9637636Sjak if ( *oldpath == NULL ) 9737636Sjak break; 9837636Sjak count = cp - path; 9937636Sjak diffcount = count - oldcount + OFFSET; 10037636Sjak oldcount = count; 10137636Sjak if ( diffcount < 0 || diffcount > 2*OFFSET ) { 10237636Sjak putc ( SWITCH, stdout ); 10337636Sjak putw ( diffcount, stdout ); 10414040Smckusick } 10514040Smckusick else 10637636Sjak putc ( diffcount, stdout ); 10714040Smckusick 10837636Sjak while ( *cp != NULL ) { 10937636Sjak if ( *(cp + 1) == NULL ) { 11037636Sjak putchar ( *cp ); 11114040Smckusick break; 11214040Smckusick } 11337636Sjak if ( (code = bgindex ( cp )) < 0 ) { 11437636Sjak putchar ( *cp++ ); 11537636Sjak putchar ( *cp++ ); 11637636Sjak } 11737636Sjak else { /* found, so mark byte with parity bit */ 11837636Sjak putchar ( (code / 2) | PARITY ); 11937636Sjak cp += 2; 12037636Sjak } 12114040Smckusick } 12237636Sjak if ( path == buf1 ) /* swap pointers */ 12337636Sjak path = buf2, oldpath = buf1; 12437636Sjak else 12537636Sjak path = buf1, oldpath = buf2; 12614040Smckusick } 12714040Smckusick } 12814040Smckusick 12937636Sjak bgindex ( bg ) /* return location of bg in bigrams or -1 */ 13037636Sjak char *bg; 13114040Smckusick { 13237636Sjak register char *p; 13337636Sjak register char bg0 = bg[0], bg1 = bg[1]; 13414040Smckusick 13537636Sjak for ( p = bigrams; *p != NULL; p++ ) 13637636Sjak if ( *p++ == bg0 && *p == bg1 ) 13737636Sjak break; 13837636Sjak return ( *p == NULL ? -1 : --p - bigrams ); 13914040Smckusick } 140