1 #ifndef lint 2 static char sccsid[] = "@(#)locate.code.c 4.3 (Berkeley) 05/04/89"; 3 #endif not lint 4 5 /* 6 * PURPOSE: sorted list compressor (works with a modified 'find' 7 * to encode/decode a filename database) 8 * 9 * USAGE: bigram < list > bigrams 10 * process bigrams (see updatedb) > common_bigrams 11 * code common_bigrams < list > squozen_list 12 * 13 * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 14 * February/March 1983, p. 8 ). Output format is, per line, an 15 * offset differential count byte followed by a partially bigram- 16 * encoded ascii residue. A bigram is a two-character sequence, 17 * the first 128 most common of which are encoded in one byte. 18 * 19 * EXAMPLE: For simple front compression with no bigram encoding, 20 * if the input is... then the output is... 21 * 22 * /usr/src 0 /usr/src 23 * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 24 * /usr/src/cmd/armadillo.c 14 armadillo.c 25 * /usr/tmp/zoo 5 tmp/zoo 26 * 27 * The codes are: 28 * 29 * 0-28 likeliest differential counts + offset to make nonnegative 30 * 30 switch code for out-of-range count to follow in next word 31 * 128-255 bigram codes (128 most common, as determined by 'updatedb') 32 * 32-127 single character (printable) ascii residue (ie, literal) 33 * 34 * SEE ALSO: updatedb.csh, bigram.c, find.c 35 * 36 * AUTHOR: James A. Woods, Informatics General Corp., 37 * NASA Ames Research Center, 10/82 38 */ 39 40 #include <stdio.h> 41 #include <sys/param.h> 42 #include "find.h" 43 44 #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 45 46 char buf1[MAXPATHLEN] = " "; 47 char buf2[MAXPATHLEN]; 48 char bigrams[BGBUFSIZE + 1] = { 0 }; 49 50 main ( argc, argv ) 51 int argc; char *argv[]; 52 { 53 register char *cp, *oldpath = buf1, *path = buf2; 54 int code, count, diffcount, oldcount = 0; 55 FILE *fp; 56 57 if ((fp = fopen(argv[1], "r")) == NULL) { 58 printf("Usage: code common_bigrams < list > squozen_list\n"); 59 exit(1); 60 } 61 /* first copy bigram array to stdout */ 62 fgets ( bigrams, BGBUFSIZE + 1, fp ); 63 fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 64 fclose( fp ); 65 66 /* every path will fit in path buffer, so safe to use gets */ 67 while ( gets ( path ) != NULL ) { 68 /* squelch characters that would botch the decoding */ 69 for ( cp = path; *cp != NULL; cp++ ) { 70 if ( *cp >= PARITY ) 71 *cp &= PARITY-1; 72 else if ( *cp <= SWITCH ) 73 *cp = '?'; 74 } 75 /* skip longest common prefix */ 76 for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 77 if ( *oldpath == NULL ) 78 break; 79 count = cp - path; 80 diffcount = count - oldcount + OFFSET; 81 oldcount = count; 82 if ( diffcount < 0 || diffcount > 2*OFFSET ) { 83 putc ( SWITCH, stdout ); 84 putw ( diffcount, stdout ); 85 } 86 else 87 putc ( diffcount, stdout ); 88 89 while ( *cp != NULL ) { 90 if ( *(cp + 1) == NULL ) { 91 putchar ( *cp ); 92 break; 93 } 94 if ( (code = bgindex ( cp )) < 0 ) { 95 putchar ( *cp++ ); 96 putchar ( *cp++ ); 97 } 98 else { /* found, so mark byte with parity bit */ 99 putchar ( (code / 2) | PARITY ); 100 cp += 2; 101 } 102 } 103 if ( path == buf1 ) /* swap pointers */ 104 path = buf2, oldpath = buf1; 105 else 106 path = buf1, oldpath = buf2; 107 } 108 } 109 110 bgindex ( bg ) /* return location of bg in bigrams or -1 */ 111 char *bg; 112 { 113 register char *p; 114 register char bg0 = bg[0], bg1 = bg[1]; 115 116 for ( p = bigrams; *p != NULL; p++ ) 117 if ( *p++ == bg0 && *p == bg1 ) 118 break; 119 return ( *p == NULL ? -1 : --p - bigrams ); 120 } 121