114040Smckusick #ifndef lint 2*14051Smckusick static char sccsid[] = "@(#)locate.code.c 4.2 (Berkeley) 07/21/83"; 314040Smckusick #endif not lint 414040Smckusick 514040Smckusick /* 614040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 714040Smckusick * to encode/decode a filename database) 814040Smckusick * 9*14051Smckusick * USAGE: bigram < list > bigrams 10*14051Smckusick * process bigrams (see updatedb) > common_bigrams 11*14051Smckusick * code common_bigrams < list > squozen_list 1214040Smckusick * 1314040Smckusick * METHOD: Uses 'front compression' (see ";login:", March 1983, p. 8 ). 1414040Smckusick * Output format is, per line, an offset differential count byte 1514040Smckusick * followed by a partially bigram-encoded ascii residue. 1614040Smckusick * 1714040Smckusick * The codes are: 1814040Smckusick * 1914040Smckusick * 0-28 likeliest differential counts + offset to make nonnegative 2014040Smckusick * 30 escape code for out-of-range count to follow in next word 21*14051Smckusick * 128-255 bigram codes, (128 most common, as determined by 'updatedb') 2214040Smckusick * 32-127 single character (printable) ascii residue 2314040Smckusick * 24*14051Smckusick * SEE ALSO: updatedb.csh, bigram.c, find.c 2514040Smckusick * 2614040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 2714040Smckusick * NASA Ames Research Center, 10/82 2814040Smckusick */ 2914040Smckusick 3014040Smckusick #include <stdio.h> 3114040Smckusick 3214040Smckusick #define MAXPATH 1024 /* maximum pathname length */ 3314040Smckusick #define RESET 30 /* switch code */ 3414040Smckusick 3514040Smckusick char path[MAXPATH]; 3614040Smckusick char oldpath[MAXPATH] = " "; 3714040Smckusick char bigrams[257] = { 0 }; 3814040Smckusick 3914040Smckusick main ( argc, argv ) 4014040Smckusick int argc; char *argv[]; 4114040Smckusick { 4214040Smckusick int count, oldcount, diffcount; 4314040Smckusick int j, code; 4414040Smckusick char bigram[3]; 4514040Smckusick FILE *fp; 4614040Smckusick 4714040Smckusick oldcount = 0; 4814040Smckusick bigram[2] = NULL; 4914040Smckusick 50*14051Smckusick if ((fp = fopen(argv[1], "r")) == NULL) { 51*14051Smckusick printf("Usage: code common_bigrams < list > coded_list\n"); 52*14051Smckusick exit(1); 53*14051Smckusick } 5414040Smckusick fgets ( bigrams, 257, fp ); 5514040Smckusick fwrite ( bigrams, 1, 256, stdout ); 5614040Smckusick 5714040Smckusick while ( gets ( path ) != NULL ) { 5814040Smckusick /* 5914040Smckusick squelch unprintable chars so as not to botch decoding 6014040Smckusick */ 6114040Smckusick for ( j = 0; path[j] != NULL; j++ ) { 6214040Smckusick path[j] &= 0177; 6314040Smckusick if ( path[j] < 040 || path[j] == 0177 ) 6414040Smckusick path[j] = '?'; 6514040Smckusick } 6614040Smckusick count = prefix_length ( oldpath, path ); 6714040Smckusick diffcount = count - oldcount; 6814040Smckusick if ( (diffcount < -14) || (diffcount > 14) ) { 6914040Smckusick putc ( RESET, stdout ); 7014040Smckusick putw ( diffcount + 14, stdout ); 7114040Smckusick } 7214040Smckusick else 7314040Smckusick putc ( diffcount + 14, stdout ); 7414040Smckusick 7514040Smckusick for ( j = count; path[j] != NULL; j += 2 ) { 7614040Smckusick if ( path[j + 1] == NULL ) { 7714040Smckusick putchar ( path[j] ); 7814040Smckusick break; 7914040Smckusick } 8014040Smckusick bigram[0] = path[j]; 8114040Smckusick bigram[1] = path[j + 1]; 8214040Smckusick /* 8314040Smckusick linear search for specific bigram in string table 8414040Smckusick */ 8514040Smckusick if ( (code = strindex ( bigrams, bigram )) % 2 == 0 ) 8614040Smckusick putchar ( (code / 2) | 0200 ); 8714040Smckusick else 8814040Smckusick fputs ( bigram, stdout ); 8914040Smckusick } 9014040Smckusick strcpy ( oldpath, path ); 9114040Smckusick oldcount = count; 9214040Smckusick } 9314040Smckusick } 9414040Smckusick 9514040Smckusick strindex ( string, pattern ) /* return location of pattern in string or -1 */ 9614040Smckusick char *string, *pattern; 9714040Smckusick { 9814040Smckusick register char *s, *p, *q; 9914040Smckusick 10014040Smckusick for ( s = string; *s != NULL; s++ ) 10114040Smckusick if ( *s == *pattern ) { /* fast first char check */ 10214040Smckusick for ( p = pattern + 1, q = s + 1; *p != NULL; p++, q++ ) 10314040Smckusick if ( *q != *p ) 10414040Smckusick break; 10514040Smckusick if ( *p == NULL ) 10614040Smckusick return ( q - strlen ( pattern ) - string ); 10714040Smckusick } 10814040Smckusick return ( -1 ); 10914040Smckusick } 11014040Smckusick 11114040Smckusick prefix_length ( s1, s2 ) /* return length of longest common prefix */ 11214040Smckusick char *s1, *s2; /* ... of strings s1 and s2 */ 11314040Smckusick { 11414040Smckusick register char *start; 11514040Smckusick 11614040Smckusick for ( start = s1; *s1 == *s2; s1++, s2++ ) 11714040Smckusick if ( *s1 == NULL ) 11814040Smckusick break; 11914040Smckusick return ( s1 - start ); 12014040Smckusick } 121