1*14040Smckusick #ifndef lint 2*14040Smckusick static char sccsid[] = "@(#)locate.code.c 4.1 (Berkeley) 07/21/83"; 3*14040Smckusick #endif not lint 4*14040Smckusick 5*14040Smckusick /* 6*14040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 7*14040Smckusick * to encode/decode a filename database) 8*14040Smckusick * 9*14040Smckusick * USAGE: find.bigram < list > bigrams 10*14040Smckusick * process bigrams (see find.squeeze) > common_bigrams 11*14040Smckusick * find.code common_bigrams < list > squozen_list 12*14040Smckusick * 13*14040Smckusick * METHOD: Uses 'front compression' (see ";login:", March 1983, p. 8 ). 14*14040Smckusick * Output format is, per line, an offset differential count byte 15*14040Smckusick * followed by a partially bigram-encoded ascii residue. 16*14040Smckusick * 17*14040Smckusick * The codes are: 18*14040Smckusick * 19*14040Smckusick * 0-28 likeliest differential counts + offset to make nonnegative 20*14040Smckusick * 30 escape code for out-of-range count to follow in next word 21*14040Smckusick * 128-255 bigram codes, (128 most common, as determined by 'find.squeeze') 22*14040Smckusick * 32-127 single character (printable) ascii residue 23*14040Smckusick * 24*14040Smckusick * SEE ALSO: squeeze, bigram.c, find.c 25*14040Smckusick * 26*14040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 27*14040Smckusick * NASA Ames Research Center, 10/82 28*14040Smckusick */ 29*14040Smckusick 30*14040Smckusick #include <stdio.h> 31*14040Smckusick 32*14040Smckusick #define MAXPATH 1024 /* maximum pathname length */ 33*14040Smckusick #define RESET 30 /* switch code */ 34*14040Smckusick 35*14040Smckusick char path[MAXPATH]; 36*14040Smckusick char oldpath[MAXPATH] = " "; 37*14040Smckusick char bigrams[257] = { 0 }; 38*14040Smckusick 39*14040Smckusick main ( argc, argv ) 40*14040Smckusick int argc; char *argv[]; 41*14040Smckusick { 42*14040Smckusick int count, oldcount, diffcount; 43*14040Smckusick int j, code; 44*14040Smckusick char bigram[3]; 45*14040Smckusick FILE *fp; 46*14040Smckusick 47*14040Smckusick oldcount = 0; 48*14040Smckusick bigram[2] = NULL; 49*14040Smckusick 50*14040Smckusick if ( (fp = fopen ( argv[1], "r" )) == NULL ) 51*14040Smckusick printf ( "Usage: find.code common_bigrams < list > coded_list\n" ), exit ( 1 ); 52*14040Smckusick fgets ( bigrams, 257, fp ); 53*14040Smckusick fwrite ( bigrams, 1, 256, stdout ); 54*14040Smckusick 55*14040Smckusick while ( gets ( path ) != NULL ) { 56*14040Smckusick /* 57*14040Smckusick squelch unprintable chars so as not to botch decoding 58*14040Smckusick */ 59*14040Smckusick for ( j = 0; path[j] != NULL; j++ ) { 60*14040Smckusick path[j] &= 0177; 61*14040Smckusick if ( path[j] < 040 || path[j] == 0177 ) 62*14040Smckusick path[j] = '?'; 63*14040Smckusick } 64*14040Smckusick count = prefix_length ( oldpath, path ); 65*14040Smckusick diffcount = count - oldcount; 66*14040Smckusick if ( (diffcount < -14) || (diffcount > 14) ) { 67*14040Smckusick putc ( RESET, stdout ); 68*14040Smckusick putw ( diffcount + 14, stdout ); 69*14040Smckusick } 70*14040Smckusick else 71*14040Smckusick putc ( diffcount + 14, stdout ); 72*14040Smckusick 73*14040Smckusick for ( j = count; path[j] != NULL; j += 2 ) { 74*14040Smckusick if ( path[j + 1] == NULL ) { 75*14040Smckusick putchar ( path[j] ); 76*14040Smckusick break; 77*14040Smckusick } 78*14040Smckusick bigram[0] = path[j]; 79*14040Smckusick bigram[1] = path[j + 1]; 80*14040Smckusick /* 81*14040Smckusick linear search for specific bigram in string table 82*14040Smckusick */ 83*14040Smckusick if ( (code = strindex ( bigrams, bigram )) % 2 == 0 ) 84*14040Smckusick putchar ( (code / 2) | 0200 ); 85*14040Smckusick else 86*14040Smckusick fputs ( bigram, stdout ); 87*14040Smckusick } 88*14040Smckusick strcpy ( oldpath, path ); 89*14040Smckusick oldcount = count; 90*14040Smckusick } 91*14040Smckusick } 92*14040Smckusick 93*14040Smckusick strindex ( string, pattern ) /* return location of pattern in string or -1 */ 94*14040Smckusick char *string, *pattern; 95*14040Smckusick { 96*14040Smckusick register char *s, *p, *q; 97*14040Smckusick 98*14040Smckusick for ( s = string; *s != NULL; s++ ) 99*14040Smckusick if ( *s == *pattern ) { /* fast first char check */ 100*14040Smckusick for ( p = pattern + 1, q = s + 1; *p != NULL; p++, q++ ) 101*14040Smckusick if ( *q != *p ) 102*14040Smckusick break; 103*14040Smckusick if ( *p == NULL ) 104*14040Smckusick return ( q - strlen ( pattern ) - string ); 105*14040Smckusick } 106*14040Smckusick return ( -1 ); 107*14040Smckusick } 108*14040Smckusick 109*14040Smckusick prefix_length ( s1, s2 ) /* return length of longest common prefix */ 110*14040Smckusick char *s1, *s2; /* ... of strings s1 and s2 */ 111*14040Smckusick { 112*14040Smckusick register char *start; 113*14040Smckusick 114*14040Smckusick for ( start = s1; *s1 == *s2; s1++, s2++ ) 115*14040Smckusick if ( *s1 == NULL ) 116*14040Smckusick break; 117*14040Smckusick return ( s1 - start ); 118*14040Smckusick } 119