114040Smckusick #ifndef lint
2*14051Smckusick static char sccsid[] = "@(#)locate.code.c	4.2	(Berkeley)	07/21/83";
314040Smckusick #endif not lint
414040Smckusick 
514040Smckusick /*
614040Smckusick  * PURPOSE:	sorted list compressor (works with a modified 'find'
714040Smckusick  *		to encode/decode a filename database)
814040Smckusick  *
9*14051Smckusick  * USAGE:	bigram < list > bigrams
10*14051Smckusick  *		process bigrams (see updatedb) > common_bigrams
11*14051Smckusick  *		code common_bigrams < list > squozen_list
1214040Smckusick  *
1314040Smckusick  * METHOD:	Uses 'front compression' (see ";login:", March 1983, p. 8 ).
1414040Smckusick  *		Output format is, per line, an offset differential count byte
1514040Smckusick  *		followed by a partially bigram-encoded ascii residue.
1614040Smckusick  *
1714040Smckusick  *  	The codes are:
1814040Smckusick  *
1914040Smckusick  *	0-28	likeliest differential counts + offset to make nonnegative
2014040Smckusick  *	30	escape code for out-of-range count to follow in next word
21*14051Smckusick  *	128-255 bigram codes, (128 most common, as determined by 'updatedb')
2214040Smckusick  *	32-127  single character (printable) ascii residue
2314040Smckusick  *
24*14051Smckusick  * SEE ALSO:	updatedb.csh, bigram.c, find.c
2514040Smckusick  *
2614040Smckusick  * AUTHOR:	James A. Woods, Informatics General Corp.,
2714040Smckusick  *		NASA Ames Research Center, 10/82
2814040Smckusick  */
2914040Smckusick 
3014040Smckusick #include <stdio.h>
3114040Smckusick 
3214040Smckusick #define MAXPATH 1024		/* maximum pathname length */
3314040Smckusick #define	RESET	30		/* switch code */
3414040Smckusick 
3514040Smckusick char path[MAXPATH];
3614040Smckusick char oldpath[MAXPATH] = " ";
3714040Smckusick char bigrams[257] = { 0 };
3814040Smckusick 
3914040Smckusick main ( argc, argv )
4014040Smckusick 	int argc; char *argv[];
4114040Smckusick {
4214040Smckusick   	int count, oldcount, diffcount;
4314040Smckusick 	int j, code;
4414040Smckusick 	char bigram[3];
4514040Smckusick 	FILE *fp;
4614040Smckusick 
4714040Smckusick 	oldcount = 0;
4814040Smckusick 	bigram[2] = NULL;
4914040Smckusick 
50*14051Smckusick 	if ((fp = fopen(argv[1], "r")) == NULL) {
51*14051Smckusick 		printf("Usage: code common_bigrams < list > coded_list\n");
52*14051Smckusick 		exit(1);
53*14051Smckusick 	}
5414040Smckusick 	fgets ( bigrams, 257, fp );
5514040Smckusick 	fwrite ( bigrams, 1, 256, stdout );
5614040Smckusick 
5714040Smckusick      	while ( gets ( path ) != NULL ) {
5814040Smckusick 		/*
5914040Smckusick 		   squelch unprintable chars so as not to botch decoding
6014040Smckusick 		*/
6114040Smckusick 		for ( j = 0; path[j] != NULL; j++ ) {
6214040Smckusick 			path[j] &= 0177;
6314040Smckusick 			if ( path[j] < 040 || path[j] == 0177 )
6414040Smckusick 				path[j] = '?';
6514040Smckusick 		}
6614040Smckusick 		count = prefix_length ( oldpath, path );
6714040Smckusick 		diffcount = count - oldcount;
6814040Smckusick 		if ( (diffcount < -14) || (diffcount > 14) ) {
6914040Smckusick 			putc ( RESET, stdout );
7014040Smckusick 			putw ( diffcount + 14, stdout );
7114040Smckusick 		}
7214040Smckusick 		else
7314040Smckusick 			putc ( diffcount + 14, stdout );
7414040Smckusick 
7514040Smckusick 		for ( j = count; path[j] != NULL; j += 2 ) {
7614040Smckusick 			if ( path[j + 1] == NULL ) {
7714040Smckusick 				putchar ( path[j] );
7814040Smckusick 				break;
7914040Smckusick 			}
8014040Smckusick 			bigram[0] = path[j];
8114040Smckusick 			bigram[1] = path[j + 1];
8214040Smckusick 			/*
8314040Smckusick 			    linear search for specific bigram in string table
8414040Smckusick 			*/
8514040Smckusick 			if ( (code = strindex ( bigrams, bigram )) % 2 == 0 )
8614040Smckusick 				putchar ( (code / 2) | 0200 );
8714040Smckusick 			else
8814040Smckusick 				fputs ( bigram, stdout );
8914040Smckusick 		}
9014040Smckusick 		strcpy ( oldpath, path );
9114040Smckusick 		oldcount = count;
9214040Smckusick 	}
9314040Smckusick }
9414040Smckusick 
9514040Smckusick strindex ( string, pattern )	/* return location of pattern in string or -1 */
9614040Smckusick 	char *string, *pattern;
9714040Smckusick {
9814040Smckusick 	register char *s, *p, *q;
9914040Smckusick 
10014040Smckusick 	for ( s = string; *s != NULL; s++ )
10114040Smckusick 		if ( *s == *pattern ) {		/* fast first char check */
10214040Smckusick 			for ( p = pattern + 1, q = s + 1; *p != NULL; p++, q++ )
10314040Smckusick 				if ( *q != *p )
10414040Smckusick 					break;
10514040Smckusick 			if ( *p == NULL )
10614040Smckusick 				return ( q - strlen ( pattern ) - string );
10714040Smckusick 		}
10814040Smckusick 	return ( -1 );
10914040Smckusick }
11014040Smckusick 
11114040Smckusick prefix_length ( s1, s2 )	/* return length of longest common prefix */
11214040Smckusick 	char *s1, *s2;		/* ... of strings s1 and s2 */
11314040Smckusick {
11414040Smckusick 	register char *start;
11514040Smckusick 
11614040Smckusick     	for ( start = s1; *s1 == *s2; s1++, s2++ )
11714040Smckusick 		if ( *s1 == NULL )
11814040Smckusick 	    		break;
11914040Smckusick     	return ( s1 - start );
12014040Smckusick }
121