1*14040Smckusick #ifndef lint
2*14040Smckusick static char sccsid[] = "@(#)locate.code.c	4.1	(Berkeley)	07/21/83";
3*14040Smckusick #endif not lint
4*14040Smckusick 
5*14040Smckusick /*
6*14040Smckusick  * PURPOSE:	sorted list compressor (works with a modified 'find'
7*14040Smckusick  *		to encode/decode a filename database)
8*14040Smckusick  *
9*14040Smckusick  * USAGE:	find.bigram < list > bigrams
10*14040Smckusick  *		process bigrams (see find.squeeze) > common_bigrams
11*14040Smckusick  *		find.code common_bigrams < list > squozen_list
12*14040Smckusick  *
13*14040Smckusick  * METHOD:	Uses 'front compression' (see ";login:", March 1983, p. 8 ).
14*14040Smckusick  *		Output format is, per line, an offset differential count byte
15*14040Smckusick  *		followed by a partially bigram-encoded ascii residue.
16*14040Smckusick  *
17*14040Smckusick  *  	The codes are:
18*14040Smckusick  *
19*14040Smckusick  *	0-28	likeliest differential counts + offset to make nonnegative
20*14040Smckusick  *	30	escape code for out-of-range count to follow in next word
21*14040Smckusick  *	128-255 bigram codes, (128 most common, as determined by 'find.squeeze')
22*14040Smckusick  *	32-127  single character (printable) ascii residue
23*14040Smckusick  *
24*14040Smckusick  * SEE ALSO:	squeeze, bigram.c, find.c
25*14040Smckusick  *
26*14040Smckusick  * AUTHOR:	James A. Woods, Informatics General Corp.,
27*14040Smckusick  *		NASA Ames Research Center, 10/82
28*14040Smckusick  */
29*14040Smckusick 
30*14040Smckusick #include <stdio.h>
31*14040Smckusick 
32*14040Smckusick #define MAXPATH 1024		/* maximum pathname length */
33*14040Smckusick #define	RESET	30		/* switch code */
34*14040Smckusick 
35*14040Smckusick char path[MAXPATH];
36*14040Smckusick char oldpath[MAXPATH] = " ";
37*14040Smckusick char bigrams[257] = { 0 };
38*14040Smckusick 
39*14040Smckusick main ( argc, argv )
40*14040Smckusick 	int argc; char *argv[];
41*14040Smckusick {
42*14040Smckusick   	int count, oldcount, diffcount;
43*14040Smckusick 	int j, code;
44*14040Smckusick 	char bigram[3];
45*14040Smckusick 	FILE *fp;
46*14040Smckusick 
47*14040Smckusick 	oldcount = 0;
48*14040Smckusick 	bigram[2] = NULL;
49*14040Smckusick 
50*14040Smckusick 	if ( (fp = fopen ( argv[1], "r" )) == NULL )
51*14040Smckusick 		printf ( "Usage: find.code common_bigrams < list > coded_list\n" ), exit ( 1 );
52*14040Smckusick 	fgets ( bigrams, 257, fp );
53*14040Smckusick 	fwrite ( bigrams, 1, 256, stdout );
54*14040Smckusick 
55*14040Smckusick      	while ( gets ( path ) != NULL ) {
56*14040Smckusick 		/*
57*14040Smckusick 		   squelch unprintable chars so as not to botch decoding
58*14040Smckusick 		*/
59*14040Smckusick 		for ( j = 0; path[j] != NULL; j++ ) {
60*14040Smckusick 			path[j] &= 0177;
61*14040Smckusick 			if ( path[j] < 040 || path[j] == 0177 )
62*14040Smckusick 				path[j] = '?';
63*14040Smckusick 		}
64*14040Smckusick 		count = prefix_length ( oldpath, path );
65*14040Smckusick 		diffcount = count - oldcount;
66*14040Smckusick 		if ( (diffcount < -14) || (diffcount > 14) ) {
67*14040Smckusick 			putc ( RESET, stdout );
68*14040Smckusick 			putw ( diffcount + 14, stdout );
69*14040Smckusick 		}
70*14040Smckusick 		else
71*14040Smckusick 			putc ( diffcount + 14, stdout );
72*14040Smckusick 
73*14040Smckusick 		for ( j = count; path[j] != NULL; j += 2 ) {
74*14040Smckusick 			if ( path[j + 1] == NULL ) {
75*14040Smckusick 				putchar ( path[j] );
76*14040Smckusick 				break;
77*14040Smckusick 			}
78*14040Smckusick 			bigram[0] = path[j];
79*14040Smckusick 			bigram[1] = path[j + 1];
80*14040Smckusick 			/*
81*14040Smckusick 			    linear search for specific bigram in string table
82*14040Smckusick 			*/
83*14040Smckusick 			if ( (code = strindex ( bigrams, bigram )) % 2 == 0 )
84*14040Smckusick 				putchar ( (code / 2) | 0200 );
85*14040Smckusick 			else
86*14040Smckusick 				fputs ( bigram, stdout );
87*14040Smckusick 		}
88*14040Smckusick 		strcpy ( oldpath, path );
89*14040Smckusick 		oldcount = count;
90*14040Smckusick 	}
91*14040Smckusick }
92*14040Smckusick 
93*14040Smckusick strindex ( string, pattern )	/* return location of pattern in string or -1 */
94*14040Smckusick 	char *string, *pattern;
95*14040Smckusick {
96*14040Smckusick 	register char *s, *p, *q;
97*14040Smckusick 
98*14040Smckusick 	for ( s = string; *s != NULL; s++ )
99*14040Smckusick 		if ( *s == *pattern ) {		/* fast first char check */
100*14040Smckusick 			for ( p = pattern + 1, q = s + 1; *p != NULL; p++, q++ )
101*14040Smckusick 				if ( *q != *p )
102*14040Smckusick 					break;
103*14040Smckusick 			if ( *p == NULL )
104*14040Smckusick 				return ( q - strlen ( pattern ) - string );
105*14040Smckusick 		}
106*14040Smckusick 	return ( -1 );
107*14040Smckusick }
108*14040Smckusick 
109*14040Smckusick prefix_length ( s1, s2 )	/* return length of longest common prefix */
110*14040Smckusick 	char *s1, *s2;		/* ... of strings s1 and s2 */
111*14040Smckusick {
112*14040Smckusick 	register char *start;
113*14040Smckusick 
114*14040Smckusick     	for ( start = s1; *s1 == *s2; s1++, s2++ )
115*14040Smckusick 		if ( *s1 == NULL )
116*14040Smckusick 	    		break;
117*14040Smckusick     	return ( s1 - start );
118*14040Smckusick }
119