1 #ifndef lint
2 static char sccsid[] = "@(#)locate.code.c	4.1	(Berkeley)	07/21/83";
3 #endif not lint
4 
5 /*
6  * PURPOSE:	sorted list compressor (works with a modified 'find'
7  *		to encode/decode a filename database)
8  *
9  * USAGE:	find.bigram < list > bigrams
10  *		process bigrams (see find.squeeze) > common_bigrams
11  *		find.code common_bigrams < list > squozen_list
12  *
13  * METHOD:	Uses 'front compression' (see ";login:", March 1983, p. 8 ).
14  *		Output format is, per line, an offset differential count byte
15  *		followed by a partially bigram-encoded ascii residue.
16  *
17  *  	The codes are:
18  *
19  *	0-28	likeliest differential counts + offset to make nonnegative
20  *	30	escape code for out-of-range count to follow in next word
21  *	128-255 bigram codes, (128 most common, as determined by 'find.squeeze')
22  *	32-127  single character (printable) ascii residue
23  *
24  * SEE ALSO:	squeeze, bigram.c, find.c
25  *
26  * AUTHOR:	James A. Woods, Informatics General Corp.,
27  *		NASA Ames Research Center, 10/82
28  */
29 
30 #include <stdio.h>
31 
32 #define MAXPATH 1024		/* maximum pathname length */
33 #define	RESET	30		/* switch code */
34 
35 char path[MAXPATH];
36 char oldpath[MAXPATH] = " ";
37 char bigrams[257] = { 0 };
38 
39 main ( argc, argv )
40 	int argc; char *argv[];
41 {
42   	int count, oldcount, diffcount;
43 	int j, code;
44 	char bigram[3];
45 	FILE *fp;
46 
47 	oldcount = 0;
48 	bigram[2] = NULL;
49 
50 	if ( (fp = fopen ( argv[1], "r" )) == NULL )
51 		printf ( "Usage: find.code common_bigrams < list > coded_list\n" ), exit ( 1 );
52 	fgets ( bigrams, 257, fp );
53 	fwrite ( bigrams, 1, 256, stdout );
54 
55      	while ( gets ( path ) != NULL ) {
56 		/*
57 		   squelch unprintable chars so as not to botch decoding
58 		*/
59 		for ( j = 0; path[j] != NULL; j++ ) {
60 			path[j] &= 0177;
61 			if ( path[j] < 040 || path[j] == 0177 )
62 				path[j] = '?';
63 		}
64 		count = prefix_length ( oldpath, path );
65 		diffcount = count - oldcount;
66 		if ( (diffcount < -14) || (diffcount > 14) ) {
67 			putc ( RESET, stdout );
68 			putw ( diffcount + 14, stdout );
69 		}
70 		else
71 			putc ( diffcount + 14, stdout );
72 
73 		for ( j = count; path[j] != NULL; j += 2 ) {
74 			if ( path[j + 1] == NULL ) {
75 				putchar ( path[j] );
76 				break;
77 			}
78 			bigram[0] = path[j];
79 			bigram[1] = path[j + 1];
80 			/*
81 			    linear search for specific bigram in string table
82 			*/
83 			if ( (code = strindex ( bigrams, bigram )) % 2 == 0 )
84 				putchar ( (code / 2) | 0200 );
85 			else
86 				fputs ( bigram, stdout );
87 		}
88 		strcpy ( oldpath, path );
89 		oldcount = count;
90 	}
91 }
92 
93 strindex ( string, pattern )	/* return location of pattern in string or -1 */
94 	char *string, *pattern;
95 {
96 	register char *s, *p, *q;
97 
98 	for ( s = string; *s != NULL; s++ )
99 		if ( *s == *pattern ) {		/* fast first char check */
100 			for ( p = pattern + 1, q = s + 1; *p != NULL; p++, q++ )
101 				if ( *q != *p )
102 					break;
103 			if ( *p == NULL )
104 				return ( q - strlen ( pattern ) - string );
105 		}
106 	return ( -1 );
107 }
108 
109 prefix_length ( s1, s2 )	/* return length of longest common prefix */
110 	char *s1, *s2;		/* ... of strings s1 and s2 */
111 {
112 	register char *start;
113 
114     	for ( start = s1; *s1 == *s2; s1++, s2++ )
115 		if ( *s1 == NULL )
116 	    		break;
117     	return ( s1 - start );
118 }
119