1 /* 2 * Copyright (c) 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * James A. Woods. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #ifndef lint 38 char copyright[] = 39 "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 40 All rights reserved.\n"; 41 #endif /* not lint */ 42 43 #ifndef lint 44 /*static char sccsid[] = "from: @(#)locate.code.c 4.8 (Berkeley) 6/1/90";*/ 45 static char rcsid[] = "$Id: locate.code.c,v 1.2 1993/08/01 18:13:52 mycroft Exp $"; 46 #endif /* not lint */ 47 48 /* 49 * PURPOSE: sorted list compressor (works with a modified 'find' 50 * to encode/decode a filename database) 51 * 52 * USAGE: bigram < list > bigrams 53 * process bigrams (see updatedb) > common_bigrams 54 * code common_bigrams < list > squozen_list 55 * 56 * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 57 * February/March 1983, p. 8 ). Output format is, per line, an 58 * offset differential count byte followed by a partially bigram- 59 * encoded ascii residue. A bigram is a two-character sequence, 60 * the first 128 most common of which are encoded in one byte. 61 * 62 * EXAMPLE: For simple front compression with no bigram encoding, 63 * if the input is... then the output is... 64 * 65 * /usr/src 0 /usr/src 66 * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 67 * /usr/src/cmd/armadillo.c 14 armadillo.c 68 * /usr/tmp/zoo 5 tmp/zoo 69 * 70 * The codes are: 71 * 72 * 0-28 likeliest differential counts + offset to make nonnegative 73 * 30 switch code for out-of-range count to follow in next word 74 * 128-255 bigram codes (128 most common, as determined by 'updatedb') 75 * 32-127 single character (printable) ascii residue (ie, literal) 76 * 77 * SEE ALSO: updatedb.csh, bigram.c, find.c 78 * 79 * AUTHOR: James A. Woods, Informatics General Corp., 80 * NASA Ames Research Center, 10/82 81 */ 82 83 #include <sys/param.h> 84 #include <stdio.h> 85 #include "locate.h" 86 87 #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 88 89 char buf1[MAXPATHLEN] = " "; 90 char buf2[MAXPATHLEN]; 91 char bigrams[BGBUFSIZE + 1] = { 0 }; 92 93 main ( argc, argv ) 94 int argc; char *argv[]; 95 { 96 register char *cp, *oldpath = buf1, *path = buf2; 97 int code, count, diffcount, oldcount = 0; 98 FILE *fp; 99 100 if ((fp = fopen(argv[1], "r")) == NULL) { 101 printf("Usage: code common_bigrams < list > squozen_list\n"); 102 exit(1); 103 } 104 /* first copy bigram array to stdout */ 105 fgets ( bigrams, BGBUFSIZE + 1, fp ); 106 fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 107 fclose( fp ); 108 109 while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { 110 /* truncate newline */ 111 cp = path + strlen(path) - 1; 112 if (cp > path && *cp == '\n') 113 *cp = '\0'; 114 /* squelch characters that would botch the decoding */ 115 for ( cp = path; *cp != NULL; cp++ ) { 116 if ( (unsigned char)*cp >= PARITY ) 117 *cp &= PARITY-1; 118 else if ( *cp <= SWITCH ) 119 *cp = '?'; 120 } 121 /* skip longest common prefix */ 122 for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 123 if ( *oldpath == NULL ) 124 break; 125 count = cp - path; 126 diffcount = count - oldcount + OFFSET; 127 oldcount = count; 128 if ( diffcount < 0 || diffcount > 2*OFFSET ) { 129 putc ( SWITCH, stdout ); 130 putw ( diffcount, stdout ); 131 } 132 else 133 putc ( diffcount, stdout ); 134 135 while ( *cp != NULL ) { 136 if ( *(cp + 1) == NULL ) { 137 putchar ( *cp ); 138 break; 139 } 140 if ( (code = bgindex ( cp )) < 0 ) { 141 putchar ( *cp++ ); 142 putchar ( *cp++ ); 143 } 144 else { /* found, so mark byte with parity bit */ 145 putchar ( (code / 2) | PARITY ); 146 cp += 2; 147 } 148 } 149 if ( path == buf1 ) /* swap pointers */ 150 path = buf2, oldpath = buf1; 151 else 152 path = buf1, oldpath = buf2; 153 } 154 } 155 156 bgindex ( bg ) /* return location of bg in bigrams or -1 */ 157 char *bg; 158 { 159 register char *p; 160 register char bg0 = bg[0], bg1 = bg[1]; 161 162 for ( p = bigrams; *p != NULL; p++ ) 163 if ( *p++ == bg0 && *p == bg1 ) 164 break; 165 return ( *p == NULL ? -1 : --p - bigrams ); 166 } 167