1 /* 2 * $OpenBSD: locate.code.c,v 1.7 1997/01/15 23:42:42 millert Exp $ 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * James A. Woods. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * $Id: locate.code.c,v 1.7 1997/01/15 23:42:42 millert Exp $ 39 */ 40 41 #ifndef lint 42 static char copyright[] = 43 "@(#) Copyright (c) 1989, 1993\n\ 44 The Regents of the University of California. All rights reserved.\n"; 45 #endif /* not lint */ 46 47 #ifndef lint 48 #if 0 49 static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; 50 #else 51 static char rcsid[] = "$OpenBSD: locate.code.c,v 1.7 1997/01/15 23:42:42 millert Exp $"; 52 #endif 53 #endif /* not lint */ 54 55 /* 56 * PURPOSE: sorted list compressor (works with a modified 'find' 57 * to encode/decode a filename database) 58 * 59 * USAGE: bigram < list > bigrams 60 * process bigrams (see updatedb) > common_bigrams 61 * code common_bigrams < list > squozen_list 62 * 63 * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 64 * February/March 1983, p. 8). Output format is, per line, an 65 * offset differential count byte followed by a partially bigram- 66 * encoded ascii residue. A bigram is a two-character sequence, 67 * the first 128 most common of which are encoded in one byte. 68 * 69 * EXAMPLE: For simple front compression with no bigram encoding, 70 * if the input is... then the output is... 71 * 72 * /usr/src 0 /usr/src 73 * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 74 * /usr/src/cmd/armadillo.c 14 armadillo.c 75 * /usr/tmp/zoo 5 tmp/zoo 76 * 77 * The codes are: 78 * 79 * 0-28 likeliest differential counts + offset to make nonnegative 80 * 30 switch code for out-of-range count to follow in next word 81 * 31 an 8 bit char followed 82 * 128-255 bigram codes (128 most common, as determined by 'updatedb') 83 * 32-127 single character (printable) ascii residue (ie, literal) 84 * 85 * The locate database store any character except newline ('\n') 86 * and NUL ('\0'). The 8-bit character support don't wast extra 87 * space until you have characters in file names less than 32 88 * or greather than 127. 89 * 90 * 91 * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c 92 * 93 * AUTHOR: James A. Woods, Informatics General Corp., 94 * NASA Ames Research Center, 10/82 95 * 8-bit file names characters: 96 * Wolfram Schneider, Berlin September 1996 97 */ 98 99 #include <sys/param.h> 100 #include <err.h> 101 #include <errno.h> 102 #include <stdlib.h> 103 #include <string.h> 104 #include <stdio.h> 105 #include "locate.h" 106 107 #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 108 109 u_char buf1[MAXPATHLEN] = " "; 110 u_char buf2[MAXPATHLEN]; 111 u_char bigrams[BGBUFSIZE + 1] = { 0 }; 112 113 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ 114 115 #ifdef LOOKUP 116 #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) 117 typedef short bg_t; 118 bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; 119 #else 120 #define BGINDEX(x) bgindex(x) 121 typedef int bg_t; 122 int bgindex __P((char *)); 123 #endif /* LOOKUP */ 124 125 126 void usage __P((void)); 127 extern int optind; 128 extern int optopt; 129 130 int 131 main(argc, argv) 132 int argc; 133 char *argv[]; 134 { 135 register u_char *cp, *oldpath, *path; 136 int ch, code, count, diffcount, oldcount; 137 FILE *fp; 138 register int i, j; 139 140 while ((ch = getopt(argc, argv, "")) != -1) 141 switch(ch) { 142 default: 143 usage(); 144 } 145 argc -= optind; 146 argv += optind; 147 148 if (argc != 1) 149 usage(); 150 151 if ((fp = fopen(argv[0], "r")) == NULL) 152 err(1, "%s", argv[0]); 153 154 /* First copy bigram array to stdout. */ 155 (void)fgets(bigrams, BGBUFSIZE + 1, fp); 156 157 if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 158 err(1, "stdout"); 159 (void)fclose(fp); 160 161 #ifdef LOOKUP 162 /* init lookup table */ 163 for (i = 0; i < UCHAR_MAX + 1; i++) 164 for (j = 0; j < UCHAR_MAX + 1; j++) 165 big[i][j] = (bg_t)-1; 166 167 for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) 168 big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; 169 170 #endif /* LOOKUP */ 171 172 oldpath = buf1; 173 path = buf2; 174 oldcount = 0; 175 176 while (fgets(path, sizeof(buf2), stdin) != NULL) { 177 178 /* skip empty lines */ 179 if (*path == '\n') 180 continue; 181 182 /* remove newline */ 183 for (cp = path; *cp != '\0'; cp++) { 184 /* chop newline */ 185 if (*cp == '\n') 186 *cp = '\0'; 187 } 188 189 /* Skip longest common prefix. */ 190 for (cp = path; *cp == *oldpath; cp++, oldpath++) 191 if (*cp == '\0') 192 break; 193 194 count = cp - path; 195 diffcount = count - oldcount + OFFSET; 196 oldcount = count; 197 if (diffcount < 0 || diffcount > 2 * OFFSET) { 198 if (putchar(SWITCH) == EOF || 199 putw(diffcount, stdout) == EOF) 200 err(1, "stdout"); 201 } else 202 if (putchar(diffcount) == EOF) 203 err(1, "stdout"); 204 205 while (*cp != '\0') { 206 /* print *two* characters */ 207 208 if ((code = BGINDEX(cp)) != (bg_t)-1) { 209 /* 210 * print *one* as bigram 211 * Found, so mark byte with 212 * parity bit. 213 */ 214 if (putchar((code / 2) | PARITY) == EOF) 215 err(1, "stdout"); 216 cp += 2; 217 } 218 219 else { 220 for (i = 0; i < 2; i++) { 221 if (*cp == '\0') 222 break; 223 224 /* print umlauts in file names */ 225 if (*cp < ASCII_MIN || 226 *cp > ASCII_MAX) { 227 if (putchar(UMLAUT) == EOF || 228 putchar(*cp++) == EOF) 229 err(1, "stdout"); 230 } 231 232 else { 233 /* normal character */ 234 if(putchar(*cp++) == EOF) 235 err(1, "stdout"); 236 } 237 } 238 239 } 240 } 241 242 if (path == buf1) { /* swap pointers */ 243 path = buf2; 244 oldpath = buf1; 245 } else { 246 path = buf1; 247 oldpath = buf2; 248 } 249 } 250 /* Non-zero status if there were errors */ 251 if (fflush(stdout) != 0 || ferror(stdout)) 252 exit(1); 253 exit(0); 254 } 255 256 #ifndef LOOKUP 257 int 258 bgindex(bg) /* Return location of bg in bigrams or -1. */ 259 char *bg; 260 { 261 register char bg0, bg1, *p; 262 263 bg0 = bg[0]; 264 bg1 = bg[1]; 265 for (p = bigrams; *p != NULL; p++) 266 if (*p++ == bg0 && *p == bg1) 267 break; 268 return (*p == NULL ? -1 : (--p - bigrams)); 269 } 270 #endif /* !LOOKUP */ 271 272 void 273 usage() 274 { 275 (void)fprintf(stderr, 276 "usage: locate.code common_bigrams < list > squozen_list\n"); 277 exit(1); 278 } 279