137637Sbostic /* 237637Sbostic * Copyright (c) 1989 The Regents of the University of California. 337637Sbostic * All rights reserved. 437637Sbostic * 537637Sbostic * This code is derived from software contributed to Berkeley by 637637Sbostic * James A. Woods. 737637Sbostic * 842737Sbostic * %sccs.include.redist.c% 937637Sbostic */ 1037637Sbostic 1114040Smckusick #ifndef lint 1237637Sbostic char copyright[] = 1337637Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 1437637Sbostic All rights reserved.\n"; 1537637Sbostic #endif /* not lint */ 1614040Smckusick 1737637Sbostic #ifndef lint 18*53692Selan static char sccsid[] = "@(#)locate.code.c 5.3 (Berkeley) 05/27/92"; 1937637Sbostic #endif /* not lint */ 2037637Sbostic 2114040Smckusick /* 2214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 2314040Smckusick * to encode/decode a filename database) 2414040Smckusick * 2514051Smckusick * USAGE: bigram < list > bigrams 2614051Smckusick * process bigrams (see updatedb) > common_bigrams 2714051Smckusick * code common_bigrams < list > squozen_list 2814040Smckusick * 2937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 3052426Sbostic * February/March 1983, p. 8). Output format is, per line, an 3137636Sjak * offset differential count byte followed by a partially bigram- 3237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 3337636Sjak * the first 128 most common of which are encoded in one byte. 3414040Smckusick * 3537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 3637636Sjak * if the input is... then the output is... 3737636Sjak * 3837636Sjak * /usr/src 0 /usr/src 3937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 4037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 4137636Sjak * /usr/tmp/zoo 5 tmp/zoo 4237636Sjak * 4352279Sleres * The codes are: 4414040Smckusick * 4552279Sleres * 0-28 likeliest differential counts + offset to make nonnegative 4637636Sjak * 30 switch code for out-of-range count to follow in next word 4737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 4837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 4914040Smckusick * 5052426Sbostic * SEE ALSO: updatedb.csh, bigram.c 5152279Sleres * 5214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 5314040Smckusick * NASA Ames Research Center, 10/82 5414040Smckusick */ 5514040Smckusick 5640302Sbostic #include <sys/param.h> 5752426Sbostic #include <errno.h> 5852426Sbostic #include <stdlib.h> 5952426Sbostic #include <string.h> 6014040Smckusick #include <stdio.h> 6140302Sbostic #include "locate.h" 6214040Smckusick 6352426Sbostic #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 6452426Sbostic #define OERR err("stdout: %s", strerror(errno)) 6514040Smckusick 6637636Sjak char buf1[MAXPATHLEN] = " "; 6737636Sjak char buf2[MAXPATHLEN]; 6837636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 6914040Smckusick 7052426Sbostic int bgindex __P((char *)); 7152426Sbostic void err __P((const char *, ...)); 7252426Sbostic void usage __P((void)); 7352426Sbostic 7452426Sbostic int 7552426Sbostic main(argc, argv) 7652426Sbostic int argc; 7752426Sbostic char *argv[]; 7814040Smckusick { 7952426Sbostic register char *cp, *oldpath, *path; 8052426Sbostic int ch, code, count, diffcount, oldcount; 8114040Smckusick FILE *fp; 8214040Smckusick 8352426Sbostic while ((ch = getopt(argc, argv, "")) != EOF) 8452426Sbostic switch(ch) { 8552426Sbostic case '?': 8652426Sbostic default: 8752426Sbostic usage(); 8852426Sbostic } 8952426Sbostic argc -= optind; 9052426Sbostic argv += optind; 9114040Smckusick 9252426Sbostic if (argc != 1) 9352426Sbostic usage(); 9452426Sbostic 9552426Sbostic if ((fp = fopen(argv[1], "r")) == NULL) 9652426Sbostic err("%s: %s", argv[1], strerror(errno)); 9752426Sbostic 9852426Sbostic /* First copy bigram array to stdout. */ 9952426Sbostic (void)fgets(bigrams, BGBUFSIZE + 1, fp); 10052426Sbostic if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 10152426Sbostic err("stdout: %s", strerror(errno)); 10252426Sbostic (void)fclose(fp); 10352426Sbostic 10452426Sbostic oldpath = buf1; 10552426Sbostic path = buf2; 10652426Sbostic oldcount = 0; 10752426Sbostic while (fgets(path, sizeof(buf2), stdin) != NULL) { 10852426Sbostic /* Truncate newline. */ 10939275Sbostic cp = path + strlen(path) - 1; 11039275Sbostic if (cp > path && *cp == '\n') 11139275Sbostic *cp = '\0'; 11252426Sbostic 11352426Sbostic /* Squelch characters that would botch the decoding. */ 11452426Sbostic for (cp = path; *cp != NULL; cp++) { 11552426Sbostic if ((u_char)*cp >= PARITY) 11637636Sjak *cp &= PARITY-1; 11752426Sbostic else if (*cp <= SWITCH) 11837636Sjak *cp = '?'; 11914040Smckusick } 12052426Sbostic 12152426Sbostic /* Skip longest common prefix. */ 12252426Sbostic for (cp = path; *cp == *oldpath; cp++, oldpath++) 12352426Sbostic if (*oldpath == NULL) 12437636Sjak break; 12537636Sjak count = cp - path; 12637636Sjak diffcount = count - oldcount + OFFSET; 12737636Sjak oldcount = count; 128*53692Selan if (diffcount < 0 || diffcount > 2 * OFFSET) { 12952426Sbostic if (putchar(SWITCH) == EOF || 13052426Sbostic putw(diffcount, stdout) == EOF) 13152426Sbostic OERR; 132*53692Selan } else 13352426Sbostic if (putchar(diffcount) == EOF) 13452426Sbostic OERR; 13514040Smckusick 13652426Sbostic while (*cp != NULL) { 13752426Sbostic if (*(cp + 1) == NULL) { 13852426Sbostic if (putchar(*cp) == EOF) 13952426Sbostic OERR; 14014040Smckusick break; 14114040Smckusick } 14252426Sbostic if ((code = bgindex(cp)) < 0) { 14352426Sbostic if (putchar(*cp++) == EOF || 14452426Sbostic putchar(*cp++) == EOF) 14552426Sbostic OERR; 14652426Sbostic } else { 14752426Sbostic /* Found, so mark byte with parity bit. */ 14852426Sbostic if (putchar((code / 2) | PARITY) == EOF) 14952426Sbostic OERR; 15037636Sjak cp += 2; 15137636Sjak } 15214040Smckusick } 15352426Sbostic if (path == buf1) { /* swap pointers */ 15452426Sbostic path = buf2; 15552426Sbostic oldpath = buf1; 15652426Sbostic } else { 15752426Sbostic path = buf1; 15852426Sbostic oldpath = buf2; 15952426Sbostic } 16014040Smckusick } 16152775Smckusick /* Non-zero status if there were errors */ 16252775Smckusick if (fflush(stdout) != 0 || ferror(stdout)) 16352775Smckusick exit(1); 16452426Sbostic exit(0); 16514040Smckusick } 16614040Smckusick 16752426Sbostic int 16852426Sbostic bgindex(bg) /* Return location of bg in bigrams or -1. */ 16937636Sjak char *bg; 17014040Smckusick { 17152426Sbostic register char bg0, bg1, *p; 17214040Smckusick 17352426Sbostic bg0 = bg[0]; 17452426Sbostic bg1 = bg[1]; 17552426Sbostic for (p = bigrams; *p != NULL; p++) 17652426Sbostic if (*p++ == bg0 && *p == bg1) 17737636Sjak break; 17852426Sbostic return (*p == NULL ? -1 : --p - bigrams); 17914040Smckusick } 18052426Sbostic 18152426Sbostic void 18252426Sbostic usage() 18352426Sbostic { 18452426Sbostic (void)fprintf(stderr, 18552426Sbostic "usage: locate.code common_bigrams < list > squozen_list\n"); 18652426Sbostic exit(1); 18752426Sbostic } 18852426Sbostic 18952426Sbostic #if __STDC__ 19052426Sbostic #include <stdarg.h> 19152426Sbostic #else 19252426Sbostic #include <varargs.h> 19352426Sbostic #endif 19452426Sbostic 19552426Sbostic void 19652426Sbostic #if __STDC__ 19752426Sbostic err(const char *fmt, ...) 19852426Sbostic #else 19952426Sbostic err(fmt, va_alist) 20052426Sbostic char *fmt; 20152426Sbostic va_dcl 20252426Sbostic #endif 20352426Sbostic { 20452426Sbostic va_list ap; 20552426Sbostic #if __STDC__ 20652426Sbostic va_start(ap, fmt); 20752426Sbostic #else 20852426Sbostic va_start(ap); 20952426Sbostic #endif 21052426Sbostic (void)fprintf(stderr, "locate.code: "); 21152426Sbostic (void)vfprintf(stderr, fmt, ap); 21252426Sbostic va_end(ap); 21352426Sbostic (void)fprintf(stderr, "\n"); 21452426Sbostic exit(1); 21552426Sbostic /* NOTREACHED */ 21652426Sbostic } 217