137637Sbostic /* 262062Sbostic * Copyright (c) 1989, 1993 362062Sbostic * The Regents of the University of California. All rights reserved. 437637Sbostic * 537637Sbostic * This code is derived from software contributed to Berkeley by 637637Sbostic * James A. Woods. 737637Sbostic * 842737Sbostic * %sccs.include.redist.c% 937637Sbostic */ 1037637Sbostic 1114040Smckusick #ifndef lint 1262062Sbostic static char copyright[] = 1362062Sbostic "@(#) Copyright (c) 1989, 1993\n\ 1462062Sbostic The Regents of the University of California. All rights reserved.\n"; 1537637Sbostic #endif /* not lint */ 1614040Smckusick 1737637Sbostic #ifndef lint 18*69017Sbostic static char sccsid[] = "@(#)locate.code.c 8.3 (Berkeley) 04/28/95"; 1937637Sbostic #endif /* not lint */ 2037637Sbostic 2114040Smckusick /* 2214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 2314040Smckusick * to encode/decode a filename database) 2414040Smckusick * 2514051Smckusick * USAGE: bigram < list > bigrams 2614051Smckusick * process bigrams (see updatedb) > common_bigrams 2714051Smckusick * code common_bigrams < list > squozen_list 2814040Smckusick * 2937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 3052426Sbostic * February/March 1983, p. 8). Output format is, per line, an 3137636Sjak * offset differential count byte followed by a partially bigram- 3237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 3337636Sjak * the first 128 most common of which are encoded in one byte. 3414040Smckusick * 3537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 3637636Sjak * if the input is... then the output is... 3737636Sjak * 3837636Sjak * /usr/src 0 /usr/src 3937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 4037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 4137636Sjak * /usr/tmp/zoo 5 tmp/zoo 4237636Sjak * 4352279Sleres * The codes are: 4414040Smckusick * 4552279Sleres * 0-28 likeliest differential counts + offset to make nonnegative 4637636Sjak * 30 switch code for out-of-range count to follow in next word 4737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 4837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 4914040Smckusick * 5052426Sbostic * SEE ALSO: updatedb.csh, bigram.c 5152279Sleres * 5214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 5314040Smckusick * NASA Ames Research Center, 10/82 5414040Smckusick */ 5514040Smckusick 5640302Sbostic #include <sys/param.h> 5759059Storek #include <err.h> 5852426Sbostic #include <errno.h> 5952426Sbostic #include <stdlib.h> 6052426Sbostic #include <string.h> 6114040Smckusick #include <stdio.h> 6240302Sbostic #include "locate.h" 6314040Smckusick 6452426Sbostic #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 6514040Smckusick 6637636Sjak char buf1[MAXPATHLEN] = " "; 6737636Sjak char buf2[MAXPATHLEN]; 6837636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 6914040Smckusick 7052426Sbostic int bgindex __P((char *)); 7152426Sbostic void usage __P((void)); 7252426Sbostic 7352426Sbostic int 7452426Sbostic main(argc, argv) 7552426Sbostic int argc; 7652426Sbostic char *argv[]; 7714040Smckusick { 7852426Sbostic register char *cp, *oldpath, *path; 7952426Sbostic int ch, code, count, diffcount, oldcount; 8014040Smckusick FILE *fp; 8114040Smckusick 8252426Sbostic while ((ch = getopt(argc, argv, "")) != EOF) 8352426Sbostic switch(ch) { 8452426Sbostic case '?': 8552426Sbostic default: 8652426Sbostic usage(); 8752426Sbostic } 8852426Sbostic argc -= optind; 8952426Sbostic argv += optind; 9014040Smckusick 9152426Sbostic if (argc != 1) 9252426Sbostic usage(); 9352426Sbostic 9459059Storek if ((fp = fopen(argv[0], "r")) == NULL) 9559059Storek err(1, "%s", argv[0]); 9652426Sbostic 9752426Sbostic /* First copy bigram array to stdout. */ 9852426Sbostic (void)fgets(bigrams, BGBUFSIZE + 1, fp); 9952426Sbostic if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 10059059Storek err(1, "stdout"); 10152426Sbostic (void)fclose(fp); 10252426Sbostic 10352426Sbostic oldpath = buf1; 10452426Sbostic path = buf2; 10552426Sbostic oldcount = 0; 10652426Sbostic while (fgets(path, sizeof(buf2), stdin) != NULL) { 10752426Sbostic /* Truncate newline. */ 10839275Sbostic cp = path + strlen(path) - 1; 10939275Sbostic if (cp > path && *cp == '\n') 11039275Sbostic *cp = '\0'; 11152426Sbostic 11252426Sbostic /* Squelch characters that would botch the decoding. */ 113*69017Sbostic for (cp = path; *cp != '\0'; cp++) { 114*69017Sbostic if ((u_char)*cp >= PARITY || *cp <= SWITCH) 11537636Sjak *cp = '?'; 11614040Smckusick } 11752426Sbostic 11852426Sbostic /* Skip longest common prefix. */ 11952426Sbostic for (cp = path; *cp == *oldpath; cp++, oldpath++) 120*69017Sbostic if (*oldpath == '\0') 12137636Sjak break; 12237636Sjak count = cp - path; 12337636Sjak diffcount = count - oldcount + OFFSET; 12437636Sjak oldcount = count; 12553692Selan if (diffcount < 0 || diffcount > 2 * OFFSET) { 12652426Sbostic if (putchar(SWITCH) == EOF || 12752426Sbostic putw(diffcount, stdout) == EOF) 12859059Storek err(1, "stdout"); 12953692Selan } else 13052426Sbostic if (putchar(diffcount) == EOF) 13159059Storek err(1, "stdout"); 13214040Smckusick 133*69017Sbostic while (*cp != '\0') { 134*69017Sbostic if (*(cp + 1) == '\0') { 13552426Sbostic if (putchar(*cp) == EOF) 13659059Storek err(1, "stdout"); 13714040Smckusick break; 13814040Smckusick } 13952426Sbostic if ((code = bgindex(cp)) < 0) { 14052426Sbostic if (putchar(*cp++) == EOF || 14152426Sbostic putchar(*cp++) == EOF) 14259059Storek err(1, "stdout"); 14352426Sbostic } else { 14452426Sbostic /* Found, so mark byte with parity bit. */ 14552426Sbostic if (putchar((code / 2) | PARITY) == EOF) 14659059Storek err(1, "stdout"); 14737636Sjak cp += 2; 14837636Sjak } 14914040Smckusick } 15052426Sbostic if (path == buf1) { /* swap pointers */ 15152426Sbostic path = buf2; 15252426Sbostic oldpath = buf1; 15352426Sbostic } else { 15452426Sbostic path = buf1; 15552426Sbostic oldpath = buf2; 15652426Sbostic } 15714040Smckusick } 15852775Smckusick /* Non-zero status if there were errors */ 15952775Smckusick if (fflush(stdout) != 0 || ferror(stdout)) 16052775Smckusick exit(1); 16152426Sbostic exit(0); 16214040Smckusick } 16314040Smckusick 16452426Sbostic int 16552426Sbostic bgindex(bg) /* Return location of bg in bigrams or -1. */ 16637636Sjak char *bg; 16714040Smckusick { 16852426Sbostic register char bg0, bg1, *p; 16914040Smckusick 17052426Sbostic bg0 = bg[0]; 17152426Sbostic bg1 = bg[1]; 172*69017Sbostic for (p = bigrams; *p != '\0'; p++) 17352426Sbostic if (*p++ == bg0 && *p == bg1) 17437636Sjak break; 175*69017Sbostic return (*p == '\0' ? -1 : --p - bigrams); 17614040Smckusick } 17752426Sbostic 17852426Sbostic void 17952426Sbostic usage() 18052426Sbostic { 18152426Sbostic (void)fprintf(stderr, 18252426Sbostic "usage: locate.code common_bigrams < list > squozen_list\n"); 18352426Sbostic exit(1); 18452426Sbostic } 185