137637Sbostic /* 2*62062Sbostic * Copyright (c) 1989, 1993 3*62062Sbostic * The Regents of the University of California. All rights reserved. 437637Sbostic * 537637Sbostic * This code is derived from software contributed to Berkeley by 637637Sbostic * James A. Woods. 737637Sbostic * 842737Sbostic * %sccs.include.redist.c% 937637Sbostic */ 1037637Sbostic 1114040Smckusick #ifndef lint 12*62062Sbostic static char copyright[] = 13*62062Sbostic "@(#) Copyright (c) 1989, 1993\n\ 14*62062Sbostic The Regents of the University of California. All rights reserved.\n"; 1537637Sbostic #endif /* not lint */ 1614040Smckusick 1737637Sbostic #ifndef lint 18*62062Sbostic static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 06/06/93"; 1937637Sbostic #endif /* not lint */ 2037637Sbostic 2114040Smckusick /* 2214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 2314040Smckusick * to encode/decode a filename database) 2414040Smckusick * 2514051Smckusick * USAGE: bigram < list > bigrams 2614051Smckusick * process bigrams (see updatedb) > common_bigrams 2714051Smckusick * code common_bigrams < list > squozen_list 2814040Smckusick * 2937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 3052426Sbostic * February/March 1983, p. 8). Output format is, per line, an 3137636Sjak * offset differential count byte followed by a partially bigram- 3237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 3337636Sjak * the first 128 most common of which are encoded in one byte. 3414040Smckusick * 3537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 3637636Sjak * if the input is... then the output is... 3737636Sjak * 3837636Sjak * /usr/src 0 /usr/src 3937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 4037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 4137636Sjak * /usr/tmp/zoo 5 tmp/zoo 4237636Sjak * 4352279Sleres * The codes are: 4414040Smckusick * 4552279Sleres * 0-28 likeliest differential counts + offset to make nonnegative 4637636Sjak * 30 switch code for out-of-range count to follow in next word 4737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 4837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 4914040Smckusick * 5052426Sbostic * SEE ALSO: updatedb.csh, bigram.c 5152279Sleres * 5214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 5314040Smckusick * NASA Ames Research Center, 10/82 5414040Smckusick */ 5514040Smckusick 5640302Sbostic #include <sys/param.h> 5759059Storek #include <err.h> 5852426Sbostic #include <errno.h> 5952426Sbostic #include <stdlib.h> 6052426Sbostic #include <string.h> 6114040Smckusick #include <stdio.h> 6240302Sbostic #include "locate.h" 6314040Smckusick 6452426Sbostic #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 6514040Smckusick 6637636Sjak char buf1[MAXPATHLEN] = " "; 6737636Sjak char buf2[MAXPATHLEN]; 6837636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 6914040Smckusick 7052426Sbostic int bgindex __P((char *)); 7152426Sbostic void usage __P((void)); 7252426Sbostic 7352426Sbostic int 7452426Sbostic main(argc, argv) 7552426Sbostic int argc; 7652426Sbostic char *argv[]; 7714040Smckusick { 7852426Sbostic register char *cp, *oldpath, *path; 7952426Sbostic int ch, code, count, diffcount, oldcount; 8014040Smckusick FILE *fp; 8114040Smckusick 8252426Sbostic while ((ch = getopt(argc, argv, "")) != EOF) 8352426Sbostic switch(ch) { 8452426Sbostic case '?': 8552426Sbostic default: 8652426Sbostic usage(); 8752426Sbostic } 8852426Sbostic argc -= optind; 8952426Sbostic argv += optind; 9014040Smckusick 9152426Sbostic if (argc != 1) 9252426Sbostic usage(); 9352426Sbostic 9459059Storek if ((fp = fopen(argv[0], "r")) == NULL) 9559059Storek err(1, "%s", argv[0]); 9652426Sbostic 9752426Sbostic /* First copy bigram array to stdout. */ 9852426Sbostic (void)fgets(bigrams, BGBUFSIZE + 1, fp); 9952426Sbostic if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 10059059Storek err(1, "stdout"); 10152426Sbostic (void)fclose(fp); 10252426Sbostic 10352426Sbostic oldpath = buf1; 10452426Sbostic path = buf2; 10552426Sbostic oldcount = 0; 10652426Sbostic while (fgets(path, sizeof(buf2), stdin) != NULL) { 10752426Sbostic /* Truncate newline. */ 10839275Sbostic cp = path + strlen(path) - 1; 10939275Sbostic if (cp > path && *cp == '\n') 11039275Sbostic *cp = '\0'; 11152426Sbostic 11252426Sbostic /* Squelch characters that would botch the decoding. */ 11352426Sbostic for (cp = path; *cp != NULL; cp++) { 11452426Sbostic if ((u_char)*cp >= PARITY) 11537636Sjak *cp &= PARITY-1; 11652426Sbostic else if (*cp <= SWITCH) 11737636Sjak *cp = '?'; 11814040Smckusick } 11952426Sbostic 12052426Sbostic /* Skip longest common prefix. */ 12152426Sbostic for (cp = path; *cp == *oldpath; cp++, oldpath++) 12252426Sbostic if (*oldpath == NULL) 12337636Sjak break; 12437636Sjak count = cp - path; 12537636Sjak diffcount = count - oldcount + OFFSET; 12637636Sjak oldcount = count; 12753692Selan if (diffcount < 0 || diffcount > 2 * OFFSET) { 12852426Sbostic if (putchar(SWITCH) == EOF || 12952426Sbostic putw(diffcount, stdout) == EOF) 13059059Storek err(1, "stdout"); 13153692Selan } else 13252426Sbostic if (putchar(diffcount) == EOF) 13359059Storek err(1, "stdout"); 13414040Smckusick 13552426Sbostic while (*cp != NULL) { 13652426Sbostic if (*(cp + 1) == NULL) { 13752426Sbostic if (putchar(*cp) == EOF) 13859059Storek err(1, "stdout"); 13914040Smckusick break; 14014040Smckusick } 14152426Sbostic if ((code = bgindex(cp)) < 0) { 14252426Sbostic if (putchar(*cp++) == EOF || 14352426Sbostic putchar(*cp++) == EOF) 14459059Storek err(1, "stdout"); 14552426Sbostic } else { 14652426Sbostic /* Found, so mark byte with parity bit. */ 14752426Sbostic if (putchar((code / 2) | PARITY) == EOF) 14859059Storek err(1, "stdout"); 14937636Sjak cp += 2; 15037636Sjak } 15114040Smckusick } 15252426Sbostic if (path == buf1) { /* swap pointers */ 15352426Sbostic path = buf2; 15452426Sbostic oldpath = buf1; 15552426Sbostic } else { 15652426Sbostic path = buf1; 15752426Sbostic oldpath = buf2; 15852426Sbostic } 15914040Smckusick } 16052775Smckusick /* Non-zero status if there were errors */ 16152775Smckusick if (fflush(stdout) != 0 || ferror(stdout)) 16252775Smckusick exit(1); 16352426Sbostic exit(0); 16414040Smckusick } 16514040Smckusick 16652426Sbostic int 16752426Sbostic bgindex(bg) /* Return location of bg in bigrams or -1. */ 16837636Sjak char *bg; 16914040Smckusick { 17052426Sbostic register char bg0, bg1, *p; 17114040Smckusick 17252426Sbostic bg0 = bg[0]; 17352426Sbostic bg1 = bg[1]; 17452426Sbostic for (p = bigrams; *p != NULL; p++) 17552426Sbostic if (*p++ == bg0 && *p == bg1) 17637636Sjak break; 17752426Sbostic return (*p == NULL ? -1 : --p - bigrams); 17814040Smckusick } 17952426Sbostic 18052426Sbostic void 18152426Sbostic usage() 18252426Sbostic { 18352426Sbostic (void)fprintf(stderr, 18452426Sbostic "usage: locate.code common_bigrams < list > squozen_list\n"); 18552426Sbostic exit(1); 18652426Sbostic } 187