137637Sbostic /* 237637Sbostic * Copyright (c) 1989 The Regents of the University of California. 337637Sbostic * All rights reserved. 437637Sbostic * 537637Sbostic * This code is derived from software contributed to Berkeley by 637637Sbostic * James A. Woods. 737637Sbostic * 842737Sbostic * %sccs.include.redist.c% 937637Sbostic */ 1037637Sbostic 1114040Smckusick #ifndef lint 1237637Sbostic char copyright[] = 1337637Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 1437637Sbostic All rights reserved.\n"; 1537637Sbostic #endif /* not lint */ 1614040Smckusick 1737637Sbostic #ifndef lint 18*52426Sbostic static char sccsid[] = "@(#)locate.code.c 5.1 (Berkeley) 02/06/92"; 1937637Sbostic #endif /* not lint */ 2037637Sbostic 2114040Smckusick /* 2214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 2314040Smckusick * to encode/decode a filename database) 2414040Smckusick * 2514051Smckusick * USAGE: bigram < list > bigrams 2614051Smckusick * process bigrams (see updatedb) > common_bigrams 2714051Smckusick * code common_bigrams < list > squozen_list 2814040Smckusick * 2937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 30*52426Sbostic * February/March 1983, p. 8). Output format is, per line, an 3137636Sjak * offset differential count byte followed by a partially bigram- 3237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 3337636Sjak * the first 128 most common of which are encoded in one byte. 3414040Smckusick * 3537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 3637636Sjak * if the input is... then the output is... 3737636Sjak * 3837636Sjak * /usr/src 0 /usr/src 3937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 4037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 4137636Sjak * /usr/tmp/zoo 5 tmp/zoo 4237636Sjak * 4352279Sleres * The codes are: 4414040Smckusick * 4552279Sleres * 0-28 likeliest differential counts + offset to make nonnegative 4637636Sjak * 30 switch code for out-of-range count to follow in next word 4737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 4837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 4914040Smckusick * 50*52426Sbostic * SEE ALSO: updatedb.csh, bigram.c 5152279Sleres * 5214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 5314040Smckusick * NASA Ames Research Center, 10/82 5414040Smckusick */ 5514040Smckusick 5640302Sbostic #include <sys/param.h> 57*52426Sbostic #include <errno.h> 58*52426Sbostic #include <stdlib.h> 59*52426Sbostic #include <string.h> 6014040Smckusick #include <stdio.h> 6140302Sbostic #include "locate.h" 6214040Smckusick 63*52426Sbostic #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 64*52426Sbostic #define OERR err("stdout: %s", strerror(errno)) 6514040Smckusick 6637636Sjak char buf1[MAXPATHLEN] = " "; 6737636Sjak char buf2[MAXPATHLEN]; 6837636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 6914040Smckusick 70*52426Sbostic int bgindex __P((char *)); 71*52426Sbostic void err __P((const char *, ...)); 72*52426Sbostic void usage __P((void)); 73*52426Sbostic 74*52426Sbostic int 75*52426Sbostic main(argc, argv) 76*52426Sbostic int argc; 77*52426Sbostic char *argv[]; 7814040Smckusick { 79*52426Sbostic register char *cp, *oldpath, *path; 80*52426Sbostic int ch, code, count, diffcount, oldcount; 8114040Smckusick FILE *fp; 8214040Smckusick 83*52426Sbostic while ((ch = getopt(argc, argv, "")) != EOF) 84*52426Sbostic switch(ch) { 85*52426Sbostic case '?': 86*52426Sbostic default: 87*52426Sbostic usage(); 88*52426Sbostic } 89*52426Sbostic argc -= optind; 90*52426Sbostic argv += optind; 9114040Smckusick 92*52426Sbostic if (argc != 1) 93*52426Sbostic usage(); 94*52426Sbostic 95*52426Sbostic if ((fp = fopen(argv[1], "r")) == NULL) 96*52426Sbostic err("%s: %s", argv[1], strerror(errno)); 97*52426Sbostic 98*52426Sbostic /* First copy bigram array to stdout. */ 99*52426Sbostic (void)fgets(bigrams, BGBUFSIZE + 1, fp); 100*52426Sbostic if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 101*52426Sbostic err("stdout: %s", strerror(errno)); 102*52426Sbostic (void)fclose(fp); 103*52426Sbostic 104*52426Sbostic oldpath = buf1; 105*52426Sbostic path = buf2; 106*52426Sbostic oldcount = 0; 107*52426Sbostic while (fgets(path, sizeof(buf2), stdin) != NULL) { 108*52426Sbostic /* Truncate newline. */ 10939275Sbostic cp = path + strlen(path) - 1; 11039275Sbostic if (cp > path && *cp == '\n') 11139275Sbostic *cp = '\0'; 112*52426Sbostic 113*52426Sbostic /* Squelch characters that would botch the decoding. */ 114*52426Sbostic for (cp = path; *cp != NULL; cp++) { 115*52426Sbostic if ((u_char)*cp >= PARITY) 11637636Sjak *cp &= PARITY-1; 117*52426Sbostic else if (*cp <= SWITCH) 11837636Sjak *cp = '?'; 11914040Smckusick } 120*52426Sbostic 121*52426Sbostic /* Skip longest common prefix. */ 122*52426Sbostic for (cp = path; *cp == *oldpath; cp++, oldpath++) 123*52426Sbostic if (*oldpath == NULL) 12437636Sjak break; 12537636Sjak count = cp - path; 12637636Sjak diffcount = count - oldcount + OFFSET; 12737636Sjak oldcount = count; 128*52426Sbostic if (diffcount < 0 || diffcount > 2 * OFFSET) 129*52426Sbostic if (putchar(SWITCH) == EOF || 130*52426Sbostic putw(diffcount, stdout) == EOF) 131*52426Sbostic OERR; 13214040Smckusick else 133*52426Sbostic if (putchar(diffcount) == EOF) 134*52426Sbostic OERR; 13514040Smckusick 136*52426Sbostic while (*cp != NULL) { 137*52426Sbostic if (*(cp + 1) == NULL) { 138*52426Sbostic if (putchar(*cp) == EOF) 139*52426Sbostic OERR; 14014040Smckusick break; 14114040Smckusick } 142*52426Sbostic if ((code = bgindex(cp)) < 0) { 143*52426Sbostic if (putchar(*cp++) == EOF || 144*52426Sbostic putchar(*cp++) == EOF) 145*52426Sbostic OERR; 146*52426Sbostic } else { 147*52426Sbostic /* Found, so mark byte with parity bit. */ 148*52426Sbostic if (putchar((code / 2) | PARITY) == EOF) 149*52426Sbostic OERR; 15037636Sjak cp += 2; 15137636Sjak } 15214040Smckusick } 153*52426Sbostic if (path == buf1) { /* swap pointers */ 154*52426Sbostic path = buf2; 155*52426Sbostic oldpath = buf1; 156*52426Sbostic } else { 157*52426Sbostic path = buf1; 158*52426Sbostic oldpath = buf2; 159*52426Sbostic } 16014040Smckusick } 161*52426Sbostic exit(0); 16214040Smckusick } 16314040Smckusick 164*52426Sbostic int 165*52426Sbostic bgindex(bg) /* Return location of bg in bigrams or -1. */ 16637636Sjak char *bg; 16714040Smckusick { 168*52426Sbostic register char bg0, bg1, *p; 16914040Smckusick 170*52426Sbostic bg0 = bg[0]; 171*52426Sbostic bg1 = bg[1]; 172*52426Sbostic for (p = bigrams; *p != NULL; p++) 173*52426Sbostic if (*p++ == bg0 && *p == bg1) 17437636Sjak break; 175*52426Sbostic return (*p == NULL ? -1 : --p - bigrams); 17614040Smckusick } 177*52426Sbostic 178*52426Sbostic void 179*52426Sbostic usage() 180*52426Sbostic { 181*52426Sbostic (void)fprintf(stderr, 182*52426Sbostic "usage: locate.code common_bigrams < list > squozen_list\n"); 183*52426Sbostic exit(1); 184*52426Sbostic } 185*52426Sbostic 186*52426Sbostic #if __STDC__ 187*52426Sbostic #include <stdarg.h> 188*52426Sbostic #else 189*52426Sbostic #include <varargs.h> 190*52426Sbostic #endif 191*52426Sbostic 192*52426Sbostic void 193*52426Sbostic #if __STDC__ 194*52426Sbostic err(const char *fmt, ...) 195*52426Sbostic #else 196*52426Sbostic err(fmt, va_alist) 197*52426Sbostic char *fmt; 198*52426Sbostic va_dcl 199*52426Sbostic #endif 200*52426Sbostic { 201*52426Sbostic va_list ap; 202*52426Sbostic #if __STDC__ 203*52426Sbostic va_start(ap, fmt); 204*52426Sbostic #else 205*52426Sbostic va_start(ap); 206*52426Sbostic #endif 207*52426Sbostic (void)fprintf(stderr, "locate.code: "); 208*52426Sbostic (void)vfprintf(stderr, fmt, ap); 209*52426Sbostic va_end(ap); 210*52426Sbostic (void)fprintf(stderr, "\n"); 211*52426Sbostic exit(1); 212*52426Sbostic /* NOTREACHED */ 213*52426Sbostic } 214