1*37637Sbostic /* 2*37637Sbostic * Copyright (c) 1989 The Regents of the University of California. 3*37637Sbostic * All rights reserved. 4*37637Sbostic * 5*37637Sbostic * This code is derived from software contributed to Berkeley by 6*37637Sbostic * James A. Woods. 7*37637Sbostic * 8*37637Sbostic * Redistribution and use in source and binary forms are permitted 9*37637Sbostic * provided that the above copyright notice and this paragraph are 10*37637Sbostic * duplicated in all such forms and that any documentation, 11*37637Sbostic * advertising materials, and other materials related to such 12*37637Sbostic * distribution and use acknowledge that the software was developed 13*37637Sbostic * by the University of California, Berkeley. The name of the 14*37637Sbostic * University may not be used to endorse or promote products derived 15*37637Sbostic * from this software without specific prior written permission. 16*37637Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 17*37637Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 18*37637Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 19*37637Sbostic */ 20*37637Sbostic 2114040Smckusick #ifndef lint 22*37637Sbostic char copyright[] = 23*37637Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 24*37637Sbostic All rights reserved.\n"; 25*37637Sbostic #endif /* not lint */ 2614040Smckusick 27*37637Sbostic #ifndef lint 28*37637Sbostic static char sccsid[] = "@(#)locate.code.c 4.4 (Berkeley) 05/04/89"; 29*37637Sbostic #endif /* not lint */ 30*37637Sbostic 3114040Smckusick /* 3214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 3314040Smckusick * to encode/decode a filename database) 3414040Smckusick * 3514051Smckusick * USAGE: bigram < list > bigrams 3614051Smckusick * process bigrams (see updatedb) > common_bigrams 3714051Smckusick * code common_bigrams < list > squozen_list 3814040Smckusick * 3937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 4037636Sjak * February/March 1983, p. 8 ). Output format is, per line, an 4137636Sjak * offset differential count byte followed by a partially bigram- 4237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 4337636Sjak * the first 128 most common of which are encoded in one byte. 4414040Smckusick * 4537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 4637636Sjak * if the input is... then the output is... 4737636Sjak * 4837636Sjak * /usr/src 0 /usr/src 4937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 5037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 5137636Sjak * /usr/tmp/zoo 5 tmp/zoo 5237636Sjak * 5314040Smckusick * The codes are: 5414040Smckusick * 5514040Smckusick * 0-28 likeliest differential counts + offset to make nonnegative 5637636Sjak * 30 switch code for out-of-range count to follow in next word 5737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 5837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 5914040Smckusick * 6014051Smckusick * SEE ALSO: updatedb.csh, bigram.c, find.c 6114040Smckusick * 6214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 6314040Smckusick * NASA Ames Research Center, 10/82 6414040Smckusick */ 6514040Smckusick 6614040Smckusick #include <stdio.h> 6737636Sjak #include <sys/param.h> 6837636Sjak #include "find.h" 6914040Smckusick 7037636Sjak #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 7114040Smckusick 7237636Sjak char buf1[MAXPATHLEN] = " "; 7337636Sjak char buf2[MAXPATHLEN]; 7437636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 7514040Smckusick 7614040Smckusick main ( argc, argv ) 7714040Smckusick int argc; char *argv[]; 7814040Smckusick { 7937636Sjak register char *cp, *oldpath = buf1, *path = buf2; 8037636Sjak int code, count, diffcount, oldcount = 0; 8114040Smckusick FILE *fp; 8214040Smckusick 8314051Smckusick if ((fp = fopen(argv[1], "r")) == NULL) { 8437636Sjak printf("Usage: code common_bigrams < list > squozen_list\n"); 8514051Smckusick exit(1); 8614051Smckusick } 8737636Sjak /* first copy bigram array to stdout */ 8837636Sjak fgets ( bigrams, BGBUFSIZE + 1, fp ); 8937636Sjak fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 9037636Sjak fclose( fp ); 9114040Smckusick 9237636Sjak /* every path will fit in path buffer, so safe to use gets */ 9314040Smckusick while ( gets ( path ) != NULL ) { 9437636Sjak /* squelch characters that would botch the decoding */ 9537636Sjak for ( cp = path; *cp != NULL; cp++ ) { 9637636Sjak if ( *cp >= PARITY ) 9737636Sjak *cp &= PARITY-1; 9837636Sjak else if ( *cp <= SWITCH ) 9937636Sjak *cp = '?'; 10014040Smckusick } 10137636Sjak /* skip longest common prefix */ 10237636Sjak for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 10337636Sjak if ( *oldpath == NULL ) 10437636Sjak break; 10537636Sjak count = cp - path; 10637636Sjak diffcount = count - oldcount + OFFSET; 10737636Sjak oldcount = count; 10837636Sjak if ( diffcount < 0 || diffcount > 2*OFFSET ) { 10937636Sjak putc ( SWITCH, stdout ); 11037636Sjak putw ( diffcount, stdout ); 11114040Smckusick } 11214040Smckusick else 11337636Sjak putc ( diffcount, stdout ); 11414040Smckusick 11537636Sjak while ( *cp != NULL ) { 11637636Sjak if ( *(cp + 1) == NULL ) { 11737636Sjak putchar ( *cp ); 11814040Smckusick break; 11914040Smckusick } 12037636Sjak if ( (code = bgindex ( cp )) < 0 ) { 12137636Sjak putchar ( *cp++ ); 12237636Sjak putchar ( *cp++ ); 12337636Sjak } 12437636Sjak else { /* found, so mark byte with parity bit */ 12537636Sjak putchar ( (code / 2) | PARITY ); 12637636Sjak cp += 2; 12737636Sjak } 12814040Smckusick } 12937636Sjak if ( path == buf1 ) /* swap pointers */ 13037636Sjak path = buf2, oldpath = buf1; 13137636Sjak else 13237636Sjak path = buf1, oldpath = buf2; 13314040Smckusick } 13414040Smckusick } 13514040Smckusick 13637636Sjak bgindex ( bg ) /* return location of bg in bigrams or -1 */ 13737636Sjak char *bg; 13814040Smckusick { 13937636Sjak register char *p; 14037636Sjak register char bg0 = bg[0], bg1 = bg[1]; 14114040Smckusick 14237636Sjak for ( p = bigrams; *p != NULL; p++ ) 14337636Sjak if ( *p++ == bg0 && *p == bg1 ) 14437636Sjak break; 14537636Sjak return ( *p == NULL ? -1 : --p - bigrams ); 14614040Smckusick } 147