137637Sbostic /* 237637Sbostic * Copyright (c) 1989 The Regents of the University of California. 337637Sbostic * All rights reserved. 437637Sbostic * 537637Sbostic * This code is derived from software contributed to Berkeley by 637637Sbostic * James A. Woods. 737637Sbostic * 837637Sbostic * Redistribution and use in source and binary forms are permitted 937637Sbostic * provided that the above copyright notice and this paragraph are 1037637Sbostic * duplicated in all such forms and that any documentation, 1137637Sbostic * advertising materials, and other materials related to such 1237637Sbostic * distribution and use acknowledge that the software was developed 1337637Sbostic * by the University of California, Berkeley. The name of the 1437637Sbostic * University may not be used to endorse or promote products derived 1537637Sbostic * from this software without specific prior written permission. 1637637Sbostic * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1737637Sbostic * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1837637Sbostic * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1937637Sbostic */ 2037637Sbostic 2114040Smckusick #ifndef lint 2237637Sbostic char copyright[] = 2337637Sbostic "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ 2437637Sbostic All rights reserved.\n"; 2537637Sbostic #endif /* not lint */ 2614040Smckusick 2737637Sbostic #ifndef lint 28*40302Sbostic static char sccsid[] = "@(#)locate.code.c 4.7 (Berkeley) 03/06/90"; 2937637Sbostic #endif /* not lint */ 3037637Sbostic 3114040Smckusick /* 3214040Smckusick * PURPOSE: sorted list compressor (works with a modified 'find' 3314040Smckusick * to encode/decode a filename database) 3414040Smckusick * 3514051Smckusick * USAGE: bigram < list > bigrams 3614051Smckusick * process bigrams (see updatedb) > common_bigrams 3714051Smckusick * code common_bigrams < list > squozen_list 3814040Smckusick * 3937636Sjak * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 4037636Sjak * February/March 1983, p. 8 ). Output format is, per line, an 4137636Sjak * offset differential count byte followed by a partially bigram- 4237636Sjak * encoded ascii residue. A bigram is a two-character sequence, 4337636Sjak * the first 128 most common of which are encoded in one byte. 4414040Smckusick * 4537636Sjak * EXAMPLE: For simple front compression with no bigram encoding, 4637636Sjak * if the input is... then the output is... 4737636Sjak * 4837636Sjak * /usr/src 0 /usr/src 4937636Sjak * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 5037636Sjak * /usr/src/cmd/armadillo.c 14 armadillo.c 5137636Sjak * /usr/tmp/zoo 5 tmp/zoo 5237636Sjak * 5314040Smckusick * The codes are: 5414040Smckusick * 5514040Smckusick * 0-28 likeliest differential counts + offset to make nonnegative 5637636Sjak * 30 switch code for out-of-range count to follow in next word 5737636Sjak * 128-255 bigram codes (128 most common, as determined by 'updatedb') 5837636Sjak * 32-127 single character (printable) ascii residue (ie, literal) 5914040Smckusick * 6014051Smckusick * SEE ALSO: updatedb.csh, bigram.c, find.c 6114040Smckusick * 6214040Smckusick * AUTHOR: James A. Woods, Informatics General Corp., 6314040Smckusick * NASA Ames Research Center, 10/82 6414040Smckusick */ 6514040Smckusick 66*40302Sbostic #include <sys/param.h> 6714040Smckusick #include <stdio.h> 68*40302Sbostic #include "locate.h" 6914040Smckusick 7037636Sjak #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 7114040Smckusick 7237636Sjak char buf1[MAXPATHLEN] = " "; 7337636Sjak char buf2[MAXPATHLEN]; 7437636Sjak char bigrams[BGBUFSIZE + 1] = { 0 }; 7514040Smckusick 7614040Smckusick main ( argc, argv ) 7714040Smckusick int argc; char *argv[]; 7814040Smckusick { 7937636Sjak register char *cp, *oldpath = buf1, *path = buf2; 8037636Sjak int code, count, diffcount, oldcount = 0; 8114040Smckusick FILE *fp; 8214040Smckusick 8314051Smckusick if ((fp = fopen(argv[1], "r")) == NULL) { 8437636Sjak printf("Usage: code common_bigrams < list > squozen_list\n"); 8514051Smckusick exit(1); 8614051Smckusick } 8737636Sjak /* first copy bigram array to stdout */ 8837636Sjak fgets ( bigrams, BGBUFSIZE + 1, fp ); 8937636Sjak fwrite ( bigrams, 1, BGBUFSIZE, stdout ); 9037636Sjak fclose( fp ); 9114040Smckusick 9238085Sbostic while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { 9339275Sbostic /* truncate newline */ 9439275Sbostic cp = path + strlen(path) - 1; 9539275Sbostic if (cp > path && *cp == '\n') 9639275Sbostic *cp = '\0'; 9737636Sjak /* squelch characters that would botch the decoding */ 9837636Sjak for ( cp = path; *cp != NULL; cp++ ) { 9939275Sbostic if ( (unsigned char)*cp >= PARITY ) 10037636Sjak *cp &= PARITY-1; 10137636Sjak else if ( *cp <= SWITCH ) 10237636Sjak *cp = '?'; 10314040Smckusick } 10437636Sjak /* skip longest common prefix */ 10537636Sjak for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 10637636Sjak if ( *oldpath == NULL ) 10737636Sjak break; 10837636Sjak count = cp - path; 10937636Sjak diffcount = count - oldcount + OFFSET; 11037636Sjak oldcount = count; 11137636Sjak if ( diffcount < 0 || diffcount > 2*OFFSET ) { 11237636Sjak putc ( SWITCH, stdout ); 11337636Sjak putw ( diffcount, stdout ); 11414040Smckusick } 11514040Smckusick else 11637636Sjak putc ( diffcount, stdout ); 11714040Smckusick 11837636Sjak while ( *cp != NULL ) { 11937636Sjak if ( *(cp + 1) == NULL ) { 12037636Sjak putchar ( *cp ); 12114040Smckusick break; 12214040Smckusick } 12337636Sjak if ( (code = bgindex ( cp )) < 0 ) { 12437636Sjak putchar ( *cp++ ); 12537636Sjak putchar ( *cp++ ); 12637636Sjak } 12737636Sjak else { /* found, so mark byte with parity bit */ 12837636Sjak putchar ( (code / 2) | PARITY ); 12937636Sjak cp += 2; 13037636Sjak } 13114040Smckusick } 13237636Sjak if ( path == buf1 ) /* swap pointers */ 13337636Sjak path = buf2, oldpath = buf1; 13437636Sjak else 13537636Sjak path = buf1, oldpath = buf2; 13614040Smckusick } 13714040Smckusick } 13814040Smckusick 13937636Sjak bgindex ( bg ) /* return location of bg in bigrams or -1 */ 14037636Sjak char *bg; 14114040Smckusick { 14237636Sjak register char *p; 14337636Sjak register char bg0 = bg[0], bg1 = bg[1]; 14414040Smckusick 14537636Sjak for ( p = bigrams; *p != NULL; p++ ) 14637636Sjak if ( *p++ == bg0 && *p == bg1 ) 14737636Sjak break; 14837636Sjak return ( *p == NULL ? -1 : --p - bigrams ); 14914040Smckusick } 150