113111Srrh #ifndef lint 2*24674Smckusick static char sccsid[] = "@(#)invert.c 2.5 09/10/85"; 313111Srrh #endif not lint 415177Sgarrison # 512911Sgarrison /* input: records of lines, separated by blank lines 612911Sgarrison output: key:file1 start/length ... start/length:file2 start/length ... 712911Sgarrison */ 812911Sgarrison 912911Sgarrison # include "stdio.h" 1012911Sgarrison # include "streams.h" 1112911Sgarrison # include "bib.h" 1212911Sgarrison # define isnull(x) (*(x) == NULL) 1312911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) 1412911Sgarrison 1512911Sgarrison int max_kcnt = 100; /* max number of keys */ 1612911Sgarrison int max_klen = 6; /* max length of keys */ 1712911Sgarrison char *ignore = /* string of line starts to ignore */ 1812911Sgarrison "CNOPVX"; 1912911Sgarrison char *common = /* name of file of common words */ 2012911Sgarrison COMFILE; 2112911Sgarrison char *INDEX= /* name of output file */ 2212911Sgarrison INDXFILE; 2312911Sgarrison 2412911Sgarrison char *tmpfile = /* name of temporary file */ 2512911Sgarrison INVTEMPFILE; 2612911Sgarrison 2712911Sgarrison int silent = 0; /* 0 => statistics printed */ 2812911Sgarrison /* 1 => no statisitics printed */ 2912911Sgarrison 3012911Sgarrison char *sort_it = 3112911Sgarrison "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; 3212911Sgarrison char sortcmd[maxstr]; 3312911Sgarrison 3412911Sgarrison int argc; 3512911Sgarrison char **argv; 3612911Sgarrison 3712911Sgarrison main(argcount,arglist) 3812911Sgarrison int argcount; 3912911Sgarrison char **arglist; 4012911Sgarrison { char *filename; 4112911Sgarrison FILE *input, *output; 4212911Sgarrison long int start,length; 4312911Sgarrison char word[maxstr]; 4412911Sgarrison int kcnt; 4512911Sgarrison char tag_line[maxstr]; 4612911Sgarrison 4712911Sgarrison long int records = 0; /* number of records read */ 4812911Sgarrison long int keys = 0; /* number of keys read (occurences) */ 4912911Sgarrison long int distinct; /* number of distinct keys */ 5012911Sgarrison long int shorten(); 5112911Sgarrison 5217250Srrh strcpy(COMFILE, N_COMFILE); 5317250Srrh strcpy(BMACLIB, N_BMACLIB); 5417250Srrh 5512911Sgarrison argc= argcount-1; 5612911Sgarrison argv= arglist+1; 5712911Sgarrison mktemp(tmpfile); 5812911Sgarrison output= fopen(tmpfile,"w"); 5912911Sgarrison 6012911Sgarrison for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) 6112911Sgarrison { /* open input file */ 6212911Sgarrison filename= *argv; 6312911Sgarrison input= fopen(filename,"r"); 6412911Sgarrison if (input==NULL) 6512911Sgarrison { fprintf(stderr, "invert: error in open of %s\n", filename); 6612911Sgarrison continue; 6712911Sgarrison } 6812911Sgarrison start= 0L; 6912911Sgarrison length= 0L; 7012911Sgarrison 7112911Sgarrison for(;;) /* each record */ 7212911Sgarrison { /* find start of next record (exit if none) */ 7312911Sgarrison start= nextrecord(input,start+length); 7412911Sgarrison if (start==EOF) break; 7512911Sgarrison records++; 7612911Sgarrison kcnt= 0; 7712911Sgarrison length= recsize(input,start); 7815177Sgarrison sprintf(tag_line, " %s %d %d\n", filename, start, length); 7912911Sgarrison 8012911Sgarrison while (ftell(input) < start+length && kcnt < max_kcnt) 8112911Sgarrison { getword(input,word,ignore); 8212911Sgarrison makekey(word,max_klen,common); 8312911Sgarrison if (!isnull(word)) 8412911Sgarrison { fputs(word,output); fputs(tag_line,output); 8512911Sgarrison kcnt++; keys++; 8612911Sgarrison } 8712911Sgarrison } 8812911Sgarrison } 8912911Sgarrison fclose(input); 9012911Sgarrison } 9112911Sgarrison fclose(output); 9212911Sgarrison 9312911Sgarrison sprintf(sortcmd, sort_it, tmpfile, tmpfile); 9412911Sgarrison system(sortcmd); 9512911Sgarrison 9612911Sgarrison distinct = shorten(tmpfile,INDEX); 9712911Sgarrison if( silent == 0 ) 9812911Sgarrison fprintf(stderr, 9915177Sgarrison "%d documents %d distinct keys %d key occurrences\n", 10012911Sgarrison records, distinct, keys); 10117301Sralph exit(0); 10212911Sgarrison } 10312911Sgarrison 10412911Sgarrison 10512911Sgarrison 10612911Sgarrison /* Flag Meaning Default 10712911Sgarrison -ki Keys per record 100 10812911Sgarrison -li max Length of keys 6 10912911Sgarrison -%str ignore lines that begin with %x CNOPVX 11012911Sgarrison where x is in str 11112911Sgarrison str is a seq of chars 112*24674Smckusick -cfile file contains Common words /usr/new/lib/bib/common 11312911Sgarrison do not use common words as keys 11412911Sgarrison -pfile name of output file INDEX 11512911Sgarrison -s do not print statistics statistics printed 11612911Sgarrison */ 11712911Sgarrison 11812911Sgarrison # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) 11912911Sgarrison 12012911Sgarrison flags() 12117250Srrh { 12217250Srrh char *p; 12317250Srrh for (; argc>0 && *argv[0]=='-'; argc--,argv++) 12412911Sgarrison { switch ((*argv)[1]) 12512911Sgarrison { case 'k': max_kcnt= atoi(operand); 12612911Sgarrison break; 12712911Sgarrison case 'l': max_klen= atoi(operand); 12812911Sgarrison break; 12912911Sgarrison case 'c': common= operand; 13012911Sgarrison break; 13112911Sgarrison case '%': ignore= *argv+2; 13212911Sgarrison break; 13312911Sgarrison case 'p': INDEX= operand; 13412911Sgarrison break; 13512911Sgarrison case 's': silent= 1; 13612911Sgarrison break; 13717250Srrh case 'd': 13817250Srrh p = &argv[0][2]; 13917250Srrh if (!p) { 14017250Srrh argv++; 14117250Srrh p = &argv[0][0]; 14217250Srrh } 14317250Srrh strreplace(COMFILE, BMACLIB, p); 14417250Srrh strcpy(BMACLIB, p); 14517250Srrh break; 14612911Sgarrison default: fprintf(stderr, "unknown flag '%s'\n", *argv); 14712911Sgarrison } 14812911Sgarrison } 14912911Sgarrison } 15012911Sgarrison 15112911Sgarrison 15212911Sgarrison /* shorten(inf,outf): file "inf" consists of lines of the form: 15312911Sgarrison key file start length 15412911Sgarrison sorted by key and file. replace lines with the same key 15512911Sgarrison with one line of the form: 15612911Sgarrison key:file1 start/length ... start/length:file2 start/length ... 15712911Sgarrison rename as file "outf" 15812911Sgarrison returns number of lines in output 15912911Sgarrison */ 16012911Sgarrison long shorten(inf,outf) 16112911Sgarrison char *inf, *outf; 16212911Sgarrison { FILE *in, *out; 16312911Sgarrison char line[maxstr]; 16412911Sgarrison char key[maxstr], newkey[maxstr], 16512911Sgarrison file[maxstr], newfile[maxstr]; 16612911Sgarrison long int start, length; 16712911Sgarrison long int lines = 0; 16812911Sgarrison 16912911Sgarrison in= fopen(inf, "r"); 17012911Sgarrison out= fopen(outf, "w"); 17112911Sgarrison if (in==NULL || out==NULL) 17212911Sgarrison { fprintf(stderr, "invert: error in opening file for compression\n"); 17312911Sgarrison return(0); 17412911Sgarrison } 17512911Sgarrison 17612911Sgarrison getline(in,line); 17715177Sgarrison sscanf(line,"%s%s%d%d", key, file, &start, &length); 17815177Sgarrison fprintf(out, "%s :%s %d/%d", key, file, start, length); 17912911Sgarrison for ( getline(in, line) ; !feof(in); getline(in, line)) 18015177Sgarrison { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length); 18112911Sgarrison if (strcmp(key,newkey)!=0) 18212911Sgarrison { strcpy(key, newkey); 18312911Sgarrison strcpy(file, newfile); 18415177Sgarrison fprintf(out, "\n%s :%s %d/%d", key, file, start, length); 18512911Sgarrison lines++; 18612911Sgarrison } 18712911Sgarrison else if (strcmp(file,newfile)!=0) 18812911Sgarrison { strcpy(file,newfile); 18915177Sgarrison fprintf(out, ":%s %d/%d", file, start, length); 19012911Sgarrison } 19112911Sgarrison else 19215177Sgarrison fprintf(out, " %d/%d", start, length); 19312911Sgarrison } 19412911Sgarrison fprintf(out, "\n"); 19512911Sgarrison lines++; 19612911Sgarrison 19712911Sgarrison fclose(in); fclose(out); 19812911Sgarrison unlink(inf); 19912911Sgarrison return (lines); 20012911Sgarrison } 201