1*12911Sgarrison # 2*12911Sgarrison /* input: records of lines, separated by blank lines 3*12911Sgarrison output: key:file1 start/length ... start/length:file2 start/length ... 4*12911Sgarrison */ 5*12911Sgarrison 6*12911Sgarrison # include "stdio.h" 7*12911Sgarrison # include "streams.h" 8*12911Sgarrison # include "bib.h" 9*12911Sgarrison # define isnull(x) (*(x) == NULL) 10*12911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) 11*12911Sgarrison 12*12911Sgarrison int max_kcnt = 100; /* max number of keys */ 13*12911Sgarrison int max_klen = 6; /* max length of keys */ 14*12911Sgarrison char *ignore = /* string of line starts to ignore */ 15*12911Sgarrison "CNOPVX"; 16*12911Sgarrison char *common = /* name of file of common words */ 17*12911Sgarrison COMFILE; 18*12911Sgarrison char *INDEX= /* name of output file */ 19*12911Sgarrison INDXFILE; 20*12911Sgarrison 21*12911Sgarrison char *tmpfile = /* name of temporary file */ 22*12911Sgarrison INVTEMPFILE; 23*12911Sgarrison 24*12911Sgarrison int silent = 0; /* 0 => statistics printed */ 25*12911Sgarrison /* 1 => no statisitics printed */ 26*12911Sgarrison 27*12911Sgarrison char *sort_it = 28*12911Sgarrison "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; 29*12911Sgarrison char sortcmd[maxstr]; 30*12911Sgarrison 31*12911Sgarrison int argc; 32*12911Sgarrison char **argv; 33*12911Sgarrison 34*12911Sgarrison main(argcount,arglist) 35*12911Sgarrison int argcount; 36*12911Sgarrison char **arglist; 37*12911Sgarrison { char *filename; 38*12911Sgarrison FILE *input, *output; 39*12911Sgarrison long int start,length; 40*12911Sgarrison char word[maxstr]; 41*12911Sgarrison int kcnt; 42*12911Sgarrison char tag_line[maxstr]; 43*12911Sgarrison 44*12911Sgarrison long int records = 0; /* number of records read */ 45*12911Sgarrison long int keys = 0; /* number of keys read (occurences) */ 46*12911Sgarrison long int distinct; /* number of distinct keys */ 47*12911Sgarrison long int shorten(); 48*12911Sgarrison 49*12911Sgarrison argc= argcount-1; 50*12911Sgarrison argv= arglist+1; 51*12911Sgarrison mktemp(tmpfile); 52*12911Sgarrison output= fopen(tmpfile,"w"); 53*12911Sgarrison 54*12911Sgarrison for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) 55*12911Sgarrison { /* open input file */ 56*12911Sgarrison filename= *argv; 57*12911Sgarrison input= fopen(filename,"r"); 58*12911Sgarrison if (input==NULL) 59*12911Sgarrison { fprintf(stderr, "invert: error in open of %s\n", filename); 60*12911Sgarrison continue; 61*12911Sgarrison } 62*12911Sgarrison start= 0L; 63*12911Sgarrison length= 0L; 64*12911Sgarrison 65*12911Sgarrison for(;;) /* each record */ 66*12911Sgarrison { /* find start of next record (exit if none) */ 67*12911Sgarrison start= nextrecord(input,start+length); 68*12911Sgarrison if (start==EOF) break; 69*12911Sgarrison records++; 70*12911Sgarrison kcnt= 0; 71*12911Sgarrison length= recsize(input,start); 72*12911Sgarrison sprintf(tag_line, " %s %D %D\n", filename, start, length); 73*12911Sgarrison 74*12911Sgarrison while (ftell(input) < start+length && kcnt < max_kcnt) 75*12911Sgarrison { getword(input,word,ignore); 76*12911Sgarrison makekey(word,max_klen,common); 77*12911Sgarrison if (!isnull(word)) 78*12911Sgarrison { fputs(word,output); fputs(tag_line,output); 79*12911Sgarrison kcnt++; keys++; 80*12911Sgarrison } 81*12911Sgarrison } 82*12911Sgarrison } 83*12911Sgarrison fclose(input); 84*12911Sgarrison } 85*12911Sgarrison fclose(output); 86*12911Sgarrison 87*12911Sgarrison sprintf(sortcmd, sort_it, tmpfile, tmpfile); 88*12911Sgarrison system(sortcmd); 89*12911Sgarrison 90*12911Sgarrison distinct = shorten(tmpfile,INDEX); 91*12911Sgarrison if( silent == 0 ) 92*12911Sgarrison fprintf(stderr, 93*12911Sgarrison "%D documents %D distinct keys %D key occurrences\n", 94*12911Sgarrison records, distinct, keys); 95*12911Sgarrison } 96*12911Sgarrison 97*12911Sgarrison 98*12911Sgarrison 99*12911Sgarrison /* Flag Meaning Default 100*12911Sgarrison -ki Keys per record 100 101*12911Sgarrison -li max Length of keys 6 102*12911Sgarrison -%str ignore lines that begin with %x CNOPVX 103*12911Sgarrison where x is in str 104*12911Sgarrison str is a seq of chars 105*12911Sgarrison -cfile file contains Common words /usr/src/local/bib/common 106*12911Sgarrison do not use common words as keys 107*12911Sgarrison -pfile name of output file INDEX 108*12911Sgarrison -s do not print statistics statistics printed 109*12911Sgarrison */ 110*12911Sgarrison 111*12911Sgarrison # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) 112*12911Sgarrison 113*12911Sgarrison flags() 114*12911Sgarrison { for (; argc>0 && *argv[0]=='-'; argc--,argv++) 115*12911Sgarrison { switch ((*argv)[1]) 116*12911Sgarrison { case 'k': max_kcnt= atoi(operand); 117*12911Sgarrison break; 118*12911Sgarrison case 'l': max_klen= atoi(operand); 119*12911Sgarrison break; 120*12911Sgarrison case 'c': common= operand; 121*12911Sgarrison break; 122*12911Sgarrison case '%': ignore= *argv+2; 123*12911Sgarrison break; 124*12911Sgarrison case 'p': INDEX= operand; 125*12911Sgarrison break; 126*12911Sgarrison case 's': silent= 1; 127*12911Sgarrison break; 128*12911Sgarrison default: fprintf(stderr, "unknown flag '%s'\n", *argv); 129*12911Sgarrison } 130*12911Sgarrison } 131*12911Sgarrison } 132*12911Sgarrison 133*12911Sgarrison 134*12911Sgarrison /* shorten(inf,outf): file "inf" consists of lines of the form: 135*12911Sgarrison key file start length 136*12911Sgarrison sorted by key and file. replace lines with the same key 137*12911Sgarrison with one line of the form: 138*12911Sgarrison key:file1 start/length ... start/length:file2 start/length ... 139*12911Sgarrison rename as file "outf" 140*12911Sgarrison returns number of lines in output 141*12911Sgarrison */ 142*12911Sgarrison long shorten(inf,outf) 143*12911Sgarrison char *inf, *outf; 144*12911Sgarrison { FILE *in, *out; 145*12911Sgarrison char line[maxstr]; 146*12911Sgarrison char key[maxstr], newkey[maxstr], 147*12911Sgarrison file[maxstr], newfile[maxstr]; 148*12911Sgarrison long int start, length; 149*12911Sgarrison long int lines = 0; 150*12911Sgarrison 151*12911Sgarrison in= fopen(inf, "r"); 152*12911Sgarrison out= fopen(outf, "w"); 153*12911Sgarrison if (in==NULL || out==NULL) 154*12911Sgarrison { fprintf(stderr, "invert: error in opening file for compression\n"); 155*12911Sgarrison return(0); 156*12911Sgarrison } 157*12911Sgarrison 158*12911Sgarrison getline(in,line); 159*12911Sgarrison sscanf(line,"%s%s12/10/14D", key, file, &start, &length); 160*12911Sgarrison fprintf(out, "%s :%s %D/%D", key, file, start, length); 161*12911Sgarrison for ( getline(in, line) ; !feof(in); getline(in, line)) 162*12911Sgarrison { sscanf(line,"%s%s12/10/14D", newkey, newfile, &start, &length); 163*12911Sgarrison if (strcmp(key,newkey)!=0) 164*12911Sgarrison { strcpy(key, newkey); 165*12911Sgarrison strcpy(file, newfile); 166*12911Sgarrison fprintf(out, "\n%s :%s %D/%D", key, file, start, length); 167*12911Sgarrison lines++; 168*12911Sgarrison } 169*12911Sgarrison else if (strcmp(file,newfile)!=0) 170*12911Sgarrison { strcpy(file,newfile); 171*12911Sgarrison fprintf(out, ":%s %D/%D", file, start, length); 172*12911Sgarrison } 173*12911Sgarrison else 174*12911Sgarrison fprintf(out, " %D/%D", start, length); 175*12911Sgarrison } 176*12911Sgarrison fprintf(out, "\n"); 177*12911Sgarrison lines++; 178*12911Sgarrison 179*12911Sgarrison fclose(in); fclose(out); 180*12911Sgarrison unlink(inf); 181*12911Sgarrison return (lines); 182*12911Sgarrison } 183