113111Srrh #ifndef lint
2*60507Sbostic static char sccsid[] = "@(#)invert.c 2.7 05/27/93";
313111Srrh #endif not lint
415177Sgarrison #
512911Sgarrison /* input: records of lines, separated by blank lines
612911Sgarrison output: key:file1 start/length ... start/length:file2 start/length ...
712911Sgarrison */
812911Sgarrison
912911Sgarrison # include "stdio.h"
1012911Sgarrison # include "streams.h"
1112911Sgarrison # include "bib.h"
1212911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
1312911Sgarrison
1412911Sgarrison int max_kcnt = 100; /* max number of keys */
1512911Sgarrison int max_klen = 6; /* max length of keys */
1612911Sgarrison char *ignore = /* string of line starts to ignore */
1712911Sgarrison "CNOPVX";
1812911Sgarrison char *INDEX= /* name of output file */
1912911Sgarrison INDXFILE;
2012911Sgarrison
2130591Sgarrison char *bibtmpfile = /* name of temporary file */
2212911Sgarrison INVTEMPFILE;
2312911Sgarrison
2412911Sgarrison int silent = 0; /* 0 => statistics printed */
2512911Sgarrison /* 1 => no statisitics printed */
2612911Sgarrison
2712911Sgarrison char *sort_it =
2812911Sgarrison "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
2912911Sgarrison char sortcmd[maxstr];
3012911Sgarrison
3112911Sgarrison int argc;
3212911Sgarrison char **argv;
3312911Sgarrison
main(argcount,arglist)3412911Sgarrison main(argcount,arglist)
3512911Sgarrison int argcount;
3612911Sgarrison char **arglist;
3712911Sgarrison { char *filename;
3812911Sgarrison FILE *input, *output;
3912911Sgarrison long int start,length;
4012911Sgarrison char word[maxstr];
4112911Sgarrison int kcnt;
4212911Sgarrison char tag_line[maxstr];
43*60507Sbostic int bol = 1; /* at beginning of line */
4412911Sgarrison
4512911Sgarrison long int records = 0; /* number of records read */
4612911Sgarrison long int keys = 0; /* number of keys read (occurences) */
4712911Sgarrison long int distinct; /* number of distinct keys */
4812911Sgarrison long int shorten();
4912911Sgarrison
50*60507Sbostic InitDirectory(BMACLIB,N_BMACLIB);
51*60507Sbostic InitDirectory(COMFILE,N_COMFILE);
5217250Srrh
5312911Sgarrison argc= argcount-1;
5412911Sgarrison argv= arglist+1;
5530591Sgarrison mktemp(bibtmpfile);
5630591Sgarrison output= fopen(bibtmpfile,"w");
5712911Sgarrison
5812911Sgarrison for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
5912911Sgarrison { /* open input file */
6012911Sgarrison filename= *argv;
6112911Sgarrison input= fopen(filename,"r");
6212911Sgarrison if (input==NULL)
63*60507Sbostic { fprintf(stderr,"invert: error in open of %s\n", filename);
6412911Sgarrison continue;
6512911Sgarrison }
66*60507Sbostic start= 0L;
67*60507Sbostic length= 0L;
6812911Sgarrison
69*60507Sbostic for(;;) /* each record */ {
70*60507Sbostic /* find start of next record (exit if none) */
71*60507Sbostic start= nextrecord(input,start+length);
72*60507Sbostic if (start==EOF) break;
73*60507Sbostic records++;
74*60507Sbostic kcnt= 0;
75*60507Sbostic length= recsize(input,start);
76*60507Sbostic sprintf(tag_line, " %s %d %d\n", filename, start, length);
7712911Sgarrison
78*60507Sbostic while (ftell(input) < start+length && kcnt < max_kcnt) {
79*60507Sbostic getword(input,word,ignore,&bol);
80*60507Sbostic makekey(word,max_klen,COMFILE);
81*60507Sbostic if (*word != NULL) {
82*60507Sbostic fputs(word,output); fputs(tag_line,output);
83*60507Sbostic kcnt++; keys++;
84*60507Sbostic }
85*60507Sbostic }
86*60507Sbostic }
87*60507Sbostic fclose(input);
88*60507Sbostic }
8912911Sgarrison fclose(output);
9012911Sgarrison
9130591Sgarrison sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
9212911Sgarrison system(sortcmd);
9312911Sgarrison
9430591Sgarrison distinct = shorten(bibtmpfile,INDEX);
9512911Sgarrison if( silent == 0 )
9612911Sgarrison fprintf(stderr,
9715177Sgarrison "%d documents %d distinct keys %d key occurrences\n",
9812911Sgarrison records, distinct, keys);
9917301Sralph exit(0);
10012911Sgarrison }
10112911Sgarrison
10212911Sgarrison
10312911Sgarrison
10412911Sgarrison /* Flag Meaning Default
10512911Sgarrison -ki Keys per record 100
10612911Sgarrison -li max Length of keys 6
10712911Sgarrison -%str ignore lines that begin with %x CNOPVX
10812911Sgarrison where x is in str
10912911Sgarrison str is a seq of chars
11024674Smckusick -cfile file contains Common words /usr/new/lib/bib/common
11112911Sgarrison do not use common words as keys
11212911Sgarrison -pfile name of output file INDEX
11312911Sgarrison -s do not print statistics statistics printed
11412911Sgarrison */
11512911Sgarrison
11612911Sgarrison # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
11712911Sgarrison
flags()11812911Sgarrison flags()
11917250Srrh {
12017250Srrh char *p;
12117250Srrh for (; argc>0 && *argv[0]=='-'; argc--,argv++)
12212911Sgarrison { switch ((*argv)[1])
12312911Sgarrison { case 'k': max_kcnt= atoi(operand);
12412911Sgarrison break;
12512911Sgarrison case 'l': max_klen= atoi(operand);
12612911Sgarrison break;
127*60507Sbostic case 'c': strcpy(COMFILE,operand);
12812911Sgarrison break;
12912911Sgarrison case '%': ignore= *argv+2;
13012911Sgarrison break;
13112911Sgarrison case 'p': INDEX= operand;
13212911Sgarrison break;
13312911Sgarrison case 's': silent= 1;
13412911Sgarrison break;
13517250Srrh case 'd':
13617250Srrh p = &argv[0][2];
13717250Srrh if (!p) {
13817250Srrh argv++;
13917250Srrh p = &argv[0][0];
14017250Srrh }
14117250Srrh strreplace(COMFILE, BMACLIB, p);
14217250Srrh strcpy(BMACLIB, p);
14317250Srrh break;
144*60507Sbostic default: fprintf(stderr,"unknown flag '%s'\n", *argv);
14512911Sgarrison }
14612911Sgarrison }
14712911Sgarrison }
14812911Sgarrison
14912911Sgarrison
15012911Sgarrison /* shorten(inf,outf): file "inf" consists of lines of the form:
15112911Sgarrison key file start length
15212911Sgarrison sorted by key and file. replace lines with the same key
15312911Sgarrison with one line of the form:
15412911Sgarrison key:file1 start/length ... start/length:file2 start/length ...
15512911Sgarrison rename as file "outf"
15612911Sgarrison returns number of lines in output
15712911Sgarrison */
shorten(inf,outf)15812911Sgarrison long shorten(inf,outf)
15912911Sgarrison char *inf, *outf;
16012911Sgarrison { FILE *in, *out;
16112911Sgarrison char line[maxstr];
16212911Sgarrison char key[maxstr], newkey[maxstr],
16312911Sgarrison file[maxstr], newfile[maxstr];
16412911Sgarrison long int start, length;
16512911Sgarrison long int lines = 0;
16612911Sgarrison
16712911Sgarrison in= fopen(inf, "r");
16812911Sgarrison out= fopen(outf, "w");
16912911Sgarrison if (in==NULL || out==NULL)
170*60507Sbostic { fprintf(stderr,"invert: error in opening file for compression\n");
17112911Sgarrison return(0);
17212911Sgarrison }
17312911Sgarrison
17412911Sgarrison getline(in,line);
17515177Sgarrison sscanf(line,"%s%s%d%d", key, file, &start, &length);
17615177Sgarrison fprintf(out, "%s :%s %d/%d", key, file, start, length);
17712911Sgarrison for ( getline(in, line) ; !feof(in); getline(in, line))
17815177Sgarrison { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
17912911Sgarrison if (strcmp(key,newkey)!=0)
18012911Sgarrison { strcpy(key, newkey);
18112911Sgarrison strcpy(file, newfile);
18215177Sgarrison fprintf(out, "\n%s :%s %d/%d", key, file, start, length);
18312911Sgarrison lines++;
18412911Sgarrison }
18512911Sgarrison else if (strcmp(file,newfile)!=0)
18612911Sgarrison { strcpy(file,newfile);
18715177Sgarrison fprintf(out, ":%s %d/%d", file, start, length);
18812911Sgarrison }
18912911Sgarrison else
19015177Sgarrison fprintf(out, " %d/%d", start, length);
19112911Sgarrison }
19212911Sgarrison fprintf(out, "\n");
19312911Sgarrison lines++;
19412911Sgarrison
19512911Sgarrison fclose(in); fclose(out);
19612911Sgarrison unlink(inf);
19712911Sgarrison return (lines);
19812911Sgarrison }
199