xref: /csrg-svn/contrib/bib/src/invert.c (revision 12911)
1*12911Sgarrison #
2*12911Sgarrison /*  input:  records of lines, separated by blank lines
3*12911Sgarrison     output: key:file1 start/length ... start/length:file2 start/length ...
4*12911Sgarrison */
5*12911Sgarrison 
6*12911Sgarrison # include "stdio.h"
7*12911Sgarrison # include "streams.h"
8*12911Sgarrison # include "bib.h"
9*12911Sgarrison # define isnull(x)  (*(x) == NULL)
10*12911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
11*12911Sgarrison 
12*12911Sgarrison int     max_kcnt = 100;     /*  max number of keys                      */
13*12911Sgarrison int     max_klen =   6;     /*  max length of keys                      */
14*12911Sgarrison char    *ignore =           /*  string of line starts to ignore         */
15*12911Sgarrison             "CNOPVX";
16*12911Sgarrison char    *common =           /*  name of file of common words            */
17*12911Sgarrison             COMFILE;
18*12911Sgarrison char    *INDEX=             /*  name of output file                     */
19*12911Sgarrison             INDXFILE;
20*12911Sgarrison 
21*12911Sgarrison char    *tmpfile =          /*  name of temporary file                  */
22*12911Sgarrison             INVTEMPFILE;
23*12911Sgarrison 
24*12911Sgarrison int	silent = 0;	    /*  0 => statistics printed			*/
25*12911Sgarrison 			    /*  1 => no statisitics printed		*/
26*12911Sgarrison 
27*12911Sgarrison char *sort_it =
28*12911Sgarrison         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
29*12911Sgarrison char sortcmd[maxstr];
30*12911Sgarrison 
31*12911Sgarrison int     argc;
32*12911Sgarrison char    **argv;
33*12911Sgarrison 
34*12911Sgarrison main(argcount,arglist)
35*12911Sgarrison int argcount;
36*12911Sgarrison char **arglist;
37*12911Sgarrison {   char            *filename;
38*12911Sgarrison     FILE            *input, *output;
39*12911Sgarrison     long int        start,length;
40*12911Sgarrison     char            word[maxstr];
41*12911Sgarrison     int             kcnt;
42*12911Sgarrison     char            tag_line[maxstr];
43*12911Sgarrison 
44*12911Sgarrison     long int	    records = 0;  /*  number of records read           */
45*12911Sgarrison     long int	    keys    = 0;  /*  number of keys read (occurences) */
46*12911Sgarrison     long int	    distinct;     /*  number of distinct keys          */
47*12911Sgarrison     long int	    shorten();
48*12911Sgarrison 
49*12911Sgarrison     argc= argcount-1;
50*12911Sgarrison     argv= arglist+1;
51*12911Sgarrison     mktemp(tmpfile);
52*12911Sgarrison     output= fopen(tmpfile,"w");
53*12911Sgarrison 
54*12911Sgarrison     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
55*12911Sgarrison     {   /* open input file              */
56*12911Sgarrison             filename=   *argv;
57*12911Sgarrison             input=      fopen(filename,"r");
58*12911Sgarrison             if (input==NULL)
59*12911Sgarrison             {   fprintf(stderr, "invert: error in open of %s\n", filename);
60*12911Sgarrison                 continue;
61*12911Sgarrison             }
62*12911Sgarrison             start=      0L;
63*12911Sgarrison             length=     0L;
64*12911Sgarrison 
65*12911Sgarrison         for(;;) /* each record  */
66*12911Sgarrison         {   /* find start of next record (exit if none)     */
67*12911Sgarrison                 start= nextrecord(input,start+length);
68*12911Sgarrison                 if (start==EOF)   break;
69*12911Sgarrison             records++;
70*12911Sgarrison 	    kcnt= 0;
71*12911Sgarrison             length= recsize(input,start);
72*12911Sgarrison             sprintf(tag_line, " %s %D %D\n", filename, start, length);
73*12911Sgarrison 
74*12911Sgarrison             while (ftell(input) < start+length && kcnt < max_kcnt)
75*12911Sgarrison             {   getword(input,word,ignore);
76*12911Sgarrison                 makekey(word,max_klen,common);
77*12911Sgarrison                 if (!isnull(word))
78*12911Sgarrison                 {   fputs(word,output); fputs(tag_line,output);
79*12911Sgarrison                     kcnt++; keys++;
80*12911Sgarrison                 }
81*12911Sgarrison             }
82*12911Sgarrison         }
83*12911Sgarrison         fclose(input);
84*12911Sgarrison     }
85*12911Sgarrison     fclose(output);
86*12911Sgarrison 
87*12911Sgarrison     sprintf(sortcmd, sort_it, tmpfile, tmpfile);
88*12911Sgarrison     system(sortcmd);
89*12911Sgarrison 
90*12911Sgarrison     distinct = shorten(tmpfile,INDEX);
91*12911Sgarrison     if( silent == 0 )
92*12911Sgarrison 	fprintf(stderr,
93*12911Sgarrison 	    "%D documents   %D distinct keys  %D key occurrences\n",
94*12911Sgarrison 	    records, distinct, keys);
95*12911Sgarrison }
96*12911Sgarrison 
97*12911Sgarrison 
98*12911Sgarrison 
99*12911Sgarrison /*  Flag    Meaning                             Default
100*12911Sgarrison     -ki     Keys per record                     100
101*12911Sgarrison     -li     max Length of keys                  6
102*12911Sgarrison     -%str   ignore lines that begin with %x     CNOPVX
103*12911Sgarrison             where x is in str
104*12911Sgarrison             str is a seq of chars
105*12911Sgarrison     -cfile  file contains Common words          /usr/src/local/bib/common
106*12911Sgarrison             do not use common words as keys
107*12911Sgarrison     -pfile  name of output file                 INDEX
108*12911Sgarrison     -s	    do not print statistics		statistics printed
109*12911Sgarrison */
110*12911Sgarrison 
111*12911Sgarrison # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
112*12911Sgarrison 
113*12911Sgarrison flags()
114*12911Sgarrison {   for (; argc>0 && *argv[0]=='-';  argc--,argv++)
115*12911Sgarrison     {   switch ((*argv)[1])
116*12911Sgarrison         {   case 'k':   max_kcnt= atoi(operand);
117*12911Sgarrison                         break;
118*12911Sgarrison             case 'l':   max_klen= atoi(operand);
119*12911Sgarrison                         break;
120*12911Sgarrison             case 'c':   common=  operand;
121*12911Sgarrison                         break;
122*12911Sgarrison             case '%':   ignore=  *argv+2;
123*12911Sgarrison                         break;
124*12911Sgarrison             case 'p':   INDEX=  operand;
125*12911Sgarrison                         break;
126*12911Sgarrison 	    case 's':	silent= 1;
127*12911Sgarrison 			break;
128*12911Sgarrison             default:    fprintf(stderr, "unknown flag '%s'\n", *argv);
129*12911Sgarrison         }
130*12911Sgarrison     }
131*12911Sgarrison }
132*12911Sgarrison 
133*12911Sgarrison 
134*12911Sgarrison /*  shorten(inf,outf): file "inf" consists of lines of the form:
135*12911Sgarrison         key file start length
136*12911Sgarrison     sorted by key and file.  replace lines with the same key
137*12911Sgarrison     with one line of the form:
138*12911Sgarrison         key:file1 start/length ... start/length:file2 start/length ...
139*12911Sgarrison     rename as file "outf"
140*12911Sgarrison     returns number of lines in output
141*12911Sgarrison */
142*12911Sgarrison long shorten(inf,outf)
143*12911Sgarrison char *inf, *outf;
144*12911Sgarrison {   FILE *in, *out;
145*12911Sgarrison     char line[maxstr];
146*12911Sgarrison     char key[maxstr],  newkey[maxstr],
147*12911Sgarrison          file[maxstr], newfile[maxstr];
148*12911Sgarrison     long int start, length;
149*12911Sgarrison     long int lines = 0;
150*12911Sgarrison 
151*12911Sgarrison     in=  fopen(inf, "r");
152*12911Sgarrison     out= fopen(outf, "w");
153*12911Sgarrison     if (in==NULL || out==NULL)
154*12911Sgarrison     {   fprintf(stderr, "invert: error in opening file for compression\n");
155*12911Sgarrison         return(0);
156*12911Sgarrison     }
157*12911Sgarrison 
158*12911Sgarrison     getline(in,line);
159*12911Sgarrison     sscanf(line,"%s%s12/10/14D", key, file, &start, &length);
160*12911Sgarrison     fprintf(out, "%s :%s %D/%D", key, file, start, length);
161*12911Sgarrison     for ( getline(in, line) ; !feof(in);  getline(in, line))
162*12911Sgarrison     {   sscanf(line,"%s%s12/10/14D", newkey, newfile, &start, &length);
163*12911Sgarrison         if (strcmp(key,newkey)!=0)
164*12911Sgarrison         {   strcpy(key, newkey);
165*12911Sgarrison             strcpy(file, newfile);
166*12911Sgarrison             fprintf(out, "\n%s :%s %D/%D",  key, file, start, length);
167*12911Sgarrison 	    lines++;
168*12911Sgarrison         }
169*12911Sgarrison         else if (strcmp(file,newfile)!=0)
170*12911Sgarrison         {   strcpy(file,newfile);
171*12911Sgarrison             fprintf(out, ":%s %D/%D", file, start, length);
172*12911Sgarrison         }
173*12911Sgarrison         else
174*12911Sgarrison             fprintf(out, " %D/%D", start, length);
175*12911Sgarrison     }
176*12911Sgarrison     fprintf(out, "\n");
177*12911Sgarrison     lines++;
178*12911Sgarrison 
179*12911Sgarrison     fclose(in); fclose(out);
180*12911Sgarrison     unlink(inf);
181*12911Sgarrison     return (lines);
182*12911Sgarrison }
183