xref: /csrg-svn/contrib/bib/src/invert.c (revision 15177)
113111Srrh #ifndef lint
2*15177Sgarrison static char sccsid[] = "@(#)invert.c	2.2	10/06/83";
313111Srrh #endif not lint
4*15177Sgarrison #
512911Sgarrison /*  input:  records of lines, separated by blank lines
612911Sgarrison     output: key:file1 start/length ... start/length:file2 start/length ...
712911Sgarrison */
812911Sgarrison 
912911Sgarrison # include "stdio.h"
1012911Sgarrison # include "streams.h"
1112911Sgarrison # include "bib.h"
1212911Sgarrison # define isnull(x)  (*(x) == NULL)
1312911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
1412911Sgarrison 
1512911Sgarrison int     max_kcnt = 100;     /*  max number of keys                      */
1612911Sgarrison int     max_klen =   6;     /*  max length of keys                      */
1712911Sgarrison char    *ignore =           /*  string of line starts to ignore         */
1812911Sgarrison             "CNOPVX";
1912911Sgarrison char    *common =           /*  name of file of common words            */
2012911Sgarrison             COMFILE;
2112911Sgarrison char    *INDEX=             /*  name of output file                     */
2212911Sgarrison             INDXFILE;
2312911Sgarrison 
2412911Sgarrison char    *tmpfile =          /*  name of temporary file                  */
2512911Sgarrison             INVTEMPFILE;
2612911Sgarrison 
2712911Sgarrison int	silent = 0;	    /*  0 => statistics printed			*/
2812911Sgarrison 			    /*  1 => no statisitics printed		*/
2912911Sgarrison 
3012911Sgarrison char *sort_it =
3112911Sgarrison         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
3212911Sgarrison char sortcmd[maxstr];
3312911Sgarrison 
3412911Sgarrison int     argc;
3512911Sgarrison char    **argv;
3612911Sgarrison 
3712911Sgarrison main(argcount,arglist)
3812911Sgarrison int argcount;
3912911Sgarrison char **arglist;
4012911Sgarrison {   char            *filename;
4112911Sgarrison     FILE            *input, *output;
4212911Sgarrison     long int        start,length;
4312911Sgarrison     char            word[maxstr];
4412911Sgarrison     int             kcnt;
4512911Sgarrison     char            tag_line[maxstr];
4612911Sgarrison 
4712911Sgarrison     long int	    records = 0;  /*  number of records read           */
4812911Sgarrison     long int	    keys    = 0;  /*  number of keys read (occurences) */
4912911Sgarrison     long int	    distinct;     /*  number of distinct keys          */
5012911Sgarrison     long int	    shorten();
5112911Sgarrison 
5212911Sgarrison     argc= argcount-1;
5312911Sgarrison     argv= arglist+1;
5412911Sgarrison     mktemp(tmpfile);
5512911Sgarrison     output= fopen(tmpfile,"w");
5612911Sgarrison 
5712911Sgarrison     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
5812911Sgarrison     {   /* open input file              */
5912911Sgarrison             filename=   *argv;
6012911Sgarrison             input=      fopen(filename,"r");
6112911Sgarrison             if (input==NULL)
6212911Sgarrison             {   fprintf(stderr, "invert: error in open of %s\n", filename);
6312911Sgarrison                 continue;
6412911Sgarrison             }
6512911Sgarrison             start=      0L;
6612911Sgarrison             length=     0L;
6712911Sgarrison 
6812911Sgarrison         for(;;) /* each record  */
6912911Sgarrison         {   /* find start of next record (exit if none)     */
7012911Sgarrison                 start= nextrecord(input,start+length);
7112911Sgarrison                 if (start==EOF)   break;
7212911Sgarrison             records++;
7312911Sgarrison 	    kcnt= 0;
7412911Sgarrison             length= recsize(input,start);
75*15177Sgarrison             sprintf(tag_line, " %s %d %d\n", filename, start, length);
7612911Sgarrison 
7712911Sgarrison             while (ftell(input) < start+length && kcnt < max_kcnt)
7812911Sgarrison             {   getword(input,word,ignore);
7912911Sgarrison                 makekey(word,max_klen,common);
8012911Sgarrison                 if (!isnull(word))
8112911Sgarrison                 {   fputs(word,output); fputs(tag_line,output);
8212911Sgarrison                     kcnt++; keys++;
8312911Sgarrison                 }
8412911Sgarrison             }
8512911Sgarrison         }
8612911Sgarrison         fclose(input);
8712911Sgarrison     }
8812911Sgarrison     fclose(output);
8912911Sgarrison 
9012911Sgarrison     sprintf(sortcmd, sort_it, tmpfile, tmpfile);
9112911Sgarrison     system(sortcmd);
9212911Sgarrison 
9312911Sgarrison     distinct = shorten(tmpfile,INDEX);
9412911Sgarrison     if( silent == 0 )
9512911Sgarrison 	fprintf(stderr,
96*15177Sgarrison 	    "%d documents   %d distinct keys  %d key occurrences\n",
9712911Sgarrison 	    records, distinct, keys);
9812911Sgarrison }
9912911Sgarrison 
10012911Sgarrison 
10112911Sgarrison 
10212911Sgarrison /*  Flag    Meaning                             Default
10312911Sgarrison     -ki     Keys per record                     100
10412911Sgarrison     -li     max Length of keys                  6
10512911Sgarrison     -%str   ignore lines that begin with %x     CNOPVX
10612911Sgarrison             where x is in str
10712911Sgarrison             str is a seq of chars
10812911Sgarrison     -cfile  file contains Common words          /usr/src/local/bib/common
10912911Sgarrison             do not use common words as keys
11012911Sgarrison     -pfile  name of output file                 INDEX
11112911Sgarrison     -s	    do not print statistics		statistics printed
11212911Sgarrison */
11312911Sgarrison 
11412911Sgarrison # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
11512911Sgarrison 
11612911Sgarrison flags()
11712911Sgarrison {   for (; argc>0 && *argv[0]=='-';  argc--,argv++)
11812911Sgarrison     {   switch ((*argv)[1])
11912911Sgarrison         {   case 'k':   max_kcnt= atoi(operand);
12012911Sgarrison                         break;
12112911Sgarrison             case 'l':   max_klen= atoi(operand);
12212911Sgarrison                         break;
12312911Sgarrison             case 'c':   common=  operand;
12412911Sgarrison                         break;
12512911Sgarrison             case '%':   ignore=  *argv+2;
12612911Sgarrison                         break;
12712911Sgarrison             case 'p':   INDEX=  operand;
12812911Sgarrison                         break;
12912911Sgarrison 	    case 's':	silent= 1;
13012911Sgarrison 			break;
13112911Sgarrison             default:    fprintf(stderr, "unknown flag '%s'\n", *argv);
13212911Sgarrison         }
13312911Sgarrison     }
13412911Sgarrison }
13512911Sgarrison 
13612911Sgarrison 
13712911Sgarrison /*  shorten(inf,outf): file "inf" consists of lines of the form:
13812911Sgarrison         key file start length
13912911Sgarrison     sorted by key and file.  replace lines with the same key
14012911Sgarrison     with one line of the form:
14112911Sgarrison         key:file1 start/length ... start/length:file2 start/length ...
14212911Sgarrison     rename as file "outf"
14312911Sgarrison     returns number of lines in output
14412911Sgarrison */
14512911Sgarrison long shorten(inf,outf)
14612911Sgarrison char *inf, *outf;
14712911Sgarrison {   FILE *in, *out;
14812911Sgarrison     char line[maxstr];
14912911Sgarrison     char key[maxstr],  newkey[maxstr],
15012911Sgarrison          file[maxstr], newfile[maxstr];
15112911Sgarrison     long int start, length;
15212911Sgarrison     long int lines = 0;
15312911Sgarrison 
15412911Sgarrison     in=  fopen(inf, "r");
15512911Sgarrison     out= fopen(outf, "w");
15612911Sgarrison     if (in==NULL || out==NULL)
15712911Sgarrison     {   fprintf(stderr, "invert: error in opening file for compression\n");
15812911Sgarrison         return(0);
15912911Sgarrison     }
16012911Sgarrison 
16112911Sgarrison     getline(in,line);
162*15177Sgarrison     sscanf(line,"%s%s%d%d", key, file, &start, &length);
163*15177Sgarrison     fprintf(out, "%s :%s %d/%d", key, file, start, length);
16412911Sgarrison     for ( getline(in, line) ; !feof(in);  getline(in, line))
165*15177Sgarrison     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
16612911Sgarrison         if (strcmp(key,newkey)!=0)
16712911Sgarrison         {   strcpy(key, newkey);
16812911Sgarrison             strcpy(file, newfile);
169*15177Sgarrison             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
17012911Sgarrison 	    lines++;
17112911Sgarrison         }
17212911Sgarrison         else if (strcmp(file,newfile)!=0)
17312911Sgarrison         {   strcpy(file,newfile);
174*15177Sgarrison             fprintf(out, ":%s %d/%d", file, start, length);
17512911Sgarrison         }
17612911Sgarrison         else
177*15177Sgarrison             fprintf(out, " %d/%d", start, length);
17812911Sgarrison     }
17912911Sgarrison     fprintf(out, "\n");
18012911Sgarrison     lines++;
18112911Sgarrison 
18212911Sgarrison     fclose(in); fclose(out);
18312911Sgarrison     unlink(inf);
18412911Sgarrison     return (lines);
18512911Sgarrison }
186