xref: /csrg-svn/contrib/bib/src/invert.c (revision 24674)
113111Srrh #ifndef lint
2*24674Smckusick static char sccsid[] = "@(#)invert.c	2.5	09/10/85";
313111Srrh #endif not lint
415177Sgarrison #
512911Sgarrison /*  input:  records of lines, separated by blank lines
612911Sgarrison     output: key:file1 start/length ... start/length:file2 start/length ...
712911Sgarrison */
812911Sgarrison 
912911Sgarrison # include "stdio.h"
1012911Sgarrison # include "streams.h"
1112911Sgarrison # include "bib.h"
1212911Sgarrison # define isnull(x)  (*(x) == NULL)
1312911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
1412911Sgarrison 
1512911Sgarrison int     max_kcnt = 100;     /*  max number of keys                      */
1612911Sgarrison int     max_klen =   6;     /*  max length of keys                      */
1712911Sgarrison char    *ignore =           /*  string of line starts to ignore         */
1812911Sgarrison             "CNOPVX";
1912911Sgarrison char    *common =           /*  name of file of common words            */
2012911Sgarrison             COMFILE;
2112911Sgarrison char    *INDEX=             /*  name of output file                     */
2212911Sgarrison             INDXFILE;
2312911Sgarrison 
2412911Sgarrison char    *tmpfile =          /*  name of temporary file                  */
2512911Sgarrison             INVTEMPFILE;
2612911Sgarrison 
2712911Sgarrison int	silent = 0;	    /*  0 => statistics printed			*/
2812911Sgarrison 			    /*  1 => no statisitics printed		*/
2912911Sgarrison 
3012911Sgarrison char *sort_it =
3112911Sgarrison         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
3212911Sgarrison char sortcmd[maxstr];
3312911Sgarrison 
3412911Sgarrison int     argc;
3512911Sgarrison char    **argv;
3612911Sgarrison 
3712911Sgarrison main(argcount,arglist)
3812911Sgarrison int argcount;
3912911Sgarrison char **arglist;
4012911Sgarrison {   char            *filename;
4112911Sgarrison     FILE            *input, *output;
4212911Sgarrison     long int        start,length;
4312911Sgarrison     char            word[maxstr];
4412911Sgarrison     int             kcnt;
4512911Sgarrison     char            tag_line[maxstr];
4612911Sgarrison 
4712911Sgarrison     long int	    records = 0;  /*  number of records read           */
4812911Sgarrison     long int	    keys    = 0;  /*  number of keys read (occurences) */
4912911Sgarrison     long int	    distinct;     /*  number of distinct keys          */
5012911Sgarrison     long int	    shorten();
5112911Sgarrison 
5217250Srrh     strcpy(COMFILE, N_COMFILE);
5317250Srrh     strcpy(BMACLIB, N_BMACLIB);
5417250Srrh 
5512911Sgarrison     argc= argcount-1;
5612911Sgarrison     argv= arglist+1;
5712911Sgarrison     mktemp(tmpfile);
5812911Sgarrison     output= fopen(tmpfile,"w");
5912911Sgarrison 
6012911Sgarrison     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
6112911Sgarrison     {   /* open input file              */
6212911Sgarrison             filename=   *argv;
6312911Sgarrison             input=      fopen(filename,"r");
6412911Sgarrison             if (input==NULL)
6512911Sgarrison             {   fprintf(stderr, "invert: error in open of %s\n", filename);
6612911Sgarrison                 continue;
6712911Sgarrison             }
6812911Sgarrison             start=      0L;
6912911Sgarrison             length=     0L;
7012911Sgarrison 
7112911Sgarrison         for(;;) /* each record  */
7212911Sgarrison         {   /* find start of next record (exit if none)     */
7312911Sgarrison                 start= nextrecord(input,start+length);
7412911Sgarrison                 if (start==EOF)   break;
7512911Sgarrison             records++;
7612911Sgarrison 	    kcnt= 0;
7712911Sgarrison             length= recsize(input,start);
7815177Sgarrison             sprintf(tag_line, " %s %d %d\n", filename, start, length);
7912911Sgarrison 
8012911Sgarrison             while (ftell(input) < start+length && kcnt < max_kcnt)
8112911Sgarrison             {   getword(input,word,ignore);
8212911Sgarrison                 makekey(word,max_klen,common);
8312911Sgarrison                 if (!isnull(word))
8412911Sgarrison                 {   fputs(word,output); fputs(tag_line,output);
8512911Sgarrison                     kcnt++; keys++;
8612911Sgarrison                 }
8712911Sgarrison             }
8812911Sgarrison         }
8912911Sgarrison         fclose(input);
9012911Sgarrison     }
9112911Sgarrison     fclose(output);
9212911Sgarrison 
9312911Sgarrison     sprintf(sortcmd, sort_it, tmpfile, tmpfile);
9412911Sgarrison     system(sortcmd);
9512911Sgarrison 
9612911Sgarrison     distinct = shorten(tmpfile,INDEX);
9712911Sgarrison     if( silent == 0 )
9812911Sgarrison 	fprintf(stderr,
9915177Sgarrison 	    "%d documents   %d distinct keys  %d key occurrences\n",
10012911Sgarrison 	    records, distinct, keys);
10117301Sralph     exit(0);
10212911Sgarrison }
10312911Sgarrison 
10412911Sgarrison 
10512911Sgarrison 
10612911Sgarrison /*  Flag    Meaning                             Default
10712911Sgarrison     -ki     Keys per record                     100
10812911Sgarrison     -li     max Length of keys                  6
10912911Sgarrison     -%str   ignore lines that begin with %x     CNOPVX
11012911Sgarrison             where x is in str
11112911Sgarrison             str is a seq of chars
112*24674Smckusick     -cfile  file contains Common words          /usr/new/lib/bib/common
11312911Sgarrison             do not use common words as keys
11412911Sgarrison     -pfile  name of output file                 INDEX
11512911Sgarrison     -s	    do not print statistics		statistics printed
11612911Sgarrison */
11712911Sgarrison 
11812911Sgarrison # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
11912911Sgarrison 
12012911Sgarrison flags()
12117250Srrh {
12217250Srrh     char *p;
12317250Srrh     for (; argc>0 && *argv[0]=='-';  argc--,argv++)
12412911Sgarrison     {   switch ((*argv)[1])
12512911Sgarrison         {   case 'k':   max_kcnt= atoi(operand);
12612911Sgarrison                         break;
12712911Sgarrison             case 'l':   max_klen= atoi(operand);
12812911Sgarrison                         break;
12912911Sgarrison             case 'c':   common=  operand;
13012911Sgarrison                         break;
13112911Sgarrison             case '%':   ignore=  *argv+2;
13212911Sgarrison                         break;
13312911Sgarrison             case 'p':   INDEX=  operand;
13412911Sgarrison                         break;
13512911Sgarrison 	    case 's':	silent= 1;
13612911Sgarrison 			break;
13717250Srrh 	    case 'd':
13817250Srrh 		p = &argv[0][2];
13917250Srrh 		if (!p) {
14017250Srrh 			argv++;
14117250Srrh 			p = &argv[0][0];
14217250Srrh 		}
14317250Srrh 		strreplace(COMFILE, BMACLIB, p);
14417250Srrh 		strcpy(BMACLIB, p);
14517250Srrh 		break;
14612911Sgarrison             default:    fprintf(stderr, "unknown flag '%s'\n", *argv);
14712911Sgarrison         }
14812911Sgarrison     }
14912911Sgarrison }
15012911Sgarrison 
15112911Sgarrison 
15212911Sgarrison /*  shorten(inf,outf): file "inf" consists of lines of the form:
15312911Sgarrison         key file start length
15412911Sgarrison     sorted by key and file.  replace lines with the same key
15512911Sgarrison     with one line of the form:
15612911Sgarrison         key:file1 start/length ... start/length:file2 start/length ...
15712911Sgarrison     rename as file "outf"
15812911Sgarrison     returns number of lines in output
15912911Sgarrison */
16012911Sgarrison long shorten(inf,outf)
16112911Sgarrison char *inf, *outf;
16212911Sgarrison {   FILE *in, *out;
16312911Sgarrison     char line[maxstr];
16412911Sgarrison     char key[maxstr],  newkey[maxstr],
16512911Sgarrison          file[maxstr], newfile[maxstr];
16612911Sgarrison     long int start, length;
16712911Sgarrison     long int lines = 0;
16812911Sgarrison 
16912911Sgarrison     in=  fopen(inf, "r");
17012911Sgarrison     out= fopen(outf, "w");
17112911Sgarrison     if (in==NULL || out==NULL)
17212911Sgarrison     {   fprintf(stderr, "invert: error in opening file for compression\n");
17312911Sgarrison         return(0);
17412911Sgarrison     }
17512911Sgarrison 
17612911Sgarrison     getline(in,line);
17715177Sgarrison     sscanf(line,"%s%s%d%d", key, file, &start, &length);
17815177Sgarrison     fprintf(out, "%s :%s %d/%d", key, file, start, length);
17912911Sgarrison     for ( getline(in, line) ; !feof(in);  getline(in, line))
18015177Sgarrison     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
18112911Sgarrison         if (strcmp(key,newkey)!=0)
18212911Sgarrison         {   strcpy(key, newkey);
18312911Sgarrison             strcpy(file, newfile);
18415177Sgarrison             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
18512911Sgarrison 	    lines++;
18612911Sgarrison         }
18712911Sgarrison         else if (strcmp(file,newfile)!=0)
18812911Sgarrison         {   strcpy(file,newfile);
18915177Sgarrison             fprintf(out, ":%s %d/%d", file, start, length);
19012911Sgarrison         }
19112911Sgarrison         else
19215177Sgarrison             fprintf(out, " %d/%d", start, length);
19312911Sgarrison     }
19412911Sgarrison     fprintf(out, "\n");
19512911Sgarrison     lines++;
19612911Sgarrison 
19712911Sgarrison     fclose(in); fclose(out);
19812911Sgarrison     unlink(inf);
19912911Sgarrison     return (lines);
20012911Sgarrison }
201