xref: /csrg-svn/contrib/bib/src/invert.c (revision 60507)
113111Srrh #ifndef lint
2*60507Sbostic static char sccsid[] = "@(#)invert.c	2.7	05/27/93";
313111Srrh #endif not lint
415177Sgarrison #
512911Sgarrison /*  input:  records of lines, separated by blank lines
612911Sgarrison     output: key:file1 start/length ... start/length:file2 start/length ...
712911Sgarrison */
812911Sgarrison 
912911Sgarrison # include "stdio.h"
1012911Sgarrison # include "streams.h"
1112911Sgarrison # include "bib.h"
1212911Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
1312911Sgarrison 
1412911Sgarrison int     max_kcnt = 100;     /*  max number of keys                      */
1512911Sgarrison int     max_klen =   6;     /*  max length of keys                      */
1612911Sgarrison char    *ignore =           /*  string of line starts to ignore         */
1712911Sgarrison             "CNOPVX";
1812911Sgarrison char    *INDEX=             /*  name of output file                     */
1912911Sgarrison             INDXFILE;
2012911Sgarrison 
2130591Sgarrison char    *bibtmpfile =          /*  name of temporary file                  */
2212911Sgarrison             INVTEMPFILE;
2312911Sgarrison 
2412911Sgarrison int	silent = 0;	    /*  0 => statistics printed			*/
2512911Sgarrison 			    /*  1 => no statisitics printed		*/
2612911Sgarrison 
2712911Sgarrison char *sort_it =
2812911Sgarrison         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
2912911Sgarrison char sortcmd[maxstr];
3012911Sgarrison 
3112911Sgarrison int     argc;
3212911Sgarrison char    **argv;
3312911Sgarrison 
main(argcount,arglist)3412911Sgarrison main(argcount,arglist)
3512911Sgarrison int argcount;
3612911Sgarrison char **arglist;
3712911Sgarrison {   char            *filename;
3812911Sgarrison     FILE            *input, *output;
3912911Sgarrison     long int        start,length;
4012911Sgarrison     char            word[maxstr];
4112911Sgarrison     int             kcnt;
4212911Sgarrison     char            tag_line[maxstr];
43*60507Sbostic     int 	    bol = 1; /* at beginning of line */
4412911Sgarrison 
4512911Sgarrison     long int	    records = 0;  /*  number of records read           */
4612911Sgarrison     long int	    keys    = 0;  /*  number of keys read (occurences) */
4712911Sgarrison     long int	    distinct;     /*  number of distinct keys          */
4812911Sgarrison     long int	    shorten();
4912911Sgarrison 
50*60507Sbostic     InitDirectory(BMACLIB,N_BMACLIB);
51*60507Sbostic     InitDirectory(COMFILE,N_COMFILE);
5217250Srrh 
5312911Sgarrison     argc= argcount-1;
5412911Sgarrison     argv= arglist+1;
5530591Sgarrison     mktemp(bibtmpfile);
5630591Sgarrison     output= fopen(bibtmpfile,"w");
5712911Sgarrison 
5812911Sgarrison     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
5912911Sgarrison     {   /* open input file              */
6012911Sgarrison             filename=   *argv;
6112911Sgarrison             input=      fopen(filename,"r");
6212911Sgarrison             if (input==NULL)
63*60507Sbostic             {   fprintf(stderr,"invert: error in open of %s\n", filename);
6412911Sgarrison                 continue;
6512911Sgarrison             }
66*60507Sbostic       start=      0L;
67*60507Sbostic       length=     0L;
6812911Sgarrison 
69*60507Sbostic       for(;;) /* each record  */ {
70*60507Sbostic 	 /* find start of next record (exit if none)     */
71*60507Sbostic 	 start= nextrecord(input,start+length);
72*60507Sbostic 	 if (start==EOF)   break;
73*60507Sbostic 	 records++;
74*60507Sbostic 	 kcnt= 0;
75*60507Sbostic 	 length= recsize(input,start);
76*60507Sbostic 	 sprintf(tag_line, " %s %d %d\n", filename, start, length);
7712911Sgarrison 
78*60507Sbostic 	 while (ftell(input) < start+length && kcnt < max_kcnt) {
79*60507Sbostic 	    getword(input,word,ignore,&bol);
80*60507Sbostic 	    makekey(word,max_klen,COMFILE);
81*60507Sbostic 	    if (*word != NULL) {
82*60507Sbostic 	       fputs(word,output); fputs(tag_line,output);
83*60507Sbostic 	       kcnt++; keys++;
84*60507Sbostic 	       }
85*60507Sbostic 	    }
86*60507Sbostic 	 }
87*60507Sbostic        fclose(input);
88*60507Sbostic        }
8912911Sgarrison     fclose(output);
9012911Sgarrison 
9130591Sgarrison     sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
9212911Sgarrison     system(sortcmd);
9312911Sgarrison 
9430591Sgarrison     distinct = shorten(bibtmpfile,INDEX);
9512911Sgarrison     if( silent == 0 )
9612911Sgarrison 	fprintf(stderr,
9715177Sgarrison 	    "%d documents   %d distinct keys  %d key occurrences\n",
9812911Sgarrison 	    records, distinct, keys);
9917301Sralph     exit(0);
10012911Sgarrison }
10112911Sgarrison 
10212911Sgarrison 
10312911Sgarrison 
10412911Sgarrison /*  Flag    Meaning                             Default
10512911Sgarrison     -ki     Keys per record                     100
10612911Sgarrison     -li     max Length of keys                  6
10712911Sgarrison     -%str   ignore lines that begin with %x     CNOPVX
10812911Sgarrison             where x is in str
10912911Sgarrison             str is a seq of chars
11024674Smckusick     -cfile  file contains Common words          /usr/new/lib/bib/common
11112911Sgarrison             do not use common words as keys
11212911Sgarrison     -pfile  name of output file                 INDEX
11312911Sgarrison     -s	    do not print statistics		statistics printed
11412911Sgarrison */
11512911Sgarrison 
11612911Sgarrison # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
11712911Sgarrison 
flags()11812911Sgarrison flags()
11917250Srrh {
12017250Srrh     char *p;
12117250Srrh     for (; argc>0 && *argv[0]=='-';  argc--,argv++)
12212911Sgarrison     {   switch ((*argv)[1])
12312911Sgarrison         {   case 'k':   max_kcnt= atoi(operand);
12412911Sgarrison                         break;
12512911Sgarrison             case 'l':   max_klen= atoi(operand);
12612911Sgarrison                         break;
127*60507Sbostic             case 'c':   strcpy(COMFILE,operand);
12812911Sgarrison                         break;
12912911Sgarrison             case '%':   ignore=  *argv+2;
13012911Sgarrison                         break;
13112911Sgarrison             case 'p':   INDEX=  operand;
13212911Sgarrison                         break;
13312911Sgarrison 	    case 's':	silent= 1;
13412911Sgarrison 			break;
13517250Srrh 	    case 'd':
13617250Srrh 		p = &argv[0][2];
13717250Srrh 		if (!p) {
13817250Srrh 			argv++;
13917250Srrh 			p = &argv[0][0];
14017250Srrh 		}
14117250Srrh 		strreplace(COMFILE, BMACLIB, p);
14217250Srrh 		strcpy(BMACLIB, p);
14317250Srrh 		break;
144*60507Sbostic             default:    fprintf(stderr,"unknown flag '%s'\n", *argv);
14512911Sgarrison         }
14612911Sgarrison     }
14712911Sgarrison }
14812911Sgarrison 
14912911Sgarrison 
15012911Sgarrison /*  shorten(inf,outf): file "inf" consists of lines of the form:
15112911Sgarrison         key file start length
15212911Sgarrison     sorted by key and file.  replace lines with the same key
15312911Sgarrison     with one line of the form:
15412911Sgarrison         key:file1 start/length ... start/length:file2 start/length ...
15512911Sgarrison     rename as file "outf"
15612911Sgarrison     returns number of lines in output
15712911Sgarrison */
shorten(inf,outf)15812911Sgarrison long shorten(inf,outf)
15912911Sgarrison char *inf, *outf;
16012911Sgarrison {   FILE *in, *out;
16112911Sgarrison     char line[maxstr];
16212911Sgarrison     char key[maxstr],  newkey[maxstr],
16312911Sgarrison          file[maxstr], newfile[maxstr];
16412911Sgarrison     long int start, length;
16512911Sgarrison     long int lines = 0;
16612911Sgarrison 
16712911Sgarrison     in=  fopen(inf, "r");
16812911Sgarrison     out= fopen(outf, "w");
16912911Sgarrison     if (in==NULL || out==NULL)
170*60507Sbostic     {   fprintf(stderr,"invert: error in opening file for compression\n");
17112911Sgarrison         return(0);
17212911Sgarrison     }
17312911Sgarrison 
17412911Sgarrison     getline(in,line);
17515177Sgarrison     sscanf(line,"%s%s%d%d", key, file, &start, &length);
17615177Sgarrison     fprintf(out, "%s :%s %d/%d", key, file, start, length);
17712911Sgarrison     for ( getline(in, line) ; !feof(in);  getline(in, line))
17815177Sgarrison     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
17912911Sgarrison         if (strcmp(key,newkey)!=0)
18012911Sgarrison         {   strcpy(key, newkey);
18112911Sgarrison             strcpy(file, newfile);
18215177Sgarrison             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
18312911Sgarrison 	    lines++;
18412911Sgarrison         }
18512911Sgarrison         else if (strcmp(file,newfile)!=0)
18612911Sgarrison         {   strcpy(file,newfile);
18715177Sgarrison             fprintf(out, ":%s %d/%d", file, start, length);
18812911Sgarrison         }
18912911Sgarrison         else
19015177Sgarrison             fprintf(out, " %d/%d", start, length);
19112911Sgarrison     }
19212911Sgarrison     fprintf(out, "\n");
19312911Sgarrison     lines++;
19412911Sgarrison 
19512911Sgarrison     fclose(in); fclose(out);
19612911Sgarrison     unlink(inf);
19712911Sgarrison     return (lines);
19812911Sgarrison }
199