xref: /netbsd-src/external/gpl2/groff/dist/src/libs/libbib/index.cpp (revision 89a07cf815a29524268025a1139fac4c5190f765)
1 /*	$NetBSD: index.cpp,v 1.1.1.1 2016/01/13 18:41:48 christos Exp $	*/
2 
3 // -*- C++ -*-
4 /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004
5    Free Software Foundation, Inc.
6      Written by James Clark (jjc@jclark.com)
7 
8 This file is part of groff.
9 
10 groff is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free
12 Software Foundation; either version 2, or (at your option) any later
13 version.
14 
15 groff is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18 for more details.
19 
20 You should have received a copy of the GNU General Public License along
21 with groff; see the file COPYING.  If not, write to the Free Software
22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23 
24 #include "lib.h"
25 
26 #include <stdlib.h>
27 #include <errno.h>
28 
29 #include "posix.h"
30 #include "cset.h"
31 #include "cmap.h"
32 #include "errarg.h"
33 #include "error.h"
34 
35 #include "refid.h"
36 #include "search.h"
37 #include "index.h"
38 #include "defs.h"
39 
40 #include "nonposix.h"
41 
42 // Interface to mmap.
43 extern "C" {
44   void *mapread(int fd, int len);
45   int unmap(void *, int len);
46 }
47 
48 #if 0
49 const
50 #endif
51 int minus_one = -1;
52 
53 int verify_flag = 0;
54 
55 struct word_list;
56 
57 class index_search_item : public search_item {
58   search_item *out_of_date_files;
59   index_header header;
60   char *buffer;
61   void *map_addr;
62   int map_len;
63   tag *tags;
64   int *table;
65   int *lists;
66   char *pool;
67   char *key_buffer;
68   char *filename_buffer;
69   int filename_buflen;
70   char **common_words_table;
71   int common_words_table_size;
72   const char *ignore_fields;
73   time_t mtime;
74 
75   const char *do_verify();
76   const int *search1(const char **pp, const char *end);
77   const int *search(const char *ptr, int length, int **temp_listp);
78   const char *munge_filename(const char *);
79   void read_common_words_file();
80   void add_out_of_date_file(int fd, const char *filename, int fid);
81 public:
82   index_search_item(const char *, int);
83   ~index_search_item();
84   int load(int fd);
85   search_item_iterator *make_search_item_iterator(const char *);
86   int verify();
87   void check_files();
88   int next_filename_id() const;
89   friend class index_search_item_iterator;
90 };
91 
92 class index_search_item_iterator : public search_item_iterator {
93   index_search_item *indx;
94   search_item_iterator *out_of_date_files_iter;
95   search_item *next_out_of_date_file;
96   const int *found_list;
97   int *temp_list;
98   char *buf;
99   int buflen;
100   linear_searcher searcher;
101   char *query;
102   int get_tag(int tagno, const linear_searcher &, const char **, int *,
103 	      reference_id *);
104 public:
105   index_search_item_iterator(index_search_item *, const char *);
106   ~index_search_item_iterator();
107   int next(const linear_searcher &, const char **, int *, reference_id *);
108 };
109 
110 
index_search_item(const char * filename,int fid)111 index_search_item::index_search_item(const char *filename, int fid)
112 : search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
113   map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
114   common_words_table(0)
115 {
116 }
117 
~index_search_item()118 index_search_item::~index_search_item()
119 {
120   if (buffer)
121     free(buffer);
122   if (map_addr) {
123     if (unmap(map_addr, map_len) < 0)
124       error("unmap: %1", strerror(errno));
125   }
126   while (out_of_date_files) {
127     search_item *tem = out_of_date_files;
128     out_of_date_files = out_of_date_files->next;
129     delete tem;
130   }
131   a_delete filename_buffer;
132   a_delete key_buffer;
133   if (common_words_table) {
134     for (int i = 0; i < common_words_table_size; i++)
135       a_delete common_words_table[i];
136     a_delete common_words_table;
137   }
138 }
139 
140 class file_closer {
141   int *fdp;
142 public:
file_closer(int & fd)143   file_closer(int &fd) : fdp(&fd) { }
~file_closer()144   ~file_closer() { close(*fdp); }
145 };
146 
147 // Tell the compiler that a variable is intentionally unused.
unused(void *)148 inline void unused(void *) { }
149 
load(int fd)150 int index_search_item::load(int fd)
151 {
152   file_closer fd_closer(fd);	// close fd on return
153   unused(&fd_closer);
154   struct stat sb;
155   if (fstat(fd, &sb) < 0) {
156     error("can't fstat `%1': %2", name, strerror(errno));
157     return 0;
158   }
159   if (!S_ISREG(sb.st_mode)) {
160     error("`%1' is not a regular file", name);
161     return 0;
162   }
163   mtime = sb.st_mtime;
164   int size = int(sb.st_size);
165   char *addr;
166   map_addr = mapread(fd, size);
167   if (map_addr) {
168     addr = (char *)map_addr;
169     map_len = size;
170   }
171   else {
172     addr = buffer = (char *)malloc(size);
173     if (buffer == 0) {
174       error("can't allocate buffer for `%1'", name);
175       return 0;
176     }
177     char *ptr = buffer;
178     int bytes_to_read = size;
179     while (bytes_to_read > 0) {
180       int nread = read(fd, ptr, bytes_to_read);
181       if (nread == 0) {
182 	error("unexpected EOF on `%1'", name);
183 	return 0;
184       }
185       if (nread < 0) {
186 	error("read error on `%1': %2", name, strerror(errno));
187 	return 0;
188       }
189       bytes_to_read -= nread;
190       ptr += nread;
191     }
192   }
193   header = *(index_header *)addr;
194   if (header.magic != INDEX_MAGIC) {
195     error("`%1' is not an index file: wrong magic number", name);
196     return 0;
197   }
198   if (header.version != INDEX_VERSION) {
199     error("version number in `%1' is wrong: was %2, should be %3",
200 	  name, header.version, INDEX_VERSION);
201     return 0;
202   }
203   int sz = (header.tags_size * sizeof(tag)
204 	    + header.lists_size * sizeof(int)
205 	    + header.table_size * sizeof(int)
206 	    + header.strings_size
207 	    + sizeof(header));
208   if (sz != size) {
209     error("size of `%1' is wrong: was %2, should be %3",
210 	  name, size, sz);
211     return 0;
212   }
213   tags = (tag *)(addr + sizeof(header));
214   lists = (int *)(tags + header.tags_size);
215   table = (int *)(lists + header.lists_size);
216   pool = (char *)(table + header.table_size);
217   ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
218   key_buffer = new char[header.truncate];
219   read_common_words_file();
220   return 1;
221 }
222 
do_verify()223 const char *index_search_item::do_verify()
224 {
225   if (tags == 0)
226     return "not loaded";
227   if (lists[header.lists_size - 1] >= 0)
228     return "last list element not negative";
229   int i;
230   for (i = 0; i < header.table_size; i++) {
231     int li = table[i];
232     if (li >= header.lists_size)
233       return "bad list index";
234     if (li >= 0) {
235       for (int *ptr = lists + li; *ptr >= 0; ptr++) {
236 	if (*ptr >= header.tags_size)
237 	  return "bad tag index";
238 	if (*ptr >= ptr[1] && ptr[1] >= 0)
239 	  return "list not ordered";
240       }
241     }
242   }
243   for (i = 0; i < header.tags_size; i++) {
244     if (tags[i].filename_index >= header.strings_size)
245       return "bad index in tags";
246     if (tags[i].length < 0)
247       return "bad length in tags";
248     if (tags[i].start < 0)
249       return "bad start in tags";
250   }
251   if (pool[header.strings_size - 1] != '\0')
252     return "last character in pool not nul";
253   return 0;
254 }
255 
verify()256 int index_search_item::verify()
257 {
258   const char *reason = do_verify();
259   if (!reason)
260     return 1;
261   error("`%1' is bad: %2", name, reason);
262   return 0;
263 }
264 
next_filename_id() const265 int index_search_item::next_filename_id() const
266 {
267   return filename_id + header.strings_size + 1;
268 }
269 
make_search_item_iterator(const char * query)270 search_item_iterator *index_search_item::make_search_item_iterator(
271   const char *query)
272 {
273   return new index_search_item_iterator(this, query);
274 }
275 
make_index_search_item(const char * filename,int fid)276 search_item *make_index_search_item(const char *filename, int fid)
277 {
278   char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
279   strcpy(index_filename, filename);
280   strcat(index_filename, INDEX_SUFFIX);
281   int fd = open(index_filename, O_RDONLY | O_BINARY);
282   if (fd < 0)
283     return 0;
284   index_search_item *item = new index_search_item(index_filename, fid);
285   a_delete index_filename;
286   if (!item->load(fd)) {
287     close(fd);
288     delete item;
289     return 0;
290   }
291   else if (verify_flag && !item->verify()) {
292     delete item;
293     return 0;
294   }
295   else {
296     item->check_files();
297     return item;
298   }
299 }
300 
301 
index_search_item_iterator(index_search_item * ind,const char * q)302 index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
303 						       const char *q)
304 : indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
305   buf(0), buflen(0),
306   searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
307   query(strsave(q))
308 {
309   found_list = indx->search(q, strlen(q), &temp_list);
310   if (!found_list) {
311     found_list = &minus_one;
312     warning("all keys would have been discarded in constructing index `%1'",
313 	    indx->name);
314   }
315 }
316 
~index_search_item_iterator()317 index_search_item_iterator::~index_search_item_iterator()
318 {
319   a_delete temp_list;
320   a_delete buf;
321   a_delete query;
322   delete out_of_date_files_iter;
323 }
324 
next(const linear_searcher &,const char ** pp,int * lenp,reference_id * ridp)325 int index_search_item_iterator::next(const linear_searcher &,
326 				     const char **pp, int *lenp,
327 				     reference_id *ridp)
328 {
329   if (found_list) {
330     for (;;) {
331       int tagno = *found_list;
332       if (tagno == -1)
333 	break;
334       found_list++;
335       if (get_tag(tagno, searcher, pp, lenp, ridp))
336 	return 1;
337     }
338     found_list = 0;
339     next_out_of_date_file = indx->out_of_date_files;
340   }
341   while (next_out_of_date_file) {
342     if (out_of_date_files_iter == 0)
343       out_of_date_files_iter
344 	= next_out_of_date_file->make_search_item_iterator(query);
345     if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
346       return 1;
347     delete out_of_date_files_iter;
348     out_of_date_files_iter = 0;
349     next_out_of_date_file = next_out_of_date_file->next;
350   }
351   return 0;
352 }
353 
get_tag(int tagno,const linear_searcher & searchr,const char ** pp,int * lenp,reference_id * ridp)354 int index_search_item_iterator::get_tag(int tagno,
355 					const linear_searcher &searchr,
356 					const char **pp, int *lenp,
357 					reference_id *ridp)
358 {
359   if (tagno < 0 || tagno >= indx->header.tags_size) {
360     error("bad tag number");
361     return 0;
362   }
363   tag *tp = indx->tags + tagno;
364   const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
365   int fd = open(filename, O_RDONLY | O_BINARY);
366   if (fd < 0) {
367     error("can't open `%1': %2", filename, strerror(errno));
368     return 0;
369   }
370   struct stat sb;
371   if (fstat(fd, &sb) < 0) {
372     error("can't fstat: %1", strerror(errno));
373     close(fd);
374     return 0;
375   }
376   time_t mtime = sb.st_mtime;
377   if (mtime > indx->mtime) {
378     indx->add_out_of_date_file(fd, filename,
379 			       indx->filename_id + tp->filename_index);
380     return 0;
381   }
382   int res = 0;
383   FILE *fp = fdopen(fd, FOPEN_RB);
384   if (!fp) {
385     error("fdopen failed");
386     close(fd);
387     return 0;
388   }
389   if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
390     error("can't seek on `%1': %2", filename, strerror(errno));
391   else {
392     int length = tp->length;
393     int err = 0;
394     if (length == 0) {
395       if (fstat(fileno(fp), &sb) < 0) {
396 	error("can't stat `%1': %2", filename, strerror(errno));
397 	err = 1;
398       }
399       else if (!S_ISREG(sb.st_mode)) {
400 	error("`%1' is not a regular file", filename);
401 	err = 1;
402       }
403       else
404 	length = int(sb.st_size);
405     }
406     if (!err) {
407       if (length + 2 > buflen) {
408 	a_delete buf;
409 	buflen = length + 2;
410 	buf = new char[buflen];
411       }
412       if (fread(buf + 1, 1, length, fp) != (size_t)length)
413 	error("fread on `%1' failed: %2", filename, strerror(errno));
414       else {
415 	buf[0] = '\n';
416 	// Remove the CR characters from CRLF pairs.
417 	int sidx = 1, didx = 1;
418 	for ( ; sidx < length + 1; sidx++, didx++)
419 	  {
420 	    if (buf[sidx] == '\r')
421 	      {
422 		if (buf[++sidx] != '\n')
423 		  buf[didx++] = '\r';
424 		else
425 		  length--;
426 	      }
427 	    if (sidx != didx)
428 	      buf[didx] = buf[sidx];
429 	  }
430 	buf[length + 1] = '\n';
431 	res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
432 	if (res && ridp)
433 	  *ridp = reference_id(indx->filename_id + tp->filename_index,
434 			       tp->start);
435       }
436     }
437   }
438   fclose(fp);
439   return res;
440 }
441 
munge_filename(const char * filename)442 const char *index_search_item::munge_filename(const char *filename)
443 {
444   if (IS_ABSOLUTE(filename))
445     return filename;
446   const char *cwd = pool;
447   int need_slash = (cwd[0] != 0
448 		    && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
449   int len = strlen(cwd) + strlen(filename) + need_slash + 1;
450   if (len > filename_buflen) {
451     a_delete filename_buffer;
452     filename_buflen = len;
453     filename_buffer = new char[len];
454   }
455   strcpy(filename_buffer, cwd);
456   if (need_slash)
457     strcat(filename_buffer, "/");
458   strcat(filename_buffer, filename);
459   return filename_buffer;
460 }
461 
search1(const char ** pp,const char * end)462 const int *index_search_item::search1(const char **pp, const char *end)
463 {
464   while (*pp < end && !csalnum(**pp))
465     *pp += 1;
466   if (*pp >= end)
467     return 0;
468   const char *start = *pp;
469   while (*pp < end && csalnum(**pp))
470     *pp += 1;
471   int len = *pp - start;
472   if (len < header.shortest)
473     return 0;
474   if (len > header.truncate)
475     len = header.truncate;
476   int is_number = 1;
477   for (int i = 0; i < len; i++)
478     if (csdigit(start[i]))
479       key_buffer[i] = start[i];
480     else {
481       key_buffer[i] = cmlower(start[i]);
482       is_number = 0;
483     }
484   if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
485     return 0;
486   unsigned hc = hash(key_buffer, len);
487   if (common_words_table) {
488     for (int h = hc % common_words_table_size;
489 	 common_words_table[h];
490 	 --h) {
491       if (strlen(common_words_table[h]) == (size_t)len
492 	  && memcmp(common_words_table[h], key_buffer, len) == 0)
493 	return 0;
494       if (h == 0)
495 	h = common_words_table_size;
496     }
497   }
498   int li = table[int(hc % header.table_size)];
499   return li < 0 ? &minus_one : lists + li;
500 }
501 
merge(int * result,const int * s1,const int * s2)502 static void merge(int *result, const int *s1, const int *s2)
503 {
504   for (; *s1 >= 0; s1++) {
505     while (*s2 >= 0 && *s2 < *s1)
506       s2++;
507     if (*s2 == *s1)
508       *result++ = *s2;
509   }
510   *result++ = -1;
511 }
512 
search(const char * ptr,int length,int ** temp_listp)513 const int *index_search_item::search(const char *ptr, int length,
514 				     int **temp_listp)
515 {
516   const char *end = ptr + length;
517   if (*temp_listp) {
518     a_delete *temp_listp;
519     *temp_listp = 0;
520   }
521   const int *first_list = 0;
522   while (ptr < end && (first_list = search1(&ptr, end)) == 0)
523     ;
524   if (!first_list)
525     return 0;
526   if (*first_list < 0)
527     return first_list;
528   const int *second_list = 0;
529   while (ptr < end && (second_list = search1(&ptr, end)) == 0)
530     ;
531   if (!second_list)
532     return first_list;
533   if (*second_list < 0)
534     return second_list;
535   const int *p;
536   for (p = first_list; *p >= 0; p++)
537     ;
538   int len = p - first_list;
539   for (p = second_list; *p >= 0; p++)
540     ;
541   if (p - second_list < len)
542     len = p - second_list;
543   int *matches = new int[len + 1];
544   merge(matches, first_list, second_list);
545   while (ptr < end) {
546     const int *list = search1(&ptr, end);
547     if (list != 0) {
548       if (*list < 0) {
549 	a_delete matches;
550 	return list;
551       }
552       merge(matches, matches, list);
553       if (*matches < 0) {
554 	a_delete matches;
555 	return &minus_one;
556       }
557     }
558   }
559   *temp_listp = matches;
560   return matches;
561 }
562 
read_common_words_file()563 void index_search_item::read_common_words_file()
564 {
565   if (header.common <= 0)
566     return;
567   const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
568   errno = 0;
569   FILE *fp = fopen(common_words_file, "r");
570   if (!fp) {
571     error("can't open `%1': %2", common_words_file, strerror(errno));
572     return;
573   }
574   common_words_table_size = 2*header.common + 1;
575   while (!is_prime(common_words_table_size))
576     common_words_table_size++;
577   common_words_table = new char *[common_words_table_size];
578   for (int i = 0; i < common_words_table_size; i++)
579     common_words_table[i] = 0;
580   int count = 0;
581   int key_len = 0;
582   for (;;) {
583     int c = getc(fp);
584     while (c != EOF && !csalnum(c))
585       c = getc(fp);
586     if (c == EOF)
587       break;
588     do {
589       if (key_len < header.truncate)
590 	key_buffer[key_len++] = cmlower(c);
591       c = getc(fp);
592     } while (c != EOF && csalnum(c));
593     if (key_len >= header.shortest) {
594       int h = hash(key_buffer, key_len) % common_words_table_size;
595       while (common_words_table[h]) {
596 	if (h == 0)
597 	  h = common_words_table_size;
598 	--h;
599       }
600       common_words_table[h] = new char[key_len + 1];
601       memcpy(common_words_table[h], key_buffer, key_len);
602       common_words_table[h][key_len] = '\0';
603     }
604     if (++count >= header.common)
605       break;
606     key_len = 0;
607     if (c == EOF)
608       break;
609   }
610   fclose(fp);
611 }
612 
add_out_of_date_file(int fd,const char * filename,int fid)613 void index_search_item::add_out_of_date_file(int fd, const char *filename,
614 					     int fid)
615 {
616   search_item **pp;
617   for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
618     if ((*pp)->is_named(filename))
619       return;
620   *pp = make_linear_search_item(fd, filename, fid);
621   warning("`%1' modified since `%2' created", filename, name);
622 }
623 
check_files()624 void index_search_item::check_files()
625 {
626   const char *pool_end = pool + header.strings_size;
627   for (const char *ptr = strchr(ignore_fields, '\0') + 1;
628        ptr < pool_end;
629        ptr = strchr(ptr, '\0') + 1) {
630     const char *path = munge_filename(ptr);
631     struct stat sb;
632     if (stat(path, &sb) < 0)
633       error("can't stat `%1': %2", path, strerror(errno));
634     else if (sb.st_mtime > mtime) {
635       int fd = open(path, O_RDONLY | O_BINARY);
636       if (fd < 0)
637 	error("can't open `%1': %2", path, strerror(errno));
638       else
639 	add_out_of_date_file(fd, path, filename_id + (ptr - pool));
640     }
641   }
642 }
643