xref: /netbsd-src/external/gpl2/groff/dist/src/libs/libbib/linear.cpp (revision 89a07cf815a29524268025a1139fac4c5190f765)
1 /*	$NetBSD: linear.cpp,v 1.1.1.1 2016/01/13 18:41:48 christos Exp $	*/
2 
3 // -*- C++ -*-
4 /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001
5    Free Software Foundation, Inc.
6      Written by James Clark (jjc@jclark.com)
7 
8 This file is part of groff.
9 
10 groff is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free
12 Software Foundation; either version 2, or (at your option) any later
13 version.
14 
15 groff is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18 for more details.
19 
20 You should have received a copy of the GNU General Public License along
21 with groff; see the file COPYING.  If not, write to the Free Software
22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23 
24 #include "lib.h"
25 
26 #include <stdlib.h>
27 #include <assert.h>
28 #include <errno.h>
29 
30 #include "posix.h"
31 #include "errarg.h"
32 #include "error.h"
33 #include "cset.h"
34 #include "cmap.h"
35 #include "nonposix.h"
36 
37 #include "refid.h"
38 #include "search.h"
39 
40 class file_buffer {
41   char *buffer;
42   char *bufend;
43 public:
44   file_buffer();
45   ~file_buffer();
46   int load(int fd, const char *filename);
47   const char *get_start() const;
48   const char *get_end() const;
49 };
50 
51 typedef unsigned char uchar;
52 
53 static uchar map[256];
54 static uchar inv_map[256][3];
55 
56 struct map_init {
57   map_init();
58 };
59 
60 static map_init the_map_init;
61 
map_init()62 map_init::map_init()
63 {
64   int i;
65   for (i = 0; i < 256; i++)
66     map[i] = csalnum(i) ? cmlower(i) : '\0';
67   for (i = 0; i < 256; i++) {
68     if (cslower(i)) {
69       inv_map[i][0] = i;
70       inv_map[i][1] = cmupper(i);
71       inv_map[i][2] = '\0';
72     }
73     else if (csdigit(i)) {
74       inv_map[i][0] = i;
75       inv_map[i][1] = 0;
76     }
77     else
78       inv_map[i][0] = '\0';
79   }
80 }
81 
82 
83 class bmpattern {
84   char *pat;
85   int len;
86   int delta[256];
87 public:
88   bmpattern(const char *pattern, int pattern_length);
89   ~bmpattern();
90   const char *search(const char *p, const char *end) const;
91   int length() const;
92 };
93 
bmpattern(const char * pattern,int pattern_length)94 bmpattern::bmpattern(const char *pattern, int pattern_length)
95 : len(pattern_length)
96 {
97   pat = new char[len];
98   int i;
99   for (i = 0; i < len; i++)
100     pat[i] = map[uchar(pattern[i])];
101   for (i = 0; i < 256; i++)
102     delta[i] = len;
103   for (i = 0; i < len; i++)
104     for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
105       delta[*inv] = len - i - 1;
106 }
107 
search(const char * buf,const char * end) const108 const char *bmpattern::search(const char *buf, const char *end) const
109 {
110   int buflen = end - buf;
111   if (len > buflen)
112     return 0;
113   const char *strend;
114   if (buflen > len*4)
115     strend = end - len*4;
116   else
117     strend = buf;
118   const char *k = buf + len - 1;
119   const int *del = delta;
120   const char *pattern = pat;
121   for (;;) {
122     while (k < strend) {
123       int t = del[uchar(*k)];
124       if (!t)
125 	break;
126       k += t;
127       k += del[uchar(*k)];
128       k += del[uchar(*k)];
129     }
130     while (k < end && del[uchar(*k)] != 0)
131       k++;
132     if (k == end)
133       break;
134     int j = len - 1;
135     const char *s = k;
136     for (;;) {
137       if (j == 0)
138 	return s;
139       if (map[uchar(*--s)] != uchar(pattern[--j]))
140 	break;
141     }
142     k++;
143   }
144   return 0;
145 }
146 
~bmpattern()147 bmpattern::~bmpattern()
148 {
149   a_delete pat;
150 }
151 
length() const152 inline int bmpattern::length() const
153 {
154   return len;
155 }
156 
157 
158 static const char *find_end(const char *bufend, const char *p);
159 
search_and_check(const bmpattern * key,const char * buf,const char * bufend,const char ** start) const160 const char *linear_searcher::search_and_check(const bmpattern *key,
161   const char *buf, const char *bufend, const char **start) const
162 {
163   assert(buf[-1] == '\n');
164   assert(bufend[-1] == '\n');
165   const char *ptr = buf;
166   for (;;) {
167     const char *found = key->search(ptr, bufend);
168     if (!found)
169       break;
170     if (check_match(buf, bufend, found, key->length(), &ptr, start))
171       return found;
172   }
173   return 0;
174 }
175 
skip_field(const char * end,const char * p)176 static const char *skip_field(const char *end, const char *p)
177 {
178   for (;;)
179     if (*p++ == '\n') {
180       if (p == end || *p == '%')
181 	break;
182       const char *q;
183       for (q = p; *q == ' ' || *q == '\t'; q++)
184 	;
185       if (*q == '\n')
186 	break;
187       p = q + 1;
188     }
189   return p;
190 }
191 
find_end(const char * bufend,const char * p)192 static const char *find_end(const char *bufend, const char *p)
193 {
194   for (;;)
195     if (*p++ == '\n') {
196       if (p == bufend)
197 	break;
198       const char *q;
199       for (q = p; *q == ' ' || *q == '\t'; q++)
200 	;
201       if (*q == '\n')
202 	break;
203       p = q + 1;
204     }
205   return p;
206 }
207 
208 
check_match(const char * buf,const char * bufend,const char * match,int matchlen,const char ** cont,const char ** start) const209 int linear_searcher::check_match(const char *buf, const char *bufend,
210 				 const char *match, int matchlen,
211 				 const char **cont, const char **start) const
212 {
213   *cont = match + 1;
214   // The user is required to supply only the first truncate_len characters
215   // of the key.  If truncate_len <= 0, he must supply all the key.
216   if ((truncate_len <= 0 || matchlen < truncate_len)
217       && map[uchar(match[matchlen])] != '\0')
218     return 0;
219 
220   // The character before the match must not be an alphanumeric
221   // character (unless the alphanumeric character follows one or two
222   // percent characters at the beginning of the line), nor must it be
223   // a percent character at the beginning of a line, nor a percent
224   // character following a percent character at the beginning of a
225   // line.
226 
227   switch (match - buf) {
228   case 0:
229     break;
230   case 1:
231     if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
232       return 0;
233     break;
234   case 2:
235     if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
236       return 0;
237     if (match[-1] == '%'
238 	&& (match[-2] == '\n' || match[-2] == '%'))
239       return 0;
240     break;
241   default:
242     if (map[uchar(match[-1])] != '\0'
243 	&& !(match[-2] == '%'
244 	     && (match[-3] == '\n'
245 		 || (match[-3] == '%' && match[-4] == '\n'))))
246       return 0;
247     if (match[-1] == '%'
248 	&& (match[-2] == '\n'
249 	    || (match[-2] == '%' && match[-3] == '\n')))
250       return 0;
251   }
252 
253   const char *p = match;
254   int had_percent = 0;
255   for (;;) {
256     if (*p == '\n') {
257       if (!had_percent && p[1] == '%') {
258 	if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
259 	  *cont = skip_field(bufend, match + matchlen);
260 	  return 0;
261 	}
262 	if (!start)
263 	  break;
264 	had_percent = 1;
265       }
266       if (p <= buf) {
267 	if (start)
268 	  *start = p + 1;
269 	return 1;
270       }
271       const char *q;
272       for (q = p - 1; *q == ' ' || *q == '\t'; q--)
273 	;
274       if (*q == '\n') {
275 	if (start)
276 	  *start = p + 1;
277 	break;
278       }
279       p = q;
280     }
281     p--;
282   }
283   return 1;
284 }
285 
file_buffer()286 file_buffer::file_buffer()
287 : buffer(0), bufend(0)
288 {
289 }
290 
~file_buffer()291 file_buffer::~file_buffer()
292 {
293   a_delete buffer;
294 }
295 
get_start() const296 const char *file_buffer::get_start() const
297 {
298   return buffer ? buffer + 4 : 0;
299 }
300 
get_end() const301 const char *file_buffer::get_end() const
302 {
303   return bufend;
304 }
305 
load(int fd,const char * filename)306 int file_buffer::load(int fd, const char *filename)
307 {
308   struct stat sb;
309   if (fstat(fd, &sb) < 0)
310     error("can't fstat `%1': %2", filename, strerror(errno));
311   else if (!S_ISREG(sb.st_mode))
312     error("`%1' is not a regular file", filename);
313   else {
314     // We need one character extra at the beginning for an additional newline
315     // used as a sentinel.  We get 4 instead so that the read buffer will be
316     // word-aligned.  This seems to make the read slightly faster.  We also
317     // need one character at the end also for an additional newline used as a
318     // sentinel.
319     int size = int(sb.st_size);
320     buffer = new char[size + 4 + 1];
321     int nread = read(fd, buffer + 4, size);
322     if (nread < 0)
323       error("error reading `%1': %2", filename, strerror(errno));
324     else if (nread != size)
325       error("size of `%1' decreased", filename);
326     else {
327       char c;
328       nread = read(fd, &c, 1);
329       if (nread != 0)
330 	error("size of `%1' increased", filename);
331       else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
332 	error("database `%1' is a binary file", filename);
333       else {
334 	close(fd);
335 	buffer[3] = '\n';
336 	int sidx = 4, didx = 4;
337 	for ( ; sidx < size + 4; sidx++, didx++)
338 	  {
339 	    if (buffer[sidx] == '\r')
340 	      {
341 		if (buffer[++sidx] != '\n')
342 		  buffer[didx++] = '\r';
343 		else
344 		  size--;
345 	      }
346 	    if (sidx != didx)
347 	      buffer[didx] = buffer[sidx];
348 	  }
349 	bufend = buffer + 4 + size;
350 	if (bufend[-1] != '\n')
351 	  *bufend++ = '\n';
352 	return 1;
353       }
354     }
355     a_delete buffer;
356     buffer = 0;
357   }
358   close(fd);
359   return 0;
360 }
361 
linear_searcher(const char * query,int query_len,const char * ign,int trunc)362 linear_searcher::linear_searcher(const char *query, int query_len,
363 				 const char *ign, int trunc)
364 : ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
365 {
366   const char *query_end = query + query_len;
367   int nk = 0;
368   const char *p;
369   for (p = query; p < query_end; p++)
370     if (map[uchar(*p)] != '\0'
371 	&& (p[1] == '\0' || map[uchar(p[1])] == '\0'))
372       nk++;
373   if (nk == 0)
374     return;
375   keys = new bmpattern*[nk];
376   p = query;
377   for (;;) {
378     while (p < query_end && map[uchar(*p)] == '\0')
379       p++;
380     if (p == query_end)
381       break;
382     const char *start = p;
383     while (p < query_end && map[uchar(*p)] != '\0')
384       p++;
385     keys[nkeys++] = new bmpattern(start, p - start);
386   }
387   assert(nkeys <= nk);
388   if (nkeys == 0) {
389     a_delete keys;
390     keys = 0;
391   }
392 }
393 
~linear_searcher()394 linear_searcher::~linear_searcher()
395 {
396   for (int i = 0; i < nkeys; i++)
397     delete keys[i];
398   a_delete keys;
399 }
400 
search(const char * buffer,const char * bufend,const char ** startp,int * lengthp) const401 int linear_searcher::search(const char *buffer, const char *bufend,
402 			    const char **startp, int *lengthp) const
403 {
404   assert(bufend - buffer > 0);
405   assert(buffer[-1] == '\n');
406   assert(bufend[-1] == '\n');
407   if (nkeys == 0)
408     return 0;
409   for (;;) {
410     const char *refstart;
411     const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
412     if (!found)
413       break;
414     const char *refend = find_end(bufend, found + keys[0]->length());
415     int i;
416     for (i = 1; i < nkeys; i++)
417       if (!search_and_check(keys[i], refstart, refend))
418 	break;
419     if (i >= nkeys) {
420       *startp = refstart;
421       *lengthp = refend - refstart;
422       return 1;
423     }
424     buffer = refend;
425   }
426   return 0;
427 }
428 
429 class linear_search_item : public search_item {
430   file_buffer fbuf;
431 public:
432   linear_search_item(const char *filename, int fid);
433   ~linear_search_item();
434   int load(int fd);
435   search_item_iterator *make_search_item_iterator(const char *);
436   friend class linear_search_item_iterator;
437 };
438 
439 class linear_search_item_iterator : public search_item_iterator {
440   linear_search_item *lsi;
441   int pos;
442 public:
443   linear_search_item_iterator(linear_search_item *, const char *query);
444   ~linear_search_item_iterator();
445   int next(const linear_searcher &, const char **ptr, int *lenp,
446 	   reference_id *ridp);
447 };
448 
make_linear_search_item(int fd,const char * filename,int fid)449 search_item *make_linear_search_item(int fd, const char *filename, int fid)
450 {
451   linear_search_item *item = new linear_search_item(filename, fid);
452   if (!item->load(fd)) {
453     delete item;
454     return 0;
455   }
456   else
457     return item;
458 }
459 
linear_search_item(const char * filename,int fid)460 linear_search_item::linear_search_item(const char *filename, int fid)
461 : search_item(filename, fid)
462 {
463 }
464 
~linear_search_item()465 linear_search_item::~linear_search_item()
466 {
467 }
468 
load(int fd)469 int linear_search_item::load(int fd)
470 {
471   return fbuf.load(fd, name);
472 }
473 
make_search_item_iterator(const char * query)474 search_item_iterator *linear_search_item::make_search_item_iterator(
475   const char *query)
476 {
477   return new linear_search_item_iterator(this, query);
478 }
479 
linear_search_item_iterator(linear_search_item * p,const char *)480 linear_search_item_iterator::linear_search_item_iterator(
481   linear_search_item *p, const char *)
482 : lsi(p), pos(0)
483 {
484 }
485 
~linear_search_item_iterator()486 linear_search_item_iterator::~linear_search_item_iterator()
487 {
488 }
489 
next(const linear_searcher & searcher,const char ** startp,int * lengthp,reference_id * ridp)490 int linear_search_item_iterator::next(const linear_searcher &searcher,
491 				      const char **startp, int *lengthp,
492 				      reference_id *ridp)
493 {
494   const char *bufstart = lsi->fbuf.get_start();
495   const char *bufend = lsi->fbuf.get_end();
496   const char *ptr = bufstart + pos;
497   if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
498     pos = *startp + *lengthp - bufstart;
499     if (ridp)
500       *ridp = reference_id(lsi->filename_id, *startp - bufstart);
501     return 1;
502   }
503   else
504     return 0;
505 }
506