1 /* $NetBSD: linear.cpp,v 1.1.1.1 2016/01/13 18:41:48 christos Exp $ */ 2 3 // -*- C++ -*- 4 /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001 5 Free Software Foundation, Inc. 6 Written by James Clark (jjc@jclark.com) 7 8 This file is part of groff. 9 10 groff is free software; you can redistribute it and/or modify it under 11 the terms of the GNU General Public License as published by the Free 12 Software Foundation; either version 2, or (at your option) any later 13 version. 14 15 groff is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18 for more details. 19 20 You should have received a copy of the GNU General Public License along 21 with groff; see the file COPYING. If not, write to the Free Software 22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 23 24 #include "lib.h" 25 26 #include <stdlib.h> 27 #include <assert.h> 28 #include <errno.h> 29 30 #include "posix.h" 31 #include "errarg.h" 32 #include "error.h" 33 #include "cset.h" 34 #include "cmap.h" 35 #include "nonposix.h" 36 37 #include "refid.h" 38 #include "search.h" 39 40 class file_buffer { 41 char *buffer; 42 char *bufend; 43 public: 44 file_buffer(); 45 ~file_buffer(); 46 int load(int fd, const char *filename); 47 const char *get_start() const; 48 const char *get_end() const; 49 }; 50 51 typedef unsigned char uchar; 52 53 static uchar map[256]; 54 static uchar inv_map[256][3]; 55 56 struct map_init { 57 map_init(); 58 }; 59 60 static map_init the_map_init; 61 62 map_init::map_init() 63 { 64 int i; 65 for (i = 0; i < 256; i++) 66 map[i] = csalnum(i) ? cmlower(i) : '\0'; 67 for (i = 0; i < 256; i++) { 68 if (cslower(i)) { 69 inv_map[i][0] = i; 70 inv_map[i][1] = cmupper(i); 71 inv_map[i][2] = '\0'; 72 } 73 else if (csdigit(i)) { 74 inv_map[i][0] = i; 75 inv_map[i][1] = 0; 76 } 77 else 78 inv_map[i][0] = '\0'; 79 } 80 } 81 82 83 class bmpattern { 84 char *pat; 85 int len; 86 int delta[256]; 87 public: 88 bmpattern(const char *pattern, int pattern_length); 89 ~bmpattern(); 90 const char *search(const char *p, const char *end) const; 91 int length() const; 92 }; 93 94 bmpattern::bmpattern(const char *pattern, int pattern_length) 95 : len(pattern_length) 96 { 97 pat = new char[len]; 98 int i; 99 for (i = 0; i < len; i++) 100 pat[i] = map[uchar(pattern[i])]; 101 for (i = 0; i < 256; i++) 102 delta[i] = len; 103 for (i = 0; i < len; i++) 104 for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++) 105 delta[*inv] = len - i - 1; 106 } 107 108 const char *bmpattern::search(const char *buf, const char *end) const 109 { 110 int buflen = end - buf; 111 if (len > buflen) 112 return 0; 113 const char *strend; 114 if (buflen > len*4) 115 strend = end - len*4; 116 else 117 strend = buf; 118 const char *k = buf + len - 1; 119 const int *del = delta; 120 const char *pattern = pat; 121 for (;;) { 122 while (k < strend) { 123 int t = del[uchar(*k)]; 124 if (!t) 125 break; 126 k += t; 127 k += del[uchar(*k)]; 128 k += del[uchar(*k)]; 129 } 130 while (k < end && del[uchar(*k)] != 0) 131 k++; 132 if (k == end) 133 break; 134 int j = len - 1; 135 const char *s = k; 136 for (;;) { 137 if (j == 0) 138 return s; 139 if (map[uchar(*--s)] != uchar(pattern[--j])) 140 break; 141 } 142 k++; 143 } 144 return 0; 145 } 146 147 bmpattern::~bmpattern() 148 { 149 a_delete pat; 150 } 151 152 inline int bmpattern::length() const 153 { 154 return len; 155 } 156 157 158 static const char *find_end(const char *bufend, const char *p); 159 160 const char *linear_searcher::search_and_check(const bmpattern *key, 161 const char *buf, const char *bufend, const char **start) const 162 { 163 assert(buf[-1] == '\n'); 164 assert(bufend[-1] == '\n'); 165 const char *ptr = buf; 166 for (;;) { 167 const char *found = key->search(ptr, bufend); 168 if (!found) 169 break; 170 if (check_match(buf, bufend, found, key->length(), &ptr, start)) 171 return found; 172 } 173 return 0; 174 } 175 176 static const char *skip_field(const char *end, const char *p) 177 { 178 for (;;) 179 if (*p++ == '\n') { 180 if (p == end || *p == '%') 181 break; 182 const char *q; 183 for (q = p; *q == ' ' || *q == '\t'; q++) 184 ; 185 if (*q == '\n') 186 break; 187 p = q + 1; 188 } 189 return p; 190 } 191 192 static const char *find_end(const char *bufend, const char *p) 193 { 194 for (;;) 195 if (*p++ == '\n') { 196 if (p == bufend) 197 break; 198 const char *q; 199 for (q = p; *q == ' ' || *q == '\t'; q++) 200 ; 201 if (*q == '\n') 202 break; 203 p = q + 1; 204 } 205 return p; 206 } 207 208 209 int linear_searcher::check_match(const char *buf, const char *bufend, 210 const char *match, int matchlen, 211 const char **cont, const char **start) const 212 { 213 *cont = match + 1; 214 // The user is required to supply only the first truncate_len characters 215 // of the key. If truncate_len <= 0, he must supply all the key. 216 if ((truncate_len <= 0 || matchlen < truncate_len) 217 && map[uchar(match[matchlen])] != '\0') 218 return 0; 219 220 // The character before the match must not be an alphanumeric 221 // character (unless the alphanumeric character follows one or two 222 // percent characters at the beginning of the line), nor must it be 223 // a percent character at the beginning of a line, nor a percent 224 // character following a percent character at the beginning of a 225 // line. 226 227 switch (match - buf) { 228 case 0: 229 break; 230 case 1: 231 if (match[-1] == '%' || map[uchar(match[-1])] != '\0') 232 return 0; 233 break; 234 case 2: 235 if (map[uchar(match[-1])] != '\0' && match[-2] != '%') 236 return 0; 237 if (match[-1] == '%' 238 && (match[-2] == '\n' || match[-2] == '%')) 239 return 0; 240 break; 241 default: 242 if (map[uchar(match[-1])] != '\0' 243 && !(match[-2] == '%' 244 && (match[-3] == '\n' 245 || (match[-3] == '%' && match[-4] == '\n')))) 246 return 0; 247 if (match[-1] == '%' 248 && (match[-2] == '\n' 249 || (match[-2] == '%' && match[-3] == '\n'))) 250 return 0; 251 } 252 253 const char *p = match; 254 int had_percent = 0; 255 for (;;) { 256 if (*p == '\n') { 257 if (!had_percent && p[1] == '%') { 258 if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) { 259 *cont = skip_field(bufend, match + matchlen); 260 return 0; 261 } 262 if (!start) 263 break; 264 had_percent = 1; 265 } 266 if (p <= buf) { 267 if (start) 268 *start = p + 1; 269 return 1; 270 } 271 const char *q; 272 for (q = p - 1; *q == ' ' || *q == '\t'; q--) 273 ; 274 if (*q == '\n') { 275 if (start) 276 *start = p + 1; 277 break; 278 } 279 p = q; 280 } 281 p--; 282 } 283 return 1; 284 } 285 286 file_buffer::file_buffer() 287 : buffer(0), bufend(0) 288 { 289 } 290 291 file_buffer::~file_buffer() 292 { 293 a_delete buffer; 294 } 295 296 const char *file_buffer::get_start() const 297 { 298 return buffer ? buffer + 4 : 0; 299 } 300 301 const char *file_buffer::get_end() const 302 { 303 return bufend; 304 } 305 306 int file_buffer::load(int fd, const char *filename) 307 { 308 struct stat sb; 309 if (fstat(fd, &sb) < 0) 310 error("can't fstat `%1': %2", filename, strerror(errno)); 311 else if (!S_ISREG(sb.st_mode)) 312 error("`%1' is not a regular file", filename); 313 else { 314 // We need one character extra at the beginning for an additional newline 315 // used as a sentinel. We get 4 instead so that the read buffer will be 316 // word-aligned. This seems to make the read slightly faster. We also 317 // need one character at the end also for an additional newline used as a 318 // sentinel. 319 int size = int(sb.st_size); 320 buffer = new char[size + 4 + 1]; 321 int nread = read(fd, buffer + 4, size); 322 if (nread < 0) 323 error("error reading `%1': %2", filename, strerror(errno)); 324 else if (nread != size) 325 error("size of `%1' decreased", filename); 326 else { 327 char c; 328 nread = read(fd, &c, 1); 329 if (nread != 0) 330 error("size of `%1' increased", filename); 331 else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0) 332 error("database `%1' is a binary file", filename); 333 else { 334 close(fd); 335 buffer[3] = '\n'; 336 int sidx = 4, didx = 4; 337 for ( ; sidx < size + 4; sidx++, didx++) 338 { 339 if (buffer[sidx] == '\r') 340 { 341 if (buffer[++sidx] != '\n') 342 buffer[didx++] = '\r'; 343 else 344 size--; 345 } 346 if (sidx != didx) 347 buffer[didx] = buffer[sidx]; 348 } 349 bufend = buffer + 4 + size; 350 if (bufend[-1] != '\n') 351 *bufend++ = '\n'; 352 return 1; 353 } 354 } 355 a_delete buffer; 356 buffer = 0; 357 } 358 close(fd); 359 return 0; 360 } 361 362 linear_searcher::linear_searcher(const char *query, int query_len, 363 const char *ign, int trunc) 364 : ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0) 365 { 366 const char *query_end = query + query_len; 367 int nk = 0; 368 const char *p; 369 for (p = query; p < query_end; p++) 370 if (map[uchar(*p)] != '\0' 371 && (p[1] == '\0' || map[uchar(p[1])] == '\0')) 372 nk++; 373 if (nk == 0) 374 return; 375 keys = new bmpattern*[nk]; 376 p = query; 377 for (;;) { 378 while (p < query_end && map[uchar(*p)] == '\0') 379 p++; 380 if (p == query_end) 381 break; 382 const char *start = p; 383 while (p < query_end && map[uchar(*p)] != '\0') 384 p++; 385 keys[nkeys++] = new bmpattern(start, p - start); 386 } 387 assert(nkeys <= nk); 388 if (nkeys == 0) { 389 a_delete keys; 390 keys = 0; 391 } 392 } 393 394 linear_searcher::~linear_searcher() 395 { 396 for (int i = 0; i < nkeys; i++) 397 delete keys[i]; 398 a_delete keys; 399 } 400 401 int linear_searcher::search(const char *buffer, const char *bufend, 402 const char **startp, int *lengthp) const 403 { 404 assert(bufend - buffer > 0); 405 assert(buffer[-1] == '\n'); 406 assert(bufend[-1] == '\n'); 407 if (nkeys == 0) 408 return 0; 409 for (;;) { 410 const char *refstart; 411 const char *found = search_and_check(keys[0], buffer, bufend, &refstart); 412 if (!found) 413 break; 414 const char *refend = find_end(bufend, found + keys[0]->length()); 415 int i; 416 for (i = 1; i < nkeys; i++) 417 if (!search_and_check(keys[i], refstart, refend)) 418 break; 419 if (i >= nkeys) { 420 *startp = refstart; 421 *lengthp = refend - refstart; 422 return 1; 423 } 424 buffer = refend; 425 } 426 return 0; 427 } 428 429 class linear_search_item : public search_item { 430 file_buffer fbuf; 431 public: 432 linear_search_item(const char *filename, int fid); 433 ~linear_search_item(); 434 int load(int fd); 435 search_item_iterator *make_search_item_iterator(const char *); 436 friend class linear_search_item_iterator; 437 }; 438 439 class linear_search_item_iterator : public search_item_iterator { 440 linear_search_item *lsi; 441 int pos; 442 public: 443 linear_search_item_iterator(linear_search_item *, const char *query); 444 ~linear_search_item_iterator(); 445 int next(const linear_searcher &, const char **ptr, int *lenp, 446 reference_id *ridp); 447 }; 448 449 search_item *make_linear_search_item(int fd, const char *filename, int fid) 450 { 451 linear_search_item *item = new linear_search_item(filename, fid); 452 if (!item->load(fd)) { 453 delete item; 454 return 0; 455 } 456 else 457 return item; 458 } 459 460 linear_search_item::linear_search_item(const char *filename, int fid) 461 : search_item(filename, fid) 462 { 463 } 464 465 linear_search_item::~linear_search_item() 466 { 467 } 468 469 int linear_search_item::load(int fd) 470 { 471 return fbuf.load(fd, name); 472 } 473 474 search_item_iterator *linear_search_item::make_search_item_iterator( 475 const char *query) 476 { 477 return new linear_search_item_iterator(this, query); 478 } 479 480 linear_search_item_iterator::linear_search_item_iterator( 481 linear_search_item *p, const char *) 482 : lsi(p), pos(0) 483 { 484 } 485 486 linear_search_item_iterator::~linear_search_item_iterator() 487 { 488 } 489 490 int linear_search_item_iterator::next(const linear_searcher &searcher, 491 const char **startp, int *lengthp, 492 reference_id *ridp) 493 { 494 const char *bufstart = lsi->fbuf.get_start(); 495 const char *bufend = lsi->fbuf.get_end(); 496 const char *ptr = bufstart + pos; 497 if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) { 498 pos = *startp + *lengthp - bufstart; 499 if (ridp) 500 *ridp = reference_id(lsi->filename_id, *startp - bufstart); 501 return 1; 502 } 503 else 504 return 0; 505 } 506