1 /* $NetBSD: indxbib.cpp,v 1.1.1.1 2016/01/13 18:41:49 christos Exp $ */ 2 3 // -*- C++ -*- 4 /* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004 5 Free Software Foundation, Inc. 6 Written by James Clark (jjc@jclark.com) 7 8 This file is part of groff. 9 10 groff is free software; you can redistribute it and/or modify it under 11 the terms of the GNU General Public License as published by the Free 12 Software Foundation; either version 2, or (at your option) any later 13 version. 14 15 groff is distributed in the hope that it will be useful, but WITHOUT ANY 16 WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18 for more details. 19 20 You should have received a copy of the GNU General Public License along 21 with groff; see the file COPYING. If not, write to the Free Software 22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 23 24 #include "lib.h" 25 26 #include <stdlib.h> 27 #include <assert.h> 28 #include <errno.h> 29 30 #include "posix.h" 31 #include "errarg.h" 32 #include "error.h" 33 #include "stringclass.h" 34 #include "cset.h" 35 #include "cmap.h" 36 37 #include "defs.h" 38 #include "index.h" 39 40 #include "nonposix.h" 41 42 extern "C" const char *Version_string; 43 44 #define DEFAULT_HASH_TABLE_SIZE 997 45 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX" 46 47 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc(). 48 49 #define MALLOC_OVERHEAD 16 50 51 #ifdef BLOCK_SIZE 52 #undef BLOCK_SIZE 53 #endif 54 55 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *) 56 - sizeof(int)) / sizeof(int)); 57 struct block { 58 block *next; 59 int used; 60 int v[BLOCK_SIZE]; 61 62 block(block *p = 0) : next(p), used(0) { } 63 }; 64 65 struct block; 66 67 union table_entry { 68 block *ptr; 69 int count; 70 }; 71 72 struct word_list { 73 word_list *next; 74 char *str; 75 int len; 76 word_list(const char *, int, word_list *); 77 }; 78 79 table_entry *hash_table; 80 int hash_table_size = DEFAULT_HASH_TABLE_SIZE; 81 // We make this the same size as hash_table so we only have to do one 82 // mod per key. 83 static word_list **common_words_table = 0; 84 char *key_buffer; 85 86 FILE *indxfp; 87 int ntags = 0; 88 string filenames; 89 char *temp_index_file = 0; 90 91 const char *ignore_fields = "XYZ"; 92 const char *common_words_file = COMMON_WORDS_FILE; 93 int n_ignore_words = 100; 94 int truncate_len = 6; 95 int shortest_len = 3; 96 int max_keys_per_item = 100; 97 98 static void usage(FILE *stream); 99 static void write_hash_table(); 100 static void init_hash_table(); 101 static void read_common_words_file(); 102 static int store_key(char *s, int len); 103 static void possibly_store_key(char *s, int len); 104 static int do_whole_file(const char *filename); 105 static int do_file(const char *filename); 106 static void store_reference(int filename_index, int pos, int len); 107 static void check_integer_arg(char opt, const char *arg, int min, int *res); 108 static void store_filename(const char *); 109 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp); 110 static char *get_cwd(); 111 112 extern "C" { 113 void cleanup(); 114 void catch_fatal_signals(); 115 void ignore_fatal_signals(); 116 } 117 118 int main(int argc, char **argv) 119 { 120 program_name = argv[0]; 121 static char stderr_buf[BUFSIZ]; 122 setbuf(stderr, stderr_buf); 123 124 const char *base_name = 0; 125 typedef int (*parser_t)(const char *); 126 parser_t parser = do_file; 127 const char *directory = 0; 128 const char *foption = 0; 129 int opt; 130 static const struct option long_options[] = { 131 { "help", no_argument, 0, CHAR_MAX + 1 }, 132 { "version", no_argument, 0, 'v' }, 133 { NULL, 0, 0, 0 } 134 }; 135 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw", 136 long_options, NULL)) 137 != EOF) 138 switch (opt) { 139 case 'c': 140 common_words_file = optarg; 141 break; 142 case 'd': 143 directory = optarg; 144 break; 145 case 'f': 146 foption = optarg; 147 break; 148 case 'h': 149 check_integer_arg('h', optarg, 1, &hash_table_size); 150 if (!is_prime(hash_table_size)) { 151 while (!is_prime(++hash_table_size)) 152 ; 153 warning("%1 not prime: using %2 instead", optarg, hash_table_size); 154 } 155 break; 156 case 'i': 157 ignore_fields = optarg; 158 break; 159 case 'k': 160 check_integer_arg('k', optarg, 1, &max_keys_per_item); 161 break; 162 case 'l': 163 check_integer_arg('l', optarg, 0, &shortest_len); 164 break; 165 case 'n': 166 check_integer_arg('n', optarg, 0, &n_ignore_words); 167 break; 168 case 'o': 169 base_name = optarg; 170 break; 171 case 't': 172 check_integer_arg('t', optarg, 1, &truncate_len); 173 break; 174 case 'w': 175 parser = do_whole_file; 176 break; 177 case 'v': 178 printf("GNU indxbib (groff) version %s\n", Version_string); 179 exit(0); 180 break; 181 case CHAR_MAX + 1: // --help 182 usage(stdout); 183 exit(0); 184 break; 185 case '?': 186 usage(stderr); 187 exit(1); 188 break; 189 default: 190 assert(0); 191 break; 192 } 193 if (optind >= argc && foption == 0) 194 fatal("no files and no -f option"); 195 if (!directory) { 196 char *path = get_cwd(); 197 store_filename(path); 198 a_delete path; 199 } 200 else 201 store_filename(directory); 202 init_hash_table(); 203 store_filename(common_words_file); 204 store_filename(ignore_fields); 205 key_buffer = new char[truncate_len]; 206 read_common_words_file(); 207 if (!base_name) 208 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME; 209 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1; 210 const char *sep = &DIR_SEPS[1]; 211 while (*sep) { 212 p1 = strrchr(base_name, *sep); 213 if (p1 && (!p || p1 > p)) 214 p = p1; 215 sep++; 216 } 217 size_t name_max; 218 if (p) { 219 char *dir = strsave(base_name); 220 dir[p - base_name] = '\0'; 221 name_max = file_name_max(dir); 222 a_delete dir; 223 } 224 else 225 name_max = file_name_max("."); 226 const char *filename = p ? p + 1 : base_name; 227 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max) 228 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX); 229 if (p) { 230 p++; 231 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)]; 232 memcpy(temp_index_file, base_name, p - base_name); 233 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE); 234 } 235 else { 236 temp_index_file = strsave(TEMP_INDEX_TEMPLATE); 237 } 238 catch_fatal_signals(); 239 int fd = mkstemp(temp_index_file); 240 if (fd < 0) 241 fatal("can't create temporary index file: %1", strerror(errno)); 242 indxfp = fdopen(fd, FOPEN_WB); 243 if (indxfp == 0) 244 fatal("fdopen failed"); 245 if (fseek(indxfp, sizeof(index_header), 0) < 0) 246 fatal("can't seek past index header: %1", strerror(errno)); 247 int failed = 0; 248 if (foption) { 249 FILE *fp = stdin; 250 if (strcmp(foption, "-") != 0) { 251 errno = 0; 252 fp = fopen(foption, "r"); 253 if (!fp) 254 fatal("can't open `%1': %2", foption, strerror(errno)); 255 } 256 string path; 257 int lineno = 1; 258 for (;;) { 259 int c; 260 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) { 261 if (c == '\0') 262 error_with_file_and_line(foption, lineno, 263 "nul character in pathname ignored"); 264 else 265 path += c; 266 } 267 if (path.length() > 0) { 268 path += '\0'; 269 if (!(*parser)(path.contents())) 270 failed = 1; 271 path.clear(); 272 } 273 if (c == EOF) 274 break; 275 lineno++; 276 } 277 if (fp != stdin) 278 fclose(fp); 279 } 280 for (int i = optind; i < argc; i++) 281 if (!(*parser)(argv[i])) 282 failed = 1; 283 write_hash_table(); 284 if (fclose(indxfp) < 0) 285 fatal("error closing temporary index file: %1", strerror(errno)); 286 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)]; 287 strcpy(index_file, base_name); 288 strcat(index_file, INDEX_SUFFIX); 289 #ifdef HAVE_RENAME 290 #ifdef __EMX__ 291 if (access(index_file, R_OK) == 0) 292 unlink(index_file); 293 #endif /* __EMX__ */ 294 if (rename(temp_index_file, index_file) < 0) { 295 #ifdef __MSDOS__ 296 // RENAME could fail on plain MSDOS filesystems because 297 // INDEX_FILE is an invalid filename, e.g. it has multiple dots. 298 char *fname = p ? index_file + (p - base_name) : 0; 299 char *dot = 0; 300 301 // Replace the dot with an underscore and try again. 302 if (fname 303 && (dot = strchr(fname, '.')) != 0 304 && strcmp(dot, INDEX_SUFFIX) != 0) 305 *dot = '_'; 306 if (rename(temp_index_file, index_file) < 0) 307 #endif 308 fatal("can't rename temporary index file: %1", strerror(errno)); 309 } 310 #else /* not HAVE_RENAME */ 311 ignore_fatal_signals(); 312 if (unlink(index_file) < 0) { 313 if (errno != ENOENT) 314 fatal("can't unlink `%1': %2", index_file, strerror(errno)); 315 } 316 if (link(temp_index_file, index_file) < 0) 317 fatal("can't link temporary index file: %1", strerror(errno)); 318 if (unlink(temp_index_file) < 0) 319 fatal("can't unlink temporary index file: %1", strerror(errno)); 320 #endif /* not HAVE_RENAME */ 321 temp_index_file = 0; 322 return failed; 323 } 324 325 static void usage(FILE *stream) 326 { 327 fprintf(stream, 328 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n" 329 " [-l n] [-n n] [-o base] [-t n] [files...]\n", 330 program_name); 331 } 332 333 static void check_integer_arg(char opt, const char *arg, int min, int *res) 334 { 335 char *ptr; 336 long n = strtol(arg, &ptr, 10); 337 if (n == 0 && ptr == arg) 338 error("argument to -%1 not an integer", opt); 339 else if (n < min) 340 error("argument to -%1 must not be less than %2", opt, min); 341 else { 342 if (n > INT_MAX) 343 error("argument to -%1 greater than maximum integer", opt); 344 else if (*ptr != '\0') 345 error("junk after integer argument to -%1", opt); 346 *res = int(n); 347 } 348 } 349 350 static char *get_cwd() 351 { 352 char *buf; 353 int size = 12; 354 355 for (;;) { 356 buf = new char[size]; 357 if (getcwd(buf, size)) 358 break; 359 if (errno != ERANGE) 360 fatal("cannot get current working directory: %1", strerror(errno)); 361 a_delete buf; 362 if (size == INT_MAX) 363 fatal("current working directory longer than INT_MAX"); 364 if (size > INT_MAX/2) 365 size = INT_MAX; 366 else 367 size *= 2; 368 } 369 return buf; 370 } 371 372 word_list::word_list(const char *s, int n, word_list *p) 373 : next(p), len(n) 374 { 375 str = new char[n]; 376 memcpy(str, s, n); 377 } 378 379 static void read_common_words_file() 380 { 381 if (n_ignore_words <= 0) 382 return; 383 errno = 0; 384 FILE *fp = fopen(common_words_file, "r"); 385 if (!fp) 386 fatal("can't open `%1': %2", common_words_file, strerror(errno)); 387 common_words_table = new word_list * [hash_table_size]; 388 for (int i = 0; i < hash_table_size; i++) 389 common_words_table[i] = 0; 390 int count = 0; 391 int key_len = 0; 392 for (;;) { 393 int c = getc(fp); 394 while (c != EOF && !csalnum(c)) 395 c = getc(fp); 396 if (c == EOF) 397 break; 398 do { 399 if (key_len < truncate_len) 400 key_buffer[key_len++] = cmlower(c); 401 c = getc(fp); 402 } while (c != EOF && csalnum(c)); 403 if (key_len >= shortest_len) { 404 int h = hash(key_buffer, key_len) % hash_table_size; 405 common_words_table[h] = new word_list(key_buffer, key_len, 406 common_words_table[h]); 407 } 408 if (++count >= n_ignore_words) 409 break; 410 key_len = 0; 411 if (c == EOF) 412 break; 413 } 414 n_ignore_words = count; 415 fclose(fp); 416 } 417 418 static int do_whole_file(const char *filename) 419 { 420 errno = 0; 421 FILE *fp = fopen(filename, "r"); 422 if (!fp) { 423 error("can't open `%1': %2", filename, strerror(errno)); 424 return 0; 425 } 426 int count = 0; 427 int key_len = 0; 428 int c; 429 while ((c = getc(fp)) != EOF) { 430 if (csalnum(c)) { 431 key_len = 1; 432 key_buffer[0] = c; 433 while ((c = getc(fp)) != EOF) { 434 if (!csalnum(c)) 435 break; 436 if (key_len < truncate_len) 437 key_buffer[key_len++] = c; 438 } 439 if (store_key(key_buffer, key_len)) { 440 if (++count >= max_keys_per_item) 441 break; 442 } 443 if (c == EOF) 444 break; 445 } 446 } 447 store_reference(filenames.length(), 0, 0); 448 store_filename(filename); 449 fclose(fp); 450 return 1; 451 } 452 453 static int do_file(const char *filename) 454 { 455 errno = 0; 456 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on 457 // byte counts to be consistent with fseek. 458 FILE *fp = fopen(filename, FOPEN_RB); 459 if (fp == 0) { 460 error("can't open `%1': %2", filename, strerror(errno)); 461 return 0; 462 } 463 int filename_index = filenames.length(); 464 store_filename(filename); 465 466 enum { 467 START, // at the start of the file; also in between references 468 BOL, // in the middle of a reference, at the beginning of the line 469 PERCENT, // seen a percent at the beginning of the line 470 IGNORE, // ignoring a field 471 IGNORE_BOL, // at the beginning of a line ignoring a field 472 KEY, // in the middle of a key 473 DISCARD, // after truncate_len bytes of a key 474 MIDDLE // in between keys 475 } state = START; 476 477 // In states START, BOL, IGNORE_BOL, space_count how many spaces at 478 // the beginning have been seen. In states PERCENT, IGNORE, KEY, 479 // MIDDLE space_count must be 0. 480 int space_count = 0; 481 int byte_count = 0; // bytes read 482 int key_len = 0; 483 int ref_start = -1; // position of start of current reference 484 for (;;) { 485 int c = getc(fp); 486 if (c == EOF) 487 break; 488 // We opened the file in binary mode, so we need to skip 489 // every CR character before a Newline. 490 if (c == '\r') { 491 int peek = getc(fp); 492 if (peek == '\n') { 493 byte_count++; 494 c = peek; 495 } 496 else 497 ungetc(peek, fp); 498 } 499 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__) 500 else if (c == 0x1a) // ^Z means EOF in text files 501 break; 502 #endif 503 byte_count++; 504 switch (state) { 505 case START: 506 if (c == ' ' || c == '\t') { 507 space_count++; 508 break; 509 } 510 if (c == '\n') { 511 space_count = 0; 512 break; 513 } 514 ref_start = byte_count - space_count - 1; 515 space_count = 0; 516 if (c == '%') 517 state = PERCENT; 518 else if (csalnum(c)) { 519 state = KEY; 520 key_buffer[0] = c; 521 key_len = 1; 522 } 523 else 524 state = MIDDLE; 525 break; 526 case BOL: 527 switch (c) { 528 case '%': 529 if (space_count > 0) { 530 space_count = 0; 531 state = MIDDLE; 532 } 533 else 534 state = PERCENT; 535 break; 536 case ' ': 537 case '\t': 538 space_count++; 539 break; 540 case '\n': 541 store_reference(filename_index, ref_start, 542 byte_count - 1 - space_count - ref_start); 543 state = START; 544 space_count = 0; 545 break; 546 default: 547 space_count = 0; 548 if (csalnum(c)) { 549 state = KEY; 550 key_buffer[0] = c; 551 key_len = 1; 552 } 553 else 554 state = MIDDLE; 555 } 556 break; 557 case PERCENT: 558 if (strchr(ignore_fields, c) != 0) 559 state = IGNORE; 560 else if (c == '\n') 561 state = BOL; 562 else 563 state = MIDDLE; 564 break; 565 case IGNORE: 566 if (c == '\n') 567 state = IGNORE_BOL; 568 break; 569 case IGNORE_BOL: 570 switch (c) { 571 case '%': 572 if (space_count > 0) { 573 state = IGNORE; 574 space_count = 0; 575 } 576 else 577 state = PERCENT; 578 break; 579 case ' ': 580 case '\t': 581 space_count++; 582 break; 583 case '\n': 584 store_reference(filename_index, ref_start, 585 byte_count - 1 - space_count - ref_start); 586 state = START; 587 space_count = 0; 588 break; 589 default: 590 space_count = 0; 591 state = IGNORE; 592 } 593 break; 594 case KEY: 595 if (csalnum(c)) { 596 if (key_len < truncate_len) 597 key_buffer[key_len++] = c; 598 else 599 state = DISCARD; 600 } 601 else { 602 possibly_store_key(key_buffer, key_len); 603 key_len = 0; 604 if (c == '\n') 605 state = BOL; 606 else 607 state = MIDDLE; 608 } 609 break; 610 case DISCARD: 611 if (!csalnum(c)) { 612 possibly_store_key(key_buffer, key_len); 613 key_len = 0; 614 if (c == '\n') 615 state = BOL; 616 else 617 state = MIDDLE; 618 } 619 break; 620 case MIDDLE: 621 if (csalnum(c)) { 622 state = KEY; 623 key_buffer[0] = c; 624 key_len = 1; 625 } 626 else if (c == '\n') 627 state = BOL; 628 break; 629 default: 630 assert(0); 631 } 632 } 633 switch (state) { 634 case START: 635 break; 636 case DISCARD: 637 case KEY: 638 possibly_store_key(key_buffer, key_len); 639 // fall through 640 case BOL: 641 case PERCENT: 642 case IGNORE_BOL: 643 case IGNORE: 644 case MIDDLE: 645 store_reference(filename_index, ref_start, 646 byte_count - ref_start - space_count); 647 break; 648 default: 649 assert(0); 650 } 651 fclose(fp); 652 return 1; 653 } 654 655 static void store_reference(int filename_index, int pos, int len) 656 { 657 tag t; 658 t.filename_index = filename_index; 659 t.start = pos; 660 t.length = len; 661 fwrite_or_die(&t, sizeof(t), 1, indxfp); 662 ntags++; 663 } 664 665 static void store_filename(const char *fn) 666 { 667 filenames += fn; 668 filenames += '\0'; 669 } 670 671 static void init_hash_table() 672 { 673 hash_table = new table_entry[hash_table_size]; 674 for (int i = 0; i < hash_table_size; i++) 675 hash_table[i].ptr = 0; 676 } 677 678 static void possibly_store_key(char *s, int len) 679 { 680 static int last_tagno = -1; 681 static int key_count; 682 if (last_tagno != ntags) { 683 last_tagno = ntags; 684 key_count = 0; 685 } 686 if (key_count < max_keys_per_item) { 687 if (store_key(s, len)) 688 key_count++; 689 } 690 } 691 692 static int store_key(char *s, int len) 693 { 694 if (len < shortest_len) 695 return 0; 696 int is_number = 1; 697 for (int i = 0; i < len; i++) 698 if (!csdigit(s[i])) { 699 is_number = 0; 700 s[i] = cmlower(s[i]); 701 } 702 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9')) 703 return 0; 704 int h = hash(s, len) % hash_table_size; 705 if (common_words_table) { 706 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next) 707 if (len == ptr->len && memcmp(s, ptr->str, len) == 0) 708 return 0; 709 } 710 table_entry *pp = hash_table + h; 711 if (!pp->ptr) 712 pp->ptr = new block; 713 else if (pp->ptr->v[pp->ptr->used - 1] == ntags) 714 return 1; 715 else if (pp->ptr->used >= BLOCK_SIZE) 716 pp->ptr = new block(pp->ptr); 717 pp->ptr->v[(pp->ptr->used)++] = ntags; 718 return 1; 719 } 720 721 static void write_hash_table() 722 { 723 const int minus_one = -1; 724 int li = 0; 725 for (int i = 0; i < hash_table_size; i++) { 726 block *ptr = hash_table[i].ptr; 727 if (!ptr) 728 hash_table[i].count = -1; 729 else { 730 hash_table[i].count = li; 731 block *rev = 0; 732 while (ptr) { 733 block *tem = ptr; 734 ptr = ptr->next; 735 tem->next = rev; 736 rev = tem; 737 } 738 while (rev) { 739 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp); 740 li += rev->used; 741 block *tem = rev; 742 rev = rev->next; 743 delete tem; 744 } 745 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp); 746 li += 1; 747 } 748 } 749 if (sizeof(table_entry) == sizeof(int)) 750 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp); 751 else { 752 // write it out word by word 753 for (int i = 0; i < hash_table_size; i++) 754 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp); 755 } 756 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp); 757 if (fseek(indxfp, 0, 0) < 0) 758 fatal("error seeking on index file: %1", strerror(errno)); 759 index_header h; 760 h.magic = INDEX_MAGIC; 761 h.version = INDEX_VERSION; 762 h.tags_size = ntags; 763 h.lists_size = li; 764 h.table_size = hash_table_size; 765 h.strings_size = filenames.length(); 766 h.truncate = truncate_len; 767 h.shortest = shortest_len; 768 h.common = n_ignore_words; 769 fwrite_or_die(&h, sizeof(h), 1, indxfp); 770 } 771 772 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp) 773 { 774 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems) 775 fatal("fwrite failed: %1", strerror(errno)); 776 } 777 778 void fatal_error_exit() 779 { 780 cleanup(); 781 exit(3); 782 } 783 784 extern "C" { 785 786 void cleanup() 787 { 788 if (temp_index_file) 789 unlink(temp_index_file); 790 } 791 792 } 793