utils/indxbib/indxbib.cpp

/*	$NetBSD: indxbib.cpp,v 1.1.1.1 2016/01/13 18:41:49 christos Exp $	*/

// -*- C++ -*-
/* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004
   Free Software Foundation, Inc.
     Written by James Clark (jjc@jclark.com)

This file is part of groff.

groff is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.

groff is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License along
with groff; see the file COPYING.  If not, write to the Free Software
Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */

#include "lib.h"

#include <stdlib.h>
#include <assert.h>
#include <errno.h>

#include "posix.h"
#include "errarg.h"
#include "error.h"
#include "stringclass.h"
#include "cset.h"
#include "cmap.h"

#include "defs.h"
#include "index.h"

#include "nonposix.h"

extern "C" const char *Version_string;

#define DEFAULT_HASH_TABLE_SIZE 997
#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"

// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().

#define MALLOC_OVERHEAD 16

#ifdef BLOCK_SIZE
#undef BLOCK_SIZE
#endif

const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
			 - sizeof(int)) / sizeof(int));
struct block {
  block *next;
  int used;
  int v[BLOCK_SIZE];

  block(block *p = 0) : next(p), used(0) { }
};

struct block;

union table_entry {
  block *ptr;
  int count;
};

struct word_list {
  word_list *next;
  char *str;
  int len;
  word_list(const char *, int, word_list *);
};

table_entry *hash_table;
int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
// We make this the same size as hash_table so we only have to do one
// mod per key.
static word_list **common_words_table = 0;
char *key_buffer;

FILE *indxfp;
int ntags = 0;
string filenames;
char *temp_index_file = 0;

const char *ignore_fields = "XYZ";
const char *common_words_file = COMMON_WORDS_FILE;
int n_ignore_words = 100;
int truncate_len = 6;
int shortest_len = 3;
int max_keys_per_item = 100;

static void usage(FILE *stream);
static void write_hash_table();
static void init_hash_table();
static void read_common_words_file();
static int store_key(char *s, int len);
static void possibly_store_key(char *s, int len);
static int do_whole_file(const char *filename);
static int do_file(const char *filename);
static void store_reference(int filename_index, int pos, int len);
static void check_integer_arg(char opt, const char *arg, int min, int *res);
static void store_filename(const char *);
static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
static char *get_cwd();

extern "C" {
  void cleanup();
  void catch_fatal_signals();
  void ignore_fatal_signals();
}

int main(int argc, char **argv)
{
  program_name = argv[0];
  static char stderr_buf[BUFSIZ];
  setbuf(stderr, stderr_buf);

  const char *base_name = 0;
  typedef int (*parser_t)(const char *);
  parser_t parser = do_file;
  const char *directory = 0;
  const char *foption = 0;
  int opt;
  static const struct option long_options[] = {
    { "help", no_argument, 0, CHAR_MAX + 1 },
    { "version", no_argument, 0, 'v' },
    { NULL, 0, 0, 0 }
  };
  while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
			    long_options, NULL))
	 != EOF)
    switch (opt) {
    case 'c':
      common_words_file = optarg;
      break;
    case 'd':
      directory = optarg;
      break;
    case 'f':
      foption = optarg;
      break;
    case 'h':
      check_integer_arg('h', optarg, 1, &hash_table_size);
      if (!is_prime(hash_table_size)) {
	while (!is_prime(++hash_table_size))
	  ;
	warning("%1 not prime: using %2 instead", optarg, hash_table_size);
      }
      break;
    case 'i':
      ignore_fields = optarg;
      break;
    case 'k':
      check_integer_arg('k', optarg, 1, &max_keys_per_item);
      break;
    case 'l':
      check_integer_arg('l', optarg, 0, &shortest_len);
      break;
    case 'n':
      check_integer_arg('n', optarg, 0, &n_ignore_words);
      break;
    case 'o':
      base_name = optarg;
      break;
    case 't':
      check_integer_arg('t', optarg, 1, &truncate_len);
      break;
    case 'w':
      parser = do_whole_file;
      break;
    case 'v':
      printf("GNU indxbib (groff) version %s\n", Version_string);
      exit(0);
      break;
    case CHAR_MAX + 1: // --help
      usage(stdout);
      exit(0);
      break;
    case '?':
      usage(stderr);
      exit(1);
      break;
    default:
      assert(0);
      break;
    }
  if (optind >= argc && foption == 0)
    fatal("no files and no -f option");
  if (!directory) {
    char *path = get_cwd();
    store_filename(path);
    a_delete path;
  }
  else
    store_filename(directory);
  init_hash_table();
  store_filename(common_words_file);
  store_filename(ignore_fields);
  key_buffer = new char[truncate_len];
  read_common_words_file();
  if (!base_name)
    base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
  const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
  const char *sep = &DIR_SEPS[1];
  while (*sep) {
    p1 = strrchr(base_name, *sep);
    if (p1 && (!p || p1 > p))
      p = p1;
    sep++;
  }
  size_t name_max;
  if (p) {
    char *dir = strsave(base_name);
    dir[p - base_name] = '\0';
    name_max = file_name_max(dir);
    a_delete dir;
  }
  else
    name_max = file_name_max(".");
  const char *filename = p ? p + 1 : base_name;
  if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
    fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
  if (p) {
    p++;
    temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
    memcpy(temp_index_file, base_name, p - base_name);
    strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
  }
  else {
    temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
  }
  catch_fatal_signals();
  int fd = mkstemp(temp_index_file);
  if (fd < 0)
    fatal("can't create temporary index file: %1", strerror(errno));
  indxfp = fdopen(fd, FOPEN_WB);
  if (indxfp == 0)
    fatal("fdopen failed");
  if (fseek(indxfp, sizeof(index_header), 0) < 0)
    fatal("can't seek past index header: %1", strerror(errno));
  int failed = 0;
  if (foption) {
    FILE *fp = stdin;
    if (strcmp(foption, "-") != 0) {
      errno = 0;
      fp = fopen(foption, "r");
      if (!fp)
	fatal("can't open `%1': %2", foption, strerror(errno));
    }
    string path;
    int lineno = 1;
    for (;;) {
      int c;
      for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
	if (c == '\0')
	  error_with_file_and_line(foption, lineno,
				   "nul character in pathname ignored");
	else
	  path += c;
      }
      if (path.length() > 0) {
	path += '\0';
	if (!(*parser)(path.contents()))
	  failed = 1;
	path.clear();
      }
      if (c == EOF)
	break;
      lineno++;
    }
    if (fp != stdin)
      fclose(fp);
  }
  for (int i = optind; i < argc; i++)
    if (!(*parser)(argv[i]))
      failed = 1;
  write_hash_table();
  if (fclose(indxfp) < 0)
    fatal("error closing temporary index file: %1", strerror(errno));
  char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
  strcpy(index_file, base_name);
  strcat(index_file, INDEX_SUFFIX);
#ifdef HAVE_RENAME
#ifdef __EMX__
  if (access(index_file, R_OK) == 0)
    unlink(index_file);
#endif /* __EMX__ */
  if (rename(temp_index_file, index_file) < 0) {
#ifdef __MSDOS__
    // RENAME could fail on plain MSDOS filesystems because
    // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
    char *fname = p ? index_file + (p - base_name) : 0;
    char *dot = 0;

    // Replace the dot with an underscore and try again.
    if (fname
        && (dot = strchr(fname, '.')) != 0
        && strcmp(dot, INDEX_SUFFIX) != 0)
      *dot = '_';
    if (rename(temp_index_file, index_file) < 0)
#endif
    fatal("can't rename temporary index file: %1", strerror(errno));
  }
#else /* not HAVE_RENAME */
  ignore_fatal_signals();
  if (unlink(index_file) < 0) {
    if (errno != ENOENT)
      fatal("can't unlink `%1': %2", index_file, strerror(errno));
  }
  if (link(temp_index_file, index_file) < 0)
    fatal("can't link temporary index file: %1", strerror(errno));
  if (unlink(temp_index_file) < 0)
    fatal("can't unlink temporary index file: %1", strerror(errno));
#endif /* not HAVE_RENAME */
  temp_index_file = 0;
  return failed;
}

static void usage(FILE *stream)
{
  fprintf(stream,
"usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
"       [-l n] [-n n] [-o base] [-t n] [files...]\n",
	  program_name);
}

static void check_integer_arg(char opt, const char *arg, int min, int *res)
{
  char *ptr;
  long n = strtol(arg, &ptr, 10);
  if (n == 0 && ptr == arg)
    error("argument to -%1 not an integer", opt);
  else if (n < min)
    error("argument to -%1 must not be less than %2", opt, min);
  else {
    if (n > INT_MAX)
      error("argument to -%1 greater than maximum integer", opt);
    else if (*ptr != '\0')
      error("junk after integer argument to -%1", opt);
    *res = int(n);
  }
}

static char *get_cwd()
{
  char *buf;
  int size = 12;

  for (;;) {
    buf = new char[size];
    if (getcwd(buf, size))
      break;
    if (errno != ERANGE)
      fatal("cannot get current working directory: %1", strerror(errno));
    a_delete buf;
    if (size == INT_MAX)
      fatal("current working directory longer than INT_MAX");
    if (size > INT_MAX/2)
      size = INT_MAX;
    else
      size *= 2;
  }
  return buf;
}

word_list::word_list(const char *s, int n, word_list *p)
: next(p), len(n)
{
  str = new char[n];
  memcpy(str, s, n);
}

static void read_common_words_file()
{
  if (n_ignore_words <= 0)
    return;
  errno = 0;
  FILE *fp = fopen(common_words_file, "r");
  if (!fp)
    fatal("can't open `%1': %2", common_words_file, strerror(errno));
  common_words_table = new word_list * [hash_table_size];
  for (int i = 0; i < hash_table_size; i++)
    common_words_table[i] = 0;
  int count = 0;
  int key_len = 0;
  for (;;) {
    int c = getc(fp);
    while (c != EOF && !csalnum(c))
      c = getc(fp);
    if (c == EOF)
      break;
    do {
      if (key_len < truncate_len)
	key_buffer[key_len++] = cmlower(c);
      c = getc(fp);
    } while (c != EOF && csalnum(c));
    if (key_len >= shortest_len) {
      int h = hash(key_buffer, key_len) % hash_table_size;
      common_words_table[h] = new word_list(key_buffer, key_len,
					    common_words_table[h]);
    }
    if (++count >= n_ignore_words)
      break;
    key_len = 0;
    if (c == EOF)
      break;
  }
  n_ignore_words = count;
  fclose(fp);
}

static int do_whole_file(const char *filename)
{
  errno = 0;
  FILE *fp = fopen(filename, "r");
  if (!fp) {
    error("can't open `%1': %2", filename, strerror(errno));
    return 0;
  }
  int count = 0;
  int key_len = 0;
  int c;
  while ((c = getc(fp)) != EOF) {
    if (csalnum(c)) {
      key_len = 1;
      key_buffer[0] = c;
      while ((c = getc(fp)) != EOF) {
	if (!csalnum(c))
	  break;
	if (key_len < truncate_len)
	  key_buffer[key_len++] = c;
      }
      if (store_key(key_buffer, key_len)) {
	if (++count >= max_keys_per_item)
	  break;
      }
      if (c == EOF)
	break;
    }
  }
  store_reference(filenames.length(), 0, 0);
  store_filename(filename);
  fclose(fp);
  return 1;
}

static int do_file(const char *filename)
{
  errno = 0;
  // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
  // byte counts to be consistent with fseek.
  FILE *fp = fopen(filename, FOPEN_RB);
  if (fp == 0) {
    error("can't open `%1': %2", filename, strerror(errno));
    return 0;
  }
  int filename_index = filenames.length();
  store_filename(filename);

  enum {
    START,	// at the start of the file; also in between references
    BOL,	// in the middle of a reference, at the beginning of the line
    PERCENT,	// seen a percent at the beginning of the line
    IGNORE,	// ignoring a field
    IGNORE_BOL,	// at the beginning of a line ignoring a field
    KEY,	// in the middle of a key
    DISCARD,	// after truncate_len bytes of a key
    MIDDLE	// in between keys
  } state = START;

  // In states START, BOL, IGNORE_BOL, space_count how many spaces at
  // the beginning have been seen.  In states PERCENT, IGNORE, KEY,
  // MIDDLE space_count must be 0.
  int space_count = 0;
  int byte_count = 0;		// bytes read
  int key_len = 0;
  int ref_start = -1;		// position of start of current reference
  for (;;) {
    int c = getc(fp);
    if (c == EOF)
      break;
    // We opened the file in binary mode, so we need to skip
    // every CR character before a Newline.
    if (c == '\r') {
      int peek = getc(fp);
      if (peek == '\n') {
	byte_count++;
	c = peek;
      }
      else
	ungetc(peek, fp);
    }
#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
    else if (c == 0x1a)	// ^Z means EOF in text files
      break;
#endif
    byte_count++;
    switch (state) {
    case START:
      if (c == ' ' || c == '\t') {
	space_count++;
	break;
      }
      if (c == '\n') {
	space_count = 0;
	break;
      }
      ref_start = byte_count - space_count - 1;
      space_count = 0;
      if (c == '%')
	state = PERCENT;
      else if (csalnum(c)) {
	state = KEY;
	key_buffer[0] = c;
	key_len = 1;
      }
      else
	state = MIDDLE;
      break;
    case BOL:
      switch (c) {
      case '%':
	if (space_count > 0) {
	  space_count = 0;
	  state = MIDDLE;
	}
	else
	  state = PERCENT;
	break;
      case ' ':
      case '\t':
	space_count++;
	break;
      case '\n':
	store_reference(filename_index, ref_start,
			byte_count - 1 - space_count - ref_start);
	state = START;
	space_count = 0;
	break;
      default:
	space_count = 0;
	if (csalnum(c)) {
	  state = KEY;
	  key_buffer[0] = c;
	  key_len = 1;
	}
	else
	  state = MIDDLE;
      }
      break;
    case PERCENT:
      if (strchr(ignore_fields, c) != 0)
	state = IGNORE;
      else if (c == '\n')
	state = BOL;
      else
	state = MIDDLE;
      break;
    case IGNORE:
      if (c == '\n')
	state = IGNORE_BOL;
      break;
    case IGNORE_BOL:
      switch (c) {
      case '%':
	if (space_count > 0) {
	  state = IGNORE;
	  space_count = 0;
	}
	else
	  state = PERCENT;
	break;
      case ' ':
      case '\t':
	space_count++;
	break;
      case '\n':
	store_reference(filename_index, ref_start,
			byte_count - 1 - space_count - ref_start);
	state = START;
	space_count = 0;
	break;
      default:
	space_count = 0;
	state = IGNORE;
      }
      break;
    case KEY:
      if (csalnum(c)) {
	if (key_len < truncate_len)
	  key_buffer[key_len++] = c;
	else
	  state = DISCARD;
      }
      else {
	possibly_store_key(key_buffer, key_len);
	key_len = 0;
	if (c == '\n')
	  state = BOL;
	else
	  state = MIDDLE;
      }
      break;
    case DISCARD:
      if (!csalnum(c)) {
	possibly_store_key(key_buffer, key_len);
	key_len = 0;
	if (c == '\n')
	  state = BOL;
	else
	  state = MIDDLE;
      }
      break;
    case MIDDLE:
      if (csalnum(c)) {
	state = KEY;
	key_buffer[0] = c;
	key_len = 1;
      }
      else if (c == '\n')
	state = BOL;
      break;
    default:
      assert(0);
    }
  }
  switch (state) {
  case START:
    break;
  case DISCARD:
  case KEY:
    possibly_store_key(key_buffer, key_len);
    // fall through
  case BOL:
  case PERCENT:
  case IGNORE_BOL:
  case IGNORE:
  case MIDDLE:
    store_reference(filename_index, ref_start,
		    byte_count - ref_start - space_count);
    break;
  default:
    assert(0);
  }
  fclose(fp);
  return 1;
}

static void store_reference(int filename_index, int pos, int len)
{
  tag t;
  t.filename_index = filename_index;
  t.start = pos;
  t.length = len;
  fwrite_or_die(&t, sizeof(t), 1, indxfp);
  ntags++;
}

static void store_filename(const char *fn)
{
  filenames += fn;
  filenames += '\0';
}

static void init_hash_table()
{
  hash_table = new table_entry[hash_table_size];
  for (int i = 0; i < hash_table_size; i++)
    hash_table[i].ptr = 0;
}

static void possibly_store_key(char *s, int len)
{
  static int last_tagno = -1;
  static int key_count;
  if (last_tagno != ntags) {
    last_tagno = ntags;
    key_count = 0;
  }
  if (key_count < max_keys_per_item) {
    if (store_key(s, len))
      key_count++;
  }
}

static int store_key(char *s, int len)
{
  if (len < shortest_len)
    return 0;
  int is_number = 1;
  for (int i = 0; i < len; i++)
    if (!csdigit(s[i])) {
      is_number = 0;
      s[i] = cmlower(s[i]);
    }
  if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
    return 0;
  int h = hash(s, len) % hash_table_size;
  if (common_words_table) {
    for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
      if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
	return 0;
  }
  table_entry *pp =  hash_table + h;
  if (!pp->ptr)
    pp->ptr = new block;
  else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
    return 1;
  else if (pp->ptr->used >= BLOCK_SIZE)
    pp->ptr = new block(pp->ptr);
  pp->ptr->v[(pp->ptr->used)++] = ntags;
  return 1;
}

static void write_hash_table()
{
  const int minus_one = -1;
  int li = 0;
  for (int i = 0; i < hash_table_size; i++) {
    block *ptr = hash_table[i].ptr;
    if (!ptr)
      hash_table[i].count = -1;
    else {
      hash_table[i].count = li;
      block *rev = 0;
      while (ptr) {
	block *tem = ptr;
	ptr = ptr->next;
	tem->next = rev;
	rev = tem;
      }
      while (rev) {
	fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
	li += rev->used;
	block *tem = rev;
	rev = rev->next;
	delete tem;
      }
      fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
      li += 1;
    }
  }
  if (sizeof(table_entry) == sizeof(int))
    fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
  else {
    // write it out word by word
    for (int i = 0; i < hash_table_size; i++)
      fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
  }
  fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
  if (fseek(indxfp, 0, 0) < 0)
    fatal("error seeking on index file: %1", strerror(errno));
  index_header h;
  h.magic = INDEX_MAGIC;
  h.version = INDEX_VERSION;
  h.tags_size = ntags;
  h.lists_size = li;
  h.table_size = hash_table_size;
  h.strings_size = filenames.length();
  h.truncate = truncate_len;
  h.shortest = shortest_len;
  h.common = n_ignore_words;
  fwrite_or_die(&h, sizeof(h), 1, indxfp);
}

static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
{
  if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
    fatal("fwrite failed: %1", strerror(errno));
}

void fatal_error_exit()
{
  cleanup();
  exit(3);
}

extern "C" {

void cleanup()
{
  if (temp_index_file)
    unlink(temp_index_file);
}

}