xref: /netbsd-src/external/gpl2/groff/dist/src/preproc/refer/ref.cpp (revision 89a07cf815a29524268025a1139fac4c5190f765)
1 /*	$NetBSD: ref.cpp,v 1.1.1.1 2016/01/13 18:41:49 christos Exp $	*/
2 
3 // -*- C++ -*-
4 /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
5    Free Software Foundation, Inc.
6 Written by James Clark (jjc@jclark.com)
7 
8 This file is part of groff.
9 
10 groff is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free
12 Software Foundation; either version 2, or (at your option) any later
13 version.
14 
15 groff is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18 for more details.
19 
20 You should have received a copy of the GNU General Public License along
21 with groff; see the file COPYING.  If not, write to the Free Software
22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23 
24 #include "refer.h"
25 #include "refid.h"
26 #include "ref.h"
27 #include "token.h"
28 
29 static const char *find_day(const char *, const char *, const char **);
30 static int find_month(const char *start, const char *end);
31 static void abbreviate_names(string &);
32 
33 #define DEFAULT_ARTICLES "the\000a\000an"
34 
35 string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
36 
37 // Multiple occurrences of fields are separated by FIELD_SEPARATOR.
38 const char FIELD_SEPARATOR = '\0';
39 
40 const char MULTI_FIELD_NAMES[] = "AE";
41 const char *AUTHOR_FIELDS = "AQ";
42 
43 enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
44 
45 const char *reference_types[] = {
46   "other",
47   "journal-article",
48   "book",
49   "article-in-book",
50   "tech-report",
51   "bell-tm",
52 };
53 
54 static string temp_fields[256];
55 
reference(const char * start,int len,reference_id * ridp)56 reference::reference(const char *start, int len, reference_id *ridp)
57 : h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
58   computed_authors(0), last_needed_author(-1), nauthors(-1)
59 {
60   int i;
61   for (i = 0; i < 256; i++)
62     field_index[i] = NULL_FIELD_INDEX;
63   if (ridp)
64     rid = *ridp;
65   if (start == 0)
66     return;
67   if (len <= 0)
68     return;
69   const char *end = start + len;
70   const char *ptr = start;
71   assert(*ptr == '%');
72   while (ptr < end) {
73     if (ptr + 1 < end && ptr[1] != '\0'
74 	&& ((ptr[1] != '%' && ptr[1] == annotation_field)
75 	    || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
76 		&& discard_fields.search(ptr[2]) < 0))) {
77       if (ptr[1] == '%')
78 	ptr++;
79       string &f = temp_fields[(unsigned char)ptr[1]];
80       ptr += 2;
81       while (ptr < end && csspace(*ptr))
82 	ptr++;
83       for (;;) {
84 	for (;;) {
85 	  if (ptr >= end) {
86 	    f += '\n';
87 	    break;
88 	  }
89 	  f += *ptr;
90 	  if (*ptr++ == '\n')
91 	    break;
92 	}
93 	if (ptr >= end || *ptr == '%')
94 	  break;
95       }
96     }
97     else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
98 	     && discard_fields.search(ptr[1]) < 0) {
99       string &f = temp_fields[(unsigned char)ptr[1]];
100       if (f.length() > 0) {
101 	if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
102 	  f += FIELD_SEPARATOR;
103 	else
104 	  f.clear();
105       }
106       ptr += 2;
107       if (ptr < end) {
108 	if (*ptr == ' ')
109 	  ptr++;
110 	for (;;) {
111 	  const char *p = ptr;
112 	  while (ptr < end && *ptr != '\n')
113 	    ptr++;
114 	  // strip trailing white space
115 	  const char *q = ptr;
116 	  while (q > p && q[-1] != '\n' && csspace(q[-1]))
117 	    q--;
118 	  while (p < q)
119 	    f += *p++;
120 	  if (ptr >= end)
121 	    break;
122 	  ptr++;
123 	  if (ptr >= end)
124 	    break;
125 	  if (*ptr == '%')
126 	    break;
127 	  f += ' ';
128 	}
129       }
130     }
131     else {
132       // skip this field
133       for (;;) {
134 	while (ptr < end && *ptr++ != '\n')
135 	  ;
136 	if (ptr >= end || *ptr == '%')
137 	  break;
138       }
139     }
140   }
141   for (i = 0; i < 256; i++)
142     if (temp_fields[i].length() > 0)
143       nfields++;
144   field = new string[nfields];
145   int j = 0;
146   for (i = 0; i < 256; i++)
147     if (temp_fields[i].length() > 0) {
148       field[j].move(temp_fields[i]);
149       if (abbreviate_fields.search(i) >= 0)
150 	abbreviate_names(field[j]);
151       field_index[i] = j;
152       j++;
153     }
154 }
155 
~reference()156 reference::~reference()
157 {
158   if (nfields > 0)
159     ad_delete(nfields) field;
160 }
161 
162 // ref is the inline, this is the database ref
163 
merge(reference & ref)164 void reference::merge(reference &ref)
165 {
166   int i;
167   for (i = 0; i < 256; i++)
168     if (field_index[i] != NULL_FIELD_INDEX)
169       temp_fields[i].move(field[field_index[i]]);
170   for (i = 0; i < 256; i++)
171     if (ref.field_index[i] != NULL_FIELD_INDEX)
172       temp_fields[i].move(ref.field[ref.field_index[i]]);
173   for (i = 0; i < 256; i++)
174     field_index[i] = NULL_FIELD_INDEX;
175   int old_nfields = nfields;
176   nfields = 0;
177   for (i = 0; i < 256; i++)
178     if (temp_fields[i].length() > 0)
179       nfields++;
180   if (nfields != old_nfields) {
181     if (old_nfields > 0)
182       ad_delete(old_nfields) field;
183     field = new string[nfields];
184   }
185   int j = 0;
186   for (i = 0; i < 256; i++)
187     if (temp_fields[i].length() > 0) {
188       field[j].move(temp_fields[i]);
189       field_index[i] = j;
190       j++;
191     }
192   merged = 1;
193 }
194 
insert_field(unsigned char c,string & s)195 void reference::insert_field(unsigned char c, string &s)
196 {
197   assert(s.length() > 0);
198   if (field_index[c] != NULL_FIELD_INDEX) {
199     field[field_index[c]].move(s);
200     return;
201   }
202   assert(field_index[c] == NULL_FIELD_INDEX);
203   string *old_field = field;
204   field = new string[nfields + 1];
205   int pos = 0;
206   int i;
207   for (i = 0; i < int(c); i++)
208     if (field_index[i] != NULL_FIELD_INDEX)
209       pos++;
210   for (i = 0; i < pos; i++)
211     field[i].move(old_field[i]);
212   field[pos].move(s);
213   for (i = pos; i < nfields; i++)
214     field[i + 1].move(old_field[i]);
215   if (nfields > 0)
216     ad_delete(nfields) old_field;
217   nfields++;
218   field_index[c] = pos;
219   for (i = c + 1; i < 256; i++)
220     if (field_index[i] != NULL_FIELD_INDEX)
221       field_index[i] += 1;
222 }
223 
delete_field(unsigned char c)224 void reference::delete_field(unsigned char c)
225 {
226   if (field_index[c] == NULL_FIELD_INDEX)
227     return;
228   string *old_field = field;
229   field = new string[nfields - 1];
230   int i;
231   for (i = 0; i < int(field_index[c]); i++)
232     field[i].move(old_field[i]);
233   for (i = field_index[c]; i < nfields - 1; i++)
234     field[i].move(old_field[i + 1]);
235   if (nfields > 0)
236     ad_delete(nfields) old_field;
237   nfields--;
238   field_index[c] = NULL_FIELD_INDEX;
239   for (i = c + 1; i < 256; i++)
240     if (field_index[i] != NULL_FIELD_INDEX)
241       field_index[i] -= 1;
242 }
243 
compute_hash_code()244 void reference::compute_hash_code()
245 {
246   if (!rid.is_null())
247     h = rid.hash();
248   else {
249     h = 0;
250     for (int i = 0; i < nfields; i++)
251       if (field[i].length() > 0) {
252 	h <<= 4;
253 	h ^= hash_string(field[i].contents(), field[i].length());
254       }
255   }
256 }
257 
set_number(int n)258 void reference::set_number(int n)
259 {
260   no = n;
261 }
262 
263 const char SORT_SEP = '\001';
264 const char SORT_SUB_SEP = '\002';
265 const char SORT_SUB_SUB_SEP = '\003';
266 
267 // sep specifies additional word separators
268 
sortify_words(const char * s,const char * end,const char * sep,string & result)269 void sortify_words(const char *s, const char *end, const char *sep,
270 		   string &result)
271 {
272   int non_empty = 0;
273   int need_separator = 0;
274   for (;;) {
275     const char *token_start = s;
276     if (!get_token(&s, end))
277       break;
278     if ((s - token_start == 1
279 	 && (*token_start == ' '
280 	     || *token_start == '\n'
281 	     || (sep && *token_start != '\0'
282 		 && strchr(sep, *token_start) != 0)))
283 	|| (s - token_start == 2
284 	    && token_start[0] == '\\' && token_start[1] == ' ')) {
285       if (non_empty)
286 	need_separator = 1;
287     }
288     else {
289       const token_info *ti = lookup_token(token_start, s);
290       if (ti->sortify_non_empty(token_start, s)) {
291 	if (need_separator) {
292 	  result += ' ';
293 	  need_separator = 0;
294 	}
295 	ti->sortify(token_start, s, result);
296 	non_empty = 1;
297       }
298     }
299   }
300 }
301 
sortify_word(const char * s,const char * end,string & result)302 void sortify_word(const char *s, const char *end, string &result)
303 {
304   for (;;) {
305     const char *token_start = s;
306     if (!get_token(&s, end))
307       break;
308     const token_info *ti = lookup_token(token_start, s);
309     ti->sortify(token_start, s, result);
310   }
311 }
312 
sortify_other(const char * s,int len,string & key)313 void sortify_other(const char *s, int len, string &key)
314 {
315   sortify_words(s, s + len, 0, key);
316 }
317 
sortify_title(const char * s,int len,string & key)318 void sortify_title(const char *s, int len, string &key)
319 {
320   const char *end = s + len;
321   for (; s < end && (*s == ' ' || *s == '\n'); s++)
322     ;
323   const char *ptr = s;
324   for (;;) {
325     const char *token_start = ptr;
326     if (!get_token(&ptr, end))
327       break;
328     if (ptr - token_start == 1
329 	&& (*token_start == ' ' || *token_start == '\n'))
330       break;
331   }
332   if (ptr < end) {
333     unsigned int first_word_len = ptr - s - 1;
334     const char *ae = articles.contents() + articles.length();
335     for (const char *a = articles.contents();
336 	 a < ae;
337 	 a = strchr(a, '\0') + 1)
338       if (first_word_len == strlen(a)) {
339 	unsigned int j;
340 	for (j = 0; j < first_word_len; j++)
341 	  if (a[j] != cmlower(s[j]))
342 	    break;
343 	if (j >= first_word_len) {
344 	  s = ptr;
345 	  for (; s < end && (*s == ' ' || *s == '\n'); s++)
346 	    ;
347 	  break;
348 	}
349       }
350   }
351   sortify_words(s, end, 0, key);
352 }
353 
sortify_name(const char * s,int len,string & key)354 void sortify_name(const char *s, int len, string &key)
355 {
356   const char *last_name_end;
357   const char *last_name = find_last_name(s, s + len, &last_name_end);
358   sortify_word(last_name, last_name_end, key);
359   key += SORT_SUB_SUB_SEP;
360   if (last_name > s)
361     sortify_words(s, last_name, ".", key);
362   key += SORT_SUB_SUB_SEP;
363   if (last_name_end < s + len)
364     sortify_words(last_name_end, s + len, ".,", key);
365 }
366 
sortify_date(const char * s,int len,string & key)367 void sortify_date(const char *s, int len, string &key)
368 {
369   const char *year_end;
370   const char *year_start = find_year(s, s + len, &year_end);
371   if (!year_start) {
372     // Things without years are often `forthcoming', so it makes sense
373     // that they sort after things with explicit years.
374     key += 'A';
375     sortify_words(s, s + len, 0, key);
376     return;
377   }
378   int n = year_end - year_start;
379   while (n < 4) {
380     key += '0';
381     n++;
382   }
383   while (year_start < year_end)
384     key += *year_start++;
385   int m = find_month(s, s + len);
386   if (m < 0)
387     return;
388   key += 'A' + m;
389   const char *day_end;
390   const char *day_start = find_day(s, s + len, &day_end);
391   if (!day_start)
392     return;
393   if (day_end - day_start == 1)
394     key += '0';
395   while (day_start < day_end)
396     key += *day_start++;
397 }
398 
399 // SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
400 
sortify_label(const char * s,int len,string & key)401 void sortify_label(const char *s, int len, string &key)
402 {
403   const char *end = s + len;
404   for (;;) {
405     const char *ptr;
406     for (ptr = s;
407 	 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
408 	 ptr++)
409       ;
410     if (ptr > s)
411       sortify_words(s, ptr, 0, key);
412     s = ptr;
413     if (s >= end)
414       break;
415     key += *s++;
416   }
417 }
418 
compute_sort_key()419 void reference::compute_sort_key()
420 {
421   if (sort_fields.length() == 0)
422     return;
423   sort_fields += '\0';
424   const char *sf = sort_fields.contents();
425   while (*sf != '\0') {
426     sort_key += SORT_SEP;
427     char f = *sf++;
428     int n = 1;
429     if (*sf == '+') {
430       n = INT_MAX;
431       sf++;
432     }
433     else if (csdigit(*sf)) {
434       char *ptr;
435       long l = strtol(sf, &ptr, 10);
436       if (l == 0 && ptr == sf)
437 	;
438       else {
439 	sf = ptr;
440 	if (l < 0) {
441 	  n = 1;
442 	}
443 	else {
444 	  n = int(l);
445 	}
446       }
447     }
448     if (f == '.')
449       sortify_label(label.contents(), label.length(), sort_key);
450     else if (f == AUTHOR_FIELDS[0])
451       sortify_authors(n, sort_key);
452     else
453       sortify_field(f, n, sort_key);
454   }
455   sort_fields.set_length(sort_fields.length() - 1);
456 }
457 
sortify_authors(int n,string & result) const458 void reference::sortify_authors(int n, string &result) const
459 {
460   for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
461     if (contains_field(*p)) {
462       sortify_field(*p, n, result);
463       return;
464     }
465   sortify_field(AUTHOR_FIELDS[0], n, result);
466 }
467 
canonicalize_authors(string & result) const468 void reference::canonicalize_authors(string &result) const
469 {
470   int len = result.length();
471   sortify_authors(INT_MAX, result);
472   if (result.length() > len)
473     result += SORT_SUB_SEP;
474 }
475 
sortify_field(unsigned char f,int n,string & result) const476 void reference::sortify_field(unsigned char f, int n, string &result) const
477 {
478   typedef void (*sortify_t)(const char *, int, string &);
479   sortify_t sortifier = sortify_other;
480   switch (f) {
481   case 'A':
482   case 'E':
483     sortifier = sortify_name;
484     break;
485   case 'D':
486     sortifier = sortify_date;
487     break;
488   case 'B':
489   case 'J':
490   case 'T':
491     sortifier = sortify_title;
492     break;
493   }
494   int fi = field_index[(unsigned char)f];
495   if (fi != NULL_FIELD_INDEX) {
496     string &str = field[fi];
497     const char *start = str.contents();
498     const char *end = start + str.length();
499     for (int i = 0; i < n && start < end; i++) {
500       const char *p = start;
501       while (start < end && *start != FIELD_SEPARATOR)
502 	start++;
503       if (i > 0)
504 	result += SORT_SUB_SEP;
505       (*sortifier)(p, start - p, result);
506       if (start < end)
507 	start++;
508     }
509   }
510 }
511 
compare_reference(const reference & r1,const reference & r2)512 int compare_reference(const reference &r1, const reference &r2)
513 {
514   assert(r1.no >= 0);
515   assert(r2.no >= 0);
516   const char *s1 = r1.sort_key.contents();
517   int n1 = r1.sort_key.length();
518   const char *s2 = r2.sort_key.contents();
519   int n2 = r2.sort_key.length();
520   for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
521     if (*s1 != *s2)
522       return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
523   if (n2 > 0)
524     return -1;
525   if (n1 > 0)
526     return 1;
527   return r1.no - r2.no;
528 }
529 
same_reference(const reference & r1,const reference & r2)530 int same_reference(const reference &r1, const reference &r2)
531 {
532   if (!r1.rid.is_null() && r1.rid == r2.rid)
533     return 1;
534   if (r1.h != r2.h)
535     return 0;
536   if (r1.nfields != r2.nfields)
537     return 0;
538   int i = 0;
539   for (i = 0; i < 256; i++)
540     if (r1.field_index != r2.field_index)
541       return 0;
542   for (i = 0; i < r1.nfields; i++)
543     if (r1.field[i] != r2.field[i])
544       return 0;
545   return 1;
546 }
547 
find_last_name(const char * start,const char * end,const char ** endp)548 const char *find_last_name(const char *start, const char *end,
549 			   const char **endp)
550 {
551   const char *ptr = start;
552   const char *last_word = start;
553   for (;;) {
554     const char *token_start = ptr;
555     if (!get_token(&ptr, end))
556       break;
557     if (ptr - token_start == 1) {
558       if (*token_start == ',') {
559 	*endp = token_start;
560 	return last_word;
561       }
562       else if (*token_start == ' ' || *token_start == '\n') {
563 	if (ptr < end && *ptr != ' ' && *ptr != '\n')
564 	  last_word = ptr;
565       }
566     }
567   }
568   *endp = end;
569   return last_word;
570 }
571 
abbreviate_name(const char * ptr,const char * end,string & result)572 void abbreviate_name(const char *ptr, const char *end, string &result)
573 {
574   const char *last_name_end;
575   const char *last_name_start = find_last_name(ptr, end, &last_name_end);
576   int need_period = 0;
577   for (;;) {
578     const char *token_start = ptr;
579     if (!get_token(&ptr, last_name_start))
580       break;
581     const token_info *ti = lookup_token(token_start, ptr);
582     if (need_period) {
583       if ((ptr - token_start == 1 && *token_start == ' ')
584 	  || (ptr - token_start == 2 && token_start[0] == '\\'
585 	      && token_start[1] == ' '))
586 	continue;
587       if (ti->is_upper())
588 	result += period_before_initial;
589       else
590 	result += period_before_other;
591       need_period = 0;
592     }
593     result.append(token_start, ptr - token_start);
594     if (ti->is_upper()) {
595       const char *lower_ptr = ptr;
596       int first_token = 1;
597       for (;;) {
598 	token_start = ptr;
599 	if (!get_token(&ptr, last_name_start))
600 	  break;
601 	if ((ptr - token_start == 1 && *token_start == ' ')
602 	    || (ptr - token_start == 2 && token_start[0] == '\\'
603 		&& token_start[1] == ' '))
604 	  break;
605 	ti = lookup_token(token_start, ptr);
606 	if (ti->is_hyphen()) {
607 	  const char *ptr1 = ptr;
608 	  if (get_token(&ptr1, last_name_start)) {
609 	    ti = lookup_token(ptr, ptr1);
610 	    if (ti->is_upper()) {
611 	      result += period_before_hyphen;
612 	      result.append(token_start, ptr1 - token_start);
613 	      ptr = ptr1;
614 	    }
615 	  }
616 	}
617 	else if (ti->is_upper()) {
618 	  // MacDougal -> MacD.
619 	  result.append(lower_ptr, ptr - lower_ptr);
620 	  lower_ptr = ptr;
621 	  first_token = 1;
622 	}
623 	else if (first_token && ti->is_accent()) {
624 	  result.append(token_start, ptr - token_start);
625 	  lower_ptr = ptr;
626 	}
627 	first_token = 0;
628       }
629       need_period = 1;
630     }
631   }
632   if (need_period)
633     result += period_before_last_name;
634   result.append(last_name_start, end - last_name_start);
635 }
636 
abbreviate_names(string & result)637 static void abbreviate_names(string &result)
638 {
639   string str;
640   str.move(result);
641   const char *ptr = str.contents();
642   const char *end = ptr + str.length();
643   while (ptr < end) {
644     const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
645     if (name_end == 0)
646       name_end = end;
647     abbreviate_name(ptr, name_end, result);
648     if (name_end >= end)
649       break;
650     ptr = name_end + 1;
651     result += FIELD_SEPARATOR;
652   }
653 }
654 
reverse_name(const char * ptr,const char * name_end,string & result)655 void reverse_name(const char *ptr, const char *name_end, string &result)
656 {
657   const char *last_name_end;
658   const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
659   result.append(last_name_start, last_name_end - last_name_start);
660   while (last_name_start > ptr
661 	 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
662     last_name_start--;
663   if (last_name_start > ptr) {
664     result += ", ";
665     result.append(ptr, last_name_start - ptr);
666   }
667   if (last_name_end < name_end)
668     result.append(last_name_end, name_end - last_name_end);
669 }
670 
reverse_names(string & result,int n)671 void reverse_names(string &result, int n)
672 {
673   if (n <= 0)
674     return;
675   string str;
676   str.move(result);
677   const char *ptr = str.contents();
678   const char *end = ptr + str.length();
679   while (ptr < end) {
680     if (--n < 0) {
681       result.append(ptr, end - ptr);
682       break;
683     }
684     const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
685     if (name_end == 0)
686       name_end = end;
687     reverse_name(ptr, name_end, result);
688     if (name_end >= end)
689       break;
690     ptr = name_end + 1;
691     result += FIELD_SEPARATOR;
692   }
693 }
694 
695 // Return number of field separators.
696 
join_fields(string & f)697 int join_fields(string &f)
698 {
699   const char *ptr = f.contents();
700   int len = f.length();
701   int nfield_seps = 0;
702   int j;
703   for (j = 0; j < len; j++)
704     if (ptr[j] == FIELD_SEPARATOR)
705       nfield_seps++;
706   if (nfield_seps == 0)
707     return 0;
708   string temp;
709   int field_seps_left = nfield_seps;
710   for (j = 0; j < len; j++) {
711     if (ptr[j] == FIELD_SEPARATOR) {
712       if (nfield_seps == 1)
713 	temp += join_authors_exactly_two;
714       else if (--field_seps_left == 0)
715 	temp += join_authors_last_two;
716       else
717 	temp += join_authors_default;
718     }
719     else
720       temp += ptr[j];
721   }
722   f = temp;
723   return nfield_seps;
724 }
725 
uppercase(const char * start,const char * end,string & result)726 void uppercase(const char *start, const char *end, string &result)
727 {
728   for (;;) {
729     const char *token_start = start;
730     if (!get_token(&start, end))
731       break;
732     const token_info *ti = lookup_token(token_start, start);
733     ti->upper_case(token_start, start, result);
734   }
735 }
736 
lowercase(const char * start,const char * end,string & result)737 void lowercase(const char *start, const char *end, string &result)
738 {
739   for (;;) {
740     const char *token_start = start;
741     if (!get_token(&start, end))
742       break;
743     const token_info *ti = lookup_token(token_start, start);
744     ti->lower_case(token_start, start, result);
745   }
746 }
747 
capitalize(const char * ptr,const char * end,string & result)748 void capitalize(const char *ptr, const char *end, string &result)
749 {
750   int in_small_point_size = 0;
751   for (;;) {
752     const char *start = ptr;
753     if (!get_token(&ptr, end))
754       break;
755     const token_info *ti = lookup_token(start, ptr);
756     const char *char_end = ptr;
757     int is_lower = ti->is_lower();
758     if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
759       const token_info *ti2 = lookup_token(char_end, ptr);
760       if (!ti2->is_accent())
761 	ptr = char_end;
762     }
763     if (is_lower) {
764       if (!in_small_point_size) {
765 	result += "\\s-2";
766 	in_small_point_size = 1;
767       }
768       ti->upper_case(start, char_end, result);
769       result.append(char_end, ptr - char_end);
770     }
771     else {
772       if (in_small_point_size) {
773 	result += "\\s+2";
774 	in_small_point_size = 0;
775       }
776       result.append(start, ptr - start);
777     }
778   }
779   if (in_small_point_size)
780     result += "\\s+2";
781 }
782 
capitalize_field(string & str)783 void capitalize_field(string &str)
784 {
785   string temp;
786   capitalize(str.contents(), str.contents() + str.length(), temp);
787   str.move(temp);
788 }
789 
is_terminated(const char * ptr,const char * end)790 int is_terminated(const char *ptr, const char *end)
791 {
792   const char *last_token = end;
793   for (;;) {
794     const char *p = ptr;
795     if (!get_token(&ptr, end))
796       break;
797     last_token = p;
798   }
799   return end - last_token == 1
800     && (*last_token == '.' || *last_token == '!' || *last_token == '?');
801 }
802 
output(FILE * fp)803 void reference::output(FILE *fp)
804 {
805   fputs(".]-\n", fp);
806   for (int i = 0; i < 256; i++)
807     if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
808       string &f = field[field_index[i]];
809       if (!csdigit(i)) {
810 	int j = reverse_fields.search(i);
811 	if (j >= 0) {
812 	  int n;
813 	  int len = reverse_fields.length();
814 	  if (++j < len && csdigit(reverse_fields[j])) {
815 	    n = reverse_fields[j] - '0';
816 	    for (++j; j < len && csdigit(reverse_fields[j]); j++)
817 	      // should check for overflow
818 	      n = n*10 + reverse_fields[j] - '0';
819 	  }
820 	  else
821 	    n = INT_MAX;
822 	  reverse_names(f, n);
823 	}
824       }
825       int is_multiple = join_fields(f) > 0;
826       if (capitalize_fields.search(i) >= 0)
827 	capitalize_field(f);
828       if (memchr(f.contents(), '\n', f.length()) == 0) {
829 	fprintf(fp, ".ds [%c ", i);
830 	if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
831 	  putc('"', fp);
832 	put_string(f, fp);
833 	putc('\n', fp);
834       }
835       else {
836 	fprintf(fp, ".de [%c\n", i);
837 	put_string(f, fp);
838 	fputs("..\n", fp);
839       }
840       if (i == 'P') {
841 	int multiple_pages = 0;
842 	const char *s = f.contents();
843 	const char *end = f.contents() + f.length();
844 	for (;;) {
845 	  const char *token_start = s;
846 	  if (!get_token(&s, end))
847 	    break;
848 	  const token_info *ti = lookup_token(token_start, s);
849 	  if (ti->is_hyphen() || ti->is_range_sep()) {
850 	    multiple_pages = 1;
851 	    break;
852 	  }
853 	}
854 	fprintf(fp, ".nr [P %d\n", multiple_pages);
855       }
856       else if (i == 'E')
857 	fprintf(fp, ".nr [E %d\n", is_multiple);
858     }
859   for (const char *p = "TAO"; *p; p++) {
860     int fi = field_index[(unsigned char)*p];
861     if (fi != NULL_FIELD_INDEX) {
862       string &f = field[fi];
863       fprintf(fp, ".nr [%c %d\n", *p,
864 	      is_terminated(f.contents(), f.contents() + f.length()));
865     }
866   }
867   int t = classify();
868   fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
869   if (annotation_macro.length() > 0 && annotation_field >= 0
870       && field_index[annotation_field] != NULL_FIELD_INDEX) {
871     putc('.', fp);
872     put_string(annotation_macro, fp);
873     putc('\n', fp);
874     put_string(field[field_index[annotation_field]], fp);
875   }
876 }
877 
print_sort_key_comment(FILE * fp)878 void reference::print_sort_key_comment(FILE *fp)
879 {
880   fputs(".\\\"", fp);
881   put_string(sort_key, fp);
882   putc('\n', fp);
883 }
884 
find_year(const char * start,const char * end,const char ** endp)885 const char *find_year(const char *start, const char *end, const char **endp)
886 {
887   for (;;) {
888     while (start < end && !csdigit(*start))
889       start++;
890     const char *ptr = start;
891     if (start == end)
892       break;
893     while (ptr < end && csdigit(*ptr))
894       ptr++;
895     if (ptr - start == 4 || ptr - start == 3
896 	|| (ptr - start == 2
897 	    && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
898       *endp = ptr;
899       return start;
900     }
901     start = ptr;
902   }
903   return 0;
904 }
905 
find_day(const char * start,const char * end,const char ** endp)906 static const char *find_day(const char *start, const char *end,
907 			    const char **endp)
908 {
909   for (;;) {
910     while (start < end && !csdigit(*start))
911       start++;
912     const char *ptr = start;
913     if (start == end)
914       break;
915     while (ptr < end && csdigit(*ptr))
916       ptr++;
917     if ((ptr - start == 1 && start[0] != '0')
918 	|| (ptr - start == 2 &&
919 	    (start[0] == '1'
920 	     || start[0] == '2'
921 	     || (start[0] == '3' && start[1] <= '1')
922 	     || (start[0] == '0' && start[1] != '0')))) {
923       *endp = ptr;
924       return start;
925     }
926     start = ptr;
927   }
928   return 0;
929 }
930 
find_month(const char * start,const char * end)931 static int find_month(const char *start, const char *end)
932 {
933   static const char *months[] = {
934     "january",
935     "february",
936     "march",
937     "april",
938     "may",
939     "june",
940     "july",
941     "august",
942     "september",
943     "october",
944     "november",
945     "december",
946   };
947   for (;;) {
948     while (start < end && !csalpha(*start))
949       start++;
950     const char *ptr = start;
951     if (start == end)
952       break;
953     while (ptr < end && csalpha(*ptr))
954       ptr++;
955     if (ptr - start >= 3) {
956       for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
957 	const char *q = months[i];
958 	const char *p = start;
959 	for (; p < ptr; p++, q++)
960 	  if (cmlower(*p) != *q)
961 	    break;
962 	if (p >= ptr)
963 	  return i;
964       }
965     }
966     start = ptr;
967   }
968   return -1;
969 }
970 
contains_field(char c) const971 int reference::contains_field(char c) const
972 {
973   return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
974 }
975 
classify()976 int reference::classify()
977 {
978   if (contains_field('J'))
979     return JOURNAL_ARTICLE;
980   if (contains_field('B'))
981     return ARTICLE_IN_BOOK;
982   if (contains_field('G'))
983     return TECH_REPORT;
984   if (contains_field('R'))
985     return TECH_REPORT;
986   if (contains_field('I'))
987     return BOOK;
988   if (contains_field('M'))
989     return BELL_TM;
990   return OTHER;
991 }
992 
get_year(const char ** endp) const993 const char *reference::get_year(const char **endp) const
994 {
995   if (field_index['D'] != NULL_FIELD_INDEX) {
996     string &date = field[field_index['D']];
997     const char *start = date.contents();
998     const char *end = start + date.length();
999     return find_year(start, end, endp);
1000   }
1001   else
1002     return 0;
1003 }
1004 
get_field(unsigned char c,const char ** endp) const1005 const char *reference::get_field(unsigned char c, const char **endp) const
1006 {
1007   if (field_index[c] != NULL_FIELD_INDEX) {
1008     string &f = field[field_index[c]];
1009     const char *start = f.contents();
1010     *endp = start + f.length();
1011     return start;
1012   }
1013   else
1014     return 0;
1015 }
1016 
get_date(const char ** endp) const1017 const char *reference::get_date(const char **endp) const
1018 {
1019   return get_field('D', endp);
1020 }
1021 
nth_field(int i,const char * start,const char ** endp)1022 const char *nth_field(int i, const char *start, const char **endp)
1023 {
1024   while (--i >= 0) {
1025     start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1026     if (!start)
1027       return 0;
1028     start++;
1029   }
1030   const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1031   if (e)
1032     *endp = e;
1033   return start;
1034 }
1035 
get_author(int i,const char ** endp) const1036 const char *reference::get_author(int i, const char **endp) const
1037 {
1038   for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1039     const char *start = get_field(*f, endp);
1040     if (start) {
1041       if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1042 	return nth_field(i, start, endp);
1043       else if (i == 0)
1044 	return start;
1045       else
1046 	return 0;
1047     }
1048   }
1049   return 0;
1050 }
1051 
get_author_last_name(int i,const char ** endp) const1052 const char *reference::get_author_last_name(int i, const char **endp) const
1053 {
1054   for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1055     const char *start = get_field(*f, endp);
1056     if (start) {
1057       if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1058 	start = nth_field(i, start, endp);
1059 	if (!start)
1060 	  return 0;
1061       }
1062       if (*f == 'A')
1063 	return find_last_name(start, *endp, endp);
1064       else
1065 	return start;
1066     }
1067   }
1068   return 0;
1069 }
1070 
set_date(string & d)1071 void reference::set_date(string &d)
1072 {
1073   if (d.length() == 0)
1074     delete_field('D');
1075   else
1076     insert_field('D', d);
1077 }
1078 
same_year(const reference & r1,const reference & r2)1079 int same_year(const reference &r1, const reference &r2)
1080 {
1081   const char *ye1;
1082   const char *ys1 = r1.get_year(&ye1);
1083   const char *ye2;
1084   const char *ys2 = r2.get_year(&ye2);
1085   if (ys1 == 0) {
1086     if (ys2 == 0)
1087       return same_date(r1, r2);
1088     else
1089       return 0;
1090   }
1091   else if (ys2 == 0)
1092     return 0;
1093   else if (ye1 - ys1 != ye2 - ys2)
1094     return 0;
1095   else
1096     return memcmp(ys1, ys2, ye1 - ys1) == 0;
1097 }
1098 
same_date(const reference & r1,const reference & r2)1099 int same_date(const reference &r1, const reference &r2)
1100 {
1101   const char *e1;
1102   const char *s1 = r1.get_date(&e1);
1103   const char *e2;
1104   const char *s2 = r2.get_date(&e2);
1105   if (s1 == 0)
1106     return s2 == 0;
1107   else if (s2 == 0)
1108     return 0;
1109   else if (e1 - s1 != e2 - s2)
1110     return 0;
1111   else
1112     return memcmp(s1, s2, e1 - s1) == 0;
1113 }
1114 
get_sort_field(int i,int si,int ssi,const char ** endp) const1115 const char *reference::get_sort_field(int i, int si, int ssi,
1116 				      const char **endp) const
1117 {
1118   const char *start = sort_key.contents();
1119   const char *end = start + sort_key.length();
1120   if (i < 0) {
1121     *endp = end;
1122     return start;
1123   }
1124   while (--i >= 0) {
1125     start = (char *)memchr(start, SORT_SEP, end - start);
1126     if (!start)
1127       return 0;
1128     start++;
1129   }
1130   const char *e = (char *)memchr(start, SORT_SEP, end - start);
1131   if (e)
1132     end = e;
1133   if (si < 0) {
1134     *endp = end;
1135     return start;
1136   }
1137   while (--si >= 0) {
1138     start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1139     if (!start)
1140       return 0;
1141     start++;
1142   }
1143   e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1144   if (e)
1145     end = e;
1146   if (ssi < 0) {
1147     *endp = end;
1148     return start;
1149   }
1150   while (--ssi >= 0) {
1151     start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1152     if (!start)
1153       return 0;
1154     start++;
1155   }
1156   e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1157   if (e)
1158     end = e;
1159   *endp = end;
1160   return start;
1161 }
1162 
1163