ref.cpp revision 114402
1// -*- C++ -*-
2/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
3Written by James Clark (jjc@jclark.com)
4
5This file is part of groff.
6
7groff is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 2, or (at your option) any later
10version.
11
12groff is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17You should have received a copy of the GNU General Public License along
18with groff; see the file COPYING.  If not, write to the Free Software
19Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20
21#include "refer.h"
22#include "refid.h"
23#include "ref.h"
24#include "token.h"
25
26static const char *find_day(const char *, const char *, const char **);
27static int find_month(const char *start, const char *end);
28static void abbreviate_names(string &);
29
30#define DEFAULT_ARTICLES "the\000a\000an"
31
32string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
33
34// Multiple occurrences of fields are separated by FIELD_SEPARATOR.
35const char FIELD_SEPARATOR = '\0';
36
37const char MULTI_FIELD_NAMES[] = "AE";
38const char *AUTHOR_FIELDS = "AQ";
39
40enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
41
42const char *reference_types[] = {
43  "other",
44  "journal-article",
45  "book",
46  "article-in-book",
47  "tech-report",
48  "bell-tm",
49};
50
51static string temp_fields[256];
52
53reference::reference(const char *start, int len, reference_id *ridp)
54: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
55  computed_authors(0), last_needed_author(-1), nauthors(-1)
56{
57  int i;
58  for (i = 0; i < 256; i++)
59    field_index[i] = NULL_FIELD_INDEX;
60  if (ridp)
61    rid = *ridp;
62  if (start == 0)
63    return;
64  if (len <= 0)
65    return;
66  const char *end = start + len;
67  const char *ptr = start;
68  assert(*ptr == '%');
69  while (ptr < end) {
70    if (ptr + 1 < end && ptr[1] != '\0'
71	&& ((ptr[1] != '%' && ptr[1] == annotation_field)
72	    || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
73		&& discard_fields.search(ptr[2]) < 0))) {
74      if (ptr[1] == '%')
75	ptr++;
76      string &f = temp_fields[(unsigned char)ptr[1]];
77      ptr += 2;
78      while (ptr < end && csspace(*ptr))
79	ptr++;
80      for (;;) {
81	for (;;) {
82	  if (ptr >= end) {
83	    f += '\n';
84	    break;
85	  }
86	  f += *ptr;
87	  if (*ptr++ == '\n')
88	    break;
89	}
90	if (ptr >= end || *ptr == '%')
91	  break;
92      }
93    }
94    else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
95	     && discard_fields.search(ptr[1]) < 0) {
96      string &f = temp_fields[(unsigned char)ptr[1]];
97      if (f.length() > 0) {
98	if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
99	  f += FIELD_SEPARATOR;
100	else
101	  f.clear();
102      }
103      ptr += 2;
104      if (ptr < end) {
105	if (*ptr == ' ')
106	  ptr++;
107	for (;;) {
108	  const char *p = ptr;
109	  while (ptr < end && *ptr != '\n')
110	    ptr++;
111	  // strip trailing white space
112	  const char *q = ptr;
113	  while (q > p && q[-1] != '\n' && csspace(q[-1]))
114	    q--;
115	  while (p < q)
116	    f += *p++;
117	  if (ptr >= end)
118	    break;
119	  ptr++;
120	  if (ptr >= end)
121	    break;
122	  if (*ptr == '%')
123	    break;
124	  f += ' ';
125	}
126      }
127    }
128    else {
129      // skip this field
130      for (;;) {
131	while (ptr < end && *ptr++ != '\n')
132	  ;
133	if (ptr >= end || *ptr == '%')
134	  break;
135      }
136    }
137  }
138  for (i = 0; i < 256; i++)
139    if (temp_fields[i].length() > 0)
140      nfields++;
141  field = new string[nfields];
142  int j = 0;
143  for (i = 0; i < 256; i++)
144    if (temp_fields[i].length() > 0) {
145      field[j].move(temp_fields[i]);
146      if (abbreviate_fields.search(i) >= 0)
147	abbreviate_names(field[j]);
148      field_index[i] = j;
149      j++;
150    }
151}
152
153reference::~reference()
154{
155  if (nfields > 0)
156    ad_delete(nfields) field;
157}
158
159// ref is the inline, this is the database ref
160
161void reference::merge(reference &ref)
162{
163  int i;
164  for (i = 0; i < 256; i++)
165    if (field_index[i] != NULL_FIELD_INDEX)
166      temp_fields[i].move(field[field_index[i]]);
167  for (i = 0; i < 256; i++)
168    if (ref.field_index[i] != NULL_FIELD_INDEX)
169      temp_fields[i].move(ref.field[ref.field_index[i]]);
170  for (i = 0; i < 256; i++)
171    field_index[i] = NULL_FIELD_INDEX;
172  int old_nfields = nfields;
173  nfields = 0;
174  for (i = 0; i < 256; i++)
175    if (temp_fields[i].length() > 0)
176      nfields++;
177  if (nfields != old_nfields) {
178    if (old_nfields > 0)
179      ad_delete(old_nfields) field;
180    field = new string[nfields];
181  }
182  int j = 0;
183  for (i = 0; i < 256; i++)
184    if (temp_fields[i].length() > 0) {
185      field[j].move(temp_fields[i]);
186      field_index[i] = j;
187      j++;
188    }
189  merged = 1;
190}
191
192void reference::insert_field(unsigned char c, string &s)
193{
194  assert(s.length() > 0);
195  if (field_index[c] != NULL_FIELD_INDEX) {
196    field[field_index[c]].move(s);
197    return;
198  }
199  assert(field_index[c] == NULL_FIELD_INDEX);
200  string *old_field = field;
201  field = new string[nfields + 1];
202  int pos = 0;
203  int i;
204  for (i = 0; i < int(c); i++)
205    if (field_index[i] != NULL_FIELD_INDEX)
206      pos++;
207  for (i = 0; i < pos; i++)
208    field[i].move(old_field[i]);
209  field[pos].move(s);
210  for (i = pos; i < nfields; i++)
211    field[i + 1].move(old_field[i]);
212  if (nfields > 0)
213    ad_delete(nfields) old_field;
214  nfields++;
215  field_index[c] = pos;
216  for (i = c + 1; i < 256; i++)
217    if (field_index[i] != NULL_FIELD_INDEX)
218      field_index[i] += 1;
219}
220
221void reference::delete_field(unsigned char c)
222{
223  if (field_index[c] == NULL_FIELD_INDEX)
224    return;
225  string *old_field = field;
226  field = new string[nfields - 1];
227  int i;
228  for (i = 0; i < int(field_index[c]); i++)
229    field[i].move(old_field[i]);
230  for (i = field_index[c]; i < nfields - 1; i++)
231    field[i].move(old_field[i + 1]);
232  if (nfields > 0)
233    ad_delete(nfields) old_field;
234  nfields--;
235  field_index[c] = NULL_FIELD_INDEX;
236  for (i = c + 1; i < 256; i++)
237    if (field_index[i] != NULL_FIELD_INDEX)
238      field_index[i] -= 1;
239}
240
241void reference::compute_hash_code()
242{
243  if (!rid.is_null())
244    h = rid.hash();
245  else {
246    h = 0;
247    for (int i = 0; i < nfields; i++)
248      if (field[i].length() > 0) {
249	h <<= 4;
250	h ^= hash_string(field[i].contents(), field[i].length());
251      }
252  }
253}
254
255void reference::set_number(int n)
256{
257  no = n;
258}
259
260const char SORT_SEP = '\001';
261const char SORT_SUB_SEP = '\002';
262const char SORT_SUB_SUB_SEP = '\003';
263
264// sep specifies additional word separators
265
266void sortify_words(const char *s, const char *end, const char *sep,
267		   string &result)
268{
269  int non_empty = 0;
270  int need_separator = 0;
271  for (;;) {
272    const char *token_start = s;
273    if (!get_token(&s, end))
274      break;
275    if ((s - token_start == 1
276	 && (*token_start == ' '
277	     || *token_start == '\n'
278	     || (sep && *token_start != '\0'
279		 && strchr(sep, *token_start) != 0)))
280	|| (s - token_start == 2
281	    && token_start[0] == '\\' && token_start[1] == ' ')) {
282      if (non_empty)
283	need_separator = 1;
284    }
285    else {
286      const token_info *ti = lookup_token(token_start, s);
287      if (ti->sortify_non_empty(token_start, s)) {
288	if (need_separator) {
289	  result += ' ';
290	  need_separator = 0;
291	}
292	ti->sortify(token_start, s, result);
293	non_empty = 1;
294      }
295    }
296  }
297}
298
299void sortify_word(const char *s, const char *end, string &result)
300{
301  for (;;) {
302    const char *token_start = s;
303    if (!get_token(&s, end))
304      break;
305    const token_info *ti = lookup_token(token_start, s);
306    ti->sortify(token_start, s, result);
307  }
308}
309
310void sortify_other(const char *s, int len, string &key)
311{
312  sortify_words(s, s + len, 0, key);
313}
314
315void sortify_title(const char *s, int len, string &key)
316{
317  const char *end = s + len;
318  for (; s < end && (*s == ' ' || *s == '\n'); s++)
319    ;
320  const char *ptr = s;
321  for (;;) {
322    const char *token_start = ptr;
323    if (!get_token(&ptr, end))
324      break;
325    if (ptr - token_start == 1
326	&& (*token_start == ' ' || *token_start == '\n'))
327      break;
328  }
329  if (ptr < end) {
330    unsigned int first_word_len = ptr - s - 1;
331    const char *ae = articles.contents() + articles.length();
332    for (const char *a = articles.contents();
333	 a < ae;
334	 a = strchr(a, '\0') + 1)
335      if (first_word_len == strlen(a)) {
336	unsigned int j;
337	for (j = 0; j < first_word_len; j++)
338	  if (a[j] != cmlower(s[j]))
339	    break;
340	if (j >= first_word_len) {
341	  s = ptr;
342	  for (; s < end && (*s == ' ' || *s == '\n'); s++)
343	    ;
344	  break;
345	}
346      }
347  }
348  sortify_words(s, end, 0, key);
349}
350
351void sortify_name(const char *s, int len, string &key)
352{
353  const char *last_name_end;
354  const char *last_name = find_last_name(s, s + len, &last_name_end);
355  sortify_word(last_name, last_name_end, key);
356  key += SORT_SUB_SUB_SEP;
357  if (last_name > s)
358    sortify_words(s, last_name, ".", key);
359  key += SORT_SUB_SUB_SEP;
360  if (last_name_end < s + len)
361    sortify_words(last_name_end, s + len, ".,", key);
362}
363
364void sortify_date(const char *s, int len, string &key)
365{
366  const char *year_end;
367  const char *year_start = find_year(s, s + len, &year_end);
368  if (!year_start) {
369    // Things without years are often `forthcoming', so it makes sense
370    // that they sort after things with explicit years.
371    key += 'A';
372    sortify_words(s, s + len, 0, key);
373    return;
374  }
375  int n = year_end - year_start;
376  while (n < 4) {
377    key += '0';
378    n++;
379  }
380  while (year_start < year_end)
381    key += *year_start++;
382  int m = find_month(s, s + len);
383  if (m < 0)
384    return;
385  key += 'A' + m;
386  const char *day_end;
387  const char *day_start = find_day(s, s + len, &day_end);
388  if (!day_start)
389    return;
390  if (day_end - day_start == 1)
391    key += '0';
392  while (day_start < day_end)
393    key += *day_start++;
394}
395
396// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
397
398void sortify_label(const char *s, int len, string &key)
399{
400  const char *end = s + len;
401  for (;;) {
402    const char *ptr;
403    for (ptr = s;
404	 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
405	 ptr++)
406      ;
407    if (ptr > s)
408      sortify_words(s, ptr, 0, key);
409    s = ptr;
410    if (s >= end)
411      break;
412    key += *s++;
413  }
414}
415
416void reference::compute_sort_key()
417{
418  if (sort_fields.length() == 0)
419    return;
420  sort_fields += '\0';
421  const char *sf = sort_fields.contents();
422  while (*sf != '\0') {
423    if (sf > sort_fields)
424      sort_key += SORT_SEP;
425    char f = *sf++;
426    int n = 1;
427    if (*sf == '+') {
428      n = INT_MAX;
429      sf++;
430    }
431    else if (csdigit(*sf)) {
432      char *ptr;
433      long l = strtol(sf, &ptr, 10);
434      if (l == 0 && ptr == sf)
435	;
436      else {
437	sf = ptr;
438	if (l < 0) {
439	  n = 1;
440	}
441	else {
442	  n = int(l);
443	}
444      }
445    }
446    if (f == '.')
447      sortify_label(label.contents(), label.length(), sort_key);
448    else if (f == AUTHOR_FIELDS[0])
449      sortify_authors(n, sort_key);
450    else
451      sortify_field(f, n, sort_key);
452  }
453  sort_fields.set_length(sort_fields.length() - 1);
454}
455
456void reference::sortify_authors(int n, string &result) const
457{
458  for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
459    if (contains_field(*p)) {
460      sortify_field(*p, n, result);
461      return;
462    }
463  sortify_field(AUTHOR_FIELDS[0], n, result);
464}
465
466void reference::canonicalize_authors(string &result) const
467{
468  int len = result.length();
469  sortify_authors(INT_MAX, result);
470  if (result.length() > len)
471    result += SORT_SUB_SEP;
472}
473
474void reference::sortify_field(unsigned char f, int n, string &result) const
475{
476  typedef void (*sortify_t)(const char *, int, string &);
477  sortify_t sortifier = sortify_other;
478  switch (f) {
479  case 'A':
480  case 'E':
481    sortifier = sortify_name;
482    break;
483  case 'D':
484    sortifier = sortify_date;
485    break;
486  case 'B':
487  case 'J':
488  case 'T':
489    sortifier = sortify_title;
490    break;
491  }
492  int fi = field_index[(unsigned char)f];
493  if (fi != NULL_FIELD_INDEX) {
494    string &str = field[fi];
495    const char *start = str.contents();
496    const char *end = start + str.length();
497    for (int i = 0; i < n && start < end; i++) {
498      const char *p = start;
499      while (start < end && *start != FIELD_SEPARATOR)
500	start++;
501      if (i > 0)
502	result += SORT_SUB_SEP;
503      (*sortifier)(p, start - p, result);
504      if (start < end)
505	start++;
506    }
507  }
508}
509
510int compare_reference(const reference &r1, const reference &r2)
511{
512  assert(r1.no >= 0);
513  assert(r2.no >= 0);
514  const char *s1 = r1.sort_key.contents();
515  int n1 = r1.sort_key.length();
516  const char *s2 = r2.sort_key.contents();
517  int n2 = r2.sort_key.length();
518  for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
519    if (*s1 != *s2)
520      return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
521  if (n2 > 0)
522    return -1;
523  if (n1 > 0)
524    return 1;
525  return r1.no - r2.no;
526}
527
528int same_reference(const reference &r1, const reference &r2)
529{
530  if (!r1.rid.is_null() && r1.rid == r2.rid)
531    return 1;
532  if (r1.h != r2.h)
533    return 0;
534  if (r1.nfields != r2.nfields)
535    return 0;
536  int i = 0;
537  for (i = 0; i < 256; i++)
538    if (r1.field_index != r2.field_index)
539      return 0;
540  for (i = 0; i < r1.nfields; i++)
541    if (r1.field[i] != r2.field[i])
542      return 0;
543  return 1;
544}
545
546const char *find_last_name(const char *start, const char *end,
547			   const char **endp)
548{
549  const char *ptr = start;
550  const char *last_word = start;
551  for (;;) {
552    const char *token_start = ptr;
553    if (!get_token(&ptr, end))
554      break;
555    if (ptr - token_start == 1) {
556      if (*token_start == ',') {
557	*endp = token_start;
558	return last_word;
559      }
560      else if (*token_start == ' ' || *token_start == '\n') {
561	if (ptr < end && *ptr != ' ' && *ptr != '\n')
562	  last_word = ptr;
563      }
564    }
565  }
566  *endp = end;
567  return last_word;
568}
569
570void abbreviate_name(const char *ptr, const char *end, string &result)
571{
572  const char *last_name_end;
573  const char *last_name_start = find_last_name(ptr, end, &last_name_end);
574  int need_period = 0;
575  for (;;) {
576    const char *token_start = ptr;
577    if (!get_token(&ptr, last_name_start))
578      break;
579    const token_info *ti = lookup_token(token_start, ptr);
580    if (need_period) {
581      if ((ptr - token_start == 1 && *token_start == ' ')
582	  || (ptr - token_start == 2 && token_start[0] == '\\'
583	      && token_start[1] == ' '))
584	continue;
585      if (ti->is_upper())
586	result += period_before_initial;
587      else
588	result += period_before_other;
589      need_period = 0;
590    }
591    result.append(token_start, ptr - token_start);
592    if (ti->is_upper()) {
593      const char *lower_ptr = ptr;
594      int first_token = 1;
595      for (;;) {
596	token_start = ptr;
597	if (!get_token(&ptr, last_name_start))
598	  break;
599	if ((ptr - token_start == 1 && *token_start == ' ')
600	    || (ptr - token_start == 2 && token_start[0] == '\\'
601		&& token_start[1] == ' '))
602	  break;
603	ti = lookup_token(token_start, ptr);
604	if (ti->is_hyphen()) {
605	  const char *ptr1 = ptr;
606	  if (get_token(&ptr1, last_name_start)) {
607	    ti = lookup_token(ptr, ptr1);
608	    if (ti->is_upper()) {
609	      result += period_before_hyphen;
610	      result.append(token_start, ptr1 - token_start);
611	      ptr = ptr1;
612	    }
613	  }
614	}
615	else if (ti->is_upper()) {
616	  // MacDougal -> MacD.
617	  result.append(lower_ptr, ptr - lower_ptr);
618	  lower_ptr = ptr;
619	  first_token = 1;
620	}
621	else if (first_token && ti->is_accent()) {
622	  result.append(token_start, ptr - token_start);
623	  lower_ptr = ptr;
624	}
625	first_token = 0;
626      }
627      need_period = 1;
628    }
629  }
630  if (need_period)
631    result += period_before_last_name;
632  result.append(last_name_start, end - last_name_start);
633}
634
635static void abbreviate_names(string &result)
636{
637  string str;
638  str.move(result);
639  const char *ptr = str.contents();
640  const char *end = ptr + str.length();
641  while (ptr < end) {
642    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
643    if (name_end == 0)
644      name_end = end;
645    abbreviate_name(ptr, name_end, result);
646    if (name_end >= end)
647      break;
648    ptr = name_end + 1;
649    result += FIELD_SEPARATOR;
650  }
651}
652
653void reverse_name(const char *ptr, const char *name_end, string &result)
654{
655  const char *last_name_end;
656  const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
657  result.append(last_name_start, last_name_end - last_name_start);
658  while (last_name_start > ptr
659	 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
660    last_name_start--;
661  if (last_name_start > ptr) {
662    result += ", ";
663    result.append(ptr, last_name_start - ptr);
664  }
665  if (last_name_end < name_end)
666    result.append(last_name_end, name_end - last_name_end);
667}
668
669void reverse_names(string &result, int n)
670{
671  if (n <= 0)
672    return;
673  string str;
674  str.move(result);
675  const char *ptr = str.contents();
676  const char *end = ptr + str.length();
677  while (ptr < end) {
678    if (--n < 0) {
679      result.append(ptr, end - ptr);
680      break;
681    }
682    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
683    if (name_end == 0)
684      name_end = end;
685    reverse_name(ptr, name_end, result);
686    if (name_end >= end)
687      break;
688    ptr = name_end + 1;
689    result += FIELD_SEPARATOR;
690  }
691}
692
693// Return number of field separators.
694
695int join_fields(string &f)
696{
697  const char *ptr = f.contents();
698  int len = f.length();
699  int nfield_seps = 0;
700  int j;
701  for (j = 0; j < len; j++)
702    if (ptr[j] == FIELD_SEPARATOR)
703      nfield_seps++;
704  if (nfield_seps == 0)
705    return 0;
706  string temp;
707  int field_seps_left = nfield_seps;
708  for (j = 0; j < len; j++) {
709    if (ptr[j] == FIELD_SEPARATOR) {
710      if (nfield_seps == 1)
711	temp += join_authors_exactly_two;
712      else if (--field_seps_left == 0)
713	temp += join_authors_last_two;
714      else
715	temp += join_authors_default;
716    }
717    else
718      temp += ptr[j];
719  }
720  f = temp;
721  return nfield_seps;
722}
723
724void uppercase(const char *start, const char *end, string &result)
725{
726  for (;;) {
727    const char *token_start = start;
728    if (!get_token(&start, end))
729      break;
730    const token_info *ti = lookup_token(token_start, start);
731    ti->upper_case(token_start, start, result);
732  }
733}
734
735void lowercase(const char *start, const char *end, string &result)
736{
737  for (;;) {
738    const char *token_start = start;
739    if (!get_token(&start, end))
740      break;
741    const token_info *ti = lookup_token(token_start, start);
742    ti->lower_case(token_start, start, result);
743  }
744}
745
746void capitalize(const char *ptr, const char *end, string &result)
747{
748  int in_small_point_size = 0;
749  for (;;) {
750    const char *start = ptr;
751    if (!get_token(&ptr, end))
752      break;
753    const token_info *ti = lookup_token(start, ptr);
754    const char *char_end = ptr;
755    int is_lower = ti->is_lower();
756    if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
757      const token_info *ti2 = lookup_token(char_end, ptr);
758      if (!ti2->is_accent())
759	ptr = char_end;
760    }
761    if (is_lower) {
762      if (!in_small_point_size) {
763	result += "\\s-2";
764	in_small_point_size = 1;
765      }
766      ti->upper_case(start, char_end, result);
767      result.append(char_end, ptr - char_end);
768    }
769    else {
770      if (in_small_point_size) {
771	result += "\\s+2";
772	in_small_point_size = 0;
773      }
774      result.append(start, ptr - start);
775    }
776  }
777  if (in_small_point_size)
778    result += "\\s+2";
779}
780
781void capitalize_field(string &str)
782{
783  string temp;
784  capitalize(str.contents(), str.contents() + str.length(), temp);
785  str.move(temp);
786}
787
788int is_terminated(const char *ptr, const char *end)
789{
790  const char *last_token = end;
791  for (;;) {
792    const char *p = ptr;
793    if (!get_token(&ptr, end))
794      break;
795    last_token = p;
796  }
797  return end - last_token == 1
798    && (*last_token == '.' || *last_token == '!' || *last_token == '?');
799}
800
801void reference::output(FILE *fp)
802{
803  fputs(".]-\n", fp);
804  for (int i = 0; i < 256; i++)
805    if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
806      string &f = field[field_index[i]];
807      if (!csdigit(i)) {
808	int j = reverse_fields.search(i);
809	if (j >= 0) {
810	  int n;
811	  int len = reverse_fields.length();
812	  if (++j < len && csdigit(reverse_fields[j])) {
813	    n = reverse_fields[j] - '0';
814	    for (++j; j < len && csdigit(reverse_fields[j]); j++)
815	      // should check for overflow
816	      n = n*10 + reverse_fields[j] - '0';
817	  }
818	  else
819	    n = INT_MAX;
820	  reverse_names(f, n);
821	}
822      }
823      int is_multiple = join_fields(f) > 0;
824      if (capitalize_fields.search(i) >= 0)
825	capitalize_field(f);
826      if (memchr(f.contents(), '\n', f.length()) == 0) {
827	fprintf(fp, ".ds [%c ", i);
828	if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
829	  putc('"', fp);
830	put_string(f, fp);
831	putc('\n', fp);
832      }
833      else {
834	fprintf(fp, ".de [%c\n", i);
835	put_string(f, fp);
836	fputs("..\n", fp);
837      }
838      if (i == 'P') {
839	int multiple_pages = 0;
840	const char *s = f.contents();
841	const char *end = f.contents() + f.length();
842	for (;;) {
843	  const char *token_start = s;
844	  if (!get_token(&s, end))
845	    break;
846	  const token_info *ti = lookup_token(token_start, s);
847	  if (ti->is_hyphen() || ti->is_range_sep()) {
848	    multiple_pages = 1;
849	    break;
850	  }
851	}
852	fprintf(fp, ".nr [P %d\n", multiple_pages);
853      }
854      else if (i == 'E')
855	fprintf(fp, ".nr [E %d\n", is_multiple);
856    }
857  for (const char *p = "TAO"; *p; p++) {
858    int fi = field_index[(unsigned char)*p];
859    if (fi != NULL_FIELD_INDEX) {
860      string &f = field[fi];
861      fprintf(fp, ".nr [%c %d\n", *p,
862	      is_terminated(f.contents(), f.contents() + f.length()));
863    }
864  }
865  int t = classify();
866  fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
867  if (annotation_macro.length() > 0 && annotation_field >= 0
868      && field_index[annotation_field] != NULL_FIELD_INDEX) {
869    putc('.', fp);
870    put_string(annotation_macro, fp);
871    putc('\n', fp);
872    put_string(field[field_index[annotation_field]], fp);
873  }
874}
875
876void reference::print_sort_key_comment(FILE *fp)
877{
878  fputs(".\\\"", fp);
879  put_string(sort_key, fp);
880  putc('\n', fp);
881}
882
883const char *find_year(const char *start, const char *end, const char **endp)
884{
885  for (;;) {
886    while (start < end && !csdigit(*start))
887      start++;
888    const char *ptr = start;
889    if (start == end)
890      break;
891    while (ptr < end && csdigit(*ptr))
892      ptr++;
893    if (ptr - start == 4 || ptr - start == 3
894	|| (ptr - start == 2
895	    && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
896      *endp = ptr;
897      return start;
898    }
899    start = ptr;
900  }
901  return 0;
902}
903
904static const char *find_day(const char *start, const char *end,
905			    const char **endp)
906{
907  for (;;) {
908    while (start < end && !csdigit(*start))
909      start++;
910    const char *ptr = start;
911    if (start == end)
912      break;
913    while (ptr < end && csdigit(*ptr))
914      ptr++;
915    if ((ptr - start == 1 && start[0] != '0')
916	|| (ptr - start == 2 &&
917	    (start[0] == '1'
918	     || start[0] == '2'
919	     || (start[0] == '3' && start[1] <= '1')
920	     || (start[0] == '0' && start[1] != '0')))) {
921      *endp = ptr;
922      return start;
923    }
924    start = ptr;
925  }
926  return 0;
927}
928
929static int find_month(const char *start, const char *end)
930{
931  static const char *months[] = {
932    "january",
933    "february",
934    "march",
935    "april",
936    "may",
937    "june",
938    "july",
939    "august",
940    "september",
941    "october",
942    "november",
943    "december",
944  };
945  for (;;) {
946    while (start < end && !csalpha(*start))
947      start++;
948    const char *ptr = start;
949    if (start == end)
950      break;
951    while (ptr < end && csalpha(*ptr))
952      ptr++;
953    if (ptr - start >= 3) {
954      for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
955	const char *q = months[i];
956	const char *p = start;
957	for (; p < ptr; p++, q++)
958	  if (cmlower(*p) != *q)
959	    break;
960	if (p >= ptr)
961	  return i;
962      }
963    }
964    start = ptr;
965  }
966  return -1;
967}
968
969int reference::contains_field(char c) const
970{
971  return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
972}
973
974int reference::classify()
975{
976  if (contains_field('J'))
977    return JOURNAL_ARTICLE;
978  if (contains_field('B'))
979    return ARTICLE_IN_BOOK;
980  if (contains_field('G'))
981    return TECH_REPORT;
982  if (contains_field('R'))
983    return TECH_REPORT;
984  if (contains_field('I'))
985    return BOOK;
986  if (contains_field('M'))
987    return BELL_TM;
988  return OTHER;
989}
990
991const char *reference::get_year(const char **endp) const
992{
993  if (field_index['D'] != NULL_FIELD_INDEX) {
994    string &date = field[field_index['D']];
995    const char *start = date.contents();
996    const char *end = start + date.length();
997    return find_year(start, end, endp);
998  }
999  else
1000    return 0;
1001}
1002
1003const char *reference::get_field(unsigned char c, const char **endp) const
1004{
1005  if (field_index[c] != NULL_FIELD_INDEX) {
1006    string &f = field[field_index[c]];
1007    const char *start = f.contents();
1008    *endp = start + f.length();
1009    return start;
1010  }
1011  else
1012    return 0;
1013}
1014
1015const char *reference::get_date(const char **endp) const
1016{
1017  return get_field('D', endp);
1018}
1019
1020const char *nth_field(int i, const char *start, const char **endp)
1021{
1022  while (--i >= 0) {
1023    start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1024    if (!start)
1025      return 0;
1026    start++;
1027  }
1028  const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029  if (e)
1030    *endp = e;
1031  return start;
1032}
1033
1034const char *reference::get_author(int i, const char **endp) const
1035{
1036  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037    const char *start = get_field(*f, endp);
1038    if (start) {
1039      if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040	return nth_field(i, start, endp);
1041      else if (i == 0)
1042	return start;
1043      else
1044	return 0;
1045    }
1046  }
1047  return 0;
1048}
1049
1050const char *reference::get_author_last_name(int i, const char **endp) const
1051{
1052  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053    const char *start = get_field(*f, endp);
1054    if (start) {
1055      if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056	start = nth_field(i, start, endp);
1057	if (!start)
1058	  return 0;
1059      }
1060      if (*f == 'A')
1061	return find_last_name(start, *endp, endp);
1062      else
1063	return start;
1064    }
1065  }
1066  return 0;
1067}
1068
1069void reference::set_date(string &d)
1070{
1071  if (d.length() == 0)
1072    delete_field('D');
1073  else
1074    insert_field('D', d);
1075}
1076
1077int same_year(const reference &r1, const reference &r2)
1078{
1079  const char *ye1;
1080  const char *ys1 = r1.get_year(&ye1);
1081  const char *ye2;
1082  const char *ys2 = r2.get_year(&ye2);
1083  if (ys1 == 0) {
1084    if (ys2 == 0)
1085      return same_date(r1, r2);
1086    else
1087      return 0;
1088  }
1089  else if (ys2 == 0)
1090    return 0;
1091  else if (ye1 - ys1 != ye2 - ys2)
1092    return 0;
1093  else
1094    return memcmp(ys1, ys2, ye1 - ys1) == 0;
1095}
1096
1097int same_date(const reference &r1, const reference &r2)
1098{
1099  const char *e1;
1100  const char *s1 = r1.get_date(&e1);
1101  const char *e2;
1102  const char *s2 = r2.get_date(&e2);
1103  if (s1 == 0)
1104    return s2 == 0;
1105  else if (s2 == 0)
1106    return 0;
1107  else if (e1 - s1 != e2 - s2)
1108    return 0;
1109  else
1110    return memcmp(s1, s2, e1 - s1) == 0;
1111}
1112
1113const char *reference::get_sort_field(int i, int si, int ssi,
1114				      const char **endp) const
1115{
1116  const char *start = sort_key.contents();
1117  const char *end = start + sort_key.length();
1118  if (i < 0) {
1119    *endp = end;
1120    return start;
1121  }
1122  while (--i >= 0) {
1123    start = (char *)memchr(start, SORT_SEP, end - start);
1124    if (!start)
1125      return 0;
1126    start++;
1127  }
1128  const char *e = (char *)memchr(start, SORT_SEP, end - start);
1129  if (e)
1130    end = e;
1131  if (si < 0) {
1132    *endp = end;
1133    return start;
1134  }
1135  while (--si >= 0) {
1136    start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1137    if (!start)
1138      return 0;
1139    start++;
1140  }
1141  e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142  if (e)
1143    end = e;
1144  if (ssi < 0) {
1145    *endp = end;
1146    return start;
1147  }
1148  while (--ssi >= 0) {
1149    start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1150    if (!start)
1151      return 0;
1152    start++;
1153  }
1154  e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155  if (e)
1156    end = e;
1157  *endp = end;
1158  return start;
1159}
1160
1161