1// -*- C++ -*-
2/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
3   Free Software Foundation, Inc.
4Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License along
19with groff; see the file COPYING.  If not, write to the Free Software
20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21
22#include "refer.h"
23#include "refid.h"
24#include "ref.h"
25#include "token.h"
26
27static const char *find_day(const char *, const char *, const char **);
28static int find_month(const char *start, const char *end);
29static void abbreviate_names(string &);
30
31#define DEFAULT_ARTICLES "the\000a\000an"
32
33string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
34
35// Multiple occurrences of fields are separated by FIELD_SEPARATOR.
36const char FIELD_SEPARATOR = '\0';
37
38const char MULTI_FIELD_NAMES[] = "AE";
39const char *AUTHOR_FIELDS = "AQ";
40
41enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
42
43const char *reference_types[] = {
44  "other",
45  "journal-article",
46  "book",
47  "article-in-book",
48  "tech-report",
49  "bell-tm",
50};
51
52static string temp_fields[256];
53
54reference::reference(const char *start, int len, reference_id *ridp)
55: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
56  computed_authors(0), last_needed_author(-1), nauthors(-1)
57{
58  int i;
59  for (i = 0; i < 256; i++)
60    field_index[i] = NULL_FIELD_INDEX;
61  if (ridp)
62    rid = *ridp;
63  if (start == 0)
64    return;
65  if (len <= 0)
66    return;
67  const char *end = start + len;
68  const char *ptr = start;
69  assert(*ptr == '%');
70  while (ptr < end) {
71    if (ptr + 1 < end && ptr[1] != '\0'
72	&& ((ptr[1] != '%' && ptr[1] == annotation_field)
73	    || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
74		&& discard_fields.search(ptr[2]) < 0))) {
75      if (ptr[1] == '%')
76	ptr++;
77      string &f = temp_fields[(unsigned char)ptr[1]];
78      ptr += 2;
79      while (ptr < end && csspace(*ptr))
80	ptr++;
81      for (;;) {
82	for (;;) {
83	  if (ptr >= end) {
84	    f += '\n';
85	    break;
86	  }
87	  f += *ptr;
88	  if (*ptr++ == '\n')
89	    break;
90	}
91	if (ptr >= end || *ptr == '%')
92	  break;
93      }
94    }
95    else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
96	     && discard_fields.search(ptr[1]) < 0) {
97      string &f = temp_fields[(unsigned char)ptr[1]];
98      if (f.length() > 0) {
99	if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
100	  f += FIELD_SEPARATOR;
101	else
102	  f.clear();
103      }
104      ptr += 2;
105      if (ptr < end) {
106	if (*ptr == ' ')
107	  ptr++;
108	for (;;) {
109	  const char *p = ptr;
110	  while (ptr < end && *ptr != '\n')
111	    ptr++;
112	  // strip trailing white space
113	  const char *q = ptr;
114	  while (q > p && q[-1] != '\n' && csspace(q[-1]))
115	    q--;
116	  while (p < q)
117	    f += *p++;
118	  if (ptr >= end)
119	    break;
120	  ptr++;
121	  if (ptr >= end)
122	    break;
123	  if (*ptr == '%')
124	    break;
125	  f += ' ';
126	}
127      }
128    }
129    else {
130      // skip this field
131      for (;;) {
132	while (ptr < end && *ptr++ != '\n')
133	  ;
134	if (ptr >= end || *ptr == '%')
135	  break;
136      }
137    }
138  }
139  for (i = 0; i < 256; i++)
140    if (temp_fields[i].length() > 0)
141      nfields++;
142  field = new string[nfields];
143  int j = 0;
144  for (i = 0; i < 256; i++)
145    if (temp_fields[i].length() > 0) {
146      field[j].move(temp_fields[i]);
147      if (abbreviate_fields.search(i) >= 0)
148	abbreviate_names(field[j]);
149      field_index[i] = j;
150      j++;
151    }
152}
153
154reference::~reference()
155{
156  if (nfields > 0)
157    ad_delete(nfields) field;
158}
159
160// ref is the inline, this is the database ref
161
162void reference::merge(reference &ref)
163{
164  int i;
165  for (i = 0; i < 256; i++)
166    if (field_index[i] != NULL_FIELD_INDEX)
167      temp_fields[i].move(field[field_index[i]]);
168  for (i = 0; i < 256; i++)
169    if (ref.field_index[i] != NULL_FIELD_INDEX)
170      temp_fields[i].move(ref.field[ref.field_index[i]]);
171  for (i = 0; i < 256; i++)
172    field_index[i] = NULL_FIELD_INDEX;
173  int old_nfields = nfields;
174  nfields = 0;
175  for (i = 0; i < 256; i++)
176    if (temp_fields[i].length() > 0)
177      nfields++;
178  if (nfields != old_nfields) {
179    if (old_nfields > 0)
180      ad_delete(old_nfields) field;
181    field = new string[nfields];
182  }
183  int j = 0;
184  for (i = 0; i < 256; i++)
185    if (temp_fields[i].length() > 0) {
186      field[j].move(temp_fields[i]);
187      field_index[i] = j;
188      j++;
189    }
190  merged = 1;
191}
192
193void reference::insert_field(unsigned char c, string &s)
194{
195  assert(s.length() > 0);
196  if (field_index[c] != NULL_FIELD_INDEX) {
197    field[field_index[c]].move(s);
198    return;
199  }
200  assert(field_index[c] == NULL_FIELD_INDEX);
201  string *old_field = field;
202  field = new string[nfields + 1];
203  int pos = 0;
204  int i;
205  for (i = 0; i < int(c); i++)
206    if (field_index[i] != NULL_FIELD_INDEX)
207      pos++;
208  for (i = 0; i < pos; i++)
209    field[i].move(old_field[i]);
210  field[pos].move(s);
211  for (i = pos; i < nfields; i++)
212    field[i + 1].move(old_field[i]);
213  if (nfields > 0)
214    ad_delete(nfields) old_field;
215  nfields++;
216  field_index[c] = pos;
217  for (i = c + 1; i < 256; i++)
218    if (field_index[i] != NULL_FIELD_INDEX)
219      field_index[i] += 1;
220}
221
222void reference::delete_field(unsigned char c)
223{
224  if (field_index[c] == NULL_FIELD_INDEX)
225    return;
226  string *old_field = field;
227  field = new string[nfields - 1];
228  int i;
229  for (i = 0; i < int(field_index[c]); i++)
230    field[i].move(old_field[i]);
231  for (i = field_index[c]; i < nfields - 1; i++)
232    field[i].move(old_field[i + 1]);
233  if (nfields > 0)
234    ad_delete(nfields) old_field;
235  nfields--;
236  field_index[c] = NULL_FIELD_INDEX;
237  for (i = c + 1; i < 256; i++)
238    if (field_index[i] != NULL_FIELD_INDEX)
239      field_index[i] -= 1;
240}
241
242void reference::compute_hash_code()
243{
244  if (!rid.is_null())
245    h = rid.hash();
246  else {
247    h = 0;
248    for (int i = 0; i < nfields; i++)
249      if (field[i].length() > 0) {
250	h <<= 4;
251	h ^= hash_string(field[i].contents(), field[i].length());
252      }
253  }
254}
255
256void reference::set_number(int n)
257{
258  no = n;
259}
260
261const char SORT_SEP = '\001';
262const char SORT_SUB_SEP = '\002';
263const char SORT_SUB_SUB_SEP = '\003';
264
265// sep specifies additional word separators
266
267void sortify_words(const char *s, const char *end, const char *sep,
268		   string &result)
269{
270  int non_empty = 0;
271  int need_separator = 0;
272  for (;;) {
273    const char *token_start = s;
274    if (!get_token(&s, end))
275      break;
276    if ((s - token_start == 1
277	 && (*token_start == ' '
278	     || *token_start == '\n'
279	     || (sep && *token_start != '\0'
280		 && strchr(sep, *token_start) != 0)))
281	|| (s - token_start == 2
282	    && token_start[0] == '\\' && token_start[1] == ' ')) {
283      if (non_empty)
284	need_separator = 1;
285    }
286    else {
287      const token_info *ti = lookup_token(token_start, s);
288      if (ti->sortify_non_empty(token_start, s)) {
289	if (need_separator) {
290	  result += ' ';
291	  need_separator = 0;
292	}
293	ti->sortify(token_start, s, result);
294	non_empty = 1;
295      }
296    }
297  }
298}
299
300void sortify_word(const char *s, const char *end, string &result)
301{
302  for (;;) {
303    const char *token_start = s;
304    if (!get_token(&s, end))
305      break;
306    const token_info *ti = lookup_token(token_start, s);
307    ti->sortify(token_start, s, result);
308  }
309}
310
311void sortify_other(const char *s, int len, string &key)
312{
313  sortify_words(s, s + len, 0, key);
314}
315
316void sortify_title(const char *s, int len, string &key)
317{
318  const char *end = s + len;
319  for (; s < end && (*s == ' ' || *s == '\n'); s++)
320    ;
321  const char *ptr = s;
322  for (;;) {
323    const char *token_start = ptr;
324    if (!get_token(&ptr, end))
325      break;
326    if (ptr - token_start == 1
327	&& (*token_start == ' ' || *token_start == '\n'))
328      break;
329  }
330  if (ptr < end) {
331    unsigned int first_word_len = ptr - s - 1;
332    const char *ae = articles.contents() + articles.length();
333    for (const char *a = articles.contents();
334	 a < ae;
335	 a = strchr(a, '\0') + 1)
336      if (first_word_len == strlen(a)) {
337	unsigned int j;
338	for (j = 0; j < first_word_len; j++)
339	  if (a[j] != cmlower(s[j]))
340	    break;
341	if (j >= first_word_len) {
342	  s = ptr;
343	  for (; s < end && (*s == ' ' || *s == '\n'); s++)
344	    ;
345	  break;
346	}
347      }
348  }
349  sortify_words(s, end, 0, key);
350}
351
352void sortify_name(const char *s, int len, string &key)
353{
354  const char *last_name_end;
355  const char *last_name = find_last_name(s, s + len, &last_name_end);
356  sortify_word(last_name, last_name_end, key);
357  key += SORT_SUB_SUB_SEP;
358  if (last_name > s)
359    sortify_words(s, last_name, ".", key);
360  key += SORT_SUB_SUB_SEP;
361  if (last_name_end < s + len)
362    sortify_words(last_name_end, s + len, ".,", key);
363}
364
365void sortify_date(const char *s, int len, string &key)
366{
367  const char *year_end;
368  const char *year_start = find_year(s, s + len, &year_end);
369  if (!year_start) {
370    // Things without years are often `forthcoming', so it makes sense
371    // that they sort after things with explicit years.
372    key += 'A';
373    sortify_words(s, s + len, 0, key);
374    return;
375  }
376  int n = year_end - year_start;
377  while (n < 4) {
378    key += '0';
379    n++;
380  }
381  while (year_start < year_end)
382    key += *year_start++;
383  int m = find_month(s, s + len);
384  if (m < 0)
385    return;
386  key += 'A' + m;
387  const char *day_end;
388  const char *day_start = find_day(s, s + len, &day_end);
389  if (!day_start)
390    return;
391  if (day_end - day_start == 1)
392    key += '0';
393  while (day_start < day_end)
394    key += *day_start++;
395}
396
397// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
398
399void sortify_label(const char *s, int len, string &key)
400{
401  const char *end = s + len;
402  for (;;) {
403    const char *ptr;
404    for (ptr = s;
405	 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
406	 ptr++)
407      ;
408    if (ptr > s)
409      sortify_words(s, ptr, 0, key);
410    s = ptr;
411    if (s >= end)
412      break;
413    key += *s++;
414  }
415}
416
417void reference::compute_sort_key()
418{
419  if (sort_fields.length() == 0)
420    return;
421  sort_fields += '\0';
422  const char *sf = sort_fields.contents();
423  while (*sf != '\0') {
424    sort_key += SORT_SEP;
425    char f = *sf++;
426    int n = 1;
427    if (*sf == '+') {
428      n = INT_MAX;
429      sf++;
430    }
431    else if (csdigit(*sf)) {
432      char *ptr;
433      long l = strtol(sf, &ptr, 10);
434      if (l == 0 && ptr == sf)
435	;
436      else {
437	sf = ptr;
438	if (l < 0) {
439	  n = 1;
440	}
441	else {
442	  n = int(l);
443	}
444      }
445    }
446    if (f == '.')
447      sortify_label(label.contents(), label.length(), sort_key);
448    else if (f == AUTHOR_FIELDS[0])
449      sortify_authors(n, sort_key);
450    else
451      sortify_field(f, n, sort_key);
452  }
453  sort_fields.set_length(sort_fields.length() - 1);
454}
455
456void reference::sortify_authors(int n, string &result) const
457{
458  for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
459    if (contains_field(*p)) {
460      sortify_field(*p, n, result);
461      return;
462    }
463  sortify_field(AUTHOR_FIELDS[0], n, result);
464}
465
466void reference::canonicalize_authors(string &result) const
467{
468  int len = result.length();
469  sortify_authors(INT_MAX, result);
470  if (result.length() > len)
471    result += SORT_SUB_SEP;
472}
473
474void reference::sortify_field(unsigned char f, int n, string &result) const
475{
476  typedef void (*sortify_t)(const char *, int, string &);
477  sortify_t sortifier = sortify_other;
478  switch (f) {
479  case 'A':
480  case 'E':
481    sortifier = sortify_name;
482    break;
483  case 'D':
484    sortifier = sortify_date;
485    break;
486  case 'B':
487  case 'J':
488  case 'T':
489    sortifier = sortify_title;
490    break;
491  }
492  int fi = field_index[(unsigned char)f];
493  if (fi != NULL_FIELD_INDEX) {
494    string &str = field[fi];
495    const char *start = str.contents();
496    const char *end = start + str.length();
497    for (int i = 0; i < n && start < end; i++) {
498      const char *p = start;
499      while (start < end && *start != FIELD_SEPARATOR)
500	start++;
501      if (i > 0)
502	result += SORT_SUB_SEP;
503      (*sortifier)(p, start - p, result);
504      if (start < end)
505	start++;
506    }
507  }
508}
509
510int compare_reference(const reference &r1, const reference &r2)
511{
512  assert(r1.no >= 0);
513  assert(r2.no >= 0);
514  const char *s1 = r1.sort_key.contents();
515  int n1 = r1.sort_key.length();
516  const char *s2 = r2.sort_key.contents();
517  int n2 = r2.sort_key.length();
518  for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
519    if (*s1 != *s2)
520      return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
521  if (n2 > 0)
522    return -1;
523  if (n1 > 0)
524    return 1;
525  return r1.no - r2.no;
526}
527
528int same_reference(const reference &r1, const reference &r2)
529{
530  if (!r1.rid.is_null() && r1.rid == r2.rid)
531    return 1;
532  if (r1.h != r2.h)
533    return 0;
534  if (r1.nfields != r2.nfields)
535    return 0;
536  int i = 0;
537  for (i = 0; i < 256; i++)
538    if (r1.field_index != r2.field_index)
539      return 0;
540  for (i = 0; i < r1.nfields; i++)
541    if (r1.field[i] != r2.field[i])
542      return 0;
543  return 1;
544}
545
546const char *find_last_name(const char *start, const char *end,
547			   const char **endp)
548{
549  const char *ptr = start;
550  const char *last_word = start;
551  for (;;) {
552    const char *token_start = ptr;
553    if (!get_token(&ptr, end))
554      break;
555    if (ptr - token_start == 1) {
556      if (*token_start == ',') {
557	*endp = token_start;
558	return last_word;
559      }
560      else if (*token_start == ' ' || *token_start == '\n') {
561	if (ptr < end && *ptr != ' ' && *ptr != '\n')
562	  last_word = ptr;
563      }
564    }
565  }
566  *endp = end;
567  return last_word;
568}
569
570void abbreviate_name(const char *ptr, const char *end, string &result)
571{
572  const char *last_name_end;
573  const char *last_name_start = find_last_name(ptr, end, &last_name_end);
574  int need_period = 0;
575  for (;;) {
576    const char *token_start = ptr;
577    if (!get_token(&ptr, last_name_start))
578      break;
579    const token_info *ti = lookup_token(token_start, ptr);
580    if (need_period) {
581      if ((ptr - token_start == 1 && *token_start == ' ')
582	  || (ptr - token_start == 2 && token_start[0] == '\\'
583	      && token_start[1] == ' '))
584	continue;
585      if (ti->is_upper())
586	result += period_before_initial;
587      else
588	result += period_before_other;
589      need_period = 0;
590    }
591    result.append(token_start, ptr - token_start);
592    if (ti->is_upper()) {
593      const char *lower_ptr = ptr;
594      int first_token = 1;
595      for (;;) {
596	token_start = ptr;
597	if (!get_token(&ptr, last_name_start))
598	  break;
599	if ((ptr - token_start == 1 && *token_start == ' ')
600	    || (ptr - token_start == 2 && token_start[0] == '\\'
601		&& token_start[1] == ' '))
602	  break;
603	ti = lookup_token(token_start, ptr);
604	if (ti->is_hyphen()) {
605	  const char *ptr1 = ptr;
606	  if (get_token(&ptr1, last_name_start)) {
607	    ti = lookup_token(ptr, ptr1);
608	    if (ti->is_upper()) {
609	      result += period_before_hyphen;
610	      result.append(token_start, ptr1 - token_start);
611	      ptr = ptr1;
612	    }
613	  }
614	}
615	else if (ti->is_upper()) {
616	  // MacDougal -> MacD.
617	  result.append(lower_ptr, ptr - lower_ptr);
618	  lower_ptr = ptr;
619	  first_token = 1;
620	}
621	else if (first_token && ti->is_accent()) {
622	  result.append(token_start, ptr - token_start);
623	  lower_ptr = ptr;
624	}
625	first_token = 0;
626      }
627      need_period = 1;
628    }
629  }
630  if (need_period)
631    result += period_before_last_name;
632  result.append(last_name_start, end - last_name_start);
633}
634
635static void abbreviate_names(string &result)
636{
637  string str;
638  str.move(result);
639  const char *ptr = str.contents();
640  const char *end = ptr + str.length();
641  while (ptr < end) {
642    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
643    if (name_end == 0)
644      name_end = end;
645    abbreviate_name(ptr, name_end, result);
646    if (name_end >= end)
647      break;
648    ptr = name_end + 1;
649    result += FIELD_SEPARATOR;
650  }
651}
652
653void reverse_name(const char *ptr, const char *name_end, string &result)
654{
655  const char *last_name_end;
656  const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
657  result.append(last_name_start, last_name_end - last_name_start);
658  while (last_name_start > ptr
659	 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
660    last_name_start--;
661  if (last_name_start > ptr) {
662    result += ", ";
663    result.append(ptr, last_name_start - ptr);
664  }
665  if (last_name_end < name_end)
666    result.append(last_name_end, name_end - last_name_end);
667}
668
669void reverse_names(string &result, int n)
670{
671  if (n <= 0)
672    return;
673  string str;
674  str.move(result);
675  const char *ptr = str.contents();
676  const char *end = ptr + str.length();
677  while (ptr < end) {
678    if (--n < 0) {
679      result.append(ptr, end - ptr);
680      break;
681    }
682    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
683    if (name_end == 0)
684      name_end = end;
685    reverse_name(ptr, name_end, result);
686    if (name_end >= end)
687      break;
688    ptr = name_end + 1;
689    result += FIELD_SEPARATOR;
690  }
691}
692
693// Return number of field separators.
694
695int join_fields(string &f)
696{
697  const char *ptr = f.contents();
698  int len = f.length();
699  int nfield_seps = 0;
700  int j;
701  for (j = 0; j < len; j++)
702    if (ptr[j] == FIELD_SEPARATOR)
703      nfield_seps++;
704  if (nfield_seps == 0)
705    return 0;
706  string temp;
707  int field_seps_left = nfield_seps;
708  for (j = 0; j < len; j++) {
709    if (ptr[j] == FIELD_SEPARATOR) {
710      if (nfield_seps == 1)
711	temp += join_authors_exactly_two;
712      else if (--field_seps_left == 0)
713	temp += join_authors_last_two;
714      else
715	temp += join_authors_default;
716    }
717    else
718      temp += ptr[j];
719  }
720  f = temp;
721  return nfield_seps;
722}
723
724void uppercase(const char *start, const char *end, string &result)
725{
726  for (;;) {
727    const char *token_start = start;
728    if (!get_token(&start, end))
729      break;
730    const token_info *ti = lookup_token(token_start, start);
731    ti->upper_case(token_start, start, result);
732  }
733}
734
735void lowercase(const char *start, const char *end, string &result)
736{
737  for (;;) {
738    const char *token_start = start;
739    if (!get_token(&start, end))
740      break;
741    const token_info *ti = lookup_token(token_start, start);
742    ti->lower_case(token_start, start, result);
743  }
744}
745
746void capitalize(const char *ptr, const char *end, string &result)
747{
748  int in_small_point_size = 0;
749  for (;;) {
750    const char *start = ptr;
751    if (!get_token(&ptr, end))
752      break;
753    const token_info *ti = lookup_token(start, ptr);
754    const char *char_end = ptr;
755    int is_lower = ti->is_lower();
756    if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
757      const token_info *ti2 = lookup_token(char_end, ptr);
758      if (!ti2->is_accent())
759	ptr = char_end;
760    }
761    if (is_lower) {
762      if (!in_small_point_size) {
763	result += "\\s-2";
764	in_small_point_size = 1;
765      }
766      ti->upper_case(start, char_end, result);
767      result.append(char_end, ptr - char_end);
768    }
769    else {
770      if (in_small_point_size) {
771	result += "\\s+2";
772	in_small_point_size = 0;
773      }
774      result.append(start, ptr - start);
775    }
776  }
777  if (in_small_point_size)
778    result += "\\s+2";
779}
780
781void capitalize_field(string &str)
782{
783  string temp;
784  capitalize(str.contents(), str.contents() + str.length(), temp);
785  str.move(temp);
786}
787
788int is_terminated(const char *ptr, const char *end)
789{
790  const char *last_token = end;
791  for (;;) {
792    const char *p = ptr;
793    if (!get_token(&ptr, end))
794      break;
795    last_token = p;
796  }
797  return end - last_token == 1
798    && (*last_token == '.' || *last_token == '!' || *last_token == '?');
799}
800
801void reference::output(FILE *fp)
802{
803  fputs(".]-\n", fp);
804  for (int i = 0; i < 256; i++)
805    if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
806      string &f = field[field_index[i]];
807      if (!csdigit(i)) {
808	int j = reverse_fields.search(i);
809	if (j >= 0) {
810	  int n;
811	  int len = reverse_fields.length();
812	  if (++j < len && csdigit(reverse_fields[j])) {
813	    n = reverse_fields[j] - '0';
814	    for (++j; j < len && csdigit(reverse_fields[j]); j++)
815	      // should check for overflow
816	      n = n*10 + reverse_fields[j] - '0';
817	  }
818	  else
819	    n = INT_MAX;
820	  reverse_names(f, n);
821	}
822      }
823      int is_multiple = join_fields(f) > 0;
824      if (capitalize_fields.search(i) >= 0)
825	capitalize_field(f);
826      if (memchr(f.contents(), '\n', f.length()) == 0) {
827	fprintf(fp, ".ds [%c ", i);
828	if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
829	  putc('"', fp);
830	put_string(f, fp);
831	putc('\n', fp);
832      }
833      else {
834	fprintf(fp, ".de [%c\n", i);
835	put_string(f, fp);
836	fputs("..\n", fp);
837      }
838      if (i == 'P') {
839	int multiple_pages = 0;
840	const char *s = f.contents();
841	const char *end = f.contents() + f.length();
842	for (;;) {
843	  const char *token_start = s;
844	  if (!get_token(&s, end))
845	    break;
846	  const token_info *ti = lookup_token(token_start, s);
847	  if (ti->is_hyphen() || ti->is_range_sep()) {
848	    multiple_pages = 1;
849	    break;
850	  }
851	}
852	fprintf(fp, ".nr [P %d\n", multiple_pages);
853      }
854      else if (i == 'E')
855	fprintf(fp, ".nr [E %d\n", is_multiple);
856    }
857  for (const char *p = "TAO"; *p; p++) {
858    int fi = field_index[(unsigned char)*p];
859    if (fi != NULL_FIELD_INDEX) {
860      string &f = field[fi];
861      fprintf(fp, ".nr [%c %d\n", *p,
862	      is_terminated(f.contents(), f.contents() + f.length()));
863    }
864  }
865  int t = classify();
866  fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
867  if (annotation_macro.length() > 0 && annotation_field >= 0
868      && field_index[annotation_field] != NULL_FIELD_INDEX) {
869    putc('.', fp);
870    put_string(annotation_macro, fp);
871    putc('\n', fp);
872    put_string(field[field_index[annotation_field]], fp);
873  }
874}
875
876void reference::print_sort_key_comment(FILE *fp)
877{
878  fputs(".\\\"", fp);
879  put_string(sort_key, fp);
880  putc('\n', fp);
881}
882
883const char *find_year(const char *start, const char *end, const char **endp)
884{
885  for (;;) {
886    while (start < end && !csdigit(*start))
887      start++;
888    const char *ptr = start;
889    if (start == end)
890      break;
891    while (ptr < end && csdigit(*ptr))
892      ptr++;
893    if (ptr - start == 4 || ptr - start == 3
894	|| (ptr - start == 2
895	    && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
896      *endp = ptr;
897      return start;
898    }
899    start = ptr;
900  }
901  return 0;
902}
903
904static const char *find_day(const char *start, const char *end,
905			    const char **endp)
906{
907  for (;;) {
908    while (start < end && !csdigit(*start))
909      start++;
910    const char *ptr = start;
911    if (start == end)
912      break;
913    while (ptr < end && csdigit(*ptr))
914      ptr++;
915    if ((ptr - start == 1 && start[0] != '0')
916	|| (ptr - start == 2 &&
917	    (start[0] == '1'
918	     || start[0] == '2'
919	     || (start[0] == '3' && start[1] <= '1')
920	     || (start[0] == '0' && start[1] != '0')))) {
921      *endp = ptr;
922      return start;
923    }
924    start = ptr;
925  }
926  return 0;
927}
928
929static int find_month(const char *start, const char *end)
930{
931  static const char *months[] = {
932    "january",
933    "february",
934    "march",
935    "april",
936    "may",
937    "june",
938    "july",
939    "august",
940    "september",
941    "october",
942    "november",
943    "december",
944  };
945  for (;;) {
946    while (start < end && !csalpha(*start))
947      start++;
948    const char *ptr = start;
949    if (start == end)
950      break;
951    while (ptr < end && csalpha(*ptr))
952      ptr++;
953    if (ptr - start >= 3) {
954      for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
955	const char *q = months[i];
956	const char *p = start;
957	for (; p < ptr; p++, q++)
958	  if (cmlower(*p) != *q)
959	    break;
960	if (p >= ptr)
961	  return i;
962      }
963    }
964    start = ptr;
965  }
966  return -1;
967}
968
969int reference::contains_field(char c) const
970{
971  return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
972}
973
974int reference::classify()
975{
976  if (contains_field('J'))
977    return JOURNAL_ARTICLE;
978  if (contains_field('B'))
979    return ARTICLE_IN_BOOK;
980  if (contains_field('G'))
981    return TECH_REPORT;
982  if (contains_field('R'))
983    return TECH_REPORT;
984  if (contains_field('I'))
985    return BOOK;
986  if (contains_field('M'))
987    return BELL_TM;
988  return OTHER;
989}
990
991const char *reference::get_year(const char **endp) const
992{
993  if (field_index['D'] != NULL_FIELD_INDEX) {
994    string &date = field[field_index['D']];
995    const char *start = date.contents();
996    const char *end = start + date.length();
997    return find_year(start, end, endp);
998  }
999  else
1000    return 0;
1001}
1002
1003const char *reference::get_field(unsigned char c, const char **endp) const
1004{
1005  if (field_index[c] != NULL_FIELD_INDEX) {
1006    string &f = field[field_index[c]];
1007    const char *start = f.contents();
1008    *endp = start + f.length();
1009    return start;
1010  }
1011  else
1012    return 0;
1013}
1014
1015const char *reference::get_date(const char **endp) const
1016{
1017  return get_field('D', endp);
1018}
1019
1020const char *nth_field(int i, const char *start, const char **endp)
1021{
1022  while (--i >= 0) {
1023    start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1024    if (!start)
1025      return 0;
1026    start++;
1027  }
1028  const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029  if (e)
1030    *endp = e;
1031  return start;
1032}
1033
1034const char *reference::get_author(int i, const char **endp) const
1035{
1036  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037    const char *start = get_field(*f, endp);
1038    if (start) {
1039      if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040	return nth_field(i, start, endp);
1041      else if (i == 0)
1042	return start;
1043      else
1044	return 0;
1045    }
1046  }
1047  return 0;
1048}
1049
1050const char *reference::get_author_last_name(int i, const char **endp) const
1051{
1052  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053    const char *start = get_field(*f, endp);
1054    if (start) {
1055      if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056	start = nth_field(i, start, endp);
1057	if (!start)
1058	  return 0;
1059      }
1060      if (*f == 'A')
1061	return find_last_name(start, *endp, endp);
1062      else
1063	return start;
1064    }
1065  }
1066  return 0;
1067}
1068
1069void reference::set_date(string &d)
1070{
1071  if (d.length() == 0)
1072    delete_field('D');
1073  else
1074    insert_field('D', d);
1075}
1076
1077int same_year(const reference &r1, const reference &r2)
1078{
1079  const char *ye1;
1080  const char *ys1 = r1.get_year(&ye1);
1081  const char *ye2;
1082  const char *ys2 = r2.get_year(&ye2);
1083  if (ys1 == 0) {
1084    if (ys2 == 0)
1085      return same_date(r1, r2);
1086    else
1087      return 0;
1088  }
1089  else if (ys2 == 0)
1090    return 0;
1091  else if (ye1 - ys1 != ye2 - ys2)
1092    return 0;
1093  else
1094    return memcmp(ys1, ys2, ye1 - ys1) == 0;
1095}
1096
1097int same_date(const reference &r1, const reference &r2)
1098{
1099  const char *e1;
1100  const char *s1 = r1.get_date(&e1);
1101  const char *e2;
1102  const char *s2 = r2.get_date(&e2);
1103  if (s1 == 0)
1104    return s2 == 0;
1105  else if (s2 == 0)
1106    return 0;
1107  else if (e1 - s1 != e2 - s2)
1108    return 0;
1109  else
1110    return memcmp(s1, s2, e1 - s1) == 0;
1111}
1112
1113const char *reference::get_sort_field(int i, int si, int ssi,
1114				      const char **endp) const
1115{
1116  const char *start = sort_key.contents();
1117  const char *end = start + sort_key.length();
1118  if (i < 0) {
1119    *endp = end;
1120    return start;
1121  }
1122  while (--i >= 0) {
1123    start = (char *)memchr(start, SORT_SEP, end - start);
1124    if (!start)
1125      return 0;
1126    start++;
1127  }
1128  const char *e = (char *)memchr(start, SORT_SEP, end - start);
1129  if (e)
1130    end = e;
1131  if (si < 0) {
1132    *endp = end;
1133    return start;
1134  }
1135  while (--si >= 0) {
1136    start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1137    if (!start)
1138      return 0;
1139    start++;
1140  }
1141  e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142  if (e)
1143    end = e;
1144  if (ssi < 0) {
1145    *endp = end;
1146    return start;
1147  }
1148  while (--ssi >= 0) {
1149    start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1150    if (!start)
1151      return 0;
1152    start++;
1153  }
1154  e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155  if (e)
1156    end = e;
1157  *endp = end;
1158  return start;
1159}
1160
1161