1114402Sru// -*- C++ -*-
2151497Sru/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
3151497Sru   Free Software Foundation, Inc.
4114402SruWritten by James Clark (jjc@jclark.com)
5114402Sru
6114402SruThis file is part of groff.
7114402Sru
8114402Srugroff is free software; you can redistribute it and/or modify it under
9114402Sruthe terms of the GNU General Public License as published by the Free
10114402SruSoftware Foundation; either version 2, or (at your option) any later
11114402Sruversion.
12114402Sru
13114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY
14114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or
15114402SruFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16114402Srufor more details.
17114402Sru
18114402SruYou should have received a copy of the GNU General Public License along
19114402Sruwith groff; see the file COPYING.  If not, write to the Free Software
20151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21114402Sru
22114402Sru#include "refer.h"
23114402Sru#include "refid.h"
24114402Sru#include "ref.h"
25114402Sru#include "token.h"
26114402Sru
27114402Srustatic const char *find_day(const char *, const char *, const char **);
28114402Srustatic int find_month(const char *start, const char *end);
29114402Srustatic void abbreviate_names(string &);
30114402Sru
31114402Sru#define DEFAULT_ARTICLES "the\000a\000an"
32114402Sru
33114402Srustring articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
34114402Sru
35114402Sru// Multiple occurrences of fields are separated by FIELD_SEPARATOR.
36114402Sruconst char FIELD_SEPARATOR = '\0';
37114402Sru
38114402Sruconst char MULTI_FIELD_NAMES[] = "AE";
39114402Sruconst char *AUTHOR_FIELDS = "AQ";
40114402Sru
41114402Sruenum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
42114402Sru
43114402Sruconst char *reference_types[] = {
44114402Sru  "other",
45114402Sru  "journal-article",
46114402Sru  "book",
47114402Sru  "article-in-book",
48114402Sru  "tech-report",
49114402Sru  "bell-tm",
50114402Sru};
51114402Sru
52114402Srustatic string temp_fields[256];
53114402Sru
54114402Srureference::reference(const char *start, int len, reference_id *ridp)
55114402Sru: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
56114402Sru  computed_authors(0), last_needed_author(-1), nauthors(-1)
57114402Sru{
58114402Sru  int i;
59114402Sru  for (i = 0; i < 256; i++)
60114402Sru    field_index[i] = NULL_FIELD_INDEX;
61114402Sru  if (ridp)
62114402Sru    rid = *ridp;
63114402Sru  if (start == 0)
64114402Sru    return;
65114402Sru  if (len <= 0)
66114402Sru    return;
67114402Sru  const char *end = start + len;
68114402Sru  const char *ptr = start;
69114402Sru  assert(*ptr == '%');
70114402Sru  while (ptr < end) {
71114402Sru    if (ptr + 1 < end && ptr[1] != '\0'
72114402Sru	&& ((ptr[1] != '%' && ptr[1] == annotation_field)
73114402Sru	    || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
74114402Sru		&& discard_fields.search(ptr[2]) < 0))) {
75114402Sru      if (ptr[1] == '%')
76114402Sru	ptr++;
77114402Sru      string &f = temp_fields[(unsigned char)ptr[1]];
78114402Sru      ptr += 2;
79114402Sru      while (ptr < end && csspace(*ptr))
80114402Sru	ptr++;
81114402Sru      for (;;) {
82114402Sru	for (;;) {
83114402Sru	  if (ptr >= end) {
84114402Sru	    f += '\n';
85114402Sru	    break;
86114402Sru	  }
87114402Sru	  f += *ptr;
88114402Sru	  if (*ptr++ == '\n')
89114402Sru	    break;
90114402Sru	}
91114402Sru	if (ptr >= end || *ptr == '%')
92114402Sru	  break;
93114402Sru      }
94114402Sru    }
95114402Sru    else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
96114402Sru	     && discard_fields.search(ptr[1]) < 0) {
97114402Sru      string &f = temp_fields[(unsigned char)ptr[1]];
98114402Sru      if (f.length() > 0) {
99114402Sru	if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
100114402Sru	  f += FIELD_SEPARATOR;
101114402Sru	else
102114402Sru	  f.clear();
103114402Sru      }
104114402Sru      ptr += 2;
105114402Sru      if (ptr < end) {
106114402Sru	if (*ptr == ' ')
107114402Sru	  ptr++;
108114402Sru	for (;;) {
109114402Sru	  const char *p = ptr;
110114402Sru	  while (ptr < end && *ptr != '\n')
111114402Sru	    ptr++;
112114402Sru	  // strip trailing white space
113114402Sru	  const char *q = ptr;
114114402Sru	  while (q > p && q[-1] != '\n' && csspace(q[-1]))
115114402Sru	    q--;
116114402Sru	  while (p < q)
117114402Sru	    f += *p++;
118114402Sru	  if (ptr >= end)
119114402Sru	    break;
120114402Sru	  ptr++;
121114402Sru	  if (ptr >= end)
122114402Sru	    break;
123114402Sru	  if (*ptr == '%')
124114402Sru	    break;
125114402Sru	  f += ' ';
126114402Sru	}
127114402Sru      }
128114402Sru    }
129114402Sru    else {
130114402Sru      // skip this field
131114402Sru      for (;;) {
132114402Sru	while (ptr < end && *ptr++ != '\n')
133114402Sru	  ;
134114402Sru	if (ptr >= end || *ptr == '%')
135114402Sru	  break;
136114402Sru      }
137114402Sru    }
138114402Sru  }
139114402Sru  for (i = 0; i < 256; i++)
140114402Sru    if (temp_fields[i].length() > 0)
141114402Sru      nfields++;
142114402Sru  field = new string[nfields];
143114402Sru  int j = 0;
144114402Sru  for (i = 0; i < 256; i++)
145114402Sru    if (temp_fields[i].length() > 0) {
146114402Sru      field[j].move(temp_fields[i]);
147114402Sru      if (abbreviate_fields.search(i) >= 0)
148114402Sru	abbreviate_names(field[j]);
149114402Sru      field_index[i] = j;
150114402Sru      j++;
151114402Sru    }
152114402Sru}
153114402Sru
154114402Srureference::~reference()
155114402Sru{
156114402Sru  if (nfields > 0)
157114402Sru    ad_delete(nfields) field;
158114402Sru}
159114402Sru
160114402Sru// ref is the inline, this is the database ref
161114402Sru
162114402Sruvoid reference::merge(reference &ref)
163114402Sru{
164114402Sru  int i;
165114402Sru  for (i = 0; i < 256; i++)
166114402Sru    if (field_index[i] != NULL_FIELD_INDEX)
167114402Sru      temp_fields[i].move(field[field_index[i]]);
168114402Sru  for (i = 0; i < 256; i++)
169114402Sru    if (ref.field_index[i] != NULL_FIELD_INDEX)
170114402Sru      temp_fields[i].move(ref.field[ref.field_index[i]]);
171114402Sru  for (i = 0; i < 256; i++)
172114402Sru    field_index[i] = NULL_FIELD_INDEX;
173114402Sru  int old_nfields = nfields;
174114402Sru  nfields = 0;
175114402Sru  for (i = 0; i < 256; i++)
176114402Sru    if (temp_fields[i].length() > 0)
177114402Sru      nfields++;
178114402Sru  if (nfields != old_nfields) {
179114402Sru    if (old_nfields > 0)
180114402Sru      ad_delete(old_nfields) field;
181114402Sru    field = new string[nfields];
182114402Sru  }
183114402Sru  int j = 0;
184114402Sru  for (i = 0; i < 256; i++)
185114402Sru    if (temp_fields[i].length() > 0) {
186114402Sru      field[j].move(temp_fields[i]);
187114402Sru      field_index[i] = j;
188114402Sru      j++;
189114402Sru    }
190114402Sru  merged = 1;
191114402Sru}
192114402Sru
193114402Sruvoid reference::insert_field(unsigned char c, string &s)
194114402Sru{
195114402Sru  assert(s.length() > 0);
196114402Sru  if (field_index[c] != NULL_FIELD_INDEX) {
197114402Sru    field[field_index[c]].move(s);
198114402Sru    return;
199114402Sru  }
200114402Sru  assert(field_index[c] == NULL_FIELD_INDEX);
201114402Sru  string *old_field = field;
202114402Sru  field = new string[nfields + 1];
203114402Sru  int pos = 0;
204114402Sru  int i;
205114402Sru  for (i = 0; i < int(c); i++)
206114402Sru    if (field_index[i] != NULL_FIELD_INDEX)
207114402Sru      pos++;
208114402Sru  for (i = 0; i < pos; i++)
209114402Sru    field[i].move(old_field[i]);
210114402Sru  field[pos].move(s);
211114402Sru  for (i = pos; i < nfields; i++)
212114402Sru    field[i + 1].move(old_field[i]);
213114402Sru  if (nfields > 0)
214114402Sru    ad_delete(nfields) old_field;
215114402Sru  nfields++;
216114402Sru  field_index[c] = pos;
217114402Sru  for (i = c + 1; i < 256; i++)
218114402Sru    if (field_index[i] != NULL_FIELD_INDEX)
219114402Sru      field_index[i] += 1;
220114402Sru}
221114402Sru
222114402Sruvoid reference::delete_field(unsigned char c)
223114402Sru{
224114402Sru  if (field_index[c] == NULL_FIELD_INDEX)
225114402Sru    return;
226114402Sru  string *old_field = field;
227114402Sru  field = new string[nfields - 1];
228114402Sru  int i;
229114402Sru  for (i = 0; i < int(field_index[c]); i++)
230114402Sru    field[i].move(old_field[i]);
231114402Sru  for (i = field_index[c]; i < nfields - 1; i++)
232114402Sru    field[i].move(old_field[i + 1]);
233114402Sru  if (nfields > 0)
234114402Sru    ad_delete(nfields) old_field;
235114402Sru  nfields--;
236114402Sru  field_index[c] = NULL_FIELD_INDEX;
237114402Sru  for (i = c + 1; i < 256; i++)
238114402Sru    if (field_index[i] != NULL_FIELD_INDEX)
239114402Sru      field_index[i] -= 1;
240114402Sru}
241114402Sru
242114402Sruvoid reference::compute_hash_code()
243114402Sru{
244114402Sru  if (!rid.is_null())
245114402Sru    h = rid.hash();
246114402Sru  else {
247114402Sru    h = 0;
248114402Sru    for (int i = 0; i < nfields; i++)
249114402Sru      if (field[i].length() > 0) {
250114402Sru	h <<= 4;
251114402Sru	h ^= hash_string(field[i].contents(), field[i].length());
252114402Sru      }
253114402Sru  }
254114402Sru}
255114402Sru
256114402Sruvoid reference::set_number(int n)
257114402Sru{
258114402Sru  no = n;
259114402Sru}
260114402Sru
261114402Sruconst char SORT_SEP = '\001';
262114402Sruconst char SORT_SUB_SEP = '\002';
263114402Sruconst char SORT_SUB_SUB_SEP = '\003';
264114402Sru
265114402Sru// sep specifies additional word separators
266114402Sru
267114402Sruvoid sortify_words(const char *s, const char *end, const char *sep,
268114402Sru		   string &result)
269114402Sru{
270114402Sru  int non_empty = 0;
271114402Sru  int need_separator = 0;
272114402Sru  for (;;) {
273114402Sru    const char *token_start = s;
274114402Sru    if (!get_token(&s, end))
275114402Sru      break;
276114402Sru    if ((s - token_start == 1
277114402Sru	 && (*token_start == ' '
278114402Sru	     || *token_start == '\n'
279114402Sru	     || (sep && *token_start != '\0'
280114402Sru		 && strchr(sep, *token_start) != 0)))
281114402Sru	|| (s - token_start == 2
282114402Sru	    && token_start[0] == '\\' && token_start[1] == ' ')) {
283114402Sru      if (non_empty)
284114402Sru	need_separator = 1;
285114402Sru    }
286114402Sru    else {
287114402Sru      const token_info *ti = lookup_token(token_start, s);
288114402Sru      if (ti->sortify_non_empty(token_start, s)) {
289114402Sru	if (need_separator) {
290114402Sru	  result += ' ';
291114402Sru	  need_separator = 0;
292114402Sru	}
293114402Sru	ti->sortify(token_start, s, result);
294114402Sru	non_empty = 1;
295114402Sru      }
296114402Sru    }
297114402Sru  }
298114402Sru}
299114402Sru
300114402Sruvoid sortify_word(const char *s, const char *end, string &result)
301114402Sru{
302114402Sru  for (;;) {
303114402Sru    const char *token_start = s;
304114402Sru    if (!get_token(&s, end))
305114402Sru      break;
306114402Sru    const token_info *ti = lookup_token(token_start, s);
307114402Sru    ti->sortify(token_start, s, result);
308114402Sru  }
309114402Sru}
310114402Sru
311114402Sruvoid sortify_other(const char *s, int len, string &key)
312114402Sru{
313114402Sru  sortify_words(s, s + len, 0, key);
314114402Sru}
315114402Sru
316114402Sruvoid sortify_title(const char *s, int len, string &key)
317114402Sru{
318114402Sru  const char *end = s + len;
319114402Sru  for (; s < end && (*s == ' ' || *s == '\n'); s++)
320114402Sru    ;
321114402Sru  const char *ptr = s;
322114402Sru  for (;;) {
323114402Sru    const char *token_start = ptr;
324114402Sru    if (!get_token(&ptr, end))
325114402Sru      break;
326114402Sru    if (ptr - token_start == 1
327114402Sru	&& (*token_start == ' ' || *token_start == '\n'))
328114402Sru      break;
329114402Sru  }
330114402Sru  if (ptr < end) {
331114402Sru    unsigned int first_word_len = ptr - s - 1;
332114402Sru    const char *ae = articles.contents() + articles.length();
333114402Sru    for (const char *a = articles.contents();
334114402Sru	 a < ae;
335114402Sru	 a = strchr(a, '\0') + 1)
336114402Sru      if (first_word_len == strlen(a)) {
337114402Sru	unsigned int j;
338114402Sru	for (j = 0; j < first_word_len; j++)
339114402Sru	  if (a[j] != cmlower(s[j]))
340114402Sru	    break;
341114402Sru	if (j >= first_word_len) {
342114402Sru	  s = ptr;
343114402Sru	  for (; s < end && (*s == ' ' || *s == '\n'); s++)
344114402Sru	    ;
345114402Sru	  break;
346114402Sru	}
347114402Sru      }
348114402Sru  }
349114402Sru  sortify_words(s, end, 0, key);
350114402Sru}
351114402Sru
352114402Sruvoid sortify_name(const char *s, int len, string &key)
353114402Sru{
354114402Sru  const char *last_name_end;
355114402Sru  const char *last_name = find_last_name(s, s + len, &last_name_end);
356114402Sru  sortify_word(last_name, last_name_end, key);
357114402Sru  key += SORT_SUB_SUB_SEP;
358114402Sru  if (last_name > s)
359114402Sru    sortify_words(s, last_name, ".", key);
360114402Sru  key += SORT_SUB_SUB_SEP;
361114402Sru  if (last_name_end < s + len)
362114402Sru    sortify_words(last_name_end, s + len, ".,", key);
363114402Sru}
364114402Sru
365114402Sruvoid sortify_date(const char *s, int len, string &key)
366114402Sru{
367114402Sru  const char *year_end;
368114402Sru  const char *year_start = find_year(s, s + len, &year_end);
369114402Sru  if (!year_start) {
370114402Sru    // Things without years are often `forthcoming', so it makes sense
371114402Sru    // that they sort after things with explicit years.
372114402Sru    key += 'A';
373114402Sru    sortify_words(s, s + len, 0, key);
374114402Sru    return;
375114402Sru  }
376114402Sru  int n = year_end - year_start;
377114402Sru  while (n < 4) {
378114402Sru    key += '0';
379114402Sru    n++;
380114402Sru  }
381114402Sru  while (year_start < year_end)
382114402Sru    key += *year_start++;
383114402Sru  int m = find_month(s, s + len);
384114402Sru  if (m < 0)
385114402Sru    return;
386114402Sru  key += 'A' + m;
387114402Sru  const char *day_end;
388114402Sru  const char *day_start = find_day(s, s + len, &day_end);
389114402Sru  if (!day_start)
390114402Sru    return;
391114402Sru  if (day_end - day_start == 1)
392114402Sru    key += '0';
393114402Sru  while (day_start < day_end)
394114402Sru    key += *day_start++;
395114402Sru}
396114402Sru
397114402Sru// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
398114402Sru
399114402Sruvoid sortify_label(const char *s, int len, string &key)
400114402Sru{
401114402Sru  const char *end = s + len;
402114402Sru  for (;;) {
403114402Sru    const char *ptr;
404114402Sru    for (ptr = s;
405114402Sru	 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
406114402Sru	 ptr++)
407114402Sru      ;
408114402Sru    if (ptr > s)
409114402Sru      sortify_words(s, ptr, 0, key);
410114402Sru    s = ptr;
411114402Sru    if (s >= end)
412114402Sru      break;
413114402Sru    key += *s++;
414114402Sru  }
415114402Sru}
416114402Sru
417114402Sruvoid reference::compute_sort_key()
418114402Sru{
419114402Sru  if (sort_fields.length() == 0)
420114402Sru    return;
421114402Sru  sort_fields += '\0';
422114402Sru  const char *sf = sort_fields.contents();
423114402Sru  while (*sf != '\0') {
424151497Sru    sort_key += SORT_SEP;
425114402Sru    char f = *sf++;
426114402Sru    int n = 1;
427114402Sru    if (*sf == '+') {
428114402Sru      n = INT_MAX;
429114402Sru      sf++;
430114402Sru    }
431114402Sru    else if (csdigit(*sf)) {
432114402Sru      char *ptr;
433114402Sru      long l = strtol(sf, &ptr, 10);
434114402Sru      if (l == 0 && ptr == sf)
435114402Sru	;
436114402Sru      else {
437114402Sru	sf = ptr;
438114402Sru	if (l < 0) {
439114402Sru	  n = 1;
440114402Sru	}
441114402Sru	else {
442114402Sru	  n = int(l);
443114402Sru	}
444114402Sru      }
445114402Sru    }
446114402Sru    if (f == '.')
447114402Sru      sortify_label(label.contents(), label.length(), sort_key);
448114402Sru    else if (f == AUTHOR_FIELDS[0])
449114402Sru      sortify_authors(n, sort_key);
450114402Sru    else
451114402Sru      sortify_field(f, n, sort_key);
452114402Sru  }
453114402Sru  sort_fields.set_length(sort_fields.length() - 1);
454114402Sru}
455114402Sru
456114402Sruvoid reference::sortify_authors(int n, string &result) const
457114402Sru{
458114402Sru  for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
459114402Sru    if (contains_field(*p)) {
460114402Sru      sortify_field(*p, n, result);
461114402Sru      return;
462114402Sru    }
463114402Sru  sortify_field(AUTHOR_FIELDS[0], n, result);
464114402Sru}
465114402Sru
466114402Sruvoid reference::canonicalize_authors(string &result) const
467114402Sru{
468114402Sru  int len = result.length();
469114402Sru  sortify_authors(INT_MAX, result);
470114402Sru  if (result.length() > len)
471114402Sru    result += SORT_SUB_SEP;
472114402Sru}
473114402Sru
474114402Sruvoid reference::sortify_field(unsigned char f, int n, string &result) const
475114402Sru{
476114402Sru  typedef void (*sortify_t)(const char *, int, string &);
477114402Sru  sortify_t sortifier = sortify_other;
478114402Sru  switch (f) {
479114402Sru  case 'A':
480114402Sru  case 'E':
481114402Sru    sortifier = sortify_name;
482114402Sru    break;
483114402Sru  case 'D':
484114402Sru    sortifier = sortify_date;
485114402Sru    break;
486114402Sru  case 'B':
487114402Sru  case 'J':
488114402Sru  case 'T':
489114402Sru    sortifier = sortify_title;
490114402Sru    break;
491114402Sru  }
492114402Sru  int fi = field_index[(unsigned char)f];
493114402Sru  if (fi != NULL_FIELD_INDEX) {
494114402Sru    string &str = field[fi];
495114402Sru    const char *start = str.contents();
496114402Sru    const char *end = start + str.length();
497114402Sru    for (int i = 0; i < n && start < end; i++) {
498114402Sru      const char *p = start;
499114402Sru      while (start < end && *start != FIELD_SEPARATOR)
500114402Sru	start++;
501114402Sru      if (i > 0)
502114402Sru	result += SORT_SUB_SEP;
503114402Sru      (*sortifier)(p, start - p, result);
504114402Sru      if (start < end)
505114402Sru	start++;
506114402Sru    }
507114402Sru  }
508114402Sru}
509114402Sru
510114402Sruint compare_reference(const reference &r1, const reference &r2)
511114402Sru{
512114402Sru  assert(r1.no >= 0);
513114402Sru  assert(r2.no >= 0);
514114402Sru  const char *s1 = r1.sort_key.contents();
515114402Sru  int n1 = r1.sort_key.length();
516114402Sru  const char *s2 = r2.sort_key.contents();
517114402Sru  int n2 = r2.sort_key.length();
518114402Sru  for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
519114402Sru    if (*s1 != *s2)
520114402Sru      return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
521114402Sru  if (n2 > 0)
522114402Sru    return -1;
523114402Sru  if (n1 > 0)
524114402Sru    return 1;
525114402Sru  return r1.no - r2.no;
526114402Sru}
527114402Sru
528114402Sruint same_reference(const reference &r1, const reference &r2)
529114402Sru{
530114402Sru  if (!r1.rid.is_null() && r1.rid == r2.rid)
531114402Sru    return 1;
532114402Sru  if (r1.h != r2.h)
533114402Sru    return 0;
534114402Sru  if (r1.nfields != r2.nfields)
535114402Sru    return 0;
536114402Sru  int i = 0;
537114402Sru  for (i = 0; i < 256; i++)
538114402Sru    if (r1.field_index != r2.field_index)
539114402Sru      return 0;
540114402Sru  for (i = 0; i < r1.nfields; i++)
541114402Sru    if (r1.field[i] != r2.field[i])
542114402Sru      return 0;
543114402Sru  return 1;
544114402Sru}
545114402Sru
546114402Sruconst char *find_last_name(const char *start, const char *end,
547114402Sru			   const char **endp)
548114402Sru{
549114402Sru  const char *ptr = start;
550114402Sru  const char *last_word = start;
551114402Sru  for (;;) {
552114402Sru    const char *token_start = ptr;
553114402Sru    if (!get_token(&ptr, end))
554114402Sru      break;
555114402Sru    if (ptr - token_start == 1) {
556114402Sru      if (*token_start == ',') {
557114402Sru	*endp = token_start;
558114402Sru	return last_word;
559114402Sru      }
560114402Sru      else if (*token_start == ' ' || *token_start == '\n') {
561114402Sru	if (ptr < end && *ptr != ' ' && *ptr != '\n')
562114402Sru	  last_word = ptr;
563114402Sru      }
564114402Sru    }
565114402Sru  }
566114402Sru  *endp = end;
567114402Sru  return last_word;
568114402Sru}
569114402Sru
570114402Sruvoid abbreviate_name(const char *ptr, const char *end, string &result)
571114402Sru{
572114402Sru  const char *last_name_end;
573114402Sru  const char *last_name_start = find_last_name(ptr, end, &last_name_end);
574114402Sru  int need_period = 0;
575114402Sru  for (;;) {
576114402Sru    const char *token_start = ptr;
577114402Sru    if (!get_token(&ptr, last_name_start))
578114402Sru      break;
579114402Sru    const token_info *ti = lookup_token(token_start, ptr);
580114402Sru    if (need_period) {
581114402Sru      if ((ptr - token_start == 1 && *token_start == ' ')
582114402Sru	  || (ptr - token_start == 2 && token_start[0] == '\\'
583114402Sru	      && token_start[1] == ' '))
584114402Sru	continue;
585114402Sru      if (ti->is_upper())
586114402Sru	result += period_before_initial;
587114402Sru      else
588114402Sru	result += period_before_other;
589114402Sru      need_period = 0;
590114402Sru    }
591114402Sru    result.append(token_start, ptr - token_start);
592114402Sru    if (ti->is_upper()) {
593114402Sru      const char *lower_ptr = ptr;
594114402Sru      int first_token = 1;
595114402Sru      for (;;) {
596114402Sru	token_start = ptr;
597114402Sru	if (!get_token(&ptr, last_name_start))
598114402Sru	  break;
599114402Sru	if ((ptr - token_start == 1 && *token_start == ' ')
600114402Sru	    || (ptr - token_start == 2 && token_start[0] == '\\'
601114402Sru		&& token_start[1] == ' '))
602114402Sru	  break;
603114402Sru	ti = lookup_token(token_start, ptr);
604114402Sru	if (ti->is_hyphen()) {
605114402Sru	  const char *ptr1 = ptr;
606114402Sru	  if (get_token(&ptr1, last_name_start)) {
607114402Sru	    ti = lookup_token(ptr, ptr1);
608114402Sru	    if (ti->is_upper()) {
609114402Sru	      result += period_before_hyphen;
610114402Sru	      result.append(token_start, ptr1 - token_start);
611114402Sru	      ptr = ptr1;
612114402Sru	    }
613114402Sru	  }
614114402Sru	}
615114402Sru	else if (ti->is_upper()) {
616114402Sru	  // MacDougal -> MacD.
617114402Sru	  result.append(lower_ptr, ptr - lower_ptr);
618114402Sru	  lower_ptr = ptr;
619114402Sru	  first_token = 1;
620114402Sru	}
621114402Sru	else if (first_token && ti->is_accent()) {
622114402Sru	  result.append(token_start, ptr - token_start);
623114402Sru	  lower_ptr = ptr;
624114402Sru	}
625114402Sru	first_token = 0;
626114402Sru      }
627114402Sru      need_period = 1;
628114402Sru    }
629114402Sru  }
630114402Sru  if (need_period)
631114402Sru    result += period_before_last_name;
632114402Sru  result.append(last_name_start, end - last_name_start);
633114402Sru}
634114402Sru
635114402Srustatic void abbreviate_names(string &result)
636114402Sru{
637114402Sru  string str;
638114402Sru  str.move(result);
639114402Sru  const char *ptr = str.contents();
640114402Sru  const char *end = ptr + str.length();
641114402Sru  while (ptr < end) {
642114402Sru    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
643114402Sru    if (name_end == 0)
644114402Sru      name_end = end;
645114402Sru    abbreviate_name(ptr, name_end, result);
646114402Sru    if (name_end >= end)
647114402Sru      break;
648114402Sru    ptr = name_end + 1;
649114402Sru    result += FIELD_SEPARATOR;
650114402Sru  }
651114402Sru}
652114402Sru
653114402Sruvoid reverse_name(const char *ptr, const char *name_end, string &result)
654114402Sru{
655114402Sru  const char *last_name_end;
656114402Sru  const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
657114402Sru  result.append(last_name_start, last_name_end - last_name_start);
658114402Sru  while (last_name_start > ptr
659114402Sru	 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
660114402Sru    last_name_start--;
661114402Sru  if (last_name_start > ptr) {
662114402Sru    result += ", ";
663114402Sru    result.append(ptr, last_name_start - ptr);
664114402Sru  }
665114402Sru  if (last_name_end < name_end)
666114402Sru    result.append(last_name_end, name_end - last_name_end);
667114402Sru}
668114402Sru
669114402Sruvoid reverse_names(string &result, int n)
670114402Sru{
671114402Sru  if (n <= 0)
672114402Sru    return;
673114402Sru  string str;
674114402Sru  str.move(result);
675114402Sru  const char *ptr = str.contents();
676114402Sru  const char *end = ptr + str.length();
677114402Sru  while (ptr < end) {
678114402Sru    if (--n < 0) {
679114402Sru      result.append(ptr, end - ptr);
680114402Sru      break;
681114402Sru    }
682114402Sru    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
683114402Sru    if (name_end == 0)
684114402Sru      name_end = end;
685114402Sru    reverse_name(ptr, name_end, result);
686114402Sru    if (name_end >= end)
687114402Sru      break;
688114402Sru    ptr = name_end + 1;
689114402Sru    result += FIELD_SEPARATOR;
690114402Sru  }
691114402Sru}
692114402Sru
693114402Sru// Return number of field separators.
694114402Sru
695114402Sruint join_fields(string &f)
696114402Sru{
697114402Sru  const char *ptr = f.contents();
698114402Sru  int len = f.length();
699114402Sru  int nfield_seps = 0;
700114402Sru  int j;
701114402Sru  for (j = 0; j < len; j++)
702114402Sru    if (ptr[j] == FIELD_SEPARATOR)
703114402Sru      nfield_seps++;
704114402Sru  if (nfield_seps == 0)
705114402Sru    return 0;
706114402Sru  string temp;
707114402Sru  int field_seps_left = nfield_seps;
708114402Sru  for (j = 0; j < len; j++) {
709114402Sru    if (ptr[j] == FIELD_SEPARATOR) {
710114402Sru      if (nfield_seps == 1)
711114402Sru	temp += join_authors_exactly_two;
712114402Sru      else if (--field_seps_left == 0)
713114402Sru	temp += join_authors_last_two;
714114402Sru      else
715114402Sru	temp += join_authors_default;
716114402Sru    }
717114402Sru    else
718114402Sru      temp += ptr[j];
719114402Sru  }
720114402Sru  f = temp;
721114402Sru  return nfield_seps;
722114402Sru}
723114402Sru
724114402Sruvoid uppercase(const char *start, const char *end, string &result)
725114402Sru{
726114402Sru  for (;;) {
727114402Sru    const char *token_start = start;
728114402Sru    if (!get_token(&start, end))
729114402Sru      break;
730114402Sru    const token_info *ti = lookup_token(token_start, start);
731114402Sru    ti->upper_case(token_start, start, result);
732114402Sru  }
733114402Sru}
734114402Sru
735114402Sruvoid lowercase(const char *start, const char *end, string &result)
736114402Sru{
737114402Sru  for (;;) {
738114402Sru    const char *token_start = start;
739114402Sru    if (!get_token(&start, end))
740114402Sru      break;
741114402Sru    const token_info *ti = lookup_token(token_start, start);
742114402Sru    ti->lower_case(token_start, start, result);
743114402Sru  }
744114402Sru}
745114402Sru
746114402Sruvoid capitalize(const char *ptr, const char *end, string &result)
747114402Sru{
748114402Sru  int in_small_point_size = 0;
749114402Sru  for (;;) {
750114402Sru    const char *start = ptr;
751114402Sru    if (!get_token(&ptr, end))
752114402Sru      break;
753114402Sru    const token_info *ti = lookup_token(start, ptr);
754114402Sru    const char *char_end = ptr;
755114402Sru    int is_lower = ti->is_lower();
756114402Sru    if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
757114402Sru      const token_info *ti2 = lookup_token(char_end, ptr);
758114402Sru      if (!ti2->is_accent())
759114402Sru	ptr = char_end;
760114402Sru    }
761114402Sru    if (is_lower) {
762114402Sru      if (!in_small_point_size) {
763114402Sru	result += "\\s-2";
764114402Sru	in_small_point_size = 1;
765114402Sru      }
766114402Sru      ti->upper_case(start, char_end, result);
767114402Sru      result.append(char_end, ptr - char_end);
768114402Sru    }
769114402Sru    else {
770114402Sru      if (in_small_point_size) {
771114402Sru	result += "\\s+2";
772114402Sru	in_small_point_size = 0;
773114402Sru      }
774114402Sru      result.append(start, ptr - start);
775114402Sru    }
776114402Sru  }
777114402Sru  if (in_small_point_size)
778114402Sru    result += "\\s+2";
779114402Sru}
780114402Sru
781114402Sruvoid capitalize_field(string &str)
782114402Sru{
783114402Sru  string temp;
784114402Sru  capitalize(str.contents(), str.contents() + str.length(), temp);
785114402Sru  str.move(temp);
786114402Sru}
787114402Sru
788114402Sruint is_terminated(const char *ptr, const char *end)
789114402Sru{
790114402Sru  const char *last_token = end;
791114402Sru  for (;;) {
792114402Sru    const char *p = ptr;
793114402Sru    if (!get_token(&ptr, end))
794114402Sru      break;
795114402Sru    last_token = p;
796114402Sru  }
797114402Sru  return end - last_token == 1
798114402Sru    && (*last_token == '.' || *last_token == '!' || *last_token == '?');
799114402Sru}
800114402Sru
801114402Sruvoid reference::output(FILE *fp)
802114402Sru{
803114402Sru  fputs(".]-\n", fp);
804114402Sru  for (int i = 0; i < 256; i++)
805114402Sru    if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
806114402Sru      string &f = field[field_index[i]];
807114402Sru      if (!csdigit(i)) {
808114402Sru	int j = reverse_fields.search(i);
809114402Sru	if (j >= 0) {
810114402Sru	  int n;
811114402Sru	  int len = reverse_fields.length();
812114402Sru	  if (++j < len && csdigit(reverse_fields[j])) {
813114402Sru	    n = reverse_fields[j] - '0';
814114402Sru	    for (++j; j < len && csdigit(reverse_fields[j]); j++)
815114402Sru	      // should check for overflow
816114402Sru	      n = n*10 + reverse_fields[j] - '0';
817114402Sru	  }
818114402Sru	  else
819114402Sru	    n = INT_MAX;
820114402Sru	  reverse_names(f, n);
821114402Sru	}
822114402Sru      }
823114402Sru      int is_multiple = join_fields(f) > 0;
824114402Sru      if (capitalize_fields.search(i) >= 0)
825114402Sru	capitalize_field(f);
826114402Sru      if (memchr(f.contents(), '\n', f.length()) == 0) {
827114402Sru	fprintf(fp, ".ds [%c ", i);
828114402Sru	if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
829114402Sru	  putc('"', fp);
830114402Sru	put_string(f, fp);
831114402Sru	putc('\n', fp);
832114402Sru      }
833114402Sru      else {
834114402Sru	fprintf(fp, ".de [%c\n", i);
835114402Sru	put_string(f, fp);
836114402Sru	fputs("..\n", fp);
837114402Sru      }
838114402Sru      if (i == 'P') {
839114402Sru	int multiple_pages = 0;
840114402Sru	const char *s = f.contents();
841114402Sru	const char *end = f.contents() + f.length();
842114402Sru	for (;;) {
843114402Sru	  const char *token_start = s;
844114402Sru	  if (!get_token(&s, end))
845114402Sru	    break;
846114402Sru	  const token_info *ti = lookup_token(token_start, s);
847114402Sru	  if (ti->is_hyphen() || ti->is_range_sep()) {
848114402Sru	    multiple_pages = 1;
849114402Sru	    break;
850114402Sru	  }
851114402Sru	}
852114402Sru	fprintf(fp, ".nr [P %d\n", multiple_pages);
853114402Sru      }
854114402Sru      else if (i == 'E')
855114402Sru	fprintf(fp, ".nr [E %d\n", is_multiple);
856114402Sru    }
857114402Sru  for (const char *p = "TAO"; *p; p++) {
858114402Sru    int fi = field_index[(unsigned char)*p];
859114402Sru    if (fi != NULL_FIELD_INDEX) {
860114402Sru      string &f = field[fi];
861114402Sru      fprintf(fp, ".nr [%c %d\n", *p,
862114402Sru	      is_terminated(f.contents(), f.contents() + f.length()));
863114402Sru    }
864114402Sru  }
865114402Sru  int t = classify();
866114402Sru  fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
867114402Sru  if (annotation_macro.length() > 0 && annotation_field >= 0
868114402Sru      && field_index[annotation_field] != NULL_FIELD_INDEX) {
869114402Sru    putc('.', fp);
870114402Sru    put_string(annotation_macro, fp);
871114402Sru    putc('\n', fp);
872114402Sru    put_string(field[field_index[annotation_field]], fp);
873114402Sru  }
874114402Sru}
875114402Sru
876114402Sruvoid reference::print_sort_key_comment(FILE *fp)
877114402Sru{
878114402Sru  fputs(".\\\"", fp);
879114402Sru  put_string(sort_key, fp);
880114402Sru  putc('\n', fp);
881114402Sru}
882114402Sru
883114402Sruconst char *find_year(const char *start, const char *end, const char **endp)
884114402Sru{
885114402Sru  for (;;) {
886114402Sru    while (start < end && !csdigit(*start))
887114402Sru      start++;
888114402Sru    const char *ptr = start;
889114402Sru    if (start == end)
890114402Sru      break;
891114402Sru    while (ptr < end && csdigit(*ptr))
892114402Sru      ptr++;
893114402Sru    if (ptr - start == 4 || ptr - start == 3
894114402Sru	|| (ptr - start == 2
895114402Sru	    && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
896114402Sru      *endp = ptr;
897114402Sru      return start;
898114402Sru    }
899114402Sru    start = ptr;
900114402Sru  }
901114402Sru  return 0;
902114402Sru}
903114402Sru
904114402Srustatic const char *find_day(const char *start, const char *end,
905114402Sru			    const char **endp)
906114402Sru{
907114402Sru  for (;;) {
908114402Sru    while (start < end && !csdigit(*start))
909114402Sru      start++;
910114402Sru    const char *ptr = start;
911114402Sru    if (start == end)
912114402Sru      break;
913114402Sru    while (ptr < end && csdigit(*ptr))
914114402Sru      ptr++;
915114402Sru    if ((ptr - start == 1 && start[0] != '0')
916114402Sru	|| (ptr - start == 2 &&
917114402Sru	    (start[0] == '1'
918114402Sru	     || start[0] == '2'
919114402Sru	     || (start[0] == '3' && start[1] <= '1')
920114402Sru	     || (start[0] == '0' && start[1] != '0')))) {
921114402Sru      *endp = ptr;
922114402Sru      return start;
923114402Sru    }
924114402Sru    start = ptr;
925114402Sru  }
926114402Sru  return 0;
927114402Sru}
928114402Sru
929114402Srustatic int find_month(const char *start, const char *end)
930114402Sru{
931114402Sru  static const char *months[] = {
932114402Sru    "january",
933114402Sru    "february",
934114402Sru    "march",
935114402Sru    "april",
936114402Sru    "may",
937114402Sru    "june",
938114402Sru    "july",
939114402Sru    "august",
940114402Sru    "september",
941114402Sru    "october",
942114402Sru    "november",
943114402Sru    "december",
944114402Sru  };
945114402Sru  for (;;) {
946114402Sru    while (start < end && !csalpha(*start))
947114402Sru      start++;
948114402Sru    const char *ptr = start;
949114402Sru    if (start == end)
950114402Sru      break;
951114402Sru    while (ptr < end && csalpha(*ptr))
952114402Sru      ptr++;
953114402Sru    if (ptr - start >= 3) {
954114402Sru      for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
955114402Sru	const char *q = months[i];
956114402Sru	const char *p = start;
957114402Sru	for (; p < ptr; p++, q++)
958114402Sru	  if (cmlower(*p) != *q)
959114402Sru	    break;
960114402Sru	if (p >= ptr)
961114402Sru	  return i;
962114402Sru      }
963114402Sru    }
964114402Sru    start = ptr;
965114402Sru  }
966114402Sru  return -1;
967114402Sru}
968114402Sru
969114402Sruint reference::contains_field(char c) const
970114402Sru{
971114402Sru  return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
972114402Sru}
973114402Sru
974114402Sruint reference::classify()
975114402Sru{
976114402Sru  if (contains_field('J'))
977114402Sru    return JOURNAL_ARTICLE;
978114402Sru  if (contains_field('B'))
979114402Sru    return ARTICLE_IN_BOOK;
980114402Sru  if (contains_field('G'))
981114402Sru    return TECH_REPORT;
982114402Sru  if (contains_field('R'))
983114402Sru    return TECH_REPORT;
984114402Sru  if (contains_field('I'))
985114402Sru    return BOOK;
986114402Sru  if (contains_field('M'))
987114402Sru    return BELL_TM;
988114402Sru  return OTHER;
989114402Sru}
990114402Sru
991114402Sruconst char *reference::get_year(const char **endp) const
992114402Sru{
993114402Sru  if (field_index['D'] != NULL_FIELD_INDEX) {
994114402Sru    string &date = field[field_index['D']];
995114402Sru    const char *start = date.contents();
996114402Sru    const char *end = start + date.length();
997114402Sru    return find_year(start, end, endp);
998114402Sru  }
999114402Sru  else
1000114402Sru    return 0;
1001114402Sru}
1002114402Sru
1003114402Sruconst char *reference::get_field(unsigned char c, const char **endp) const
1004114402Sru{
1005114402Sru  if (field_index[c] != NULL_FIELD_INDEX) {
1006114402Sru    string &f = field[field_index[c]];
1007114402Sru    const char *start = f.contents();
1008114402Sru    *endp = start + f.length();
1009114402Sru    return start;
1010114402Sru  }
1011114402Sru  else
1012114402Sru    return 0;
1013114402Sru}
1014114402Sru
1015114402Sruconst char *reference::get_date(const char **endp) const
1016114402Sru{
1017114402Sru  return get_field('D', endp);
1018114402Sru}
1019114402Sru
1020114402Sruconst char *nth_field(int i, const char *start, const char **endp)
1021114402Sru{
1022114402Sru  while (--i >= 0) {
1023114402Sru    start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1024114402Sru    if (!start)
1025114402Sru      return 0;
1026114402Sru    start++;
1027114402Sru  }
1028114402Sru  const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029114402Sru  if (e)
1030114402Sru    *endp = e;
1031114402Sru  return start;
1032114402Sru}
1033114402Sru
1034114402Sruconst char *reference::get_author(int i, const char **endp) const
1035114402Sru{
1036114402Sru  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037114402Sru    const char *start = get_field(*f, endp);
1038114402Sru    if (start) {
1039114402Sru      if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040114402Sru	return nth_field(i, start, endp);
1041114402Sru      else if (i == 0)
1042114402Sru	return start;
1043114402Sru      else
1044114402Sru	return 0;
1045114402Sru    }
1046114402Sru  }
1047114402Sru  return 0;
1048114402Sru}
1049114402Sru
1050114402Sruconst char *reference::get_author_last_name(int i, const char **endp) const
1051114402Sru{
1052114402Sru  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053114402Sru    const char *start = get_field(*f, endp);
1054114402Sru    if (start) {
1055114402Sru      if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056114402Sru	start = nth_field(i, start, endp);
1057114402Sru	if (!start)
1058114402Sru	  return 0;
1059114402Sru      }
1060114402Sru      if (*f == 'A')
1061114402Sru	return find_last_name(start, *endp, endp);
1062114402Sru      else
1063114402Sru	return start;
1064114402Sru    }
1065114402Sru  }
1066114402Sru  return 0;
1067114402Sru}
1068114402Sru
1069114402Sruvoid reference::set_date(string &d)
1070114402Sru{
1071114402Sru  if (d.length() == 0)
1072114402Sru    delete_field('D');
1073114402Sru  else
1074114402Sru    insert_field('D', d);
1075114402Sru}
1076114402Sru
1077114402Sruint same_year(const reference &r1, const reference &r2)
1078114402Sru{
1079114402Sru  const char *ye1;
1080114402Sru  const char *ys1 = r1.get_year(&ye1);
1081114402Sru  const char *ye2;
1082114402Sru  const char *ys2 = r2.get_year(&ye2);
1083114402Sru  if (ys1 == 0) {
1084114402Sru    if (ys2 == 0)
1085114402Sru      return same_date(r1, r2);
1086114402Sru    else
1087114402Sru      return 0;
1088114402Sru  }
1089114402Sru  else if (ys2 == 0)
1090114402Sru    return 0;
1091114402Sru  else if (ye1 - ys1 != ye2 - ys2)
1092114402Sru    return 0;
1093114402Sru  else
1094114402Sru    return memcmp(ys1, ys2, ye1 - ys1) == 0;
1095114402Sru}
1096114402Sru
1097114402Sruint same_date(const reference &r1, const reference &r2)
1098114402Sru{
1099114402Sru  const char *e1;
1100114402Sru  const char *s1 = r1.get_date(&e1);
1101114402Sru  const char *e2;
1102114402Sru  const char *s2 = r2.get_date(&e2);
1103114402Sru  if (s1 == 0)
1104114402Sru    return s2 == 0;
1105114402Sru  else if (s2 == 0)
1106114402Sru    return 0;
1107114402Sru  else if (e1 - s1 != e2 - s2)
1108114402Sru    return 0;
1109114402Sru  else
1110114402Sru    return memcmp(s1, s2, e1 - s1) == 0;
1111114402Sru}
1112114402Sru
1113114402Sruconst char *reference::get_sort_field(int i, int si, int ssi,
1114114402Sru				      const char **endp) const
1115114402Sru{
1116114402Sru  const char *start = sort_key.contents();
1117114402Sru  const char *end = start + sort_key.length();
1118114402Sru  if (i < 0) {
1119114402Sru    *endp = end;
1120114402Sru    return start;
1121114402Sru  }
1122114402Sru  while (--i >= 0) {
1123114402Sru    start = (char *)memchr(start, SORT_SEP, end - start);
1124114402Sru    if (!start)
1125114402Sru      return 0;
1126114402Sru    start++;
1127114402Sru  }
1128114402Sru  const char *e = (char *)memchr(start, SORT_SEP, end - start);
1129114402Sru  if (e)
1130114402Sru    end = e;
1131114402Sru  if (si < 0) {
1132114402Sru    *endp = end;
1133114402Sru    return start;
1134114402Sru  }
1135114402Sru  while (--si >= 0) {
1136114402Sru    start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1137114402Sru    if (!start)
1138114402Sru      return 0;
1139114402Sru    start++;
1140114402Sru  }
1141114402Sru  e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142114402Sru  if (e)
1143114402Sru    end = e;
1144114402Sru  if (ssi < 0) {
1145114402Sru    *endp = end;
1146114402Sru    return start;
1147114402Sru  }
1148114402Sru  while (--ssi >= 0) {
1149114402Sru    start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1150114402Sru    if (!start)
1151114402Sru      return 0;
1152114402Sru    start++;
1153114402Sru  }
1154114402Sru  e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155114402Sru  if (e)
1156114402Sru    end = e;
1157114402Sru  *endp = end;
1158114402Sru  return start;
1159114402Sru}
1160114402Sru
1161