1/* -*- C++ -*-
2   Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
3   Free Software Foundation, Inc.
4     Written by James Clark (jjc@jclark.com)
5
6This file is part of groff.
7
8groff is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13groff is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License along
19with groff; see the file COPYING.  If not, write to the Free Software
20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21
22%{
23
24#include "refer.h"
25#include "refid.h"
26#include "ref.h"
27#include "token.h"
28
29int yylex();
30void yyerror(const char *);
31int yyparse();
32
33static const char *format_serial(char c, int n);
34
35struct label_info {
36  int start;
37  int length;
38  int count;
39  int total;
40  label_info(const string &);
41};
42
43label_info *lookup_label(const string &label);
44
45struct expression {
46  enum {
47    // Does the tentative label depend on the reference?
48    CONTAINS_VARIABLE = 01,
49    CONTAINS_STAR = 02,
50    CONTAINS_FORMAT = 04,
51    CONTAINS_AT = 010
52  };
53  virtual ~expression() { }
54  virtual void evaluate(int, const reference &, string &,
55			substring_position &) = 0;
56  virtual unsigned analyze() { return 0; }
57};
58
59class at_expr : public expression {
60public:
61  at_expr() { }
62  void evaluate(int, const reference &, string &, substring_position &);
63  unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
64};
65
66class format_expr : public expression {
67  char type;
68  int width;
69  int first_number;
70public:
71  format_expr(char c, int w = 0, int f = 1)
72    : type(c), width(w), first_number(f) { }
73  void evaluate(int, const reference &, string &, substring_position &);
74  unsigned analyze() { return CONTAINS_FORMAT; }
75};
76
77class field_expr : public expression {
78  int number;
79  char name;
80public:
81  field_expr(char nm, int num) : number(num), name(nm) { }
82  void evaluate(int, const reference &, string &, substring_position &);
83  unsigned analyze() { return CONTAINS_VARIABLE; }
84};
85
86class literal_expr : public expression {
87  string s;
88public:
89  literal_expr(const char *ptr, int len) : s(ptr, len) { }
90  void evaluate(int, const reference &, string &, substring_position &);
91};
92
93class unary_expr : public expression {
94protected:
95  expression *expr;
96public:
97  unary_expr(expression *e) : expr(e) { }
98  ~unary_expr() { delete expr; }
99  void evaluate(int, const reference &, string &, substring_position &) = 0;
100  unsigned analyze() { return expr ? expr->analyze() : 0; }
101};
102
103// This caches the analysis of an expression.
104
105class analyzed_expr : public unary_expr {
106  unsigned flags;
107public:
108  analyzed_expr(expression *);
109  void evaluate(int, const reference &, string &, substring_position &);
110  unsigned analyze() { return flags; }
111};
112
113class star_expr : public unary_expr {
114public:
115  star_expr(expression *e) : unary_expr(e) { }
116  void evaluate(int, const reference &, string &, substring_position &);
117  unsigned analyze() {
118    return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
119	    | CONTAINS_STAR);
120  }
121};
122
123typedef void map_func(const char *, const char *, string &);
124
125class map_expr : public unary_expr {
126  map_func *func;
127public:
128  map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
129  void evaluate(int, const reference &, string &, substring_position &);
130};
131
132typedef const char *extractor_func(const char *, const char *, const char **);
133
134class extractor_expr : public unary_expr {
135  int part;
136  extractor_func *func;
137public:
138  enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
139  extractor_expr(expression *e, extractor_func *f, int pt)
140    : unary_expr(e), part(pt), func(f) { }
141  void evaluate(int, const reference &, string &, substring_position &);
142};
143
144class truncate_expr : public unary_expr {
145  int n;
146public:
147  truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
148  void evaluate(int, const reference &, string &, substring_position &);
149};
150
151class separator_expr : public unary_expr {
152public:
153  separator_expr(expression *e) : unary_expr(e) { }
154  void evaluate(int, const reference &, string &, substring_position &);
155};
156
157class binary_expr : public expression {
158protected:
159  expression *expr1;
160  expression *expr2;
161public:
162  binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
163  ~binary_expr() { delete expr1; delete expr2; }
164  void evaluate(int, const reference &, string &, substring_position &) = 0;
165  unsigned analyze() {
166    return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
167  }
168};
169
170class alternative_expr : public binary_expr {
171public:
172  alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
173  void evaluate(int, const reference &, string &, substring_position &);
174};
175
176class list_expr : public binary_expr {
177public:
178  list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
179  void evaluate(int, const reference &, string &, substring_position &);
180};
181
182class substitute_expr : public binary_expr {
183public:
184  substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
185  void evaluate(int, const reference &, string &, substring_position &);
186};
187
188class ternary_expr : public expression {
189protected:
190  expression *expr1;
191  expression *expr2;
192  expression *expr3;
193public:
194  ternary_expr(expression *e1, expression *e2, expression *e3)
195    : expr1(e1), expr2(e2), expr3(e3) { }
196  ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
197  void evaluate(int, const reference &, string &, substring_position &) = 0;
198  unsigned analyze() {
199    return ((expr1 ? expr1->analyze() : 0)
200	    | (expr2 ? expr2->analyze() : 0)
201	    | (expr3 ? expr3->analyze() : 0));
202  }
203};
204
205class conditional_expr : public ternary_expr {
206public:
207  conditional_expr(expression *e1, expression *e2, expression *e3)
208    : ternary_expr(e1, e2, e3) { }
209  void evaluate(int, const reference &, string &, substring_position &);
210};
211
212static expression *parsed_label = 0;
213static expression *parsed_date_label = 0;
214static expression *parsed_short_label = 0;
215
216static expression *parse_result;
217
218string literals;
219
220%}
221
222%union {
223  int num;
224  expression *expr;
225  struct { int ndigits; int val; } dig;
226  struct { int start; int len; } str;
227}
228
229/* uppercase or lowercase letter */
230%token <num> TOKEN_LETTER
231/* literal characters */
232%token <str> TOKEN_LITERAL
233/* digit */
234%token <num> TOKEN_DIGIT
235
236%type <expr> conditional
237%type <expr> alternative
238%type <expr> list
239%type <expr> string
240%type <expr> substitute
241%type <expr> optional_conditional
242%type <num> number
243%type <dig> digits
244%type <num> optional_number
245%type <num> flag
246
247%%
248
249expr:
250	optional_conditional
251		{ parse_result = ($1 ? new analyzed_expr($1) : 0); }
252	;
253
254conditional:
255	alternative
256		{ $$ = $1; }
257	| alternative '?' optional_conditional ':' conditional
258		{ $$ = new conditional_expr($1, $3, $5); }
259	;
260
261optional_conditional:
262	/* empty */
263		{ $$ = 0; }
264	| conditional
265		{ $$ = $1; }
266	;
267
268alternative:
269	list
270		{ $$ = $1; }
271	| alternative '|' list
272		{ $$ = new alternative_expr($1, $3); }
273	| alternative '&' list
274		{ $$ = new conditional_expr($1, $3, 0); }
275	;
276
277list:
278	substitute
279		{ $$ = $1; }
280	| list substitute
281		{ $$ = new list_expr($1, $2); }
282	;
283
284substitute:
285	string
286		{ $$ = $1; }
287	| substitute '~' string
288		{ $$ = new substitute_expr($1, $3); }
289	;
290
291string:
292	'@'
293		{ $$ = new at_expr; }
294	| TOKEN_LITERAL
295		{
296		  $$ = new literal_expr(literals.contents() + $1.start,
297					$1.len);
298		}
299	| TOKEN_LETTER
300		{ $$ = new field_expr($1, 0); }
301	| TOKEN_LETTER number
302		{ $$ = new field_expr($1, $2 - 1); }
303	| '%' TOKEN_LETTER
304		{
305		  switch ($2) {
306		  case 'I':
307		  case 'i':
308		  case 'A':
309		  case 'a':
310		    $$ = new format_expr($2);
311		    break;
312		  default:
313		    command_error("unrecognized format `%1'", char($2));
314		    $$ = new format_expr('a');
315		    break;
316		  }
317		}
318
319	| '%' digits
320		{
321		  $$ = new format_expr('0', $2.ndigits, $2.val);
322		}
323	| string '.' flag TOKEN_LETTER optional_number
324		{
325		  switch ($4) {
326		  case 'l':
327		    $$ = new map_expr($1, lowercase);
328		    break;
329		  case 'u':
330		    $$ = new map_expr($1, uppercase);
331		    break;
332		  case 'c':
333		    $$ = new map_expr($1, capitalize);
334		    break;
335		  case 'r':
336		    $$ = new map_expr($1, reverse_name);
337		    break;
338		  case 'a':
339		    $$ = new map_expr($1, abbreviate_name);
340		    break;
341		  case 'y':
342		    $$ = new extractor_expr($1, find_year, $3);
343		    break;
344		  case 'n':
345		    $$ = new extractor_expr($1, find_last_name, $3);
346		    break;
347		  default:
348		    $$ = $1;
349		    command_error("unknown function `%1'", char($4));
350		    break;
351		  }
352		}
353
354	| string '+' number
355		{ $$ = new truncate_expr($1, $3); }
356	| string '-' number
357		{ $$ = new truncate_expr($1, -$3); }
358	| string '*'
359		{ $$ = new star_expr($1); }
360	| '(' optional_conditional ')'
361		{ $$ = $2; }
362	| '<' optional_conditional '>'
363		{ $$ = new separator_expr($2); }
364	;
365
366optional_number:
367	/* empty */
368		{ $$ = -1; }
369	| number
370		{ $$ = $1; }
371	;
372
373number:
374	TOKEN_DIGIT
375		{ $$ = $1; }
376	| number TOKEN_DIGIT
377		{ $$ = $1*10 + $2; }
378	;
379
380digits:
381	TOKEN_DIGIT
382		{ $$.ndigits = 1; $$.val = $1; }
383	| digits TOKEN_DIGIT
384		{ $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
385	;
386
387
388flag:
389	/* empty */
390		{ $$ = 0; }
391	| '+'
392		{ $$ = 1; }
393	| '-'
394		{ $$ = -1; }
395	;
396
397%%
398
399/* bison defines const to be empty unless __STDC__ is defined, which it
400isn't under cfront */
401
402#ifdef const
403#undef const
404#endif
405
406const char *spec_ptr;
407const char *spec_end;
408const char *spec_cur;
409
410static char uppercase_array[] = {
411  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
412  'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
413  'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
414  'Y', 'Z',
415};
416
417static char lowercase_array[] = {
418  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
419  'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
420  'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
421  'y', 'z',
422};
423
424int yylex()
425{
426  while (spec_ptr < spec_end && csspace(*spec_ptr))
427    spec_ptr++;
428  spec_cur = spec_ptr;
429  if (spec_ptr >= spec_end)
430    return 0;
431  unsigned char c = *spec_ptr++;
432  if (csalpha(c)) {
433    yylval.num = c;
434    return TOKEN_LETTER;
435  }
436  if (csdigit(c)) {
437    yylval.num = c - '0';
438    return TOKEN_DIGIT;
439  }
440  if (c == '\'') {
441    yylval.str.start = literals.length();
442    for (; spec_ptr < spec_end; spec_ptr++) {
443      if (*spec_ptr == '\'') {
444	if (++spec_ptr < spec_end && *spec_ptr == '\'')
445	  literals += '\'';
446	else {
447	  yylval.str.len = literals.length() - yylval.str.start;
448	  return TOKEN_LITERAL;
449	}
450      }
451      else
452	literals += *spec_ptr;
453    }
454    yylval.str.len = literals.length() - yylval.str.start;
455    return TOKEN_LITERAL;
456  }
457  return c;
458}
459
460int set_label_spec(const char *label_spec)
461{
462  spec_cur = spec_ptr = label_spec;
463  spec_end = strchr(label_spec, '\0');
464  literals.clear();
465  if (yyparse())
466    return 0;
467  delete parsed_label;
468  parsed_label = parse_result;
469  return 1;
470}
471
472int set_date_label_spec(const char *label_spec)
473{
474  spec_cur = spec_ptr = label_spec;
475  spec_end = strchr(label_spec, '\0');
476  literals.clear();
477  if (yyparse())
478    return 0;
479  delete parsed_date_label;
480  parsed_date_label = parse_result;
481  return 1;
482}
483
484int set_short_label_spec(const char *label_spec)
485{
486  spec_cur = spec_ptr = label_spec;
487  spec_end = strchr(label_spec, '\0');
488  literals.clear();
489  if (yyparse())
490    return 0;
491  delete parsed_short_label;
492  parsed_short_label = parse_result;
493  return 1;
494}
495
496void yyerror(const char *message)
497{
498  if (spec_cur < spec_end)
499    command_error("label specification %1 before `%2'", message, spec_cur);
500  else
501    command_error("label specification %1 at end of string",
502		  message, spec_cur);
503}
504
505void at_expr::evaluate(int tentative, const reference &ref,
506		       string &result, substring_position &)
507{
508  if (tentative)
509    ref.canonicalize_authors(result);
510  else {
511    const char *end, *start = ref.get_authors(&end);
512    if (start)
513      result.append(start, end - start);
514  }
515}
516
517void format_expr::evaluate(int tentative, const reference &ref,
518			   string &result, substring_position &)
519{
520  if (tentative)
521    return;
522  const label_info *lp = ref.get_label_ptr();
523  int num = lp == 0 ? ref.get_number() : lp->count;
524  if (type != '0')
525    result += format_serial(type, num + 1);
526  else {
527    const char *ptr = i_to_a(num + first_number);
528    int pad = width - strlen(ptr);
529    while (--pad >= 0)
530      result += '0';
531    result += ptr;
532  }
533}
534
535static const char *format_serial(char c, int n)
536{
537  assert(n > 0);
538  static char buf[128]; // more than enough.
539  switch (c) {
540  case 'i':
541  case 'I':
542    {
543      char *p = buf;
544      // troff uses z and w to represent 10000 and 5000 in Roman
545      // numerals; I can find no historical basis for this usage
546      const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
547      if (n >= 40000)
548	return i_to_a(n);
549      while (n >= 10000) {
550	*p++ = s[0];
551	n -= 10000;
552      }
553      for (int i = 1000; i > 0; i /= 10, s += 2) {
554	int m = n/i;
555	n -= m*i;
556	switch (m) {
557	case 3:
558	  *p++ = s[2];
559	  /* falls through */
560	case 2:
561	  *p++ = s[2];
562	  /* falls through */
563	case 1:
564	  *p++ = s[2];
565	  break;
566	case 4:
567	  *p++ = s[2];
568	  *p++ = s[1];
569	  break;
570	case 8:
571	  *p++ = s[1];
572	  *p++ = s[2];
573	  *p++ = s[2];
574	  *p++ = s[2];
575	  break;
576	case 7:
577	  *p++ = s[1];
578	  *p++ = s[2];
579	  *p++ = s[2];
580	  break;
581	case 6:
582	  *p++ = s[1];
583	  *p++ = s[2];
584	  break;
585	case 5:
586	  *p++ = s[1];
587	  break;
588	case 9:
589	  *p++ = s[2];
590	  *p++ = s[0];
591	}
592      }
593      *p = 0;
594      break;
595    }
596  case 'a':
597  case 'A':
598    {
599      char *p = buf;
600      // this is derived from troff/reg.c
601      while (n > 0) {
602	int d = n % 26;
603	if (d == 0)
604	  d = 26;
605	n -= d;
606	n /= 26;
607	*p++ = c == 'a' ? lowercase_array[d - 1] :
608			       uppercase_array[d - 1];
609      }
610      *p-- = 0;
611      // Reverse it.
612      char *q = buf;
613      while (q < p) {
614	char temp = *q;
615	*q = *p;
616	*p = temp;
617	--p;
618	++q;
619      }
620      break;
621    }
622  default:
623    assert(0);
624  }
625  return buf;
626}
627
628void field_expr::evaluate(int, const reference &ref,
629			  string &result, substring_position &)
630{
631  const char *end;
632  const char *start = ref.get_field(name, &end);
633  if (start) {
634    start = nth_field(number, start, &end);
635    if (start)
636      result.append(start, end - start);
637  }
638}
639
640void literal_expr::evaluate(int, const reference &,
641			    string &result, substring_position &)
642{
643  result += s;
644}
645
646analyzed_expr::analyzed_expr(expression *e)
647: unary_expr(e), flags(e ? e->analyze() : 0)
648{
649}
650
651void analyzed_expr::evaluate(int tentative, const reference &ref,
652			     string &result, substring_position &pos)
653{
654  if (expr)
655    expr->evaluate(tentative, ref, result, pos);
656}
657
658void star_expr::evaluate(int tentative, const reference &ref,
659			 string &result, substring_position &pos)
660{
661  const label_info *lp = ref.get_label_ptr();
662  if (!tentative
663      && (lp == 0 || lp->total > 1)
664      && expr)
665    expr->evaluate(tentative, ref, result, pos);
666}
667
668void separator_expr::evaluate(int tentative, const reference &ref,
669			      string &result, substring_position &pos)
670{
671  int start_length = result.length();
672  int is_first = pos.start < 0;
673  if (expr)
674    expr->evaluate(tentative, ref, result, pos);
675  if (is_first) {
676    pos.start = start_length;
677    pos.length = result.length() - start_length;
678  }
679}
680
681void map_expr::evaluate(int tentative, const reference &ref,
682			string &result, substring_position &)
683{
684  if (expr) {
685    string temp;
686    substring_position temp_pos;
687    expr->evaluate(tentative, ref, temp, temp_pos);
688    (*func)(temp.contents(), temp.contents() + temp.length(), result);
689  }
690}
691
692void extractor_expr::evaluate(int tentative, const reference &ref,
693			      string &result, substring_position &)
694{
695  if (expr) {
696    string temp;
697    substring_position temp_pos;
698    expr->evaluate(tentative, ref, temp, temp_pos);
699    const char *end, *start = (*func)(temp.contents(),
700				      temp.contents() + temp.length(),
701				      &end);
702    switch (part) {
703    case BEFORE:
704      if (start)
705	result.append(temp.contents(), start - temp.contents());
706      else
707	result += temp;
708      break;
709    case MATCH:
710      if (start)
711	result.append(start, end - start);
712      break;
713    case AFTER:
714      if (start)
715	result.append(end, temp.contents() + temp.length() - end);
716      break;
717    default:
718      assert(0);
719    }
720  }
721}
722
723static void first_part(int len, const char *ptr, const char *end,
724			  string &result)
725{
726  for (;;) {
727    const char *token_start = ptr;
728    if (!get_token(&ptr, end))
729      break;
730    const token_info *ti = lookup_token(token_start, ptr);
731    int counts = ti->sortify_non_empty(token_start, ptr);
732    if (counts && --len < 0)
733      break;
734    if (counts || ti->is_accent())
735      result.append(token_start, ptr - token_start);
736  }
737}
738
739static void last_part(int len, const char *ptr, const char *end,
740		      string &result)
741{
742  const char *start = ptr;
743  int count = 0;
744  for (;;) {
745    const char *token_start = ptr;
746    if (!get_token(&ptr, end))
747      break;
748    const token_info *ti = lookup_token(token_start, ptr);
749    if (ti->sortify_non_empty(token_start, ptr))
750      count++;
751  }
752  ptr = start;
753  int skip = count - len;
754  if (skip > 0) {
755    for (;;) {
756      const char *token_start = ptr;
757      if (!get_token(&ptr, end))
758	assert(0);
759      const token_info *ti = lookup_token(token_start, ptr);
760      if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
761	ptr = token_start;
762	break;
763      }
764    }
765  }
766  first_part(len, ptr, end, result);
767}
768
769void truncate_expr::evaluate(int tentative, const reference &ref,
770			     string &result, substring_position &)
771{
772  if (expr) {
773    string temp;
774    substring_position temp_pos;
775    expr->evaluate(tentative, ref, temp, temp_pos);
776    const char *start = temp.contents();
777    const char *end = start + temp.length();
778    if (n > 0)
779      first_part(n, start, end, result);
780    else if (n < 0)
781      last_part(-n, start, end, result);
782  }
783}
784
785void alternative_expr::evaluate(int tentative, const reference &ref,
786				string &result, substring_position &pos)
787{
788  int start_length = result.length();
789  if (expr1)
790    expr1->evaluate(tentative, ref, result, pos);
791  if (result.length() == start_length && expr2)
792    expr2->evaluate(tentative, ref, result, pos);
793}
794
795void list_expr::evaluate(int tentative, const reference &ref,
796			 string &result, substring_position &pos)
797{
798  if (expr1)
799    expr1->evaluate(tentative, ref, result, pos);
800  if (expr2)
801    expr2->evaluate(tentative, ref, result, pos);
802}
803
804void substitute_expr::evaluate(int tentative, const reference &ref,
805			       string &result, substring_position &pos)
806{
807  int start_length = result.length();
808  if (expr1)
809    expr1->evaluate(tentative, ref, result, pos);
810  if (result.length() > start_length && result[result.length() - 1] == '-') {
811    // ought to see if pos covers the -
812    result.set_length(result.length() - 1);
813    if (expr2)
814      expr2->evaluate(tentative, ref, result, pos);
815  }
816}
817
818void conditional_expr::evaluate(int tentative, const reference &ref,
819				string &result, substring_position &pos)
820{
821  string temp;
822  substring_position temp_pos;
823  if (expr1)
824    expr1->evaluate(tentative, ref, temp, temp_pos);
825  if (temp.length() > 0) {
826    if (expr2)
827      expr2->evaluate(tentative, ref, result, pos);
828  }
829  else {
830    if (expr3)
831      expr3->evaluate(tentative, ref, result, pos);
832  }
833}
834
835void reference::pre_compute_label()
836{
837  if (parsed_label != 0
838      && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
839    label.clear();
840    substring_position temp_pos;
841    parsed_label->evaluate(1, *this, label, temp_pos);
842    label_ptr = lookup_label(label);
843  }
844}
845
846void reference::compute_label()
847{
848  label.clear();
849  if (parsed_label)
850    parsed_label->evaluate(0, *this, label, separator_pos);
851  if (short_label_flag && parsed_short_label)
852    parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
853  if (date_as_label) {
854    string new_date;
855    if (parsed_date_label) {
856      substring_position temp_pos;
857      parsed_date_label->evaluate(0, *this, new_date, temp_pos);
858    }
859    set_date(new_date);
860  }
861  if (label_ptr)
862    label_ptr->count += 1;
863}
864
865void reference::immediate_compute_label()
866{
867  if (label_ptr)
868    label_ptr->total = 2;	// force use of disambiguator
869  compute_label();
870}
871
872int reference::merge_labels(reference **v, int n, label_type type,
873			    string &result)
874{
875  if (abbreviate_label_ranges)
876    return merge_labels_by_number(v, n, type, result);
877  else
878    return merge_labels_by_parts(v, n, type, result);
879}
880
881int reference::merge_labels_by_number(reference **v, int n, label_type type,
882				      string &result)
883{
884  if (n <= 1)
885    return 0;
886  int num = get_number();
887  // Only merge three or more labels.
888  if (v[0]->get_number() != num + 1
889      || v[1]->get_number() != num + 2)
890    return 0;
891  int i;
892  for (i = 2; i < n; i++)
893    if (v[i]->get_number() != num + i + 1)
894      break;
895  result = get_label(type);
896  result += label_range_indicator;
897  result += v[i - 1]->get_label(type);
898  return i;
899}
900
901const substring_position &reference::get_separator_pos(label_type type) const
902{
903  if (type == SHORT_LABEL && short_label_flag)
904    return short_separator_pos;
905  else
906    return separator_pos;
907}
908
909const string &reference::get_label(label_type type) const
910{
911  if (type == SHORT_LABEL && short_label_flag)
912    return short_label;
913  else
914    return label;
915}
916
917int reference::merge_labels_by_parts(reference **v, int n, label_type type,
918				     string &result)
919{
920  if (n <= 0)
921    return 0;
922  const string &lb = get_label(type);
923  const substring_position &sp = get_separator_pos(type);
924  if (sp.start < 0
925      || sp.start != v[0]->get_separator_pos(type).start
926      || memcmp(lb.contents(), v[0]->get_label(type).contents(),
927		sp.start) != 0)
928    return 0;
929  result = lb;
930  int i = 0;
931  do {
932    result += separate_label_second_parts;
933    const substring_position &s = v[i]->get_separator_pos(type);
934    int sep_end_pos = s.start + s.length;
935    result.append(v[i]->get_label(type).contents() + sep_end_pos,
936		  v[i]->get_label(type).length() - sep_end_pos);
937  } while (++i < n
938	   && sp.start == v[i]->get_separator_pos(type).start
939	   && memcmp(lb.contents(), v[i]->get_label(type).contents(),
940		     sp.start) == 0);
941  return i;
942}
943
944string label_pool;
945
946label_info::label_info(const string &s)
947: start(label_pool.length()), length(s.length()), count(0), total(1)
948{
949  label_pool += s;
950}
951
952static label_info **label_table = 0;
953static int label_table_size = 0;
954static int label_table_used = 0;
955
956label_info *lookup_label(const string &label)
957{
958  if (label_table == 0) {
959    label_table = new label_info *[17];
960    label_table_size = 17;
961    for (int i = 0; i < 17; i++)
962      label_table[i] = 0;
963  }
964  unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
965  label_info **ptr;
966  for (ptr = label_table + h;
967       *ptr != 0;
968       (ptr == label_table)
969       ? (ptr = label_table + label_table_size - 1)
970       : ptr--)
971    if ((*ptr)->length == label.length()
972	&& memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
973		  label.length()) == 0) {
974      (*ptr)->total += 1;
975      return *ptr;
976    }
977  label_info *result = *ptr = new label_info(label);
978  if (++label_table_used * 2 > label_table_size) {
979    // Rehash the table.
980    label_info **old_table = label_table;
981    int old_size = label_table_size;
982    label_table_size = next_size(label_table_size);
983    label_table = new label_info *[label_table_size];
984    int i;
985    for (i = 0; i < label_table_size; i++)
986      label_table[i] = 0;
987    for (i = 0; i < old_size; i++)
988      if (old_table[i]) {
989	h = hash_string(label_pool.contents() + old_table[i]->start,
990			old_table[i]->length);
991	label_info **p;
992	for (p = label_table + (h % label_table_size);
993	     *p != 0;
994	     (p == label_table)
995	     ? (p = label_table + label_table_size - 1)
996	     : --p)
997	    ;
998	*p = old_table[i];
999	}
1000    a_delete old_table;
1001  }
1002  return result;
1003}
1004
1005void clear_labels()
1006{
1007  for (int i = 0; i < label_table_size; i++) {
1008    delete label_table[i];
1009    label_table[i] = 0;
1010  }
1011  label_table_used = 0;
1012  label_pool.clear();
1013}
1014
1015static void consider_authors(reference **start, reference **end, int i);
1016
1017void compute_labels(reference **v, int n)
1018{
1019  if (parsed_label
1020      && (parsed_label->analyze() & expression::CONTAINS_AT)
1021      && sort_fields.length() >= 2
1022      && sort_fields[0] == 'A'
1023      && sort_fields[1] == '+')
1024    consider_authors(v, v + n, 0);
1025  for (int i = 0; i < n; i++)
1026    v[i]->compute_label();
1027}
1028
1029
1030/* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1031where 0 <= i <= N if there exists a reference with a list of authors
1032<B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1033and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1034A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1035<B0,B1,...,BM>.  If a reference needs author i we only have to call
1036need_author(j) for some j >= i such that the reference also needs
1037author j. */
1038
1039/* This function handles 2 tasks:
1040determine which authors are needed (cannot be elided with et al.);
1041determine which authors can have only last names in the labels.
1042
1043References >= start and < end have the same first i author names.
1044Also they're sorted by A+. */
1045
1046static void consider_authors(reference **start, reference **end, int i)
1047{
1048  if (start >= end)
1049    return;
1050  reference **p = start;
1051  if (i >= (*p)->get_nauthors()) {
1052    for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1053      ;
1054    if (p < end && i > 0) {
1055      // If we have an author list <A B C> and an author list <A B C D>,
1056      // then both lists need C.
1057      for (reference **q = start; q < end; q++)
1058	(*q)->need_author(i - 1);
1059    }
1060    start = p;
1061  }
1062  while (p < end) {
1063    reference **last_name_start = p;
1064    reference **name_start = p;
1065    for (++p;
1066	 p < end && i < (*p)->get_nauthors()
1067	 && same_author_last_name(**last_name_start, **p, i);
1068	 p++) {
1069      if (!same_author_name(**name_start, **p, i)) {
1070	consider_authors(name_start, p, i + 1);
1071	name_start = p;
1072      }
1073    }
1074    consider_authors(name_start, p, i + 1);
1075    if (last_name_start == name_start) {
1076      for (reference **q = last_name_start; q < p; q++)
1077	(*q)->set_last_name_unambiguous(i);
1078    }
1079    // If we have an author list <A B C D> and <A B C E>, then the lists
1080    // need author D and E respectively.
1081    if (name_start > start || p < end) {
1082      for (reference **q = last_name_start; q < p; q++)
1083	(*q)->need_author(i);
1084    }
1085  }
1086}
1087
1088int same_author_last_name(const reference &r1, const reference &r2, int n)
1089{
1090  const char *ae1;
1091  const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1092  const char *ae2;
1093  const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1094  if (!as1 && !as2) return 1;	// they are the same
1095  if (!as1 || !as2) return 0;
1096  return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1097}
1098
1099int same_author_name(const reference &r1, const reference &r2, int n)
1100{
1101  const char *ae1;
1102  const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1103  const char *ae2;
1104  const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1105  if (!as1 && !as2) return 1;	// they are the same
1106  if (!as1 || !as2) return 0;
1107  return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1108}
1109
1110
1111void int_set::set(int i)
1112{
1113  assert(i >= 0);
1114  int bytei = i >> 3;
1115  if (bytei >= v.length()) {
1116    int old_length = v.length();
1117    v.set_length(bytei + 1);
1118    for (int j = old_length; j <= bytei; j++)
1119      v[j] = 0;
1120  }
1121  v[bytei] |= 1 << (i & 7);
1122}
1123
1124int int_set::get(int i) const
1125{
1126  assert(i >= 0);
1127  int bytei = i >> 3;
1128  return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1129}
1130
1131void reference::set_last_name_unambiguous(int i)
1132{
1133  last_name_unambiguous.set(i);
1134}
1135
1136void reference::need_author(int n)
1137{
1138  if (n > last_needed_author)
1139    last_needed_author = n;
1140}
1141
1142const char *reference::get_authors(const char **end) const
1143{
1144  if (!computed_authors) {
1145    ((reference *)this)->computed_authors = 1;
1146    string &result = ((reference *)this)->authors;
1147    int na = get_nauthors();
1148    result.clear();
1149    for (int i = 0; i < na; i++) {
1150      if (last_name_unambiguous.get(i)) {
1151	const char *e, *start = get_author_last_name(i, &e);
1152	assert(start != 0);
1153	result.append(start, e - start);
1154      }
1155      else {
1156	const char *e, *start = get_author(i, &e);
1157	assert(start != 0);
1158	result.append(start, e - start);
1159      }
1160      if (i == last_needed_author
1161	  && et_al.length() > 0
1162	  && et_al_min_elide > 0
1163	  && last_needed_author + et_al_min_elide < na
1164	  && na >= et_al_min_total) {
1165	result += et_al;
1166	break;
1167      }
1168      if (i < na - 1) {
1169	if (na == 2)
1170	  result += join_authors_exactly_two;
1171	else if (i < na - 2)
1172	  result += join_authors_default;
1173	else
1174	  result += join_authors_last_two;
1175      }
1176    }
1177  }
1178  const char *start = authors.contents();
1179  *end = start + authors.length();
1180  return start;
1181}
1182
1183int reference::get_nauthors() const
1184{
1185  if (nauthors < 0) {
1186    const char *dummy;
1187    int na;
1188    for (na = 0; get_author(na, &dummy) != 0; na++)
1189      ;
1190    ((reference *)this)->nauthors = na;
1191  }
1192  return nauthors;
1193}
1194