1/*	$NetBSD: label.y,v 1.1.1.4 2006/02/06 18:14:37 wiz Exp $	*/
2
3/* -*- C++ -*-
4   Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
5   Free Software Foundation, Inc.
6     Written by James Clark (jjc@jclark.com)
7
8This file is part of groff.
9
10groff is free software; you can redistribute it and/or modify it under
11the terms of the GNU General Public License as published by the Free
12Software Foundation; either version 2, or (at your option) any later
13version.
14
15groff is distributed in the hope that it will be useful, but WITHOUT ANY
16WARRANTY; without even the implied warranty of MERCHANTABILITY or
17FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18for more details.
19
20You should have received a copy of the GNU General Public License along
21with groff; see the file COPYING.  If not, write to the Free Software
22Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23
24%{
25
26#include "refer.h"
27#include "refid.h"
28#include "ref.h"
29#include "token.h"
30
31int yylex();
32void yyerror(const char *);
33int yyparse();
34
35static const char *format_serial(char c, int n);
36
37struct label_info {
38  int start;
39  int length;
40  int count;
41  int total;
42  label_info(const string &);
43};
44
45label_info *lookup_label(const string &label);
46
47struct expression {
48  enum {
49    // Does the tentative label depend on the reference?
50    CONTAINS_VARIABLE = 01,
51    CONTAINS_STAR = 02,
52    CONTAINS_FORMAT = 04,
53    CONTAINS_AT = 010
54  };
55  virtual ~expression() { }
56  virtual void evaluate(int, const reference &, string &,
57			substring_position &) = 0;
58  virtual unsigned analyze() { return 0; }
59};
60
61class at_expr : public expression {
62public:
63  at_expr() { }
64  void evaluate(int, const reference &, string &, substring_position &);
65  unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
66};
67
68class format_expr : public expression {
69  char type;
70  int width;
71  int first_number;
72public:
73  format_expr(char c, int w = 0, int f = 1)
74    : type(c), width(w), first_number(f) { }
75  void evaluate(int, const reference &, string &, substring_position &);
76  unsigned analyze() { return CONTAINS_FORMAT; }
77};
78
79class field_expr : public expression {
80  int number;
81  char name;
82public:
83  field_expr(char nm, int num) : number(num), name(nm) { }
84  void evaluate(int, const reference &, string &, substring_position &);
85  unsigned analyze() { return CONTAINS_VARIABLE; }
86};
87
88class literal_expr : public expression {
89  string s;
90public:
91  literal_expr(const char *ptr, int len) : s(ptr, len) { }
92  void evaluate(int, const reference &, string &, substring_position &);
93};
94
95class unary_expr : public expression {
96protected:
97  expression *expr;
98public:
99  unary_expr(expression *e) : expr(e) { }
100  ~unary_expr() { delete expr; }
101  void evaluate(int, const reference &, string &, substring_position &) = 0;
102  unsigned analyze() { return expr ? expr->analyze() : 0; }
103};
104
105// This caches the analysis of an expression.
106
107class analyzed_expr : public unary_expr {
108  unsigned flags;
109public:
110  analyzed_expr(expression *);
111  void evaluate(int, const reference &, string &, substring_position &);
112  unsigned analyze() { return flags; }
113};
114
115class star_expr : public unary_expr {
116public:
117  star_expr(expression *e) : unary_expr(e) { }
118  void evaluate(int, const reference &, string &, substring_position &);
119  unsigned analyze() {
120    return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
121	    | CONTAINS_STAR);
122  }
123};
124
125typedef void map_func(const char *, const char *, string &);
126
127class map_expr : public unary_expr {
128  map_func *func;
129public:
130  map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
131  void evaluate(int, const reference &, string &, substring_position &);
132};
133
134typedef const char *extractor_func(const char *, const char *, const char **);
135
136class extractor_expr : public unary_expr {
137  int part;
138  extractor_func *func;
139public:
140  enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
141  extractor_expr(expression *e, extractor_func *f, int pt)
142    : unary_expr(e), part(pt), func(f) { }
143  void evaluate(int, const reference &, string &, substring_position &);
144};
145
146class truncate_expr : public unary_expr {
147  int n;
148public:
149  truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
150  void evaluate(int, const reference &, string &, substring_position &);
151};
152
153class separator_expr : public unary_expr {
154public:
155  separator_expr(expression *e) : unary_expr(e) { }
156  void evaluate(int, const reference &, string &, substring_position &);
157};
158
159class binary_expr : public expression {
160protected:
161  expression *expr1;
162  expression *expr2;
163public:
164  binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
165  ~binary_expr() { delete expr1; delete expr2; }
166  void evaluate(int, const reference &, string &, substring_position &) = 0;
167  unsigned analyze() {
168    return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
169  }
170};
171
172class alternative_expr : public binary_expr {
173public:
174  alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
175  void evaluate(int, const reference &, string &, substring_position &);
176};
177
178class list_expr : public binary_expr {
179public:
180  list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
181  void evaluate(int, const reference &, string &, substring_position &);
182};
183
184class substitute_expr : public binary_expr {
185public:
186  substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
187  void evaluate(int, const reference &, string &, substring_position &);
188};
189
190class ternary_expr : public expression {
191protected:
192  expression *expr1;
193  expression *expr2;
194  expression *expr3;
195public:
196  ternary_expr(expression *e1, expression *e2, expression *e3)
197    : expr1(e1), expr2(e2), expr3(e3) { }
198  ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
199  void evaluate(int, const reference &, string &, substring_position &) = 0;
200  unsigned analyze() {
201    return ((expr1 ? expr1->analyze() : 0)
202	    | (expr2 ? expr2->analyze() : 0)
203	    | (expr3 ? expr3->analyze() : 0));
204  }
205};
206
207class conditional_expr : public ternary_expr {
208public:
209  conditional_expr(expression *e1, expression *e2, expression *e3)
210    : ternary_expr(e1, e2, e3) { }
211  void evaluate(int, const reference &, string &, substring_position &);
212};
213
214static expression *parsed_label = 0;
215static expression *parsed_date_label = 0;
216static expression *parsed_short_label = 0;
217
218static expression *parse_result;
219
220string literals;
221
222%}
223
224%union {
225  int num;
226  expression *expr;
227  struct { int ndigits; int val; } dig;
228  struct { int start; int len; } str;
229}
230
231/* uppercase or lowercase letter */
232%token <num> TOKEN_LETTER
233/* literal characters */
234%token <str> TOKEN_LITERAL
235/* digit */
236%token <num> TOKEN_DIGIT
237
238%type <expr> conditional
239%type <expr> alternative
240%type <expr> list
241%type <expr> string
242%type <expr> substitute
243%type <expr> optional_conditional
244%type <num> number
245%type <dig> digits
246%type <num> optional_number
247%type <num> flag
248
249%%
250
251expr:
252	optional_conditional
253		{ parse_result = ($1 ? new analyzed_expr($1) : 0); }
254	;
255
256conditional:
257	alternative
258		{ $$ = $1; }
259	| alternative '?' optional_conditional ':' conditional
260		{ $$ = new conditional_expr($1, $3, $5); }
261	;
262
263optional_conditional:
264	/* empty */
265		{ $$ = 0; }
266	| conditional
267		{ $$ = $1; }
268	;
269
270alternative:
271	list
272		{ $$ = $1; }
273	| alternative '|' list
274		{ $$ = new alternative_expr($1, $3); }
275	| alternative '&' list
276		{ $$ = new conditional_expr($1, $3, 0); }
277	;
278
279list:
280	substitute
281		{ $$ = $1; }
282	| list substitute
283		{ $$ = new list_expr($1, $2); }
284	;
285
286substitute:
287	string
288		{ $$ = $1; }
289	| substitute '~' string
290		{ $$ = new substitute_expr($1, $3); }
291	;
292
293string:
294	'@'
295		{ $$ = new at_expr; }
296	| TOKEN_LITERAL
297		{
298		  $$ = new literal_expr(literals.contents() + $1.start,
299					$1.len);
300		}
301	| TOKEN_LETTER
302		{ $$ = new field_expr($1, 0); }
303	| TOKEN_LETTER number
304		{ $$ = new field_expr($1, $2 - 1); }
305	| '%' TOKEN_LETTER
306		{
307		  switch ($2) {
308		  case 'I':
309		  case 'i':
310		  case 'A':
311		  case 'a':
312		    $$ = new format_expr($2);
313		    break;
314		  default:
315		    command_error("unrecognized format `%1'", char($2));
316		    $$ = new format_expr('a');
317		    break;
318		  }
319		}
320
321	| '%' digits
322		{
323		  $$ = new format_expr('0', $2.ndigits, $2.val);
324		}
325	| string '.' flag TOKEN_LETTER optional_number
326		{
327		  switch ($4) {
328		  case 'l':
329		    $$ = new map_expr($1, lowercase);
330		    break;
331		  case 'u':
332		    $$ = new map_expr($1, uppercase);
333		    break;
334		  case 'c':
335		    $$ = new map_expr($1, capitalize);
336		    break;
337		  case 'r':
338		    $$ = new map_expr($1, reverse_name);
339		    break;
340		  case 'a':
341		    $$ = new map_expr($1, abbreviate_name);
342		    break;
343		  case 'y':
344		    $$ = new extractor_expr($1, find_year, $3);
345		    break;
346		  case 'n':
347		    $$ = new extractor_expr($1, find_last_name, $3);
348		    break;
349		  default:
350		    $$ = $1;
351		    command_error("unknown function `%1'", char($4));
352		    break;
353		  }
354		}
355
356	| string '+' number
357		{ $$ = new truncate_expr($1, $3); }
358	| string '-' number
359		{ $$ = new truncate_expr($1, -$3); }
360	| string '*'
361		{ $$ = new star_expr($1); }
362	| '(' optional_conditional ')'
363		{ $$ = $2; }
364	| '<' optional_conditional '>'
365		{ $$ = new separator_expr($2); }
366	;
367
368optional_number:
369	/* empty */
370		{ $$ = -1; }
371	| number
372		{ $$ = $1; }
373	;
374
375number:
376	TOKEN_DIGIT
377		{ $$ = $1; }
378	| number TOKEN_DIGIT
379		{ $$ = $1*10 + $2; }
380	;
381
382digits:
383	TOKEN_DIGIT
384		{ $$.ndigits = 1; $$.val = $1; }
385	| digits TOKEN_DIGIT
386		{ $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
387	;
388
389
390flag:
391	/* empty */
392		{ $$ = 0; }
393	| '+'
394		{ $$ = 1; }
395	| '-'
396		{ $$ = -1; }
397	;
398
399%%
400
401/* bison defines const to be empty unless __STDC__ is defined, which it
402isn't under cfront */
403
404#ifdef const
405#undef const
406#endif
407
408const char *spec_ptr;
409const char *spec_end;
410const char *spec_cur;
411
412static char uppercase_array[] = {
413  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
414  'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
415  'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
416  'Y', 'Z',
417};
418
419static char lowercase_array[] = {
420  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
421  'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
422  'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
423  'y', 'z',
424};
425
426int yylex()
427{
428  while (spec_ptr < spec_end && csspace(*spec_ptr))
429    spec_ptr++;
430  spec_cur = spec_ptr;
431  if (spec_ptr >= spec_end)
432    return 0;
433  unsigned char c = *spec_ptr++;
434  if (csalpha(c)) {
435    yylval.num = c;
436    return TOKEN_LETTER;
437  }
438  if (csdigit(c)) {
439    yylval.num = c - '0';
440    return TOKEN_DIGIT;
441  }
442  if (c == '\'') {
443    yylval.str.start = literals.length();
444    for (; spec_ptr < spec_end; spec_ptr++) {
445      if (*spec_ptr == '\'') {
446	if (++spec_ptr < spec_end && *spec_ptr == '\'')
447	  literals += '\'';
448	else {
449	  yylval.str.len = literals.length() - yylval.str.start;
450	  return TOKEN_LITERAL;
451	}
452      }
453      else
454	literals += *spec_ptr;
455    }
456    yylval.str.len = literals.length() - yylval.str.start;
457    return TOKEN_LITERAL;
458  }
459  return c;
460}
461
462int set_label_spec(const char *label_spec)
463{
464  spec_cur = spec_ptr = label_spec;
465  spec_end = strchr(label_spec, '\0');
466  literals.clear();
467  if (yyparse())
468    return 0;
469  delete parsed_label;
470  parsed_label = parse_result;
471  return 1;
472}
473
474int set_date_label_spec(const char *label_spec)
475{
476  spec_cur = spec_ptr = label_spec;
477  spec_end = strchr(label_spec, '\0');
478  literals.clear();
479  if (yyparse())
480    return 0;
481  delete parsed_date_label;
482  parsed_date_label = parse_result;
483  return 1;
484}
485
486int set_short_label_spec(const char *label_spec)
487{
488  spec_cur = spec_ptr = label_spec;
489  spec_end = strchr(label_spec, '\0');
490  literals.clear();
491  if (yyparse())
492    return 0;
493  delete parsed_short_label;
494  parsed_short_label = parse_result;
495  return 1;
496}
497
498void yyerror(const char *message)
499{
500  if (spec_cur < spec_end)
501    command_error("label specification %1 before `%2'", message, spec_cur);
502  else
503    command_error("label specification %1 at end of string",
504		  message, spec_cur);
505}
506
507void at_expr::evaluate(int tentative, const reference &ref,
508		       string &result, substring_position &)
509{
510  if (tentative)
511    ref.canonicalize_authors(result);
512  else {
513    const char *end, *start = ref.get_authors(&end);
514    if (start)
515      result.append(start, end - start);
516  }
517}
518
519void format_expr::evaluate(int tentative, const reference &ref,
520			   string &result, substring_position &)
521{
522  if (tentative)
523    return;
524  const label_info *lp = ref.get_label_ptr();
525  int num = lp == 0 ? ref.get_number() : lp->count;
526  if (type != '0')
527    result += format_serial(type, num + 1);
528  else {
529    const char *ptr = i_to_a(num + first_number);
530    int pad = width - strlen(ptr);
531    while (--pad >= 0)
532      result += '0';
533    result += ptr;
534  }
535}
536
537static const char *format_serial(char c, int n)
538{
539  assert(n > 0);
540  static char buf[128]; // more than enough.
541  switch (c) {
542  case 'i':
543  case 'I':
544    {
545      char *p = buf;
546      // troff uses z and w to represent 10000 and 5000 in Roman
547      // numerals; I can find no historical basis for this usage
548      const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
549      if (n >= 40000)
550	return i_to_a(n);
551      while (n >= 10000) {
552	*p++ = s[0];
553	n -= 10000;
554      }
555      for (int i = 1000; i > 0; i /= 10, s += 2) {
556	int m = n/i;
557	n -= m*i;
558	switch (m) {
559	case 3:
560	  *p++ = s[2];
561	  /* falls through */
562	case 2:
563	  *p++ = s[2];
564	  /* falls through */
565	case 1:
566	  *p++ = s[2];
567	  break;
568	case 4:
569	  *p++ = s[2];
570	  *p++ = s[1];
571	  break;
572	case 8:
573	  *p++ = s[1];
574	  *p++ = s[2];
575	  *p++ = s[2];
576	  *p++ = s[2];
577	  break;
578	case 7:
579	  *p++ = s[1];
580	  *p++ = s[2];
581	  *p++ = s[2];
582	  break;
583	case 6:
584	  *p++ = s[1];
585	  *p++ = s[2];
586	  break;
587	case 5:
588	  *p++ = s[1];
589	  break;
590	case 9:
591	  *p++ = s[2];
592	  *p++ = s[0];
593	}
594      }
595      *p = 0;
596      break;
597    }
598  case 'a':
599  case 'A':
600    {
601      char *p = buf;
602      // this is derived from troff/reg.c
603      while (n > 0) {
604	int d = n % 26;
605	if (d == 0)
606	  d = 26;
607	n -= d;
608	n /= 26;
609	*p++ = c == 'a' ? lowercase_array[d - 1] :
610			       uppercase_array[d - 1];
611      }
612      *p-- = 0;
613      // Reverse it.
614      char *q = buf;
615      while (q < p) {
616	char temp = *q;
617	*q = *p;
618	*p = temp;
619	--p;
620	++q;
621      }
622      break;
623    }
624  default:
625    assert(0);
626  }
627  return buf;
628}
629
630void field_expr::evaluate(int, const reference &ref,
631			  string &result, substring_position &)
632{
633  const char *end;
634  const char *start = ref.get_field(name, &end);
635  if (start) {
636    start = nth_field(number, start, &end);
637    if (start)
638      result.append(start, end - start);
639  }
640}
641
642void literal_expr::evaluate(int, const reference &,
643			    string &result, substring_position &)
644{
645  result += s;
646}
647
648analyzed_expr::analyzed_expr(expression *e)
649: unary_expr(e), flags(e ? e->analyze() : 0)
650{
651}
652
653void analyzed_expr::evaluate(int tentative, const reference &ref,
654			     string &result, substring_position &pos)
655{
656  if (expr)
657    expr->evaluate(tentative, ref, result, pos);
658}
659
660void star_expr::evaluate(int tentative, const reference &ref,
661			 string &result, substring_position &pos)
662{
663  const label_info *lp = ref.get_label_ptr();
664  if (!tentative
665      && (lp == 0 || lp->total > 1)
666      && expr)
667    expr->evaluate(tentative, ref, result, pos);
668}
669
670void separator_expr::evaluate(int tentative, const reference &ref,
671			      string &result, substring_position &pos)
672{
673  int start_length = result.length();
674  int is_first = pos.start < 0;
675  if (expr)
676    expr->evaluate(tentative, ref, result, pos);
677  if (is_first) {
678    pos.start = start_length;
679    pos.length = result.length() - start_length;
680  }
681}
682
683void map_expr::evaluate(int tentative, const reference &ref,
684			string &result, substring_position &)
685{
686  if (expr) {
687    string temp;
688    substring_position temp_pos;
689    expr->evaluate(tentative, ref, temp, temp_pos);
690    (*func)(temp.contents(), temp.contents() + temp.length(), result);
691  }
692}
693
694void extractor_expr::evaluate(int tentative, const reference &ref,
695			      string &result, substring_position &)
696{
697  if (expr) {
698    string temp;
699    substring_position temp_pos;
700    expr->evaluate(tentative, ref, temp, temp_pos);
701    const char *end, *start = (*func)(temp.contents(),
702				      temp.contents() + temp.length(),
703				      &end);
704    switch (part) {
705    case BEFORE:
706      if (start)
707	result.append(temp.contents(), start - temp.contents());
708      else
709	result += temp;
710      break;
711    case MATCH:
712      if (start)
713	result.append(start, end - start);
714      break;
715    case AFTER:
716      if (start)
717	result.append(end, temp.contents() + temp.length() - end);
718      break;
719    default:
720      assert(0);
721    }
722  }
723}
724
725static void first_part(int len, const char *ptr, const char *end,
726			  string &result)
727{
728  for (;;) {
729    const char *token_start = ptr;
730    if (!get_token(&ptr, end))
731      break;
732    const token_info *ti = lookup_token(token_start, ptr);
733    int counts = ti->sortify_non_empty(token_start, ptr);
734    if (counts && --len < 0)
735      break;
736    if (counts || ti->is_accent())
737      result.append(token_start, ptr - token_start);
738  }
739}
740
741static void last_part(int len, const char *ptr, const char *end,
742		      string &result)
743{
744  const char *start = ptr;
745  int count = 0;
746  for (;;) {
747    const char *token_start = ptr;
748    if (!get_token(&ptr, end))
749      break;
750    const token_info *ti = lookup_token(token_start, ptr);
751    if (ti->sortify_non_empty(token_start, ptr))
752      count++;
753  }
754  ptr = start;
755  int skip = count - len;
756  if (skip > 0) {
757    for (;;) {
758      const char *token_start = ptr;
759      if (!get_token(&ptr, end))
760	assert(0);
761      const token_info *ti = lookup_token(token_start, ptr);
762      if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
763	ptr = token_start;
764	break;
765      }
766    }
767  }
768  first_part(len, ptr, end, result);
769}
770
771void truncate_expr::evaluate(int tentative, const reference &ref,
772			     string &result, substring_position &)
773{
774  if (expr) {
775    string temp;
776    substring_position temp_pos;
777    expr->evaluate(tentative, ref, temp, temp_pos);
778    const char *start = temp.contents();
779    const char *end = start + temp.length();
780    if (n > 0)
781      first_part(n, start, end, result);
782    else if (n < 0)
783      last_part(-n, start, end, result);
784  }
785}
786
787void alternative_expr::evaluate(int tentative, const reference &ref,
788				string &result, substring_position &pos)
789{
790  int start_length = result.length();
791  if (expr1)
792    expr1->evaluate(tentative, ref, result, pos);
793  if (result.length() == start_length && expr2)
794    expr2->evaluate(tentative, ref, result, pos);
795}
796
797void list_expr::evaluate(int tentative, const reference &ref,
798			 string &result, substring_position &pos)
799{
800  if (expr1)
801    expr1->evaluate(tentative, ref, result, pos);
802  if (expr2)
803    expr2->evaluate(tentative, ref, result, pos);
804}
805
806void substitute_expr::evaluate(int tentative, const reference &ref,
807			       string &result, substring_position &pos)
808{
809  int start_length = result.length();
810  if (expr1)
811    expr1->evaluate(tentative, ref, result, pos);
812  if (result.length() > start_length && result[result.length() - 1] == '-') {
813    // ought to see if pos covers the -
814    result.set_length(result.length() - 1);
815    if (expr2)
816      expr2->evaluate(tentative, ref, result, pos);
817  }
818}
819
820void conditional_expr::evaluate(int tentative, const reference &ref,
821				string &result, substring_position &pos)
822{
823  string temp;
824  substring_position temp_pos;
825  if (expr1)
826    expr1->evaluate(tentative, ref, temp, temp_pos);
827  if (temp.length() > 0) {
828    if (expr2)
829      expr2->evaluate(tentative, ref, result, pos);
830  }
831  else {
832    if (expr3)
833      expr3->evaluate(tentative, ref, result, pos);
834  }
835}
836
837void reference::pre_compute_label()
838{
839  if (parsed_label != 0
840      && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
841    label.clear();
842    substring_position temp_pos;
843    parsed_label->evaluate(1, *this, label, temp_pos);
844    label_ptr = lookup_label(label);
845  }
846}
847
848void reference::compute_label()
849{
850  label.clear();
851  if (parsed_label)
852    parsed_label->evaluate(0, *this, label, separator_pos);
853  if (short_label_flag && parsed_short_label)
854    parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
855  if (date_as_label) {
856    string new_date;
857    if (parsed_date_label) {
858      substring_position temp_pos;
859      parsed_date_label->evaluate(0, *this, new_date, temp_pos);
860    }
861    set_date(new_date);
862  }
863  if (label_ptr)
864    label_ptr->count += 1;
865}
866
867void reference::immediate_compute_label()
868{
869  if (label_ptr)
870    label_ptr->total = 2;	// force use of disambiguator
871  compute_label();
872}
873
874int reference::merge_labels(reference **v, int n, label_type type,
875			    string &result)
876{
877  if (abbreviate_label_ranges)
878    return merge_labels_by_number(v, n, type, result);
879  else
880    return merge_labels_by_parts(v, n, type, result);
881}
882
883int reference::merge_labels_by_number(reference **v, int n, label_type type,
884				      string &result)
885{
886  if (n <= 1)
887    return 0;
888  int num = get_number();
889  // Only merge three or more labels.
890  if (v[0]->get_number() != num + 1
891      || v[1]->get_number() != num + 2)
892    return 0;
893  int i;
894  for (i = 2; i < n; i++)
895    if (v[i]->get_number() != num + i + 1)
896      break;
897  result = get_label(type);
898  result += label_range_indicator;
899  result += v[i - 1]->get_label(type);
900  return i;
901}
902
903const substring_position &reference::get_separator_pos(label_type type) const
904{
905  if (type == SHORT_LABEL && short_label_flag)
906    return short_separator_pos;
907  else
908    return separator_pos;
909}
910
911const string &reference::get_label(label_type type) const
912{
913  if (type == SHORT_LABEL && short_label_flag)
914    return short_label;
915  else
916    return label;
917}
918
919int reference::merge_labels_by_parts(reference **v, int n, label_type type,
920				     string &result)
921{
922  if (n <= 0)
923    return 0;
924  const string &lb = get_label(type);
925  const substring_position &sp = get_separator_pos(type);
926  if (sp.start < 0
927      || sp.start != v[0]->get_separator_pos(type).start
928      || memcmp(lb.contents(), v[0]->get_label(type).contents(),
929		sp.start) != 0)
930    return 0;
931  result = lb;
932  int i = 0;
933  do {
934    result += separate_label_second_parts;
935    const substring_position &s = v[i]->get_separator_pos(type);
936    int sep_end_pos = s.start + s.length;
937    result.append(v[i]->get_label(type).contents() + sep_end_pos,
938		  v[i]->get_label(type).length() - sep_end_pos);
939  } while (++i < n
940	   && sp.start == v[i]->get_separator_pos(type).start
941	   && memcmp(lb.contents(), v[i]->get_label(type).contents(),
942		     sp.start) == 0);
943  return i;
944}
945
946string label_pool;
947
948label_info::label_info(const string &s)
949: start(label_pool.length()), length(s.length()), count(0), total(1)
950{
951  label_pool += s;
952}
953
954static label_info **label_table = 0;
955static int label_table_size = 0;
956static int label_table_used = 0;
957
958label_info *lookup_label(const string &label)
959{
960  if (label_table == 0) {
961    label_table = new label_info *[17];
962    label_table_size = 17;
963    for (int i = 0; i < 17; i++)
964      label_table[i] = 0;
965  }
966  unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
967  label_info **ptr;
968  for (ptr = label_table + h;
969       *ptr != 0;
970       (ptr == label_table)
971       ? (ptr = label_table + label_table_size - 1)
972       : ptr--)
973    if ((*ptr)->length == label.length()
974	&& memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
975		  label.length()) == 0) {
976      (*ptr)->total += 1;
977      return *ptr;
978    }
979  label_info *result = *ptr = new label_info(label);
980  if (++label_table_used * 2 > label_table_size) {
981    // Rehash the table.
982    label_info **old_table = label_table;
983    int old_size = label_table_size;
984    label_table_size = next_size(label_table_size);
985    label_table = new label_info *[label_table_size];
986    int i;
987    for (i = 0; i < label_table_size; i++)
988      label_table[i] = 0;
989    for (i = 0; i < old_size; i++)
990      if (old_table[i]) {
991	h = hash_string(label_pool.contents() + old_table[i]->start,
992			old_table[i]->length);
993	label_info **p;
994	for (p = label_table + (h % label_table_size);
995	     *p != 0;
996	     (p == label_table)
997	     ? (p = label_table + label_table_size - 1)
998	     : --p)
999	    ;
1000	*p = old_table[i];
1001	}
1002    a_delete old_table;
1003  }
1004  return result;
1005}
1006
1007void clear_labels()
1008{
1009  for (int i = 0; i < label_table_size; i++) {
1010    delete label_table[i];
1011    label_table[i] = 0;
1012  }
1013  label_table_used = 0;
1014  label_pool.clear();
1015}
1016
1017static void consider_authors(reference **start, reference **end, int i);
1018
1019void compute_labels(reference **v, int n)
1020{
1021  if (parsed_label
1022      && (parsed_label->analyze() & expression::CONTAINS_AT)
1023      && sort_fields.length() >= 2
1024      && sort_fields[0] == 'A'
1025      && sort_fields[1] == '+')
1026    consider_authors(v, v + n, 0);
1027  for (int i = 0; i < n; i++)
1028    v[i]->compute_label();
1029}
1030
1031
1032/* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1033where 0 <= i <= N if there exists a reference with a list of authors
1034<B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1035and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1036A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1037<B0,B1,...,BM>.  If a reference needs author i we only have to call
1038need_author(j) for some j >= i such that the reference also needs
1039author j. */
1040
1041/* This function handles 2 tasks:
1042determine which authors are needed (cannot be elided with et al.);
1043determine which authors can have only last names in the labels.
1044
1045References >= start and < end have the same first i author names.
1046Also they're sorted by A+. */
1047
1048static void consider_authors(reference **start, reference **end, int i)
1049{
1050  if (start >= end)
1051    return;
1052  reference **p = start;
1053  if (i >= (*p)->get_nauthors()) {
1054    for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1055      ;
1056    if (p < end && i > 0) {
1057      // If we have an author list <A B C> and an author list <A B C D>,
1058      // then both lists need C.
1059      for (reference **q = start; q < end; q++)
1060	(*q)->need_author(i - 1);
1061    }
1062    start = p;
1063  }
1064  while (p < end) {
1065    reference **last_name_start = p;
1066    reference **name_start = p;
1067    for (++p;
1068	 p < end && i < (*p)->get_nauthors()
1069	 && same_author_last_name(**last_name_start, **p, i);
1070	 p++) {
1071      if (!same_author_name(**name_start, **p, i)) {
1072	consider_authors(name_start, p, i + 1);
1073	name_start = p;
1074      }
1075    }
1076    consider_authors(name_start, p, i + 1);
1077    if (last_name_start == name_start) {
1078      for (reference **q = last_name_start; q < p; q++)
1079	(*q)->set_last_name_unambiguous(i);
1080    }
1081    // If we have an author list <A B C D> and <A B C E>, then the lists
1082    // need author D and E respectively.
1083    if (name_start > start || p < end) {
1084      for (reference **q = last_name_start; q < p; q++)
1085	(*q)->need_author(i);
1086    }
1087  }
1088}
1089
1090int same_author_last_name(const reference &r1, const reference &r2, int n)
1091{
1092  const char *ae1;
1093  const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1094  const char *ae2;
1095  const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1096  if (!as1 && !as2) return 1;	// they are the same
1097  if (!as1 || !as2) return 0;
1098  return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1099}
1100
1101int same_author_name(const reference &r1, const reference &r2, int n)
1102{
1103  const char *ae1;
1104  const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1105  const char *ae2;
1106  const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1107  if (!as1 && !as2) return 1;	// they are the same
1108  if (!as1 || !as2) return 0;
1109  return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1110}
1111
1112
1113void int_set::set(int i)
1114{
1115  assert(i >= 0);
1116  int bytei = i >> 3;
1117  if (bytei >= v.length()) {
1118    int old_length = v.length();
1119    v.set_length(bytei + 1);
1120    for (int j = old_length; j <= bytei; j++)
1121      v[j] = 0;
1122  }
1123  v[bytei] |= 1 << (i & 7);
1124}
1125
1126int int_set::get(int i) const
1127{
1128  assert(i >= 0);
1129  int bytei = i >> 3;
1130  return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1131}
1132
1133void reference::set_last_name_unambiguous(int i)
1134{
1135  last_name_unambiguous.set(i);
1136}
1137
1138void reference::need_author(int n)
1139{
1140  if (n > last_needed_author)
1141    last_needed_author = n;
1142}
1143
1144const char *reference::get_authors(const char **end) const
1145{
1146  if (!computed_authors) {
1147    ((reference *)this)->computed_authors = 1;
1148    string &result = ((reference *)this)->authors;
1149    int na = get_nauthors();
1150    result.clear();
1151    for (int i = 0; i < na; i++) {
1152      if (last_name_unambiguous.get(i)) {
1153	const char *e, *start = get_author_last_name(i, &e);
1154	assert(start != 0);
1155	result.append(start, e - start);
1156      }
1157      else {
1158	const char *e, *start = get_author(i, &e);
1159	assert(start != 0);
1160	result.append(start, e - start);
1161      }
1162      if (i == last_needed_author
1163	  && et_al.length() > 0
1164	  && et_al_min_elide > 0
1165	  && last_needed_author + et_al_min_elide < na
1166	  && na >= et_al_min_total) {
1167	result += et_al;
1168	break;
1169      }
1170      if (i < na - 1) {
1171	if (na == 2)
1172	  result += join_authors_exactly_two;
1173	else if (i < na - 2)
1174	  result += join_authors_default;
1175	else
1176	  result += join_authors_last_two;
1177      }
1178    }
1179  }
1180  const char *start = authors.contents();
1181  *end = start + authors.length();
1182  return start;
1183}
1184
1185int reference::get_nauthors() const
1186{
1187  if (nauthors < 0) {
1188    const char *dummy;
1189    int na;
1190    for (na = 0; get_author(na, &dummy) != 0; na++)
1191      ;
1192    ((reference *)this)->nauthors = na;
1193  }
1194  return nauthors;
1195}
1196