label.y revision 75584
1/* -*- C++ -*-
2   Copyright (C) 1989, 1990, 1991, 1992, 2000 Free Software Foundation, Inc.
3     Written by James Clark (jjc@jclark.com)
4
5This file is part of groff.
6
7groff is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 2, or (at your option) any later
10version.
11
12groff is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17You should have received a copy of the GNU General Public License along
18with groff; see the file COPYING.  If not, write to the Free Software
19Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20
21%{
22
23#include "refer.h"
24#include "refid.h"
25#include "ref.h"
26#include "token.h"
27
28int yylex();
29void yyerror(const char *);
30int yyparse();
31
32static const char *format_serial(char c, int n);
33
34struct label_info {
35  int start;
36  int length;
37  int count;
38  int total;
39  label_info(const string &);
40};
41
42label_info *lookup_label(const string &label);
43
44struct expression {
45  enum {
46    // Does the tentative label depend on the reference?
47    CONTAINS_VARIABLE = 01,
48    CONTAINS_STAR = 02,
49    CONTAINS_FORMAT = 04,
50    CONTAINS_AT = 010
51  };
52  virtual ~expression() { }
53  virtual void evaluate(int, const reference &, string &,
54			substring_position &) = 0;
55  virtual unsigned analyze() { return 0; }
56};
57
58class at_expr : public expression {
59public:
60  at_expr() { }
61  void evaluate(int, const reference &, string &, substring_position &);
62  unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
63};
64
65class format_expr : public expression {
66  char type;
67  int width;
68  int first_number;
69public:
70  format_expr(char c, int w = 0, int f = 1)
71    : type(c), width(w), first_number(f) { }
72  void evaluate(int, const reference &, string &, substring_position &);
73  unsigned analyze() { return CONTAINS_FORMAT; }
74};
75
76class field_expr : public expression {
77  int number;
78  char name;
79public:
80  field_expr(char nm, int num) : number(num), name(nm) { }
81  void evaluate(int, const reference &, string &, substring_position &);
82  unsigned analyze() { return CONTAINS_VARIABLE; }
83};
84
85class literal_expr : public expression {
86  string s;
87public:
88  literal_expr(const char *ptr, int len) : s(ptr, len) { }
89  void evaluate(int, const reference &, string &, substring_position &);
90};
91
92class unary_expr : public expression {
93protected:
94  expression *expr;
95public:
96  unary_expr(expression *e) : expr(e) { }
97  ~unary_expr() { delete expr; }
98  void evaluate(int, const reference &, string &, substring_position &) = 0;
99  unsigned analyze() { return expr ? expr->analyze() : 0; }
100};
101
102// This caches the analysis of an expression.
103
104class analyzed_expr : public unary_expr {
105  unsigned flags;
106public:
107  analyzed_expr(expression *);
108  void evaluate(int, const reference &, string &, substring_position &);
109  unsigned analyze() { return flags; }
110};
111
112class star_expr : public unary_expr {
113public:
114  star_expr(expression *e) : unary_expr(e) { }
115  void evaluate(int, const reference &, string &, substring_position &);
116  unsigned analyze() {
117    return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
118	    | CONTAINS_STAR);
119  }
120};
121
122typedef void map_func(const char *, const char *, string &);
123
124class map_expr : public unary_expr {
125  map_func *func;
126public:
127  map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
128  void evaluate(int, const reference &, string &, substring_position &);
129};
130
131typedef const char *extractor_func(const char *, const char *, const char **);
132
133class extractor_expr : public unary_expr {
134  int part;
135  extractor_func *func;
136public:
137  enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
138  extractor_expr(expression *e, extractor_func *f, int pt)
139    : unary_expr(e), part(pt), func(f) { }
140  void evaluate(int, const reference &, string &, substring_position &);
141};
142
143class truncate_expr : public unary_expr {
144  int n;
145public:
146  truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
147  void evaluate(int, const reference &, string &, substring_position &);
148};
149
150class separator_expr : public unary_expr {
151public:
152  separator_expr(expression *e) : unary_expr(e) { }
153  void evaluate(int, const reference &, string &, substring_position &);
154};
155
156class binary_expr : public expression {
157protected:
158  expression *expr1;
159  expression *expr2;
160public:
161  binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
162  ~binary_expr() { delete expr1; delete expr2; }
163  void evaluate(int, const reference &, string &, substring_position &) = 0;
164  unsigned analyze() {
165    return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
166  }
167};
168
169class alternative_expr : public binary_expr {
170public:
171  alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
172  void evaluate(int, const reference &, string &, substring_position &);
173};
174
175class list_expr : public binary_expr {
176public:
177  list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
178  void evaluate(int, const reference &, string &, substring_position &);
179};
180
181class substitute_expr : public binary_expr {
182public:
183  substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
184  void evaluate(int, const reference &, string &, substring_position &);
185};
186
187class ternary_expr : public expression {
188protected:
189  expression *expr1;
190  expression *expr2;
191  expression *expr3;
192public:
193  ternary_expr(expression *e1, expression *e2, expression *e3)
194    : expr1(e1), expr2(e2), expr3(e3) { }
195  ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
196  void evaluate(int, const reference &, string &, substring_position &) = 0;
197  unsigned analyze() {
198    return ((expr1 ? expr1->analyze() : 0)
199	    | (expr2 ? expr2->analyze() : 0)
200	    | (expr3 ? expr3->analyze() : 0));
201  }
202};
203
204class conditional_expr : public ternary_expr {
205public:
206  conditional_expr(expression *e1, expression *e2, expression *e3)
207    : ternary_expr(e1, e2, e3) { }
208  void evaluate(int, const reference &, string &, substring_position &);
209};
210
211static expression *parsed_label = 0;
212static expression *parsed_date_label = 0;
213static expression *parsed_short_label = 0;
214
215static expression *parse_result;
216
217string literals;
218
219%}
220
221%union {
222  int num;
223  expression *expr;
224  struct { int ndigits; int val; } dig;
225  struct { int start; int len; } str;
226}
227
228/* uppercase or lowercase letter */
229%token <num> TOKEN_LETTER
230/* literal characters */
231%token <str> TOKEN_LITERAL
232/* digit */
233%token <num> TOKEN_DIGIT
234
235%type <expr> conditional
236%type <expr> alternative
237%type <expr> list
238%type <expr> string
239%type <expr> substitute
240%type <expr> optional_conditional
241%type <num> number
242%type <dig> digits
243%type <num> optional_number
244%type <num> flag
245
246%%
247
248expr:
249	optional_conditional
250		{ parse_result = ($1 ? new analyzed_expr($1) : 0); }
251	;
252
253conditional:
254	alternative
255		{ $$ = $1; }
256	| alternative '?' optional_conditional ':' conditional
257		{ $$ = new conditional_expr($1, $3, $5); }
258	;
259
260optional_conditional:
261	/* empty */
262		{ $$ = 0; }
263	| conditional
264		{ $$ = $1; }
265	;
266
267alternative:
268	list
269		{ $$ = $1; }
270	| alternative '|' list
271		{ $$ = new alternative_expr($1, $3); }
272	| alternative '&' list
273		{ $$ = new conditional_expr($1, $3, 0); }
274	;
275
276list:
277	substitute
278		{ $$ = $1; }
279	| list substitute
280		{ $$ = new list_expr($1, $2); }
281	;
282
283substitute:
284	string
285		{ $$ = $1; }
286	| substitute '~' string
287		{ $$ = new substitute_expr($1, $3); }
288	;
289
290string:
291	'@'
292		{ $$ = new at_expr; }
293	| TOKEN_LITERAL
294		{
295		  $$ = new literal_expr(literals.contents() + $1.start,
296					$1.len);
297		}
298	| TOKEN_LETTER
299		{ $$ = new field_expr($1, 0); }
300	| TOKEN_LETTER number
301		{ $$ = new field_expr($1, $2 - 1); }
302	| '%' TOKEN_LETTER
303		{
304		  switch ($2) {
305		  case 'I':
306		  case 'i':
307		  case 'A':
308		  case 'a':
309		    $$ = new format_expr($2);
310		    break;
311		  default:
312		    command_error("unrecognized format `%1'", char($2));
313		    $$ = new format_expr('a');
314		    break;
315		  }
316		}
317
318	| '%' digits
319		{
320		  $$ = new format_expr('0', $2.ndigits, $2.val);
321		}
322	| string '.' flag TOKEN_LETTER optional_number
323		{
324		  switch ($4) {
325		  case 'l':
326		    $$ = new map_expr($1, lowercase);
327		    break;
328		  case 'u':
329		    $$ = new map_expr($1, uppercase);
330		    break;
331		  case 'c':
332		    $$ = new map_expr($1, capitalize);
333		    break;
334		  case 'r':
335		    $$ = new map_expr($1, reverse_name);
336		    break;
337		  case 'a':
338		    $$ = new map_expr($1, abbreviate_name);
339		    break;
340		  case 'y':
341		    $$ = new extractor_expr($1, find_year, $3);
342		    break;
343		  case 'n':
344		    $$ = new extractor_expr($1, find_last_name, $3);
345		    break;
346		  default:
347		    $$ = $1;
348		    command_error("unknown function `%1'", char($4));
349		    break;
350		  }
351		}
352
353	| string '+' number
354		{ $$ = new truncate_expr($1, $3); }
355	| string '-' number
356		{ $$ = new truncate_expr($1, -$3); }
357	| string '*'
358		{ $$ = new star_expr($1); }
359	| '(' optional_conditional ')'
360		{ $$ = $2; }
361	| '<' optional_conditional '>'
362		{ $$ = new separator_expr($2); }
363	;
364
365optional_number:
366	/* empty */
367		{ $$ = -1; }
368	| number
369		{ $$ = $1; }
370	;
371
372number:
373	TOKEN_DIGIT
374		{ $$ = $1; }
375	| number TOKEN_DIGIT
376		{ $$ = $1*10 + $2; }
377	;
378
379digits:
380	TOKEN_DIGIT
381		{ $$.ndigits = 1; $$.val = $1; }
382	| digits TOKEN_DIGIT
383		{ $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
384	;
385
386
387flag:
388	/* empty */
389		{ $$ = 0; }
390	| '+'
391		{ $$ = 1; }
392	| '-'
393		{ $$ = -1; }
394	;
395
396%%
397
398/* bison defines const to be empty unless __STDC__ is defined, which it
399isn't under cfront */
400
401#ifdef const
402#undef const
403#endif
404
405const char *spec_ptr;
406const char *spec_end;
407const char *spec_cur;
408
409int yylex()
410{
411  while (spec_ptr < spec_end && csspace(*spec_ptr))
412    spec_ptr++;
413  spec_cur = spec_ptr;
414  if (spec_ptr >= spec_end)
415    return 0;
416  unsigned char c = *spec_ptr++;
417  if (csalpha(c)) {
418    yylval.num = c;
419    return TOKEN_LETTER;
420  }
421  if (csdigit(c)) {
422    yylval.num = c - '0';
423    return TOKEN_DIGIT;
424  }
425  if (c == '\'') {
426    yylval.str.start = literals.length();
427    for (; spec_ptr < spec_end; spec_ptr++) {
428      if (*spec_ptr == '\'') {
429	if (++spec_ptr < spec_end && *spec_ptr == '\'')
430	  literals += '\'';
431	else {
432	  yylval.str.len = literals.length() - yylval.str.start;
433	  return TOKEN_LITERAL;
434	}
435      }
436      else
437	literals += *spec_ptr;
438    }
439    yylval.str.len = literals.length() - yylval.str.start;
440    return TOKEN_LITERAL;
441  }
442  return c;
443}
444
445int set_label_spec(const char *label_spec)
446{
447  spec_cur = spec_ptr = label_spec;
448  spec_end = strchr(label_spec, '\0');
449  literals.clear();
450  if (yyparse())
451    return 0;
452  delete parsed_label;
453  parsed_label = parse_result;
454  return 1;
455}
456
457int set_date_label_spec(const char *label_spec)
458{
459  spec_cur = spec_ptr = label_spec;
460  spec_end = strchr(label_spec, '\0');
461  literals.clear();
462  if (yyparse())
463    return 0;
464  delete parsed_date_label;
465  parsed_date_label = parse_result;
466  return 1;
467}
468
469int set_short_label_spec(const char *label_spec)
470{
471  spec_cur = spec_ptr = label_spec;
472  spec_end = strchr(label_spec, '\0');
473  literals.clear();
474  if (yyparse())
475    return 0;
476  delete parsed_short_label;
477  parsed_short_label = parse_result;
478  return 1;
479}
480
481void yyerror(const char *message)
482{
483  if (spec_cur < spec_end)
484    command_error("label specification %1 before `%2'", message, spec_cur);
485  else
486    command_error("label specification %1 at end of string",
487		  message, spec_cur);
488}
489
490void at_expr::evaluate(int tentative, const reference &ref,
491		       string &result, substring_position &)
492{
493  if (tentative)
494    ref.canonicalize_authors(result);
495  else {
496    const char *end, *start = ref.get_authors(&end);
497    if (start)
498      result.append(start, end - start);
499  }
500}
501
502void format_expr::evaluate(int tentative, const reference &ref,
503			   string &result, substring_position &)
504{
505  if (tentative)
506    return;
507  const label_info *lp = ref.get_label_ptr();
508  int num = lp == 0 ? ref.get_number() : lp->count;
509  if (type != '0')
510    result += format_serial(type, num + 1);
511  else {
512    const char *ptr = i_to_a(num + first_number);
513    int pad = width - strlen(ptr);
514    while (--pad >= 0)
515      result += '0';
516    result += ptr;
517  }
518}
519
520static const char *format_serial(char c, int n)
521{
522  assert(n > 0);
523  static char buf[128]; // more than enough.
524  switch (c) {
525  case 'i':
526  case 'I':
527    {
528      char *p = buf;
529      // troff uses z and w to represent 10000 and 5000 in Roman
530      // numerals; I can find no historical basis for this usage
531      const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
532      if (n >= 40000)
533	return i_to_a(n);
534      while (n >= 10000) {
535	*p++ = s[0];
536	n -= 10000;
537      }
538      for (int i = 1000; i > 0; i /= 10, s += 2) {
539	int m = n/i;
540	n -= m*i;
541	switch (m) {
542	case 3:
543	  *p++ = s[2];
544	  /* falls through */
545	case 2:
546	  *p++ = s[2];
547	  /* falls through */
548	case 1:
549	  *p++ = s[2];
550	  break;
551	case 4:
552	  *p++ = s[2];
553	  *p++ = s[1];
554	  break;
555	case 8:
556	  *p++ = s[1];
557	  *p++ = s[2];
558	  *p++ = s[2];
559	  *p++ = s[2];
560	  break;
561	case 7:
562	  *p++ = s[1];
563	  *p++ = s[2];
564	  *p++ = s[2];
565	  break;
566	case 6:
567	  *p++ = s[1];
568	  *p++ = s[2];
569	  break;
570	case 5:
571	  *p++ = s[1];
572	  break;
573	case 9:
574	  *p++ = s[2];
575	  *p++ = s[0];
576	}
577      }
578      *p = 0;
579      break;
580    }
581  case 'a':
582  case 'A':
583    {
584      char *p = buf;
585      // this is derived from troff/reg.c
586      while (n > 0) {
587	int d = n % 26;
588	if (d == 0)
589	  d = 26;
590	n -= d;
591	n /= 26;
592	*p++ = c + d - 1;	// ASCII dependent
593      }
594      *p-- = 0;
595      // Reverse it.
596      char *q = buf;
597      while (q < p) {
598	char temp = *q;
599	*q = *p;
600	*p = temp;
601	--p;
602	++q;
603      }
604      break;
605    }
606  default:
607    assert(0);
608  }
609  return buf;
610}
611
612void field_expr::evaluate(int, const reference &ref,
613			  string &result, substring_position &)
614{
615  const char *end;
616  const char *start = ref.get_field(name, &end);
617  if (start) {
618    start = nth_field(number, start, &end);
619    if (start)
620      result.append(start, end - start);
621  }
622}
623
624void literal_expr::evaluate(int, const reference &,
625			    string &result, substring_position &)
626{
627  result += s;
628}
629
630analyzed_expr::analyzed_expr(expression *e)
631: unary_expr(e), flags(e ? e->analyze() : 0)
632{
633}
634
635void analyzed_expr::evaluate(int tentative, const reference &ref,
636			     string &result, substring_position &pos)
637{
638  if (expr)
639    expr->evaluate(tentative, ref, result, pos);
640}
641
642void star_expr::evaluate(int tentative, const reference &ref,
643			 string &result, substring_position &pos)
644{
645  const label_info *lp = ref.get_label_ptr();
646  if (!tentative
647      && (lp == 0 || lp->total > 1)
648      && expr)
649    expr->evaluate(tentative, ref, result, pos);
650}
651
652void separator_expr::evaluate(int tentative, const reference &ref,
653			      string &result, substring_position &pos)
654{
655  int start_length = result.length();
656  int is_first = pos.start < 0;
657  if (expr)
658    expr->evaluate(tentative, ref, result, pos);
659  if (is_first) {
660    pos.start = start_length;
661    pos.length = result.length() - start_length;
662  }
663}
664
665void map_expr::evaluate(int tentative, const reference &ref,
666			string &result, substring_position &)
667{
668  if (expr) {
669    string temp;
670    substring_position temp_pos;
671    expr->evaluate(tentative, ref, temp, temp_pos);
672    (*func)(temp.contents(), temp.contents() + temp.length(), result);
673  }
674}
675
676void extractor_expr::evaluate(int tentative, const reference &ref,
677			      string &result, substring_position &)
678{
679  if (expr) {
680    string temp;
681    substring_position temp_pos;
682    expr->evaluate(tentative, ref, temp, temp_pos);
683    const char *end, *start = (*func)(temp.contents(),
684				      temp.contents() + temp.length(),
685				      &end);
686    switch (part) {
687    case BEFORE:
688      if (start)
689	result.append(temp.contents(), start - temp.contents());
690      else
691	result += temp;
692      break;
693    case MATCH:
694      if (start)
695	result.append(start, end - start);
696      break;
697    case AFTER:
698      if (start)
699	result.append(end, temp.contents() + temp.length() - end);
700      break;
701    default:
702      assert(0);
703    }
704  }
705}
706
707static void first_part(int len, const char *ptr, const char *end,
708			  string &result)
709{
710  for (;;) {
711    const char *token_start = ptr;
712    if (!get_token(&ptr, end))
713      break;
714    const token_info *ti = lookup_token(token_start, ptr);
715    int counts = ti->sortify_non_empty(token_start, ptr);
716    if (counts && --len < 0)
717      break;
718    if (counts || ti->is_accent())
719      result.append(token_start, ptr - token_start);
720  }
721}
722
723static void last_part(int len, const char *ptr, const char *end,
724		      string &result)
725{
726  const char *start = ptr;
727  int count = 0;
728  for (;;) {
729    const char *token_start = ptr;
730    if (!get_token(&ptr, end))
731      break;
732    const token_info *ti = lookup_token(token_start, ptr);
733    if (ti->sortify_non_empty(token_start, ptr))
734      count++;
735  }
736  ptr = start;
737  int skip = count - len;
738  if (skip > 0) {
739    for (;;) {
740      const char *token_start = ptr;
741      if (!get_token(&ptr, end))
742	assert(0);
743      const token_info *ti = lookup_token(token_start, ptr);
744      if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
745	ptr = token_start;
746	break;
747      }
748    }
749  }
750  first_part(len, ptr, end, result);
751}
752
753void truncate_expr::evaluate(int tentative, const reference &ref,
754			     string &result, substring_position &)
755{
756  if (expr) {
757    string temp;
758    substring_position temp_pos;
759    expr->evaluate(tentative, ref, temp, temp_pos);
760    const char *start = temp.contents();
761    const char *end = start + temp.length();
762    if (n > 0)
763      first_part(n, start, end, result);
764    else if (n < 0)
765      last_part(-n, start, end, result);
766  }
767}
768
769void alternative_expr::evaluate(int tentative, const reference &ref,
770				string &result, substring_position &pos)
771{
772  int start_length = result.length();
773  if (expr1)
774    expr1->evaluate(tentative, ref, result, pos);
775  if (result.length() == start_length && expr2)
776    expr2->evaluate(tentative, ref, result, pos);
777}
778
779void list_expr::evaluate(int tentative, const reference &ref,
780			 string &result, substring_position &pos)
781{
782  if (expr1)
783    expr1->evaluate(tentative, ref, result, pos);
784  if (expr2)
785    expr2->evaluate(tentative, ref, result, pos);
786}
787
788void substitute_expr::evaluate(int tentative, const reference &ref,
789			       string &result, substring_position &pos)
790{
791  int start_length = result.length();
792  if (expr1)
793    expr1->evaluate(tentative, ref, result, pos);
794  if (result.length() > start_length && result[result.length() - 1] == '-') {
795    // ought to see if pos covers the -
796    result.set_length(result.length() - 1);
797    if (expr2)
798      expr2->evaluate(tentative, ref, result, pos);
799  }
800}
801
802void conditional_expr::evaluate(int tentative, const reference &ref,
803				string &result, substring_position &pos)
804{
805  string temp;
806  substring_position temp_pos;
807  if (expr1)
808    expr1->evaluate(tentative, ref, temp, temp_pos);
809  if (temp.length() > 0) {
810    if (expr2)
811      expr2->evaluate(tentative, ref, result, pos);
812  }
813  else {
814    if (expr3)
815      expr3->evaluate(tentative, ref, result, pos);
816  }
817}
818
819void reference::pre_compute_label()
820{
821  if (parsed_label != 0
822      && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
823    label.clear();
824    substring_position temp_pos;
825    parsed_label->evaluate(1, *this, label, temp_pos);
826    label_ptr = lookup_label(label);
827  }
828}
829
830void reference::compute_label()
831{
832  label.clear();
833  if (parsed_label)
834    parsed_label->evaluate(0, *this, label, separator_pos);
835  if (short_label_flag && parsed_short_label)
836    parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
837  if (date_as_label) {
838    string new_date;
839    if (parsed_date_label) {
840      substring_position temp_pos;
841      parsed_date_label->evaluate(0, *this, new_date, temp_pos);
842    }
843    set_date(new_date);
844  }
845  if (label_ptr)
846    label_ptr->count += 1;
847}
848
849void reference::immediate_compute_label()
850{
851  if (label_ptr)
852    label_ptr->total = 2;	// force use of disambiguator
853  compute_label();
854}
855
856int reference::merge_labels(reference **v, int n, label_type type,
857			    string &result)
858{
859  if (abbreviate_label_ranges)
860    return merge_labels_by_number(v, n, type, result);
861  else
862    return merge_labels_by_parts(v, n, type, result);
863}
864
865int reference::merge_labels_by_number(reference **v, int n, label_type type,
866				      string &result)
867{
868  if (n <= 1)
869    return 0;
870  int num = get_number();
871  // Only merge three or more labels.
872  if (v[0]->get_number() != num + 1
873      || v[1]->get_number() != num + 2)
874    return 0;
875  int i;
876  for (i = 2; i < n; i++)
877    if (v[i]->get_number() != num + i + 1)
878      break;
879  result = get_label(type);
880  result += label_range_indicator;
881  result += v[i - 1]->get_label(type);
882  return i;
883}
884
885const substring_position &reference::get_separator_pos(label_type type) const
886{
887  if (type == SHORT_LABEL && short_label_flag)
888    return short_separator_pos;
889  else
890    return separator_pos;
891}
892
893const string &reference::get_label(label_type type) const
894{
895  if (type == SHORT_LABEL && short_label_flag)
896    return short_label;
897  else
898    return label;
899}
900
901int reference::merge_labels_by_parts(reference **v, int n, label_type type,
902				     string &result)
903{
904  if (n <= 0)
905    return 0;
906  const string &lb = get_label(type);
907  const substring_position &sp = get_separator_pos(type);
908  if (sp.start < 0
909      || sp.start != v[0]->get_separator_pos(type).start
910      || memcmp(lb.contents(), v[0]->get_label(type).contents(),
911		sp.start) != 0)
912    return 0;
913  result = lb;
914  int i = 0;
915  do {
916    result += separate_label_second_parts;
917    const substring_position &s = v[i]->get_separator_pos(type);
918    int sep_end_pos = s.start + s.length;
919    result.append(v[i]->get_label(type).contents() + sep_end_pos,
920		  v[i]->get_label(type).length() - sep_end_pos);
921  } while (++i < n
922	   && sp.start == v[i]->get_separator_pos(type).start
923	   && memcmp(lb.contents(), v[i]->get_label(type).contents(),
924		     sp.start) == 0);
925  return i;
926}
927
928string label_pool;
929
930label_info::label_info(const string &s)
931: start(label_pool.length()), length(s.length()), count(0), total(1)
932{
933  label_pool += s;
934}
935
936static label_info **label_table = 0;
937static int label_table_size = 0;
938static int label_table_used = 0;
939
940label_info *lookup_label(const string &label)
941{
942  if (label_table == 0) {
943    label_table = new label_info *[17];
944    label_table_size = 17;
945    for (int i = 0; i < 17; i++)
946      label_table[i] = 0;
947  }
948  unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
949  label_info **ptr;
950  for (ptr = label_table + h;
951       *ptr != 0;
952       (ptr == label_table)
953       ? (ptr = label_table + label_table_size - 1)
954       : ptr--)
955    if ((*ptr)->length == label.length()
956	&& memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
957		  label.length()) == 0) {
958      (*ptr)->total += 1;
959      return *ptr;
960    }
961  label_info *result = *ptr = new label_info(label);
962  if (++label_table_used * 2 > label_table_size) {
963    // Rehash the table.
964    label_info **old_table = label_table;
965    int old_size = label_table_size;
966    label_table_size = next_size(label_table_size);
967    label_table = new label_info *[label_table_size];
968    int i;
969    for (i = 0; i < label_table_size; i++)
970      label_table[i] = 0;
971    for (i = 0; i < old_size; i++)
972      if (old_table[i]) {
973	unsigned h = hash_string(label_pool.contents() + old_table[i]->start,
974				 old_table[i]->length);
975	label_info **p;
976	for (p = label_table + (h % label_table_size);
977	     *p != 0;
978	     (p == label_table)
979	     ? (p = label_table + label_table_size - 1)
980	     : --p)
981	    ;
982	*p = old_table[i];
983	}
984    a_delete old_table;
985  }
986  return result;
987}
988
989void clear_labels()
990{
991  for (int i = 0; i < label_table_size; i++) {
992    delete label_table[i];
993    label_table[i] = 0;
994  }
995  label_table_used = 0;
996  label_pool.clear();
997}
998
999static void consider_authors(reference **start, reference **end, int i);
1000
1001void compute_labels(reference **v, int n)
1002{
1003  if (parsed_label
1004      && (parsed_label->analyze() & expression::CONTAINS_AT)
1005      && sort_fields.length() >= 2
1006      && sort_fields[0] == 'A'
1007      && sort_fields[1] == '+')
1008    consider_authors(v, v + n, 0);
1009  for (int i = 0; i < n; i++)
1010    v[i]->compute_label();
1011}
1012
1013
1014/* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1015where 0 <= i <= N if there exists a reference with a list of authors
1016<B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1017and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1018A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1019<B0,B1,...,BM>.  If a reference needs author i we only have to call
1020need_author(j) for some j >= i such that the reference also needs
1021author j. */
1022
1023/* This function handles 2 tasks:
1024determine which authors are needed (cannot be elided with et al.);
1025determine which authors can have only last names in the labels.
1026
1027References >= start and < end have the same first i author names.
1028Also they're sorted by A+. */
1029
1030static void consider_authors(reference **start, reference **end, int i)
1031{
1032  if (start >= end)
1033    return;
1034  reference **p = start;
1035  if (i >= (*p)->get_nauthors()) {
1036    for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1037      ;
1038    if (p < end && i > 0) {
1039      // If we have an author list <A B C> and an author list <A B C D>,
1040      // then both lists need C.
1041      for (reference **q = start; q < end; q++)
1042	(*q)->need_author(i - 1);
1043    }
1044    start = p;
1045  }
1046  while (p < end) {
1047    reference **last_name_start = p;
1048    reference **name_start = p;
1049    for (++p;
1050	 p < end && i < (*p)->get_nauthors()
1051	 && same_author_last_name(**last_name_start, **p, i);
1052	 p++) {
1053      if (!same_author_name(**name_start, **p, i)) {
1054	consider_authors(name_start, p, i + 1);
1055	name_start = p;
1056      }
1057    }
1058    consider_authors(name_start, p, i + 1);
1059    if (last_name_start == name_start) {
1060      for (reference **q = last_name_start; q < p; q++)
1061	(*q)->set_last_name_unambiguous(i);
1062    }
1063    // If we have an author list <A B C D> and <A B C E>, then the lists
1064    // need author D and E respectively.
1065    if (name_start > start || p < end) {
1066      for (reference **q = last_name_start; q < p; q++)
1067	(*q)->need_author(i);
1068    }
1069  }
1070}
1071
1072int same_author_last_name(const reference &r1, const reference &r2, int n)
1073{
1074  const char *ae1;
1075  const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1076  assert(as1 != 0);
1077  const char *ae2;
1078  const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1079  assert(as2 != 0);
1080  return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1081}
1082
1083int same_author_name(const reference &r1, const reference &r2, int n)
1084{
1085  const char *ae1;
1086  const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1087  assert(as1 != 0);
1088  const char *ae2;
1089  const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1090  assert(as2 != 0);
1091  return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1092}
1093
1094
1095void int_set::set(int i)
1096{
1097  assert(i >= 0);
1098  int bytei = i >> 3;
1099  if (bytei >= v.length()) {
1100    int old_length = v.length();
1101    v.set_length(bytei + 1);
1102    for (int j = old_length; j <= bytei; j++)
1103      v[j] = 0;
1104  }
1105  v[bytei] |= 1 << (i & 7);
1106}
1107
1108int int_set::get(int i) const
1109{
1110  assert(i >= 0);
1111  int bytei = i >> 3;
1112  return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1113}
1114
1115void reference::set_last_name_unambiguous(int i)
1116{
1117  last_name_unambiguous.set(i);
1118}
1119
1120void reference::need_author(int n)
1121{
1122  if (n > last_needed_author)
1123    last_needed_author = n;
1124}
1125
1126const char *reference::get_authors(const char **end) const
1127{
1128  if (!computed_authors) {
1129    ((reference *)this)->computed_authors = 1;
1130    string &result = ((reference *)this)->authors;
1131    int na = get_nauthors();
1132    result.clear();
1133    for (int i = 0; i < na; i++) {
1134      if (last_name_unambiguous.get(i)) {
1135	const char *e, *start = get_author_last_name(i, &e);
1136	assert(start != 0);
1137	result.append(start, e - start);
1138      }
1139      else {
1140	const char *e, *start = get_author(i, &e);
1141	assert(start != 0);
1142	result.append(start, e - start);
1143      }
1144      if (i == last_needed_author
1145	  && et_al.length() > 0
1146	  && et_al_min_elide > 0
1147	  && last_needed_author + et_al_min_elide < na
1148	  && na >= et_al_min_total) {
1149	result += et_al;
1150	break;
1151      }
1152      if (i < na - 1) {
1153	if (na == 2)
1154	  result += join_authors_exactly_two;
1155	else if (i < na - 2)
1156	  result += join_authors_default;
1157	else
1158	  result += join_authors_last_two;
1159      }
1160    }
1161  }
1162  const char *start = authors.contents();
1163  *end = start + authors.length();
1164  return start;
1165}
1166
1167int reference::get_nauthors() const
1168{
1169  if (nauthors < 0) {
1170    const char *dummy;
1171    int na;
1172    for (na = 0; get_author(na, &dummy) != 0; na++)
1173      ;
1174    ((reference *)this)->nauthors = na;
1175  }
1176  return nauthors;
1177}
1178