1/* Extended regular expression matching and search library.
2   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, write to the Free
18   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19   02111-1307 USA.  */
20
21static void re_string_construct_common (const char *str, int len,
22					re_string_t *pstr,
23					RE_TRANSLATE_TYPE trans, int icase,
24					const re_dfa_t *dfa) internal_function;
25#ifdef RE_ENABLE_I18N
26static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27				 wint_t *last_wc) internal_function;
28#endif /* RE_ENABLE_I18N */
29static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30				     unsigned int hash) internal_function;
31static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32					  const re_node_set *nodes,
33					  unsigned int hash) internal_function;
34static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35					  const re_node_set *nodes,
36					  unsigned int context,
37					  unsigned int hash) internal_function;
38static unsigned int inline calc_state_hash (const re_node_set *nodes,
39					    unsigned int context) internal_function;
40
41/* Functions for string operation.  */
42
43/* This function allocate the buffers.  It is necessary to call
44   re_string_reconstruct before using the object.  */
45
46static reg_errcode_t
47re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
48     re_string_t *pstr;
49     const char *str;
50     int len, init_len, icase;
51     RE_TRANSLATE_TYPE trans;
52     const re_dfa_t *dfa;
53{
54  reg_errcode_t ret;
55  int init_buf_len;
56
57  /* Ensure at least one character fits into the buffers.  */
58  if (init_len < dfa->mb_cur_max)
59    init_len = dfa->mb_cur_max;
60  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61  re_string_construct_common (str, len, pstr, trans, icase, dfa);
62
63  ret = re_string_realloc_buffers (pstr, init_buf_len);
64  if (BE (ret != REG_NOERROR, 0))
65    return ret;
66
67  pstr->word_char = dfa->word_char;
68  pstr->word_ops_used = dfa->word_ops_used;
69  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71  pstr->valid_raw_len = pstr->valid_len;
72  return REG_NOERROR;
73}
74
75/* This function allocate the buffers, and initialize them.  */
76
77static reg_errcode_t
78re_string_construct (pstr, str, len, trans, icase, dfa)
79     re_string_t *pstr;
80     const char *str;
81     int len, icase;
82     RE_TRANSLATE_TYPE trans;
83     const re_dfa_t *dfa;
84{
85  reg_errcode_t ret;
86  memset (pstr, '\0', sizeof (re_string_t));
87  re_string_construct_common (str, len, pstr, trans, icase, dfa);
88
89  if (len > 0)
90    {
91      ret = re_string_realloc_buffers (pstr, len + 1);
92      if (BE (ret != REG_NOERROR, 0))
93	return ret;
94    }
95  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
96
97  if (icase)
98    {
99#ifdef RE_ENABLE_I18N
100      if (dfa->mb_cur_max > 1)
101	{
102	  while (1)
103	    {
104	      ret = build_wcs_upper_buffer (pstr);
105	      if (BE (ret != REG_NOERROR, 0))
106		return ret;
107	      if (pstr->valid_raw_len >= len)
108		break;
109	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
110		break;
111	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
112	      if (BE (ret != REG_NOERROR, 0))
113		return ret;
114	    }
115	}
116      else
117#endif /* RE_ENABLE_I18N  */
118	build_upper_buffer (pstr);
119    }
120  else
121    {
122#ifdef RE_ENABLE_I18N
123      if (dfa->mb_cur_max > 1)
124	build_wcs_buffer (pstr);
125      else
126#endif /* RE_ENABLE_I18N  */
127	{
128	  if (trans != NULL)
129	    re_string_translate_buffer (pstr);
130	  else
131	    {
132	      pstr->valid_len = pstr->bufs_len;
133	      pstr->valid_raw_len = pstr->bufs_len;
134	    }
135	}
136    }
137
138  return REG_NOERROR;
139}
140
141/* Helper functions for re_string_allocate, and re_string_construct.  */
142
143static reg_errcode_t
144re_string_realloc_buffers (pstr, new_buf_len)
145     re_string_t *pstr;
146     int new_buf_len;
147{
148#ifdef RE_ENABLE_I18N
149  if (pstr->mb_cur_max > 1)
150    {
151      wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
152      if (BE (new_array == NULL, 0))
153	return REG_ESPACE;
154      pstr->wcs = new_array;
155      if (pstr->offsets != NULL)
156	{
157	  int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
158	  if (BE (new_array == NULL, 0))
159	    return REG_ESPACE;
160	  pstr->offsets = new_array;
161	}
162    }
163#endif /* RE_ENABLE_I18N  */
164  if (pstr->mbs_allocated)
165    {
166      unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
167					     new_buf_len);
168      if (BE (new_array == NULL, 0))
169	return REG_ESPACE;
170      pstr->mbs = new_array;
171    }
172  pstr->bufs_len = new_buf_len;
173  return REG_NOERROR;
174}
175
176
177static void
178re_string_construct_common (str, len, pstr, trans, icase, dfa)
179     const char *str;
180     int len;
181     re_string_t *pstr;
182     RE_TRANSLATE_TYPE trans;
183     int icase;
184     const re_dfa_t *dfa;
185{
186  pstr->raw_mbs = (const unsigned char *) str;
187  pstr->len = len;
188  pstr->raw_len = len;
189  pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
190  pstr->icase = icase ? 1 : 0;
191  pstr->mbs_allocated = (trans != NULL || icase);
192  pstr->mb_cur_max = dfa->mb_cur_max;
193  pstr->is_utf8 = dfa->is_utf8;
194  pstr->map_notascii = dfa->map_notascii;
195  pstr->stop = pstr->len;
196  pstr->raw_stop = pstr->stop;
197}
198
199#ifdef RE_ENABLE_I18N
200
201/* Build wide character buffer PSTR->WCS.
202   If the byte sequence of the string are:
203     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204   Then wide character buffer will be:
205     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
206   We use WEOF for padding, they indicate that the position isn't
207   a first byte of a multibyte character.
208
209   Note that this function assumes PSTR->VALID_LEN elements are already
210   built and starts from PSTR->VALID_LEN.  */
211
212static void
213build_wcs_buffer (pstr)
214     re_string_t *pstr;
215{
216#ifdef _LIBC
217  unsigned char buf[MB_CUR_MAX];
218  assert (MB_CUR_MAX >= pstr->mb_cur_max);
219#else
220  unsigned char buf[64];
221#endif
222  mbstate_t prev_st;
223  int byte_idx, end_idx, remain_len;
224  size_t mbclen;
225
226  /* Build the buffers from pstr->valid_len to either pstr->len or
227     pstr->bufs_len.  */
228  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
229  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
230    {
231      wchar_t wc;
232      const char *p;
233
234      remain_len = end_idx - byte_idx;
235      prev_st = pstr->cur_state;
236      /* Apply the translation if we need.  */
237      if (BE (pstr->trans != NULL, 0))
238	{
239	  int i, ch;
240
241	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
242	    {
243	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
244	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
245	    }
246	  p = (const char *) buf;
247	}
248      else
249	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
250      mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
251      if (BE (mbclen == (size_t) -2, 0))
252	{
253	  /* The buffer doesn't have enough space, finish to build.  */
254	  pstr->cur_state = prev_st;
255	  break;
256	}
257      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
258	{
259	  /* We treat these cases as a singlebyte character.  */
260	  mbclen = 1;
261	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
262	  if (BE (pstr->trans != NULL, 0))
263	    wc = pstr->trans[wc];
264	  pstr->cur_state = prev_st;
265	}
266
267      /* Write wide character and padding.  */
268      pstr->wcs[byte_idx++] = wc;
269      /* Write paddings.  */
270      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271	pstr->wcs[byte_idx++] = WEOF;
272    }
273  pstr->valid_len = byte_idx;
274  pstr->valid_raw_len = byte_idx;
275}
276
277/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
278   but for REG_ICASE.  */
279
280static int
281build_wcs_upper_buffer (pstr)
282     re_string_t *pstr;
283{
284  mbstate_t prev_st;
285  int src_idx, byte_idx, end_idx, remain_len;
286  size_t mbclen;
287#ifdef _LIBC
288  char buf[MB_CUR_MAX];
289  assert (MB_CUR_MAX >= pstr->mb_cur_max);
290#else
291  char buf[64];
292#endif
293
294  byte_idx = pstr->valid_len;
295  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
296
297  /* The following optimization assumes that ASCII characters can be
298     mapped to wide characters with a simple cast.  */
299  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
300    {
301      while (byte_idx < end_idx)
302	{
303	  wchar_t wc;
304
305	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
306	      && mbsinit (&pstr->cur_state))
307	    {
308	      /* In case of a singlebyte character.  */
309	      pstr->mbs[byte_idx]
310		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
311	      /* The next step uses the assumption that wchar_t is encoded
312		 ASCII-safe: all ASCII values can be converted like this.  */
313	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
314	      ++byte_idx;
315	      continue;
316	    }
317
318	  remain_len = end_idx - byte_idx;
319	  prev_st = pstr->cur_state;
320	  mbclen = mbrtowc (&wc,
321			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
322			     + byte_idx), remain_len, &pstr->cur_state);
323	  if (BE (mbclen + 2 > 2, 1))
324	    {
325	      wchar_t wcu = wc;
326	      if (iswlower (wc))
327		{
328		  size_t mbcdlen;
329
330		  wcu = towupper (wc);
331		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
332		  if (BE (mbclen == mbcdlen, 1))
333		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
334		  else
335		    {
336		      src_idx = byte_idx;
337		      goto offsets_needed;
338		    }
339		}
340	      else
341		memcpy (pstr->mbs + byte_idx,
342			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343	      pstr->wcs[byte_idx++] = wcu;
344	      /* Write paddings.  */
345	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346		pstr->wcs[byte_idx++] = WEOF;
347	    }
348	  else if (mbclen == (size_t) -1 || mbclen == 0)
349	    {
350	      /* It is an invalid character or '\0'.  Just use the byte.  */
351	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352	      pstr->mbs[byte_idx] = ch;
353	      /* And also cast it to wide char.  */
354	      pstr->wcs[byte_idx++] = (wchar_t) ch;
355	      if (BE (mbclen == (size_t) -1, 0))
356		pstr->cur_state = prev_st;
357	    }
358	  else
359	    {
360	      /* The buffer doesn't have enough space, finish to build.  */
361	      pstr->cur_state = prev_st;
362	      break;
363	    }
364	}
365      pstr->valid_len = byte_idx;
366      pstr->valid_raw_len = byte_idx;
367      return REG_NOERROR;
368    }
369  else
370    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
371      {
372	wchar_t wc;
373	const char *p;
374      offsets_needed:
375	remain_len = end_idx - byte_idx;
376	prev_st = pstr->cur_state;
377	if (BE (pstr->trans != NULL, 0))
378	  {
379	    int i, ch;
380
381	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
382	      {
383		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384		buf[i] = pstr->trans[ch];
385	      }
386	    p = (const char *) buf;
387	  }
388	else
389	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391	if (BE (mbclen + 2 > 2, 1))
392	  {
393	    wchar_t wcu = wc;
394	    if (iswlower (wc))
395	      {
396		size_t mbcdlen;
397
398		wcu = towupper (wc);
399		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
400		if (BE (mbclen == mbcdlen, 1))
401		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
402		else if (mbcdlen != (size_t) -1)
403		  {
404		    size_t i;
405
406		    if (byte_idx + mbcdlen > pstr->bufs_len)
407		      {
408			pstr->cur_state = prev_st;
409			break;
410		      }
411
412		    if (pstr->offsets == NULL)
413		      {
414			pstr->offsets = re_malloc (int, pstr->bufs_len);
415
416			if (pstr->offsets == NULL)
417			  return REG_ESPACE;
418		      }
419		    if (!pstr->offsets_needed)
420		      {
421			for (i = 0; i < (size_t) byte_idx; ++i)
422			  pstr->offsets[i] = i;
423			pstr->offsets_needed = 1;
424		      }
425
426		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427		    pstr->wcs[byte_idx] = wcu;
428		    pstr->offsets[byte_idx] = src_idx;
429		    for (i = 1; i < mbcdlen; ++i)
430		      {
431			pstr->offsets[byte_idx + i]
432			  = src_idx + (i < mbclen ? i : mbclen - 1);
433			pstr->wcs[byte_idx + i] = WEOF;
434		      }
435		    pstr->len += mbcdlen - mbclen;
436		    if (pstr->raw_stop > src_idx)
437		      pstr->stop += mbcdlen - mbclen;
438		    end_idx = (pstr->bufs_len > pstr->len)
439			      ? pstr->len : pstr->bufs_len;
440		    byte_idx += mbcdlen;
441		    src_idx += mbclen;
442		    continue;
443		  }
444                else
445                  memcpy (pstr->mbs + byte_idx, p, mbclen);
446	      }
447	    else
448	      memcpy (pstr->mbs + byte_idx, p, mbclen);
449
450	    if (BE (pstr->offsets_needed != 0, 0))
451	      {
452		size_t i;
453		for (i = 0; i < mbclen; ++i)
454		  pstr->offsets[byte_idx + i] = src_idx + i;
455	      }
456	    src_idx += mbclen;
457
458	    pstr->wcs[byte_idx++] = wcu;
459	    /* Write paddings.  */
460	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461	      pstr->wcs[byte_idx++] = WEOF;
462	  }
463	else if (mbclen == (size_t) -1 || mbclen == 0)
464	  {
465	    /* It is an invalid character or '\0'.  Just use the byte.  */
466	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467
468	    if (BE (pstr->trans != NULL, 0))
469	      ch = pstr->trans [ch];
470	    pstr->mbs[byte_idx] = ch;
471
472	    if (BE (pstr->offsets_needed != 0, 0))
473	      pstr->offsets[byte_idx] = src_idx;
474	    ++src_idx;
475
476	    /* And also cast it to wide char.  */
477	    pstr->wcs[byte_idx++] = (wchar_t) ch;
478	    if (BE (mbclen == (size_t) -1, 0))
479	      pstr->cur_state = prev_st;
480	  }
481	else
482	  {
483	    /* The buffer doesn't have enough space, finish to build.  */
484	    pstr->cur_state = prev_st;
485	    break;
486	  }
487      }
488  pstr->valid_len = byte_idx;
489  pstr->valid_raw_len = src_idx;
490  return REG_NOERROR;
491}
492
493/* Skip characters until the index becomes greater than NEW_RAW_IDX.
494   Return the index.  */
495
496static int
497re_string_skip_chars (pstr, new_raw_idx, last_wc)
498     re_string_t *pstr;
499     int new_raw_idx;
500     wint_t *last_wc;
501{
502  mbstate_t prev_st;
503  int rawbuf_idx;
504  size_t mbclen;
505  wchar_t wc = 0;
506
507  /* Skip the characters which are not necessary to check.  */
508  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
509       rawbuf_idx < new_raw_idx;)
510    {
511      int remain_len;
512      remain_len = pstr->len - rawbuf_idx;
513      prev_st = pstr->cur_state;
514      mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
515			remain_len, &pstr->cur_state);
516      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
517	{
518	  /* We treat these cases as a singlebyte character.  */
519	  mbclen = 1;
520	  pstr->cur_state = prev_st;
521	}
522      /* Then proceed the next character.  */
523      rawbuf_idx += mbclen;
524    }
525  *last_wc = (wint_t) wc;
526  return rawbuf_idx;
527}
528#endif /* RE_ENABLE_I18N  */
529
530/* Build the buffer PSTR->MBS, and apply the translation if we need.
531   This function is used in case of REG_ICASE.  */
532
533static void
534build_upper_buffer (pstr)
535     re_string_t *pstr;
536{
537  int char_idx, end_idx;
538  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539
540  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541    {
542      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
543      if (BE (pstr->trans != NULL, 0))
544	ch = pstr->trans[ch];
545      if (islower (ch))
546	pstr->mbs[char_idx] = toupper (ch);
547      else
548	pstr->mbs[char_idx] = ch;
549    }
550  pstr->valid_len = char_idx;
551  pstr->valid_raw_len = char_idx;
552}
553
554/* Apply TRANS to the buffer in PSTR.  */
555
556static void
557re_string_translate_buffer (pstr)
558     re_string_t *pstr;
559{
560  int buf_idx, end_idx;
561  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562
563  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
564    {
565      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
566      pstr->mbs[buf_idx] = pstr->trans[ch];
567    }
568
569  pstr->valid_len = buf_idx;
570  pstr->valid_raw_len = buf_idx;
571}
572
573/* This function re-construct the buffers.
574   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575   convert to upper case in case of REG_ICASE, apply translation.  */
576
577static reg_errcode_t
578re_string_reconstruct (pstr, idx, eflags)
579     re_string_t *pstr;
580     int idx, eflags;
581{
582  int offset = idx - pstr->raw_mbs_idx;
583  if (BE (offset < 0, 0))
584    {
585      /* Reset buffer.  */
586#ifdef RE_ENABLE_I18N
587      if (pstr->mb_cur_max > 1)
588	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589#endif /* RE_ENABLE_I18N */
590      pstr->len = pstr->raw_len;
591      pstr->stop = pstr->raw_stop;
592      pstr->valid_len = 0;
593      pstr->raw_mbs_idx = 0;
594      pstr->valid_raw_len = 0;
595      pstr->offsets_needed = 0;
596      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598      if (!pstr->mbs_allocated)
599	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600      offset = idx;
601    }
602
603  if (BE (offset != 0, 1))
604    {
605      /* Are the characters which are already checked remain?  */
606      if (BE (offset < pstr->valid_raw_len, 1)
607#ifdef RE_ENABLE_I18N
608	  /* Handling this would enlarge the code too much.
609	     Accept a slowdown in that case.  */
610	  && pstr->offsets_needed == 0
611#endif
612	 )
613	{
614	  /* Yes, move them to the front of the buffer.  */
615	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
616#ifdef RE_ENABLE_I18N
617	  if (pstr->mb_cur_max > 1)
618	    memmove (pstr->wcs, pstr->wcs + offset,
619		     (pstr->valid_len - offset) * sizeof (wint_t));
620#endif /* RE_ENABLE_I18N */
621	  if (BE (pstr->mbs_allocated, 0))
622	    memmove (pstr->mbs, pstr->mbs + offset,
623		     pstr->valid_len - offset);
624	  pstr->valid_len -= offset;
625	  pstr->valid_raw_len -= offset;
626#if DEBUG
627	  assert (pstr->valid_len > 0);
628#endif
629	}
630      else
631	{
632	  /* No, skip all characters until IDX.  */
633#ifdef RE_ENABLE_I18N
634	  if (BE (pstr->offsets_needed, 0))
635	    {
636	      pstr->len = pstr->raw_len - idx + offset;
637	      pstr->stop = pstr->raw_stop - idx + offset;
638	      pstr->offsets_needed = 0;
639	    }
640#endif
641	  pstr->valid_len = 0;
642	  pstr->valid_raw_len = 0;
643#ifdef RE_ENABLE_I18N
644	  if (pstr->mb_cur_max > 1)
645	    {
646	      int wcs_idx;
647	      wint_t wc = WEOF;
648
649	      if (pstr->is_utf8)
650		{
651		  const unsigned char *raw, *p, *q, *end;
652
653		  /* Special case UTF-8.  Multi-byte chars start with any
654		     byte other than 0x80 - 0xbf.  */
655		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
656		  end = raw + (offset - pstr->mb_cur_max);
657		  for (p = raw + offset - 1; p >= end; --p)
658		    if ((*p & 0xc0) != 0x80)
659		      {
660			mbstate_t cur_state;
661			wchar_t wc2;
662			int mlen = raw + pstr->len - p;
663			unsigned char buf[6];
664
665			q = p;
666			if (BE (pstr->trans != NULL, 0))
667			  {
668			    int i = mlen < 6 ? mlen : 6;
669			    while (--i >= 0)
670			      buf[i] = pstr->trans[p[i]];
671			    q = buf;
672			  }
673			/* XXX Don't use mbrtowc, we know which conversion
674			   to use (UTF-8 -> UCS4).  */
675			memset (&cur_state, 0, sizeof (cur_state));
676			mlen = (mbrtowc (&wc2, (const char *) p, mlen,
677					 &cur_state)
678				- (raw + offset - p));
679			if (mlen >= 0)
680			  {
681			    memset (&pstr->cur_state, '\0',
682				    sizeof (mbstate_t));
683			    pstr->valid_len = mlen;
684			    wc = wc2;
685			  }
686			break;
687		      }
688		}
689
690	      if (wc == WEOF)
691		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
692	      if (BE (pstr->valid_len, 0))
693		{
694		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
695		    pstr->wcs[wcs_idx] = WEOF;
696		  if (pstr->mbs_allocated)
697		    memset (pstr->mbs, 255, pstr->valid_len);
698		}
699	      pstr->valid_raw_len = pstr->valid_len;
700	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
701				    && IS_WIDE_WORD_CHAR (wc))
702				   ? CONTEXT_WORD
703				   : ((IS_WIDE_NEWLINE (wc)
704				       && pstr->newline_anchor)
705				      ? CONTEXT_NEWLINE : 0));
706	    }
707	  else
708#endif /* RE_ENABLE_I18N */
709	    {
710	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
711	      if (pstr->trans)
712		c = pstr->trans[c];
713	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
714				   ? CONTEXT_WORD
715				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
716				      ? CONTEXT_NEWLINE : 0));
717	    }
718	}
719      if (!BE (pstr->mbs_allocated, 0))
720	pstr->mbs += offset;
721    }
722  pstr->raw_mbs_idx = idx;
723  pstr->len -= offset;
724  pstr->stop -= offset;
725
726  /* Then build the buffers.  */
727#ifdef RE_ENABLE_I18N
728  if (pstr->mb_cur_max > 1)
729    {
730      if (pstr->icase)
731	{
732	  int ret = build_wcs_upper_buffer (pstr);
733	  if (BE (ret != REG_NOERROR, 0))
734	    return ret;
735	}
736      else
737	build_wcs_buffer (pstr);
738    }
739  else
740#endif /* RE_ENABLE_I18N */
741  if (BE (pstr->mbs_allocated, 0))
742    {
743      if (pstr->icase)
744	build_upper_buffer (pstr);
745      else if (pstr->trans != NULL)
746	re_string_translate_buffer (pstr);
747    }
748  else
749    pstr->valid_len = pstr->len;
750
751  pstr->cur_idx = 0;
752  return REG_NOERROR;
753}
754
755static unsigned char
756re_string_peek_byte_case (pstr, idx)
757     const re_string_t *pstr;
758     int idx;
759{
760  int ch, off;
761
762  /* Handle the common (easiest) cases first.  */
763  if (BE (!pstr->mbs_allocated, 1))
764    return re_string_peek_byte (pstr, idx);
765
766#ifdef RE_ENABLE_I18N
767  if (pstr->mb_cur_max > 1
768      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
769    return re_string_peek_byte (pstr, idx);
770#endif
771
772  off = pstr->cur_idx + idx;
773#ifdef RE_ENABLE_I18N
774  if (pstr->offsets_needed)
775    off = pstr->offsets[off];
776#endif
777
778  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
779
780#ifdef RE_ENABLE_I18N
781  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
782     this function returns CAPITAL LETTER I instead of first byte of
783     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
784     since peek_byte_case doesn't advance cur_idx in any way.  */
785  if (pstr->offsets_needed && !isascii (ch))
786    return re_string_peek_byte (pstr, idx);
787#endif
788
789  return ch;
790}
791
792static unsigned char
793re_string_fetch_byte_case (pstr)
794     re_string_t *pstr;
795{
796  if (BE (!pstr->mbs_allocated, 1))
797    return re_string_fetch_byte (pstr);
798
799#ifdef RE_ENABLE_I18N
800  if (pstr->offsets_needed)
801    {
802      int off, ch;
803
804      /* For tr_TR.UTF-8 [[:islower:]] there is
805	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
806	 in that case the whole multi-byte character and return
807	 the original letter.  On the other side, with
808	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
809	 anything else would complicate things too much.  */
810
811      if (!re_string_first_byte (pstr, pstr->cur_idx))
812	return re_string_fetch_byte (pstr);
813
814      off = pstr->offsets[pstr->cur_idx];
815      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
816
817      if (! isascii (ch))
818	return re_string_fetch_byte (pstr);
819
820      re_string_skip_bytes (pstr,
821			    re_string_char_size_at (pstr, pstr->cur_idx));
822      return ch;
823    }
824#endif
825
826  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
827}
828
829static void
830re_string_destruct (pstr)
831     re_string_t *pstr;
832{
833#ifdef RE_ENABLE_I18N
834  re_free (pstr->wcs);
835  re_free (pstr->offsets);
836#endif /* RE_ENABLE_I18N  */
837  if (pstr->mbs_allocated)
838    re_free (pstr->mbs);
839}
840
841/* Return the context at IDX in INPUT.  */
842
843static unsigned int
844re_string_context_at (input, idx, eflags)
845     const re_string_t *input;
846     int idx, eflags;
847{
848  int c;
849  if (BE (idx < 0, 0))
850    /* In this case, we use the value stored in input->tip_context,
851       since we can't know the character in input->mbs[-1] here.  */
852    return input->tip_context;
853  if (BE (idx == input->len, 0))
854    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
855	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
856#ifdef RE_ENABLE_I18N
857  if (input->mb_cur_max > 1)
858    {
859      wint_t wc;
860      int wc_idx = idx;
861      while(input->wcs[wc_idx] == WEOF)
862	{
863#ifdef DEBUG
864	  /* It must not happen.  */
865	  assert (wc_idx >= 0);
866#endif
867	  --wc_idx;
868	  if (wc_idx < 0)
869	    return input->tip_context;
870	}
871      wc = input->wcs[wc_idx];
872      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
873	return CONTEXT_WORD;
874      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
875	      ? CONTEXT_NEWLINE : 0);
876    }
877  else
878#endif
879    {
880      c = re_string_byte_at (input, idx);
881      if (bitset_contain (input->word_char, c))
882	return CONTEXT_WORD;
883      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
884    }
885}
886
887/* Functions for set operation.  */
888
889static reg_errcode_t
890re_node_set_alloc (set, size)
891     re_node_set *set;
892     int size;
893{
894  set->alloc = size;
895  set->nelem = 0;
896  set->elems = re_malloc (int, size);
897  if (BE (set->elems == NULL, 0))
898    return REG_ESPACE;
899  return REG_NOERROR;
900}
901
902static reg_errcode_t
903re_node_set_init_1 (set, elem)
904     re_node_set *set;
905     int elem;
906{
907  set->alloc = 1;
908  set->nelem = 1;
909  set->elems = re_malloc (int, 1);
910  if (BE (set->elems == NULL, 0))
911    {
912      set->alloc = set->nelem = 0;
913      return REG_ESPACE;
914    }
915  set->elems[0] = elem;
916  return REG_NOERROR;
917}
918
919static reg_errcode_t
920re_node_set_init_2 (set, elem1, elem2)
921     re_node_set *set;
922     int elem1, elem2;
923{
924  set->alloc = 2;
925  set->elems = re_malloc (int, 2);
926  if (BE (set->elems == NULL, 0))
927    return REG_ESPACE;
928  if (elem1 == elem2)
929    {
930      set->nelem = 1;
931      set->elems[0] = elem1;
932    }
933  else
934    {
935      set->nelem = 2;
936      if (elem1 < elem2)
937	{
938	  set->elems[0] = elem1;
939	  set->elems[1] = elem2;
940	}
941      else
942	{
943	  set->elems[0] = elem2;
944	  set->elems[1] = elem1;
945	}
946    }
947  return REG_NOERROR;
948}
949
950static reg_errcode_t
951re_node_set_init_copy (dest, src)
952     re_node_set *dest;
953     const re_node_set *src;
954{
955  dest->nelem = src->nelem;
956  if (src->nelem > 0)
957    {
958      dest->alloc = dest->nelem;
959      dest->elems = re_malloc (int, dest->alloc);
960      if (BE (dest->elems == NULL, 0))
961	{
962	  dest->alloc = dest->nelem = 0;
963	  return REG_ESPACE;
964	}
965      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
966    }
967  else
968    re_node_set_init_empty (dest);
969  return REG_NOERROR;
970}
971
972/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
973   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
974   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
975
976static reg_errcode_t
977re_node_set_add_intersect (dest, src1, src2)
978     re_node_set *dest;
979     const re_node_set *src1, *src2;
980{
981  int i1, i2, is, id, delta, sbase;
982  if (src1->nelem == 0 || src2->nelem == 0)
983    return REG_NOERROR;
984
985  /* We need dest->nelem + 2 * elems_in_intersection; this is a
986     conservative estimate.  */
987  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
988    {
989      int new_alloc = src1->nelem + src2->nelem + dest->alloc;
990      int *new_elems = re_realloc (dest->elems, int, new_alloc);
991      if (BE (new_elems == NULL, 0))
992        return REG_ESPACE;
993      dest->elems = new_elems;
994      dest->alloc = new_alloc;
995    }
996
997  /* Find the items in the intersection of SRC1 and SRC2, and copy
998     into the top of DEST those that are not already in DEST itself.  */
999  sbase = dest->nelem + src1->nelem + src2->nelem;
1000  i1 = src1->nelem - 1;
1001  i2 = src2->nelem - 1;
1002  id = dest->nelem - 1;
1003  for (;;)
1004    {
1005      if (src1->elems[i1] == src2->elems[i2])
1006	{
1007	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1008	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1009	    --id;
1010
1011          if (id < 0 || dest->elems[id] != src1->elems[i1])
1012            dest->elems[--sbase] = src1->elems[i1];
1013
1014	  if (--i1 < 0 || --i2 < 0)
1015	    break;
1016	}
1017
1018      /* Lower the highest of the two items.  */
1019      else if (src1->elems[i1] < src2->elems[i2])
1020	{
1021	  if (--i2 < 0)
1022	    break;
1023	}
1024      else
1025	{
1026	  if (--i1 < 0)
1027	    break;
1028	}
1029    }
1030
1031  id = dest->nelem - 1;
1032  is = dest->nelem + src1->nelem + src2->nelem - 1;
1033  delta = is - sbase + 1;
1034
1035  /* Now copy.  When DELTA becomes zero, the remaining
1036     DEST elements are already in place; this is more or
1037     less the same loop that is in re_node_set_merge.  */
1038  dest->nelem += delta;
1039  if (delta > 0 && id >= 0)
1040    for (;;)
1041      {
1042        if (dest->elems[is] > dest->elems[id])
1043          {
1044            /* Copy from the top.  */
1045            dest->elems[id + delta--] = dest->elems[is--];
1046            if (delta == 0)
1047              break;
1048          }
1049        else
1050          {
1051            /* Slide from the bottom.  */
1052            dest->elems[id + delta] = dest->elems[id];
1053            if (--id < 0)
1054              break;
1055          }
1056      }
1057
1058  /* Copy remaining SRC elements.  */
1059  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1060
1061  return REG_NOERROR;
1062}
1063
1064/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1065   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1066
1067static reg_errcode_t
1068re_node_set_init_union (dest, src1, src2)
1069     re_node_set *dest;
1070     const re_node_set *src1, *src2;
1071{
1072  int i1, i2, id;
1073  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1074    {
1075      dest->alloc = src1->nelem + src2->nelem;
1076      dest->elems = re_malloc (int, dest->alloc);
1077      if (BE (dest->elems == NULL, 0))
1078	return REG_ESPACE;
1079    }
1080  else
1081    {
1082      if (src1 != NULL && src1->nelem > 0)
1083	return re_node_set_init_copy (dest, src1);
1084      else if (src2 != NULL && src2->nelem > 0)
1085	return re_node_set_init_copy (dest, src2);
1086      else
1087	re_node_set_init_empty (dest);
1088      return REG_NOERROR;
1089    }
1090  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1091    {
1092      if (src1->elems[i1] > src2->elems[i2])
1093	{
1094	  dest->elems[id++] = src2->elems[i2++];
1095	  continue;
1096	}
1097      if (src1->elems[i1] == src2->elems[i2])
1098	++i2;
1099      dest->elems[id++] = src1->elems[i1++];
1100    }
1101  if (i1 < src1->nelem)
1102    {
1103      memcpy (dest->elems + id, src1->elems + i1,
1104	     (src1->nelem - i1) * sizeof (int));
1105      id += src1->nelem - i1;
1106    }
1107  else if (i2 < src2->nelem)
1108    {
1109      memcpy (dest->elems + id, src2->elems + i2,
1110	     (src2->nelem - i2) * sizeof (int));
1111      id += src2->nelem - i2;
1112    }
1113  dest->nelem = id;
1114  return REG_NOERROR;
1115}
1116
1117/* Calculate the union set of the sets DEST and SRC. And store it to
1118   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1119
1120static reg_errcode_t
1121re_node_set_merge (dest, src)
1122     re_node_set *dest;
1123     const re_node_set *src;
1124{
1125  int is, id, sbase, delta;
1126  if (src == NULL || src->nelem == 0)
1127    return REG_NOERROR;
1128  if (dest->alloc < 2 * src->nelem + dest->nelem)
1129    {
1130      int new_alloc = 2 * (src->nelem + dest->alloc);
1131      int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1132      if (BE (new_buffer == NULL, 0))
1133	return REG_ESPACE;
1134      dest->elems = new_buffer;
1135      dest->alloc = new_alloc;
1136    }
1137
1138  if (BE (dest->nelem == 0, 0))
1139    {
1140      dest->nelem = src->nelem;
1141      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1142      return REG_NOERROR;
1143    }
1144
1145  /* Copy into the top of DEST the items of SRC that are not
1146     found in DEST.  Maybe we could binary search in DEST?  */
1147  for (sbase = dest->nelem + 2 * src->nelem,
1148       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1149    {
1150      if (dest->elems[id] == src->elems[is])
1151        is--, id--;
1152      else if (dest->elems[id] < src->elems[is])
1153        dest->elems[--sbase] = src->elems[is--];
1154      else /* if (dest->elems[id] > src->elems[is]) */
1155        --id;
1156    }
1157
1158  if (is >= 0)
1159    {
1160      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1161      sbase -= is + 1;
1162      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1163    }
1164
1165  id = dest->nelem - 1;
1166  is = dest->nelem + 2 * src->nelem - 1;
1167  delta = is - sbase + 1;
1168  if (delta == 0)
1169    return REG_NOERROR;
1170
1171  /* Now copy.  When DELTA becomes zero, the remaining
1172     DEST elements are already in place.  */
1173  dest->nelem += delta;
1174  for (;;)
1175    {
1176      if (dest->elems[is] > dest->elems[id])
1177        {
1178	  /* Copy from the top.  */
1179          dest->elems[id + delta--] = dest->elems[is--];
1180	  if (delta == 0)
1181	    break;
1182	}
1183      else
1184        {
1185          /* Slide from the bottom.  */
1186          dest->elems[id + delta] = dest->elems[id];
1187	  if (--id < 0)
1188	    {
1189	      /* Copy remaining SRC elements.  */
1190	      memcpy (dest->elems, dest->elems + sbase,
1191	              delta * sizeof (int));
1192	      break;
1193	    }
1194	}
1195    }
1196
1197  return REG_NOERROR;
1198}
1199
1200/* Insert the new element ELEM to the re_node_set* SET.
1201   SET should not already have ELEM.
1202   return -1 if an error is occured, return 1 otherwise.  */
1203
1204static int
1205re_node_set_insert (set, elem)
1206     re_node_set *set;
1207     int elem;
1208{
1209  int idx;
1210  /* In case the set is empty.  */
1211  if (set->alloc == 0)
1212    {
1213      if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1214	return 1;
1215      else
1216	return -1;
1217    }
1218
1219  if (BE (set->nelem, 0) == 0)
1220    {
1221      /* We already guaranteed above that set->alloc != 0.  */
1222      set->elems[0] = elem;
1223      ++set->nelem;
1224      return 1;
1225    }
1226
1227  /* Realloc if we need.  */
1228  if (set->alloc == set->nelem)
1229    {
1230      int *new_array;
1231      set->alloc = set->alloc * 2;
1232      new_array = re_realloc (set->elems, int, set->alloc);
1233      if (BE (new_array == NULL, 0))
1234	return -1;
1235      set->elems = new_array;
1236    }
1237
1238  /* Move the elements which follows the new element.  Test the
1239     first element separately to skip a check in the inner loop.  */
1240  if (elem < set->elems[0])
1241    {
1242      idx = 0;
1243      for (idx = set->nelem; idx > 0; idx--)
1244        set->elems[idx] = set->elems[idx - 1];
1245    }
1246  else
1247    {
1248      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1249        set->elems[idx] = set->elems[idx - 1];
1250    }
1251
1252  /* Insert the new element.  */
1253  set->elems[idx] = elem;
1254  ++set->nelem;
1255  return 1;
1256}
1257
1258/* Insert the new element ELEM to the re_node_set* SET.
1259   SET should not already have any element greater than or equal to ELEM.
1260   Return -1 if an error is occured, return 1 otherwise.  */
1261
1262static int
1263re_node_set_insert_last (set, elem)
1264     re_node_set *set;
1265     int elem;
1266{
1267  /* Realloc if we need.  */
1268  if (set->alloc == set->nelem)
1269    {
1270      int *new_array;
1271      set->alloc = (set->alloc + 1) * 2;
1272      new_array = re_realloc (set->elems, int, set->alloc);
1273      if (BE (new_array == NULL, 0))
1274	return -1;
1275      set->elems = new_array;
1276    }
1277
1278  /* Insert the new element.  */
1279  set->elems[set->nelem++] = elem;
1280  return 1;
1281}
1282
1283/* Compare two node sets SET1 and SET2.
1284   return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1285
1286static int
1287re_node_set_compare (set1, set2)
1288     const re_node_set *set1, *set2;
1289{
1290  int i;
1291  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1292    return 0;
1293  for (i = set1->nelem ; --i >= 0 ; )
1294    if (set1->elems[i] != set2->elems[i])
1295      return 0;
1296  return 1;
1297}
1298
1299/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1300
1301static int
1302re_node_set_contains (set, elem)
1303     const re_node_set *set;
1304     int elem;
1305{
1306  unsigned int idx, right, mid;
1307  if (set->nelem <= 0)
1308    return 0;
1309
1310  /* Binary search the element.  */
1311  idx = 0;
1312  right = set->nelem - 1;
1313  while (idx < right)
1314    {
1315      mid = (idx + right) / 2;
1316      if (set->elems[mid] < elem)
1317	idx = mid + 1;
1318      else
1319	right = mid;
1320    }
1321  return set->elems[idx] == elem ? idx + 1 : 0;
1322}
1323
1324static void
1325re_node_set_remove_at (set, idx)
1326     re_node_set *set;
1327     int idx;
1328{
1329  if (idx < 0 || idx >= set->nelem)
1330    return;
1331  --set->nelem;
1332  for (; idx < set->nelem; idx++)
1333    set->elems[idx] = set->elems[idx + 1];
1334}
1335
1336
1337/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1338   Or return -1, if an error will be occured.  */
1339
1340static int
1341re_dfa_add_node (dfa, token)
1342     re_dfa_t *dfa;
1343     re_token_t token;
1344{
1345  int type = token.type;
1346  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1347    {
1348      int new_nodes_alloc = dfa->nodes_alloc * 2;
1349      int *new_nexts, *new_indices;
1350      re_node_set *new_edests, *new_eclosures;
1351
1352      re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1353					  new_nodes_alloc);
1354      if (BE (new_array == NULL, 0))
1355	return -1;
1356      dfa->nodes = new_array;
1357      new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1358      new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1359      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1360      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1361      if (BE (new_nexts == NULL || new_indices == NULL
1362	      || new_edests == NULL || new_eclosures == NULL, 0))
1363	return -1;
1364      dfa->nexts = new_nexts;
1365      dfa->org_indices = new_indices;
1366      dfa->edests = new_edests;
1367      dfa->eclosures = new_eclosures;
1368      dfa->nodes_alloc = new_nodes_alloc;
1369    }
1370  dfa->nodes[dfa->nodes_len] = token;
1371  dfa->nodes[dfa->nodes_len].constraint = 0;
1372#ifdef RE_ENABLE_I18N
1373  dfa->nodes[dfa->nodes_len].accept_mb =
1374    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1375#endif
1376  dfa->nexts[dfa->nodes_len] = -1;
1377  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1378  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1379  return dfa->nodes_len++;
1380}
1381
1382static unsigned int inline
1383calc_state_hash (nodes, context)
1384     const re_node_set *nodes;
1385     unsigned int context;
1386{
1387  unsigned int hash = nodes->nelem + context;
1388  int i;
1389  for (i = 0 ; i < nodes->nelem ; i++)
1390    hash += nodes->elems[i];
1391  return hash;
1392}
1393
1394/* Search for the state whose node_set is equivalent to NODES.
1395   Return the pointer to the state, if we found it in the DFA.
1396   Otherwise create the new one and return it.  In case of an error
1397   return NULL and set the error code in ERR.
1398   Note: - We assume NULL as the invalid state, then it is possible that
1399	   return value is NULL and ERR is REG_NOERROR.
1400	 - We never return non-NULL value in case of any errors, it is for
1401	   optimization.  */
1402
1403static re_dfastate_t*
1404re_acquire_state (err, dfa, nodes)
1405     reg_errcode_t *err;
1406     re_dfa_t *dfa;
1407     const re_node_set *nodes;
1408{
1409  unsigned int hash;
1410  re_dfastate_t *new_state;
1411  struct re_state_table_entry *spot;
1412  int i;
1413  if (BE (nodes->nelem == 0, 0))
1414    {
1415      *err = REG_NOERROR;
1416      return NULL;
1417    }
1418  hash = calc_state_hash (nodes, 0);
1419  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1420
1421  for (i = 0 ; i < spot->num ; i++)
1422    {
1423      re_dfastate_t *state = spot->array[i];
1424      if (hash != state->hash)
1425	continue;
1426      if (re_node_set_compare (&state->nodes, nodes))
1427	return state;
1428    }
1429
1430  /* There are no appropriate state in the dfa, create the new one.  */
1431  new_state = create_ci_newstate (dfa, nodes, hash);
1432  if (BE (new_state != NULL, 1))
1433    return new_state;
1434  else
1435    {
1436      *err = REG_ESPACE;
1437      return NULL;
1438    }
1439}
1440
1441/* Search for the state whose node_set is equivalent to NODES and
1442   whose context is equivalent to CONTEXT.
1443   Return the pointer to the state, if we found it in the DFA.
1444   Otherwise create the new one and return it.  In case of an error
1445   return NULL and set the error code in ERR.
1446   Note: - We assume NULL as the invalid state, then it is possible that
1447	   return value is NULL and ERR is REG_NOERROR.
1448	 - We never return non-NULL value in case of any errors, it is for
1449	   optimization.  */
1450
1451static re_dfastate_t*
1452re_acquire_state_context (err, dfa, nodes, context)
1453     reg_errcode_t *err;
1454     re_dfa_t *dfa;
1455     const re_node_set *nodes;
1456     unsigned int context;
1457{
1458  unsigned int hash;
1459  re_dfastate_t *new_state;
1460  struct re_state_table_entry *spot;
1461  int i;
1462  if (nodes->nelem == 0)
1463    {
1464      *err = REG_NOERROR;
1465      return NULL;
1466    }
1467  hash = calc_state_hash (nodes, context);
1468  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1469
1470  for (i = 0 ; i < spot->num ; i++)
1471    {
1472      re_dfastate_t *state = spot->array[i];
1473      if (state->hash == hash
1474	  && state->context == context
1475	  && re_node_set_compare (state->entrance_nodes, nodes))
1476	return state;
1477    }
1478  /* There are no appropriate state in `dfa', create the new one.  */
1479  new_state = create_cd_newstate (dfa, nodes, context, hash);
1480  if (BE (new_state != NULL, 1))
1481    return new_state;
1482  else
1483    {
1484      *err = REG_ESPACE;
1485      return NULL;
1486    }
1487}
1488
1489/* Finish initialization of the new state NEWSTATE, and using its hash value
1490   HASH put in the appropriate bucket of DFA's state table.  Return value
1491   indicates the error code if failed.  */
1492
1493static reg_errcode_t
1494register_state (dfa, newstate, hash)
1495     re_dfa_t *dfa;
1496     re_dfastate_t *newstate;
1497     unsigned int hash;
1498{
1499  struct re_state_table_entry *spot;
1500  reg_errcode_t err;
1501  int i;
1502
1503  newstate->hash = hash;
1504  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1505  if (BE (err != REG_NOERROR, 0))
1506    return REG_ESPACE;
1507  for (i = 0; i < newstate->nodes.nelem; i++)
1508    {
1509      int elem = newstate->nodes.elems[i];
1510      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1511        re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1512    }
1513
1514  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1515  if (BE (spot->alloc <= spot->num, 0))
1516    {
1517      int new_alloc = 2 * spot->num + 2;
1518      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1519					      new_alloc);
1520      if (BE (new_array == NULL, 0))
1521	return REG_ESPACE;
1522      spot->array = new_array;
1523      spot->alloc = new_alloc;
1524    }
1525  spot->array[spot->num++] = newstate;
1526  return REG_NOERROR;
1527}
1528
1529/* Create the new state which is independ of contexts.
1530   Return the new state if succeeded, otherwise return NULL.  */
1531
1532static re_dfastate_t *
1533create_ci_newstate (dfa, nodes, hash)
1534     re_dfa_t *dfa;
1535     const re_node_set *nodes;
1536     unsigned int hash;
1537{
1538  int i;
1539  reg_errcode_t err;
1540  re_dfastate_t *newstate;
1541
1542  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1543  if (BE (newstate == NULL, 0))
1544    return NULL;
1545  err = re_node_set_init_copy (&newstate->nodes, nodes);
1546  if (BE (err != REG_NOERROR, 0))
1547    {
1548      re_free (newstate);
1549      return NULL;
1550    }
1551
1552  newstate->entrance_nodes = &newstate->nodes;
1553  for (i = 0 ; i < nodes->nelem ; i++)
1554    {
1555      re_token_t *node = dfa->nodes + nodes->elems[i];
1556      re_token_type_t type = node->type;
1557      if (type == CHARACTER && !node->constraint)
1558	continue;
1559#ifdef RE_ENABLE_I18N
1560      newstate->accept_mb |= node->accept_mb;
1561#endif /* RE_ENABLE_I18N */
1562
1563      /* If the state has the halt node, the state is a halt state.  */
1564      if (type == END_OF_RE)
1565	newstate->halt = 1;
1566      else if (type == OP_BACK_REF)
1567	newstate->has_backref = 1;
1568      else if (type == ANCHOR || node->constraint)
1569	newstate->has_constraint = 1;
1570    }
1571  err = register_state (dfa, newstate, hash);
1572  if (BE (err != REG_NOERROR, 0))
1573    {
1574      free_state (newstate);
1575      newstate = NULL;
1576    }
1577  return newstate;
1578}
1579
1580/* Create the new state which is depend on the context CONTEXT.
1581   Return the new state if succeeded, otherwise return NULL.  */
1582
1583static re_dfastate_t *
1584create_cd_newstate (dfa, nodes, context, hash)
1585     re_dfa_t *dfa;
1586     const re_node_set *nodes;
1587     unsigned int context, hash;
1588{
1589  int i, nctx_nodes = 0;
1590  reg_errcode_t err;
1591  re_dfastate_t *newstate;
1592
1593  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1594  if (BE (newstate == NULL, 0))
1595    return NULL;
1596  err = re_node_set_init_copy (&newstate->nodes, nodes);
1597  if (BE (err != REG_NOERROR, 0))
1598    {
1599      re_free (newstate);
1600      return NULL;
1601    }
1602
1603  newstate->context = context;
1604  newstate->entrance_nodes = &newstate->nodes;
1605
1606  for (i = 0 ; i < nodes->nelem ; i++)
1607    {
1608      unsigned int constraint = 0;
1609      re_token_t *node = dfa->nodes + nodes->elems[i];
1610      re_token_type_t type = node->type;
1611      if (node->constraint)
1612	constraint = node->constraint;
1613
1614      if (type == CHARACTER && !constraint)
1615	continue;
1616#ifdef RE_ENABLE_I18N
1617      newstate->accept_mb |= node->accept_mb;
1618#endif /* RE_ENABLE_I18N */
1619
1620      /* If the state has the halt node, the state is a halt state.  */
1621      if (type == END_OF_RE)
1622	newstate->halt = 1;
1623      else if (type == OP_BACK_REF)
1624	newstate->has_backref = 1;
1625      else if (type == ANCHOR)
1626	constraint = node->opr.ctx_type;
1627
1628      if (constraint)
1629	{
1630	  if (newstate->entrance_nodes == &newstate->nodes)
1631	    {
1632	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633	      if (BE (newstate->entrance_nodes == NULL, 0))
1634		{
1635		  free_state (newstate);
1636		  return NULL;
1637		}
1638	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639	      nctx_nodes = 0;
1640	      newstate->has_constraint = 1;
1641	    }
1642
1643	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644	    {
1645	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646	      ++nctx_nodes;
1647	    }
1648	}
1649    }
1650  err = register_state (dfa, newstate, hash);
1651  if (BE (err != REG_NOERROR, 0))
1652    {
1653      free_state (newstate);
1654      newstate = NULL;
1655    }
1656  return  newstate;
1657}
1658
1659static void
1660free_state (state)
1661     re_dfastate_t *state;
1662{
1663  re_node_set_free (&state->non_eps_nodes);
1664  re_node_set_free (&state->inveclosure);
1665  if (state->entrance_nodes != &state->nodes)
1666    {
1667      re_node_set_free (state->entrance_nodes);
1668      re_free (state->entrance_nodes);
1669    }
1670  re_node_set_free (&state->nodes);
1671  re_free (state->word_trtable);
1672  re_free (state->trtable);
1673  re_free (state);
1674}
1675