1/* Extended regular expression matching and search library.
2   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License along
17   with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static void re_string_construct_common (const char *str, Idx len,
21					re_string_t *pstr,
22					REG_TRANSLATE_TYPE trans, bool icase,
23					const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25					  const re_node_set *nodes,
26					  re_hashval_t hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28					  const re_node_set *nodes,
29					  unsigned int context,
30					  re_hashval_t hash) internal_function;
31
32/* Functions for string operation.  */
33
34/* This function allocate the buffers.  It is necessary to call
35   re_string_reconstruct before using the object.  */
36
37static reg_errcode_t
38internal_function
39re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40		    REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41{
42  reg_errcode_t ret;
43  Idx init_buf_len;
44
45  /* Ensure at least one character fits into the buffers.  */
46  if (init_len < dfa->mb_cur_max)
47    init_len = dfa->mb_cur_max;
48  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49  re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51  ret = re_string_realloc_buffers (pstr, init_buf_len);
52  if (BE (ret != REG_NOERROR, 0))
53    return ret;
54
55  pstr->word_char = dfa->word_char;
56  pstr->word_ops_used = dfa->word_ops_used;
57  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59  pstr->valid_raw_len = pstr->valid_len;
60  return REG_NOERROR;
61}
62
63/* This function allocate the buffers, and initialize them.  */
64
65static reg_errcode_t
66internal_function
67re_string_construct (re_string_t *pstr, const char *str, Idx len,
68		     REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69{
70  reg_errcode_t ret;
71  memset (pstr, '\0', sizeof (re_string_t));
72  re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74  if (len > 0)
75    {
76      ret = re_string_realloc_buffers (pstr, len + 1);
77      if (BE (ret != REG_NOERROR, 0))
78	return ret;
79    }
80  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82  if (icase)
83    {
84#ifdef RE_ENABLE_I18N
85      if (dfa->mb_cur_max > 1)
86	{
87	  while (1)
88	    {
89	      ret = build_wcs_upper_buffer (pstr);
90	      if (BE (ret != REG_NOERROR, 0))
91		return ret;
92	      if (pstr->valid_raw_len >= len)
93		break;
94	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95		break;
96	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97	      if (BE (ret != REG_NOERROR, 0))
98		return ret;
99	    }
100	}
101      else
102#endif /* RE_ENABLE_I18N  */
103	build_upper_buffer (pstr);
104    }
105  else
106    {
107#ifdef RE_ENABLE_I18N
108      if (dfa->mb_cur_max > 1)
109	build_wcs_buffer (pstr);
110      else
111#endif /* RE_ENABLE_I18N  */
112	{
113	  if (trans != NULL)
114	    re_string_translate_buffer (pstr);
115	  else
116	    {
117	      pstr->valid_len = pstr->bufs_len;
118	      pstr->valid_raw_len = pstr->bufs_len;
119	    }
120	}
121    }
122
123  return REG_NOERROR;
124}
125
126/* Helper functions for re_string_allocate, and re_string_construct.  */
127
128static reg_errcode_t
129internal_function
130re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131{
132#ifdef RE_ENABLE_I18N
133  if (pstr->mb_cur_max > 1)
134    {
135      wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
136      if (BE (new_wcs == NULL, 0))
137	return REG_ESPACE;
138      pstr->wcs = new_wcs;
139      if (pstr->offsets != NULL)
140	{
141	  Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
142	  if (BE (new_offsets == NULL, 0))
143	    return REG_ESPACE;
144	  pstr->offsets = new_offsets;
145	}
146    }
147#endif /* RE_ENABLE_I18N  */
148  if (pstr->mbs_allocated)
149    {
150      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
151					   new_buf_len);
152      if (BE (new_mbs == NULL, 0))
153	return REG_ESPACE;
154      pstr->mbs = new_mbs;
155    }
156  pstr->bufs_len = new_buf_len;
157  return REG_NOERROR;
158}
159
160
161static void
162internal_function
163re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
164			    REG_TRANSLATE_TYPE trans, bool icase,
165			    const re_dfa_t *dfa)
166{
167  pstr->raw_mbs = (const unsigned char *) str;
168  pstr->len = len;
169  pstr->raw_len = len;
170  pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
171  pstr->icase = icase;
172  pstr->mbs_allocated = (trans != NULL || icase);
173  pstr->mb_cur_max = dfa->mb_cur_max;
174  pstr->is_utf8 = dfa->is_utf8;
175  pstr->map_notascii = dfa->map_notascii;
176  pstr->stop = pstr->len;
177  pstr->raw_stop = pstr->stop;
178}
179
180#ifdef RE_ENABLE_I18N
181
182/* Build wide character buffer PSTR->WCS.
183   If the byte sequence of the string are:
184     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185   Then wide character buffer will be:
186     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
187   We use WEOF for padding, they indicate that the position isn't
188   a first byte of a multibyte character.
189
190   Note that this function assumes PSTR->VALID_LEN elements are already
191   built and starts from PSTR->VALID_LEN.  */
192
193static void
194internal_function
195build_wcs_buffer (re_string_t *pstr)
196{
197#ifdef _LIBC
198  unsigned char buf[MB_LEN_MAX];
199  assert (MB_LEN_MAX >= pstr->mb_cur_max);
200#else
201  unsigned char buf[64];
202#endif
203  mbstate_t prev_st;
204  Idx byte_idx, end_idx, remain_len;
205  size_t mbclen;
206
207  /* Build the buffers from pstr->valid_len to either pstr->len or
208     pstr->bufs_len.  */
209  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
211    {
212      wchar_t wc;
213      const char *p;
214
215      remain_len = end_idx - byte_idx;
216      prev_st = pstr->cur_state;
217      /* Apply the translation if we need.  */
218      if (BE (pstr->trans != NULL, 0))
219	{
220	  int i, ch;
221
222	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
223	    {
224	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
226	    }
227	  p = (const char *) buf;
228	}
229      else
230	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231      mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232      if (BE (mbclen == (size_t) -2, 0))
233	{
234	  /* The buffer doesn't have enough space, finish to build.  */
235	  pstr->cur_state = prev_st;
236	  break;
237	}
238      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
239	{
240	  /* We treat these cases as a singlebyte character.  */
241	  mbclen = 1;
242	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243	  if (BE (pstr->trans != NULL, 0))
244	    wc = pstr->trans[wc];
245	  pstr->cur_state = prev_st;
246	}
247
248      /* Write wide character and padding.  */
249      pstr->wcs[byte_idx++] = wc;
250      /* Write paddings.  */
251      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252	pstr->wcs[byte_idx++] = WEOF;
253    }
254  pstr->valid_len = byte_idx;
255  pstr->valid_raw_len = byte_idx;
256}
257
258/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259   but for REG_ICASE.  */
260
261static reg_errcode_t
262internal_function
263build_wcs_upper_buffer (re_string_t *pstr)
264{
265  mbstate_t prev_st;
266  Idx src_idx, byte_idx, end_idx, remain_len;
267  size_t mbclen;
268#ifdef _LIBC
269  char buf[MB_LEN_MAX];
270  assert (MB_LEN_MAX >= pstr->mb_cur_max);
271#else
272  char buf[64];
273#endif
274
275  byte_idx = pstr->valid_len;
276  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
277
278  /* The following optimization assumes that ASCII characters can be
279     mapped to wide characters with a simple cast.  */
280  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
281    {
282      while (byte_idx < end_idx)
283	{
284	  wchar_t wc;
285
286	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287	      && mbsinit (&pstr->cur_state))
288	    {
289	      /* In case of a singlebyte character.  */
290	      pstr->mbs[byte_idx]
291		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292	      /* The next step uses the assumption that wchar_t is encoded
293		 ASCII-safe: all ASCII values can be converted like this.  */
294	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
295	      ++byte_idx;
296	      continue;
297	    }
298
299	  remain_len = end_idx - byte_idx;
300	  prev_st = pstr->cur_state;
301	  mbclen = mbrtowc (&wc,
302			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303			     + byte_idx), remain_len, &pstr->cur_state);
304	  if (BE ((size_t) (mbclen + 2) > 2, 1))
305	    {
306	      wchar_t wcu = wc;
307	      if (iswlower (wc))
308		{
309		  size_t mbcdlen;
310
311		  wcu = towupper (wc);
312		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
313		  if (BE (mbclen == mbcdlen, 1))
314		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
315		  else
316		    {
317		      src_idx = byte_idx;
318		      goto offsets_needed;
319		    }
320		}
321	      else
322		memcpy (pstr->mbs + byte_idx,
323			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324	      pstr->wcs[byte_idx++] = wcu;
325	      /* Write paddings.  */
326	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327		pstr->wcs[byte_idx++] = WEOF;
328	    }
329	  else if (mbclen == (size_t) -1 || mbclen == 0)
330	    {
331	      /* It is an invalid character or '\0'.  Just use the byte.  */
332	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333	      pstr->mbs[byte_idx] = ch;
334	      /* And also cast it to wide char.  */
335	      pstr->wcs[byte_idx++] = (wchar_t) ch;
336	      if (BE (mbclen == (size_t) -1, 0))
337		pstr->cur_state = prev_st;
338	    }
339	  else
340	    {
341	      /* The buffer doesn't have enough space, finish to build.  */
342	      pstr->cur_state = prev_st;
343	      break;
344	    }
345	}
346      pstr->valid_len = byte_idx;
347      pstr->valid_raw_len = byte_idx;
348      return REG_NOERROR;
349    }
350  else
351    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
352      {
353	wchar_t wc;
354	const char *p;
355      offsets_needed:
356	remain_len = end_idx - byte_idx;
357	prev_st = pstr->cur_state;
358	if (BE (pstr->trans != NULL, 0))
359	  {
360	    int i, ch;
361
362	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
363	      {
364		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365		buf[i] = pstr->trans[ch];
366	      }
367	    p = (const char *) buf;
368	  }
369	else
370	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372	if (BE ((size_t) (mbclen + 2) > 2, 1))
373	  {
374	    wchar_t wcu = wc;
375	    if (iswlower (wc))
376	      {
377		size_t mbcdlen;
378
379		wcu = towupper (wc);
380		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381		if (BE (mbclen == mbcdlen, 1))
382		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
383		else if (mbcdlen != (size_t) -1)
384		  {
385		    size_t i;
386
387		    if (byte_idx + mbcdlen > pstr->bufs_len)
388		      {
389			pstr->cur_state = prev_st;
390			break;
391		      }
392
393		    if (pstr->offsets == NULL)
394		      {
395			pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
396
397			if (pstr->offsets == NULL)
398			  return REG_ESPACE;
399		      }
400		    if (!pstr->offsets_needed)
401		      {
402			for (i = 0; i < (size_t) byte_idx; ++i)
403			  pstr->offsets[i] = i;
404			pstr->offsets_needed = 1;
405		      }
406
407		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408		    pstr->wcs[byte_idx] = wcu;
409		    pstr->offsets[byte_idx] = src_idx;
410		    for (i = 1; i < mbcdlen; ++i)
411		      {
412			pstr->offsets[byte_idx + i]
413			  = src_idx + (i < mbclen ? i : mbclen - 1);
414			pstr->wcs[byte_idx + i] = WEOF;
415		      }
416		    pstr->len += mbcdlen - mbclen;
417		    if (pstr->raw_stop > src_idx)
418		      pstr->stop += mbcdlen - mbclen;
419		    end_idx = (pstr->bufs_len > pstr->len)
420			      ? pstr->len : pstr->bufs_len;
421		    byte_idx += mbcdlen;
422		    src_idx += mbclen;
423		    continue;
424		  }
425                else
426                  memcpy (pstr->mbs + byte_idx, p, mbclen);
427	      }
428	    else
429	      memcpy (pstr->mbs + byte_idx, p, mbclen);
430
431	    if (BE (pstr->offsets_needed != 0, 0))
432	      {
433		size_t i;
434		for (i = 0; i < mbclen; ++i)
435		  pstr->offsets[byte_idx + i] = src_idx + i;
436	      }
437	    src_idx += mbclen;
438
439	    pstr->wcs[byte_idx++] = wcu;
440	    /* Write paddings.  */
441	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442	      pstr->wcs[byte_idx++] = WEOF;
443	  }
444	else if (mbclen == (size_t) -1 || mbclen == 0)
445	  {
446	    /* It is an invalid character or '\0'.  Just use the byte.  */
447	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
448
449	    if (BE (pstr->trans != NULL, 0))
450	      ch = pstr->trans [ch];
451	    pstr->mbs[byte_idx] = ch;
452
453	    if (BE (pstr->offsets_needed != 0, 0))
454	      pstr->offsets[byte_idx] = src_idx;
455	    ++src_idx;
456
457	    /* And also cast it to wide char.  */
458	    pstr->wcs[byte_idx++] = (wchar_t) ch;
459	    if (BE (mbclen == (size_t) -1, 0))
460	      pstr->cur_state = prev_st;
461	  }
462	else
463	  {
464	    /* The buffer doesn't have enough space, finish to build.  */
465	    pstr->cur_state = prev_st;
466	    break;
467	  }
468      }
469  pstr->valid_len = byte_idx;
470  pstr->valid_raw_len = src_idx;
471  return REG_NOERROR;
472}
473
474/* Skip characters until the index becomes greater than NEW_RAW_IDX.
475   Return the index.  */
476
477static Idx
478internal_function
479re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
480{
481  mbstate_t prev_st;
482  Idx rawbuf_idx;
483  size_t mbclen;
484  wchar_t wc = 0;
485
486  /* Skip the characters which are not necessary to check.  */
487  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488       rawbuf_idx < new_raw_idx;)
489    {
490      Idx remain_len;
491      remain_len = pstr->len - rawbuf_idx;
492      prev_st = pstr->cur_state;
493      mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494			remain_len, &pstr->cur_state);
495      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
496	{
497	  /* We treat these cases as a singlebyte character.  */
498	  mbclen = 1;
499	  pstr->cur_state = prev_st;
500	}
501      /* Then proceed the next character.  */
502      rawbuf_idx += mbclen;
503    }
504  *last_wc = (wint_t) wc;
505  return rawbuf_idx;
506}
507#endif /* RE_ENABLE_I18N  */
508
509/* Build the buffer PSTR->MBS, and apply the translation if we need.
510   This function is used in case of REG_ICASE.  */
511
512static void
513internal_function
514build_upper_buffer (re_string_t *pstr)
515{
516  Idx char_idx, end_idx;
517  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
518
519  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
520    {
521      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522      if (BE (pstr->trans != NULL, 0))
523	ch = pstr->trans[ch];
524      if (islower (ch))
525	pstr->mbs[char_idx] = toupper (ch);
526      else
527	pstr->mbs[char_idx] = ch;
528    }
529  pstr->valid_len = char_idx;
530  pstr->valid_raw_len = char_idx;
531}
532
533/* Apply TRANS to the buffer in PSTR.  */
534
535static void
536internal_function
537re_string_translate_buffer (re_string_t *pstr)
538{
539  Idx buf_idx, end_idx;
540  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541
542  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
543    {
544      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545      pstr->mbs[buf_idx] = pstr->trans[ch];
546    }
547
548  pstr->valid_len = buf_idx;
549  pstr->valid_raw_len = buf_idx;
550}
551
552/* This function re-construct the buffers.
553   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554   convert to upper case in case of REG_ICASE, apply translation.  */
555
556static reg_errcode_t
557internal_function
558re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
559{
560  Idx offset;
561
562  if (BE (pstr->raw_mbs_idx <= idx, 0))
563    offset = idx - pstr->raw_mbs_idx;
564  else
565    {
566      /* Reset buffer.  */
567#ifdef RE_ENABLE_I18N
568      if (pstr->mb_cur_max > 1)
569	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
570#endif /* RE_ENABLE_I18N */
571      pstr->len = pstr->raw_len;
572      pstr->stop = pstr->raw_stop;
573      pstr->valid_len = 0;
574      pstr->raw_mbs_idx = 0;
575      pstr->valid_raw_len = 0;
576      pstr->offsets_needed = 0;
577      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
578			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
579      if (!pstr->mbs_allocated)
580	pstr->mbs = (unsigned char *) pstr->raw_mbs;
581      offset = idx;
582    }
583
584  if (BE (offset != 0, 1))
585    {
586      /* Are the characters which are already checked remain?  */
587      if (BE (offset < pstr->valid_raw_len, 1)
588#ifdef RE_ENABLE_I18N
589	  /* Handling this would enlarge the code too much.
590	     Accept a slowdown in that case.  */
591	  && pstr->offsets_needed == 0
592#endif
593	 )
594	{
595	  /* Yes, move them to the front of the buffer.  */
596	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
597#ifdef RE_ENABLE_I18N
598	  if (pstr->mb_cur_max > 1)
599	    memmove (pstr->wcs, pstr->wcs + offset,
600		     (pstr->valid_len - offset) * sizeof (wint_t));
601#endif /* RE_ENABLE_I18N */
602	  if (BE (pstr->mbs_allocated, 0))
603	    memmove (pstr->mbs, pstr->mbs + offset,
604		     pstr->valid_len - offset);
605	  pstr->valid_len -= offset;
606	  pstr->valid_raw_len -= offset;
607#if DEBUG
608	  assert (pstr->valid_len > 0);
609#endif
610	}
611      else
612	{
613	  /* No, skip all characters until IDX.  */
614#ifdef RE_ENABLE_I18N
615	  if (BE (pstr->offsets_needed, 0))
616	    {
617	      pstr->len = pstr->raw_len - idx + offset;
618	      pstr->stop = pstr->raw_stop - idx + offset;
619	      pstr->offsets_needed = 0;
620	    }
621#endif
622	  pstr->valid_len = 0;
623	  pstr->valid_raw_len = 0;
624#ifdef RE_ENABLE_I18N
625	  if (pstr->mb_cur_max > 1)
626	    {
627	      Idx wcs_idx;
628	      wint_t wc = WEOF;
629
630	      if (pstr->is_utf8)
631		{
632		  const unsigned char *raw, *p, *q, *end;
633
634		  /* Special case UTF-8.  Multi-byte chars start with any
635		     byte other than 0x80 - 0xbf.  */
636		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
637		  end = raw + (offset - pstr->mb_cur_max);
638		  for (p = raw + offset - 1; p >= end; --p)
639		    if ((*p & 0xc0) != 0x80)
640		      {
641			mbstate_t cur_state;
642			wchar_t wc2;
643			Idx mlen = raw + pstr->len - p;
644			unsigned char buf[6];
645			size_t mbclen;
646
647			q = p;
648			if (BE (pstr->trans != NULL, 0))
649			  {
650			    int i = mlen < 6 ? mlen : 6;
651			    while (--i >= 0)
652			      buf[i] = pstr->trans[p[i]];
653			    q = buf;
654			  }
655			/* XXX Don't use mbrtowc, we know which conversion
656			   to use (UTF-8 -> UCS4).  */
657			memset (&cur_state, 0, sizeof (cur_state));
658			mbclen = mbrtowc (&wc2, (const char *) p, mlen,
659					  &cur_state);
660			if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
661			  {
662			    memset (&pstr->cur_state, '\0',
663				    sizeof (mbstate_t));
664			    pstr->valid_len = mbclen - (raw + offset - p);
665			    wc = wc2;
666			  }
667			break;
668		      }
669		}
670
671	      if (wc == WEOF)
672		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
673	      if (BE (pstr->valid_len, 0))
674		{
675		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
676		    pstr->wcs[wcs_idx] = WEOF;
677		  if (pstr->mbs_allocated)
678		    memset (pstr->mbs, -1, pstr->valid_len);
679		}
680	      pstr->valid_raw_len = pstr->valid_len;
681	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
682				    && IS_WIDE_WORD_CHAR (wc))
683				   ? CONTEXT_WORD
684				   : ((IS_WIDE_NEWLINE (wc)
685				       && pstr->newline_anchor)
686				      ? CONTEXT_NEWLINE : 0));
687	    }
688	  else
689#endif /* RE_ENABLE_I18N */
690	    {
691	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
692	      if (pstr->trans)
693		c = pstr->trans[c];
694	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
695				   ? CONTEXT_WORD
696				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
697				      ? CONTEXT_NEWLINE : 0));
698	    }
699	}
700      if (!BE (pstr->mbs_allocated, 0))
701	pstr->mbs += offset;
702    }
703  pstr->raw_mbs_idx = idx;
704  pstr->len -= offset;
705  pstr->stop -= offset;
706
707  /* Then build the buffers.  */
708#ifdef RE_ENABLE_I18N
709  if (pstr->mb_cur_max > 1)
710    {
711      if (pstr->icase)
712	{
713	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
714	  if (BE (ret != REG_NOERROR, 0))
715	    return ret;
716	}
717      else
718	build_wcs_buffer (pstr);
719    }
720  else
721#endif /* RE_ENABLE_I18N */
722  if (BE (pstr->mbs_allocated, 0))
723    {
724      if (pstr->icase)
725	build_upper_buffer (pstr);
726      else if (pstr->trans != NULL)
727	re_string_translate_buffer (pstr);
728    }
729  else
730    pstr->valid_len = pstr->len;
731
732  pstr->cur_idx = 0;
733  return REG_NOERROR;
734}
735
736static unsigned char
737internal_function __attribute ((pure))
738re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
739{
740  int ch;
741  Idx off;
742
743  /* Handle the common (easiest) cases first.  */
744  if (BE (!pstr->mbs_allocated, 1))
745    return re_string_peek_byte (pstr, idx);
746
747#ifdef RE_ENABLE_I18N
748  if (pstr->mb_cur_max > 1
749      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
750    return re_string_peek_byte (pstr, idx);
751#endif
752
753  off = pstr->cur_idx + idx;
754#ifdef RE_ENABLE_I18N
755  if (pstr->offsets_needed)
756    off = pstr->offsets[off];
757#endif
758
759  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
760
761#ifdef RE_ENABLE_I18N
762  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
763     this function returns CAPITAL LETTER I instead of first byte of
764     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
765     since peek_byte_case doesn't advance cur_idx in any way.  */
766  if (pstr->offsets_needed && !isascii (ch))
767    return re_string_peek_byte (pstr, idx);
768#endif
769
770  return ch;
771}
772
773static unsigned char
774internal_function __attribute ((pure))
775re_string_fetch_byte_case (re_string_t *pstr)
776{
777  if (BE (!pstr->mbs_allocated, 1))
778    return re_string_fetch_byte (pstr);
779
780#ifdef RE_ENABLE_I18N
781  if (pstr->offsets_needed)
782    {
783      Idx off;
784      int ch;
785
786      /* For tr_TR.UTF-8 [[:islower:]] there is
787	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
788	 in that case the whole multi-byte character and return
789	 the original letter.  On the other side, with
790	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
791	 anything else would complicate things too much.  */
792
793      if (!re_string_first_byte (pstr, pstr->cur_idx))
794	return re_string_fetch_byte (pstr);
795
796      off = pstr->offsets[pstr->cur_idx];
797      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
798
799      if (! isascii (ch))
800	return re_string_fetch_byte (pstr);
801
802      re_string_skip_bytes (pstr,
803			    re_string_char_size_at (pstr, pstr->cur_idx));
804      return ch;
805    }
806#endif
807
808  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
809}
810
811static void
812internal_function
813re_string_destruct (re_string_t *pstr)
814{
815#ifdef RE_ENABLE_I18N
816  re_free (pstr->wcs);
817  re_free (pstr->offsets);
818#endif /* RE_ENABLE_I18N  */
819  if (pstr->mbs_allocated)
820    re_free (pstr->mbs);
821}
822
823/* Return the context at IDX in INPUT.  */
824
825static unsigned int
826internal_function
827re_string_context_at (const re_string_t *input, Idx idx, int eflags)
828{
829  int c;
830  if (BE (! REG_VALID_INDEX (idx), 0))
831    /* In this case, we use the value stored in input->tip_context,
832       since we can't know the character in input->mbs[-1] here.  */
833    return input->tip_context;
834  if (BE (idx == input->len, 0))
835    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
836	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
837#ifdef RE_ENABLE_I18N
838  if (input->mb_cur_max > 1)
839    {
840      wint_t wc;
841      Idx wc_idx = idx;
842      while(input->wcs[wc_idx] == WEOF)
843	{
844#ifdef DEBUG
845	  /* It must not happen.  */
846	  assert (REG_VALID_INDEX (wc_idx));
847#endif
848	  --wc_idx;
849	  if (! REG_VALID_INDEX (wc_idx))
850	    return input->tip_context;
851	}
852      wc = input->wcs[wc_idx];
853      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
854	return CONTEXT_WORD;
855      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
856	      ? CONTEXT_NEWLINE : 0);
857    }
858  else
859#endif
860    {
861      c = re_string_byte_at (input, idx);
862      if (bitset_contain (input->word_char, c))
863	return CONTEXT_WORD;
864      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
865    }
866}
867
868/* Functions for set operation.  */
869
870static reg_errcode_t
871internal_function
872re_node_set_alloc (re_node_set *set, Idx size)
873{
874  set->alloc = size;
875  set->nelem = 0;
876  set->elems = re_xmalloc (Idx, size);
877  if (BE (set->elems == NULL, 0))
878    return REG_ESPACE;
879  return REG_NOERROR;
880}
881
882static reg_errcode_t
883internal_function
884re_node_set_init_1 (re_node_set *set, Idx elem)
885{
886  set->alloc = 1;
887  set->nelem = 1;
888  set->elems = re_malloc (Idx, 1);
889  if (BE (set->elems == NULL, 0))
890    {
891      set->alloc = set->nelem = 0;
892      return REG_ESPACE;
893    }
894  set->elems[0] = elem;
895  return REG_NOERROR;
896}
897
898static reg_errcode_t
899internal_function
900re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
901{
902  set->alloc = 2;
903  set->elems = re_malloc (Idx, 2);
904  if (BE (set->elems == NULL, 0))
905    return REG_ESPACE;
906  if (elem1 == elem2)
907    {
908      set->nelem = 1;
909      set->elems[0] = elem1;
910    }
911  else
912    {
913      set->nelem = 2;
914      if (elem1 < elem2)
915	{
916	  set->elems[0] = elem1;
917	  set->elems[1] = elem2;
918	}
919      else
920	{
921	  set->elems[0] = elem2;
922	  set->elems[1] = elem1;
923	}
924    }
925  return REG_NOERROR;
926}
927
928static reg_errcode_t
929internal_function
930re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
931{
932  dest->nelem = src->nelem;
933  if (src->nelem > 0)
934    {
935      dest->alloc = dest->nelem;
936      dest->elems = re_malloc (Idx, dest->alloc);
937      if (BE (dest->elems == NULL, 0))
938	{
939	  dest->alloc = dest->nelem = 0;
940	  return REG_ESPACE;
941	}
942      memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
943    }
944  else
945    re_node_set_init_empty (dest);
946  return REG_NOERROR;
947}
948
949/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
950   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
951   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
952
953static reg_errcode_t
954internal_function
955re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
956			   const re_node_set *src2)
957{
958  Idx i1, i2, is, id, delta, sbase;
959  if (src1->nelem == 0 || src2->nelem == 0)
960    return REG_NOERROR;
961
962  /* We need dest->nelem + 2 * elems_in_intersection; this is a
963     conservative estimate.  */
964  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
965    {
966      Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
967      Idx *new_elems;
968      if (sizeof (Idx) < 3
969	  && (new_alloc < dest->alloc
970	      || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
971	return REG_ESPACE;
972      new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
973      if (BE (new_elems == NULL, 0))
974        return REG_ESPACE;
975      dest->elems = new_elems;
976      dest->alloc = new_alloc;
977    }
978
979  /* Find the items in the intersection of SRC1 and SRC2, and copy
980     into the top of DEST those that are not already in DEST itself.  */
981  sbase = dest->nelem + src1->nelem + src2->nelem;
982  i1 = src1->nelem - 1;
983  i2 = src2->nelem - 1;
984  id = dest->nelem - 1;
985  for (;;)
986    {
987      if (src1->elems[i1] == src2->elems[i2])
988	{
989	  /* Try to find the item in DEST.  Maybe we could binary search?  */
990	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
991	    --id;
992
993          if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
994            dest->elems[--sbase] = src1->elems[i1];
995
996	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
997	    break;
998	}
999
1000      /* Lower the highest of the two items.  */
1001      else if (src1->elems[i1] < src2->elems[i2])
1002	{
1003	  if (! REG_VALID_INDEX (--i2))
1004	    break;
1005	}
1006      else
1007	{
1008	  if (! REG_VALID_INDEX (--i1))
1009	    break;
1010	}
1011    }
1012
1013  id = dest->nelem - 1;
1014  is = dest->nelem + src1->nelem + src2->nelem - 1;
1015  delta = is - sbase + 1;
1016
1017  /* Now copy.  When DELTA becomes zero, the remaining
1018     DEST elements are already in place; this is more or
1019     less the same loop that is in re_node_set_merge.  */
1020  dest->nelem += delta;
1021  if (delta > 0 && REG_VALID_INDEX (id))
1022    for (;;)
1023      {
1024        if (dest->elems[is] > dest->elems[id])
1025          {
1026            /* Copy from the top.  */
1027            dest->elems[id + delta--] = dest->elems[is--];
1028            if (delta == 0)
1029              break;
1030          }
1031        else
1032          {
1033            /* Slide from the bottom.  */
1034            dest->elems[id + delta] = dest->elems[id];
1035            if (! REG_VALID_INDEX (--id))
1036              break;
1037          }
1038      }
1039
1040  /* Copy remaining SRC elements.  */
1041  memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1042
1043  return REG_NOERROR;
1044}
1045
1046/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1047   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1048
1049static reg_errcode_t
1050internal_function
1051re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1052			const re_node_set *src2)
1053{
1054  Idx i1, i2, id;
1055  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1056    {
1057      dest->alloc = src1->nelem + src2->nelem;
1058      if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1059	return REG_ESPACE;
1060      dest->elems = re_xmalloc (Idx, dest->alloc);
1061      if (BE (dest->elems == NULL, 0))
1062	return REG_ESPACE;
1063    }
1064  else
1065    {
1066      if (src1 != NULL && src1->nelem > 0)
1067	return re_node_set_init_copy (dest, src1);
1068      else if (src2 != NULL && src2->nelem > 0)
1069	return re_node_set_init_copy (dest, src2);
1070      else
1071	re_node_set_init_empty (dest);
1072      return REG_NOERROR;
1073    }
1074  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1075    {
1076      if (src1->elems[i1] > src2->elems[i2])
1077	{
1078	  dest->elems[id++] = src2->elems[i2++];
1079	  continue;
1080	}
1081      if (src1->elems[i1] == src2->elems[i2])
1082	++i2;
1083      dest->elems[id++] = src1->elems[i1++];
1084    }
1085  if (i1 < src1->nelem)
1086    {
1087      memcpy (dest->elems + id, src1->elems + i1,
1088	     (src1->nelem - i1) * sizeof dest->elems[0]);
1089      id += src1->nelem - i1;
1090    }
1091  else if (i2 < src2->nelem)
1092    {
1093      memcpy (dest->elems + id, src2->elems + i2,
1094	     (src2->nelem - i2) * sizeof dest->elems[0]);
1095      id += src2->nelem - i2;
1096    }
1097  dest->nelem = id;
1098  return REG_NOERROR;
1099}
1100
1101/* Calculate the union set of the sets DEST and SRC. And store it to
1102   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1103
1104static reg_errcode_t
1105internal_function
1106re_node_set_merge (re_node_set *dest, const re_node_set *src)
1107{
1108  Idx is, id, sbase, delta;
1109  if (src == NULL || src->nelem == 0)
1110    return REG_NOERROR;
1111  if (sizeof (Idx) < 3
1112      && ((Idx) (2 * src->nelem) < src->nelem
1113	  || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1114    return REG_ESPACE;
1115  if (dest->alloc < 2 * src->nelem + dest->nelem)
1116    {
1117      Idx new_alloc = src->nelem + dest->alloc;
1118      Idx *new_buffer;
1119      if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1120	return REG_ESPACE;
1121      new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1122      if (BE (new_buffer == NULL, 0))
1123	return REG_ESPACE;
1124      dest->elems = new_buffer;
1125      dest->alloc = new_alloc;
1126    }
1127
1128  if (BE (dest->nelem == 0, 0))
1129    {
1130      dest->nelem = src->nelem;
1131      memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1132      return REG_NOERROR;
1133    }
1134
1135  /* Copy into the top of DEST the items of SRC that are not
1136     found in DEST.  Maybe we could binary search in DEST?  */
1137  for (sbase = dest->nelem + 2 * src->nelem,
1138       is = src->nelem - 1, id = dest->nelem - 1;
1139       REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1140    {
1141      if (dest->elems[id] == src->elems[is])
1142        is--, id--;
1143      else if (dest->elems[id] < src->elems[is])
1144        dest->elems[--sbase] = src->elems[is--];
1145      else /* if (dest->elems[id] > src->elems[is]) */
1146        --id;
1147    }
1148
1149  if (REG_VALID_INDEX (is))
1150    {
1151      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1152      sbase -= is + 1;
1153      memcpy (dest->elems + sbase, src->elems,
1154	      (is + 1) * sizeof dest->elems[0]);
1155    }
1156
1157  id = dest->nelem - 1;
1158  is = dest->nelem + 2 * src->nelem - 1;
1159  delta = is - sbase + 1;
1160  if (delta == 0)
1161    return REG_NOERROR;
1162
1163  /* Now copy.  When DELTA becomes zero, the remaining
1164     DEST elements are already in place.  */
1165  dest->nelem += delta;
1166  for (;;)
1167    {
1168      if (dest->elems[is] > dest->elems[id])
1169        {
1170	  /* Copy from the top.  */
1171          dest->elems[id + delta--] = dest->elems[is--];
1172	  if (delta == 0)
1173	    break;
1174	}
1175      else
1176        {
1177          /* Slide from the bottom.  */
1178          dest->elems[id + delta] = dest->elems[id];
1179	  if (! REG_VALID_INDEX (--id))
1180	    {
1181	      /* Copy remaining SRC elements.  */
1182	      memcpy (dest->elems, dest->elems + sbase,
1183	              delta * sizeof dest->elems[0]);
1184	      break;
1185	    }
1186	}
1187    }
1188
1189  return REG_NOERROR;
1190}
1191
1192/* Insert the new element ELEM to the re_node_set* SET.
1193   SET should not already have ELEM.
1194   Return true if successful.  */
1195
1196static bool
1197internal_function
1198re_node_set_insert (re_node_set *set, Idx elem)
1199{
1200  Idx idx;
1201  /* In case the set is empty.  */
1202  if (set->alloc == 0)
1203    return re_node_set_init_1 (set, elem) == REG_NOERROR;
1204
1205  if (BE (set->nelem, 0) == 0)
1206    {
1207      /* We already guaranteed above that set->alloc != 0.  */
1208      set->elems[0] = elem;
1209      ++set->nelem;
1210      return true;
1211    }
1212
1213  /* Realloc if we need.  */
1214  if (set->alloc == set->nelem)
1215    {
1216      Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1217      if (BE (new_elems == NULL, 0))
1218	return false;
1219      set->elems = new_elems;
1220    }
1221
1222  /* Move the elements which follows the new element.  Test the
1223     first element separately to skip a check in the inner loop.  */
1224  if (elem < set->elems[0])
1225    {
1226      idx = 0;
1227      for (idx = set->nelem; idx > 0; idx--)
1228        set->elems[idx] = set->elems[idx - 1];
1229    }
1230  else
1231    {
1232      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1233        set->elems[idx] = set->elems[idx - 1];
1234    }
1235
1236  /* Insert the new element.  */
1237  set->elems[idx] = elem;
1238  ++set->nelem;
1239  return true;
1240}
1241
1242/* Insert the new element ELEM to the re_node_set* SET.
1243   SET should not already have any element greater than or equal to ELEM.
1244   Return true if successful.  */
1245
1246static bool
1247internal_function
1248re_node_set_insert_last (re_node_set *set, Idx elem)
1249{
1250  /* Realloc if we need.  */
1251  if (set->alloc == set->nelem)
1252    {
1253      Idx *new_elems;
1254      new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1255      if (BE (new_elems == NULL, 0))
1256	return false;
1257      set->elems = new_elems;
1258    }
1259
1260  /* Insert the new element.  */
1261  set->elems[set->nelem++] = elem;
1262  return true;
1263}
1264
1265/* Compare two node sets SET1 and SET2.
1266   Return true if SET1 and SET2 are equivalent.  */
1267
1268static bool
1269internal_function __attribute ((pure))
1270re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1271{
1272  Idx i;
1273  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1274    return false;
1275  for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1276    if (set1->elems[i] != set2->elems[i])
1277      return false;
1278  return true;
1279}
1280
1281/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1282
1283static Idx
1284internal_function __attribute ((pure))
1285re_node_set_contains (const re_node_set *set, Idx elem)
1286{
1287  __re_size_t idx, right, mid;
1288  if (! REG_VALID_NONZERO_INDEX (set->nelem))
1289    return 0;
1290
1291  /* Binary search the element.  */
1292  idx = 0;
1293  right = set->nelem - 1;
1294  while (idx < right)
1295    {
1296      mid = (idx + right) / 2;
1297      if (set->elems[mid] < elem)
1298	idx = mid + 1;
1299      else
1300	right = mid;
1301    }
1302  return set->elems[idx] == elem ? idx + 1 : 0;
1303}
1304
1305static void
1306internal_function
1307re_node_set_remove_at (re_node_set *set, Idx idx)
1308{
1309  if (idx < 0 || idx >= set->nelem)
1310    return;
1311  --set->nelem;
1312  for (; idx < set->nelem; idx++)
1313    set->elems[idx] = set->elems[idx + 1];
1314}
1315
1316
1317/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1318   Or return REG_MISSING if an error occurred.  */
1319
1320static Idx
1321internal_function
1322re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1323{
1324  int type = token.type;
1325  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1326    {
1327      Idx new_nodes_alloc = dfa->nodes_alloc;
1328      Idx *new_nexts, *new_indices;
1329      re_node_set *new_edests, *new_eclosures;
1330
1331      re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1332					    &new_nodes_alloc);
1333      if (BE (new_nodes == NULL, 0))
1334	return REG_MISSING;
1335      dfa->nodes = new_nodes;
1336      new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1337      new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1338      new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1339      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1340      if (BE (new_nexts == NULL || new_indices == NULL
1341	      || new_edests == NULL || new_eclosures == NULL, 0))
1342	return REG_MISSING;
1343      dfa->nexts = new_nexts;
1344      dfa->org_indices = new_indices;
1345      dfa->edests = new_edests;
1346      dfa->eclosures = new_eclosures;
1347      dfa->nodes_alloc = new_nodes_alloc;
1348    }
1349  dfa->nodes[dfa->nodes_len] = token;
1350  dfa->nodes[dfa->nodes_len].constraint = 0;
1351#ifdef RE_ENABLE_I18N
1352  dfa->nodes[dfa->nodes_len].accept_mb =
1353    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1354#endif
1355  dfa->nexts[dfa->nodes_len] = REG_MISSING;
1356  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1357  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1358  return dfa->nodes_len++;
1359}
1360
1361static inline re_hashval_t
1362internal_function
1363calc_state_hash (const re_node_set *nodes, unsigned int context)
1364{
1365  re_hashval_t hash = nodes->nelem + context;
1366  Idx i;
1367  for (i = 0 ; i < nodes->nelem ; i++)
1368    hash += nodes->elems[i];
1369  return hash;
1370}
1371
1372/* Search for the state whose node_set is equivalent to NODES.
1373   Return the pointer to the state, if we found it in the DFA.
1374   Otherwise create the new one and return it.  In case of an error
1375   return NULL and set the error code in ERR.
1376   Note: - We assume NULL as the invalid state, then it is possible that
1377	   return value is NULL and ERR is REG_NOERROR.
1378	 - We never return non-NULL value in case of any errors, it is for
1379	   optimization.  */
1380
1381static re_dfastate_t*
1382internal_function
1383re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1384{
1385  re_hashval_t hash;
1386  re_dfastate_t *new_state;
1387  struct re_state_table_entry *spot;
1388  Idx i;
1389#ifdef lint
1390  /* Suppress bogus uninitialized-variable warnings.  */
1391  *err = REG_NOERROR;
1392#endif
1393  if (BE (nodes->nelem == 0, 0))
1394    {
1395      *err = REG_NOERROR;
1396      return NULL;
1397    }
1398  hash = calc_state_hash (nodes, 0);
1399  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1400
1401  for (i = 0 ; i < spot->num ; i++)
1402    {
1403      re_dfastate_t *state = spot->array[i];
1404      if (hash != state->hash)
1405	continue;
1406      if (re_node_set_compare (&state->nodes, nodes))
1407	return state;
1408    }
1409
1410  /* There are no appropriate state in the dfa, create the new one.  */
1411  new_state = create_ci_newstate (dfa, nodes, hash);
1412  if (BE (new_state != NULL, 1))
1413    return new_state;
1414  else
1415    {
1416      *err = REG_ESPACE;
1417      return NULL;
1418    }
1419}
1420
1421/* Search for the state whose node_set is equivalent to NODES and
1422   whose context is equivalent to CONTEXT.
1423   Return the pointer to the state, if we found it in the DFA.
1424   Otherwise create the new one and return it.  In case of an error
1425   return NULL and set the error code in ERR.
1426   Note: - We assume NULL as the invalid state, then it is possible that
1427	   return value is NULL and ERR is REG_NOERROR.
1428	 - We never return non-NULL value in case of any errors, it is for
1429	   optimization.  */
1430
1431static re_dfastate_t*
1432internal_function
1433re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1434			  const re_node_set *nodes, unsigned int context)
1435{
1436  re_hashval_t hash;
1437  re_dfastate_t *new_state;
1438  struct re_state_table_entry *spot;
1439  Idx i;
1440#ifdef lint
1441  /* Suppress bogus uninitialized-variable warnings.  */
1442  *err = REG_NOERROR;
1443#endif
1444  if (nodes->nelem == 0)
1445    {
1446      *err = REG_NOERROR;
1447      return NULL;
1448    }
1449  hash = calc_state_hash (nodes, context);
1450  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1451
1452  for (i = 0 ; i < spot->num ; i++)
1453    {
1454      re_dfastate_t *state = spot->array[i];
1455      if (state->hash == hash
1456	  && state->context == context
1457	  && re_node_set_compare (state->entrance_nodes, nodes))
1458	return state;
1459    }
1460  /* There are no appropriate state in `dfa', create the new one.  */
1461  new_state = create_cd_newstate (dfa, nodes, context, hash);
1462  if (BE (new_state != NULL, 1))
1463    return new_state;
1464  else
1465    {
1466      *err = REG_ESPACE;
1467      return NULL;
1468    }
1469}
1470
1471/* Finish initialization of the new state NEWSTATE, and using its hash value
1472   HASH put in the appropriate bucket of DFA's state table.  Return value
1473   indicates the error code if failed.  */
1474
1475static reg_errcode_t
1476internal_function
1477register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1478{
1479  struct re_state_table_entry *spot;
1480  reg_errcode_t err;
1481  Idx i;
1482
1483  newstate->hash = hash;
1484  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1485  if (BE (err != REG_NOERROR, 0))
1486    return REG_ESPACE;
1487  for (i = 0; i < newstate->nodes.nelem; i++)
1488    {
1489      Idx elem = newstate->nodes.elems[i];
1490      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1491	{
1492	  bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1493	  if (BE (! ok, 0))
1494	    return REG_ESPACE;
1495	}
1496    }
1497
1498  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1499  if (BE (spot->alloc <= spot->num, 0))
1500    {
1501      Idx new_alloc = spot->num;
1502      re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1503						&new_alloc);
1504      if (BE (new_array == NULL, 0))
1505	return REG_ESPACE;
1506      spot->array = new_array;
1507      spot->alloc = new_alloc;
1508    }
1509  spot->array[spot->num++] = newstate;
1510  return REG_NOERROR;
1511}
1512
1513/* Create the new state which is independ of contexts.
1514   Return the new state if succeeded, otherwise return NULL.  */
1515
1516static re_dfastate_t *
1517internal_function
1518create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1519		    re_hashval_t hash)
1520{
1521  Idx i;
1522  reg_errcode_t err;
1523  re_dfastate_t *newstate;
1524
1525  newstate = re_calloc (re_dfastate_t, 1);
1526  if (BE (newstate == NULL, 0))
1527    return NULL;
1528  err = re_node_set_init_copy (&newstate->nodes, nodes);
1529  if (BE (err != REG_NOERROR, 0))
1530    {
1531      re_free (newstate);
1532      return NULL;
1533    }
1534
1535  newstate->entrance_nodes = &newstate->nodes;
1536  for (i = 0 ; i < nodes->nelem ; i++)
1537    {
1538      re_token_t *node = dfa->nodes + nodes->elems[i];
1539      re_token_type_t type = node->type;
1540      if (type == CHARACTER && !node->constraint)
1541	continue;
1542#ifdef RE_ENABLE_I18N
1543      newstate->accept_mb |= node->accept_mb;
1544#endif /* RE_ENABLE_I18N */
1545
1546      /* If the state has the halt node, the state is a halt state.  */
1547      if (type == END_OF_RE)
1548	newstate->halt = 1;
1549      else if (type == OP_BACK_REF)
1550	newstate->has_backref = 1;
1551      else if (type == ANCHOR || node->constraint)
1552	newstate->has_constraint = 1;
1553    }
1554  err = register_state (dfa, newstate, hash);
1555  if (BE (err != REG_NOERROR, 0))
1556    {
1557      free_state (newstate);
1558      newstate = NULL;
1559    }
1560  return newstate;
1561}
1562
1563/* Create the new state which is depend on the context CONTEXT.
1564   Return the new state if succeeded, otherwise return NULL.  */
1565
1566static re_dfastate_t *
1567internal_function
1568create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1569		    unsigned int context, re_hashval_t hash)
1570{
1571  Idx i, nctx_nodes = 0;
1572  reg_errcode_t err;
1573  re_dfastate_t *newstate;
1574
1575  newstate = re_calloc (re_dfastate_t, 1);
1576  if (BE (newstate == NULL, 0))
1577    return NULL;
1578  err = re_node_set_init_copy (&newstate->nodes, nodes);
1579  if (BE (err != REG_NOERROR, 0))
1580    {
1581      re_free (newstate);
1582      return NULL;
1583    }
1584
1585  newstate->context = context;
1586  newstate->entrance_nodes = &newstate->nodes;
1587
1588  for (i = 0 ; i < nodes->nelem ; i++)
1589    {
1590      unsigned int constraint = 0;
1591      re_token_t *node = dfa->nodes + nodes->elems[i];
1592      re_token_type_t type = node->type;
1593      if (node->constraint)
1594	constraint = node->constraint;
1595
1596      if (type == CHARACTER && !constraint)
1597	continue;
1598#ifdef RE_ENABLE_I18N
1599      newstate->accept_mb |= node->accept_mb;
1600#endif /* RE_ENABLE_I18N */
1601
1602      /* If the state has the halt node, the state is a halt state.  */
1603      if (type == END_OF_RE)
1604	newstate->halt = 1;
1605      else if (type == OP_BACK_REF)
1606	newstate->has_backref = 1;
1607      else if (type == ANCHOR)
1608	constraint = node->opr.ctx_type;
1609
1610      if (constraint)
1611	{
1612	  if (newstate->entrance_nodes == &newstate->nodes)
1613	    {
1614	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1615	      if (BE (newstate->entrance_nodes == NULL, 0))
1616		{
1617		  free_state (newstate);
1618		  return NULL;
1619		}
1620	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1621	      nctx_nodes = 0;
1622	      newstate->has_constraint = 1;
1623	    }
1624
1625	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1626	    {
1627	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1628	      ++nctx_nodes;
1629	    }
1630	}
1631    }
1632  err = register_state (dfa, newstate, hash);
1633  if (BE (err != REG_NOERROR, 0))
1634    {
1635      free_state (newstate);
1636      newstate = NULL;
1637    }
1638  return  newstate;
1639}
1640
1641static void
1642internal_function
1643free_state (re_dfastate_t *state)
1644{
1645  re_node_set_free (&state->non_eps_nodes);
1646  re_node_set_free (&state->inveclosure);
1647  if (state->entrance_nodes != &state->nodes)
1648    {
1649      re_node_set_free (state->entrance_nodes);
1650      re_free (state->entrance_nodes);
1651    }
1652  re_node_set_free (&state->nodes);
1653  re_free (state->word_trtable);
1654  re_free (state->trtable);
1655  re_free (state);
1656}
1657