1/* Extended regular expression matching and search library.
2   Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Free Software
3   Foundation, Inc.
4   This file is part of the GNU C Library.
5   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6
7   This program is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License along
18   with this program; if not, write to the Free Software Foundation,
19   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20
21static void re_string_construct_common (const char *str, Idx len,
22					re_string_t *pstr,
23					RE_TRANSLATE_TYPE trans, bool icase,
24					const re_dfa_t *dfa) internal_function;
25static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26					  const re_node_set *nodes,
27					  re_hashval_t hash) internal_function;
28static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29					  const re_node_set *nodes,
30					  unsigned int context,
31					  re_hashval_t hash) internal_function;
32
33/* Functions for string operation.  */
34
35/* This function allocate the buffers.  It is necessary to call
36   re_string_reconstruct before using the object.  */
37
38static reg_errcode_t
39internal_function
40re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
41		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
42{
43  reg_errcode_t ret;
44  Idx init_buf_len;
45
46  /* Ensure at least one character fits into the buffers.  */
47  if (init_len < dfa->mb_cur_max)
48    init_len = dfa->mb_cur_max;
49  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50  re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52  ret = re_string_realloc_buffers (pstr, init_buf_len);
53  if (BE (ret != REG_NOERROR, 0))
54    return ret;
55
56  pstr->word_char = dfa->word_char;
57  pstr->word_ops_used = dfa->word_ops_used;
58  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60  pstr->valid_raw_len = pstr->valid_len;
61  return REG_NOERROR;
62}
63
64/* This function allocate the buffers, and initialize them.  */
65
66static reg_errcode_t
67internal_function
68re_string_construct (re_string_t *pstr, const char *str, Idx len,
69		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
70{
71  reg_errcode_t ret;
72  memset (pstr, '\0', sizeof (re_string_t));
73  re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75  if (len > 0)
76    {
77      ret = re_string_realloc_buffers (pstr, len + 1);
78      if (BE (ret != REG_NOERROR, 0))
79	return ret;
80    }
81  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83  if (icase)
84    {
85#ifdef RE_ENABLE_I18N
86      if (dfa->mb_cur_max > 1)
87	{
88	  while (1)
89	    {
90	      ret = build_wcs_upper_buffer (pstr);
91	      if (BE (ret != REG_NOERROR, 0))
92		return ret;
93	      if (pstr->valid_raw_len >= len)
94		break;
95	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96		break;
97	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98	      if (BE (ret != REG_NOERROR, 0))
99		return ret;
100	    }
101	}
102      else
103#endif /* RE_ENABLE_I18N  */
104	build_upper_buffer (pstr);
105    }
106  else
107    {
108#ifdef RE_ENABLE_I18N
109      if (dfa->mb_cur_max > 1)
110	build_wcs_buffer (pstr);
111      else
112#endif /* RE_ENABLE_I18N  */
113	{
114	  if (trans != NULL)
115	    re_string_translate_buffer (pstr);
116	  else
117	    {
118	      pstr->valid_len = pstr->bufs_len;
119	      pstr->valid_raw_len = pstr->bufs_len;
120	    }
121	}
122    }
123
124  return REG_NOERROR;
125}
126
127/* Helper functions for re_string_allocate, and re_string_construct.  */
128
129static reg_errcode_t
130internal_function
131re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132{
133#ifdef RE_ENABLE_I18N
134  if (pstr->mb_cur_max > 1)
135    {
136      wint_t *new_wcs;
137
138      /* Avoid overflow.  */
139      size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
140      if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141	return REG_ESPACE;
142
143      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144      if (BE (new_wcs == NULL, 0))
145	return REG_ESPACE;
146      pstr->wcs = new_wcs;
147      if (pstr->offsets != NULL)
148	{
149	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
150	  if (BE (new_offsets == NULL, 0))
151	    return REG_ESPACE;
152	  pstr->offsets = new_offsets;
153	}
154    }
155#endif /* RE_ENABLE_I18N  */
156  if (pstr->mbs_allocated)
157    {
158      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159					   new_buf_len);
160      if (BE (new_mbs == NULL, 0))
161	return REG_ESPACE;
162      pstr->mbs = new_mbs;
163    }
164  pstr->bufs_len = new_buf_len;
165  return REG_NOERROR;
166}
167
168
169static void
170internal_function
171re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
172			    RE_TRANSLATE_TYPE trans, bool icase,
173			    const re_dfa_t *dfa)
174{
175  pstr->raw_mbs = (const unsigned char *) str;
176  pstr->len = len;
177  pstr->raw_len = len;
178  pstr->trans = trans;
179  pstr->icase = icase;
180  pstr->mbs_allocated = (trans != NULL || icase);
181  pstr->mb_cur_max = dfa->mb_cur_max;
182  pstr->is_utf8 = dfa->is_utf8;
183  pstr->map_notascii = dfa->map_notascii;
184  pstr->stop = pstr->len;
185  pstr->raw_stop = pstr->stop;
186}
187
188#ifdef RE_ENABLE_I18N
189
190/* Build wide character buffer PSTR->WCS.
191   If the byte sequence of the string are:
192     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193   Then wide character buffer will be:
194     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
195   We use WEOF for padding, they indicate that the position isn't
196   a first byte of a multibyte character.
197
198   Note that this function assumes PSTR->VALID_LEN elements are already
199   built and starts from PSTR->VALID_LEN.  */
200
201static void
202internal_function
203build_wcs_buffer (re_string_t *pstr)
204{
205#ifdef _LIBC
206  unsigned char buf[MB_LEN_MAX];
207  assert (MB_LEN_MAX >= pstr->mb_cur_max);
208#else
209  unsigned char buf[64];
210#endif
211  mbstate_t prev_st;
212  Idx byte_idx, end_idx, remain_len;
213  size_t mbclen;
214
215  /* Build the buffers from pstr->valid_len to either pstr->len or
216     pstr->bufs_len.  */
217  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219    {
220      wchar_t wc;
221      const char *p;
222
223      remain_len = end_idx - byte_idx;
224      prev_st = pstr->cur_state;
225      /* Apply the translation if we need.  */
226      if (BE (pstr->trans != NULL, 0))
227	{
228	  int i, ch;
229
230	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231	    {
232	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234	    }
235	  p = (const char *) buf;
236	}
237      else
238	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239      mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240      if (BE (mbclen == (size_t) -2, 0))
241	{
242	  /* The buffer doesn't have enough space, finish to build.  */
243	  pstr->cur_state = prev_st;
244	  break;
245	}
246      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
247	{
248	  /* We treat these cases as a singlebyte character.  */
249	  mbclen = 1;
250	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251	  if (BE (pstr->trans != NULL, 0))
252	    wc = pstr->trans[wc];
253	  pstr->cur_state = prev_st;
254	}
255
256      /* Write wide character and padding.  */
257      pstr->wcs[byte_idx++] = wc;
258      /* Write paddings.  */
259      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260	pstr->wcs[byte_idx++] = WEOF;
261    }
262  pstr->valid_len = byte_idx;
263  pstr->valid_raw_len = byte_idx;
264}
265
266/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267   but for REG_ICASE.  */
268
269static reg_errcode_t
270internal_function
271build_wcs_upper_buffer (re_string_t *pstr)
272{
273  mbstate_t prev_st;
274  Idx src_idx, byte_idx, end_idx, remain_len;
275  size_t mbclen;
276#ifdef _LIBC
277  char buf[MB_LEN_MAX];
278  assert (MB_LEN_MAX >= pstr->mb_cur_max);
279#else
280  char buf[64];
281#endif
282
283  byte_idx = pstr->valid_len;
284  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286  /* The following optimization assumes that ASCII characters can be
287     mapped to wide characters with a simple cast.  */
288  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289    {
290      while (byte_idx < end_idx)
291	{
292	  wchar_t wc;
293
294	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295	      && mbsinit (&pstr->cur_state))
296	    {
297	      /* In case of a singlebyte character.  */
298	      pstr->mbs[byte_idx]
299		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300	      /* The next step uses the assumption that wchar_t is encoded
301		 ASCII-safe: all ASCII values can be converted like this.  */
302	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303	      ++byte_idx;
304	      continue;
305	    }
306
307	  remain_len = end_idx - byte_idx;
308	  prev_st = pstr->cur_state;
309	  mbclen = mbrtowc (&wc,
310			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311			     + byte_idx), remain_len, &pstr->cur_state);
312	  if (BE (mbclen < (size_t) -2, 1))
313	    {
314	      wchar_t wcu = wc;
315	      if (iswlower (wc))
316		{
317		  size_t mbcdlen;
318
319		  wcu = towupper (wc);
320		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
321		  if (BE (mbclen == mbcdlen, 1))
322		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
323		  else
324		    {
325		      src_idx = byte_idx;
326		      goto offsets_needed;
327		    }
328		}
329	      else
330		memcpy (pstr->mbs + byte_idx,
331			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332	      pstr->wcs[byte_idx++] = wcu;
333	      /* Write paddings.  */
334	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335		pstr->wcs[byte_idx++] = WEOF;
336	    }
337	  else if (mbclen == (size_t) -1 || mbclen == 0)
338	    {
339	      /* It is an invalid character or '\0'.  Just use the byte.  */
340	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341	      pstr->mbs[byte_idx] = ch;
342	      /* And also cast it to wide char.  */
343	      pstr->wcs[byte_idx++] = (wchar_t) ch;
344	      if (BE (mbclen == (size_t) -1, 0))
345		pstr->cur_state = prev_st;
346	    }
347	  else
348	    {
349	      /* The buffer doesn't have enough space, finish to build.  */
350	      pstr->cur_state = prev_st;
351	      break;
352	    }
353	}
354      pstr->valid_len = byte_idx;
355      pstr->valid_raw_len = byte_idx;
356      return REG_NOERROR;
357    }
358  else
359    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
360      {
361	wchar_t wc;
362	const char *p;
363      offsets_needed:
364	remain_len = end_idx - byte_idx;
365	prev_st = pstr->cur_state;
366	if (BE (pstr->trans != NULL, 0))
367	  {
368	    int i, ch;
369
370	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371	      {
372		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373		buf[i] = pstr->trans[ch];
374	      }
375	    p = (const char *) buf;
376	  }
377	else
378	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380	if (BE (mbclen < (size_t) -2, 1))
381	  {
382	    wchar_t wcu = wc;
383	    if (iswlower (wc))
384	      {
385		size_t mbcdlen;
386
387		wcu = towupper (wc);
388		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389		if (BE (mbclen == mbcdlen, 1))
390		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
391		else if (mbcdlen != (size_t) -1)
392		  {
393		    size_t i;
394
395		    if (byte_idx + mbcdlen > pstr->bufs_len)
396		      {
397			pstr->cur_state = prev_st;
398			break;
399		      }
400
401		    if (pstr->offsets == NULL)
402		      {
403			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404
405			if (pstr->offsets == NULL)
406			  return REG_ESPACE;
407		      }
408		    if (!pstr->offsets_needed)
409		      {
410			for (i = 0; i < (size_t) byte_idx; ++i)
411			  pstr->offsets[i] = i;
412			pstr->offsets_needed = 1;
413		      }
414
415		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416		    pstr->wcs[byte_idx] = wcu;
417		    pstr->offsets[byte_idx] = src_idx;
418		    for (i = 1; i < mbcdlen; ++i)
419		      {
420			pstr->offsets[byte_idx + i]
421			  = src_idx + (i < mbclen ? i : mbclen - 1);
422			pstr->wcs[byte_idx + i] = WEOF;
423		      }
424		    pstr->len += mbcdlen - mbclen;
425		    if (pstr->raw_stop > src_idx)
426		      pstr->stop += mbcdlen - mbclen;
427		    end_idx = (pstr->bufs_len > pstr->len)
428			      ? pstr->len : pstr->bufs_len;
429		    byte_idx += mbcdlen;
430		    src_idx += mbclen;
431		    continue;
432		  }
433                else
434                  memcpy (pstr->mbs + byte_idx, p, mbclen);
435	      }
436	    else
437	      memcpy (pstr->mbs + byte_idx, p, mbclen);
438
439	    if (BE (pstr->offsets_needed != 0, 0))
440	      {
441		size_t i;
442		for (i = 0; i < mbclen; ++i)
443		  pstr->offsets[byte_idx + i] = src_idx + i;
444	      }
445	    src_idx += mbclen;
446
447	    pstr->wcs[byte_idx++] = wcu;
448	    /* Write paddings.  */
449	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450	      pstr->wcs[byte_idx++] = WEOF;
451	  }
452	else if (mbclen == (size_t) -1 || mbclen == 0)
453	  {
454	    /* It is an invalid character or '\0'.  Just use the byte.  */
455	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
456
457	    if (BE (pstr->trans != NULL, 0))
458	      ch = pstr->trans [ch];
459	    pstr->mbs[byte_idx] = ch;
460
461	    if (BE (pstr->offsets_needed != 0, 0))
462	      pstr->offsets[byte_idx] = src_idx;
463	    ++src_idx;
464
465	    /* And also cast it to wide char.  */
466	    pstr->wcs[byte_idx++] = (wchar_t) ch;
467	    if (BE (mbclen == (size_t) -1, 0))
468	      pstr->cur_state = prev_st;
469	  }
470	else
471	  {
472	    /* The buffer doesn't have enough space, finish to build.  */
473	    pstr->cur_state = prev_st;
474	    break;
475	  }
476      }
477  pstr->valid_len = byte_idx;
478  pstr->valid_raw_len = src_idx;
479  return REG_NOERROR;
480}
481
482/* Skip characters until the index becomes greater than NEW_RAW_IDX.
483   Return the index.  */
484
485static Idx
486internal_function
487re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
488{
489  mbstate_t prev_st;
490  Idx rawbuf_idx;
491  size_t mbclen;
492  wint_t wc = WEOF;
493
494  /* Skip the characters which are not necessary to check.  */
495  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496       rawbuf_idx < new_raw_idx;)
497    {
498      wchar_t wc2;
499      Idx remain_len;
500      remain_len = pstr->len - rawbuf_idx;
501      prev_st = pstr->cur_state;
502      mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503			remain_len, &pstr->cur_state);
504      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505	{
506	  /* We treat these cases as a single byte character.  */
507	  if (mbclen == 0 || remain_len == 0)
508	    wc = L'\0';
509	  else
510	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511	  mbclen = 1;
512	  pstr->cur_state = prev_st;
513	}
514      else
515	wc = wc2;
516      /* Then proceed the next character.  */
517      rawbuf_idx += mbclen;
518    }
519  *last_wc = wc;
520  return rawbuf_idx;
521}
522#endif /* RE_ENABLE_I18N  */
523
524/* Build the buffer PSTR->MBS, and apply the translation if we need.
525   This function is used in case of REG_ICASE.  */
526
527static void
528internal_function
529build_upper_buffer (re_string_t *pstr)
530{
531  Idx char_idx, end_idx;
532  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535    {
536      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537      if (BE (pstr->trans != NULL, 0))
538	ch = pstr->trans[ch];
539      if (islower (ch))
540	pstr->mbs[char_idx] = toupper (ch);
541      else
542	pstr->mbs[char_idx] = ch;
543    }
544  pstr->valid_len = char_idx;
545  pstr->valid_raw_len = char_idx;
546}
547
548/* Apply TRANS to the buffer in PSTR.  */
549
550static void
551internal_function
552re_string_translate_buffer (re_string_t *pstr)
553{
554  Idx buf_idx, end_idx;
555  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556
557  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558    {
559      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
560      pstr->mbs[buf_idx] = pstr->trans[ch];
561    }
562
563  pstr->valid_len = buf_idx;
564  pstr->valid_raw_len = buf_idx;
565}
566
567/* This function re-construct the buffers.
568   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
569   convert to upper case in case of REG_ICASE, apply translation.  */
570
571static reg_errcode_t
572internal_function
573re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
574{
575  Idx offset;
576
577  if (BE (pstr->raw_mbs_idx <= idx, 0))
578    offset = idx - pstr->raw_mbs_idx;
579  else
580    {
581      /* Reset buffer.  */
582#ifdef RE_ENABLE_I18N
583      if (pstr->mb_cur_max > 1)
584	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585#endif /* RE_ENABLE_I18N */
586      pstr->len = pstr->raw_len;
587      pstr->stop = pstr->raw_stop;
588      pstr->valid_len = 0;
589      pstr->raw_mbs_idx = 0;
590      pstr->valid_raw_len = 0;
591      pstr->offsets_needed = 0;
592      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594      if (!pstr->mbs_allocated)
595	pstr->mbs = (unsigned char *) pstr->raw_mbs;
596      offset = idx;
597    }
598
599  if (BE (offset != 0, 1))
600    {
601      /* Should the already checked characters be kept?  */
602      if (BE (offset < pstr->valid_raw_len, 1))
603	{
604	  /* Yes, move them to the front of the buffer.  */
605#ifdef RE_ENABLE_I18N
606	  if (BE (pstr->offsets_needed, 0))
607	    {
608	      Idx low = 0, high = pstr->valid_len, mid;
609	      do
610		{
611		  mid = (high + low) / 2;
612		  if (pstr->offsets[mid] > offset)
613		    high = mid;
614		  else if (pstr->offsets[mid] < offset)
615		    low = mid + 1;
616		  else
617		    break;
618		}
619	      while (low < high);
620	      if (pstr->offsets[mid] < offset)
621		++mid;
622	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
623							eflags);
624	      /* This can be quite complicated, so handle specially
625		 only the common and easy case where the character with
626		 different length representation of lower and upper
627		 case is present at or after offset.  */
628	      if (pstr->valid_len > offset
629		  && mid == offset && pstr->offsets[mid] == offset)
630		{
631		  memmove (pstr->wcs, pstr->wcs + offset,
632			   (pstr->valid_len - offset) * sizeof (wint_t));
633		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634		  pstr->valid_len -= offset;
635		  pstr->valid_raw_len -= offset;
636		  for (low = 0; low < pstr->valid_len; low++)
637		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
638		}
639	      else
640		{
641		  /* Otherwise, just find out how long the partial multibyte
642		     character at offset is and fill it with WEOF/255.  */
643		  pstr->len = pstr->raw_len - idx + offset;
644		  pstr->stop = pstr->raw_stop - idx + offset;
645		  pstr->offsets_needed = 0;
646		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
647		    --mid;
648		  while (mid < pstr->valid_len)
649		    if (pstr->wcs[mid] != WEOF)
650		      break;
651		    else
652		      ++mid;
653		  if (mid == pstr->valid_len)
654		    pstr->valid_len = 0;
655		  else
656		    {
657		      pstr->valid_len = pstr->offsets[mid] - offset;
658		      if (pstr->valid_len)
659			{
660			  for (low = 0; low < pstr->valid_len; ++low)
661			    pstr->wcs[low] = WEOF;
662			  memset (pstr->mbs, 255, pstr->valid_len);
663			}
664		    }
665		  pstr->valid_raw_len = pstr->valid_len;
666		}
667	    }
668	  else
669#endif
670	    {
671	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
672							eflags);
673#ifdef RE_ENABLE_I18N
674	      if (pstr->mb_cur_max > 1)
675		memmove (pstr->wcs, pstr->wcs + offset,
676			 (pstr->valid_len - offset) * sizeof (wint_t));
677#endif /* RE_ENABLE_I18N */
678	      if (BE (pstr->mbs_allocated, 0))
679		memmove (pstr->mbs, pstr->mbs + offset,
680			 pstr->valid_len - offset);
681	      pstr->valid_len -= offset;
682	      pstr->valid_raw_len -= offset;
683#if DEBUG
684	      assert (pstr->valid_len > 0);
685#endif
686	    }
687	}
688      else
689	{
690	  /* No, skip all characters until IDX.  */
691	  Idx prev_valid_len = pstr->valid_len;
692
693#ifdef RE_ENABLE_I18N
694	  if (BE (pstr->offsets_needed, 0))
695	    {
696	      pstr->len = pstr->raw_len - idx + offset;
697	      pstr->stop = pstr->raw_stop - idx + offset;
698	      pstr->offsets_needed = 0;
699	    }
700#endif
701	  pstr->valid_len = 0;
702#ifdef RE_ENABLE_I18N
703	  if (pstr->mb_cur_max > 1)
704	    {
705	      Idx wcs_idx;
706	      wint_t wc = WEOF;
707
708	      if (pstr->is_utf8)
709		{
710		  const unsigned char *raw, *p, *end;
711
712		  /* Special case UTF-8.  Multi-byte chars start with any
713		     byte other than 0x80 - 0xbf.  */
714		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715		  end = raw + (offset - pstr->mb_cur_max);
716		  if (end < pstr->raw_mbs)
717		    end = pstr->raw_mbs;
718		  p = raw + offset - 1;
719#ifdef _LIBC
720		  /* We know the wchar_t encoding is UCS4, so for the simple
721		     case, ASCII characters, skip the conversion step.  */
722		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
723		    {
724		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725		      /* pstr->valid_len = 0; */
726		      wc = (wchar_t) *p;
727		    }
728		  else
729#endif
730		    for (; p >= end; --p)
731		      if ((*p & 0xc0) != 0x80)
732			{
733			  mbstate_t cur_state;
734			  wchar_t wc2;
735			  Idx mlen = raw + pstr->len - p;
736			  unsigned char buf[6];
737			  size_t mbclen;
738
739			  if (BE (pstr->trans != NULL, 0))
740			    {
741			      int i = mlen < 6 ? mlen : 6;
742			      while (--i >= 0)
743				buf[i] = pstr->trans[p[i]];
744			    }
745			  /* XXX Don't use mbrtowc, we know which conversion
746			     to use (UTF-8 -> UCS4).  */
747			  memset (&cur_state, 0, sizeof (cur_state));
748			  mbclen = mbrtowc (&wc2, (const char *) p, mlen,
749					    &cur_state);
750			  if (raw + offset - p <= mbclen
751			      && mbclen < (size_t) -2)
752			    {
753			      memset (&pstr->cur_state, '\0',
754				      sizeof (mbstate_t));
755			      pstr->valid_len = mbclen - (raw + offset - p);
756			      wc = wc2;
757			    }
758			  break;
759			}
760		}
761
762	      if (wc == WEOF)
763		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
764	      if (wc == WEOF)
765		pstr->tip_context
766		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
767	      else
768		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
769				      && IS_WIDE_WORD_CHAR (wc))
770				     ? CONTEXT_WORD
771				     : ((IS_WIDE_NEWLINE (wc)
772					 && pstr->newline_anchor)
773					? CONTEXT_NEWLINE : 0));
774	      if (BE (pstr->valid_len, 0))
775		{
776		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
777		    pstr->wcs[wcs_idx] = WEOF;
778		  if (pstr->mbs_allocated)
779		    memset (pstr->mbs, 255, pstr->valid_len);
780		}
781	      pstr->valid_raw_len = pstr->valid_len;
782	    }
783	  else
784#endif /* RE_ENABLE_I18N */
785	    {
786	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
787	      pstr->valid_raw_len = 0;
788	      if (pstr->trans)
789		c = pstr->trans[c];
790	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
791				   ? CONTEXT_WORD
792				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
793				      ? CONTEXT_NEWLINE : 0));
794	    }
795	}
796      if (!BE (pstr->mbs_allocated, 0))
797	pstr->mbs += offset;
798    }
799  pstr->raw_mbs_idx = idx;
800  pstr->len -= offset;
801  pstr->stop -= offset;
802
803  /* Then build the buffers.  */
804#ifdef RE_ENABLE_I18N
805  if (pstr->mb_cur_max > 1)
806    {
807      if (pstr->icase)
808	{
809	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
810	  if (BE (ret != REG_NOERROR, 0))
811	    return ret;
812	}
813      else
814	build_wcs_buffer (pstr);
815    }
816  else
817#endif /* RE_ENABLE_I18N */
818    if (BE (pstr->mbs_allocated, 0))
819      {
820	if (pstr->icase)
821	  build_upper_buffer (pstr);
822	else if (pstr->trans != NULL)
823	  re_string_translate_buffer (pstr);
824      }
825    else
826      pstr->valid_len = pstr->len;
827
828  pstr->cur_idx = 0;
829  return REG_NOERROR;
830}
831
832static unsigned char
833internal_function __attribute ((pure))
834re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
835{
836  int ch;
837  Idx off;
838
839  /* Handle the common (easiest) cases first.  */
840  if (BE (!pstr->mbs_allocated, 1))
841    return re_string_peek_byte (pstr, idx);
842
843#ifdef RE_ENABLE_I18N
844  if (pstr->mb_cur_max > 1
845      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
846    return re_string_peek_byte (pstr, idx);
847#endif
848
849  off = pstr->cur_idx + idx;
850#ifdef RE_ENABLE_I18N
851  if (pstr->offsets_needed)
852    off = pstr->offsets[off];
853#endif
854
855  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
856
857#ifdef RE_ENABLE_I18N
858  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
859     this function returns CAPITAL LETTER I instead of first byte of
860     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
861     since peek_byte_case doesn't advance cur_idx in any way.  */
862  if (pstr->offsets_needed && !isascii (ch))
863    return re_string_peek_byte (pstr, idx);
864#endif
865
866  return ch;
867}
868
869static unsigned char
870internal_function __attribute ((pure))
871re_string_fetch_byte_case (re_string_t *pstr)
872{
873  if (BE (!pstr->mbs_allocated, 1))
874    return re_string_fetch_byte (pstr);
875
876#ifdef RE_ENABLE_I18N
877  if (pstr->offsets_needed)
878    {
879      Idx off;
880      int ch;
881
882      /* For tr_TR.UTF-8 [[:islower:]] there is
883	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
884	 in that case the whole multi-byte character and return
885	 the original letter.  On the other side, with
886	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
887	 anything else would complicate things too much.  */
888
889      if (!re_string_first_byte (pstr, pstr->cur_idx))
890	return re_string_fetch_byte (pstr);
891
892      off = pstr->offsets[pstr->cur_idx];
893      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
894
895      if (! isascii (ch))
896	return re_string_fetch_byte (pstr);
897
898      re_string_skip_bytes (pstr,
899			    re_string_char_size_at (pstr, pstr->cur_idx));
900      return ch;
901    }
902#endif
903
904  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
905}
906
907static void
908internal_function
909re_string_destruct (re_string_t *pstr)
910{
911#ifdef RE_ENABLE_I18N
912  re_free (pstr->wcs);
913  re_free (pstr->offsets);
914#endif /* RE_ENABLE_I18N  */
915  if (pstr->mbs_allocated)
916    re_free (pstr->mbs);
917}
918
919/* Return the context at IDX in INPUT.  */
920
921static unsigned int
922internal_function
923re_string_context_at (const re_string_t *input, Idx idx, int eflags)
924{
925  int c;
926  if (BE (! REG_VALID_INDEX (idx), 0))
927    /* In this case, we use the value stored in input->tip_context,
928       since we can't know the character in input->mbs[-1] here.  */
929    return input->tip_context;
930  if (BE (idx == input->len, 0))
931    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
932	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
933#ifdef RE_ENABLE_I18N
934  if (input->mb_cur_max > 1)
935    {
936      wint_t wc;
937      Idx wc_idx = idx;
938      while(input->wcs[wc_idx] == WEOF)
939	{
940#ifdef DEBUG
941	  /* It must not happen.  */
942	  assert (REG_VALID_INDEX (wc_idx));
943#endif
944	  --wc_idx;
945	  if (! REG_VALID_INDEX (wc_idx))
946	    return input->tip_context;
947	}
948      wc = input->wcs[wc_idx];
949      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
950	return CONTEXT_WORD;
951      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
952	      ? CONTEXT_NEWLINE : 0);
953    }
954  else
955#endif
956    {
957      c = re_string_byte_at (input, idx);
958      if (bitset_contain (input->word_char, c))
959	return CONTEXT_WORD;
960      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
961    }
962}
963
964/* Functions for set operation.  */
965
966static reg_errcode_t
967internal_function
968re_node_set_alloc (re_node_set *set, Idx size)
969{
970  set->alloc = size;
971  set->nelem = 0;
972  set->elems = re_malloc (Idx, size);
973  if (BE (set->elems == NULL, 0))
974    return REG_ESPACE;
975  return REG_NOERROR;
976}
977
978static reg_errcode_t
979internal_function
980re_node_set_init_1 (re_node_set *set, Idx elem)
981{
982  set->alloc = 1;
983  set->nelem = 1;
984  set->elems = re_malloc (Idx, 1);
985  if (BE (set->elems == NULL, 0))
986    {
987      set->alloc = set->nelem = 0;
988      return REG_ESPACE;
989    }
990  set->elems[0] = elem;
991  return REG_NOERROR;
992}
993
994static reg_errcode_t
995internal_function
996re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
997{
998  set->alloc = 2;
999  set->elems = re_malloc (Idx, 2);
1000  if (BE (set->elems == NULL, 0))
1001    return REG_ESPACE;
1002  if (elem1 == elem2)
1003    {
1004      set->nelem = 1;
1005      set->elems[0] = elem1;
1006    }
1007  else
1008    {
1009      set->nelem = 2;
1010      if (elem1 < elem2)
1011	{
1012	  set->elems[0] = elem1;
1013	  set->elems[1] = elem2;
1014	}
1015      else
1016	{
1017	  set->elems[0] = elem2;
1018	  set->elems[1] = elem1;
1019	}
1020    }
1021  return REG_NOERROR;
1022}
1023
1024static reg_errcode_t
1025internal_function
1026re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1027{
1028  dest->nelem = src->nelem;
1029  if (src->nelem > 0)
1030    {
1031      dest->alloc = dest->nelem;
1032      dest->elems = re_malloc (Idx, dest->alloc);
1033      if (BE (dest->elems == NULL, 0))
1034	{
1035	  dest->alloc = dest->nelem = 0;
1036	  return REG_ESPACE;
1037	}
1038      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1039    }
1040  else
1041    re_node_set_init_empty (dest);
1042  return REG_NOERROR;
1043}
1044
1045/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1046   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1047   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1048
1049static reg_errcode_t
1050internal_function
1051re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1052			   const re_node_set *src2)
1053{
1054  Idx i1, i2, is, id, delta, sbase;
1055  if (src1->nelem == 0 || src2->nelem == 0)
1056    return REG_NOERROR;
1057
1058  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1059     conservative estimate.  */
1060  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1061    {
1062      Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1063      Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1064      if (BE (new_elems == NULL, 0))
1065        return REG_ESPACE;
1066      dest->elems = new_elems;
1067      dest->alloc = new_alloc;
1068    }
1069
1070  /* Find the items in the intersection of SRC1 and SRC2, and copy
1071     into the top of DEST those that are not already in DEST itself.  */
1072  sbase = dest->nelem + src1->nelem + src2->nelem;
1073  i1 = src1->nelem - 1;
1074  i2 = src2->nelem - 1;
1075  id = dest->nelem - 1;
1076  for (;;)
1077    {
1078      if (src1->elems[i1] == src2->elems[i2])
1079	{
1080	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1081	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1082	    --id;
1083
1084          if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1085            dest->elems[--sbase] = src1->elems[i1];
1086
1087	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1088	    break;
1089	}
1090
1091      /* Lower the highest of the two items.  */
1092      else if (src1->elems[i1] < src2->elems[i2])
1093	{
1094	  if (! REG_VALID_INDEX (--i2))
1095	    break;
1096	}
1097      else
1098	{
1099	  if (! REG_VALID_INDEX (--i1))
1100	    break;
1101	}
1102    }
1103
1104  id = dest->nelem - 1;
1105  is = dest->nelem + src1->nelem + src2->nelem - 1;
1106  delta = is - sbase + 1;
1107
1108  /* Now copy.  When DELTA becomes zero, the remaining
1109     DEST elements are already in place; this is more or
1110     less the same loop that is in re_node_set_merge.  */
1111  dest->nelem += delta;
1112  if (delta > 0 && REG_VALID_INDEX (id))
1113    for (;;)
1114      {
1115        if (dest->elems[is] > dest->elems[id])
1116          {
1117            /* Copy from the top.  */
1118            dest->elems[id + delta--] = dest->elems[is--];
1119            if (delta == 0)
1120              break;
1121          }
1122        else
1123          {
1124            /* Slide from the bottom.  */
1125            dest->elems[id + delta] = dest->elems[id];
1126            if (! REG_VALID_INDEX (--id))
1127              break;
1128          }
1129      }
1130
1131  /* Copy remaining SRC elements.  */
1132  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1133
1134  return REG_NOERROR;
1135}
1136
1137/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1138   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1139
1140static reg_errcode_t
1141internal_function
1142re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1143			const re_node_set *src2)
1144{
1145  Idx i1, i2, id;
1146  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1147    {
1148      dest->alloc = src1->nelem + src2->nelem;
1149      dest->elems = re_malloc (Idx, dest->alloc);
1150      if (BE (dest->elems == NULL, 0))
1151	return REG_ESPACE;
1152    }
1153  else
1154    {
1155      if (src1 != NULL && src1->nelem > 0)
1156	return re_node_set_init_copy (dest, src1);
1157      else if (src2 != NULL && src2->nelem > 0)
1158	return re_node_set_init_copy (dest, src2);
1159      else
1160	re_node_set_init_empty (dest);
1161      return REG_NOERROR;
1162    }
1163  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1164    {
1165      if (src1->elems[i1] > src2->elems[i2])
1166	{
1167	  dest->elems[id++] = src2->elems[i2++];
1168	  continue;
1169	}
1170      if (src1->elems[i1] == src2->elems[i2])
1171	++i2;
1172      dest->elems[id++] = src1->elems[i1++];
1173    }
1174  if (i1 < src1->nelem)
1175    {
1176      memcpy (dest->elems + id, src1->elems + i1,
1177	     (src1->nelem - i1) * sizeof (Idx));
1178      id += src1->nelem - i1;
1179    }
1180  else if (i2 < src2->nelem)
1181    {
1182      memcpy (dest->elems + id, src2->elems + i2,
1183	     (src2->nelem - i2) * sizeof (Idx));
1184      id += src2->nelem - i2;
1185    }
1186  dest->nelem = id;
1187  return REG_NOERROR;
1188}
1189
1190/* Calculate the union set of the sets DEST and SRC. And store it to
1191   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1192
1193static reg_errcode_t
1194internal_function
1195re_node_set_merge (re_node_set *dest, const re_node_set *src)
1196{
1197  Idx is, id, sbase, delta;
1198  if (src == NULL || src->nelem == 0)
1199    return REG_NOERROR;
1200  if (dest->alloc < 2 * src->nelem + dest->nelem)
1201    {
1202      Idx new_alloc = 2 * (src->nelem + dest->alloc);
1203      Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1204      if (BE (new_buffer == NULL, 0))
1205	return REG_ESPACE;
1206      dest->elems = new_buffer;
1207      dest->alloc = new_alloc;
1208    }
1209
1210  if (BE (dest->nelem == 0, 0))
1211    {
1212      dest->nelem = src->nelem;
1213      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1214      return REG_NOERROR;
1215    }
1216
1217  /* Copy into the top of DEST the items of SRC that are not
1218     found in DEST.  Maybe we could binary search in DEST?  */
1219  for (sbase = dest->nelem + 2 * src->nelem,
1220       is = src->nelem - 1, id = dest->nelem - 1;
1221       REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1222    {
1223      if (dest->elems[id] == src->elems[is])
1224        is--, id--;
1225      else if (dest->elems[id] < src->elems[is])
1226        dest->elems[--sbase] = src->elems[is--];
1227      else /* if (dest->elems[id] > src->elems[is]) */
1228        --id;
1229    }
1230
1231  if (REG_VALID_INDEX (is))
1232    {
1233      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1234      sbase -= is + 1;
1235      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236    }
1237
1238  id = dest->nelem - 1;
1239  is = dest->nelem + 2 * src->nelem - 1;
1240  delta = is - sbase + 1;
1241  if (delta == 0)
1242    return REG_NOERROR;
1243
1244  /* Now copy.  When DELTA becomes zero, the remaining
1245     DEST elements are already in place.  */
1246  dest->nelem += delta;
1247  for (;;)
1248    {
1249      if (dest->elems[is] > dest->elems[id])
1250        {
1251	  /* Copy from the top.  */
1252          dest->elems[id + delta--] = dest->elems[is--];
1253	  if (delta == 0)
1254	    break;
1255	}
1256      else
1257        {
1258          /* Slide from the bottom.  */
1259          dest->elems[id + delta] = dest->elems[id];
1260	  if (! REG_VALID_INDEX (--id))
1261	    {
1262	      /* Copy remaining SRC elements.  */
1263	      memcpy (dest->elems, dest->elems + sbase,
1264	              delta * sizeof (Idx));
1265	      break;
1266	    }
1267	}
1268    }
1269
1270  return REG_NOERROR;
1271}
1272
1273/* Insert the new element ELEM to the re_node_set* SET.
1274   SET should not already have ELEM.
1275   Return true if successful.  */
1276
1277static bool
1278internal_function
1279re_node_set_insert (re_node_set *set, Idx elem)
1280{
1281  Idx idx;
1282  /* In case the set is empty.  */
1283  if (set->alloc == 0)
1284    return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1285
1286  if (BE (set->nelem, 0) == 0)
1287    {
1288      /* We already guaranteed above that set->alloc != 0.  */
1289      set->elems[0] = elem;
1290      ++set->nelem;
1291      return true;
1292    }
1293
1294  /* Realloc if we need.  */
1295  if (set->alloc == set->nelem)
1296    {
1297      Idx *new_elems;
1298      set->alloc = set->alloc * 2;
1299      new_elems = re_realloc (set->elems, Idx, set->alloc);
1300      if (BE (new_elems == NULL, 0))
1301	return false;
1302      set->elems = new_elems;
1303    }
1304
1305  /* Move the elements which follows the new element.  Test the
1306     first element separately to skip a check in the inner loop.  */
1307  if (elem < set->elems[0])
1308    {
1309      idx = 0;
1310      for (idx = set->nelem; idx > 0; idx--)
1311        set->elems[idx] = set->elems[idx - 1];
1312    }
1313  else
1314    {
1315      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316        set->elems[idx] = set->elems[idx - 1];
1317    }
1318
1319  /* Insert the new element.  */
1320  set->elems[idx] = elem;
1321  ++set->nelem;
1322  return true;
1323}
1324
1325/* Insert the new element ELEM to the re_node_set* SET.
1326   SET should not already have any element greater than or equal to ELEM.
1327   Return true if successful.  */
1328
1329static bool
1330internal_function
1331re_node_set_insert_last (re_node_set *set, Idx elem)
1332{
1333  /* Realloc if we need.  */
1334  if (set->alloc == set->nelem)
1335    {
1336      Idx *new_elems;
1337      set->alloc = (set->alloc + 1) * 2;
1338      new_elems = re_realloc (set->elems, Idx, set->alloc);
1339      if (BE (new_elems == NULL, 0))
1340	return false;
1341      set->elems = new_elems;
1342    }
1343
1344  /* Insert the new element.  */
1345  set->elems[set->nelem++] = elem;
1346  return true;
1347}
1348
1349/* Compare two node sets SET1 and SET2.
1350   Return true if SET1 and SET2 are equivalent.  */
1351
1352static bool
1353internal_function __attribute ((pure))
1354re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1355{
1356  Idx i;
1357  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1358    return false;
1359  for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1360    if (set1->elems[i] != set2->elems[i])
1361      return false;
1362  return true;
1363}
1364
1365/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1366
1367static Idx
1368internal_function __attribute ((pure))
1369re_node_set_contains (const re_node_set *set, Idx elem)
1370{
1371  __re_size_t idx, right, mid;
1372  if (! REG_VALID_NONZERO_INDEX (set->nelem))
1373    return 0;
1374
1375  /* Binary search the element.  */
1376  idx = 0;
1377  right = set->nelem - 1;
1378  while (idx < right)
1379    {
1380      mid = (idx + right) / 2;
1381      if (set->elems[mid] < elem)
1382	idx = mid + 1;
1383      else
1384	right = mid;
1385    }
1386  return set->elems[idx] == elem ? idx + 1 : 0;
1387}
1388
1389static void
1390internal_function
1391re_node_set_remove_at (re_node_set *set, Idx idx)
1392{
1393  if (idx < 0 || idx >= set->nelem)
1394    return;
1395  --set->nelem;
1396  for (; idx < set->nelem; idx++)
1397    set->elems[idx] = set->elems[idx + 1];
1398}
1399
1400
1401/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1402   Or return REG_MISSING if an error occurred.  */
1403
1404static Idx
1405internal_function
1406re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1407{
1408  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1409    {
1410      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1411      Idx *new_nexts, *new_indices;
1412      re_node_set *new_edests, *new_eclosures;
1413      re_token_t *new_nodes;
1414      size_t max_object_size =
1415	MAX (sizeof (re_token_t),
1416	     MAX (sizeof (re_node_set),
1417		  sizeof (Idx)));
1418
1419      /* Avoid overflows.  */
1420      if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1421	return REG_MISSING;
1422
1423      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1424      if (BE (new_nodes == NULL, 0))
1425	return REG_MISSING;
1426      dfa->nodes = new_nodes;
1427      new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1428      new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1429      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1430      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1431      if (BE (new_nexts == NULL || new_indices == NULL
1432	      || new_edests == NULL || new_eclosures == NULL, 0))
1433	return REG_MISSING;
1434      dfa->nexts = new_nexts;
1435      dfa->org_indices = new_indices;
1436      dfa->edests = new_edests;
1437      dfa->eclosures = new_eclosures;
1438      dfa->nodes_alloc = new_nodes_alloc;
1439    }
1440  dfa->nodes[dfa->nodes_len] = token;
1441  dfa->nodes[dfa->nodes_len].constraint = 0;
1442#ifdef RE_ENABLE_I18N
1443  {
1444  int type = token.type;
1445  dfa->nodes[dfa->nodes_len].accept_mb =
1446    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1447  }
1448#endif
1449  dfa->nexts[dfa->nodes_len] = REG_MISSING;
1450  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1451  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1452  return dfa->nodes_len++;
1453}
1454
1455static inline re_hashval_t
1456internal_function
1457calc_state_hash (const re_node_set *nodes, unsigned int context)
1458{
1459  re_hashval_t hash = nodes->nelem + context;
1460  Idx i;
1461  for (i = 0 ; i < nodes->nelem ; i++)
1462    hash += nodes->elems[i];
1463  return hash;
1464}
1465
1466/* Search for the state whose node_set is equivalent to NODES.
1467   Return the pointer to the state, if we found it in the DFA.
1468   Otherwise create the new one and return it.  In case of an error
1469   return NULL and set the error code in ERR.
1470   Note: - We assume NULL as the invalid state, then it is possible that
1471	   return value is NULL and ERR is REG_NOERROR.
1472	 - We never return non-NULL value in case of any errors, it is for
1473	   optimization.  */
1474
1475static re_dfastate_t *
1476internal_function
1477re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478		  const re_node_set *nodes)
1479{
1480  re_hashval_t hash;
1481  re_dfastate_t *new_state;
1482  struct re_state_table_entry *spot;
1483  Idx i;
1484#ifdef lint
1485  /* Suppress bogus uninitialized-variable warnings.  */
1486  *err = REG_NOERROR;
1487#endif
1488  if (BE (nodes->nelem == 0, 0))
1489    {
1490      *err = REG_NOERROR;
1491      return NULL;
1492    }
1493  hash = calc_state_hash (nodes, 0);
1494  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496  for (i = 0 ; i < spot->num ; i++)
1497    {
1498      re_dfastate_t *state = spot->array[i];
1499      if (hash != state->hash)
1500	continue;
1501      if (re_node_set_compare (&state->nodes, nodes))
1502	return state;
1503    }
1504
1505  /* There are no appropriate state in the dfa, create the new one.  */
1506  new_state = create_ci_newstate (dfa, nodes, hash);
1507  if (BE (new_state == NULL, 0))
1508    *err = REG_ESPACE;
1509
1510  return new_state;
1511}
1512
1513/* Search for the state whose node_set is equivalent to NODES and
1514   whose context is equivalent to CONTEXT.
1515   Return the pointer to the state, if we found it in the DFA.
1516   Otherwise create the new one and return it.  In case of an error
1517   return NULL and set the error code in ERR.
1518   Note: - We assume NULL as the invalid state, then it is possible that
1519	   return value is NULL and ERR is REG_NOERROR.
1520	 - We never return non-NULL value in case of any errors, it is for
1521	   optimization.  */
1522
1523static re_dfastate_t *
1524internal_function
1525re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526			  const re_node_set *nodes, unsigned int context)
1527{
1528  re_hashval_t hash;
1529  re_dfastate_t *new_state;
1530  struct re_state_table_entry *spot;
1531  Idx i;
1532#ifdef lint
1533  /* Suppress bogus uninitialized-variable warnings.  */
1534  *err = REG_NOERROR;
1535#endif
1536  if (nodes->nelem == 0)
1537    {
1538      *err = REG_NOERROR;
1539      return NULL;
1540    }
1541  hash = calc_state_hash (nodes, context);
1542  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544  for (i = 0 ; i < spot->num ; i++)
1545    {
1546      re_dfastate_t *state = spot->array[i];
1547      if (state->hash == hash
1548	  && state->context == context
1549	  && re_node_set_compare (state->entrance_nodes, nodes))
1550	return state;
1551    }
1552  /* There are no appropriate state in `dfa', create the new one.  */
1553  new_state = create_cd_newstate (dfa, nodes, context, hash);
1554  if (BE (new_state == NULL, 0))
1555    *err = REG_ESPACE;
1556
1557  return new_state;
1558}
1559
1560/* Finish initialization of the new state NEWSTATE, and using its hash value
1561   HASH put in the appropriate bucket of DFA's state table.  Return value
1562   indicates the error code if failed.  */
1563
1564static reg_errcode_t
1565register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1566		re_hashval_t hash)
1567{
1568  struct re_state_table_entry *spot;
1569  reg_errcode_t err;
1570  Idx i;
1571
1572  newstate->hash = hash;
1573  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1574  if (BE (err != REG_NOERROR, 0))
1575    return REG_ESPACE;
1576  for (i = 0; i < newstate->nodes.nelem; i++)
1577    {
1578      Idx elem = newstate->nodes.elems[i];
1579      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1580	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1581	  return REG_ESPACE;
1582    }
1583
1584  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1585  if (BE (spot->alloc <= spot->num, 0))
1586    {
1587      Idx new_alloc = 2 * spot->num + 2;
1588      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1589					      new_alloc);
1590      if (BE (new_array == NULL, 0))
1591	return REG_ESPACE;
1592      spot->array = new_array;
1593      spot->alloc = new_alloc;
1594    }
1595  spot->array[spot->num++] = newstate;
1596  return REG_NOERROR;
1597}
1598
1599static void
1600free_state (re_dfastate_t *state)
1601{
1602  re_node_set_free (&state->non_eps_nodes);
1603  re_node_set_free (&state->inveclosure);
1604  if (state->entrance_nodes != &state->nodes)
1605    {
1606      re_node_set_free (state->entrance_nodes);
1607      re_free (state->entrance_nodes);
1608    }
1609  re_node_set_free (&state->nodes);
1610  re_free (state->word_trtable);
1611  re_free (state->trtable);
1612  re_free (state);
1613}
1614
1615/* Create the new state which is independ of contexts.
1616   Return the new state if succeeded, otherwise return NULL.  */
1617
1618static re_dfastate_t *
1619internal_function
1620create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1621		    re_hashval_t hash)
1622{
1623  Idx i;
1624  reg_errcode_t err;
1625  re_dfastate_t *newstate;
1626
1627  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1628  if (BE (newstate == NULL, 0))
1629    return NULL;
1630  err = re_node_set_init_copy (&newstate->nodes, nodes);
1631  if (BE (err != REG_NOERROR, 0))
1632    {
1633      re_free (newstate);
1634      return NULL;
1635    }
1636
1637  newstate->entrance_nodes = &newstate->nodes;
1638  for (i = 0 ; i < nodes->nelem ; i++)
1639    {
1640      re_token_t *node = dfa->nodes + nodes->elems[i];
1641      re_token_type_t type = node->type;
1642      if (type == CHARACTER && !node->constraint)
1643	continue;
1644#ifdef RE_ENABLE_I18N
1645      newstate->accept_mb |= node->accept_mb;
1646#endif /* RE_ENABLE_I18N */
1647
1648      /* If the state has the halt node, the state is a halt state.  */
1649      if (type == END_OF_RE)
1650	newstate->halt = 1;
1651      else if (type == OP_BACK_REF)
1652	newstate->has_backref = 1;
1653      else if (type == ANCHOR || node->constraint)
1654	newstate->has_constraint = 1;
1655    }
1656  err = register_state (dfa, newstate, hash);
1657  if (BE (err != REG_NOERROR, 0))
1658    {
1659      free_state (newstate);
1660      newstate = NULL;
1661    }
1662  return newstate;
1663}
1664
1665/* Create the new state which is depend on the context CONTEXT.
1666   Return the new state if succeeded, otherwise return NULL.  */
1667
1668static re_dfastate_t *
1669internal_function
1670create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1671		    unsigned int context, re_hashval_t hash)
1672{
1673  Idx i, nctx_nodes = 0;
1674  reg_errcode_t err;
1675  re_dfastate_t *newstate;
1676
1677  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1678  if (BE (newstate == NULL, 0))
1679    return NULL;
1680  err = re_node_set_init_copy (&newstate->nodes, nodes);
1681  if (BE (err != REG_NOERROR, 0))
1682    {
1683      re_free (newstate);
1684      return NULL;
1685    }
1686
1687  newstate->context = context;
1688  newstate->entrance_nodes = &newstate->nodes;
1689
1690  for (i = 0 ; i < nodes->nelem ; i++)
1691    {
1692      unsigned int constraint = 0;
1693      re_token_t *node = dfa->nodes + nodes->elems[i];
1694      re_token_type_t type = node->type;
1695      if (node->constraint)
1696	constraint = node->constraint;
1697
1698      if (type == CHARACTER && !constraint)
1699	continue;
1700#ifdef RE_ENABLE_I18N
1701      newstate->accept_mb |= node->accept_mb;
1702#endif /* RE_ENABLE_I18N */
1703
1704      /* If the state has the halt node, the state is a halt state.  */
1705      if (type == END_OF_RE)
1706	newstate->halt = 1;
1707      else if (type == OP_BACK_REF)
1708	newstate->has_backref = 1;
1709      else if (type == ANCHOR)
1710	constraint = node->opr.ctx_type;
1711
1712      if (constraint)
1713	{
1714	  if (newstate->entrance_nodes == &newstate->nodes)
1715	    {
1716	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717	      if (BE (newstate->entrance_nodes == NULL, 0))
1718		{
1719		  free_state (newstate);
1720		  return NULL;
1721		}
1722	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1723	      nctx_nodes = 0;
1724	      newstate->has_constraint = 1;
1725	    }
1726
1727	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1728	    {
1729	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1730	      ++nctx_nodes;
1731	    }
1732	}
1733    }
1734  err = register_state (dfa, newstate, hash);
1735  if (BE (err != REG_NOERROR, 0))
1736    {
1737      free_state (newstate);
1738      newstate = NULL;
1739    }
1740  return  newstate;
1741}
1742