1/* Extended regular expression matching and search library.
2   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, write to the Free
18   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19   02111-1307 USA.  */
20
21static void re_string_construct_common (const char *str, int len,
22					re_string_t *pstr,
23					RE_TRANSLATE_TYPE trans, int icase,
24					const re_dfa_t *dfa) internal_function;
25static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26					  const re_node_set *nodes,
27					  unsigned int hash) internal_function;
28static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29					  const re_node_set *nodes,
30					  unsigned int context,
31					  unsigned int hash) internal_function;
32
33/* Functions for string operation.  */
34
35/* This function allocate the buffers.  It is necessary to call
36   re_string_reconstruct before using the object.  */
37
38static reg_errcode_t
39internal_function
40re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
42{
43  reg_errcode_t ret;
44  int init_buf_len;
45
46  /* Ensure at least one character fits into the buffers.  */
47  if (init_len < dfa->mb_cur_max)
48    init_len = dfa->mb_cur_max;
49  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50  re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52  ret = re_string_realloc_buffers (pstr, init_buf_len);
53  if (BE (ret != REG_NOERROR, 0))
54    return ret;
55
56  pstr->word_char = dfa->word_char;
57  pstr->word_ops_used = dfa->word_ops_used;
58  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60  pstr->valid_raw_len = pstr->valid_len;
61  return REG_NOERROR;
62}
63
64/* This function allocate the buffers, and initialize them.  */
65
66static reg_errcode_t
67internal_function
68re_string_construct (re_string_t *pstr, const char *str, int len,
69		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
70{
71  reg_errcode_t ret;
72  memset (pstr, '\0', sizeof (re_string_t));
73  re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75  if (len > 0)
76    {
77      ret = re_string_realloc_buffers (pstr, len + 1);
78      if (BE (ret != REG_NOERROR, 0))
79	return ret;
80    }
81  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83  if (icase)
84    {
85#ifdef RE_ENABLE_I18N
86      if (dfa->mb_cur_max > 1)
87	{
88	  while (1)
89	    {
90	      ret = build_wcs_upper_buffer (pstr);
91	      if (BE (ret != REG_NOERROR, 0))
92		return ret;
93	      if (pstr->valid_raw_len >= len)
94		break;
95	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96		break;
97	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98	      if (BE (ret != REG_NOERROR, 0))
99		return ret;
100	    }
101	}
102      else
103#endif /* RE_ENABLE_I18N  */
104	build_upper_buffer (pstr);
105    }
106  else
107    {
108#ifdef RE_ENABLE_I18N
109      if (dfa->mb_cur_max > 1)
110	build_wcs_buffer (pstr);
111      else
112#endif /* RE_ENABLE_I18N  */
113	{
114	  if (trans != NULL)
115	    re_string_translate_buffer (pstr);
116	  else
117	    {
118	      pstr->valid_len = pstr->bufs_len;
119	      pstr->valid_raw_len = pstr->bufs_len;
120	    }
121	}
122    }
123
124  return REG_NOERROR;
125}
126
127/* Helper functions for re_string_allocate, and re_string_construct.  */
128
129static reg_errcode_t
130internal_function
131re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132{
133#ifdef RE_ENABLE_I18N
134  if (pstr->mb_cur_max > 1)
135    {
136      wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
137      if (BE (new_wcs == NULL, 0))
138	return REG_ESPACE;
139      pstr->wcs = new_wcs;
140      if (pstr->offsets != NULL)
141	{
142	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
143	  if (BE (new_offsets == NULL, 0))
144	    return REG_ESPACE;
145	  pstr->offsets = new_offsets;
146	}
147    }
148#endif /* RE_ENABLE_I18N  */
149  if (pstr->mbs_allocated)
150    {
151      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152					   new_buf_len);
153      if (BE (new_mbs == NULL, 0))
154	return REG_ESPACE;
155      pstr->mbs = new_mbs;
156    }
157  pstr->bufs_len = new_buf_len;
158  return REG_NOERROR;
159}
160
161
162static void
163internal_function
164re_string_construct_common (const char *str, int len, re_string_t *pstr,
165			    RE_TRANSLATE_TYPE trans, int icase,
166			    const re_dfa_t *dfa)
167{
168  pstr->raw_mbs = (const unsigned char *) str;
169  pstr->len = len;
170  pstr->raw_len = len;
171  pstr->trans = trans;
172  pstr->icase = icase ? 1 : 0;
173  pstr->mbs_allocated = (trans != NULL || icase);
174  pstr->mb_cur_max = dfa->mb_cur_max;
175  pstr->is_utf8 = dfa->is_utf8;
176  pstr->map_notascii = dfa->map_notascii;
177  pstr->stop = pstr->len;
178  pstr->raw_stop = pstr->stop;
179}
180
181#ifdef RE_ENABLE_I18N
182
183/* Build wide character buffer PSTR->WCS.
184   If the byte sequence of the string are:
185     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
186   Then wide character buffer will be:
187     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
188   We use WEOF for padding, they indicate that the position isn't
189   a first byte of a multibyte character.
190
191   Note that this function assumes PSTR->VALID_LEN elements are already
192   built and starts from PSTR->VALID_LEN.  */
193
194static void
195internal_function
196build_wcs_buffer (re_string_t *pstr)
197{
198#ifdef _LIBC
199  unsigned char buf[MB_LEN_MAX];
200  assert (MB_LEN_MAX >= pstr->mb_cur_max);
201#else
202  unsigned char buf[64];
203#endif
204  mbstate_t prev_st;
205  int byte_idx, end_idx, remain_len;
206  size_t mbclen;
207
208  /* Build the buffers from pstr->valid_len to either pstr->len or
209     pstr->bufs_len.  */
210  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
211  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
212    {
213      wchar_t wc;
214      const char *p;
215
216      remain_len = end_idx - byte_idx;
217      prev_st = pstr->cur_state;
218      /* Apply the translation if we need.  */
219      if (BE (pstr->trans != NULL, 0))
220	{
221	  int i, ch;
222
223	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224	    {
225	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
226	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227	    }
228	  p = (const char *) buf;
229	}
230      else
231	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
232      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
233      if (BE (mbclen == (size_t) -2, 0))
234	{
235	  /* The buffer doesn't have enough space, finish to build.  */
236	  pstr->cur_state = prev_st;
237	  break;
238	}
239      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240	{
241	  /* We treat these cases as a singlebyte character.  */
242	  mbclen = 1;
243	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244	  if (BE (pstr->trans != NULL, 0))
245	    wc = pstr->trans[wc];
246	  pstr->cur_state = prev_st;
247	}
248
249      /* Write wide character and padding.  */
250      pstr->wcs[byte_idx++] = wc;
251      /* Write paddings.  */
252      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
253	pstr->wcs[byte_idx++] = WEOF;
254    }
255  pstr->valid_len = byte_idx;
256  pstr->valid_raw_len = byte_idx;
257}
258
259/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
260   but for REG_ICASE.  */
261
262static reg_errcode_t
263internal_function
264build_wcs_upper_buffer (re_string_t *pstr)
265{
266  mbstate_t prev_st;
267  int src_idx, byte_idx, end_idx, remain_len;
268  size_t mbclen;
269#ifdef _LIBC
270  char buf[MB_LEN_MAX];
271  assert (MB_LEN_MAX >= pstr->mb_cur_max);
272#else
273  char buf[64];
274#endif
275
276  byte_idx = pstr->valid_len;
277  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278
279  /* The following optimization assumes that ASCII characters can be
280     mapped to wide characters with a simple cast.  */
281  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282    {
283      while (byte_idx < end_idx)
284	{
285	  wchar_t wc;
286
287	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
288	      && mbsinit (&pstr->cur_state))
289	    {
290	      /* In case of a singlebyte character.  */
291	      pstr->mbs[byte_idx]
292		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
293	      /* The next step uses the assumption that wchar_t is encoded
294		 ASCII-safe: all ASCII values can be converted like this.  */
295	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
296	      ++byte_idx;
297	      continue;
298	    }
299
300	  remain_len = end_idx - byte_idx;
301	  prev_st = pstr->cur_state;
302	  mbclen = __mbrtowc (&wc,
303			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
304			       + byte_idx), remain_len, &pstr->cur_state);
305	  if (BE (mbclen + 2 > 2, 1))
306	    {
307	      wchar_t wcu = wc;
308	      if (iswlower (wc))
309		{
310		  size_t mbcdlen;
311
312		  wcu = towupper (wc);
313		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
314		  if (BE (mbclen == mbcdlen, 1))
315		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
316		  else
317		    {
318		      src_idx = byte_idx;
319		      goto offsets_needed;
320		    }
321		}
322	      else
323		memcpy (pstr->mbs + byte_idx,
324			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
325	      pstr->wcs[byte_idx++] = wcu;
326	      /* Write paddings.  */
327	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
328		pstr->wcs[byte_idx++] = WEOF;
329	    }
330	  else if (mbclen == (size_t) -1 || mbclen == 0)
331	    {
332	      /* It is an invalid character or '\0'.  Just use the byte.  */
333	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
334	      pstr->mbs[byte_idx] = ch;
335	      /* And also cast it to wide char.  */
336	      pstr->wcs[byte_idx++] = (wchar_t) ch;
337	      if (BE (mbclen == (size_t) -1, 0))
338		pstr->cur_state = prev_st;
339	    }
340	  else
341	    {
342	      /* The buffer doesn't have enough space, finish to build.  */
343	      pstr->cur_state = prev_st;
344	      break;
345	    }
346	}
347      pstr->valid_len = byte_idx;
348      pstr->valid_raw_len = byte_idx;
349      return REG_NOERROR;
350    }
351  else
352    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
353      {
354	wchar_t wc;
355	const char *p;
356      offsets_needed:
357	remain_len = end_idx - byte_idx;
358	prev_st = pstr->cur_state;
359	if (BE (pstr->trans != NULL, 0))
360	  {
361	    int i, ch;
362
363	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364	      {
365		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
366		buf[i] = pstr->trans[ch];
367	      }
368	    p = (const char *) buf;
369	  }
370	else
371	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
372	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
373	if (BE (mbclen + 2 > 2, 1))
374	  {
375	    wchar_t wcu = wc;
376	    if (iswlower (wc))
377	      {
378		size_t mbcdlen;
379
380		wcu = towupper (wc);
381		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
382		if (BE (mbclen == mbcdlen, 1))
383		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
384		else if (mbcdlen != (size_t) -1)
385		  {
386		    size_t i;
387
388		    if (byte_idx + mbcdlen > pstr->bufs_len)
389		      {
390			pstr->cur_state = prev_st;
391			break;
392		      }
393
394		    if (pstr->offsets == NULL)
395		      {
396			pstr->offsets = re_malloc (int, pstr->bufs_len);
397
398			if (pstr->offsets == NULL)
399			  return REG_ESPACE;
400		      }
401		    if (!pstr->offsets_needed)
402		      {
403			for (i = 0; i < (size_t) byte_idx; ++i)
404			  pstr->offsets[i] = i;
405			pstr->offsets_needed = 1;
406		      }
407
408		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
409		    pstr->wcs[byte_idx] = wcu;
410		    pstr->offsets[byte_idx] = src_idx;
411		    for (i = 1; i < mbcdlen; ++i)
412		      {
413			pstr->offsets[byte_idx + i]
414			  = src_idx + (i < mbclen ? i : mbclen - 1);
415			pstr->wcs[byte_idx + i] = WEOF;
416		      }
417		    pstr->len += mbcdlen - mbclen;
418		    if (pstr->raw_stop > src_idx)
419		      pstr->stop += mbcdlen - mbclen;
420		    end_idx = (pstr->bufs_len > pstr->len)
421			      ? pstr->len : pstr->bufs_len;
422		    byte_idx += mbcdlen;
423		    src_idx += mbclen;
424		    continue;
425		  }
426                else
427                  memcpy (pstr->mbs + byte_idx, p, mbclen);
428	      }
429	    else
430	      memcpy (pstr->mbs + byte_idx, p, mbclen);
431
432	    if (BE (pstr->offsets_needed != 0, 0))
433	      {
434		size_t i;
435		for (i = 0; i < mbclen; ++i)
436		  pstr->offsets[byte_idx + i] = src_idx + i;
437	      }
438	    src_idx += mbclen;
439
440	    pstr->wcs[byte_idx++] = wcu;
441	    /* Write paddings.  */
442	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
443	      pstr->wcs[byte_idx++] = WEOF;
444	  }
445	else if (mbclen == (size_t) -1 || mbclen == 0)
446	  {
447	    /* It is an invalid character or '\0'.  Just use the byte.  */
448	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449
450	    if (BE (pstr->trans != NULL, 0))
451	      ch = pstr->trans [ch];
452	    pstr->mbs[byte_idx] = ch;
453
454	    if (BE (pstr->offsets_needed != 0, 0))
455	      pstr->offsets[byte_idx] = src_idx;
456	    ++src_idx;
457
458	    /* And also cast it to wide char.  */
459	    pstr->wcs[byte_idx++] = (wchar_t) ch;
460	    if (BE (mbclen == (size_t) -1, 0))
461	      pstr->cur_state = prev_st;
462	  }
463	else
464	  {
465	    /* The buffer doesn't have enough space, finish to build.  */
466	    pstr->cur_state = prev_st;
467	    break;
468	  }
469      }
470  pstr->valid_len = byte_idx;
471  pstr->valid_raw_len = src_idx;
472  return REG_NOERROR;
473}
474
475/* Skip characters until the index becomes greater than NEW_RAW_IDX.
476   Return the index.  */
477
478static int
479internal_function
480re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
481{
482  mbstate_t prev_st;
483  int rawbuf_idx;
484  size_t mbclen;
485  wchar_t wc = WEOF;
486
487  /* Skip the characters which are not necessary to check.  */
488  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
489       rawbuf_idx < new_raw_idx;)
490    {
491      int remain_len;
492      remain_len = pstr->len - rawbuf_idx;
493      prev_st = pstr->cur_state;
494      mbclen = __mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
495			  remain_len, &pstr->cur_state);
496      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497	{
498	  /* We treat these cases as a single byte character.  */
499	  if (mbclen == 0 || remain_len == 0)
500	    wc = L'\0';
501	  else
502	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
503	  mbclen = 1;
504	  pstr->cur_state = prev_st;
505	}
506      /* Then proceed the next character.  */
507      rawbuf_idx += mbclen;
508    }
509  *last_wc = (wint_t) wc;
510  return rawbuf_idx;
511}
512#endif /* RE_ENABLE_I18N  */
513
514/* Build the buffer PSTR->MBS, and apply the translation if we need.
515   This function is used in case of REG_ICASE.  */
516
517static void
518internal_function
519build_upper_buffer (re_string_t *pstr)
520{
521  int char_idx, end_idx;
522  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
523
524  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
525    {
526      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
527      if (BE (pstr->trans != NULL, 0))
528	ch = pstr->trans[ch];
529      if (islower (ch))
530	pstr->mbs[char_idx] = toupper (ch);
531      else
532	pstr->mbs[char_idx] = ch;
533    }
534  pstr->valid_len = char_idx;
535  pstr->valid_raw_len = char_idx;
536}
537
538/* Apply TRANS to the buffer in PSTR.  */
539
540static void
541internal_function
542re_string_translate_buffer (re_string_t *pstr)
543{
544  int buf_idx, end_idx;
545  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
546
547  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
548    {
549      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
550      pstr->mbs[buf_idx] = pstr->trans[ch];
551    }
552
553  pstr->valid_len = buf_idx;
554  pstr->valid_raw_len = buf_idx;
555}
556
557/* This function re-construct the buffers.
558   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
559   convert to upper case in case of REG_ICASE, apply translation.  */
560
561static reg_errcode_t
562internal_function
563re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
564{
565  int offset = idx - pstr->raw_mbs_idx;
566  if (BE (offset < 0, 0))
567    {
568      /* Reset buffer.  */
569#ifdef RE_ENABLE_I18N
570      if (pstr->mb_cur_max > 1)
571	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
572#endif /* RE_ENABLE_I18N */
573      pstr->len = pstr->raw_len;
574      pstr->stop = pstr->raw_stop;
575      pstr->valid_len = 0;
576      pstr->raw_mbs_idx = 0;
577      pstr->valid_raw_len = 0;
578      pstr->offsets_needed = 0;
579      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
580			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
581      if (!pstr->mbs_allocated)
582	pstr->mbs = (unsigned char *) pstr->raw_mbs;
583      offset = idx;
584    }
585
586  if (BE (offset != 0, 1))
587    {
588      /* Should the already checked characters be kept?  */
589      if (BE (offset < pstr->valid_raw_len, 1))
590	{
591	  /* Yes, move them to the front of the buffer.  */
592#ifdef RE_ENABLE_I18N
593	  if (BE (pstr->offsets_needed, 0))
594	    {
595	      int low = 0, high = pstr->valid_len, mid;
596	      do
597		{
598		  mid = (high + low) / 2;
599		  if (pstr->offsets[mid] > offset)
600		    high = mid;
601		  else if (pstr->offsets[mid] < offset)
602		    low = mid + 1;
603		  else
604		    break;
605		}
606	      while (low < high);
607	      if (pstr->offsets[mid] < offset)
608		++mid;
609	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
610							eflags);
611	      /* This can be quite complicated, so handle specially
612		 only the common and easy case where the character with
613		 different length representation of lower and upper
614		 case is present at or after offset.  */
615	      if (pstr->valid_len > offset
616		  && mid == offset && pstr->offsets[mid] == offset)
617		{
618		  memmove (pstr->wcs, pstr->wcs + offset,
619			   (pstr->valid_len - offset) * sizeof (wint_t));
620		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
621		  pstr->valid_len -= offset;
622		  pstr->valid_raw_len -= offset;
623		  for (low = 0; low < pstr->valid_len; low++)
624		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
625		}
626	      else
627		{
628		  /* Otherwise, just find out how long the partial multibyte
629		     character at offset is and fill it with WEOF/255.  */
630		  pstr->len = pstr->raw_len - idx + offset;
631		  pstr->stop = pstr->raw_stop - idx + offset;
632		  pstr->offsets_needed = 0;
633		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
634		    --mid;
635		  while (mid < pstr->valid_len)
636		    if (pstr->wcs[mid] != WEOF)
637		      break;
638		    else
639		      ++mid;
640		  if (mid == pstr->valid_len)
641		    pstr->valid_len = 0;
642		  else
643		    {
644		      pstr->valid_len = pstr->offsets[mid] - offset;
645		      if (pstr->valid_len)
646			{
647			  for (low = 0; low < pstr->valid_len; ++low)
648			    pstr->wcs[low] = WEOF;
649			  memset (pstr->mbs, 255, pstr->valid_len);
650			}
651		    }
652		  pstr->valid_raw_len = pstr->valid_len;
653		}
654	    }
655	  else
656#endif
657	    {
658	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
659							eflags);
660#ifdef RE_ENABLE_I18N
661	      if (pstr->mb_cur_max > 1)
662		memmove (pstr->wcs, pstr->wcs + offset,
663			 (pstr->valid_len - offset) * sizeof (wint_t));
664#endif /* RE_ENABLE_I18N */
665	      if (BE (pstr->mbs_allocated, 0))
666		memmove (pstr->mbs, pstr->mbs + offset,
667			 pstr->valid_len - offset);
668	      pstr->valid_len -= offset;
669	      pstr->valid_raw_len -= offset;
670#if DEBUG
671	      assert (pstr->valid_len > 0);
672#endif
673	    }
674	}
675      else
676	{
677	  /* No, skip all characters until IDX.  */
678	  int prev_valid_len = pstr->valid_len;
679
680#ifdef RE_ENABLE_I18N
681	  if (BE (pstr->offsets_needed, 0))
682	    {
683	      pstr->len = pstr->raw_len - idx + offset;
684	      pstr->stop = pstr->raw_stop - idx + offset;
685	      pstr->offsets_needed = 0;
686	    }
687#endif
688	  pstr->valid_len = 0;
689#ifdef RE_ENABLE_I18N
690	  if (pstr->mb_cur_max > 1)
691	    {
692	      int wcs_idx;
693	      wint_t wc = WEOF;
694
695	      if (pstr->is_utf8)
696		{
697		  const unsigned char *raw, *p, *q, *end;
698
699		  /* Special case UTF-8.  Multi-byte chars start with any
700		     byte other than 0x80 - 0xbf.  */
701		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
702		  end = raw + (offset - pstr->mb_cur_max);
703		  if (end < pstr->raw_mbs)
704		    end = pstr->raw_mbs;
705		  p = raw + offset - 1;
706#ifdef _LIBC
707		  /* We know the wchar_t encoding is UCS4, so for the simple
708		     case, ASCII characters, skip the conversion step.  */
709		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
710		    {
711		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
712		      /* pstr->valid_len = 0; */
713		      wc = (wchar_t) *p;
714		    }
715		  else
716#endif
717		    for (; p >= end; --p)
718		      if ((*p & 0xc0) != 0x80)
719			{
720			  mbstate_t cur_state;
721			  wchar_t wc2;
722			  int mlen = raw + pstr->len - p;
723			  unsigned char buf[6];
724			  size_t mbclen;
725
726			  q = p;
727			  if (BE (pstr->trans != NULL, 0))
728			    {
729			      int i = mlen < 6 ? mlen : 6;
730			      while (--i >= 0)
731				buf[i] = pstr->trans[p[i]];
732			      q = buf;
733			    }
734			  /* XXX Don't use mbrtowc, we know which conversion
735			     to use (UTF-8 -> UCS4).  */
736			  memset (&cur_state, 0, sizeof (cur_state));
737			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
738					      &cur_state);
739			  if (raw + offset - p <= mbclen
740			      && mbclen < (size_t) -2)
741			    {
742			      memset (&pstr->cur_state, '\0',
743				      sizeof (mbstate_t));
744			      pstr->valid_len = mbclen - (raw + offset - p);
745			      wc = wc2;
746			    }
747			  break;
748			}
749		}
750
751	      if (wc == WEOF)
752		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
753	      if (wc == WEOF)
754		pstr->tip_context
755		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
756	      else
757		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
758				      && IS_WIDE_WORD_CHAR (wc))
759				     ? CONTEXT_WORD
760				     : ((IS_WIDE_NEWLINE (wc)
761					 && pstr->newline_anchor)
762					? CONTEXT_NEWLINE : 0));
763	      if (BE (pstr->valid_len, 0))
764		{
765		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
766		    pstr->wcs[wcs_idx] = WEOF;
767		  if (pstr->mbs_allocated)
768		    memset (pstr->mbs, 255, pstr->valid_len);
769		}
770	      pstr->valid_raw_len = pstr->valid_len;
771	    }
772	  else
773#endif /* RE_ENABLE_I18N */
774	    {
775	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
776	      pstr->valid_raw_len = 0;
777	      if (pstr->trans)
778		c = pstr->trans[c];
779	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
780				   ? CONTEXT_WORD
781				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
782				      ? CONTEXT_NEWLINE : 0));
783	    }
784	}
785      if (!BE (pstr->mbs_allocated, 0))
786	pstr->mbs += offset;
787    }
788  pstr->raw_mbs_idx = idx;
789  pstr->len -= offset;
790  pstr->stop -= offset;
791
792  /* Then build the buffers.  */
793#ifdef RE_ENABLE_I18N
794  if (pstr->mb_cur_max > 1)
795    {
796      if (pstr->icase)
797	{
798	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
799	  if (BE (ret != REG_NOERROR, 0))
800	    return ret;
801	}
802      else
803	build_wcs_buffer (pstr);
804    }
805  else
806#endif /* RE_ENABLE_I18N */
807    if (BE (pstr->mbs_allocated, 0))
808      {
809	if (pstr->icase)
810	  build_upper_buffer (pstr);
811	else if (pstr->trans != NULL)
812	  re_string_translate_buffer (pstr);
813      }
814    else
815      pstr->valid_len = pstr->len;
816
817  pstr->cur_idx = 0;
818  return REG_NOERROR;
819}
820
821static unsigned char
822internal_function __attribute ((pure))
823re_string_peek_byte_case (const re_string_t *pstr, int idx)
824{
825  int ch, off;
826
827  /* Handle the common (easiest) cases first.  */
828  if (BE (!pstr->mbs_allocated, 1))
829    return re_string_peek_byte (pstr, idx);
830
831#ifdef RE_ENABLE_I18N
832  if (pstr->mb_cur_max > 1
833      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
834    return re_string_peek_byte (pstr, idx);
835#endif
836
837  off = pstr->cur_idx + idx;
838#ifdef RE_ENABLE_I18N
839  if (pstr->offsets_needed)
840    off = pstr->offsets[off];
841#endif
842
843  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
844
845#ifdef RE_ENABLE_I18N
846  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
847     this function returns CAPITAL LETTER I instead of first byte of
848     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
849     since peek_byte_case doesn't advance cur_idx in any way.  */
850  if (pstr->offsets_needed && !isascii (ch))
851    return re_string_peek_byte (pstr, idx);
852#endif
853
854  return ch;
855}
856
857static unsigned char
858internal_function __attribute ((pure))
859re_string_fetch_byte_case (re_string_t *pstr)
860{
861  if (BE (!pstr->mbs_allocated, 1))
862    return re_string_fetch_byte (pstr);
863
864#ifdef RE_ENABLE_I18N
865  if (pstr->offsets_needed)
866    {
867      int off, ch;
868
869      /* For tr_TR.UTF-8 [[:islower:]] there is
870	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
871	 in that case the whole multi-byte character and return
872	 the original letter.  On the other side, with
873	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
874	 anything else would complicate things too much.  */
875
876      if (!re_string_first_byte (pstr, pstr->cur_idx))
877	return re_string_fetch_byte (pstr);
878
879      off = pstr->offsets[pstr->cur_idx];
880      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
881
882      if (! isascii (ch))
883	return re_string_fetch_byte (pstr);
884
885      re_string_skip_bytes (pstr,
886			    re_string_char_size_at (pstr, pstr->cur_idx));
887      return ch;
888    }
889#endif
890
891  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
892}
893
894static void
895internal_function
896re_string_destruct (re_string_t *pstr)
897{
898#ifdef RE_ENABLE_I18N
899  re_free (pstr->wcs);
900  re_free (pstr->offsets);
901#endif /* RE_ENABLE_I18N  */
902  if (pstr->mbs_allocated)
903    re_free (pstr->mbs);
904}
905
906/* Return the context at IDX in INPUT.  */
907
908static unsigned int
909internal_function
910re_string_context_at (const re_string_t *input, int idx, int eflags)
911{
912  int c;
913  if (BE (idx < 0, 0))
914    /* In this case, we use the value stored in input->tip_context,
915       since we can't know the character in input->mbs[-1] here.  */
916    return input->tip_context;
917  if (BE (idx == input->len, 0))
918    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
919	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
920#ifdef RE_ENABLE_I18N
921  if (input->mb_cur_max > 1)
922    {
923      wint_t wc;
924      int wc_idx = idx;
925      while(input->wcs[wc_idx] == WEOF)
926	{
927#ifdef DEBUG
928	  /* It must not happen.  */
929	  assert (wc_idx >= 0);
930#endif
931	  --wc_idx;
932	  if (wc_idx < 0)
933	    return input->tip_context;
934	}
935      wc = input->wcs[wc_idx];
936      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
937	return CONTEXT_WORD;
938      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
939	      ? CONTEXT_NEWLINE : 0);
940    }
941  else
942#endif
943    {
944      c = re_string_byte_at (input, idx);
945      if (bitset_contain (input->word_char, c))
946	return CONTEXT_WORD;
947      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
948    }
949}
950
951/* Functions for set operation.  */
952
953static reg_errcode_t
954internal_function
955re_node_set_alloc (re_node_set *set, int size)
956{
957  set->alloc = size;
958  set->nelem = 0;
959  set->elems = re_malloc (int, size);
960  if (BE (set->elems == NULL, 0))
961    return REG_ESPACE;
962  return REG_NOERROR;
963}
964
965static reg_errcode_t
966internal_function
967re_node_set_init_1 (re_node_set *set, int elem)
968{
969  set->alloc = 1;
970  set->nelem = 1;
971  set->elems = re_malloc (int, 1);
972  if (BE (set->elems == NULL, 0))
973    {
974      set->alloc = set->nelem = 0;
975      return REG_ESPACE;
976    }
977  set->elems[0] = elem;
978  return REG_NOERROR;
979}
980
981static reg_errcode_t
982internal_function
983re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
984{
985  set->alloc = 2;
986  set->elems = re_malloc (int, 2);
987  if (BE (set->elems == NULL, 0))
988    return REG_ESPACE;
989  if (elem1 == elem2)
990    {
991      set->nelem = 1;
992      set->elems[0] = elem1;
993    }
994  else
995    {
996      set->nelem = 2;
997      if (elem1 < elem2)
998	{
999	  set->elems[0] = elem1;
1000	  set->elems[1] = elem2;
1001	}
1002      else
1003	{
1004	  set->elems[0] = elem2;
1005	  set->elems[1] = elem1;
1006	}
1007    }
1008  return REG_NOERROR;
1009}
1010
1011static reg_errcode_t
1012internal_function
1013re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1014{
1015  dest->nelem = src->nelem;
1016  if (src->nelem > 0)
1017    {
1018      dest->alloc = dest->nelem;
1019      dest->elems = re_malloc (int, dest->alloc);
1020      if (BE (dest->elems == NULL, 0))
1021	{
1022	  dest->alloc = dest->nelem = 0;
1023	  return REG_ESPACE;
1024	}
1025      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1026    }
1027  else
1028    re_node_set_init_empty (dest);
1029  return REG_NOERROR;
1030}
1031
1032/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1033   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1034   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1035
1036static reg_errcode_t
1037internal_function
1038re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1039			   const re_node_set *src2)
1040{
1041  int i1, i2, is, id, delta, sbase;
1042  if (src1->nelem == 0 || src2->nelem == 0)
1043    return REG_NOERROR;
1044
1045  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1046     conservative estimate.  */
1047  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1048    {
1049      int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1050      int *new_elems = re_realloc (dest->elems, int, new_alloc);
1051      if (BE (new_elems == NULL, 0))
1052        return REG_ESPACE;
1053      dest->elems = new_elems;
1054      dest->alloc = new_alloc;
1055    }
1056
1057  /* Find the items in the intersection of SRC1 and SRC2, and copy
1058     into the top of DEST those that are not already in DEST itself.  */
1059  sbase = dest->nelem + src1->nelem + src2->nelem;
1060  i1 = src1->nelem - 1;
1061  i2 = src2->nelem - 1;
1062  id = dest->nelem - 1;
1063  for (;;)
1064    {
1065      if (src1->elems[i1] == src2->elems[i2])
1066	{
1067	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1068	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1069	    --id;
1070
1071          if (id < 0 || dest->elems[id] != src1->elems[i1])
1072            dest->elems[--sbase] = src1->elems[i1];
1073
1074	  if (--i1 < 0 || --i2 < 0)
1075	    break;
1076	}
1077
1078      /* Lower the highest of the two items.  */
1079      else if (src1->elems[i1] < src2->elems[i2])
1080	{
1081	  if (--i2 < 0)
1082	    break;
1083	}
1084      else
1085	{
1086	  if (--i1 < 0)
1087	    break;
1088	}
1089    }
1090
1091  id = dest->nelem - 1;
1092  is = dest->nelem + src1->nelem + src2->nelem - 1;
1093  delta = is - sbase + 1;
1094
1095  /* Now copy.  When DELTA becomes zero, the remaining
1096     DEST elements are already in place; this is more or
1097     less the same loop that is in re_node_set_merge.  */
1098  dest->nelem += delta;
1099  if (delta > 0 && id >= 0)
1100    for (;;)
1101      {
1102        if (dest->elems[is] > dest->elems[id])
1103          {
1104            /* Copy from the top.  */
1105            dest->elems[id + delta--] = dest->elems[is--];
1106            if (delta == 0)
1107              break;
1108          }
1109        else
1110          {
1111            /* Slide from the bottom.  */
1112            dest->elems[id + delta] = dest->elems[id];
1113            if (--id < 0)
1114              break;
1115          }
1116      }
1117
1118  /* Copy remaining SRC elements.  */
1119  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1120
1121  return REG_NOERROR;
1122}
1123
1124/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1125   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1126
1127static reg_errcode_t
1128internal_function
1129re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1130			const re_node_set *src2)
1131{
1132  int i1, i2, id;
1133  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1134    {
1135      dest->alloc = src1->nelem + src2->nelem;
1136      dest->elems = re_malloc (int, dest->alloc);
1137      if (BE (dest->elems == NULL, 0))
1138	return REG_ESPACE;
1139    }
1140  else
1141    {
1142      if (src1 != NULL && src1->nelem > 0)
1143	return re_node_set_init_copy (dest, src1);
1144      else if (src2 != NULL && src2->nelem > 0)
1145	return re_node_set_init_copy (dest, src2);
1146      else
1147	re_node_set_init_empty (dest);
1148      return REG_NOERROR;
1149    }
1150  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1151    {
1152      if (src1->elems[i1] > src2->elems[i2])
1153	{
1154	  dest->elems[id++] = src2->elems[i2++];
1155	  continue;
1156	}
1157      if (src1->elems[i1] == src2->elems[i2])
1158	++i2;
1159      dest->elems[id++] = src1->elems[i1++];
1160    }
1161  if (i1 < src1->nelem)
1162    {
1163      memcpy (dest->elems + id, src1->elems + i1,
1164	     (src1->nelem - i1) * sizeof (int));
1165      id += src1->nelem - i1;
1166    }
1167  else if (i2 < src2->nelem)
1168    {
1169      memcpy (dest->elems + id, src2->elems + i2,
1170	     (src2->nelem - i2) * sizeof (int));
1171      id += src2->nelem - i2;
1172    }
1173  dest->nelem = id;
1174  return REG_NOERROR;
1175}
1176
1177/* Calculate the union set of the sets DEST and SRC. And store it to
1178   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1179
1180static reg_errcode_t
1181internal_function
1182re_node_set_merge (re_node_set *dest, const re_node_set *src)
1183{
1184  int is, id, sbase, delta;
1185  if (src == NULL || src->nelem == 0)
1186    return REG_NOERROR;
1187  if (dest->alloc < 2 * src->nelem + dest->nelem)
1188    {
1189      int new_alloc = 2 * (src->nelem + dest->alloc);
1190      int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1191      if (BE (new_buffer == NULL, 0))
1192	return REG_ESPACE;
1193      dest->elems = new_buffer;
1194      dest->alloc = new_alloc;
1195    }
1196
1197  if (BE (dest->nelem == 0, 0))
1198    {
1199      dest->nelem = src->nelem;
1200      memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1201      return REG_NOERROR;
1202    }
1203
1204  /* Copy into the top of DEST the items of SRC that are not
1205     found in DEST.  Maybe we could binary search in DEST?  */
1206  for (sbase = dest->nelem + 2 * src->nelem,
1207       is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1208    {
1209      if (dest->elems[id] == src->elems[is])
1210        is--, id--;
1211      else if (dest->elems[id] < src->elems[is])
1212        dest->elems[--sbase] = src->elems[is--];
1213      else /* if (dest->elems[id] > src->elems[is]) */
1214        --id;
1215    }
1216
1217  if (is >= 0)
1218    {
1219      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1220      sbase -= is + 1;
1221      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1222    }
1223
1224  id = dest->nelem - 1;
1225  is = dest->nelem + 2 * src->nelem - 1;
1226  delta = is - sbase + 1;
1227  if (delta == 0)
1228    return REG_NOERROR;
1229
1230  /* Now copy.  When DELTA becomes zero, the remaining
1231     DEST elements are already in place.  */
1232  dest->nelem += delta;
1233  for (;;)
1234    {
1235      if (dest->elems[is] > dest->elems[id])
1236        {
1237	  /* Copy from the top.  */
1238          dest->elems[id + delta--] = dest->elems[is--];
1239	  if (delta == 0)
1240	    break;
1241	}
1242      else
1243        {
1244          /* Slide from the bottom.  */
1245          dest->elems[id + delta] = dest->elems[id];
1246	  if (--id < 0)
1247	    {
1248	      /* Copy remaining SRC elements.  */
1249	      memcpy (dest->elems, dest->elems + sbase,
1250	              delta * sizeof (int));
1251	      break;
1252	    }
1253	}
1254    }
1255
1256  return REG_NOERROR;
1257}
1258
1259/* Insert the new element ELEM to the re_node_set* SET.
1260   SET should not already have ELEM.
1261   return -1 if an error is occured, return 1 otherwise.  */
1262
1263static int
1264internal_function
1265re_node_set_insert (re_node_set *set, int elem)
1266{
1267  int idx;
1268  /* In case the set is empty.  */
1269  if (set->alloc == 0)
1270    {
1271      if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1272	return 1;
1273      else
1274	return -1;
1275    }
1276
1277  if (BE (set->nelem, 0) == 0)
1278    {
1279      /* We already guaranteed above that set->alloc != 0.  */
1280      set->elems[0] = elem;
1281      ++set->nelem;
1282      return 1;
1283    }
1284
1285  /* Realloc if we need.  */
1286  if (set->alloc == set->nelem)
1287    {
1288      int *new_elems;
1289      set->alloc = set->alloc * 2;
1290      new_elems = re_realloc (set->elems, int, set->alloc);
1291      if (BE (new_elems == NULL, 0))
1292	return -1;
1293      set->elems = new_elems;
1294    }
1295
1296  /* Move the elements which follows the new element.  Test the
1297     first element separately to skip a check in the inner loop.  */
1298  if (elem < set->elems[0])
1299    {
1300      idx = 0;
1301      for (idx = set->nelem; idx > 0; idx--)
1302        set->elems[idx] = set->elems[idx - 1];
1303    }
1304  else
1305    {
1306      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1307        set->elems[idx] = set->elems[idx - 1];
1308    }
1309
1310  /* Insert the new element.  */
1311  set->elems[idx] = elem;
1312  ++set->nelem;
1313  return 1;
1314}
1315
1316/* Insert the new element ELEM to the re_node_set* SET.
1317   SET should not already have any element greater than or equal to ELEM.
1318   Return -1 if an error is occured, return 1 otherwise.  */
1319
1320static int
1321internal_function
1322re_node_set_insert_last (re_node_set *set, int elem)
1323{
1324  /* Realloc if we need.  */
1325  if (set->alloc == set->nelem)
1326    {
1327      int *new_elems;
1328      set->alloc = (set->alloc + 1) * 2;
1329      new_elems = re_realloc (set->elems, int, set->alloc);
1330      if (BE (new_elems == NULL, 0))
1331	return -1;
1332      set->elems = new_elems;
1333    }
1334
1335  /* Insert the new element.  */
1336  set->elems[set->nelem++] = elem;
1337  return 1;
1338}
1339
1340/* Compare two node sets SET1 and SET2.
1341   return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1342
1343static int
1344internal_function __attribute ((pure))
1345re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1346{
1347  int i;
1348  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1349    return 0;
1350  for (i = set1->nelem ; --i >= 0 ; )
1351    if (set1->elems[i] != set2->elems[i])
1352      return 0;
1353  return 1;
1354}
1355
1356/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1357
1358static int
1359internal_function __attribute ((pure))
1360re_node_set_contains (const re_node_set *set, int elem)
1361{
1362  unsigned int idx, right, mid;
1363  if (set->nelem <= 0)
1364    return 0;
1365
1366  /* Binary search the element.  */
1367  idx = 0;
1368  right = set->nelem - 1;
1369  while (idx < right)
1370    {
1371      mid = (idx + right) / 2;
1372      if (set->elems[mid] < elem)
1373	idx = mid + 1;
1374      else
1375	right = mid;
1376    }
1377  return set->elems[idx] == elem ? idx + 1 : 0;
1378}
1379
1380static void
1381internal_function
1382re_node_set_remove_at (re_node_set *set, int idx)
1383{
1384  if (idx < 0 || idx >= set->nelem)
1385    return;
1386  --set->nelem;
1387  for (; idx < set->nelem; idx++)
1388    set->elems[idx] = set->elems[idx + 1];
1389}
1390
1391
1392/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1393   Or return -1, if an error will be occured.  */
1394
1395static int
1396internal_function
1397re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1398{
1399  int type = token.type;
1400  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1401    {
1402      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1403      int *new_nexts, *new_indices;
1404      re_node_set *new_edests, *new_eclosures;
1405      re_token_t *new_nodes;
1406
1407      /* Avoid overflows.  */
1408      if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1409	return -1;
1410
1411      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1412      if (BE (new_nodes == NULL, 0))
1413	return -1;
1414      dfa->nodes = new_nodes;
1415      new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1416      new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1417      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1418      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1419      if (BE (new_nexts == NULL || new_indices == NULL
1420	      || new_edests == NULL || new_eclosures == NULL, 0))
1421	return -1;
1422      dfa->nexts = new_nexts;
1423      dfa->org_indices = new_indices;
1424      dfa->edests = new_edests;
1425      dfa->eclosures = new_eclosures;
1426      dfa->nodes_alloc = new_nodes_alloc;
1427    }
1428  dfa->nodes[dfa->nodes_len] = token;
1429  dfa->nodes[dfa->nodes_len].constraint = 0;
1430#ifdef RE_ENABLE_I18N
1431  dfa->nodes[dfa->nodes_len].accept_mb =
1432    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1433#endif
1434  dfa->nexts[dfa->nodes_len] = -1;
1435  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1436  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1437  return dfa->nodes_len++;
1438}
1439
1440static inline unsigned int
1441internal_function
1442calc_state_hash (const re_node_set *nodes, unsigned int context)
1443{
1444  unsigned int hash = nodes->nelem + context;
1445  int i;
1446  for (i = 0 ; i < nodes->nelem ; i++)
1447    hash += nodes->elems[i];
1448  return hash;
1449}
1450
1451/* Search for the state whose node_set is equivalent to NODES.
1452   Return the pointer to the state, if we found it in the DFA.
1453   Otherwise create the new one and return it.  In case of an error
1454   return NULL and set the error code in ERR.
1455   Note: - We assume NULL as the invalid state, then it is possible that
1456	   return value is NULL and ERR is REG_NOERROR.
1457	 - We never return non-NULL value in case of any errors, it is for
1458	   optimization.  */
1459
1460static re_dfastate_t *
1461internal_function
1462re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1463		  const re_node_set *nodes)
1464{
1465  unsigned int hash;
1466  re_dfastate_t *new_state;
1467  struct re_state_table_entry *spot;
1468  int i;
1469  if (BE (nodes->nelem == 0, 0))
1470    {
1471      *err = REG_NOERROR;
1472      return NULL;
1473    }
1474  hash = calc_state_hash (nodes, 0);
1475  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1476
1477  for (i = 0 ; i < spot->num ; i++)
1478    {
1479      re_dfastate_t *state = spot->array[i];
1480      if (hash != state->hash)
1481	continue;
1482      if (re_node_set_compare (&state->nodes, nodes))
1483	return state;
1484    }
1485
1486  /* There are no appropriate state in the dfa, create the new one.  */
1487  new_state = create_ci_newstate (dfa, nodes, hash);
1488  if (BE (new_state == NULL, 0))
1489    *err = REG_ESPACE;
1490
1491  return new_state;
1492}
1493
1494/* Search for the state whose node_set is equivalent to NODES and
1495   whose context is equivalent to CONTEXT.
1496   Return the pointer to the state, if we found it in the DFA.
1497   Otherwise create the new one and return it.  In case of an error
1498   return NULL and set the error code in ERR.
1499   Note: - We assume NULL as the invalid state, then it is possible that
1500	   return value is NULL and ERR is REG_NOERROR.
1501	 - We never return non-NULL value in case of any errors, it is for
1502	   optimization.  */
1503
1504static re_dfastate_t *
1505internal_function
1506re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1507			  const re_node_set *nodes, unsigned int context)
1508{
1509  unsigned int hash;
1510  re_dfastate_t *new_state;
1511  struct re_state_table_entry *spot;
1512  int i;
1513  if (nodes->nelem == 0)
1514    {
1515      *err = REG_NOERROR;
1516      return NULL;
1517    }
1518  hash = calc_state_hash (nodes, context);
1519  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1520
1521  for (i = 0 ; i < spot->num ; i++)
1522    {
1523      re_dfastate_t *state = spot->array[i];
1524      if (state->hash == hash
1525	  && state->context == context
1526	  && re_node_set_compare (state->entrance_nodes, nodes))
1527	return state;
1528    }
1529  /* There are no appropriate state in `dfa', create the new one.  */
1530  new_state = create_cd_newstate (dfa, nodes, context, hash);
1531  if (BE (new_state == NULL, 0))
1532    *err = REG_ESPACE;
1533
1534  return new_state;
1535}
1536
1537/* Finish initialization of the new state NEWSTATE, and using its hash value
1538   HASH put in the appropriate bucket of DFA's state table.  Return value
1539   indicates the error code if failed.  */
1540
1541static reg_errcode_t
1542register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1543		unsigned int hash)
1544{
1545  struct re_state_table_entry *spot;
1546  reg_errcode_t err;
1547  int i;
1548
1549  newstate->hash = hash;
1550  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1551  if (BE (err != REG_NOERROR, 0))
1552    return REG_ESPACE;
1553  for (i = 0; i < newstate->nodes.nelem; i++)
1554    {
1555      int elem = newstate->nodes.elems[i];
1556      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1557        re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1558    }
1559
1560  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1561  if (BE (spot->alloc <= spot->num, 0))
1562    {
1563      int new_alloc = 2 * spot->num + 2;
1564      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1565					      new_alloc);
1566      if (BE (new_array == NULL, 0))
1567	return REG_ESPACE;
1568      spot->array = new_array;
1569      spot->alloc = new_alloc;
1570    }
1571  spot->array[spot->num++] = newstate;
1572  return REG_NOERROR;
1573}
1574
1575static void
1576free_state (re_dfastate_t *state)
1577{
1578  re_node_set_free (&state->non_eps_nodes);
1579  re_node_set_free (&state->inveclosure);
1580  if (state->entrance_nodes != &state->nodes)
1581    {
1582      re_node_set_free (state->entrance_nodes);
1583      re_free (state->entrance_nodes);
1584    }
1585  re_node_set_free (&state->nodes);
1586  re_free (state->word_trtable);
1587  re_free (state->trtable);
1588  re_free (state);
1589}
1590
1591/* Create the new state which is independ of contexts.
1592   Return the new state if succeeded, otherwise return NULL.  */
1593
1594static re_dfastate_t *
1595internal_function
1596create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1597		    unsigned int hash)
1598{
1599  int i;
1600  reg_errcode_t err;
1601  re_dfastate_t *newstate;
1602
1603  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1604  if (BE (newstate == NULL, 0))
1605    return NULL;
1606  err = re_node_set_init_copy (&newstate->nodes, nodes);
1607  if (BE (err != REG_NOERROR, 0))
1608    {
1609      re_free (newstate);
1610      return NULL;
1611    }
1612
1613  newstate->entrance_nodes = &newstate->nodes;
1614  for (i = 0 ; i < nodes->nelem ; i++)
1615    {
1616      re_token_t *node = dfa->nodes + nodes->elems[i];
1617      re_token_type_t type = node->type;
1618      if (type == CHARACTER && !node->constraint)
1619	continue;
1620#ifdef RE_ENABLE_I18N
1621      newstate->accept_mb |= node->accept_mb;
1622#endif /* RE_ENABLE_I18N */
1623
1624      /* If the state has the halt node, the state is a halt state.  */
1625      if (type == END_OF_RE)
1626	newstate->halt = 1;
1627      else if (type == OP_BACK_REF)
1628	newstate->has_backref = 1;
1629      else if (type == ANCHOR || node->constraint)
1630	newstate->has_constraint = 1;
1631    }
1632  err = register_state (dfa, newstate, hash);
1633  if (BE (err != REG_NOERROR, 0))
1634    {
1635      free_state (newstate);
1636      newstate = NULL;
1637    }
1638  return newstate;
1639}
1640
1641/* Create the new state which is depend on the context CONTEXT.
1642   Return the new state if succeeded, otherwise return NULL.  */
1643
1644static re_dfastate_t *
1645internal_function
1646create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1647		    unsigned int context, unsigned int hash)
1648{
1649  int i, nctx_nodes = 0;
1650  reg_errcode_t err;
1651  re_dfastate_t *newstate;
1652
1653  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1654  if (BE (newstate == NULL, 0))
1655    return NULL;
1656  err = re_node_set_init_copy (&newstate->nodes, nodes);
1657  if (BE (err != REG_NOERROR, 0))
1658    {
1659      re_free (newstate);
1660      return NULL;
1661    }
1662
1663  newstate->context = context;
1664  newstate->entrance_nodes = &newstate->nodes;
1665
1666  for (i = 0 ; i < nodes->nelem ; i++)
1667    {
1668      re_token_t *node = dfa->nodes + nodes->elems[i];
1669      re_token_type_t type = node->type;
1670      unsigned int constraint = node->constraint;
1671
1672      if (type == CHARACTER && !constraint)
1673	continue;
1674#ifdef RE_ENABLE_I18N
1675      newstate->accept_mb |= node->accept_mb;
1676#endif /* RE_ENABLE_I18N */
1677
1678      /* If the state has the halt node, the state is a halt state.  */
1679      if (type == END_OF_RE)
1680	newstate->halt = 1;
1681      else if (type == OP_BACK_REF)
1682	newstate->has_backref = 1;
1683
1684      if (constraint)
1685	{
1686	  if (newstate->entrance_nodes == &newstate->nodes)
1687	    {
1688	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1689	      if (BE (newstate->entrance_nodes == NULL, 0))
1690		{
1691		  free_state (newstate);
1692		  return NULL;
1693		}
1694	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1695	      nctx_nodes = 0;
1696	      newstate->has_constraint = 1;
1697	    }
1698
1699	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1700	    {
1701	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1702	      ++nctx_nodes;
1703	    }
1704	}
1705    }
1706  err = register_state (dfa, newstate, hash);
1707  if (BE (err != REG_NOERROR, 0))
1708    {
1709      free_state (newstate);
1710      newstate = NULL;
1711    }
1712  return  newstate;
1713}
1714