1/* Extended regular expression matching and search library.
2   Copyright (C) 2002-2014 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU General Public
8   License as published by the Free Software Foundation; either
9   version 3 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   General Public License for more details.
15
16   You should have received a copy of the GNU General Public
17   License along with the GNU C Library; if not, see
18   <http://www.gnu.org/licenses/>.  */
19
20static void re_string_construct_common (const char *str, Idx len,
21					re_string_t *pstr,
22					RE_TRANSLATE_TYPE trans, bool icase,
23					const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25					  const re_node_set *nodes,
26					  re_hashval_t hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28					  const re_node_set *nodes,
29					  unsigned int context,
30					  re_hashval_t hash) internal_function;
31
32/* Functions for string operation.  */
33
34/* This function allocate the buffers.  It is necessary to call
35   re_string_reconstruct before using the object.  */
36
37static reg_errcode_t
38internal_function __attribute_warn_unused_result__
39re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41{
42  reg_errcode_t ret;
43  Idx init_buf_len;
44
45  /* Ensure at least one character fits into the buffers.  */
46  if (init_len < dfa->mb_cur_max)
47    init_len = dfa->mb_cur_max;
48  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49  re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51  ret = re_string_realloc_buffers (pstr, init_buf_len);
52  if (BE (ret != REG_NOERROR, 0))
53    return ret;
54
55  pstr->word_char = dfa->word_char;
56  pstr->word_ops_used = dfa->word_ops_used;
57  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59  pstr->valid_raw_len = pstr->valid_len;
60  return REG_NOERROR;
61}
62
63/* This function allocate the buffers, and initialize them.  */
64
65static reg_errcode_t
66internal_function __attribute_warn_unused_result__
67re_string_construct (re_string_t *pstr, const char *str, Idx len,
68		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69{
70  reg_errcode_t ret;
71  memset (pstr, '\0', sizeof (re_string_t));
72  re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74  if (len > 0)
75    {
76      ret = re_string_realloc_buffers (pstr, len + 1);
77      if (BE (ret != REG_NOERROR, 0))
78	return ret;
79    }
80  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82  if (icase)
83    {
84#ifdef RE_ENABLE_I18N
85      if (dfa->mb_cur_max > 1)
86	{
87	  while (1)
88	    {
89	      ret = build_wcs_upper_buffer (pstr);
90	      if (BE (ret != REG_NOERROR, 0))
91		return ret;
92	      if (pstr->valid_raw_len >= len)
93		break;
94	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95		break;
96	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97	      if (BE (ret != REG_NOERROR, 0))
98		return ret;
99	    }
100	}
101      else
102#endif /* RE_ENABLE_I18N  */
103	build_upper_buffer (pstr);
104    }
105  else
106    {
107#ifdef RE_ENABLE_I18N
108      if (dfa->mb_cur_max > 1)
109	build_wcs_buffer (pstr);
110      else
111#endif /* RE_ENABLE_I18N  */
112	{
113	  if (trans != NULL)
114	    re_string_translate_buffer (pstr);
115	  else
116	    {
117	      pstr->valid_len = pstr->bufs_len;
118	      pstr->valid_raw_len = pstr->bufs_len;
119	    }
120	}
121    }
122
123  return REG_NOERROR;
124}
125
126/* Helper functions for re_string_allocate, and re_string_construct.  */
127
128static reg_errcode_t
129internal_function __attribute_warn_unused_result__
130re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131{
132#ifdef RE_ENABLE_I18N
133  if (pstr->mb_cur_max > 1)
134    {
135      wint_t *new_wcs;
136
137      /* Avoid overflow in realloc.  */
138      const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139      if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
140	return REG_ESPACE;
141
142      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143      if (BE (new_wcs == NULL, 0))
144	return REG_ESPACE;
145      pstr->wcs = new_wcs;
146      if (pstr->offsets != NULL)
147	{
148	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149	  if (BE (new_offsets == NULL, 0))
150	    return REG_ESPACE;
151	  pstr->offsets = new_offsets;
152	}
153    }
154#endif /* RE_ENABLE_I18N  */
155  if (pstr->mbs_allocated)
156    {
157      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158					   new_buf_len);
159      if (BE (new_mbs == NULL, 0))
160	return REG_ESPACE;
161      pstr->mbs = new_mbs;
162    }
163  pstr->bufs_len = new_buf_len;
164  return REG_NOERROR;
165}
166
167
168static void
169internal_function
170re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171			    RE_TRANSLATE_TYPE trans, bool icase,
172			    const re_dfa_t *dfa)
173{
174  pstr->raw_mbs = (const unsigned char *) str;
175  pstr->len = len;
176  pstr->raw_len = len;
177  pstr->trans = trans;
178  pstr->icase = icase;
179  pstr->mbs_allocated = (trans != NULL || icase);
180  pstr->mb_cur_max = dfa->mb_cur_max;
181  pstr->is_utf8 = dfa->is_utf8;
182  pstr->map_notascii = dfa->map_notascii;
183  pstr->stop = pstr->len;
184  pstr->raw_stop = pstr->stop;
185}
186
187#ifdef RE_ENABLE_I18N
188
189/* Build wide character buffer PSTR->WCS.
190   If the byte sequence of the string are:
191     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192   Then wide character buffer will be:
193     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
194   We use WEOF for padding, they indicate that the position isn't
195   a first byte of a multibyte character.
196
197   Note that this function assumes PSTR->VALID_LEN elements are already
198   built and starts from PSTR->VALID_LEN.  */
199
200static void
201internal_function
202build_wcs_buffer (re_string_t *pstr)
203{
204#ifdef _LIBC
205  unsigned char buf[MB_LEN_MAX];
206  assert (MB_LEN_MAX >= pstr->mb_cur_max);
207#else
208  unsigned char buf[64];
209#endif
210  mbstate_t prev_st;
211  Idx byte_idx, end_idx, remain_len;
212  size_t mbclen;
213
214  /* Build the buffers from pstr->valid_len to either pstr->len or
215     pstr->bufs_len.  */
216  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218    {
219      wchar_t wc;
220      const char *p;
221
222      remain_len = end_idx - byte_idx;
223      prev_st = pstr->cur_state;
224      /* Apply the translation if we need.  */
225      if (BE (pstr->trans != NULL, 0))
226	{
227	  int i, ch;
228
229	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230	    {
231	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233	    }
234	  p = (const char *) buf;
235	}
236      else
237	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239      if (BE (mbclen == (size_t) -1 || mbclen == 0
240	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241	{
242	  /* We treat these cases as a singlebyte character.  */
243	  mbclen = 1;
244	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245	  if (BE (pstr->trans != NULL, 0))
246	    wc = pstr->trans[wc];
247	  pstr->cur_state = prev_st;
248	}
249      else if (BE (mbclen == (size_t) -2, 0))
250	{
251	  /* The buffer doesn't have enough space, finish to build.  */
252	  pstr->cur_state = prev_st;
253	  break;
254	}
255
256      /* Write wide character and padding.  */
257      pstr->wcs[byte_idx++] = wc;
258      /* Write paddings.  */
259      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260	pstr->wcs[byte_idx++] = WEOF;
261    }
262  pstr->valid_len = byte_idx;
263  pstr->valid_raw_len = byte_idx;
264}
265
266/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267   but for REG_ICASE.  */
268
269static reg_errcode_t
270internal_function __attribute_warn_unused_result__
271build_wcs_upper_buffer (re_string_t *pstr)
272{
273  mbstate_t prev_st;
274  Idx src_idx, byte_idx, end_idx, remain_len;
275  size_t mbclen;
276#ifdef _LIBC
277  char buf[MB_LEN_MAX];
278  assert (MB_LEN_MAX >= pstr->mb_cur_max);
279#else
280  char buf[64];
281#endif
282
283  byte_idx = pstr->valid_len;
284  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286  /* The following optimization assumes that ASCII characters can be
287     mapped to wide characters with a simple cast.  */
288  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289    {
290      while (byte_idx < end_idx)
291	{
292	  wchar_t wc;
293
294	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295	      && mbsinit (&pstr->cur_state))
296	    {
297	      /* In case of a singlebyte character.  */
298	      pstr->mbs[byte_idx]
299		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300	      /* The next step uses the assumption that wchar_t is encoded
301		 ASCII-safe: all ASCII values can be converted like this.  */
302	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303	      ++byte_idx;
304	      continue;
305	    }
306
307	  remain_len = end_idx - byte_idx;
308	  prev_st = pstr->cur_state;
309	  mbclen = __mbrtowc (&wc,
310			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311			       + byte_idx), remain_len, &pstr->cur_state);
312	  if (BE (mbclen < (size_t) -2, 1))
313	    {
314	      wchar_t wcu = towupper (wc);
315	      if (wcu != wc)
316		{
317		  size_t mbcdlen;
318
319		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
320		  if (BE (mbclen == mbcdlen, 1))
321		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
322		  else
323		    {
324		      src_idx = byte_idx;
325		      goto offsets_needed;
326		    }
327		}
328	      else
329		memcpy (pstr->mbs + byte_idx,
330			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331	      pstr->wcs[byte_idx++] = wcu;
332	      /* Write paddings.  */
333	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334		pstr->wcs[byte_idx++] = WEOF;
335	    }
336	  else if (mbclen == (size_t) -1 || mbclen == 0
337		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
338	    {
339	      /* It is an invalid character, an incomplete character
340		 at the end of the string, or '\0'.  Just use the byte.  */
341	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
342	      pstr->mbs[byte_idx] = ch;
343	      /* And also cast it to wide char.  */
344	      pstr->wcs[byte_idx++] = (wchar_t) ch;
345	      if (BE (mbclen == (size_t) -1, 0))
346		pstr->cur_state = prev_st;
347	    }
348	  else
349	    {
350	      /* The buffer doesn't have enough space, finish to build.  */
351	      pstr->cur_state = prev_st;
352	      break;
353	    }
354	}
355      pstr->valid_len = byte_idx;
356      pstr->valid_raw_len = byte_idx;
357      return REG_NOERROR;
358    }
359  else
360    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361      {
362	wchar_t wc;
363	const char *p;
364      offsets_needed:
365	remain_len = end_idx - byte_idx;
366	prev_st = pstr->cur_state;
367	if (BE (pstr->trans != NULL, 0))
368	  {
369	    int i, ch;
370
371	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372	      {
373		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
374		buf[i] = pstr->trans[ch];
375	      }
376	    p = (const char *) buf;
377	  }
378	else
379	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
380	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
381	if (BE (mbclen < (size_t) -2, 1))
382	  {
383	    wchar_t wcu = towupper (wc);
384	    if (wcu != wc)
385	      {
386		size_t mbcdlen;
387
388		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389		if (BE (mbclen == mbcdlen, 1))
390		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
391		else if (mbcdlen != (size_t) -1)
392		  {
393		    size_t i;
394
395		    if (byte_idx + mbcdlen > pstr->bufs_len)
396		      {
397			pstr->cur_state = prev_st;
398			break;
399		      }
400
401		    if (pstr->offsets == NULL)
402		      {
403			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404
405			if (pstr->offsets == NULL)
406			  return REG_ESPACE;
407		      }
408		    if (!pstr->offsets_needed)
409		      {
410			for (i = 0; i < (size_t) byte_idx; ++i)
411			  pstr->offsets[i] = i;
412			pstr->offsets_needed = 1;
413		      }
414
415		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416		    pstr->wcs[byte_idx] = wcu;
417		    pstr->offsets[byte_idx] = src_idx;
418		    for (i = 1; i < mbcdlen; ++i)
419		      {
420			pstr->offsets[byte_idx + i]
421			  = src_idx + (i < mbclen ? i : mbclen - 1);
422			pstr->wcs[byte_idx + i] = WEOF;
423		      }
424		    pstr->len += mbcdlen - mbclen;
425		    if (pstr->raw_stop > src_idx)
426		      pstr->stop += mbcdlen - mbclen;
427		    end_idx = (pstr->bufs_len > pstr->len)
428			      ? pstr->len : pstr->bufs_len;
429		    byte_idx += mbcdlen;
430		    src_idx += mbclen;
431		    continue;
432		  }
433		else
434		  memcpy (pstr->mbs + byte_idx, p, mbclen);
435	      }
436	    else
437	      memcpy (pstr->mbs + byte_idx, p, mbclen);
438
439	    if (BE (pstr->offsets_needed != 0, 0))
440	      {
441		size_t i;
442		for (i = 0; i < mbclen; ++i)
443		  pstr->offsets[byte_idx + i] = src_idx + i;
444	      }
445	    src_idx += mbclen;
446
447	    pstr->wcs[byte_idx++] = wcu;
448	    /* Write paddings.  */
449	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450	      pstr->wcs[byte_idx++] = WEOF;
451	  }
452	else if (mbclen == (size_t) -1 || mbclen == 0
453		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
454	  {
455	    /* It is an invalid character or '\0'.  Just use the byte.  */
456	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457
458	    if (BE (pstr->trans != NULL, 0))
459	      ch = pstr->trans [ch];
460	    pstr->mbs[byte_idx] = ch;
461
462	    if (BE (pstr->offsets_needed != 0, 0))
463	      pstr->offsets[byte_idx] = src_idx;
464	    ++src_idx;
465
466	    /* And also cast it to wide char.  */
467	    pstr->wcs[byte_idx++] = (wchar_t) ch;
468	    if (BE (mbclen == (size_t) -1, 0))
469	      pstr->cur_state = prev_st;
470	  }
471	else
472	  {
473	    /* The buffer doesn't have enough space, finish to build.  */
474	    pstr->cur_state = prev_st;
475	    break;
476	  }
477      }
478  pstr->valid_len = byte_idx;
479  pstr->valid_raw_len = src_idx;
480  return REG_NOERROR;
481}
482
483/* Skip characters until the index becomes greater than NEW_RAW_IDX.
484   Return the index.  */
485
486static Idx
487internal_function
488re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
489{
490  mbstate_t prev_st;
491  Idx rawbuf_idx;
492  size_t mbclen;
493  wint_t wc = WEOF;
494
495  /* Skip the characters which are not necessary to check.  */
496  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
497       rawbuf_idx < new_raw_idx;)
498    {
499      wchar_t wc2;
500      Idx remain_len = pstr->raw_len - rawbuf_idx;
501      prev_st = pstr->cur_state;
502      mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503			  remain_len, &pstr->cur_state);
504      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505	{
506	  /* We treat these cases as a single byte character.  */
507	  if (mbclen == 0 || remain_len == 0)
508	    wc = L'\0';
509	  else
510	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511	  mbclen = 1;
512	  pstr->cur_state = prev_st;
513	}
514      else
515	wc = wc2;
516      /* Then proceed the next character.  */
517      rawbuf_idx += mbclen;
518    }
519  *last_wc = wc;
520  return rawbuf_idx;
521}
522#endif /* RE_ENABLE_I18N  */
523
524/* Build the buffer PSTR->MBS, and apply the translation if we need.
525   This function is used in case of REG_ICASE.  */
526
527static void
528internal_function
529build_upper_buffer (re_string_t *pstr)
530{
531  Idx char_idx, end_idx;
532  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535    {
536      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537      if (BE (pstr->trans != NULL, 0))
538	ch = pstr->trans[ch];
539      pstr->mbs[char_idx] = toupper (ch);
540    }
541  pstr->valid_len = char_idx;
542  pstr->valid_raw_len = char_idx;
543}
544
545/* Apply TRANS to the buffer in PSTR.  */
546
547static void
548internal_function
549re_string_translate_buffer (re_string_t *pstr)
550{
551  Idx buf_idx, end_idx;
552  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553
554  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555    {
556      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557      pstr->mbs[buf_idx] = pstr->trans[ch];
558    }
559
560  pstr->valid_len = buf_idx;
561  pstr->valid_raw_len = buf_idx;
562}
563
564/* This function re-construct the buffers.
565   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566   convert to upper case in case of REG_ICASE, apply translation.  */
567
568static reg_errcode_t
569internal_function __attribute_warn_unused_result__
570re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
571{
572  Idx offset;
573
574  if (BE (pstr->raw_mbs_idx <= idx, 0))
575    offset = idx - pstr->raw_mbs_idx;
576  else
577    {
578      /* Reset buffer.  */
579#ifdef RE_ENABLE_I18N
580      if (pstr->mb_cur_max > 1)
581	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
582#endif /* RE_ENABLE_I18N */
583      pstr->len = pstr->raw_len;
584      pstr->stop = pstr->raw_stop;
585      pstr->valid_len = 0;
586      pstr->raw_mbs_idx = 0;
587      pstr->valid_raw_len = 0;
588      pstr->offsets_needed = 0;
589      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
590			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
591      if (!pstr->mbs_allocated)
592	pstr->mbs = (unsigned char *) pstr->raw_mbs;
593      offset = idx;
594    }
595
596  if (BE (offset != 0, 1))
597    {
598      /* Should the already checked characters be kept?  */
599      if (BE (offset < pstr->valid_raw_len, 1))
600	{
601	  /* Yes, move them to the front of the buffer.  */
602#ifdef RE_ENABLE_I18N
603	  if (BE (pstr->offsets_needed, 0))
604	    {
605	      Idx low = 0, high = pstr->valid_len, mid;
606	      do
607		{
608		  mid = (high + low) / 2;
609		  if (pstr->offsets[mid] > offset)
610		    high = mid;
611		  else if (pstr->offsets[mid] < offset)
612		    low = mid + 1;
613		  else
614		    break;
615		}
616	      while (low < high);
617	      if (pstr->offsets[mid] < offset)
618		++mid;
619	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
620							eflags);
621	      /* This can be quite complicated, so handle specially
622		 only the common and easy case where the character with
623		 different length representation of lower and upper
624		 case is present at or after offset.  */
625	      if (pstr->valid_len > offset
626		  && mid == offset && pstr->offsets[mid] == offset)
627		{
628		  memmove (pstr->wcs, pstr->wcs + offset,
629			   (pstr->valid_len - offset) * sizeof (wint_t));
630		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
631		  pstr->valid_len -= offset;
632		  pstr->valid_raw_len -= offset;
633		  for (low = 0; low < pstr->valid_len; low++)
634		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
635		}
636	      else
637		{
638		  /* Otherwise, just find out how long the partial multibyte
639		     character at offset is and fill it with WEOF/255.  */
640		  pstr->len = pstr->raw_len - idx + offset;
641		  pstr->stop = pstr->raw_stop - idx + offset;
642		  pstr->offsets_needed = 0;
643		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
644		    --mid;
645		  while (mid < pstr->valid_len)
646		    if (pstr->wcs[mid] != WEOF)
647		      break;
648		    else
649		      ++mid;
650		  if (mid == pstr->valid_len)
651		    pstr->valid_len = 0;
652		  else
653		    {
654		      pstr->valid_len = pstr->offsets[mid] - offset;
655		      if (pstr->valid_len)
656			{
657			  for (low = 0; low < pstr->valid_len; ++low)
658			    pstr->wcs[low] = WEOF;
659			  memset (pstr->mbs, 255, pstr->valid_len);
660			}
661		    }
662		  pstr->valid_raw_len = pstr->valid_len;
663		}
664	    }
665	  else
666#endif
667	    {
668	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
669							eflags);
670#ifdef RE_ENABLE_I18N
671	      if (pstr->mb_cur_max > 1)
672		memmove (pstr->wcs, pstr->wcs + offset,
673			 (pstr->valid_len - offset) * sizeof (wint_t));
674#endif /* RE_ENABLE_I18N */
675	      if (BE (pstr->mbs_allocated, 0))
676		memmove (pstr->mbs, pstr->mbs + offset,
677			 pstr->valid_len - offset);
678	      pstr->valid_len -= offset;
679	      pstr->valid_raw_len -= offset;
680#if DEBUG
681	      assert (pstr->valid_len > 0);
682#endif
683	    }
684	}
685      else
686	{
687#ifdef RE_ENABLE_I18N
688	  /* No, skip all characters until IDX.  */
689	  Idx prev_valid_len = pstr->valid_len;
690
691	  if (BE (pstr->offsets_needed, 0))
692	    {
693	      pstr->len = pstr->raw_len - idx + offset;
694	      pstr->stop = pstr->raw_stop - idx + offset;
695	      pstr->offsets_needed = 0;
696	    }
697#endif
698	  pstr->valid_len = 0;
699#ifdef RE_ENABLE_I18N
700	  if (pstr->mb_cur_max > 1)
701	    {
702	      Idx wcs_idx;
703	      wint_t wc = WEOF;
704
705	      if (pstr->is_utf8)
706		{
707		  const unsigned char *raw, *p, *end;
708
709		  /* Special case UTF-8.  Multi-byte chars start with any
710		     byte other than 0x80 - 0xbf.  */
711		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
712		  end = raw + (offset - pstr->mb_cur_max);
713		  if (end < pstr->raw_mbs)
714		    end = pstr->raw_mbs;
715		  p = raw + offset - 1;
716#ifdef _LIBC
717		  /* We know the wchar_t encoding is UCS4, so for the simple
718		     case, ASCII characters, skip the conversion step.  */
719		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
720		    {
721		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
722		      /* pstr->valid_len = 0; */
723		      wc = (wchar_t) *p;
724		    }
725		  else
726#endif
727		    for (; p >= end; --p)
728		      if ((*p & 0xc0) != 0x80)
729			{
730			  mbstate_t cur_state;
731			  wchar_t wc2;
732			  Idx mlen = raw + pstr->len - p;
733			  unsigned char buf[6];
734			  size_t mbclen;
735
736			  const unsigned char *pp = p;
737			  if (BE (pstr->trans != NULL, 0))
738			    {
739			      int i = mlen < 6 ? mlen : 6;
740			      while (--i >= 0)
741				buf[i] = pstr->trans[p[i]];
742			      pp = buf;
743			    }
744			  /* XXX Don't use mbrtowc, we know which conversion
745			     to use (UTF-8 -> UCS4).  */
746			  memset (&cur_state, 0, sizeof (cur_state));
747			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
748					      &cur_state);
749			  if (raw + offset - p <= mbclen
750			      && mbclen < (size_t) -2)
751			    {
752			      memset (&pstr->cur_state, '\0',
753				      sizeof (mbstate_t));
754			      pstr->valid_len = mbclen - (raw + offset - p);
755			      wc = wc2;
756			    }
757			  break;
758			}
759		}
760
761	      if (wc == WEOF)
762		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
763	      if (wc == WEOF)
764		pstr->tip_context
765		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
766	      else
767		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
768				      && IS_WIDE_WORD_CHAR (wc))
769				     ? CONTEXT_WORD
770				     : ((IS_WIDE_NEWLINE (wc)
771					 && pstr->newline_anchor)
772					? CONTEXT_NEWLINE : 0));
773	      if (BE (pstr->valid_len, 0))
774		{
775		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
776		    pstr->wcs[wcs_idx] = WEOF;
777		  if (pstr->mbs_allocated)
778		    memset (pstr->mbs, 255, pstr->valid_len);
779		}
780	      pstr->valid_raw_len = pstr->valid_len;
781	    }
782	  else
783#endif /* RE_ENABLE_I18N */
784	    {
785	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
786	      pstr->valid_raw_len = 0;
787	      if (pstr->trans)
788		c = pstr->trans[c];
789	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
790				   ? CONTEXT_WORD
791				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
792				      ? CONTEXT_NEWLINE : 0));
793	    }
794	}
795      if (!BE (pstr->mbs_allocated, 0))
796	pstr->mbs += offset;
797    }
798  pstr->raw_mbs_idx = idx;
799  pstr->len -= offset;
800  pstr->stop -= offset;
801
802  /* Then build the buffers.  */
803#ifdef RE_ENABLE_I18N
804  if (pstr->mb_cur_max > 1)
805    {
806      if (pstr->icase)
807	{
808	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
809	  if (BE (ret != REG_NOERROR, 0))
810	    return ret;
811	}
812      else
813	build_wcs_buffer (pstr);
814    }
815  else
816#endif /* RE_ENABLE_I18N */
817    if (BE (pstr->mbs_allocated, 0))
818      {
819	if (pstr->icase)
820	  build_upper_buffer (pstr);
821	else if (pstr->trans != NULL)
822	  re_string_translate_buffer (pstr);
823      }
824    else
825      pstr->valid_len = pstr->len;
826
827  pstr->cur_idx = 0;
828  return REG_NOERROR;
829}
830
831static unsigned char
832internal_function __attribute__ ((pure))
833re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
834{
835  int ch;
836  Idx off;
837
838  /* Handle the common (easiest) cases first.  */
839  if (BE (!pstr->mbs_allocated, 1))
840    return re_string_peek_byte (pstr, idx);
841
842#ifdef RE_ENABLE_I18N
843  if (pstr->mb_cur_max > 1
844      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
845    return re_string_peek_byte (pstr, idx);
846#endif
847
848  off = pstr->cur_idx + idx;
849#ifdef RE_ENABLE_I18N
850  if (pstr->offsets_needed)
851    off = pstr->offsets[off];
852#endif
853
854  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
855
856#ifdef RE_ENABLE_I18N
857  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
858     this function returns CAPITAL LETTER I instead of first byte of
859     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
860     since peek_byte_case doesn't advance cur_idx in any way.  */
861  if (pstr->offsets_needed && !isascii (ch))
862    return re_string_peek_byte (pstr, idx);
863#endif
864
865  return ch;
866}
867
868static unsigned char
869internal_function
870re_string_fetch_byte_case (re_string_t *pstr)
871{
872  if (BE (!pstr->mbs_allocated, 1))
873    return re_string_fetch_byte (pstr);
874
875#ifdef RE_ENABLE_I18N
876  if (pstr->offsets_needed)
877    {
878      Idx off;
879      int ch;
880
881      /* For tr_TR.UTF-8 [[:islower:]] there is
882	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
883	 in that case the whole multi-byte character and return
884	 the original letter.  On the other side, with
885	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
886	 anything else would complicate things too much.  */
887
888      if (!re_string_first_byte (pstr, pstr->cur_idx))
889	return re_string_fetch_byte (pstr);
890
891      off = pstr->offsets[pstr->cur_idx];
892      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893
894      if (! isascii (ch))
895	return re_string_fetch_byte (pstr);
896
897      re_string_skip_bytes (pstr,
898			    re_string_char_size_at (pstr, pstr->cur_idx));
899      return ch;
900    }
901#endif
902
903  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904}
905
906static void
907internal_function
908re_string_destruct (re_string_t *pstr)
909{
910#ifdef RE_ENABLE_I18N
911  re_free (pstr->wcs);
912  re_free (pstr->offsets);
913#endif /* RE_ENABLE_I18N  */
914  if (pstr->mbs_allocated)
915    re_free (pstr->mbs);
916}
917
918/* Return the context at IDX in INPUT.  */
919
920static unsigned int
921internal_function
922re_string_context_at (const re_string_t *input, Idx idx, int eflags)
923{
924  int c;
925  if (BE (! REG_VALID_INDEX (idx), 0))
926    /* In this case, we use the value stored in input->tip_context,
927       since we can't know the character in input->mbs[-1] here.  */
928    return input->tip_context;
929  if (BE (idx == input->len, 0))
930    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
932#ifdef RE_ENABLE_I18N
933  if (input->mb_cur_max > 1)
934    {
935      wint_t wc;
936      Idx wc_idx = idx;
937      while(input->wcs[wc_idx] == WEOF)
938	{
939#ifdef DEBUG
940	  /* It must not happen.  */
941	  assert (REG_VALID_INDEX (wc_idx));
942#endif
943	  --wc_idx;
944	  if (! REG_VALID_INDEX (wc_idx))
945	    return input->tip_context;
946	}
947      wc = input->wcs[wc_idx];
948      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
949	return CONTEXT_WORD;
950      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951	      ? CONTEXT_NEWLINE : 0);
952    }
953  else
954#endif
955    {
956      c = re_string_byte_at (input, idx);
957      if (bitset_contain (input->word_char, c))
958	return CONTEXT_WORD;
959      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
960    }
961}
962
963/* Functions for set operation.  */
964
965static reg_errcode_t
966internal_function __attribute_warn_unused_result__
967re_node_set_alloc (re_node_set *set, Idx size)
968{
969  set->alloc = size;
970  set->nelem = 0;
971  set->elems = re_malloc (Idx, size);
972  if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
973    return REG_ESPACE;
974  return REG_NOERROR;
975}
976
977static reg_errcode_t
978internal_function __attribute_warn_unused_result__
979re_node_set_init_1 (re_node_set *set, Idx elem)
980{
981  set->alloc = 1;
982  set->nelem = 1;
983  set->elems = re_malloc (Idx, 1);
984  if (BE (set->elems == NULL, 0))
985    {
986      set->alloc = set->nelem = 0;
987      return REG_ESPACE;
988    }
989  set->elems[0] = elem;
990  return REG_NOERROR;
991}
992
993static reg_errcode_t
994internal_function __attribute_warn_unused_result__
995re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
996{
997  set->alloc = 2;
998  set->elems = re_malloc (Idx, 2);
999  if (BE (set->elems == NULL, 0))
1000    return REG_ESPACE;
1001  if (elem1 == elem2)
1002    {
1003      set->nelem = 1;
1004      set->elems[0] = elem1;
1005    }
1006  else
1007    {
1008      set->nelem = 2;
1009      if (elem1 < elem2)
1010	{
1011	  set->elems[0] = elem1;
1012	  set->elems[1] = elem2;
1013	}
1014      else
1015	{
1016	  set->elems[0] = elem2;
1017	  set->elems[1] = elem1;
1018	}
1019    }
1020  return REG_NOERROR;
1021}
1022
1023static reg_errcode_t
1024internal_function __attribute_warn_unused_result__
1025re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026{
1027  dest->nelem = src->nelem;
1028  if (src->nelem > 0)
1029    {
1030      dest->alloc = dest->nelem;
1031      dest->elems = re_malloc (Idx, dest->alloc);
1032      if (BE (dest->elems == NULL, 0))
1033	{
1034	  dest->alloc = dest->nelem = 0;
1035	  return REG_ESPACE;
1036	}
1037      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1038    }
1039  else
1040    re_node_set_init_empty (dest);
1041  return REG_NOERROR;
1042}
1043
1044/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1047
1048static reg_errcode_t
1049internal_function __attribute_warn_unused_result__
1050re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051			   const re_node_set *src2)
1052{
1053  Idx i1, i2, is, id, delta, sbase;
1054  if (src1->nelem == 0 || src2->nelem == 0)
1055    return REG_NOERROR;
1056
1057  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058     conservative estimate.  */
1059  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060    {
1061      Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062      Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1063      if (BE (new_elems == NULL, 0))
1064	return REG_ESPACE;
1065      dest->elems = new_elems;
1066      dest->alloc = new_alloc;
1067    }
1068
1069  /* Find the items in the intersection of SRC1 and SRC2, and copy
1070     into the top of DEST those that are not already in DEST itself.  */
1071  sbase = dest->nelem + src1->nelem + src2->nelem;
1072  i1 = src1->nelem - 1;
1073  i2 = src2->nelem - 1;
1074  id = dest->nelem - 1;
1075  for (;;)
1076    {
1077      if (src1->elems[i1] == src2->elems[i2])
1078	{
1079	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1080	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1081	    --id;
1082
1083          if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1084            dest->elems[--sbase] = src1->elems[i1];
1085
1086	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1087	    break;
1088	}
1089
1090      /* Lower the highest of the two items.  */
1091      else if (src1->elems[i1] < src2->elems[i2])
1092	{
1093	  if (! REG_VALID_INDEX (--i2))
1094	    break;
1095	}
1096      else
1097	{
1098	  if (! REG_VALID_INDEX (--i1))
1099	    break;
1100	}
1101    }
1102
1103  id = dest->nelem - 1;
1104  is = dest->nelem + src1->nelem + src2->nelem - 1;
1105  delta = is - sbase + 1;
1106
1107  /* Now copy.  When DELTA becomes zero, the remaining
1108     DEST elements are already in place; this is more or
1109     less the same loop that is in re_node_set_merge.  */
1110  dest->nelem += delta;
1111  if (delta > 0 && REG_VALID_INDEX (id))
1112    for (;;)
1113      {
1114	if (dest->elems[is] > dest->elems[id])
1115	  {
1116	    /* Copy from the top.  */
1117	    dest->elems[id + delta--] = dest->elems[is--];
1118	    if (delta == 0)
1119	      break;
1120	  }
1121	else
1122	  {
1123	    /* Slide from the bottom.  */
1124	    dest->elems[id + delta] = dest->elems[id];
1125	    if (! REG_VALID_INDEX (--id))
1126	      break;
1127	  }
1128      }
1129
1130  /* Copy remaining SRC elements.  */
1131  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1132
1133  return REG_NOERROR;
1134}
1135
1136/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1138
1139static reg_errcode_t
1140internal_function __attribute_warn_unused_result__
1141re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142			const re_node_set *src2)
1143{
1144  Idx i1, i2, id;
1145  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146    {
1147      dest->alloc = src1->nelem + src2->nelem;
1148      dest->elems = re_malloc (Idx, dest->alloc);
1149      if (BE (dest->elems == NULL, 0))
1150	return REG_ESPACE;
1151    }
1152  else
1153    {
1154      if (src1 != NULL && src1->nelem > 0)
1155	return re_node_set_init_copy (dest, src1);
1156      else if (src2 != NULL && src2->nelem > 0)
1157	return re_node_set_init_copy (dest, src2);
1158      else
1159	re_node_set_init_empty (dest);
1160      return REG_NOERROR;
1161    }
1162  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163    {
1164      if (src1->elems[i1] > src2->elems[i2])
1165	{
1166	  dest->elems[id++] = src2->elems[i2++];
1167	  continue;
1168	}
1169      if (src1->elems[i1] == src2->elems[i2])
1170	++i2;
1171      dest->elems[id++] = src1->elems[i1++];
1172    }
1173  if (i1 < src1->nelem)
1174    {
1175      memcpy (dest->elems + id, src1->elems + i1,
1176	     (src1->nelem - i1) * sizeof (Idx));
1177      id += src1->nelem - i1;
1178    }
1179  else if (i2 < src2->nelem)
1180    {
1181      memcpy (dest->elems + id, src2->elems + i2,
1182	     (src2->nelem - i2) * sizeof (Idx));
1183      id += src2->nelem - i2;
1184    }
1185  dest->nelem = id;
1186  return REG_NOERROR;
1187}
1188
1189/* Calculate the union set of the sets DEST and SRC. And store it to
1190   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1191
1192static reg_errcode_t
1193internal_function __attribute_warn_unused_result__
1194re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195{
1196  Idx is, id, sbase, delta;
1197  if (src == NULL || src->nelem == 0)
1198    return REG_NOERROR;
1199  if (dest->alloc < 2 * src->nelem + dest->nelem)
1200    {
1201      Idx new_alloc = 2 * (src->nelem + dest->alloc);
1202      Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1203      if (BE (new_buffer == NULL, 0))
1204	return REG_ESPACE;
1205      dest->elems = new_buffer;
1206      dest->alloc = new_alloc;
1207    }
1208
1209  if (BE (dest->nelem == 0, 0))
1210    {
1211      dest->nelem = src->nelem;
1212      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1213      return REG_NOERROR;
1214    }
1215
1216  /* Copy into the top of DEST the items of SRC that are not
1217     found in DEST.  Maybe we could binary search in DEST?  */
1218  for (sbase = dest->nelem + 2 * src->nelem,
1219       is = src->nelem - 1, id = dest->nelem - 1;
1220       REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1221    {
1222      if (dest->elems[id] == src->elems[is])
1223	is--, id--;
1224      else if (dest->elems[id] < src->elems[is])
1225	dest->elems[--sbase] = src->elems[is--];
1226      else /* if (dest->elems[id] > src->elems[is]) */
1227	--id;
1228    }
1229
1230  if (REG_VALID_INDEX (is))
1231    {
1232      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1233      sbase -= is + 1;
1234      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1235    }
1236
1237  id = dest->nelem - 1;
1238  is = dest->nelem + 2 * src->nelem - 1;
1239  delta = is - sbase + 1;
1240  if (delta == 0)
1241    return REG_NOERROR;
1242
1243  /* Now copy.  When DELTA becomes zero, the remaining
1244     DEST elements are already in place.  */
1245  dest->nelem += delta;
1246  for (;;)
1247    {
1248      if (dest->elems[is] > dest->elems[id])
1249	{
1250	  /* Copy from the top.  */
1251	  dest->elems[id + delta--] = dest->elems[is--];
1252	  if (delta == 0)
1253	    break;
1254	}
1255      else
1256	{
1257	  /* Slide from the bottom.  */
1258	  dest->elems[id + delta] = dest->elems[id];
1259	  if (! REG_VALID_INDEX (--id))
1260	    {
1261	      /* Copy remaining SRC elements.  */
1262	      memcpy (dest->elems, dest->elems + sbase,
1263		      delta * sizeof (Idx));
1264	      break;
1265	    }
1266	}
1267    }
1268
1269  return REG_NOERROR;
1270}
1271
1272/* Insert the new element ELEM to the re_node_set* SET.
1273   SET should not already have ELEM.
1274   Return true if successful.  */
1275
1276static bool
1277internal_function __attribute_warn_unused_result__
1278re_node_set_insert (re_node_set *set, Idx elem)
1279{
1280  Idx idx;
1281  /* In case the set is empty.  */
1282  if (set->alloc == 0)
1283    return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1284
1285  if (BE (set->nelem, 0) == 0)
1286    {
1287      /* We already guaranteed above that set->alloc != 0.  */
1288      set->elems[0] = elem;
1289      ++set->nelem;
1290      return true;
1291    }
1292
1293  /* Realloc if we need.  */
1294  if (set->alloc == set->nelem)
1295    {
1296      Idx *new_elems;
1297      set->alloc = set->alloc * 2;
1298      new_elems = re_realloc (set->elems, Idx, set->alloc);
1299      if (BE (new_elems == NULL, 0))
1300	return false;
1301      set->elems = new_elems;
1302    }
1303
1304  /* Move the elements which follows the new element.  Test the
1305     first element separately to skip a check in the inner loop.  */
1306  if (elem < set->elems[0])
1307    {
1308      idx = 0;
1309      for (idx = set->nelem; idx > 0; idx--)
1310	set->elems[idx] = set->elems[idx - 1];
1311    }
1312  else
1313    {
1314      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1315	set->elems[idx] = set->elems[idx - 1];
1316    }
1317
1318  /* Insert the new element.  */
1319  set->elems[idx] = elem;
1320  ++set->nelem;
1321  return true;
1322}
1323
1324/* Insert the new element ELEM to the re_node_set* SET.
1325   SET should not already have any element greater than or equal to ELEM.
1326   Return true if successful.  */
1327
1328static bool
1329internal_function __attribute_warn_unused_result__
1330re_node_set_insert_last (re_node_set *set, Idx elem)
1331{
1332  /* Realloc if we need.  */
1333  if (set->alloc == set->nelem)
1334    {
1335      Idx *new_elems;
1336      set->alloc = (set->alloc + 1) * 2;
1337      new_elems = re_realloc (set->elems, Idx, set->alloc);
1338      if (BE (new_elems == NULL, 0))
1339	return false;
1340      set->elems = new_elems;
1341    }
1342
1343  /* Insert the new element.  */
1344  set->elems[set->nelem++] = elem;
1345  return true;
1346}
1347
1348/* Compare two node sets SET1 and SET2.
1349   Return true if SET1 and SET2 are equivalent.  */
1350
1351static bool
1352internal_function __attribute__ ((pure))
1353re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1354{
1355  Idx i;
1356  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1357    return false;
1358  for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1359    if (set1->elems[i] != set2->elems[i])
1360      return false;
1361  return true;
1362}
1363
1364/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1365
1366static Idx
1367internal_function __attribute__ ((pure))
1368re_node_set_contains (const re_node_set *set, Idx elem)
1369{
1370  __re_size_t idx, right, mid;
1371  if (! REG_VALID_NONZERO_INDEX (set->nelem))
1372    return 0;
1373
1374  /* Binary search the element.  */
1375  idx = 0;
1376  right = set->nelem - 1;
1377  while (idx < right)
1378    {
1379      mid = (idx + right) / 2;
1380      if (set->elems[mid] < elem)
1381	idx = mid + 1;
1382      else
1383	right = mid;
1384    }
1385  return set->elems[idx] == elem ? idx + 1 : 0;
1386}
1387
1388static void
1389internal_function
1390re_node_set_remove_at (re_node_set *set, Idx idx)
1391{
1392  if (idx < 0 || idx >= set->nelem)
1393    return;
1394  --set->nelem;
1395  for (; idx < set->nelem; idx++)
1396    set->elems[idx] = set->elems[idx + 1];
1397}
1398
1399
1400/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1401   Or return REG_MISSING if an error occurred.  */
1402
1403static Idx
1404internal_function
1405re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1406{
1407  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1408    {
1409      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1410      Idx *new_nexts, *new_indices;
1411      re_node_set *new_edests, *new_eclosures;
1412      re_token_t *new_nodes;
1413
1414      /* Avoid overflows in realloc.  */
1415      const size_t max_object_size = MAX (sizeof (re_token_t),
1416					  MAX (sizeof (re_node_set),
1417					       sizeof (Idx)));
1418      if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1419	return REG_MISSING;
1420
1421      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1422      if (BE (new_nodes == NULL, 0))
1423	return REG_MISSING;
1424      dfa->nodes = new_nodes;
1425      new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1426      new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1427      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1429      if (BE (new_nexts == NULL || new_indices == NULL
1430	      || new_edests == NULL || new_eclosures == NULL, 0))
1431	return REG_MISSING;
1432      dfa->nexts = new_nexts;
1433      dfa->org_indices = new_indices;
1434      dfa->edests = new_edests;
1435      dfa->eclosures = new_eclosures;
1436      dfa->nodes_alloc = new_nodes_alloc;
1437    }
1438  dfa->nodes[dfa->nodes_len] = token;
1439  dfa->nodes[dfa->nodes_len].constraint = 0;
1440#ifdef RE_ENABLE_I18N
1441  dfa->nodes[dfa->nodes_len].accept_mb =
1442    ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1443     || token.type == COMPLEX_BRACKET);
1444#endif
1445  dfa->nexts[dfa->nodes_len] = REG_MISSING;
1446  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1447  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1448  return dfa->nodes_len++;
1449}
1450
1451static re_hashval_t
1452internal_function
1453calc_state_hash (const re_node_set *nodes, unsigned int context)
1454{
1455  re_hashval_t hash = nodes->nelem + context;
1456  Idx i;
1457  for (i = 0 ; i < nodes->nelem ; i++)
1458    hash += nodes->elems[i];
1459  return hash;
1460}
1461
1462/* Search for the state whose node_set is equivalent to NODES.
1463   Return the pointer to the state, if we found it in the DFA.
1464   Otherwise create the new one and return it.  In case of an error
1465   return NULL and set the error code in ERR.
1466   Note: - We assume NULL as the invalid state, then it is possible that
1467	   return value is NULL and ERR is REG_NOERROR.
1468	 - We never return non-NULL value in case of any errors, it is for
1469	   optimization.  */
1470
1471static re_dfastate_t *
1472internal_function __attribute_warn_unused_result__
1473re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1474		  const re_node_set *nodes)
1475{
1476  re_hashval_t hash;
1477  re_dfastate_t *new_state;
1478  struct re_state_table_entry *spot;
1479  Idx i;
1480#ifdef lint
1481  /* Suppress bogus uninitialized-variable warnings.  */
1482  *err = REG_NOERROR;
1483#endif
1484  if (BE (nodes->nelem == 0, 0))
1485    {
1486      *err = REG_NOERROR;
1487      return NULL;
1488    }
1489  hash = calc_state_hash (nodes, 0);
1490  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1491
1492  for (i = 0 ; i < spot->num ; i++)
1493    {
1494      re_dfastate_t *state = spot->array[i];
1495      if (hash != state->hash)
1496	continue;
1497      if (re_node_set_compare (&state->nodes, nodes))
1498	return state;
1499    }
1500
1501  /* There are no appropriate state in the dfa, create the new one.  */
1502  new_state = create_ci_newstate (dfa, nodes, hash);
1503  if (BE (new_state == NULL, 0))
1504    *err = REG_ESPACE;
1505
1506  return new_state;
1507}
1508
1509/* Search for the state whose node_set is equivalent to NODES and
1510   whose context is equivalent to CONTEXT.
1511   Return the pointer to the state, if we found it in the DFA.
1512   Otherwise create the new one and return it.  In case of an error
1513   return NULL and set the error code in ERR.
1514   Note: - We assume NULL as the invalid state, then it is possible that
1515	   return value is NULL and ERR is REG_NOERROR.
1516	 - We never return non-NULL value in case of any errors, it is for
1517	   optimization.  */
1518
1519static re_dfastate_t *
1520internal_function __attribute_warn_unused_result__
1521re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1522			  const re_node_set *nodes, unsigned int context)
1523{
1524  re_hashval_t hash;
1525  re_dfastate_t *new_state;
1526  struct re_state_table_entry *spot;
1527  Idx i;
1528#ifdef lint
1529  /* Suppress bogus uninitialized-variable warnings.  */
1530  *err = REG_NOERROR;
1531#endif
1532  if (nodes->nelem == 0)
1533    {
1534      *err = REG_NOERROR;
1535      return NULL;
1536    }
1537  hash = calc_state_hash (nodes, context);
1538  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1539
1540  for (i = 0 ; i < spot->num ; i++)
1541    {
1542      re_dfastate_t *state = spot->array[i];
1543      if (state->hash == hash
1544	  && state->context == context
1545	  && re_node_set_compare (state->entrance_nodes, nodes))
1546	return state;
1547    }
1548  /* There are no appropriate state in 'dfa', create the new one.  */
1549  new_state = create_cd_newstate (dfa, nodes, context, hash);
1550  if (BE (new_state == NULL, 0))
1551    *err = REG_ESPACE;
1552
1553  return new_state;
1554}
1555
1556/* Finish initialization of the new state NEWSTATE, and using its hash value
1557   HASH put in the appropriate bucket of DFA's state table.  Return value
1558   indicates the error code if failed.  */
1559
1560static reg_errcode_t
1561__attribute_warn_unused_result__
1562register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1563		re_hashval_t hash)
1564{
1565  struct re_state_table_entry *spot;
1566  reg_errcode_t err;
1567  Idx i;
1568
1569  newstate->hash = hash;
1570  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1571  if (BE (err != REG_NOERROR, 0))
1572    return REG_ESPACE;
1573  for (i = 0; i < newstate->nodes.nelem; i++)
1574    {
1575      Idx elem = newstate->nodes.elems[i];
1576      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1577	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1578	  return REG_ESPACE;
1579    }
1580
1581  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1582  if (BE (spot->alloc <= spot->num, 0))
1583    {
1584      Idx new_alloc = 2 * spot->num + 2;
1585      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1586					      new_alloc);
1587      if (BE (new_array == NULL, 0))
1588	return REG_ESPACE;
1589      spot->array = new_array;
1590      spot->alloc = new_alloc;
1591    }
1592  spot->array[spot->num++] = newstate;
1593  return REG_NOERROR;
1594}
1595
1596static void
1597free_state (re_dfastate_t *state)
1598{
1599  re_node_set_free (&state->non_eps_nodes);
1600  re_node_set_free (&state->inveclosure);
1601  if (state->entrance_nodes != &state->nodes)
1602    {
1603      re_node_set_free (state->entrance_nodes);
1604      re_free (state->entrance_nodes);
1605    }
1606  re_node_set_free (&state->nodes);
1607  re_free (state->word_trtable);
1608  re_free (state->trtable);
1609  re_free (state);
1610}
1611
1612/* Create the new state which is independent of contexts.
1613   Return the new state if succeeded, otherwise return NULL.  */
1614
1615static re_dfastate_t *
1616internal_function __attribute_warn_unused_result__
1617create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1618		    re_hashval_t hash)
1619{
1620  Idx i;
1621  reg_errcode_t err;
1622  re_dfastate_t *newstate;
1623
1624  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1625  if (BE (newstate == NULL, 0))
1626    return NULL;
1627  err = re_node_set_init_copy (&newstate->nodes, nodes);
1628  if (BE (err != REG_NOERROR, 0))
1629    {
1630      re_free (newstate);
1631      return NULL;
1632    }
1633
1634  newstate->entrance_nodes = &newstate->nodes;
1635  for (i = 0 ; i < nodes->nelem ; i++)
1636    {
1637      re_token_t *node = dfa->nodes + nodes->elems[i];
1638      re_token_type_t type = node->type;
1639      if (type == CHARACTER && !node->constraint)
1640	continue;
1641#ifdef RE_ENABLE_I18N
1642      newstate->accept_mb |= node->accept_mb;
1643#endif /* RE_ENABLE_I18N */
1644
1645      /* If the state has the halt node, the state is a halt state.  */
1646      if (type == END_OF_RE)
1647	newstate->halt = 1;
1648      else if (type == OP_BACK_REF)
1649	newstate->has_backref = 1;
1650      else if (type == ANCHOR || node->constraint)
1651	newstate->has_constraint = 1;
1652    }
1653  err = register_state (dfa, newstate, hash);
1654  if (BE (err != REG_NOERROR, 0))
1655    {
1656      free_state (newstate);
1657      newstate = NULL;
1658    }
1659  return newstate;
1660}
1661
1662/* Create the new state which is depend on the context CONTEXT.
1663   Return the new state if succeeded, otherwise return NULL.  */
1664
1665static re_dfastate_t *
1666internal_function __attribute_warn_unused_result__
1667create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1668		    unsigned int context, re_hashval_t hash)
1669{
1670  Idx i, nctx_nodes = 0;
1671  reg_errcode_t err;
1672  re_dfastate_t *newstate;
1673
1674  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1675  if (BE (newstate == NULL, 0))
1676    return NULL;
1677  err = re_node_set_init_copy (&newstate->nodes, nodes);
1678  if (BE (err != REG_NOERROR, 0))
1679    {
1680      re_free (newstate);
1681      return NULL;
1682    }
1683
1684  newstate->context = context;
1685  newstate->entrance_nodes = &newstate->nodes;
1686
1687  for (i = 0 ; i < nodes->nelem ; i++)
1688    {
1689      re_token_t *node = dfa->nodes + nodes->elems[i];
1690      re_token_type_t type = node->type;
1691      unsigned int constraint = node->constraint;
1692
1693      if (type == CHARACTER && !constraint)
1694	continue;
1695#ifdef RE_ENABLE_I18N
1696      newstate->accept_mb |= node->accept_mb;
1697#endif /* RE_ENABLE_I18N */
1698
1699      /* If the state has the halt node, the state is a halt state.  */
1700      if (type == END_OF_RE)
1701	newstate->halt = 1;
1702      else if (type == OP_BACK_REF)
1703	newstate->has_backref = 1;
1704
1705      if (constraint)
1706	{
1707	  if (newstate->entrance_nodes == &newstate->nodes)
1708	    {
1709	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1710	      if (BE (newstate->entrance_nodes == NULL, 0))
1711		{
1712		  free_state (newstate);
1713		  return NULL;
1714		}
1715	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1716		  != REG_NOERROR)
1717		return NULL;
1718	      nctx_nodes = 0;
1719	      newstate->has_constraint = 1;
1720	    }
1721
1722	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1723	    {
1724	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1725	      ++nctx_nodes;
1726	    }
1727	}
1728    }
1729  err = register_state (dfa, newstate, hash);
1730  if (BE (err != REG_NOERROR, 0))
1731    {
1732      free_state (newstate);
1733      newstate = NULL;
1734    }
1735  return  newstate;
1736}
1737