1/* strmatch.c -- ksh-like extended pattern matching for the shell and filename
2		globbing. */
3
4/* Copyright (C) 1991-2005 Free Software Foundation, Inc.
5
6   This file is part of GNU Bash, the Bourne Again SHell.
7
8   Bash is free software; you can redistribute it and/or modify it under
9   the terms of the GNU General Public License as published by the Free
10   Software Foundation; either version 2, or (at your option) any later
11   version.
12
13   Bash is distributed in the hope that it will be useful, but WITHOUT ANY
14   WARRANTY; without even the implied warranty of MERCHANTABILITY or
15   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16   for more details.
17
18   You should have received a copy of the GNU General Public License along
19   with Bash; see the file COPYING.  If not, write to the Free Software
20   Foundation, 59 Temple Place, Suite 330, Boston, MA 02111 USA. */
21
22#include <config.h>
23
24#include <stdio.h>	/* for debugging */
25
26#include "strmatch.h"
27#include <chartypes.h>
28
29#include "bashansi.h"
30#include "shmbutil.h"
31#include "xmalloc.h"
32
33/* First, compile `sm_loop.c' for single-byte characters. */
34#define CHAR	unsigned char
35#define U_CHAR	unsigned char
36#define XCHAR	char
37#define INT	int
38#define L(CS)	CS
39#define INVALID	-1
40
41#undef STREQ
42#undef STREQN
43#define STREQ(a, b) ((a)[0] == (b)[0] && strcmp(a, b) == 0)
44#define STREQN(a, b, n) ((a)[0] == (b)[0] && strncmp(a, b, n) == 0)
45
46/* We use strcoll(3) for range comparisons in bracket expressions,
47   even though it can have unwanted side effects in locales
48   other than POSIX or US.  For instance, in the de locale, [A-Z] matches
49   all characters. */
50
51#if defined (HAVE_STRCOLL)
52/* Helper function for collating symbol equivalence. */
53static int rangecmp (c1, c2)
54     int c1, c2;
55{
56  static char s1[2] = { ' ', '\0' };
57  static char s2[2] = { ' ', '\0' };
58  int ret;
59
60  /* Eight bits only.  Period. */
61  c1 &= 0xFF;
62  c2 &= 0xFF;
63
64  if (c1 == c2)
65    return (0);
66
67  s1[0] = c1;
68  s2[0] = c2;
69
70  if ((ret = strcoll (s1, s2)) != 0)
71    return ret;
72  return (c1 - c2);
73}
74#else /* !HAVE_STRCOLL */
75#  define rangecmp(c1, c2)	((int)(c1) - (int)(c2))
76#endif /* !HAVE_STRCOLL */
77
78#if defined (HAVE_STRCOLL)
79static int
80collequiv (c1, c2)
81     int c1, c2;
82{
83  return (rangecmp (c1, c2) == 0);
84}
85#else
86#  define collequiv(c1, c2)	((c1) == (c2))
87#endif
88
89#define _COLLSYM	_collsym
90#define __COLLSYM	__collsym
91#define POSIXCOLL	posix_collsyms
92#include "collsyms.h"
93
94static int
95collsym (s, len)
96     CHAR *s;
97     int len;
98{
99  register struct _collsym *csp;
100  char *x;
101
102  x = (char *)s;
103  for (csp = posix_collsyms; csp->name; csp++)
104    {
105      if (STREQN(csp->name, x, len) && csp->name[len] == '\0')
106	return (csp->code);
107    }
108  if (len == 1)
109    return s[0];
110  return INVALID;
111}
112
113/* unibyte character classification */
114#if !defined (isascii) && !defined (HAVE_ISASCII)
115#  define isascii(c)	((unsigned int)(c) <= 0177)
116#endif
117
118enum char_class
119  {
120    CC_NO_CLASS = 0,
121    CC_ASCII, CC_ALNUM, CC_ALPHA, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
122    CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_WORD, CC_XDIGIT
123  };
124
125static char const *const cclass_name[] =
126  {
127    "",
128    "ascii", "alnum", "alpha", "blank", "cntrl", "digit", "graph",
129    "lower", "print", "punct", "space", "upper", "word", "xdigit"
130  };
131
132#define N_CHAR_CLASS (sizeof(cclass_name) / sizeof (cclass_name[0]))
133
134static int
135is_cclass (c, name)
136     int c;
137     const char *name;
138{
139  enum char_class char_class = CC_NO_CLASS;
140  int i, result;
141
142  for (i = 1; i < N_CHAR_CLASS; i++)
143    {
144      if (STREQ (name, cclass_name[i]))
145	{
146	  char_class = (enum char_class)i;
147	  break;
148	}
149    }
150
151  if (char_class == 0)
152    return -1;
153
154  switch (char_class)
155    {
156      case CC_ASCII:
157	result = isascii (c);
158	break;
159      case CC_ALNUM:
160	result = ISALNUM (c);
161	break;
162      case CC_ALPHA:
163	result = ISALPHA (c);
164	break;
165      case CC_BLANK:
166	result = ISBLANK (c);
167	break;
168      case CC_CNTRL:
169	result = ISCNTRL (c);
170	break;
171      case CC_DIGIT:
172	result = ISDIGIT (c);
173	break;
174      case CC_GRAPH:
175	result = ISGRAPH (c);
176	break;
177      case CC_LOWER:
178	result = ISLOWER (c);
179	break;
180      case CC_PRINT:
181	result = ISPRINT (c);
182	break;
183      case CC_PUNCT:
184	result = ISPUNCT (c);
185	break;
186      case CC_SPACE:
187	result = ISSPACE (c);
188	break;
189      case CC_UPPER:
190	result = ISUPPER (c);
191	break;
192      case CC_WORD:
193        result = (ISALNUM (c) || c == '_');
194	break;
195      case CC_XDIGIT:
196	result = ISXDIGIT (c);
197	break;
198      default:
199	result = -1;
200	break;
201    }
202
203  return result;
204}
205
206/* Now include `sm_loop.c' for single-byte characters. */
207/* The result of FOLD is an `unsigned char' */
208# define FOLD(c) ((flags & FNM_CASEFOLD) \
209	? TOLOWER ((unsigned char)c) \
210	: ((unsigned char)c))
211
212#define FCT			internal_strmatch
213#define GMATCH			gmatch
214#define COLLSYM			collsym
215#define PARSE_COLLSYM		parse_collsym
216#define BRACKMATCH		brackmatch
217#define PATSCAN			patscan
218#define STRCOMPARE		strcompare
219#define EXTMATCH		extmatch
220#define STRCHR(S, C)		strchr((S), (C))
221#define STRCOLL(S1, S2)		strcoll((S1), (S2))
222#define STRLEN(S)		strlen(S)
223#define STRCMP(S1, S2)		strcmp((S1), (S2))
224#define RANGECMP(C1, C2)	rangecmp((C1), (C2))
225#define COLLEQUIV(C1, C2)	collequiv((C1), (C2))
226#define CTYPE_T			enum char_class
227#define IS_CCLASS(C, S)		is_cclass((C), (S))
228#include "sm_loop.c"
229
230#if HANDLE_MULTIBYTE
231
232#  define CHAR		wchar_t
233#  define U_CHAR	wint_t
234#  define XCHAR		wchar_t
235#  define INT		wint_t
236#  define L(CS)		L##CS
237#  define INVALID	WEOF
238
239#  undef STREQ
240#  undef STREQN
241#  define STREQ(s1, s2) ((wcscmp (s1, s2) == 0))
242#  define STREQN(a, b, n) ((a)[0] == (b)[0] && wcsncmp(a, b, n) == 0)
243
244static int
245rangecmp_wc (c1, c2)
246     wint_t c1, c2;
247{
248  static wchar_t s1[2] = { L' ', L'\0' };
249  static wchar_t s2[2] = { L' ', L'\0' };
250
251  if (c1 == c2)
252    return 0;
253
254  s1[0] = c1;
255  s2[0] = c2;
256
257  return (wcscoll (s1, s2));
258}
259
260static int
261collequiv_wc (c, equiv)
262     wint_t c, equiv;
263{
264  return (!(c - equiv));
265}
266
267/* Helper function for collating symbol. */
268#  define _COLLSYM	_collwcsym
269#  define __COLLSYM	__collwcsym
270#  define POSIXCOLL	posix_collwcsyms
271#  include "collsyms.h"
272
273static wint_t
274collwcsym (s, len)
275     wchar_t *s;
276     int len;
277{
278  register struct _collwcsym *csp;
279
280  for (csp = posix_collwcsyms; csp->name; csp++)
281    {
282      if (STREQN(csp->name, s, len) && csp->name[len] == L'\0')
283	return (csp->code);
284    }
285  if (len == 1)
286    return s[0];
287  return INVALID;
288}
289
290static int
291is_wcclass (wc, name)
292     wint_t wc;
293     wchar_t *name;
294{
295  char *mbs;
296  mbstate_t state;
297  size_t mbslength;
298  wctype_t desc;
299  int want_word;
300
301  if ((wctype ("ascii") == (wctype_t)0) && (wcscmp (name, L"ascii") == 0))
302    {
303      int c;
304
305      if ((c = wctob (wc)) == EOF)
306	return 0;
307      else
308        return (c <= 0x7F);
309    }
310
311  want_word = (wcscmp (name, L"word") == 0);
312  if (want_word)
313    name = L"alnum";
314
315  memset (&state, '\0', sizeof (mbstate_t));
316  mbs = (char *) malloc (wcslen(name) * MB_CUR_MAX + 1);
317  mbslength = wcsrtombs(mbs, (const wchar_t **)&name, (wcslen(name) * MB_CUR_MAX + 1), &state);
318
319  if (mbslength == (size_t)-1 || mbslength == (size_t)-2)
320    {
321      free (mbs);
322      return -1;
323    }
324  desc = wctype (mbs);
325  free (mbs);
326
327  if (desc == (wctype_t)0)
328    return -1;
329
330  if (want_word)
331    return (iswctype (wc, desc) || wc == L'_');
332  else
333    return (iswctype (wc, desc));
334}
335
336/* Now include `sm_loop.c' for multibyte characters. */
337#define FOLD(c) ((flags & FNM_CASEFOLD) && iswupper (c) ? towlower (c) : (c))
338#define FCT			internal_wstrmatch
339#define GMATCH			gmatch_wc
340#define COLLSYM			collwcsym
341#define PARSE_COLLSYM		parse_collwcsym
342#define BRACKMATCH		brackmatch_wc
343#define PATSCAN			patscan_wc
344#define STRCOMPARE		wscompare
345#define EXTMATCH		extmatch_wc
346#define STRCHR(S, C)		wcschr((S), (C))
347#define STRCOLL(S1, S2)		wcscoll((S1), (S2))
348#define STRLEN(S)		wcslen(S)
349#define STRCMP(S1, S2)		wcscmp((S1), (S2))
350#define RANGECMP(C1, C2)	rangecmp_wc((C1), (C2))
351#define COLLEQUIV(C1, C2)	collequiv_wc((C1), (C2))
352#define CTYPE_T			enum char_class
353#define IS_CCLASS(C, S)		is_wcclass((C), (S))
354#include "sm_loop.c"
355
356#endif /* HAVE_MULTIBYTE */
357
358int
359xstrmatch (pattern, string, flags)
360     char *pattern;
361     char *string;
362     int flags;
363{
364#if HANDLE_MULTIBYTE
365  int ret;
366  size_t n;
367  wchar_t *wpattern, *wstring;
368
369  if (MB_CUR_MAX == 1)
370    return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
371
372  n = xdupmbstowcs (&wpattern, NULL, pattern);
373  if (n == (size_t)-1 || n == (size_t)-2)
374    return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
375
376  n = xdupmbstowcs (&wstring, NULL, string);
377  if (n == (size_t)-1 || n == (size_t)-2)
378    {
379      free (wpattern);
380      return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
381    }
382
383  ret = internal_wstrmatch (wpattern, wstring, flags);
384
385  free (wpattern);
386  free (wstring);
387
388  return ret;
389#else
390  return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
391#endif /* !HANDLE_MULTIBYTE */
392}
393