1#
2# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
3#
4# Copyright 1997, 1998, 1999 Computing Research Labs,
5# New Mexico State University
6#
7# Permission is hereby granted, free of charge, to any person obtaining a
8# copy of this software and associated documentation files (the "Software"),
9# to deal in the Software without restriction, including without limitation
10# the rights to use, copy, modify, merge, publish, distribute, sublicense,
11# and/or sell copies of the Software, and to permit persons to whom the
12# Software is furnished to do so, subject to the following conditions:
13#
14# The above copyright notice and this permission notice shall be included in
15# all copies or substantial portions of the Software.
16#
17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
21# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
22# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
23# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24#
25
26
27                       Unicode and Regular Expressions
28                                 Version 0.5
29
30This is a simple regular expression package for matching against Unicode text
31in UCS2 form.  The implementation of this URE package is a variation on the
32RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu).  Mark
33Hopkins' algorithm had the virtue of being very simple, so it was used as a
34model.
35
36---------------------------------------------------------------------------
37
38Assumptions:
39
40  o  Regular expression and text already normalized.
41
42  o  Conversion to lower case assumes a 1-1 mapping.
43
44Definitions:
45
46  Separator - any one of U+2028, U+2029, '\n', '\r'.
47
48Operators:
49  .   - match any character.
50  *   - match zero or more of the last subexpression.
51  +   - match one or more of the last subexpression.
52  ?   - match zero or one of the last subexpression.
53  ()  - subexpression grouping.
54
55  Notes:
56
57    o  The "." operator normally does not match separators, but a flag is
58       available for the ure_exec() function that will allow this operator to
59       match a separator.
60
61Literals and Constants:
62
63  c       - literal UCS2 character.
64  \x....  - hexadecimal number of up to 4 digits.
65  \X....  - hexadecimal number of up to 4 digits.
66  \u....  - hexadecimal number of up to 4 digits.
67  \U....  - hexadecimal number of up to 4 digits.
68
69Character classes:
70
71  [...]           - Character class.
72  [^...]          - Negated character class.
73  \pN1,N2,...,Nn  - Character properties class.
74  \PN1,N2,...,Nn  - Negated character properties class.
75
76  POSIX character classes recognized:
77
78    :alnum:
79    :alpha:
80    :cntrl:
81    :digit:
82    :graph:
83    :lower:
84    :print:
85    :punct:
86    :space:
87    :upper:
88    :xdigit:
89
90  Notes:
91
92    o  Character property classes are \p or \P followed by a comma separated
93       list of integers between 1 and 32.  These integers are references to
94       the following character properties:
95
96        N	Character Property
97        --------------------------
98        1	_URE_NONSPACING
99        2	_URE_COMBINING
100        3	_URE_NUMDIGIT
101        4	_URE_NUMOTHER
102        5	_URE_SPACESEP
103        6	_URE_LINESEP
104        7	_URE_PARASEP
105        8	_URE_CNTRL
106        9	_URE_PUA
107        10	_URE_UPPER
108        11	_URE_LOWER
109        12	_URE_TITLE
110        13	_URE_MODIFIER
111        14	_URE_OTHERLETTER
112        15	_URE_DASHPUNCT
113        16	_URE_OPENPUNCT
114        17	_URE_CLOSEPUNCT
115        18	_URE_OTHERPUNCT
116        19	_URE_MATHSYM
117        20	_URE_CURRENCYSYM
118        21	_URE_OTHERSYM
119        22	_URE_LTR
120        23	_URE_RTL
121        24	_URE_EURONUM
122        25	_URE_EURONUMSEP
123        26	_URE_EURONUMTERM
124        27	_URE_ARABNUM
125        28	_URE_COMMONSEP
126        29	_URE_BLOCKSEP
127        30	_URE_SEGMENTSEP
128        31	_URE_WHITESPACE
129        32	_URE_OTHERNEUT
130
131    o  Character classes can contain literals, constants, and character
132       property classes. Example:
133
134       [abc\U10A\p1,3,4]
135
136---------------------------------------------------------------------------
137
138Before using URE
139----------------
140Before URE is used, two functions need to be created.  One to check if a
141character matches a set of URE character properties, and one to convert a
142character to lower case.
143
144Stubs for these function are located in the urestubs.c file.
145
146Using URE
147---------
148
149Sample pseudo-code fragment.
150
151  ure_buffer_t rebuf;
152  ure_dfa_t dfa;
153  ucs2_t *re, *text;
154  unsigned long relen, textlen;
155  unsigned long match_start, match_end;
156
157  /*
158   * Allocate the dynamic storage needed to compile regular expressions.
159   */
160  rebuf = ure_buffer_create();
161
162  for each regular expression in a list {
163      re = next regular expression;
164      relen = length(re);
165
166      /*
167       * Compile the regular expression with the case insensitive flag
168       * turned on.
169       */
170      dfa = ure_compile(re, relen, 1, rebuf);
171
172      /*
173       * Look for the first match in some text.  The matching will be done
174       * in a case insensitive manner because the expression was compiled
175       * with the case insensitive flag on.
176       */
177      if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
178        printf("MATCH: %ld %ld\n", match_start, match_end);
179
180      /*
181       * Look for the first match in some text, ignoring non-spacing
182       * characters.
183       */
184      if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
185                   &match_start, &match_end))
186        printf("MATCH: %ld %ld\n", match_start, match_end);
187
188      /*
189       * Free the DFA.
190       */
191      ure_free_dfa(dfa);
192  }
193
194  /*
195   * Free the dynamic storage used for compiling the expressions.
196   */
197  ure_free_buffer(rebuf);
198
199---------------------------------------------------------------------------
200
201Mark Leisher <mleisher@crl.nmsu.edu>
20229 March 1997
203
204===========================================================================
205
206CHANGES
207-------
208
209Version: 0.5
210Date   : 21 September 1999
211==========================
212  1. Added copyright stuff and put in CVS.
213