1#
2# Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp
3#
4# Copyright 1997, 1998, 1999 Computing Research Labs,
5# New Mexico State University
6#
7# Permission is hereby granted, free of charge, to any person obtaining a
8# copy of this software and associated documentation files (the "Software"),
9# to deal in the Software without restriction, including without limitation
10# the rights to use, copy, modify, merge, publish, distribute, sublicense,
11# and/or sell copies of the Software, and to permit persons to whom the
12# Software is furnished to do so, subject to the following conditions:
13#
14# The above copyright notice and this permission notice shall be included in
15# all copies or substantial portions of the Software.
16#
17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
21# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
22# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
23# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24#
25
26
27 Unicode and Regular Expressions
28 Version 0.5
29
30This is a simple regular expression package for matching against Unicode text
31in UCS2 form. The implementation of this URE package is a variation on the
32RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
33Hopkins' algorithm had the virtue of being very simple, so it was used as a
34model.
35
36---------------------------------------------------------------------------
37
38Assumptions:
39
40 o Regular expression and text already normalized.
41
42 o Conversion to lower case assumes a 1-1 mapping.
43
44Definitions:
45
46 Separator - any one of U+2028, U+2029, '\n', '\r'.
47
48Operators:
49 . - match any character.
50 * - match zero or more of the last subexpression.
51 + - match one or more of the last subexpression.
52 ? - match zero or one of the last subexpression.
53 () - subexpression grouping.
54
55 Notes:
56
57 o The "." operator normally does not match separators, but a flag is
58 available for the ure_exec() function that will allow this operator to
59 match a separator.
60
61Literals and Constants:
62
63 c - literal UCS2 character.
64 \x.... - hexadecimal number of up to 4 digits.
65 \X.... - hexadecimal number of up to 4 digits.
66 \u.... - hexadecimal number of up to 4 digits.
67 \U.... - hexadecimal number of up to 4 digits.
68
69Character classes:
70
71 [...] - Character class.
72 [^...] - Negated character class.
73 \pN1,N2,...,Nn - Character properties class.
74 \PN1,N2,...,Nn - Negated character properties class.
75
76 POSIX character classes recognized:
77
78 :alnum:
79 :alpha:
80 :cntrl:
81 :digit:
82 :graph:
83 :lower:
84 :print:
85 :punct:
86 :space:
87 :upper:
88 :xdigit:
89
90 Notes:
91
92 o Character property classes are \p or \P followed by a comma separated
93 list of integers between 1 and 32. These integers are references to
94 the following character properties:
95
96 N Character Property
97 --------------------------
98 1 _URE_NONSPACING
99 2 _URE_COMBINING
100 3 _URE_NUMDIGIT
101 4 _URE_NUMOTHER
102 5 _URE_SPACESEP
103 6 _URE_LINESEP
104 7 _URE_PARASEP
105 8 _URE_CNTRL
106 9 _URE_PUA
107 10 _URE_UPPER
108 11 _URE_LOWER
109 12 _URE_TITLE
110 13 _URE_MODIFIER
111 14 _URE_OTHERLETTER
112 15 _URE_DASHPUNCT
113 16 _URE_OPENPUNCT
114 17 _URE_CLOSEPUNCT
115 18 _URE_OTHERPUNCT
116 19 _URE_MATHSYM
117 20 _URE_CURRENCYSYM
118 21 _URE_OTHERSYM
119 22 _URE_LTR
120 23 _URE_RTL
121 24 _URE_EURONUM
122 25 _URE_EURONUMSEP
123 26 _URE_EURONUMTERM
124 27 _URE_ARABNUM
125 28 _URE_COMMONSEP
126 29 _URE_BLOCKSEP
127 30 _URE_SEGMENTSEP
128 31 _URE_WHITESPACE
129 32 _URE_OTHERNEUT
130
131 o Character classes can contain literals, constants, and character
132 property classes. Example:
133
134 [abc\U10A\p1,3,4]
135
136---------------------------------------------------------------------------
137
138Before using URE
139----------------
140Before URE is used, two functions need to be created. One to check if a
141character matches a set of URE character properties, and one to convert a
142character to lower case.
143
144Stubs for these function are located in the urestubs.c file.
145
146Using URE
147---------
148
149Sample pseudo-code fragment.
150
151 ure_buffer_t rebuf;
152 ure_dfa_t dfa;
153 ucs2_t *re, *text;
154 unsigned long relen, textlen;
155 unsigned long match_start, match_end;
156
157 /*
158 * Allocate the dynamic storage needed to compile regular expressions.
159 */
160 rebuf = ure_buffer_create();
161
162 for each regular expression in a list {
163 re = next regular expression;
164 relen = length(re);
165
166 /*
167 * Compile the regular expression with the case insensitive flag
168 * turned on.
169 */
170 dfa = ure_compile(re, relen, 1, rebuf);
171
172 /*
173 * Look for the first match in some text. The matching will be done
174 * in a case insensitive manner because the expression was compiled
175 * with the case insensitive flag on.
176 */
177 if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
178 printf("MATCH: %ld %ld\n", match_start, match_end);
179
180 /*
181 * Look for the first match in some text, ignoring non-spacing
182 * characters.
183 */
184 if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
185 &match_start, &match_end))
186 printf("MATCH: %ld %ld\n", match_start, match_end);
187
188 /*
189 * Free the DFA.
190 */
191 ure_free_dfa(dfa);
192 }
193
194 /*
195 * Free the dynamic storage used for compiling the expressions.
196 */
197 ure_free_buffer(rebuf);
198
199---------------------------------------------------------------------------
200
201Mark Leisher <mleisher@crl.nmsu.edu>
20229 March 1997
203
204===========================================================================
205
206CHANGES
207-------
208
209Version: 0.5
210Date : 21 September 1999
211==========================
212 1. Added copyright stuff and put in CVS.
213