1# 2# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $ 3# 4# Copyright 1997, 1998, 1999 Computing Research Labs, 5# New Mexico State University 6# 7# Permission is hereby granted, free of charge, to any person obtaining a 8# copy of this software and associated documentation files (the "Software"), 9# to deal in the Software without restriction, including without limitation 10# the rights to use, copy, modify, merge, publish, distribute, sublicense, 11# and/or sell copies of the Software, and to permit persons to whom the 12# Software is furnished to do so, subject to the following conditions: 13# 14# The above copyright notice and this permission notice shall be included in 15# all copies or substantial portions of the Software. 16# 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 21# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 22# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 23# THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24# 25 26 27 Unicode and Regular Expressions 28 Version 0.5 29 30This is a simple regular expression package for matching against Unicode text 31in UCS2 form. The implementation of this URE package is a variation on the 32RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark 33Hopkins' algorithm had the virtue of being very simple, so it was used as a 34model. 35 36--------------------------------------------------------------------------- 37 38Assumptions: 39 40 o Regular expression and text already normalized. 41 42 o Conversion to lower case assumes a 1-1 mapping. 43 44Definitions: 45 46 Separator - any one of U+2028, U+2029, '\n', '\r'. 47 48Operators: 49 . - match any character. 50 * - match zero or more of the last subexpression. 51 + - match one or more of the last subexpression. 52 ? - match zero or one of the last subexpression. 53 () - subexpression grouping. 54 55 Notes: 56 57 o The "." operator normally does not match separators, but a flag is 58 available for the ure_exec() function that will allow this operator to 59 match a separator. 60 61Literals and Constants: 62 63 c - literal UCS2 character. 64 \x.... - hexadecimal number of up to 4 digits. 65 \X.... - hexadecimal number of up to 4 digits. 66 \u.... - hexadecimal number of up to 4 digits. 67 \U.... - hexadecimal number of up to 4 digits. 68 69Character classes: 70 71 [...] - Character class. 72 [^...] - Negated character class. 73 \pN1,N2,...,Nn - Character properties class. 74 \PN1,N2,...,Nn - Negated character properties class. 75 76 POSIX character classes recognized: 77 78 :alnum: 79 :alpha: 80 :cntrl: 81 :digit: 82 :graph: 83 :lower: 84 :print: 85 :punct: 86 :space: 87 :upper: 88 :xdigit: 89 90 Notes: 91 92 o Character property classes are \p or \P followed by a comma separated 93 list of integers between 1 and 32. These integers are references to 94 the following character properties: 95 96 N Character Property 97 -------------------------- 98 1 _URE_NONSPACING 99 2 _URE_COMBINING 100 3 _URE_NUMDIGIT 101 4 _URE_NUMOTHER 102 5 _URE_SPACESEP 103 6 _URE_LINESEP 104 7 _URE_PARASEP 105 8 _URE_CNTRL 106 9 _URE_PUA 107 10 _URE_UPPER 108 11 _URE_LOWER 109 12 _URE_TITLE 110 13 _URE_MODIFIER 111 14 _URE_OTHERLETTER 112 15 _URE_DASHPUNCT 113 16 _URE_OPENPUNCT 114 17 _URE_CLOSEPUNCT 115 18 _URE_OTHERPUNCT 116 19 _URE_MATHSYM 117 20 _URE_CURRENCYSYM 118 21 _URE_OTHERSYM 119 22 _URE_LTR 120 23 _URE_RTL 121 24 _URE_EURONUM 122 25 _URE_EURONUMSEP 123 26 _URE_EURONUMTERM 124 27 _URE_ARABNUM 125 28 _URE_COMMONSEP 126 29 _URE_BLOCKSEP 127 30 _URE_SEGMENTSEP 128 31 _URE_WHITESPACE 129 32 _URE_OTHERNEUT 130 131 o Character classes can contain literals, constants, and character 132 property classes. Example: 133 134 [abc\U10A\p1,3,4] 135 136--------------------------------------------------------------------------- 137 138Before using URE 139---------------- 140Before URE is used, two functions need to be created. One to check if a 141character matches a set of URE character properties, and one to convert a 142character to lower case. 143 144Stubs for these function are located in the urestubs.c file. 145 146Using URE 147--------- 148 149Sample pseudo-code fragment. 150 151 ure_buffer_t rebuf; 152 ure_dfa_t dfa; 153 ucs2_t *re, *text; 154 unsigned long relen, textlen; 155 unsigned long match_start, match_end; 156 157 /* 158 * Allocate the dynamic storage needed to compile regular expressions. 159 */ 160 rebuf = ure_buffer_create(); 161 162 for each regular expression in a list { 163 re = next regular expression; 164 relen = length(re); 165 166 /* 167 * Compile the regular expression with the case insensitive flag 168 * turned on. 169 */ 170 dfa = ure_compile(re, relen, 1, rebuf); 171 172 /* 173 * Look for the first match in some text. The matching will be done 174 * in a case insensitive manner because the expression was compiled 175 * with the case insensitive flag on. 176 */ 177 if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end)) 178 printf("MATCH: %ld %ld\n", match_start, match_end); 179 180 /* 181 * Look for the first match in some text, ignoring non-spacing 182 * characters. 183 */ 184 if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen, 185 &match_start, &match_end)) 186 printf("MATCH: %ld %ld\n", match_start, match_end); 187 188 /* 189 * Free the DFA. 190 */ 191 ure_free_dfa(dfa); 192 } 193 194 /* 195 * Free the dynamic storage used for compiling the expressions. 196 */ 197 ure_free_buffer(rebuf); 198 199--------------------------------------------------------------------------- 200 201Mark Leisher <mleisher@crl.nmsu.edu> 20229 March 1997 203 204=========================================================================== 205 206CHANGES 207------- 208 209Version: 0.5 210Date : 21 September 1999 211========================== 212 1. Added copyright stuff and put in CVS. 213