1#!/usr/bin/env ruby 2 3# Creates the data structures needed by Onigurma to map Unicode codepoints to 4# property names and POSIX character classes 5# 6# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt, 7# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt, 8# DerivedAge.txt and Blocks.txt from unicode.org. 9# (http://unicode.org/Public/UNIDATA/) And run following command. 10# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd 11# You can get source file for gperf. After this, simply make ruby. 12 13unless ARGV.size == 1 14 $stderr.puts "Usage: #{$0} data_directory" 15 exit(1) 16end 17 18POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII] 19 20def pair_codepoints(codepoints) 21 22 # We have a sorted Array of codepoints that we wish to partition into 23 # ranges such that the start- and endpoints form an inclusive set of 24 # codepoints with property _property_. Note: It is intended that some ranges 25 # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 26 27 codepoints.sort! 28 last_cp = codepoints.first 29 pairs = [[last_cp, nil]] 30 codepoints[1..-1].each do |codepoint| 31 next if last_cp == codepoint 32 33 # If the current codepoint does not follow directly on from the last 34 # codepoint, the last codepoint represents the end of the current range, 35 # and the current codepoint represents the start of the next range. 36 if last_cp.next != codepoint 37 pairs[-1][-1] = last_cp 38 pairs << [codepoint, nil] 39 end 40 last_cp = codepoint 41 end 42 43 # The final pair has as its endpoint the last codepoint for this property 44 pairs[-1][-1] = codepoints.last 45 pairs 46end 47 48def parse_unicode_data(file) 49 last_cp = 0 50 data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [], 51 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []} 52 beg_cp = nil 53 IO.foreach(file) do |line| 54 fields = line.split(';') 55 cp = fields[0].to_i(16) 56 57 case fields[1] 58 when /\A<(.*),\s*First>\z/ 59 beg_cp = cp 60 next 61 when /\A<(.*),\s*Last>\z/ 62 cps = (beg_cp..cp).to_a 63 else 64 beg_cp = cp 65 cps = [cp] 66 end 67 68 # The Cn category represents unassigned characters. These are not listed in 69 # UnicodeData.txt so we must derive them by looking for 'holes' in the range 70 # of listed codepoints. We increment the last codepoint seen and compare it 71 # with the current codepoint. If the current codepoint is less than 72 # last_cp.next we have found a hole, so we add the missing codepoint to the 73 # Cn category. 74 data['Cn'].concat((last_cp.next...beg_cp).to_a) 75 76 # Assigned - Defined in unicode.c; interpreted as every character in the 77 # Unicode range minus the unassigned characters 78 data['Assigned'].concat(cps) 79 80 # The third field denotes the 'General' category, e.g. Lu 81 (data[fields[2]] ||= []).concat(cps) 82 83 # The 'Major' category is the first letter of the 'General' category, e.g. 84 # 'Lu' -> 'L' 85 (data[fields[2][0,1]] ||= []).concat(cps) 86 last_cp = cp 87 end 88 89 # The last Cn codepoint should be 0x10ffff. If it's not, append the missing 90 # codepoints to Cn and C 91 cn_remainder = (last_cp.next..0x10ffff).to_a 92 data['Cn'] += cn_remainder 93 data['C'] += data['Cn'] 94 95 # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu 96 data['LC'] = data['Ll'] + data['Lt'] + data['Lu'] 97 98 # Define General Category properties 99 gcps = data.keys.sort - POSIX_NAMES 100 101 # Returns General Category Property names and the data 102 [gcps, data] 103end 104 105def define_posix_props(data) 106 # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] 107 # 108 109 data['Alpha'] = data['Alphabetic'] 110 data['Upper'] = data['Uppercase'] 111 data['Lower'] = data['Lowercase'] 112 data['Punct'] = data['Punctuation'] 113 data['Digit'] = data['Decimal_Number'] 114 data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + 115 (0x0061..0x0066).to_a 116 data['Alnum'] = data['Alpha'] + data['Digit'] 117 data['Space'] = data['White_Space'] 118 data['Blank'] = data['Space_Separator'] + [0x0009] 119 data['Cntrl'] = data['Cc'] 120 data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation'] 121 data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] - 122 data['Surrogate'] - data['Unassigned'] 123 data['Print'] = data['Graph'] + data['Space_Separator'] 124end 125 126def parse_scripts(data, categories) 127 files = [ 128 {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'}, 129 {:fn => 'Scripts.txt', :title => 'Script'}, 130 {:fn => 'PropList.txt', :title => 'Binary Property'} 131 ] 132 current = nil 133 cps = [] 134 names = {} 135 files.each do |file| 136 IO.foreach(get_file(file[:fn])) do |line| 137 if /^# Total code points: / =~ line 138 data[current] = cps 139 categories[current] = file[:title] 140 (names[file[:title]] ||= []) << current 141 cps = [] 142 elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line 143 current = $3 144 $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) 145 end 146 end 147 end 148 # All code points not explicitly listed for Script 149 # have the value Unknown (Zzzz). 150 data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten 151 categories['Unknown'] = 'Script' 152 names.values.flatten << 'Unknown' 153end 154 155def parse_aliases(data) 156 kv = {} 157 IO.foreach(get_file('PropertyAliases.txt')) do |line| 158 next unless /^(\w+)\s*; (\w+)/ =~ line 159 data[$1] = data[$2] 160 kv[normalize_propname($1)] = normalize_propname($2) 161 end 162 IO.foreach(get_file('PropertyValueAliases.txt')) do |line| 163 next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line 164 if $1 == 'gc' 165 data[$3] = data[$2] 166 data[$4] = data[$2] 167 kv[normalize_propname($3)] = normalize_propname($2) 168 kv[normalize_propname($4)] = normalize_propname($2) if $4 169 else 170 data[$2] = data[$3] 171 data[$4] = data[$3] 172 kv[normalize_propname($2)] = normalize_propname($3) 173 kv[normalize_propname($4)] = normalize_propname($3) if $4 174 end 175 end 176 kv 177end 178 179# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version 180# never involves any additions to the character repertoire." Versions 181# in DerivedAge.txt should always be /\d+\.\d+/ 182def parse_age(data) 183 current = nil 184 last_constname = nil 185 cps = [] 186 ages = [] 187 IO.foreach(get_file('DerivedAge.txt')) do |line| 188 if /^# Total code points: / =~ line 189 constname = constantize_agename(current) 190 # each version matches all previous versions 191 cps.concat(data[last_constname]) if last_constname 192 data[constname] = cps 193 make_const(constname, cps, "Derived Age #{current}") 194 ages << current 195 last_constname = constname 196 cps = [] 197 elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line 198 current = $3 199 $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) 200 end 201 end 202 ages 203end 204 205def parse_block(data) 206 current = nil 207 last_constname = nil 208 cps = [] 209 blocks = [] 210 IO.foreach(get_file('Blocks.txt')) do |line| 211 if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line 212 cps = ($1.to_i(16)..$2.to_i(16)).to_a 213 constname = constantize_blockname($3) 214 data[constname] = cps 215 make_const(constname, cps, "Block") 216 blocks << constname 217 end 218 end 219 220 # All code points not belonging to any of the named blocks 221 # have the value No_Block. 222 no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten 223 constname = constantize_blockname("No_Block") 224 make_const(constname, no_block, "Block") 225 blocks << constname 226end 227 228# shim for Ruby 1.8 229unless {}.respond_to?(:key) 230 class Hash 231 alias key index 232 end 233end 234 235$const_cache = {} 236# make_const(property, pairs, name): Prints a 'static const' structure for a 237# given property, group of paired codepoints, and a human-friendly name for 238# the group 239def make_const(prop, data, name) 240 puts "\n/* '#{prop}': #{name} */" 241 if origprop = $const_cache.key(data) 242 puts "#define CR_#{prop} CR_#{origprop}" 243 else 244 $const_cache[prop] = data 245 pairs = pair_codepoints(data) 246 puts "static const OnigCodePoint CR_#{prop}[] = {" 247 # The first element of the constant is the number of pairs of codepoints 248 puts "\t#{pairs.size}," 249 pairs.each do |pair| 250 pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } 251 puts "\t#{pair.first}, #{pair.last}," 252 end 253 puts "}; /* CR_#{prop} */" 254 end 255end 256 257def normalize_propname(name) 258 name = name.downcase 259 name.delete!('- _') 260 name 261end 262 263def constantize_agename(name) 264 "Age_#{name.sub(/\./, '_')}" 265end 266 267def constantize_blockname(name) 268 "In_#{name.gsub(/\W/, '_')}" 269end 270 271def get_file(name) 272 File.join(ARGV[0], name) 273end 274 275 276# Write Data 277puts '%{' 278puts '#define long size_t' 279props, data = parse_unicode_data(get_file('UnicodeData.txt')) 280categories = {} 281props.concat parse_scripts(data, categories) 282aliases = parse_aliases(data) 283define_posix_props(data) 284POSIX_NAMES.each do |name| 285 make_const(name, data[name], "[[:#{name}:]]") 286end 287print "\n#ifdef USE_UNICODE_PROPERTIES" 288props.each do |name| 289 category = categories[name] || 290 case name.size 291 when 1 then 'Major Category' 292 when 2 then 'General Category' 293 else '-' 294 end 295 make_const(name, data[name], category) 296end 297ages = parse_age(data) 298blocks = parse_block(data) 299puts '#endif /* USE_UNICODE_PROPERTIES */' 300puts(<<'__HEREDOC') 301 302static const OnigCodePoint* const CodeRanges[] = { 303__HEREDOC 304POSIX_NAMES.each{|name|puts" CR_#{name},"} 305puts "#ifdef USE_UNICODE_PROPERTIES" 306props.each{|name| puts" CR_#{name},"} 307ages.each{|name| puts" CR_#{constantize_agename(name)},"} 308blocks.each{|name|puts" CR_#{name},"} 309 310puts(<<'__HEREDOC') 311#endif /* USE_UNICODE_PROPERTIES */ 312}; 313struct uniname2ctype_struct { 314 int name, ctype; 315}; 316 317static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int); 318%} 319struct uniname2ctype_struct; 320%% 321__HEREDOC 322i = -1 323name_to_index = {} 324POSIX_NAMES.each do |name| 325 i += 1 326 next if name == 'NEWLINE' 327 name = normalize_propname(name) 328 name_to_index[name] = i 329 puts"%-40s %3d" % [name + ',', i] 330end 331puts "#ifdef USE_UNICODE_PROPERTIES" 332props.each do |name| 333 i += 1 334 name = normalize_propname(name) 335 name_to_index[name] = i 336 puts "%-40s %3d" % [name + ',', i] 337end 338aliases.each_pair do |k, v| 339 next if name_to_index[k] 340 next unless v = name_to_index[v] 341 puts "%-40s %3d" % [k + ',', v] 342end 343ages.each do |name| 344 i += 1 345 name = "age=#{name}" 346 name_to_index[name] = i 347 puts "%-40s %3d" % [name + ',', i] 348end 349blocks.each do |name| 350 i += 1 351 name = normalize_propname(name) 352 name_to_index[name] = i 353 puts "%-40s %3d" % [name + ',', i] 354end 355puts(<<'__HEREDOC') 356#endif /* USE_UNICODE_PROPERTIES */ 357%% 358static int 359uniname2ctype(const UChar *name, unsigned int len) 360{ 361 const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len); 362 if (p) return p->ctype; 363 return -1; 364} 365__HEREDOC 366