1#!/usr/bin/env ruby
2
3# Creates the data structures needed by Onigurma to map Unicode codepoints to
4# property names and POSIX character classes
5#
6# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
7# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
8# DerivedAge.txt and Blocks.txt  from unicode.org.
9# (http://unicode.org/Public/UNIDATA/) And run following command.
10# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
11# You can get source file for gperf.  After this, simply make ruby.
12
13unless ARGV.size == 1
14  $stderr.puts "Usage: #{$0} data_directory"
15  exit(1)
16end
17
18POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
19
20def pair_codepoints(codepoints)
21
22  # We have a sorted Array of codepoints that we wish to partition into
23  # ranges such that the start- and endpoints form an inclusive set of
24  # codepoints with property _property_. Note: It is intended that some ranges
25  # will begin with the value with  which they end, e.g. 0x0020 -> 0x0020
26
27  codepoints.sort!
28  last_cp = codepoints.first
29  pairs = [[last_cp, nil]]
30  codepoints[1..-1].each do |codepoint|
31    next if last_cp == codepoint
32
33    # If the current codepoint does not follow directly on from the last
34    # codepoint, the last codepoint represents the end of the current range,
35    # and the current codepoint represents the start of the next range.
36    if last_cp.next != codepoint
37      pairs[-1][-1] = last_cp
38      pairs << [codepoint, nil]
39    end
40    last_cp = codepoint
41  end
42
43  # The final pair has as its endpoint the last codepoint for this property
44  pairs[-1][-1] = codepoints.last
45  pairs
46end
47
48def parse_unicode_data(file)
49  last_cp = 0
50  data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
51    'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
52  beg_cp = nil
53  IO.foreach(file) do |line|
54    fields = line.split(';')
55    cp = fields[0].to_i(16)
56
57    case fields[1]
58    when /\A<(.*),\s*First>\z/
59      beg_cp = cp
60      next
61    when /\A<(.*),\s*Last>\z/
62      cps = (beg_cp..cp).to_a
63    else
64      beg_cp = cp
65      cps = [cp]
66    end
67
68    # The Cn category represents unassigned characters. These are not listed in
69    # UnicodeData.txt so we must derive them by looking for 'holes' in the range
70    # of listed codepoints. We increment the last codepoint seen and compare it
71    # with the current codepoint. If the current codepoint is less than
72    # last_cp.next we have found a hole, so we add the missing codepoint to the
73    # Cn category.
74    data['Cn'].concat((last_cp.next...beg_cp).to_a)
75
76    # Assigned - Defined in unicode.c; interpreted as every character in the
77    # Unicode range minus the unassigned characters
78    data['Assigned'].concat(cps)
79
80    # The third field denotes the 'General' category, e.g. Lu
81    (data[fields[2]] ||= []).concat(cps)
82
83    # The 'Major' category is the first letter of the 'General' category, e.g.
84    # 'Lu' -> 'L'
85    (data[fields[2][0,1]] ||= []).concat(cps)
86    last_cp = cp
87  end
88
89  # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
90  # codepoints to Cn and C
91  cn_remainder = (last_cp.next..0x10ffff).to_a
92  data['Cn'] += cn_remainder
93  data['C'] += data['Cn']
94
95  # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu
96  data['LC'] = data['Ll'] + data['Lt'] + data['Lu']
97
98  # Define General Category properties
99  gcps = data.keys.sort - POSIX_NAMES
100
101  # Returns General Category Property names and the data
102  [gcps, data]
103end
104
105def define_posix_props(data)
106  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
107  #
108
109  data['Alpha'] = data['Alphabetic']
110  data['Upper'] = data['Uppercase']
111  data['Lower'] = data['Lowercase']
112  data['Punct'] = data['Punctuation']
113  data['Digit'] = data['Decimal_Number']
114  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
115                   (0x0061..0x0066).to_a
116  data['Alnum'] = data['Alpha'] + data['Digit']
117  data['Space'] = data['White_Space']
118  data['Blank'] = data['Space_Separator'] + [0x0009]
119  data['Cntrl'] = data['Cc']
120  data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
121  data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
122    data['Surrogate'] - data['Unassigned']
123  data['Print'] = data['Graph'] + data['Space_Separator']
124end
125
126def parse_scripts(data, categories)
127  files = [
128    {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'},
129    {:fn => 'Scripts.txt', :title => 'Script'},
130    {:fn => 'PropList.txt', :title => 'Binary Property'}
131  ]
132  current = nil
133  cps = []
134  names = {}
135  files.each do |file|
136    IO.foreach(get_file(file[:fn])) do |line|
137      if /^# Total code points: / =~ line
138        data[current] = cps
139        categories[current] = file[:title]
140        (names[file[:title]] ||= []) << current
141        cps = []
142      elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
143        current = $3
144        $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
145      end
146    end
147  end
148  #  All code points not explicitly listed for Script
149  #  have the value Unknown (Zzzz).
150  data['Unknown'] =  (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten
151  categories['Unknown'] = 'Script'
152  names.values.flatten << 'Unknown'
153end
154
155def parse_aliases(data)
156  kv = {}
157  IO.foreach(get_file('PropertyAliases.txt')) do |line|
158    next unless /^(\w+)\s*; (\w+)/ =~ line
159    data[$1] = data[$2]
160    kv[normalize_propname($1)] = normalize_propname($2)
161  end
162  IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
163    next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
164    if $1 == 'gc'
165      data[$3] = data[$2]
166      data[$4] = data[$2]
167      kv[normalize_propname($3)] = normalize_propname($2)
168      kv[normalize_propname($4)] = normalize_propname($2) if $4
169    else
170      data[$2] = data[$3]
171      data[$4] = data[$3]
172      kv[normalize_propname($2)] = normalize_propname($3)
173      kv[normalize_propname($4)] = normalize_propname($3) if $4
174    end
175  end
176  kv
177end
178
179# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version
180# never involves any additions to the character repertoire." Versions
181# in DerivedAge.txt should always be /\d+\.\d+/
182def parse_age(data)
183  current = nil
184  last_constname = nil
185  cps = []
186  ages = []
187  IO.foreach(get_file('DerivedAge.txt')) do |line|
188    if /^# Total code points: / =~ line
189      constname = constantize_agename(current)
190      # each version matches all previous versions
191      cps.concat(data[last_constname]) if last_constname
192      data[constname] = cps
193      make_const(constname, cps, "Derived Age #{current}")
194      ages << current
195      last_constname = constname
196      cps = []
197    elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line
198      current = $3
199      $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
200    end
201  end
202  ages
203end
204
205def parse_block(data)
206  current = nil
207  last_constname = nil
208  cps = []
209  blocks = []
210  IO.foreach(get_file('Blocks.txt')) do |line|
211    if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
212      cps = ($1.to_i(16)..$2.to_i(16)).to_a
213      constname = constantize_blockname($3)
214      data[constname] = cps
215      make_const(constname, cps, "Block")
216      blocks << constname
217    end
218  end
219
220  # All code points not belonging to any of the named blocks
221  # have the value No_Block.
222  no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten
223  constname = constantize_blockname("No_Block")
224  make_const(constname, no_block, "Block")
225  blocks << constname
226end
227
228# shim for Ruby 1.8
229unless {}.respond_to?(:key)
230  class Hash
231    alias key index
232  end
233end
234
235$const_cache = {}
236# make_const(property, pairs, name): Prints a 'static const' structure for a
237# given property, group of paired codepoints, and a human-friendly name for
238# the group
239def make_const(prop, data, name)
240  puts "\n/* '#{prop}': #{name} */"
241  if origprop = $const_cache.key(data)
242    puts "#define CR_#{prop} CR_#{origprop}"
243  else
244    $const_cache[prop] = data
245    pairs = pair_codepoints(data)
246    puts "static const OnigCodePoint CR_#{prop}[] = {"
247    # The first element of the constant is the number of pairs of codepoints
248    puts "\t#{pairs.size},"
249    pairs.each do |pair|
250      pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
251      puts "\t#{pair.first}, #{pair.last},"
252    end
253    puts "}; /* CR_#{prop} */"
254  end
255end
256
257def normalize_propname(name)
258  name = name.downcase
259  name.delete!('- _')
260  name
261end
262
263def constantize_agename(name)
264  "Age_#{name.sub(/\./, '_')}"
265end
266
267def constantize_blockname(name)
268  "In_#{name.gsub(/\W/, '_')}"
269end
270
271def get_file(name)
272  File.join(ARGV[0], name)
273end
274
275
276# Write Data
277puts '%{'
278puts '#define long size_t'
279props, data = parse_unicode_data(get_file('UnicodeData.txt'))
280categories = {}
281props.concat parse_scripts(data, categories)
282aliases = parse_aliases(data)
283define_posix_props(data)
284POSIX_NAMES.each do |name|
285  make_const(name, data[name], "[[:#{name}:]]")
286end
287print "\n#ifdef USE_UNICODE_PROPERTIES"
288props.each do |name|
289  category = categories[name] ||
290    case name.size
291    when 1 then 'Major Category'
292    when 2 then 'General Category'
293    else        '-'
294    end
295  make_const(name, data[name], category)
296end
297ages = parse_age(data)
298blocks = parse_block(data)
299puts '#endif /* USE_UNICODE_PROPERTIES */'
300puts(<<'__HEREDOC')
301
302static const OnigCodePoint* const CodeRanges[] = {
303__HEREDOC
304POSIX_NAMES.each{|name|puts"  CR_#{name},"}
305puts "#ifdef USE_UNICODE_PROPERTIES"
306props.each{|name| puts"  CR_#{name},"}
307ages.each{|name|  puts"  CR_#{constantize_agename(name)},"}
308blocks.each{|name|puts"  CR_#{name},"}
309
310puts(<<'__HEREDOC')
311#endif /* USE_UNICODE_PROPERTIES */
312};
313struct uniname2ctype_struct {
314  int name, ctype;
315};
316
317static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
318%}
319struct uniname2ctype_struct;
320%%
321__HEREDOC
322i = -1
323name_to_index = {}
324POSIX_NAMES.each do |name|
325  i += 1
326  next if name == 'NEWLINE'
327  name = normalize_propname(name)
328  name_to_index[name] = i
329  puts"%-40s %3d" % [name + ',', i]
330end
331puts "#ifdef USE_UNICODE_PROPERTIES"
332props.each do |name|
333  i += 1
334  name = normalize_propname(name)
335  name_to_index[name] = i
336  puts "%-40s %3d" % [name + ',', i]
337end
338aliases.each_pair do |k, v|
339  next if name_to_index[k]
340  next unless v = name_to_index[v]
341  puts "%-40s %3d" % [k + ',', v]
342end
343ages.each do |name|
344  i += 1
345  name = "age=#{name}"
346  name_to_index[name] = i
347  puts "%-40s %3d" % [name + ',', i]
348end
349blocks.each do |name|
350  i += 1
351  name = normalize_propname(name)
352  name_to_index[name] = i
353  puts "%-40s %3d" % [name + ',', i]
354end
355puts(<<'__HEREDOC')
356#endif /* USE_UNICODE_PROPERTIES */
357%%
358static int
359uniname2ctype(const UChar *name, unsigned int len)
360{
361  const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
362  if (p) return p->ctype;
363  return -1;
364}
365__HEREDOC
366