1require 'optparse'
2require 'erb'
3require 'fileutils'
4require 'pp'
5
6class Array
7  unless [].respond_to? :product
8    def product(*args)
9      if args.empty?
10        self.map {|e| [e] }
11      else
12        result = []
13        self.each {|e0|
14          result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] }
15        }
16        result
17      end
18    end
19  end
20end
21
22class String
23  unless "".respond_to? :start_with?
24    def start_with?(*prefixes)
25      prefixes.each {|prefix|
26        return true if prefix.length <= self.length && prefix == self[0, prefix.length]
27      }
28      false
29    end
30  end
31end
32
33NUM_ELEM_BYTELOOKUP = 2
34
35C_ESC = {
36  "\\" => "\\\\",
37  '"' => '\"',
38  "\n" => '\n',
39}
40
410x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
420x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
43C_ESC_PAT = Regexp.union(*C_ESC.keys)
44
45def c_esc(str)
46  '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
47end
48
49HEX2 = /(?:[0-9A-Fa-f]{2})/
50
51class ArrayCode
52  def initialize(type, name)
53    @type = type
54    @name = name
55    @len = 0;
56    @content = ''
57  end
58
59  def length
60    @len
61  end
62
63  def insert_at_last(num, str)
64    newnum = self.length + num
65    @content << str
66    @len += num
67  end
68
69  def to_s
70    <<"End"
71static const #{@type}
72#{@name}[#{@len}] = {
73#{@content}};
74End
75  end
76end
77
78class Action
79  def initialize(value)
80    @value = value
81  end
82  attr_reader :value
83
84  def hash
85    @value.hash
86  end
87
88  def eql?(other)
89    self.class == other.class &&
90    @value == other.value
91  end
92  alias == eql?
93end
94
95class Branch
96  def initialize(byte_min, byte_max, child_tree)
97    @byte_min = byte_min
98    @byte_max = byte_max
99    @child_tree = child_tree
100    @hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash
101  end
102  attr_reader :byte_min, :byte_max, :child_tree, :hash
103
104  def eql?(other)
105    self.class == other.class &&
106    @hash == other.hash &&
107    @byte_min == other.byte_min &&
108    @byte_max == other.byte_max &&
109    @child_tree == other.child_tree
110  end
111  alias == eql?
112end
113
114class ActionMap
115  def self.parse_to_rects(mapping)
116    rects = []
117    n = 0
118    mapping.each {|pat, action|
119      pat = pat.to_s
120      if /\A\s*\(empset\)\s*\z/ =~ pat
121        next
122      elsif /\A\s*\(empstr\)\s*\z/ =~ pat
123        rects << ['', '', action]
124        n += 1
125      elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat
126        hex = $1.upcase
127        rects << [hex, hex, action]
128      elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat
129        pat = pat.upcase
130        pat.scan(/\S+/) {
131          pat1 = $&
132          ranges_list = []
133          pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) {
134            ranges_list << []
135            if !$1
136              ranges_list.last << [$&,$&]
137            else
138              set = {}
139              $1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) {
140                if !$2
141                  c = $1.to_i(16)
142                  set[c] = true
143                else
144                  b = $1.to_i(16)
145                  e = $2.to_i(16)
146                  b.upto(e) {|c| set[c] = true }
147                end
148              }
149              i = nil
150              0.upto(256) {|j|
151                if set[j]
152                  if !i
153                    i = j
154                  end
155                  if !set[j+1]
156                    ranges_list.last << ["%02X" % i, "%02X" % j]
157                    i = nil
158                  end
159                end
160              }
161            end
162          }
163          first_ranges = ranges_list.shift
164          first_ranges.product(*ranges_list).each {|range_list|
165            min = range_list.map {|x, y| x }.join
166            max = range_list.map {|x, y| y }.join
167            rects << [min, max, action]
168          }
169        }
170      else
171        raise ArgumentError, "invalid pattern: #{pat.inspect}"
172      end
173    }
174    rects
175  end
176
177  def self.unambiguous_action(actions0)
178    actions = actions0.uniq
179    if actions.length == 1
180      actions[0]
181    else
182      actions.delete(:nomap0)
183      if actions.length == 1
184        actions[0]
185      else
186        raise ArgumentError, "ambiguous actions: #{actions0.inspect}"
187      end
188    end
189  end
190
191  def self.build_tree(rects)
192    expand(rects) {|prefix, actions|
193      unambiguous_action(actions)
194    }
195  end
196
197  def self.parse(mapping)
198    rects = parse_to_rects(mapping)
199    tree = build_tree(rects)
200    self.new(tree)
201  end
202
203  def self.merge_rects(*rects_list)
204    if rects_list.length < 2
205      raise ArgumentError, "not enough arguments"
206    end
207
208    all_rects = []
209    rects_list.each_with_index {|rects, i|
210      all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] }
211    }
212
213    tree = expand(all_rects) {|prefix, actions|
214      args = Array.new(rects_list.length) { [] }
215      actions.each {|i, action|
216        args[i] << action
217      }
218      yield(prefix, *args)
219    }
220
221    self.new(tree)
222  end
223
224  def self.merge(*mappings, &block)
225    merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block)
226  end
227
228  def self.merge2(map1, map2, &block)
229    rects1 = parse_to_rects(map1)
230    rects2 = parse_to_rects(map2)
231
232    actions = []
233    all_rects = []
234
235    rects1.each {|rect|
236      min, max, action = rect
237      rect[2] = actions.length
238      actions << action
239      all_rects << rect
240    }
241
242    boundary = actions.length
243
244    rects2.each {|rect|
245      min, max, action = rect
246      rect[2] = actions.length
247      actions << action
248      all_rects << rect
249    }
250
251    tree = expand(all_rects) {|prefix, as0|
252      as1 = []
253      as2 = []
254      as0.each {|i|
255        if i < boundary
256          as1 << actions[i]
257        else
258          as2 << actions[i]
259        end
260      }
261      yield(prefix, as1, as2)
262    }
263
264    self.new(tree)
265  end
266
267  def self.expand(rects, &block)
268    #numsing = numreg = 0
269    #rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end }
270    #puts "#{numsing} singleton mappings and #{numreg} region mappings."
271    singleton_rects = []
272    region_rects = []
273    rects.each {|rect|
274      min, max, action = rect
275      if min == max
276        singleton_rects << rect
277      else
278        region_rects << rect
279      end
280    }
281    @singleton_rects = singleton_rects.sort_by {|min, max, action| min }
282    @singleton_rects.reverse!
283    ret = expand_rec("", region_rects, &block)
284    @singleton_rects = nil
285    ret
286  end
287
288  TMPHASH = {}
289  def self.expand_rec(prefix, region_rects, &block)
290    return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix))
291    if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty?
292      h = TMPHASH
293      while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)
294        min, max, action = @singleton_rects.pop
295        raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length
296        h[action] = true
297      end
298      region_rects.each {|min, max, action|
299        raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty?
300        h[action] = true
301      }
302      tree = Action.new(block.call(prefix, h.keys))
303      h.clear
304    else
305      tree = []
306      each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2|
307        if byte_min == byte_max
308          prefix2 = prefix + "%02X" % byte_min
309        else
310          prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max]
311        end
312        child_tree = expand_rec(prefix2, r_rects2, &block)
313        tree << Branch.new(byte_min, byte_max, child_tree)
314      }
315    end
316    return tree
317  end
318
319  def self.each_firstbyte_range(prefix, region_rects)
320    index_from = TMPHASH
321
322    region_ary = []
323    region_rects.each {|min, max, action|
324      raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty?
325      min_firstbyte = min[0,2].to_i(16)
326      min_rest = min[2..-1]
327      max_firstbyte = max[0,2].to_i(16)
328      max_rest = max[2..-1]
329      region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]]
330      index_from[min_firstbyte] = true
331      index_from[max_firstbyte+1] = true
332    }
333
334    byte_from = Array.new(index_from.size)
335    bytes = index_from.keys
336    bytes.sort!
337    bytes.reverse!
338    bytes.each_with_index {|byte, i|
339      index_from[byte] = i
340      byte_from[i] = byte
341    }
342
343    region_rects_ary = Array.new(index_from.size) { [] }
344    region_ary.each {|min_firstbyte, max_firstbyte, rest_elt|
345      index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i|
346        region_rects_ary[i] << rest_elt
347      }
348    }
349
350    index_from.clear
351
352    r_rects = region_rects_ary.pop
353    region_byte = byte_from.pop
354    prev_r_start = region_byte
355    prev_r_rects = []
356    while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
357      singleton_byte = seq[prefix.length, 2].to_i(16)
358      min_byte = singleton_byte < region_byte ? singleton_byte : region_byte
359      if prev_r_start < min_byte && !prev_r_rects.empty?
360        yield prev_r_start, min_byte-1, prev_r_rects
361      end
362      if region_byte < singleton_byte
363        prev_r_start = region_byte
364        prev_r_rects = r_rects
365        r_rects = region_rects_ary.pop
366        region_byte = byte_from.pop
367      elsif region_byte > singleton_byte
368        yield singleton_byte, singleton_byte, prev_r_rects
369        prev_r_start = singleton_byte+1
370      else # region_byte == singleton_byte
371        prev_r_start = region_byte+1
372        prev_r_rects = r_rects
373        r_rects = region_rects_ary.pop
374        region_byte = byte_from.pop
375        yield singleton_byte, singleton_byte, prev_r_rects
376      end
377    end
378
379    while r_rects
380      if prev_r_start < region_byte && !prev_r_rects.empty?
381        yield prev_r_start, region_byte-1, prev_r_rects
382      end
383      prev_r_start = region_byte
384      prev_r_rects = r_rects
385      r_rects = region_rects_ary.pop
386      region_byte = byte_from.pop
387    end
388
389    while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
390      singleton_byte = seq[prefix.length, 2].to_i(16)
391      yield singleton_byte, singleton_byte, []
392    end
393  end
394
395  def initialize(tree)
396    @tree = tree
397  end
398
399  def inspect
400    "\#<#{self.class}:" +
401    @tree.inspect +
402    ">"
403  end
404
405  def max_input_length_rec(tree)
406    case tree
407    when Action
408      0
409    else
410      tree.map {|branch|
411        max_input_length_rec(branch.child_tree)
412      }.max + 1
413    end
414  end
415
416  def max_input_length
417    max_input_length_rec(@tree)
418  end
419
420  def empty_action
421    if @tree.kind_of? Action
422      @tree.value
423    else
424      nil
425    end
426  end
427
428  OffsetsMemo = {}
429  InfosMemo = {}
430
431  def format_offsets(min, max, offsets)
432    offsets = offsets[min..max]
433    code = "%d, %d,\n" % [min, max]
434    0.step(offsets.length-1,16) {|i|
435      code << "    "
436      code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
437      if i+8 < offsets.length
438        code << "  "
439        code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
440      end
441      code << "\n"
442    }
443    code
444  end
445
446  UsedName = {}
447
448  StrMemo = {}
449
450  def str_name(bytes)
451    size = @bytes_code.length
452    rawbytes = [bytes].pack("H*")
453
454    n = nil
455    if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end
456    if !n && !UsedName[nn = "str1_" + bytes] then n = nn end
457    n ||= "str1s_#{size}"
458
459    StrMemo[bytes] = n
460    UsedName[n] = true
461    n
462  end
463
464  def gen_str(bytes)
465    if n = StrMemo[bytes]
466      n
467    else
468      len = bytes.length/2
469      size = @bytes_code.length
470      n = str_name(bytes)
471      @bytes_code.insert_at_last(1 + len,
472        "\#define #{n} makeSTR1(#{size})\n" +
473        "    makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n")
474      n
475    end
476  end
477
478  def generate_info(info)
479    case info
480    when :nomap, :nomap0
481      # :nomap0 is low priority.  it never collides.
482      "NOMAP"
483    when :undef
484      "UNDEF"
485    when :invalid
486      "INVALID"
487    when :func_ii
488      "FUNii"
489    when :func_si
490      "FUNsi"
491    when :func_io
492      "FUNio"
493    when :func_so
494      "FUNso"
495    when /\A(#{HEX2})\z/o
496      "o1(0x#$1)"
497    when /\A(#{HEX2})(#{HEX2})\z/o
498      "o2(0x#$1,0x#$2)"
499    when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o
500      "o3(0x#$1,0x#$2,0x#$3)"
501    when /funsio\((\d+)\)/
502      "funsio(#{$1})"
503    when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o
504      "g4(0x#$1,0x#$2,0x#$3,0x#$4)"
505    when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o
506      "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
507    when /\A(#{HEX2}){4,259}\z/o
508      gen_str(info.upcase)
509    when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure
510      $'.to_s
511    else
512      raise "unexpected action: #{info.inspect}"
513    end
514  end
515
516  def format_infos(infos)
517    infos = infos.map {|info| generate_info(info) }
518    maxlen = infos.map {|info| info.length }.max
519    columns = maxlen <= 16 ? 4 : 2
520    code = ""
521    0.step(infos.length-1, columns) {|i|
522      code << "    "
523      is = infos[i,columns]
524      is.each {|info|
525        code << sprintf(" %#{maxlen}s,", info)
526      }
527      code << "\n"
528    }
529    code
530  end
531
532  def generate_lookup_node(name, table)
533    bytes_code = @bytes_code
534    words_code = @words_code
535    offsets = []
536    infos = []
537    infomap = {}
538    min = max = nil
539    table.each_with_index {|action, byte|
540      action ||= :invalid
541      if action != :invalid
542        min = byte if !min
543        max = byte
544      end
545      unless o = infomap[action]
546        infomap[action] = o = infos.length
547        infos[o] = action
548      end
549      offsets[byte] = o
550    }
551    infomap.clear
552    if !min
553      min = max = 0
554    end
555
556    offsets_key = [min, max, offsets[min..max]]
557    if n = OffsetsMemo[offsets_key]
558      offsets_name = n
559    else
560      offsets_name = "#{name}_offsets"
561      OffsetsMemo[offsets_key] = offsets_name
562      size = bytes_code.length
563      bytes_code.insert_at_last(2+max-min+1,
564        "\#define #{offsets_name} #{size}\n" +
565        format_offsets(min,max,offsets) + "\n")
566    end
567
568    if n = InfosMemo[infos]
569      infos_name = n
570    else
571      infos_name = "#{name}_infos"
572      InfosMemo[infos] = infos_name
573
574      size = words_code.length
575      words_code.insert_at_last(infos.length,
576        "\#define #{infos_name} WORDINDEX2INFO(#{size})\n" +
577        format_infos(infos) + "\n")
578    end
579
580    size = words_code.length
581    words_code.insert_at_last(NUM_ELEM_BYTELOOKUP,
582      "\#define #{name} WORDINDEX2INFO(#{size})\n" +
583      <<"End" + "\n")
584    #{offsets_name},
585    #{infos_name},
586End
587  end
588
589  PreMemo = {}
590  NextName = "a"
591
592  def generate_node(name_hint=nil)
593    if n = PreMemo[@tree]
594      return n
595    end
596
597    table = Array.new(0x100, :invalid)
598    @tree.each {|branch|
599      byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree
600      rest = ActionMap.new(child_tree)
601      if a = rest.empty_action
602        table.fill(a, byte_min..byte_max)
603      else
604        name_hint2 = nil
605        if name_hint
606          name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}"
607        end
608        v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2)
609        table.fill(v, byte_min..byte_max)
610      end
611    }
612
613    if !name_hint
614      name_hint = "fun_" + NextName
615      NextName.succ!
616    end
617
618    PreMemo[@tree] = name_hint
619
620    generate_lookup_node(name_hint, table)
621    name_hint
622  end
623
624  def gennode(bytes_code, words_code, name_hint=nil)
625    @bytes_code = bytes_code
626    @words_code = words_code
627    name = generate_node(name_hint)
628    @bytes_code = nil
629    @words_code = nil
630    return name
631  end
632end
633
634def citrus_mskanji_cstomb(csid, index)
635  case csid
636  when 0
637    index
638  when 1
639    index + 0x80
640  when 2, 3
641    row = index >> 8
642    raise "invalid byte sequence" if row < 0x21
643    if csid == 3
644      if row <= 0x2F
645        offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0
646      elsif row >= 0x4D && row <= 0x7E
647        offset = 0xCE
648      else
649        raise "invalid byte sequence"
650      end
651    else
652      raise "invalid byte sequence" if row > 0x97
653      offset = (row < 0x5F) ? 0x81 : 0xC1
654    end
655    col = index & 0xFF
656    raise "invalid byte sequence" if (col < 0x21 || col > 0x7E)
657
658    row -= 0x21
659    col -= 0x21
660    if (row & 1) == 0
661      col += 0x40
662      col += 1 if (col >= 0x7F)
663    else
664      col += 0x9F;
665    end
666    row = row / 2 + offset
667    (row << 8) | col
668  end.to_s(16)
669end
670
671def citrus_euc_cstomb(csid, index)
672  case csid
673  when 0x0000
674    index
675  when 0x8080
676    index | 0x8080
677  when 0x0080
678    index | 0x8E80
679  when 0x8000
680    index | 0x8F8080
681  end.to_s(16)
682end
683
684def citrus_stateless_iso_cstomb(csid, index)
685  (index | 0x8080 | (csid << 16)).to_s(16)
686end
687
688def citrus_cstomb(ces, csid, index)
689  case ces
690  when 'mskanji'
691    citrus_mskanji_cstomb(csid, index)
692  when 'euc'
693    citrus_euc_cstomb(csid, index)
694  when 'stateless_iso'
695    citrus_stateless_iso_cstomb(csid, index)
696  end
697end
698
699SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/
700
701
702def citrus_decode_mapsrc(ces, csid, mapsrcs)
703  table = []
704  mapsrcs.split(',').each do |mapsrc|
705    path = [$srcdir]
706    mode = nil
707    if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0)
708      mode = :from_ucs
709      from = mapsrc[$&.size+1..-1]
710      path << SUBDIR.find{|x| from.rindex(x, 0) }
711    else
712      mode = :to_ucs
713      path << SUBDIR.find{|x| mapsrc.rindex(x, 0) }
714    end
715    if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc
716      plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1]
717    else
718      plane = 0
719    end
720    plane <<= 16
721    path << mapsrc.gsub(':', '@')
722    path = File.join(*path)
723    path << ".src"
724    path[path.rindex('/')] = '%'
725    STDERR.puts 'load mapsrc %s' % path if VERBOSE_MODE
726    open(path) do |f|
727      f.each_line do |l|
728        break if /^BEGIN_MAP/ =~ l
729      end
730      f.each_line do |l|
731        next if /^\s*(?:#|$)/ =~ l
732          break if /^END_MAP/ =~ l
733        case mode
734        when :from_ucs
735          case l
736          when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/
737            # Citrus OOB_MODE
738          when /(0x\w+)\s*=\s*(0x\w+)/
739            table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)]
740          else
741            raise "unknown notation '%s'"% l
742          end
743        when :to_ucs
744          case l
745          when /(0x\w+)\s*=\s*(0x\w+)/
746            table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex]
747          else
748            raise "unknown notation '%s'"% l
749          end
750        end
751      end
752    end
753  end
754  return table
755end
756
757def import_ucm(path)
758  to_ucs = []
759  from_ucs = []
760  File.foreach(File.join($srcdir, "ucm", path)) do |line|
761    uc, bs, fb = nil
762    if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
763      uc = $1.hex
764      bs = $2.delete('x\\')
765      fb = $3.to_i
766      next if uc < 128 && uc == bs.hex
767    elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
768      uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0]
769      bs = $2.delete('x\\')
770      fb = $3.to_i
771    end
772    to_ucs << [bs, uc] if fb == 0 || fb == 3
773    from_ucs << [uc, bs] if fb == 0 || fb == 1
774  end
775  [to_ucs, from_ucs]
776end
777
778def encode_utf8(map)
779  r = []
780  map.each {|k, v|
781    # integer means UTF-8 encoded sequence.
782    k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
783    v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
784    r << [k,v]
785  }
786  r
787end
788
789UnspecifiedValidEncoding = Object.new
790
791def transcode_compile_tree(name, from, map, valid_encoding)
792  map = encode_utf8(map)
793  h = {}
794  map.each {|k, v|
795    h[k] = v unless h[k] # use first mapping
796  }
797  if valid_encoding.equal? UnspecifiedValidEncoding
798    valid_encoding = ValidEncoding.fetch(from)
799  end
800  if valid_encoding
801    am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2|
802      a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1)
803      a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2)
804      if !a2
805        raise "invalid mapping: #{prefix}"
806      end
807      a1 || a2
808    }
809  else
810    am = ActionMap.parse(h)
811  end
812  h.clear
813
814  max_input = am.max_input_length
815  defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name)
816  return defined_name, max_input
817end
818
819TRANSCODERS = []
820TRANSCODE_GENERATED_TRANSCODER_CODE = ''
821
822def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding)
823  if VERBOSE_MODE
824    if from.empty? || to.empty?
825      STDERR.puts "converter for #{from.empty? ? to : from}"
826    else
827      STDERR.puts "converter from #{from} to #{to}"
828    end
829  end
830  id_from = from.tr('^0-9A-Za-z', '_')
831  id_to = to.tr('^0-9A-Za-z', '_')
832  if from == "UTF-8"
833    tree_name = "to_#{id_to}"
834  elsif to == "UTF-8"
835    tree_name = "from_#{id_from}"
836  else
837    tree_name = "from_#{id_from}_to_#{id_to}"
838  end
839  real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding)
840  return map, tree_name, real_tree_name, max_input
841end
842
843def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding)
844  map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding)
845  transcoder_name = "rb_#{tree_name}"
846  TRANSCODERS << transcoder_name
847  input_unit_length = UnitLength[from]
848  max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
849  transcoder_code = <<"End"
850static const rb_transcoder
851#{transcoder_name} = {
852    #{c_esc from}, #{c_esc to}, #{real_tree_name},
853    TRANSCODE_TABLE_INFO,
854    #{input_unit_length}, /* input_unit_length */
855    #{max_input}, /* max_input */
856    #{max_output}, /* max_output */
857    asciicompat_converter, /* asciicompat_type */
858    0, NULL, NULL, /* state_size, state_init, state_fini */
859    NULL, NULL, NULL, NULL,
860    NULL, NULL, NULL
861};
862End
863  TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code
864  ''
865end
866
867def transcode_generate_node(am, name_hint=nil)
868  STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
869  name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint)
870  ''
871end
872
873def transcode_generated_code
874  TRANSCODE_GENERATED_BYTES_CODE.to_s +
875    TRANSCODE_GENERATED_WORDS_CODE.to_s +
876    "\#define TRANSCODE_TABLE_INFO " +
877    "#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " +
878    "#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " +
879    "((int)sizeof(unsigned int))\n" +
880    TRANSCODE_GENERATED_TRANSCODER_CODE
881end
882
883def transcode_register_code
884  code = ''
885  TRANSCODERS.each {|transcoder_name|
886    code << "    rb_register_transcoder(&#{transcoder_name});\n"
887  }
888  code
889end
890
891UnitLength = {
892  'UTF-16BE'    => 2,
893  'UTF-16LE'    => 2,
894  'UTF-32BE'    => 4,
895  'UTF-32LE'    => 4,
896}
897UnitLength.default = 1
898
899ValidEncoding = {
900  '1byte'       => '{00-ff}',
901  '2byte'       => '{00-ff}{00-ff}',
902  '4byte'       => '{00-ff}{00-ff}{00-ff}{00-ff}',
903  'US-ASCII'    => '{00-7f}',
904  'UTF-8'       => '{00-7f}
905                    {c2-df}{80-bf}
906                         e0{a0-bf}{80-bf}
907                    {e1-ec}{80-bf}{80-bf}
908                         ed{80-9f}{80-bf}
909                    {ee-ef}{80-bf}{80-bf}
910                         f0{90-bf}{80-bf}{80-bf}
911                    {f1-f3}{80-bf}{80-bf}{80-bf}
912                         f4{80-8f}{80-bf}{80-bf}',
913  'UTF-16BE'    => '{00-d7,e0-ff}{00-ff}
914                    {d8-db}{00-ff}{dc-df}{00-ff}',
915  'UTF-16LE'    => '{00-ff}{00-d7,e0-ff}
916                    {00-ff}{d8-db}{00-ff}{dc-df}',
917  'UTF-32BE'    => '0000{00-d7,e0-ff}{00-ff}
918                    00{01-10}{00-ff}{00-ff}',
919  'UTF-32LE'    => '{00-ff}{00-d7,e0-ff}0000
920                    {00-ff}{00-ff}{01-10}00',
921  'EUC-JP'      => '{00-7f}
922                    {a1-fe}{a1-fe}
923                    8e{a1-fe}
924                    8f{a1-fe}{a1-fe}',
925  'CP51932'     => '{00-7f}
926                    {a1-fe}{a1-fe}
927                    8e{a1-fe}',
928  'EUC-JP-2004' => '{00-7f}
929                    {a1-fe}{a1-fe}
930                    8e{a1-fe}
931                    8f{a1-fe}{a1-fe}',
932  'Shift_JIS'   => '{00-7f}
933                    {81-9f,e0-fc}{40-7e,80-fc}
934                    {a1-df}',
935  'EUC-KR'      => '{00-7f}
936                    {a1-fe}{a1-fe}',
937  'CP949'       => '{00-7f}
938                    {81-fe}{41-5a,61-7a,81-fe}',
939  'Big5'        => '{00-7f}
940                    {81-fe}{40-7e,a1-fe}',
941  'EUC-TW'      => '{00-7f}
942                    {a1-fe}{a1-fe}
943                    8e{a1-b0}{a1-fe}{a1-fe}',
944  'GBK'         => '{00-80}
945                    {81-fe}{40-7e,80-fe}',
946  'GB18030'     => '{00-7f}
947                    {81-fe}{40-7e,80-fe}
948                    {81-fe}{30-39}{81-fe}{30-39}',
949}
950
951def ValidEncoding(enc)
952  ValidEncoding.fetch(enc)
953end
954
955def set_valid_byte_pattern(encoding, pattern_or_label)
956  pattern =
957    if ValidEncoding[pattern_or_label]
958      ValidEncoding[pattern_or_label]
959    else
960      pattern_or_label
961    end
962  if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern
963    raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}"
964  end
965  ValidEncoding[encoding] = pattern
966end
967
968# the following may be used in different places, so keep them here for the moment
969set_valid_byte_pattern 'ASCII-8BIT', '1byte'
970set_valid_byte_pattern 'Windows-31J', 'Shift_JIS'
971set_valid_byte_pattern 'eucJP-ms', 'EUC-JP'
972
973def make_signature(filename, src)
974  "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
975end
976
977if __FILE__ == $0
978  start_time = Time.now
979
980  output_filename = nil
981  verbose_mode = false
982  force_mode = false
983
984  op = OptionParser.new
985  op.def_option("--help", "show help message") { puts op; exit 0 }
986  op.def_option("--verbose", "verbose mode") { verbose_mode = true }
987  op.def_option("--force", "force table generation") { force_mode = true }
988  op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
989  op.parse!
990
991  VERBOSE_MODE = verbose_mode
992
993  OUTPUT_FILENAME = output_filename
994  OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : ""
995  OUTPUT_PREFIX.sub!(/\A_+/, '')
996  OUTPUT_PREFIX.sub!(/_*\z/, '_')
997
998  TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array")
999  TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array")
1000
1001  arg = ARGV.shift
1002  $srcdir = File.dirname(arg)
1003  $:.unshift $srcdir unless $:.include? $srcdir
1004  src = File.read(arg)
1005  src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
1006  this_script = File.read(__FILE__)
1007  this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
1008
1009  base_signature = "/* autogenerated. */\n"
1010  base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
1011  base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
1012
1013  if !force_mode && output_filename && File.readable?(output_filename)
1014    old_signature = File.open(output_filename) {|f| f.gets("").chomp }
1015    chk_signature = base_signature.dup
1016    old_signature.each_line {|line|
1017      if %r{/\* src="([0-9a-z_.-]+)",} =~ line
1018        name = $1
1019        next if name == File.basename(arg) || name == File.basename(__FILE__)
1020        path = File.join($srcdir, name)
1021        if File.readable? path
1022          chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
1023        end
1024      end
1025    }
1026    if old_signature == chk_signature
1027      now = Time.now
1028      File.utime(now, now, output_filename)
1029      STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
1030      exit
1031    end
1032  end
1033
1034  if VERBOSE_MODE
1035    if output_filename
1036      STDERR.puts "generating #{output_filename} ..."
1037    end
1038  end
1039
1040  libs1 = $".dup
1041  erb = ERB.new(src, nil, '%')
1042  erb.filename = arg
1043  erb_result = erb.result(binding)
1044  libs2 = $".dup
1045
1046  libs = libs2 - libs1
1047  lib_sigs = ''
1048  libs.each {|lib|
1049    lib = File.basename(lib)
1050    path = File.join($srcdir, lib)
1051    if File.readable? path
1052      lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
1053    end
1054  }
1055
1056  result = ''
1057  result << base_signature
1058  result << lib_sigs
1059  result << "\n"
1060  result << erb_result
1061  result << "\n"
1062
1063  if output_filename
1064    new_filename = output_filename + ".new"
1065    FileUtils.mkdir_p(File.dirname(output_filename))
1066    File.open(new_filename, "wb") {|f| f << result }
1067    File.rename(new_filename, output_filename)
1068    tms = Process.times
1069    elapsed = Time.now - start_time
1070    STDERR.puts "done.  (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE
1071  else
1072    print result
1073  end
1074end
1075