1require 'optparse' 2require 'erb' 3require 'fileutils' 4require 'pp' 5 6class Array 7 unless [].respond_to? :product 8 def product(*args) 9 if args.empty? 10 self.map {|e| [e] } 11 else 12 result = [] 13 self.each {|e0| 14 result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] } 15 } 16 result 17 end 18 end 19 end 20end 21 22class String 23 unless "".respond_to? :start_with? 24 def start_with?(*prefixes) 25 prefixes.each {|prefix| 26 return true if prefix.length <= self.length && prefix == self[0, prefix.length] 27 } 28 false 29 end 30 end 31end 32 33NUM_ELEM_BYTELOOKUP = 2 34 35C_ESC = { 36 "\\" => "\\\\", 37 '"' => '\"', 38 "\n" => '\n', 39} 40 410x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch } 420x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch } 43C_ESC_PAT = Regexp.union(*C_ESC.keys) 44 45def c_esc(str) 46 '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"' 47end 48 49HEX2 = /(?:[0-9A-Fa-f]{2})/ 50 51class ArrayCode 52 def initialize(type, name) 53 @type = type 54 @name = name 55 @len = 0; 56 @content = '' 57 end 58 59 def length 60 @len 61 end 62 63 def insert_at_last(num, str) 64 newnum = self.length + num 65 @content << str 66 @len += num 67 end 68 69 def to_s 70 <<"End" 71static const #{@type} 72#{@name}[#{@len}] = { 73#{@content}}; 74End 75 end 76end 77 78class Action 79 def initialize(value) 80 @value = value 81 end 82 attr_reader :value 83 84 def hash 85 @value.hash 86 end 87 88 def eql?(other) 89 self.class == other.class && 90 @value == other.value 91 end 92 alias == eql? 93end 94 95class Branch 96 def initialize(byte_min, byte_max, child_tree) 97 @byte_min = byte_min 98 @byte_max = byte_max 99 @child_tree = child_tree 100 @hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash 101 end 102 attr_reader :byte_min, :byte_max, :child_tree, :hash 103 104 def eql?(other) 105 self.class == other.class && 106 @hash == other.hash && 107 @byte_min == other.byte_min && 108 @byte_max == other.byte_max && 109 @child_tree == other.child_tree 110 end 111 alias == eql? 112end 113 114class ActionMap 115 def self.parse_to_rects(mapping) 116 rects = [] 117 n = 0 118 mapping.each {|pat, action| 119 pat = pat.to_s 120 if /\A\s*\(empset\)\s*\z/ =~ pat 121 next 122 elsif /\A\s*\(empstr\)\s*\z/ =~ pat 123 rects << ['', '', action] 124 n += 1 125 elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat 126 hex = $1.upcase 127 rects << [hex, hex, action] 128 elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat 129 pat = pat.upcase 130 pat.scan(/\S+/) { 131 pat1 = $& 132 ranges_list = [] 133 pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) { 134 ranges_list << [] 135 if !$1 136 ranges_list.last << [$&,$&] 137 else 138 set = {} 139 $1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) { 140 if !$2 141 c = $1.to_i(16) 142 set[c] = true 143 else 144 b = $1.to_i(16) 145 e = $2.to_i(16) 146 b.upto(e) {|c| set[c] = true } 147 end 148 } 149 i = nil 150 0.upto(256) {|j| 151 if set[j] 152 if !i 153 i = j 154 end 155 if !set[j+1] 156 ranges_list.last << ["%02X" % i, "%02X" % j] 157 i = nil 158 end 159 end 160 } 161 end 162 } 163 first_ranges = ranges_list.shift 164 first_ranges.product(*ranges_list).each {|range_list| 165 min = range_list.map {|x, y| x }.join 166 max = range_list.map {|x, y| y }.join 167 rects << [min, max, action] 168 } 169 } 170 else 171 raise ArgumentError, "invalid pattern: #{pat.inspect}" 172 end 173 } 174 rects 175 end 176 177 def self.unambiguous_action(actions0) 178 actions = actions0.uniq 179 if actions.length == 1 180 actions[0] 181 else 182 actions.delete(:nomap0) 183 if actions.length == 1 184 actions[0] 185 else 186 raise ArgumentError, "ambiguous actions: #{actions0.inspect}" 187 end 188 end 189 end 190 191 def self.build_tree(rects) 192 expand(rects) {|prefix, actions| 193 unambiguous_action(actions) 194 } 195 end 196 197 def self.parse(mapping) 198 rects = parse_to_rects(mapping) 199 tree = build_tree(rects) 200 self.new(tree) 201 end 202 203 def self.merge_rects(*rects_list) 204 if rects_list.length < 2 205 raise ArgumentError, "not enough arguments" 206 end 207 208 all_rects = [] 209 rects_list.each_with_index {|rects, i| 210 all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] } 211 } 212 213 tree = expand(all_rects) {|prefix, actions| 214 args = Array.new(rects_list.length) { [] } 215 actions.each {|i, action| 216 args[i] << action 217 } 218 yield(prefix, *args) 219 } 220 221 self.new(tree) 222 end 223 224 def self.merge(*mappings, &block) 225 merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block) 226 end 227 228 def self.merge2(map1, map2, &block) 229 rects1 = parse_to_rects(map1) 230 rects2 = parse_to_rects(map2) 231 232 actions = [] 233 all_rects = [] 234 235 rects1.each {|rect| 236 min, max, action = rect 237 rect[2] = actions.length 238 actions << action 239 all_rects << rect 240 } 241 242 boundary = actions.length 243 244 rects2.each {|rect| 245 min, max, action = rect 246 rect[2] = actions.length 247 actions << action 248 all_rects << rect 249 } 250 251 tree = expand(all_rects) {|prefix, as0| 252 as1 = [] 253 as2 = [] 254 as0.each {|i| 255 if i < boundary 256 as1 << actions[i] 257 else 258 as2 << actions[i] 259 end 260 } 261 yield(prefix, as1, as2) 262 } 263 264 self.new(tree) 265 end 266 267 def self.expand(rects, &block) 268 #numsing = numreg = 0 269 #rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end } 270 #puts "#{numsing} singleton mappings and #{numreg} region mappings." 271 singleton_rects = [] 272 region_rects = [] 273 rects.each {|rect| 274 min, max, action = rect 275 if min == max 276 singleton_rects << rect 277 else 278 region_rects << rect 279 end 280 } 281 @singleton_rects = singleton_rects.sort_by {|min, max, action| min } 282 @singleton_rects.reverse! 283 ret = expand_rec("", region_rects, &block) 284 @singleton_rects = nil 285 ret 286 end 287 288 TMPHASH = {} 289 def self.expand_rec(prefix, region_rects, &block) 290 return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)) 291 if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty? 292 h = TMPHASH 293 while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix) 294 min, max, action = @singleton_rects.pop 295 raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length 296 h[action] = true 297 end 298 region_rects.each {|min, max, action| 299 raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty? 300 h[action] = true 301 } 302 tree = Action.new(block.call(prefix, h.keys)) 303 h.clear 304 else 305 tree = [] 306 each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2| 307 if byte_min == byte_max 308 prefix2 = prefix + "%02X" % byte_min 309 else 310 prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max] 311 end 312 child_tree = expand_rec(prefix2, r_rects2, &block) 313 tree << Branch.new(byte_min, byte_max, child_tree) 314 } 315 end 316 return tree 317 end 318 319 def self.each_firstbyte_range(prefix, region_rects) 320 index_from = TMPHASH 321 322 region_ary = [] 323 region_rects.each {|min, max, action| 324 raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty? 325 min_firstbyte = min[0,2].to_i(16) 326 min_rest = min[2..-1] 327 max_firstbyte = max[0,2].to_i(16) 328 max_rest = max[2..-1] 329 region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]] 330 index_from[min_firstbyte] = true 331 index_from[max_firstbyte+1] = true 332 } 333 334 byte_from = Array.new(index_from.size) 335 bytes = index_from.keys 336 bytes.sort! 337 bytes.reverse! 338 bytes.each_with_index {|byte, i| 339 index_from[byte] = i 340 byte_from[i] = byte 341 } 342 343 region_rects_ary = Array.new(index_from.size) { [] } 344 region_ary.each {|min_firstbyte, max_firstbyte, rest_elt| 345 index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i| 346 region_rects_ary[i] << rest_elt 347 } 348 } 349 350 index_from.clear 351 352 r_rects = region_rects_ary.pop 353 region_byte = byte_from.pop 354 prev_r_start = region_byte 355 prev_r_rects = [] 356 while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix) 357 singleton_byte = seq[prefix.length, 2].to_i(16) 358 min_byte = singleton_byte < region_byte ? singleton_byte : region_byte 359 if prev_r_start < min_byte && !prev_r_rects.empty? 360 yield prev_r_start, min_byte-1, prev_r_rects 361 end 362 if region_byte < singleton_byte 363 prev_r_start = region_byte 364 prev_r_rects = r_rects 365 r_rects = region_rects_ary.pop 366 region_byte = byte_from.pop 367 elsif region_byte > singleton_byte 368 yield singleton_byte, singleton_byte, prev_r_rects 369 prev_r_start = singleton_byte+1 370 else # region_byte == singleton_byte 371 prev_r_start = region_byte+1 372 prev_r_rects = r_rects 373 r_rects = region_rects_ary.pop 374 region_byte = byte_from.pop 375 yield singleton_byte, singleton_byte, prev_r_rects 376 end 377 end 378 379 while r_rects 380 if prev_r_start < region_byte && !prev_r_rects.empty? 381 yield prev_r_start, region_byte-1, prev_r_rects 382 end 383 prev_r_start = region_byte 384 prev_r_rects = r_rects 385 r_rects = region_rects_ary.pop 386 region_byte = byte_from.pop 387 end 388 389 while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix) 390 singleton_byte = seq[prefix.length, 2].to_i(16) 391 yield singleton_byte, singleton_byte, [] 392 end 393 end 394 395 def initialize(tree) 396 @tree = tree 397 end 398 399 def inspect 400 "\#<#{self.class}:" + 401 @tree.inspect + 402 ">" 403 end 404 405 def max_input_length_rec(tree) 406 case tree 407 when Action 408 0 409 else 410 tree.map {|branch| 411 max_input_length_rec(branch.child_tree) 412 }.max + 1 413 end 414 end 415 416 def max_input_length 417 max_input_length_rec(@tree) 418 end 419 420 def empty_action 421 if @tree.kind_of? Action 422 @tree.value 423 else 424 nil 425 end 426 end 427 428 OffsetsMemo = {} 429 InfosMemo = {} 430 431 def format_offsets(min, max, offsets) 432 offsets = offsets[min..max] 433 code = "%d, %d,\n" % [min, max] 434 0.step(offsets.length-1,16) {|i| 435 code << " " 436 code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('') 437 if i+8 < offsets.length 438 code << " " 439 code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') 440 end 441 code << "\n" 442 } 443 code 444 end 445 446 UsedName = {} 447 448 StrMemo = {} 449 450 def str_name(bytes) 451 size = @bytes_code.length 452 rawbytes = [bytes].pack("H*") 453 454 n = nil 455 if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end 456 if !n && !UsedName[nn = "str1_" + bytes] then n = nn end 457 n ||= "str1s_#{size}" 458 459 StrMemo[bytes] = n 460 UsedName[n] = true 461 n 462 end 463 464 def gen_str(bytes) 465 if n = StrMemo[bytes] 466 n 467 else 468 len = bytes.length/2 469 size = @bytes_code.length 470 n = str_name(bytes) 471 @bytes_code.insert_at_last(1 + len, 472 "\#define #{n} makeSTR1(#{size})\n" + 473 " makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n") 474 n 475 end 476 end 477 478 def generate_info(info) 479 case info 480 when :nomap, :nomap0 481 # :nomap0 is low priority. it never collides. 482 "NOMAP" 483 when :undef 484 "UNDEF" 485 when :invalid 486 "INVALID" 487 when :func_ii 488 "FUNii" 489 when :func_si 490 "FUNsi" 491 when :func_io 492 "FUNio" 493 when :func_so 494 "FUNso" 495 when /\A(#{HEX2})\z/o 496 "o1(0x#$1)" 497 when /\A(#{HEX2})(#{HEX2})\z/o 498 "o2(0x#$1,0x#$2)" 499 when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o 500 "o3(0x#$1,0x#$2,0x#$3)" 501 when /funsio\((\d+)\)/ 502 "funsio(#{$1})" 503 when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o 504 "g4(0x#$1,0x#$2,0x#$3,0x#$4)" 505 when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o 506 "o4(0x#$1,0x#$2,0x#$3,0x#$4)" 507 when /\A(#{HEX2}){4,259}\z/o 508 gen_str(info.upcase) 509 when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure 510 $'.to_s 511 else 512 raise "unexpected action: #{info.inspect}" 513 end 514 end 515 516 def format_infos(infos) 517 infos = infos.map {|info| generate_info(info) } 518 maxlen = infos.map {|info| info.length }.max 519 columns = maxlen <= 16 ? 4 : 2 520 code = "" 521 0.step(infos.length-1, columns) {|i| 522 code << " " 523 is = infos[i,columns] 524 is.each {|info| 525 code << sprintf(" %#{maxlen}s,", info) 526 } 527 code << "\n" 528 } 529 code 530 end 531 532 def generate_lookup_node(name, table) 533 bytes_code = @bytes_code 534 words_code = @words_code 535 offsets = [] 536 infos = [] 537 infomap = {} 538 min = max = nil 539 table.each_with_index {|action, byte| 540 action ||= :invalid 541 if action != :invalid 542 min = byte if !min 543 max = byte 544 end 545 unless o = infomap[action] 546 infomap[action] = o = infos.length 547 infos[o] = action 548 end 549 offsets[byte] = o 550 } 551 infomap.clear 552 if !min 553 min = max = 0 554 end 555 556 offsets_key = [min, max, offsets[min..max]] 557 if n = OffsetsMemo[offsets_key] 558 offsets_name = n 559 else 560 offsets_name = "#{name}_offsets" 561 OffsetsMemo[offsets_key] = offsets_name 562 size = bytes_code.length 563 bytes_code.insert_at_last(2+max-min+1, 564 "\#define #{offsets_name} #{size}\n" + 565 format_offsets(min,max,offsets) + "\n") 566 end 567 568 if n = InfosMemo[infos] 569 infos_name = n 570 else 571 infos_name = "#{name}_infos" 572 InfosMemo[infos] = infos_name 573 574 size = words_code.length 575 words_code.insert_at_last(infos.length, 576 "\#define #{infos_name} WORDINDEX2INFO(#{size})\n" + 577 format_infos(infos) + "\n") 578 end 579 580 size = words_code.length 581 words_code.insert_at_last(NUM_ELEM_BYTELOOKUP, 582 "\#define #{name} WORDINDEX2INFO(#{size})\n" + 583 <<"End" + "\n") 584 #{offsets_name}, 585 #{infos_name}, 586End 587 end 588 589 PreMemo = {} 590 NextName = "a" 591 592 def generate_node(name_hint=nil) 593 if n = PreMemo[@tree] 594 return n 595 end 596 597 table = Array.new(0x100, :invalid) 598 @tree.each {|branch| 599 byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree 600 rest = ActionMap.new(child_tree) 601 if a = rest.empty_action 602 table.fill(a, byte_min..byte_max) 603 else 604 name_hint2 = nil 605 if name_hint 606 name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}" 607 end 608 v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2) 609 table.fill(v, byte_min..byte_max) 610 end 611 } 612 613 if !name_hint 614 name_hint = "fun_" + NextName 615 NextName.succ! 616 end 617 618 PreMemo[@tree] = name_hint 619 620 generate_lookup_node(name_hint, table) 621 name_hint 622 end 623 624 def gennode(bytes_code, words_code, name_hint=nil) 625 @bytes_code = bytes_code 626 @words_code = words_code 627 name = generate_node(name_hint) 628 @bytes_code = nil 629 @words_code = nil 630 return name 631 end 632end 633 634def citrus_mskanji_cstomb(csid, index) 635 case csid 636 when 0 637 index 638 when 1 639 index + 0x80 640 when 2, 3 641 row = index >> 8 642 raise "invalid byte sequence" if row < 0x21 643 if csid == 3 644 if row <= 0x2F 645 offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0 646 elsif row >= 0x4D && row <= 0x7E 647 offset = 0xCE 648 else 649 raise "invalid byte sequence" 650 end 651 else 652 raise "invalid byte sequence" if row > 0x97 653 offset = (row < 0x5F) ? 0x81 : 0xC1 654 end 655 col = index & 0xFF 656 raise "invalid byte sequence" if (col < 0x21 || col > 0x7E) 657 658 row -= 0x21 659 col -= 0x21 660 if (row & 1) == 0 661 col += 0x40 662 col += 1 if (col >= 0x7F) 663 else 664 col += 0x9F; 665 end 666 row = row / 2 + offset 667 (row << 8) | col 668 end.to_s(16) 669end 670 671def citrus_euc_cstomb(csid, index) 672 case csid 673 when 0x0000 674 index 675 when 0x8080 676 index | 0x8080 677 when 0x0080 678 index | 0x8E80 679 when 0x8000 680 index | 0x8F8080 681 end.to_s(16) 682end 683 684def citrus_stateless_iso_cstomb(csid, index) 685 (index | 0x8080 | (csid << 16)).to_s(16) 686end 687 688def citrus_cstomb(ces, csid, index) 689 case ces 690 when 'mskanji' 691 citrus_mskanji_cstomb(csid, index) 692 when 'euc' 693 citrus_euc_cstomb(csid, index) 694 when 'stateless_iso' 695 citrus_stateless_iso_cstomb(csid, index) 696 end 697end 698 699SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/ 700 701 702def citrus_decode_mapsrc(ces, csid, mapsrcs) 703 table = [] 704 mapsrcs.split(',').each do |mapsrc| 705 path = [$srcdir] 706 mode = nil 707 if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0) 708 mode = :from_ucs 709 from = mapsrc[$&.size+1..-1] 710 path << SUBDIR.find{|x| from.rindex(x, 0) } 711 else 712 mode = :to_ucs 713 path << SUBDIR.find{|x| mapsrc.rindex(x, 0) } 714 end 715 if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc 716 plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1] 717 else 718 plane = 0 719 end 720 plane <<= 16 721 path << mapsrc.gsub(':', '@') 722 path = File.join(*path) 723 path << ".src" 724 path[path.rindex('/')] = '%' 725 STDERR.puts 'load mapsrc %s' % path if VERBOSE_MODE 726 open(path) do |f| 727 f.each_line do |l| 728 break if /^BEGIN_MAP/ =~ l 729 end 730 f.each_line do |l| 731 next if /^\s*(?:#|$)/ =~ l 732 break if /^END_MAP/ =~ l 733 case mode 734 when :from_ucs 735 case l 736 when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/ 737 # Citrus OOB_MODE 738 when /(0x\w+)\s*=\s*(0x\w+)/ 739 table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)] 740 else 741 raise "unknown notation '%s'"% l 742 end 743 when :to_ucs 744 case l 745 when /(0x\w+)\s*=\s*(0x\w+)/ 746 table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex] 747 else 748 raise "unknown notation '%s'"% l 749 end 750 end 751 end 752 end 753 end 754 return table 755end 756 757def import_ucm(path) 758 to_ucs = [] 759 from_ucs = [] 760 File.foreach(File.join($srcdir, "ucm", path)) do |line| 761 uc, bs, fb = nil 762 if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line 763 uc = $1.hex 764 bs = $2.delete('x\\') 765 fb = $3.to_i 766 next if uc < 128 && uc == bs.hex 767 elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line 768 uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0] 769 bs = $2.delete('x\\') 770 fb = $3.to_i 771 end 772 to_ucs << [bs, uc] if fb == 0 || fb == 3 773 from_ucs << [uc, bs] if fb == 0 || fb == 1 774 end 775 [to_ucs, from_ucs] 776end 777 778def encode_utf8(map) 779 r = [] 780 map.each {|k, v| 781 # integer means UTF-8 encoded sequence. 782 k = [k].pack("U").unpack("H*")[0].upcase if Integer === k 783 v = [v].pack("U").unpack("H*")[0].upcase if Integer === v 784 r << [k,v] 785 } 786 r 787end 788 789UnspecifiedValidEncoding = Object.new 790 791def transcode_compile_tree(name, from, map, valid_encoding) 792 map = encode_utf8(map) 793 h = {} 794 map.each {|k, v| 795 h[k] = v unless h[k] # use first mapping 796 } 797 if valid_encoding.equal? UnspecifiedValidEncoding 798 valid_encoding = ValidEncoding.fetch(from) 799 end 800 if valid_encoding 801 am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2| 802 a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1) 803 a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2) 804 if !a2 805 raise "invalid mapping: #{prefix}" 806 end 807 a1 || a2 808 } 809 else 810 am = ActionMap.parse(h) 811 end 812 h.clear 813 814 max_input = am.max_input_length 815 defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name) 816 return defined_name, max_input 817end 818 819TRANSCODERS = [] 820TRANSCODE_GENERATED_TRANSCODER_CODE = '' 821 822def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding) 823 if VERBOSE_MODE 824 if from.empty? || to.empty? 825 STDERR.puts "converter for #{from.empty? ? to : from}" 826 else 827 STDERR.puts "converter from #{from} to #{to}" 828 end 829 end 830 id_from = from.tr('^0-9A-Za-z', '_') 831 id_to = to.tr('^0-9A-Za-z', '_') 832 if from == "UTF-8" 833 tree_name = "to_#{id_to}" 834 elsif to == "UTF-8" 835 tree_name = "from_#{id_from}" 836 else 837 tree_name = "from_#{id_from}_to_#{id_to}" 838 end 839 real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding) 840 return map, tree_name, real_tree_name, max_input 841end 842 843def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding) 844 map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding) 845 transcoder_name = "rb_#{tree_name}" 846 TRANSCODERS << transcoder_name 847 input_unit_length = UnitLength[from] 848 max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max 849 transcoder_code = <<"End" 850static const rb_transcoder 851#{transcoder_name} = { 852 #{c_esc from}, #{c_esc to}, #{real_tree_name}, 853 TRANSCODE_TABLE_INFO, 854 #{input_unit_length}, /* input_unit_length */ 855 #{max_input}, /* max_input */ 856 #{max_output}, /* max_output */ 857 asciicompat_converter, /* asciicompat_type */ 858 0, NULL, NULL, /* state_size, state_init, state_fini */ 859 NULL, NULL, NULL, NULL, 860 NULL, NULL, NULL 861}; 862End 863 TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code 864 '' 865end 866 867def transcode_generate_node(am, name_hint=nil) 868 STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE 869 name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint) 870 '' 871end 872 873def transcode_generated_code 874 TRANSCODE_GENERATED_BYTES_CODE.to_s + 875 TRANSCODE_GENERATED_WORDS_CODE.to_s + 876 "\#define TRANSCODE_TABLE_INFO " + 877 "#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " + 878 "#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " + 879 "((int)sizeof(unsigned int))\n" + 880 TRANSCODE_GENERATED_TRANSCODER_CODE 881end 882 883def transcode_register_code 884 code = '' 885 TRANSCODERS.each {|transcoder_name| 886 code << " rb_register_transcoder(&#{transcoder_name});\n" 887 } 888 code 889end 890 891UnitLength = { 892 'UTF-16BE' => 2, 893 'UTF-16LE' => 2, 894 'UTF-32BE' => 4, 895 'UTF-32LE' => 4, 896} 897UnitLength.default = 1 898 899ValidEncoding = { 900 '1byte' => '{00-ff}', 901 '2byte' => '{00-ff}{00-ff}', 902 '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}', 903 'US-ASCII' => '{00-7f}', 904 'UTF-8' => '{00-7f} 905 {c2-df}{80-bf} 906 e0{a0-bf}{80-bf} 907 {e1-ec}{80-bf}{80-bf} 908 ed{80-9f}{80-bf} 909 {ee-ef}{80-bf}{80-bf} 910 f0{90-bf}{80-bf}{80-bf} 911 {f1-f3}{80-bf}{80-bf}{80-bf} 912 f4{80-8f}{80-bf}{80-bf}', 913 'UTF-16BE' => '{00-d7,e0-ff}{00-ff} 914 {d8-db}{00-ff}{dc-df}{00-ff}', 915 'UTF-16LE' => '{00-ff}{00-d7,e0-ff} 916 {00-ff}{d8-db}{00-ff}{dc-df}', 917 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff} 918 00{01-10}{00-ff}{00-ff}', 919 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000 920 {00-ff}{00-ff}{01-10}00', 921 'EUC-JP' => '{00-7f} 922 {a1-fe}{a1-fe} 923 8e{a1-fe} 924 8f{a1-fe}{a1-fe}', 925 'CP51932' => '{00-7f} 926 {a1-fe}{a1-fe} 927 8e{a1-fe}', 928 'EUC-JP-2004' => '{00-7f} 929 {a1-fe}{a1-fe} 930 8e{a1-fe} 931 8f{a1-fe}{a1-fe}', 932 'Shift_JIS' => '{00-7f} 933 {81-9f,e0-fc}{40-7e,80-fc} 934 {a1-df}', 935 'EUC-KR' => '{00-7f} 936 {a1-fe}{a1-fe}', 937 'CP949' => '{00-7f} 938 {81-fe}{41-5a,61-7a,81-fe}', 939 'Big5' => '{00-7f} 940 {81-fe}{40-7e,a1-fe}', 941 'EUC-TW' => '{00-7f} 942 {a1-fe}{a1-fe} 943 8e{a1-b0}{a1-fe}{a1-fe}', 944 'GBK' => '{00-80} 945 {81-fe}{40-7e,80-fe}', 946 'GB18030' => '{00-7f} 947 {81-fe}{40-7e,80-fe} 948 {81-fe}{30-39}{81-fe}{30-39}', 949} 950 951def ValidEncoding(enc) 952 ValidEncoding.fetch(enc) 953end 954 955def set_valid_byte_pattern(encoding, pattern_or_label) 956 pattern = 957 if ValidEncoding[pattern_or_label] 958 ValidEncoding[pattern_or_label] 959 else 960 pattern_or_label 961 end 962 if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern 963 raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}" 964 end 965 ValidEncoding[encoding] = pattern 966end 967 968# the following may be used in different places, so keep them here for the moment 969set_valid_byte_pattern 'ASCII-8BIT', '1byte' 970set_valid_byte_pattern 'Windows-31J', 'Shift_JIS' 971set_valid_byte_pattern 'eucJP-ms', 'EUC-JP' 972 973def make_signature(filename, src) 974 "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}" 975end 976 977if __FILE__ == $0 978 start_time = Time.now 979 980 output_filename = nil 981 verbose_mode = false 982 force_mode = false 983 984 op = OptionParser.new 985 op.def_option("--help", "show help message") { puts op; exit 0 } 986 op.def_option("--verbose", "verbose mode") { verbose_mode = true } 987 op.def_option("--force", "force table generation") { force_mode = true } 988 op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg } 989 op.parse! 990 991 VERBOSE_MODE = verbose_mode 992 993 OUTPUT_FILENAME = output_filename 994 OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "" 995 OUTPUT_PREFIX.sub!(/\A_+/, '') 996 OUTPUT_PREFIX.sub!(/_*\z/, '_') 997 998 TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array") 999 TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array") 1000 1001 arg = ARGV.shift 1002 $srcdir = File.dirname(arg) 1003 $:.unshift $srcdir unless $:.include? $srcdir 1004 src = File.read(arg) 1005 src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding 1006 this_script = File.read(__FILE__) 1007 this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding 1008 1009 base_signature = "/* autogenerated. */\n" 1010 base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n" 1011 base_signature << "/* #{make_signature(File.basename(arg), src)} */\n" 1012 1013 if !force_mode && output_filename && File.readable?(output_filename) 1014 old_signature = File.open(output_filename) {|f| f.gets("").chomp } 1015 chk_signature = base_signature.dup 1016 old_signature.each_line {|line| 1017 if %r{/\* src="([0-9a-z_.-]+)",} =~ line 1018 name = $1 1019 next if name == File.basename(arg) || name == File.basename(__FILE__) 1020 path = File.join($srcdir, name) 1021 if File.readable? path 1022 chk_signature << "/* #{make_signature(name, File.read(path))} */\n" 1023 end 1024 end 1025 } 1026 if old_signature == chk_signature 1027 now = Time.now 1028 File.utime(now, now, output_filename) 1029 STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE 1030 exit 1031 end 1032 end 1033 1034 if VERBOSE_MODE 1035 if output_filename 1036 STDERR.puts "generating #{output_filename} ..." 1037 end 1038 end 1039 1040 libs1 = $".dup 1041 erb = ERB.new(src, nil, '%') 1042 erb.filename = arg 1043 erb_result = erb.result(binding) 1044 libs2 = $".dup 1045 1046 libs = libs2 - libs1 1047 lib_sigs = '' 1048 libs.each {|lib| 1049 lib = File.basename(lib) 1050 path = File.join($srcdir, lib) 1051 if File.readable? path 1052 lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n" 1053 end 1054 } 1055 1056 result = '' 1057 result << base_signature 1058 result << lib_sigs 1059 result << "\n" 1060 result << erb_result 1061 result << "\n" 1062 1063 if output_filename 1064 new_filename = output_filename + ".new" 1065 FileUtils.mkdir_p(File.dirname(output_filename)) 1066 File.open(new_filename, "wb") {|f| f << result } 1067 File.rename(new_filename, output_filename) 1068 tms = Process.times 1069 elapsed = Time.now - start_time 1070 STDERR.puts "done. (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE 1071 else 1072 print result 1073 end 1074end 1075