1# coding: US-ASCII 2 3#-- 4# irb/ruby-lex.rb - ruby lexcal analyzer 5# $Release Version: 0.9.5$ 6# $Revision: 17979 $ 7# $Date: 2008-07-09 10:17:05 -0700 (Wed, 09 Jul 2008) $ 8# by Keiju ISHITSUKA(keiju@ruby-lang.org) 9# 10#++ 11 12require "e2mmap" 13require "irb/slex" 14require "stringio" 15 16## 17# Ruby lexer adapted from irb. 18# 19# The internals are not documented because they are scary. 20 21class RDoc::RubyLex 22 23 ## 24 # Raised upon invalid input 25 26 class Error < RDoc::Error 27 end 28 29 # :stopdoc: 30 31 extend Exception2MessageMapper 32 33 def_exception(:AlreadyDefinedToken, "Already defined token(%s)") 34 def_exception(:TkReading2TokenNoKey, "key nothing(key='%s')") 35 def_exception(:TkSymbol2TokenNoKey, "key nothing(key='%s')") 36 def_exception(:TkReading2TokenDuplicateError, 37 "key duplicate(token_n='%s', key='%s')") 38 def_exception(:SyntaxError, "%s") 39 40 def_exception(:TerminateLineInput, "Terminate Line Input") 41 42 include RDoc::RubyToken 43 include IRB 44 45 attr_reader :continue 46 attr_reader :lex_state 47 attr_reader :reader 48 49 class << self 50 attr_accessor :debug_level 51 end 52 53 def self.debug? 54 @debug_level > 0 55 end 56 57 self.debug_level = 0 58 59 # :startdoc: 60 61 ## 62 # Returns an Array of +ruby+ tokens. See ::new for a description of 63 # +options+. 64 65 def self.tokenize ruby, options 66 tokens = [] 67 68 scanner = RDoc::RubyLex.new ruby, options 69 scanner.exception_on_syntax_error = true 70 71 while token = scanner.token do 72 tokens << token 73 end 74 75 tokens 76 end 77 78 ## 79 # Creates a new lexer for +content+. +options+ is an RDoc::Options, only 80 # +tab_width is used. 81 82 def initialize(content, options) 83 lex_init 84 85 if /\t/ =~ content then 86 tab_width = options.tab_width 87 content = content.split(/\n/).map do |line| 88 1 while line.gsub!(/\t+/) { 89 ' ' * (tab_width*$&.length - $`.length % tab_width) 90 } && $~ 91 line 92 end.join("\n") 93 end 94 95 content << "\n" unless content[-1, 1] == "\n" 96 97 set_input StringIO.new content 98 99 @base_char_no = 0 100 @char_no = 0 101 @exp_line_no = @line_no = 1 102 @here_readed = [] 103 @readed = [] 104 @rests = [] 105 @seek = 0 106 107 @here_header = false 108 @indent = 0 109 @indent_stack = [] 110 @lex_state = EXPR_BEG 111 @space_seen = false 112 113 @continue = false 114 @line = "" 115 116 @skip_space = false 117 @readed_auto_clean_up = false 118 @exception_on_syntax_error = true 119 120 @prompt = nil 121 @prev_seek = nil 122 @ltype = nil 123 end 124 125 # :stopdoc: 126 127 def inspect # :nodoc: 128 "#<%s:0x%x pos %d lex_state %p space_seen %p>" % [ 129 self.class, object_id, 130 @io.pos, @lex_state, @space_seen, 131 ] 132 end 133 134 attr_accessor :skip_space 135 attr_accessor :readed_auto_clean_up 136 attr_accessor :exception_on_syntax_error 137 138 attr_reader :seek 139 attr_reader :char_no 140 attr_reader :line_no 141 attr_reader :indent 142 143 # io functions 144 def set_input(io, p = nil, &block) 145 @io = io 146 if p.respond_to?(:call) 147 @input = p 148 elsif block_given? 149 @input = block 150 else 151 @input = Proc.new{@io.gets} 152 end 153 end 154 155 def get_readed 156 if idx = @readed.rindex("\n") 157 @base_char_no = @readed.size - (idx + 1) 158 else 159 @base_char_no += @readed.size 160 end 161 162 readed = @readed.join("") 163 @readed = [] 164 readed 165 end 166 167 def getc 168 while @rests.empty? 169 # return nil unless buf_input 170 @rests.push nil unless buf_input 171 end 172 c = @rests.shift 173 if @here_header 174 @here_readed.push c 175 else 176 @readed.push c 177 end 178 @seek += 1 179 if c == "\n" 180 @line_no += 1 181 @char_no = 0 182 else 183 @char_no += 1 184 end 185 186 c 187 end 188 189 def gets 190 l = "" 191 while c = getc 192 l.concat(c) 193 break if c == "\n" 194 end 195 return nil if l == "" and c.nil? 196 l 197 end 198 199 def eof? 200 @io.eof? 201 end 202 203 def getc_of_rests 204 if @rests.empty? 205 nil 206 else 207 getc 208 end 209 end 210 211 def ungetc(c = nil) 212 if @here_readed.empty? 213 c2 = @readed.pop 214 else 215 c2 = @here_readed.pop 216 end 217 c = c2 unless c 218 @rests.unshift c #c = 219 @seek -= 1 220 if c == "\n" 221 @line_no -= 1 222 if idx = @readed.rindex("\n") 223 @char_no = idx + 1 224 else 225 @char_no = @base_char_no + @readed.size 226 end 227 else 228 @char_no -= 1 229 end 230 end 231 232 def peek_equal?(str) 233 chrs = str.split(//) 234 until @rests.size >= chrs.size 235 return false unless buf_input 236 end 237 @rests[0, chrs.size] == chrs 238 end 239 240 def peek_match?(regexp) 241 while @rests.empty? 242 return false unless buf_input 243 end 244 regexp =~ @rests.join("") 245 end 246 247 def peek(i = 0) 248 while @rests.size <= i 249 return nil unless buf_input 250 end 251 @rests[i] 252 end 253 254 def buf_input 255 prompt 256 line = @input.call 257 return nil unless line 258 @rests.concat line.split(//) 259 true 260 end 261 private :buf_input 262 263 def set_prompt(p = nil, &block) 264 p = block if block_given? 265 if p.respond_to?(:call) 266 @prompt = p 267 else 268 @prompt = Proc.new{print p} 269 end 270 end 271 272 def prompt 273 if @prompt 274 @prompt.call(@ltype, @indent, @continue, @line_no) 275 end 276 end 277 278 def initialize_input 279 @ltype = nil 280 @quoted = nil 281 @indent = 0 282 @indent_stack = [] 283 @lex_state = EXPR_BEG 284 @space_seen = false 285 @here_header = false 286 287 @continue = false 288 prompt 289 290 @line = "" 291 @exp_line_no = @line_no 292 end 293 294 def each_top_level_statement 295 initialize_input 296 catch(:TERM_INPUT) do 297 loop do 298 begin 299 @continue = false 300 prompt 301 unless l = lex 302 throw :TERM_INPUT if @line == '' 303 else 304 #p l 305 @line.concat l 306 if @ltype or @continue or @indent > 0 307 next 308 end 309 end 310 if @line != "\n" 311 yield @line, @exp_line_no 312 end 313 break unless l 314 @line = '' 315 @exp_line_no = @line_no 316 317 @indent = 0 318 @indent_stack = [] 319 prompt 320 rescue TerminateLineInput 321 initialize_input 322 prompt 323 get_readed 324 end 325 end 326 end 327 end 328 329 def lex 330 until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) && 331 !@continue or 332 tk.nil?) 333 #p tk 334 #p @lex_state 335 #p self 336 end 337 line = get_readed 338 # print self.inspect 339 if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil? 340 nil 341 else 342 line 343 end 344 end 345 346 def token 347 # require "tracer" 348 # Tracer.on 349 @prev_seek = @seek 350 @prev_line_no = @line_no 351 @prev_char_no = @char_no 352 begin 353 begin 354 tk = @OP.match(self) 355 @space_seen = tk.kind_of?(TkSPACE) 356 rescue SyntaxError => e 357 raise Error, "syntax error: #{e.message}" if 358 @exception_on_syntax_error 359 360 tk = TkError.new(@seek, @line_no, @char_no) 361 end 362 end while @skip_space and tk.kind_of?(TkSPACE) 363 364 if @readed_auto_clean_up 365 get_readed 366 end 367 # Tracer.off 368 tk 369 end 370 371 ENINDENT_CLAUSE = [ 372 "case", "class", "def", "do", "for", "if", 373 "module", "unless", "until", "while", "begin" #, "when" 374 ] 375 376 DEINDENT_CLAUSE = ["end" #, "when" 377 ] 378 379 PERCENT_LTYPE = { 380 "q" => "\'", 381 "Q" => "\"", 382 "x" => "\`", 383 "r" => "/", 384 "w" => "]", 385 "W" => "]", 386 "s" => ":" 387 } 388 389 PERCENT_PAREN = { 390 "{" => "}", 391 "[" => "]", 392 "<" => ">", 393 "(" => ")" 394 } 395 396 PERCENT_PAREN_REV = PERCENT_PAREN.invert 397 398 Ltype2Token = { 399 "\'" => TkSTRING, 400 "\"" => TkSTRING, 401 "\`" => TkXSTRING, 402 "/" => TkREGEXP, 403 "]" => TkDSTRING, 404 ":" => TkSYMBOL 405 } 406 DLtype2Token = { 407 "\"" => TkDSTRING, 408 "\`" => TkDXSTRING, 409 "/" => TkDREGEXP, 410 } 411 412 def lex_init() 413 @OP = IRB::SLex.new 414 @OP.def_rules("\0", "\004", "\032") do |op, io| 415 Token(TkEND_OF_SCRIPT, '') 416 end 417 418 @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |op, io| 419 @space_seen = true 420 str = op 421 while (ch = getc) =~ /[ \t\f\r\13]/ do 422 str << ch 423 end 424 ungetc 425 Token TkSPACE, str 426 end 427 428 @OP.def_rule("#") do |op, io| 429 identify_comment 430 end 431 432 @OP.def_rule("=begin", 433 proc{|op, io| @prev_char_no == 0 && peek(0) =~ /\s/}) do 434 |op, io| 435 @ltype = "=" 436 res = '' 437 nil until (ch = getc) == "\n" 438 439 until ( peek_equal?("=end") && peek(4) =~ /\s/ ) do 440 (ch = getc) 441 res << ch 442 end 443 444 gets # consume =end 445 446 @ltype = nil 447 Token(TkRD_COMMENT, res) 448 end 449 450 @OP.def_rule("\n") do |op, io| 451 print "\\n\n" if RDoc::RubyLex.debug? 452 case @lex_state 453 when EXPR_BEG, EXPR_FNAME, EXPR_DOT 454 @continue = true 455 else 456 @continue = false 457 @lex_state = EXPR_BEG 458 until (@indent_stack.empty? || 459 [TkLPAREN, TkLBRACK, TkLBRACE, 460 TkfLPAREN, TkfLBRACK, TkfLBRACE].include?(@indent_stack.last)) 461 @indent_stack.pop 462 end 463 end 464 @here_header = false 465 @here_readed = [] 466 Token(TkNL) 467 end 468 469 @OP.def_rules("*", "**", 470 "=", "==", "===", 471 "=~", "<=>", 472 "<", "<=", 473 ">", ">=", ">>") do 474 |op, io| 475 case @lex_state 476 when EXPR_FNAME, EXPR_DOT 477 @lex_state = EXPR_ARG 478 else 479 @lex_state = EXPR_BEG 480 end 481 Token(op) 482 end 483 484 @OP.def_rules("!", "!=", "!~") do 485 |op, io| 486 @lex_state = EXPR_BEG 487 Token(op) 488 end 489 490 @OP.def_rules("<<") do 491 |op, io| 492 tk = nil 493 if @lex_state != EXPR_END && @lex_state != EXPR_CLASS && 494 (@lex_state != EXPR_ARG || @space_seen) 495 c = peek(0) 496 if /\S/ =~ c && (/["'`]/ =~ c || /\w/ =~ c || c == "-") 497 tk = identify_here_document 498 end 499 end 500 unless tk 501 tk = Token(op) 502 case @lex_state 503 when EXPR_FNAME, EXPR_DOT 504 @lex_state = EXPR_ARG 505 else 506 @lex_state = EXPR_BEG 507 end 508 end 509 tk 510 end 511 512 @OP.def_rules("'", '"') do 513 |op, io| 514 identify_string(op) 515 end 516 517 @OP.def_rules("`") do 518 |op, io| 519 if @lex_state == EXPR_FNAME 520 @lex_state = EXPR_END 521 Token(op) 522 else 523 identify_string(op) 524 end 525 end 526 527 @OP.def_rules('?') do 528 |op, io| 529 if @lex_state == EXPR_END 530 @lex_state = EXPR_BEG 531 Token(TkQUESTION) 532 else 533 ch = getc 534 if @lex_state == EXPR_ARG && ch =~ /\s/ 535 ungetc 536 @lex_state = EXPR_BEG; 537 Token(TkQUESTION) 538 else 539 @lex_state = EXPR_END 540 Token(TkSTRING, ch) 541 end 542 end 543 end 544 545 @OP.def_rules("&", "&&", "|", "||") do 546 |op, io| 547 @lex_state = EXPR_BEG 548 Token(op) 549 end 550 551 @OP.def_rules("+=", "-=", "*=", "**=", 552 "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do 553 |op, io| 554 @lex_state = EXPR_BEG 555 op =~ /^(.*)=$/ 556 Token(TkOPASGN, $1) 557 end 558 559 @OP.def_rule("+@", proc{|op, io| @lex_state == EXPR_FNAME}) do 560 |op, io| 561 @lex_state = EXPR_ARG 562 Token(op) 563 end 564 565 @OP.def_rule("-@", proc{|op, io| @lex_state == EXPR_FNAME}) do 566 |op, io| 567 @lex_state = EXPR_ARG 568 Token(op) 569 end 570 571 @OP.def_rules("+", "-") do 572 |op, io| 573 catch(:RET) do 574 if @lex_state == EXPR_ARG 575 if @space_seen and peek(0) =~ /[0-9]/ 576 throw :RET, identify_number(op) 577 else 578 @lex_state = EXPR_BEG 579 end 580 elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/ 581 throw :RET, identify_number(op) 582 else 583 @lex_state = EXPR_BEG 584 end 585 Token(op) 586 end 587 end 588 589 @OP.def_rule(".") do 590 |op, io| 591 @lex_state = EXPR_BEG 592 if peek(0) =~ /[0-9]/ 593 ungetc 594 identify_number 595 else 596 # for "obj.if" etc. 597 @lex_state = EXPR_DOT 598 Token(TkDOT) 599 end 600 end 601 602 @OP.def_rules("..", "...") do 603 |op, io| 604 @lex_state = EXPR_BEG 605 Token(op) 606 end 607 608 lex_int2 609 end 610 611 def lex_int2 612 @OP.def_rules("]", "}", ")") do 613 |op, io| 614 @lex_state = EXPR_END 615 @indent -= 1 616 @indent_stack.pop 617 Token(op) 618 end 619 620 @OP.def_rule(":") do 621 |op, io| 622 if @lex_state == EXPR_END || peek(0) =~ /\s/ 623 @lex_state = EXPR_BEG 624 Token(TkCOLON) 625 else 626 @lex_state = EXPR_FNAME; 627 Token(TkSYMBEG) 628 end 629 end 630 631 @OP.def_rule("::") do 632 |op, io| 633 # p @lex_state.id2name, @space_seen 634 if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen 635 @lex_state = EXPR_BEG 636 Token(TkCOLON3) 637 else 638 @lex_state = EXPR_DOT 639 Token(TkCOLON2) 640 end 641 end 642 643 @OP.def_rule("/") do 644 |op, io| 645 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 646 identify_string(op) 647 elsif peek(0) == '=' 648 getc 649 @lex_state = EXPR_BEG 650 Token(TkOPASGN, "/") #/) 651 elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 652 identify_string(op) 653 else 654 @lex_state = EXPR_BEG 655 Token("/") #/) 656 end 657 end 658 659 @OP.def_rules("^") do 660 |op, io| 661 @lex_state = EXPR_BEG 662 Token("^") 663 end 664 665 # @OP.def_rules("^=") do 666 # @lex_state = EXPR_BEG 667 # Token(OP_ASGN, :^) 668 # end 669 670 @OP.def_rules(",") do 671 |op, io| 672 @lex_state = EXPR_BEG 673 Token(op) 674 end 675 676 @OP.def_rules(";") do 677 |op, io| 678 @lex_state = EXPR_BEG 679 until (@indent_stack.empty? || 680 [TkLPAREN, TkLBRACK, TkLBRACE, 681 TkfLPAREN, TkfLBRACK, TkfLBRACE].include?(@indent_stack.last)) 682 @indent_stack.pop 683 end 684 Token(op) 685 end 686 687 @OP.def_rule("~") do 688 |op, io| 689 @lex_state = EXPR_BEG 690 Token("~") 691 end 692 693 @OP.def_rule("~@", proc{|op, io| @lex_state == EXPR_FNAME}) do 694 |op, io| 695 @lex_state = EXPR_BEG 696 Token("~") 697 end 698 699 @OP.def_rule("(") do 700 |op, io| 701 @indent += 1 702 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 703 @lex_state = EXPR_BEG 704 tk_c = TkfLPAREN 705 else 706 @lex_state = EXPR_BEG 707 tk_c = TkLPAREN 708 end 709 @indent_stack.push tk_c 710 Token tk_c 711 end 712 713 @OP.def_rule("[]", proc{|op, io| @lex_state == EXPR_FNAME}) do 714 |op, io| 715 @lex_state = EXPR_ARG 716 Token("[]") 717 end 718 719 @OP.def_rule("[]=", proc{|op, io| @lex_state == EXPR_FNAME}) do 720 |op, io| 721 @lex_state = EXPR_ARG 722 Token("[]=") 723 end 724 725 @OP.def_rule("[") do 726 |op, io| 727 @indent += 1 728 if @lex_state == EXPR_FNAME 729 tk_c = TkfLBRACK 730 else 731 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 732 tk_c = TkLBRACK 733 elsif @lex_state == EXPR_ARG && @space_seen 734 tk_c = TkLBRACK 735 else 736 tk_c = TkfLBRACK 737 end 738 @lex_state = EXPR_BEG 739 end 740 @indent_stack.push tk_c 741 Token(tk_c) 742 end 743 744 @OP.def_rule("{") do 745 |op, io| 746 @indent += 1 747 if @lex_state != EXPR_END && @lex_state != EXPR_ARG 748 tk_c = TkLBRACE 749 else 750 tk_c = TkfLBRACE 751 end 752 @lex_state = EXPR_BEG 753 @indent_stack.push tk_c 754 Token(tk_c) 755 end 756 757 @OP.def_rule('\\') do 758 |op, io| 759 if getc == "\n" 760 @space_seen = true 761 @continue = true 762 Token(TkSPACE) 763 else 764 ungetc 765 Token("\\") 766 end 767 end 768 769 @OP.def_rule('%') do 770 |op, io| 771 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID 772 identify_quotation 773 elsif peek(0) == '=' 774 getc 775 Token(TkOPASGN, :%) 776 elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/ 777 identify_quotation 778 else 779 @lex_state = EXPR_BEG 780 Token("%") #)) 781 end 782 end 783 784 @OP.def_rule('$') do 785 |op, io| 786 identify_gvar 787 end 788 789 @OP.def_rule('@') do 790 |op, io| 791 if peek(0) =~ /[\w@]/ 792 ungetc 793 identify_identifier 794 else 795 Token("@") 796 end 797 end 798 799 # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do 800 # |op, io| 801 # @indent += 1 802 # @lex_state = EXPR_FNAME 803 # # @lex_state = EXPR_END 804 # # until @rests[0] == "\n" or @rests[0] == ";" 805 # # rests.shift 806 # # end 807 # end 808 809 @OP.def_rule("_") do 810 if peek_match?(/_END__/) and @lex_state == EXPR_BEG then 811 6.times { getc } 812 Token(TkEND_OF_SCRIPT, '__END__') 813 else 814 ungetc 815 identify_identifier 816 end 817 end 818 819 @OP.def_rule("") do 820 |op, io| 821 printf "MATCH: start %s: %s\n", op, io.inspect if RDoc::RubyLex.debug? 822 if peek(0) =~ /[0-9]/ 823 t = identify_number 824 else 825 t = identify_identifier 826 end 827 printf "MATCH: end %s: %s\n", op, io.inspect if RDoc::RubyLex.debug? 828 t 829 end 830 831 p @OP if RDoc::RubyLex.debug? 832 end 833 834 def identify_gvar 835 @lex_state = EXPR_END 836 837 case ch = getc 838 when /[~_*$?!@\/\\;,=:<>".]/ #" 839 Token(TkGVAR, "$" + ch) 840 when "-" 841 Token(TkGVAR, "$-" + getc) 842 when "&", "`", "'", "+" 843 Token(TkBACK_REF, "$"+ch) 844 when /[1-9]/ 845 ref = ch 846 while (ch = getc) =~ /[0-9]/ do ref << ch end 847 ungetc 848 Token(TkNTH_REF, "$#{ref}") 849 when /\w/ 850 ungetc 851 ungetc 852 identify_identifier 853 else 854 ungetc 855 Token("$") 856 end 857 end 858 859 IDENT_RE = if defined? Encoding then 860 eval '/[\w\u{0080}-\u{FFFFF}]/u' # 1.8 can't parse \u{} 861 else 862 /[\w\x80-\xFF]/ 863 end 864 865 def identify_identifier 866 token = "" 867 if peek(0) =~ /[$@]/ 868 token.concat(c = getc) 869 if c == "@" and peek(0) == "@" 870 token.concat getc 871 end 872 end 873 874 while (ch = getc) =~ IDENT_RE do 875 print " :#{ch}: " if RDoc::RubyLex.debug? 876 token.concat ch 877 end 878 879 ungetc 880 881 if (ch == "!" || ch == "?") && token[0,1] =~ /\w/ && peek(0) != "=" 882 token.concat getc 883 end 884 885 # almost fix token 886 887 case token 888 when /^\$/ 889 return Token(TkGVAR, token) 890 when /^\@\@/ 891 @lex_state = EXPR_END 892 # p Token(TkCVAR, token) 893 return Token(TkCVAR, token) 894 when /^\@/ 895 @lex_state = EXPR_END 896 return Token(TkIVAR, token) 897 end 898 899 if @lex_state != EXPR_DOT 900 print token, "\n" if RDoc::RubyLex.debug? 901 902 token_c, *trans = TkReading2Token[token] 903 if token_c 904 # reserved word? 905 906 if (@lex_state != EXPR_BEG && 907 @lex_state != EXPR_FNAME && 908 trans[1]) 909 # modifiers 910 token_c = TkSymbol2Token[trans[1]] 911 @lex_state = trans[0] 912 else 913 if @lex_state != EXPR_FNAME 914 if ENINDENT_CLAUSE.include?(token) 915 # check for ``class = val'' etc. 916 valid = true 917 case token 918 when "class" 919 valid = false unless peek_match?(/^\s*(<<|\w|::)/) 920 when "def" 921 valid = false if peek_match?(/^\s*(([+-\/*&\|^]|<<|>>|\|\||\&\&)=|\&\&|\|\|)/) 922 when "do" 923 valid = false if peek_match?(/^\s*([+-\/*]?=|\*|<|>|\&)/) 924 when *ENINDENT_CLAUSE 925 valid = false if peek_match?(/^\s*([+-\/*]?=|\*|<|>|\&|\|)/) 926 else 927 # no nothing 928 end 929 if valid 930 if token == "do" 931 if ![TkFOR, TkWHILE, TkUNTIL].include?(@indent_stack.last) 932 @indent += 1 933 @indent_stack.push token_c 934 end 935 else 936 @indent += 1 937 @indent_stack.push token_c 938 end 939 else 940 token_c = TkIDENTIFIER 941 end 942 943 elsif DEINDENT_CLAUSE.include?(token) 944 @indent -= 1 945 @indent_stack.pop 946 end 947 @lex_state = trans[0] 948 else 949 @lex_state = EXPR_END 950 end 951 end 952 return Token(token_c, token) 953 end 954 end 955 956 if @lex_state == EXPR_FNAME 957 @lex_state = EXPR_END 958 if peek(0) == '=' 959 token.concat getc 960 end 961 elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT || 962 @lex_state == EXPR_ARG 963 @lex_state = EXPR_ARG 964 else 965 @lex_state = EXPR_END 966 end 967 968 if token[0, 1] =~ /[A-Z]/ 969 return Token(TkCONSTANT, token) 970 elsif token[token.size - 1, 1] =~ /[!?]/ 971 return Token(TkFID, token) 972 else 973 return Token(TkIDENTIFIER, token) 974 end 975 end 976 977 def identify_here_document 978 ch = getc 979 # if lt = PERCENT_LTYPE[ch] 980 if ch == "-" 981 ch = getc 982 indent = true 983 end 984 if /['"`]/ =~ ch 985 user_quote = lt = ch 986 quoted = "" 987 while (c = getc) && c != lt 988 quoted.concat c 989 end 990 else 991 user_quote = nil 992 lt = '"' 993 quoted = ch.dup 994 while (c = getc) && c =~ /\w/ 995 quoted.concat c 996 end 997 ungetc 998 end 999 1000 ltback, @ltype = @ltype, lt 1001 reserve = [] 1002 while ch = getc 1003 reserve.push ch 1004 if ch == "\\" 1005 reserve.push ch = getc 1006 elsif ch == "\n" 1007 break 1008 end 1009 end 1010 1011 output_heredoc = reserve.join =~ /\A\r?\n\z/ 1012 1013 if output_heredoc then 1014 doc = '<<' 1015 doc << '-' if indent 1016 doc << "#{user_quote}#{quoted}#{user_quote}\n" 1017 else 1018 doc = '"' 1019 end 1020 1021 @here_header = false 1022 while l = gets 1023 l = l.sub(/(:?\r)?\n\z/, "\n") 1024 if (indent ? l.strip : l.chomp) == quoted 1025 break 1026 end 1027 doc << l 1028 end 1029 1030 if output_heredoc then 1031 doc << l.chomp 1032 else 1033 doc << '"' 1034 end 1035 1036 @here_header = true 1037 @here_readed.concat reserve 1038 while ch = reserve.pop 1039 ungetc ch 1040 end 1041 1042 token_class = output_heredoc ? RDoc::RubyLex::TkHEREDOC : Ltype2Token[lt] 1043 @ltype = ltback 1044 @lex_state = EXPR_END 1045 Token(token_class, doc) 1046 end 1047 1048 def identify_quotation 1049 type = ch = getc 1050 if lt = PERCENT_LTYPE[type] 1051 ch = getc 1052 elsif type =~ /\W/ 1053 type = nil 1054 lt = "\"" 1055 else 1056 return Token(TkMOD, '%') 1057 end 1058 # if ch !~ /\W/ 1059 # ungetc 1060 # next 1061 # end 1062 #@ltype = lt 1063 @quoted = ch unless @quoted = PERCENT_PAREN[ch] 1064 identify_string(lt, @quoted, type) 1065 end 1066 1067 def identify_number(op = "") 1068 @lex_state = EXPR_END 1069 1070 num = op 1071 1072 if peek(0) == "0" && peek(1) !~ /[.eE]/ 1073 num << getc 1074 1075 case peek(0) 1076 when /[xX]/ 1077 ch = getc 1078 match = /[0-9a-fA-F_]/ 1079 when /[bB]/ 1080 ch = getc 1081 match = /[01_]/ 1082 when /[oO]/ 1083 ch = getc 1084 match = /[0-7_]/ 1085 when /[dD]/ 1086 ch = getc 1087 match = /[0-9_]/ 1088 when /[0-7]/ 1089 match = /[0-7_]/ 1090 when /[89]/ 1091 raise Error, "Illegal octal digit" 1092 else 1093 return Token(TkINTEGER, num) 1094 end 1095 1096 num << ch if ch 1097 1098 len0 = true 1099 non_digit = false 1100 while ch = getc 1101 num << ch 1102 if match =~ ch 1103 if ch == "_" 1104 if non_digit 1105 raise Error, "trailing `#{ch}' in number" 1106 else 1107 non_digit = ch 1108 end 1109 else 1110 non_digit = false 1111 len0 = false 1112 end 1113 else 1114 ungetc 1115 num[-1, 1] = '' 1116 if len0 1117 raise Error, "numeric literal without digits" 1118 end 1119 if non_digit 1120 raise Error, "trailing `#{non_digit}' in number" 1121 end 1122 break 1123 end 1124 end 1125 return Token(TkINTEGER, num) 1126 end 1127 1128 type = TkINTEGER 1129 allow_point = true 1130 allow_e = true 1131 non_digit = false 1132 while ch = getc 1133 num << ch 1134 case ch 1135 when /[0-9]/ 1136 non_digit = false 1137 when "_" 1138 non_digit = ch 1139 when allow_point && "." 1140 if non_digit 1141 raise Error, "trailing `#{non_digit}' in number" 1142 end 1143 type = TkFLOAT 1144 if peek(0) !~ /[0-9]/ 1145 type = TkINTEGER 1146 ungetc 1147 num[-1, 1] = '' 1148 break 1149 end 1150 allow_point = false 1151 when allow_e && "e", allow_e && "E" 1152 if non_digit 1153 raise Error, "trailing `#{non_digit}' in number" 1154 end 1155 type = TkFLOAT 1156 if peek(0) =~ /[+-]/ 1157 num << getc 1158 end 1159 allow_e = false 1160 allow_point = false 1161 non_digit = ch 1162 else 1163 if non_digit 1164 raise Error, "trailing `#{non_digit}' in number" 1165 end 1166 ungetc 1167 num[-1, 1] = '' 1168 break 1169 end 1170 end 1171 1172 Token(type, num) 1173 end 1174 1175 def identify_string(ltype, quoted = ltype, type = nil) 1176 close = PERCENT_PAREN.values.include?(quoted) 1177 @ltype = ltype 1178 @quoted = quoted 1179 1180 str = if ltype == quoted and %w[" ' /].include? ltype then 1181 ltype.dup 1182 elsif RUBY_VERSION > '1.9' then 1183 "%#{type or PERCENT_LTYPE.key ltype}#{PERCENT_PAREN_REV[quoted]}" 1184 else 1185 "%#{type or PERCENT_LTYPE.index ltype}#{PERCENT_PAREN_REV[quoted]}" 1186 end 1187 1188 subtype = nil 1189 begin 1190 nest = 0 1191 1192 while ch = getc 1193 str << ch 1194 1195 if @quoted == ch and nest <= 0 1196 break 1197 elsif @ltype != "'" && @ltype != "]" && @ltype != ":" and ch == "#" 1198 ch = getc 1199 subtype = true 1200 if ch == "{" then 1201 str << ch << skip_inner_expression 1202 next 1203 else 1204 ungetc 1205 end 1206 elsif ch == '\\' 1207 if %w[' /].include? @ltype then 1208 case ch = getc 1209 when "\\", "\n", "'" 1210 when @ltype 1211 str << ch 1212 else 1213 ungetc 1214 end 1215 else 1216 str << read_escape 1217 end 1218 end 1219 1220 if close then 1221 if PERCENT_PAREN[ch] == @quoted 1222 nest += 1 1223 elsif ch == @quoted 1224 nest -= 1 1225 end 1226 end 1227 end 1228 1229 if @ltype == "/" 1230 if peek(0) =~ /i|m|x|o|e|s|u|n/ 1231 getc 1232 end 1233 end 1234 1235 if subtype 1236 Token(DLtype2Token[ltype], str) 1237 else 1238 Token(Ltype2Token[ltype], str) 1239 end 1240 ensure 1241 @ltype = nil 1242 @quoted = nil 1243 @lex_state = EXPR_END 1244 end 1245 end 1246 1247 def skip_inner_expression 1248 res = "" 1249 nest = 0 1250 while ch = getc 1251 res << ch 1252 if ch == '}' 1253 break if nest.zero? 1254 nest -= 1 1255 elsif ch == '{' 1256 nest += 1 1257 end 1258 end 1259 res 1260 end 1261 1262 def identify_comment 1263 @ltype = "#" 1264 1265 comment = '#' 1266 1267 while ch = getc 1268 # if ch == "\\" #" 1269 # read_escape 1270 # end 1271 if ch == "\n" 1272 @ltype = nil 1273 ungetc 1274 break 1275 end 1276 1277 comment << ch 1278 end 1279 1280 return Token(TkCOMMENT, comment) 1281 end 1282 1283 def read_escape 1284 escape = '' 1285 ch = getc 1286 escape << ch 1287 1288 case ch 1289 when "\n", "\r", "\f" 1290 when "\\", "n", "t", "r", "f", "v", "a", "e", "b", "s" #" 1291 when /[0-7]/ 1292 ungetc ch 1293 3.times do 1294 ch = getc 1295 escape << ch 1296 case ch 1297 when /[0-7]/ 1298 when nil 1299 break 1300 else 1301 ungetc 1302 break 1303 end 1304 end 1305 1306 when "x" 1307 2.times do 1308 ch = getc 1309 escape << ch 1310 case ch 1311 when /[0-9a-fA-F]/ 1312 when nil 1313 break 1314 else 1315 ungetc 1316 break 1317 end 1318 end 1319 1320 when "M" 1321 ch = getc 1322 escape << ch 1323 if ch != '-' 1324 ungetc 1325 else 1326 ch = getc 1327 escape << ch 1328 if ch == "\\" #" 1329 escape << read_escape 1330 end 1331 end 1332 1333 when "C", "c" #, "^" 1334 if ch == "C" and (ch = getc) != "-" 1335 escape << ch 1336 ungetc 1337 elsif (ch = getc) == "\\" #" 1338 escape << ch << read_escape 1339 end 1340 else 1341 # other characters 1342 end 1343 1344 escape 1345 end 1346 1347 # :startdoc: 1348 1349end 1350 1351#RDoc::RubyLex.debug_level = 1 1352 1353