1# coding: US-ASCII
2
3#--
4#   irb/ruby-lex.rb - ruby lexcal analyzer
5#   	$Release Version: 0.9.5$
6#   	$Revision: 17979 $
7#   	$Date: 2008-07-09 10:17:05 -0700 (Wed, 09 Jul 2008) $
8#   	by Keiju ISHITSUKA(keiju@ruby-lang.org)
9#
10#++
11
12require "e2mmap"
13require "irb/slex"
14require "stringio"
15
16##
17# Ruby lexer adapted from irb.
18#
19# The internals are not documented because they are scary.
20
21class RDoc::RubyLex
22
23  ##
24  # Raised upon invalid input
25
26  class Error < RDoc::Error
27  end
28
29  # :stopdoc:
30
31  extend Exception2MessageMapper
32
33  def_exception(:AlreadyDefinedToken, "Already defined token(%s)")
34  def_exception(:TkReading2TokenNoKey, "key nothing(key='%s')")
35  def_exception(:TkSymbol2TokenNoKey, "key nothing(key='%s')")
36  def_exception(:TkReading2TokenDuplicateError,
37                "key duplicate(token_n='%s', key='%s')")
38  def_exception(:SyntaxError, "%s")
39
40  def_exception(:TerminateLineInput, "Terminate Line Input")
41
42  include RDoc::RubyToken
43  include IRB
44
45  attr_reader :continue
46  attr_reader :lex_state
47  attr_reader :reader
48
49  class << self
50    attr_accessor :debug_level
51  end
52
53  def self.debug?
54    @debug_level > 0
55  end
56
57  self.debug_level = 0
58
59  # :startdoc:
60
61  ##
62  # Returns an Array of +ruby+ tokens.  See ::new for a description of
63  # +options+.
64
65  def self.tokenize ruby, options
66    tokens = []
67
68    scanner = RDoc::RubyLex.new ruby, options
69    scanner.exception_on_syntax_error = true
70
71    while token = scanner.token do
72      tokens << token
73    end
74
75    tokens
76  end
77
78  ##
79  # Creates a new lexer for +content+.  +options+ is an RDoc::Options, only
80  # +tab_width is used.
81
82  def initialize(content, options)
83    lex_init
84
85    if /\t/ =~ content then
86      tab_width = options.tab_width
87      content = content.split(/\n/).map do |line|
88        1 while line.gsub!(/\t+/) {
89          ' ' * (tab_width*$&.length - $`.length % tab_width)
90        }  && $~
91        line
92      end.join("\n")
93    end
94
95    content << "\n" unless content[-1, 1] == "\n"
96
97    set_input StringIO.new content
98
99    @base_char_no = 0
100    @char_no = 0
101    @exp_line_no = @line_no = 1
102    @here_readed = []
103    @readed = []
104    @rests = []
105    @seek = 0
106
107    @here_header = false
108    @indent = 0
109    @indent_stack = []
110    @lex_state = EXPR_BEG
111    @space_seen = false
112
113    @continue = false
114    @line = ""
115
116    @skip_space = false
117    @readed_auto_clean_up = false
118    @exception_on_syntax_error = true
119
120    @prompt = nil
121    @prev_seek = nil
122    @ltype = nil
123  end
124
125  # :stopdoc:
126
127  def inspect # :nodoc:
128    "#<%s:0x%x pos %d lex_state %p space_seen %p>" % [
129      self.class, object_id,
130      @io.pos, @lex_state, @space_seen,
131    ]
132  end
133
134  attr_accessor :skip_space
135  attr_accessor :readed_auto_clean_up
136  attr_accessor :exception_on_syntax_error
137
138  attr_reader :seek
139  attr_reader :char_no
140  attr_reader :line_no
141  attr_reader :indent
142
143  # io functions
144  def set_input(io, p = nil, &block)
145    @io = io
146    if p.respond_to?(:call)
147      @input = p
148    elsif block_given?
149      @input = block
150    else
151      @input = Proc.new{@io.gets}
152    end
153  end
154
155  def get_readed
156    if idx = @readed.rindex("\n")
157      @base_char_no = @readed.size - (idx + 1)
158    else
159      @base_char_no += @readed.size
160    end
161
162    readed = @readed.join("")
163    @readed = []
164    readed
165  end
166
167  def getc
168    while @rests.empty?
169      #      return nil unless buf_input
170      @rests.push nil unless buf_input
171    end
172    c = @rests.shift
173    if @here_header
174      @here_readed.push c
175    else
176      @readed.push c
177    end
178    @seek += 1
179    if c == "\n"
180      @line_no += 1
181      @char_no = 0
182    else
183      @char_no += 1
184    end
185
186    c
187  end
188
189  def gets
190    l = ""
191    while c = getc
192      l.concat(c)
193      break if c == "\n"
194    end
195    return nil if l == "" and c.nil?
196    l
197  end
198
199  def eof?
200    @io.eof?
201  end
202
203  def getc_of_rests
204    if @rests.empty?
205      nil
206    else
207      getc
208    end
209  end
210
211  def ungetc(c = nil)
212    if @here_readed.empty?
213      c2 = @readed.pop
214    else
215      c2 = @here_readed.pop
216    end
217    c = c2 unless c
218    @rests.unshift c #c =
219    @seek -= 1
220    if c == "\n"
221      @line_no -= 1
222      if idx = @readed.rindex("\n")
223        @char_no = idx + 1
224      else
225        @char_no = @base_char_no + @readed.size
226      end
227    else
228      @char_no -= 1
229    end
230  end
231
232  def peek_equal?(str)
233    chrs = str.split(//)
234    until @rests.size >= chrs.size
235      return false unless buf_input
236    end
237    @rests[0, chrs.size] == chrs
238  end
239
240  def peek_match?(regexp)
241    while @rests.empty?
242      return false unless buf_input
243    end
244    regexp =~ @rests.join("")
245  end
246
247  def peek(i = 0)
248    while @rests.size <= i
249      return nil unless buf_input
250    end
251    @rests[i]
252  end
253
254  def buf_input
255    prompt
256    line = @input.call
257    return nil unless line
258    @rests.concat line.split(//)
259    true
260  end
261  private :buf_input
262
263  def set_prompt(p = nil, &block)
264    p = block if block_given?
265    if p.respond_to?(:call)
266      @prompt = p
267    else
268      @prompt = Proc.new{print p}
269    end
270  end
271
272  def prompt
273    if @prompt
274      @prompt.call(@ltype, @indent, @continue, @line_no)
275    end
276  end
277
278  def initialize_input
279    @ltype = nil
280    @quoted = nil
281    @indent = 0
282    @indent_stack = []
283    @lex_state = EXPR_BEG
284    @space_seen = false
285    @here_header = false
286
287    @continue = false
288    prompt
289
290    @line = ""
291    @exp_line_no = @line_no
292  end
293
294  def each_top_level_statement
295    initialize_input
296    catch(:TERM_INPUT) do
297      loop do
298        begin
299          @continue = false
300          prompt
301          unless l = lex
302            throw :TERM_INPUT if @line == ''
303          else
304            #p l
305            @line.concat l
306            if @ltype or @continue or @indent > 0
307              next
308            end
309          end
310          if @line != "\n"
311            yield @line, @exp_line_no
312          end
313          break unless l
314          @line = ''
315          @exp_line_no = @line_no
316
317          @indent = 0
318          @indent_stack = []
319          prompt
320        rescue TerminateLineInput
321          initialize_input
322          prompt
323          get_readed
324        end
325      end
326    end
327  end
328
329  def lex
330    until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
331           !@continue or
332      tk.nil?)
333      #p tk
334      #p @lex_state
335      #p self
336    end
337    line = get_readed
338    #      print self.inspect
339    if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
340      nil
341    else
342      line
343    end
344  end
345
346  def token
347    #      require "tracer"
348    #      Tracer.on
349    @prev_seek = @seek
350    @prev_line_no = @line_no
351    @prev_char_no = @char_no
352    begin
353      begin
354        tk = @OP.match(self)
355        @space_seen = tk.kind_of?(TkSPACE)
356      rescue SyntaxError => e
357        raise Error, "syntax error: #{e.message}" if
358          @exception_on_syntax_error
359
360        tk = TkError.new(@seek, @line_no, @char_no)
361      end
362    end while @skip_space and tk.kind_of?(TkSPACE)
363
364    if @readed_auto_clean_up
365      get_readed
366    end
367    #      Tracer.off
368    tk
369  end
370
371  ENINDENT_CLAUSE = [
372    "case", "class", "def", "do", "for", "if",
373    "module", "unless", "until", "while", "begin" #, "when"
374  ]
375
376  DEINDENT_CLAUSE = ["end" #, "when"
377  ]
378
379  PERCENT_LTYPE = {
380    "q" => "\'",
381    "Q" => "\"",
382    "x" => "\`",
383    "r" => "/",
384    "w" => "]",
385    "W" => "]",
386    "s" => ":"
387  }
388
389  PERCENT_PAREN = {
390    "{" => "}",
391    "[" => "]",
392    "<" => ">",
393    "(" => ")"
394  }
395
396  PERCENT_PAREN_REV = PERCENT_PAREN.invert
397
398  Ltype2Token = {
399    "\'" => TkSTRING,
400    "\"" => TkSTRING,
401    "\`" => TkXSTRING,
402    "/" => TkREGEXP,
403    "]" => TkDSTRING,
404    ":" => TkSYMBOL
405  }
406  DLtype2Token = {
407    "\"" => TkDSTRING,
408    "\`" => TkDXSTRING,
409    "/" => TkDREGEXP,
410  }
411
412  def lex_init()
413    @OP = IRB::SLex.new
414    @OP.def_rules("\0", "\004", "\032") do |op, io|
415      Token(TkEND_OF_SCRIPT, '')
416    end
417
418    @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |op, io|
419      @space_seen = true
420      str = op
421      while (ch = getc) =~ /[ \t\f\r\13]/ do
422        str << ch
423      end
424      ungetc
425      Token TkSPACE, str
426    end
427
428    @OP.def_rule("#") do |op, io|
429      identify_comment
430    end
431
432    @OP.def_rule("=begin",
433                 proc{|op, io| @prev_char_no == 0 && peek(0) =~ /\s/}) do
434      |op, io|
435      @ltype = "="
436      res = ''
437      nil until (ch = getc) == "\n"
438
439      until ( peek_equal?("=end") && peek(4) =~ /\s/ ) do
440        (ch = getc)
441        res << ch
442      end
443
444      gets # consume =end
445
446      @ltype = nil
447      Token(TkRD_COMMENT, res)
448    end
449
450    @OP.def_rule("\n") do |op, io|
451      print "\\n\n" if RDoc::RubyLex.debug?
452      case @lex_state
453      when EXPR_BEG, EXPR_FNAME, EXPR_DOT
454        @continue = true
455      else
456        @continue = false
457        @lex_state = EXPR_BEG
458        until (@indent_stack.empty? ||
459               [TkLPAREN, TkLBRACK, TkLBRACE,
460                 TkfLPAREN, TkfLBRACK, TkfLBRACE].include?(@indent_stack.last))
461          @indent_stack.pop
462        end
463      end
464      @here_header = false
465      @here_readed = []
466      Token(TkNL)
467    end
468
469    @OP.def_rules("*", "**",
470                  "=", "==", "===",
471                  "=~", "<=>",
472                  "<", "<=",
473                  ">", ">=", ">>") do
474      |op, io|
475      case @lex_state
476      when EXPR_FNAME, EXPR_DOT
477        @lex_state = EXPR_ARG
478      else
479        @lex_state = EXPR_BEG
480      end
481      Token(op)
482    end
483
484    @OP.def_rules("!", "!=", "!~") do
485      |op, io|
486      @lex_state = EXPR_BEG
487      Token(op)
488    end
489
490    @OP.def_rules("<<") do
491      |op, io|
492      tk = nil
493      if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
494         (@lex_state != EXPR_ARG || @space_seen)
495        c = peek(0)
496        if /\S/ =~ c && (/["'`]/ =~ c || /\w/ =~ c || c == "-")
497          tk = identify_here_document
498        end
499      end
500      unless tk
501        tk = Token(op)
502        case @lex_state
503        when EXPR_FNAME, EXPR_DOT
504          @lex_state = EXPR_ARG
505        else
506          @lex_state = EXPR_BEG
507        end
508      end
509      tk
510    end
511
512    @OP.def_rules("'", '"') do
513      |op, io|
514      identify_string(op)
515    end
516
517    @OP.def_rules("`") do
518      |op, io|
519      if @lex_state == EXPR_FNAME
520        @lex_state = EXPR_END
521        Token(op)
522      else
523        identify_string(op)
524      end
525    end
526
527    @OP.def_rules('?') do
528      |op, io|
529      if @lex_state == EXPR_END
530        @lex_state = EXPR_BEG
531        Token(TkQUESTION)
532      else
533        ch = getc
534        if @lex_state == EXPR_ARG && ch =~ /\s/
535          ungetc
536          @lex_state = EXPR_BEG;
537          Token(TkQUESTION)
538        else
539          @lex_state = EXPR_END
540          Token(TkSTRING, ch)
541        end
542      end
543    end
544
545    @OP.def_rules("&", "&&", "|", "||") do
546      |op, io|
547      @lex_state = EXPR_BEG
548      Token(op)
549    end
550
551    @OP.def_rules("+=", "-=", "*=", "**=",
552                  "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
553      |op, io|
554      @lex_state = EXPR_BEG
555      op =~ /^(.*)=$/
556      Token(TkOPASGN, $1)
557    end
558
559    @OP.def_rule("+@", proc{|op, io| @lex_state == EXPR_FNAME}) do
560      |op, io|
561      @lex_state = EXPR_ARG
562      Token(op)
563    end
564
565    @OP.def_rule("-@", proc{|op, io| @lex_state == EXPR_FNAME}) do
566      |op, io|
567      @lex_state = EXPR_ARG
568      Token(op)
569    end
570
571    @OP.def_rules("+", "-") do
572      |op, io|
573      catch(:RET) do
574        if @lex_state == EXPR_ARG
575          if @space_seen and peek(0) =~ /[0-9]/
576            throw :RET, identify_number(op)
577          else
578            @lex_state = EXPR_BEG
579          end
580        elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
581          throw :RET, identify_number(op)
582        else
583          @lex_state = EXPR_BEG
584        end
585        Token(op)
586      end
587    end
588
589    @OP.def_rule(".") do
590      |op, io|
591      @lex_state = EXPR_BEG
592      if peek(0) =~ /[0-9]/
593        ungetc
594        identify_number
595      else
596        # for "obj.if" etc.
597        @lex_state = EXPR_DOT
598        Token(TkDOT)
599      end
600    end
601
602    @OP.def_rules("..", "...") do
603      |op, io|
604      @lex_state = EXPR_BEG
605      Token(op)
606    end
607
608    lex_int2
609  end
610
611  def lex_int2
612    @OP.def_rules("]", "}", ")") do
613      |op, io|
614      @lex_state = EXPR_END
615      @indent -= 1
616      @indent_stack.pop
617      Token(op)
618    end
619
620    @OP.def_rule(":") do
621      |op, io|
622      if @lex_state == EXPR_END || peek(0) =~ /\s/
623        @lex_state = EXPR_BEG
624        Token(TkCOLON)
625      else
626        @lex_state = EXPR_FNAME;
627        Token(TkSYMBEG)
628      end
629    end
630
631    @OP.def_rule("::") do
632      |op, io|
633      #      p @lex_state.id2name, @space_seen
634      if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
635        @lex_state = EXPR_BEG
636        Token(TkCOLON3)
637      else
638        @lex_state = EXPR_DOT
639        Token(TkCOLON2)
640      end
641    end
642
643    @OP.def_rule("/") do
644      |op, io|
645      if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
646        identify_string(op)
647      elsif peek(0) == '='
648        getc
649        @lex_state = EXPR_BEG
650        Token(TkOPASGN, "/") #/)
651      elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
652        identify_string(op)
653      else
654        @lex_state = EXPR_BEG
655        Token("/") #/)
656      end
657    end
658
659    @OP.def_rules("^") do
660      |op, io|
661      @lex_state = EXPR_BEG
662      Token("^")
663    end
664
665    #       @OP.def_rules("^=") do
666    # 	@lex_state = EXPR_BEG
667    # 	Token(OP_ASGN, :^)
668    #       end
669
670    @OP.def_rules(",") do
671      |op, io|
672      @lex_state = EXPR_BEG
673      Token(op)
674    end
675
676    @OP.def_rules(";") do
677      |op, io|
678      @lex_state = EXPR_BEG
679      until (@indent_stack.empty? ||
680             [TkLPAREN, TkLBRACK, TkLBRACE,
681               TkfLPAREN, TkfLBRACK, TkfLBRACE].include?(@indent_stack.last))
682        @indent_stack.pop
683      end
684      Token(op)
685    end
686
687    @OP.def_rule("~") do
688      |op, io|
689      @lex_state = EXPR_BEG
690      Token("~")
691    end
692
693    @OP.def_rule("~@", proc{|op, io| @lex_state == EXPR_FNAME}) do
694      |op, io|
695      @lex_state = EXPR_BEG
696      Token("~")
697    end
698
699    @OP.def_rule("(") do
700      |op, io|
701      @indent += 1
702      if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
703        @lex_state = EXPR_BEG
704        tk_c = TkfLPAREN
705      else
706        @lex_state = EXPR_BEG
707        tk_c = TkLPAREN
708      end
709      @indent_stack.push tk_c
710      Token tk_c
711    end
712
713    @OP.def_rule("[]", proc{|op, io| @lex_state == EXPR_FNAME}) do
714      |op, io|
715      @lex_state = EXPR_ARG
716      Token("[]")
717    end
718
719    @OP.def_rule("[]=", proc{|op, io| @lex_state == EXPR_FNAME}) do
720      |op, io|
721      @lex_state = EXPR_ARG
722      Token("[]=")
723    end
724
725    @OP.def_rule("[") do
726      |op, io|
727      @indent += 1
728      if @lex_state == EXPR_FNAME
729        tk_c = TkfLBRACK
730      else
731        if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
732          tk_c = TkLBRACK
733        elsif @lex_state == EXPR_ARG && @space_seen
734          tk_c = TkLBRACK
735        else
736          tk_c = TkfLBRACK
737        end
738        @lex_state = EXPR_BEG
739      end
740      @indent_stack.push tk_c
741      Token(tk_c)
742    end
743
744    @OP.def_rule("{") do
745      |op, io|
746      @indent += 1
747      if @lex_state != EXPR_END && @lex_state != EXPR_ARG
748        tk_c = TkLBRACE
749      else
750        tk_c = TkfLBRACE
751      end
752      @lex_state = EXPR_BEG
753      @indent_stack.push tk_c
754      Token(tk_c)
755    end
756
757    @OP.def_rule('\\') do
758      |op, io|
759      if getc == "\n"
760        @space_seen = true
761        @continue = true
762        Token(TkSPACE)
763      else
764        ungetc
765        Token("\\")
766      end
767    end
768
769    @OP.def_rule('%') do
770      |op, io|
771      if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
772        identify_quotation
773      elsif peek(0) == '='
774        getc
775        Token(TkOPASGN, :%)
776      elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
777        identify_quotation
778      else
779        @lex_state = EXPR_BEG
780        Token("%") #))
781      end
782    end
783
784    @OP.def_rule('$') do
785      |op, io|
786      identify_gvar
787    end
788
789    @OP.def_rule('@') do
790      |op, io|
791      if peek(0) =~ /[\w@]/
792        ungetc
793        identify_identifier
794      else
795        Token("@")
796      end
797    end
798
799    #       @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
800    # 	|op, io|
801    # 	@indent += 1
802    # 	@lex_state = EXPR_FNAME
803    # #	@lex_state = EXPR_END
804    # #	until @rests[0] == "\n" or @rests[0] == ";"
805    # #	  rests.shift
806    # #	end
807    #       end
808
809    @OP.def_rule("_") do
810      if peek_match?(/_END__/) and @lex_state == EXPR_BEG then
811        6.times { getc }
812        Token(TkEND_OF_SCRIPT, '__END__')
813      else
814        ungetc
815        identify_identifier
816      end
817    end
818
819    @OP.def_rule("") do
820      |op, io|
821      printf "MATCH: start %s: %s\n", op, io.inspect if RDoc::RubyLex.debug?
822      if peek(0) =~ /[0-9]/
823        t = identify_number
824      else
825        t = identify_identifier
826      end
827      printf "MATCH: end %s: %s\n", op, io.inspect if RDoc::RubyLex.debug?
828      t
829    end
830
831    p @OP if RDoc::RubyLex.debug?
832  end
833
834  def identify_gvar
835    @lex_state = EXPR_END
836
837    case ch = getc
838    when /[~_*$?!@\/\\;,=:<>".]/   #"
839      Token(TkGVAR, "$" + ch)
840    when "-"
841      Token(TkGVAR, "$-" + getc)
842    when "&", "`", "'", "+"
843      Token(TkBACK_REF, "$"+ch)
844    when /[1-9]/
845      ref = ch
846      while (ch = getc) =~ /[0-9]/ do ref << ch end
847      ungetc
848      Token(TkNTH_REF, "$#{ref}")
849    when /\w/
850      ungetc
851      ungetc
852      identify_identifier
853    else
854      ungetc
855      Token("$")
856    end
857  end
858
859  IDENT_RE = if defined? Encoding then
860               eval '/[\w\u{0080}-\u{FFFFF}]/u' # 1.8 can't parse \u{}
861             else
862               /[\w\x80-\xFF]/
863             end
864
865  def identify_identifier
866    token = ""
867    if peek(0) =~ /[$@]/
868      token.concat(c = getc)
869      if c == "@" and peek(0) == "@"
870        token.concat getc
871      end
872    end
873
874    while (ch = getc) =~ IDENT_RE do
875      print " :#{ch}: " if RDoc::RubyLex.debug?
876      token.concat ch
877    end
878
879    ungetc
880
881    if (ch == "!" || ch == "?") && token[0,1] =~ /\w/ && peek(0) != "="
882      token.concat getc
883    end
884
885    # almost fix token
886
887    case token
888    when /^\$/
889      return Token(TkGVAR, token)
890    when /^\@\@/
891      @lex_state = EXPR_END
892      # p Token(TkCVAR, token)
893      return Token(TkCVAR, token)
894    when /^\@/
895      @lex_state = EXPR_END
896      return Token(TkIVAR, token)
897    end
898
899    if @lex_state != EXPR_DOT
900      print token, "\n" if RDoc::RubyLex.debug?
901
902      token_c, *trans = TkReading2Token[token]
903      if token_c
904        # reserved word?
905
906        if (@lex_state != EXPR_BEG &&
907            @lex_state != EXPR_FNAME &&
908            trans[1])
909          # modifiers
910          token_c = TkSymbol2Token[trans[1]]
911          @lex_state = trans[0]
912        else
913          if @lex_state != EXPR_FNAME
914            if ENINDENT_CLAUSE.include?(token)
915              # check for ``class = val'' etc.
916              valid = true
917              case token
918              when "class"
919                valid = false unless peek_match?(/^\s*(<<|\w|::)/)
920              when "def"
921                valid = false if peek_match?(/^\s*(([+-\/*&\|^]|<<|>>|\|\||\&\&)=|\&\&|\|\|)/)
922              when "do"
923                valid = false if peek_match?(/^\s*([+-\/*]?=|\*|<|>|\&)/)
924              when *ENINDENT_CLAUSE
925                valid = false if peek_match?(/^\s*([+-\/*]?=|\*|<|>|\&|\|)/)
926              else
927                # no nothing
928              end
929              if valid
930                if token == "do"
931                  if ![TkFOR, TkWHILE, TkUNTIL].include?(@indent_stack.last)
932                    @indent += 1
933                    @indent_stack.push token_c
934                  end
935                else
936                  @indent += 1
937                  @indent_stack.push token_c
938                end
939              else
940                token_c = TkIDENTIFIER
941              end
942
943            elsif DEINDENT_CLAUSE.include?(token)
944              @indent -= 1
945              @indent_stack.pop
946            end
947            @lex_state = trans[0]
948          else
949            @lex_state = EXPR_END
950          end
951        end
952        return Token(token_c, token)
953      end
954    end
955
956    if @lex_state == EXPR_FNAME
957      @lex_state = EXPR_END
958      if peek(0) == '='
959        token.concat getc
960      end
961    elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT ||
962          @lex_state == EXPR_ARG
963      @lex_state = EXPR_ARG
964    else
965      @lex_state = EXPR_END
966    end
967
968    if token[0, 1] =~ /[A-Z]/
969      return Token(TkCONSTANT, token)
970    elsif token[token.size - 1, 1] =~ /[!?]/
971      return Token(TkFID, token)
972    else
973      return Token(TkIDENTIFIER, token)
974    end
975  end
976
977  def identify_here_document
978    ch = getc
979    #    if lt = PERCENT_LTYPE[ch]
980    if ch == "-"
981      ch = getc
982      indent = true
983    end
984    if /['"`]/ =~ ch
985      user_quote = lt = ch
986      quoted = ""
987      while (c = getc) && c != lt
988        quoted.concat c
989      end
990    else
991      user_quote = nil
992      lt = '"'
993      quoted = ch.dup
994      while (c = getc) && c =~ /\w/
995        quoted.concat c
996      end
997      ungetc
998    end
999
1000    ltback, @ltype = @ltype, lt
1001    reserve = []
1002    while ch = getc
1003      reserve.push ch
1004      if ch == "\\"
1005        reserve.push ch = getc
1006      elsif ch == "\n"
1007        break
1008      end
1009    end
1010
1011    output_heredoc = reserve.join =~ /\A\r?\n\z/
1012
1013    if output_heredoc then
1014      doc = '<<'
1015      doc << '-' if indent
1016      doc << "#{user_quote}#{quoted}#{user_quote}\n"
1017    else
1018      doc = '"'
1019    end
1020
1021    @here_header = false
1022    while l = gets
1023      l = l.sub(/(:?\r)?\n\z/, "\n")
1024      if (indent ? l.strip : l.chomp) == quoted
1025        break
1026      end
1027      doc << l
1028    end
1029
1030    if output_heredoc then
1031      doc << l.chomp
1032    else
1033      doc << '"'
1034    end
1035
1036    @here_header = true
1037    @here_readed.concat reserve
1038    while ch = reserve.pop
1039      ungetc ch
1040    end
1041
1042    token_class = output_heredoc ? RDoc::RubyLex::TkHEREDOC : Ltype2Token[lt]
1043    @ltype = ltback
1044    @lex_state = EXPR_END
1045    Token(token_class, doc)
1046  end
1047
1048  def identify_quotation
1049    type = ch = getc
1050    if lt = PERCENT_LTYPE[type]
1051      ch = getc
1052    elsif type =~ /\W/
1053      type = nil
1054      lt = "\""
1055    else
1056      return Token(TkMOD, '%')
1057    end
1058    #     if ch !~ /\W/
1059    #       ungetc
1060    #       next
1061    #     end
1062    #@ltype = lt
1063    @quoted = ch unless @quoted = PERCENT_PAREN[ch]
1064    identify_string(lt, @quoted, type)
1065  end
1066
1067  def identify_number(op = "")
1068    @lex_state = EXPR_END
1069
1070    num = op
1071
1072    if peek(0) == "0" && peek(1) !~ /[.eE]/
1073      num << getc
1074
1075      case peek(0)
1076      when /[xX]/
1077        ch = getc
1078        match = /[0-9a-fA-F_]/
1079      when /[bB]/
1080        ch = getc
1081        match = /[01_]/
1082      when /[oO]/
1083        ch = getc
1084        match = /[0-7_]/
1085      when /[dD]/
1086        ch = getc
1087        match = /[0-9_]/
1088      when /[0-7]/
1089        match = /[0-7_]/
1090      when /[89]/
1091        raise Error, "Illegal octal digit"
1092      else
1093        return Token(TkINTEGER, num)
1094      end
1095
1096      num << ch if ch
1097
1098      len0 = true
1099      non_digit = false
1100      while ch = getc
1101        num << ch
1102        if match =~ ch
1103          if ch == "_"
1104            if non_digit
1105              raise Error, "trailing `#{ch}' in number"
1106            else
1107              non_digit = ch
1108            end
1109          else
1110            non_digit = false
1111            len0 = false
1112          end
1113        else
1114          ungetc
1115          num[-1, 1] = ''
1116          if len0
1117            raise Error, "numeric literal without digits"
1118          end
1119          if non_digit
1120            raise Error, "trailing `#{non_digit}' in number"
1121          end
1122          break
1123        end
1124      end
1125      return Token(TkINTEGER, num)
1126    end
1127
1128    type = TkINTEGER
1129    allow_point = true
1130    allow_e = true
1131    non_digit = false
1132    while ch = getc
1133      num << ch
1134      case ch
1135      when /[0-9]/
1136        non_digit = false
1137      when "_"
1138        non_digit = ch
1139      when allow_point && "."
1140        if non_digit
1141          raise Error, "trailing `#{non_digit}' in number"
1142        end
1143        type = TkFLOAT
1144        if peek(0) !~ /[0-9]/
1145          type = TkINTEGER
1146          ungetc
1147          num[-1, 1] = ''
1148          break
1149        end
1150        allow_point = false
1151      when allow_e && "e", allow_e && "E"
1152        if non_digit
1153          raise Error, "trailing `#{non_digit}' in number"
1154        end
1155        type = TkFLOAT
1156        if peek(0) =~ /[+-]/
1157          num << getc
1158        end
1159        allow_e = false
1160        allow_point = false
1161        non_digit = ch
1162      else
1163        if non_digit
1164          raise Error, "trailing `#{non_digit}' in number"
1165        end
1166        ungetc
1167        num[-1, 1] = ''
1168        break
1169      end
1170    end
1171
1172    Token(type, num)
1173  end
1174
1175  def identify_string(ltype, quoted = ltype, type = nil)
1176    close = PERCENT_PAREN.values.include?(quoted)
1177    @ltype = ltype
1178    @quoted = quoted
1179
1180    str = if ltype == quoted and %w[" ' /].include? ltype then
1181            ltype.dup
1182          elsif RUBY_VERSION > '1.9' then
1183            "%#{type or PERCENT_LTYPE.key ltype}#{PERCENT_PAREN_REV[quoted]}"
1184          else
1185            "%#{type or PERCENT_LTYPE.index ltype}#{PERCENT_PAREN_REV[quoted]}"
1186          end
1187
1188    subtype = nil
1189    begin
1190      nest = 0
1191
1192      while ch = getc
1193        str << ch
1194
1195        if @quoted == ch and nest <= 0
1196          break
1197        elsif @ltype != "'" && @ltype != "]" && @ltype != ":" and ch == "#"
1198          ch = getc
1199          subtype = true
1200          if ch == "{" then
1201            str << ch << skip_inner_expression
1202            next
1203          else
1204            ungetc
1205          end
1206        elsif ch == '\\'
1207          if %w[' /].include? @ltype then
1208            case ch = getc
1209            when "\\", "\n", "'"
1210            when @ltype
1211              str << ch
1212            else
1213              ungetc
1214            end
1215          else
1216            str << read_escape
1217          end
1218        end
1219
1220        if close then
1221          if PERCENT_PAREN[ch] == @quoted
1222            nest += 1
1223          elsif ch == @quoted
1224            nest -= 1
1225          end
1226        end
1227      end
1228
1229      if @ltype == "/"
1230        if peek(0) =~ /i|m|x|o|e|s|u|n/
1231          getc
1232        end
1233      end
1234
1235      if subtype
1236        Token(DLtype2Token[ltype], str)
1237      else
1238        Token(Ltype2Token[ltype], str)
1239      end
1240    ensure
1241      @ltype = nil
1242      @quoted = nil
1243      @lex_state = EXPR_END
1244    end
1245  end
1246
1247  def skip_inner_expression
1248    res = ""
1249    nest = 0
1250    while ch = getc
1251      res << ch
1252      if ch == '}'
1253        break if nest.zero?
1254        nest -= 1
1255      elsif ch == '{'
1256        nest += 1
1257      end
1258    end
1259    res
1260  end
1261
1262  def identify_comment
1263    @ltype = "#"
1264
1265    comment = '#'
1266
1267    while ch = getc
1268      # if ch == "\\" #"
1269      #   read_escape
1270      # end
1271      if ch == "\n"
1272        @ltype = nil
1273        ungetc
1274        break
1275      end
1276
1277      comment << ch
1278    end
1279
1280    return Token(TkCOMMENT, comment)
1281  end
1282
1283  def read_escape
1284    escape = ''
1285    ch = getc
1286    escape << ch
1287
1288    case ch
1289    when "\n", "\r", "\f"
1290    when "\\", "n", "t", "r", "f", "v", "a", "e", "b", "s" #"
1291    when /[0-7]/
1292      ungetc ch
1293      3.times do
1294        ch = getc
1295        escape << ch
1296        case ch
1297        when /[0-7]/
1298        when nil
1299          break
1300        else
1301          ungetc
1302          break
1303        end
1304      end
1305
1306    when "x"
1307      2.times do
1308        ch = getc
1309        escape << ch
1310        case ch
1311        when /[0-9a-fA-F]/
1312        when nil
1313          break
1314        else
1315          ungetc
1316          break
1317        end
1318      end
1319
1320    when "M"
1321      ch = getc
1322      escape << ch
1323      if ch != '-'
1324        ungetc
1325      else
1326        ch = getc
1327        escape << ch
1328        if ch == "\\" #"
1329          escape << read_escape
1330        end
1331      end
1332
1333    when "C", "c" #, "^"
1334      if ch == "C" and (ch = getc) != "-"
1335        escape << ch
1336        ungetc
1337      elsif (ch = getc) == "\\" #"
1338        escape << ch << read_escape
1339      end
1340    else
1341      # other characters
1342    end
1343
1344    escape
1345  end
1346
1347  # :startdoc:
1348
1349end
1350
1351#RDoc::RubyLex.debug_level = 1
1352
1353