1require 'strscan'
2
3##
4# A recursive-descent parser for RDoc markup.
5#
6# The parser tokenizes an input string then parses the tokens into a Document.
7# Documents can be converted into output formats by writing a visitor like
8# RDoc::Markup::ToHTML.
9#
10# The parser only handles the block-level constructs Paragraph, List,
11# ListItem, Heading, Verbatim, BlankLine and Rule.  Inline markup such as
12# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
13#
14# To see what markup the Parser implements read RDoc.  To see how to use
15# RDoc markup to format text in your program read RDoc::Markup.
16
17class RDoc::Markup::Parser
18
19  include RDoc::Text
20
21  ##
22  # List token types
23
24  LIST_TOKENS = [
25    :BULLET,
26    :LABEL,
27    :LALPHA,
28    :NOTE,
29    :NUMBER,
30    :UALPHA,
31  ]
32
33  ##
34  # Parser error subclass
35
36  class Error < RuntimeError; end
37
38  ##
39  # Raised when the parser is unable to handle the given markup
40
41  class ParseError < Error; end
42
43  ##
44  # Enables display of debugging information
45
46  attr_accessor :debug
47
48  ##
49  # Token accessor
50
51  attr_reader :tokens
52
53  ##
54  # Parses +str+ into a Document.
55  #
56  # Use RDoc::Markup#parse instead of this method.
57
58  def self.parse str
59    parser = new
60    parser.tokenize str
61    doc = RDoc::Markup::Document.new
62    parser.parse doc
63  end
64
65  ##
66  # Returns a token stream for +str+, for testing
67
68  def self.tokenize str
69    parser = new
70    parser.tokenize str
71    parser.tokens
72  end
73
74  ##
75  # Creates a new Parser.  See also ::parse
76
77  def initialize
78    @binary_input   = nil
79    @current_token  = nil
80    @debug          = false
81    @have_encoding  = Object.const_defined? :Encoding
82    @have_byteslice = ''.respond_to? :byteslice
83    @input          = nil
84    @input_encoding = nil
85    @line           = 0
86    @line_pos       = 0
87    @s              = nil
88    @tokens         = []
89  end
90
91  ##
92  # Builds a Heading of +level+
93
94  def build_heading level
95    type, text, = get
96
97    text = case type
98           when :TEXT then
99             skip :NEWLINE
100             text
101           else
102             unget
103             ''
104           end
105
106    RDoc::Markup::Heading.new level, text
107  end
108
109  ##
110  # Builds a List flush to +margin+
111
112  def build_list margin
113    p :list_start => margin if @debug
114
115    list = RDoc::Markup::List.new
116    label = nil
117
118    until @tokens.empty? do
119      type, data, column, = get
120
121      case type
122      when *LIST_TOKENS then
123        if column < margin || (list.type && list.type != type) then
124          unget
125          break
126        end
127
128        list.type = type
129        peek_type, _, column, = peek_token
130
131        case type
132        when :NOTE, :LABEL then
133          label = [] unless label
134
135          if peek_type == :NEWLINE then
136            # description not on the same line as LABEL/NOTE
137            # skip the trailing newline & any blank lines below
138            while peek_type == :NEWLINE
139              get
140              peek_type, _, column, = peek_token
141            end
142
143            # we may be:
144            #   - at end of stream
145            #   - at a column < margin:
146            #         [text]
147            #       blah blah blah
148            #   - at the same column, but with a different type of list item
149            #       [text]
150            #       * blah blah
151            #   - at the same column, with the same type of list item
152            #       [one]
153            #       [two]
154            # In all cases, we have an empty description.
155            # In the last case only, we continue.
156            if peek_type.nil? || column < margin then
157              empty = true
158            elsif column == margin then
159              case peek_type
160              when type
161                empty = :continue
162              when *LIST_TOKENS
163                empty = true
164              else
165                empty = false
166              end
167            else
168              empty = false
169            end
170
171            if empty then
172              label << data
173              next if empty == :continue
174              break
175            end
176          end
177        else
178          data = nil
179        end
180
181        if label then
182          data = label << data
183          label = nil
184        end
185
186        list_item = RDoc::Markup::ListItem.new data
187        parse list_item, column
188        list << list_item
189
190      else
191        unget
192        break
193      end
194    end
195
196    p :list_end => margin if @debug
197
198    if list.empty? then
199      return nil unless label
200      return nil unless [:LABEL, :NOTE].include? list.type
201
202      list_item = RDoc::Markup::ListItem.new label, RDoc::Markup::BlankLine.new
203      list << list_item
204    end
205
206    list
207  end
208
209  ##
210  # Builds a Paragraph that is flush to +margin+
211
212  def build_paragraph margin
213    p :paragraph_start => margin if @debug
214
215    paragraph = RDoc::Markup::Paragraph.new
216
217    until @tokens.empty? do
218      type, data, column, = get
219
220      if type == :TEXT and column == margin then
221        paragraph << data
222
223        break if peek_token.first == :BREAK
224
225        data << ' ' if skip :NEWLINE
226      else
227        unget
228        break
229      end
230    end
231
232    paragraph.parts.last.sub!(/ \z/, '') # cleanup
233
234    p :paragraph_end => margin if @debug
235
236    paragraph
237  end
238
239  ##
240  # Builds a Verbatim that is indented from +margin+.
241  #
242  # The verbatim block is shifted left (the least indented lines start in
243  # column 0).  Each part of the verbatim is one line of text, always
244  # terminated by a newline.  Blank lines always consist of a single newline
245  # character, and there is never a single newline at the end of the verbatim.
246
247  def build_verbatim margin
248    p :verbatim_begin => margin if @debug
249    verbatim = RDoc::Markup::Verbatim.new
250
251    min_indent = nil
252    generate_leading_spaces = true
253    line = ''
254
255    until @tokens.empty? do
256      type, data, column, = get
257
258      if type == :NEWLINE then
259        line << data
260        verbatim << line
261        line = ''
262        generate_leading_spaces = true
263        next
264      end
265
266      if column <= margin
267        unget
268        break
269      end
270
271      if generate_leading_spaces then
272        indent = column - margin
273        line << ' ' * indent
274        min_indent = indent if min_indent.nil? || indent < min_indent
275        generate_leading_spaces = false
276      end
277
278      case type
279      when :HEADER then
280        line << '=' * data
281        _, _, peek_column, = peek_token
282        peek_column ||= column + data
283        indent = peek_column - column - data
284        line << ' ' * indent
285      when :RULE then
286        width = 2 + data
287        line << '-' * width
288        _, _, peek_column, = peek_token
289        peek_column ||= column + width
290        indent = peek_column - column - width
291        line << ' ' * indent
292      when :BREAK, :TEXT then
293        line << data
294      else # *LIST_TOKENS
295        list_marker = case type
296                      when :BULLET then data
297                      when :LABEL  then "[#{data}]"
298                      when :NOTE   then "#{data}::"
299                      else # :LALPHA, :NUMBER, :UALPHA
300                        "#{data}."
301                      end
302        line << list_marker
303        peek_type, _, peek_column = peek_token
304        unless peek_type == :NEWLINE then
305          peek_column ||= column + list_marker.length
306          indent = peek_column - column - list_marker.length
307          line << ' ' * indent
308        end
309      end
310
311    end
312
313    verbatim << line << "\n" unless line.empty?
314    verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
315    verbatim.normalize
316
317    p :verbatim_end => margin if @debug
318
319    verbatim
320  end
321
322  ##
323  # The character offset for the input string at the given +byte_offset+
324
325  def char_pos byte_offset
326    if @have_byteslice then
327      @input.byteslice(0, byte_offset).length
328    elsif @have_encoding then
329      matched = @binary_input[0, byte_offset]
330      matched.force_encoding @input_encoding
331      matched.length
332    else
333      byte_offset
334    end
335  end
336
337  ##
338  # Pulls the next token from the stream.
339
340  def get
341    @current_token = @tokens.shift
342    p :get => @current_token if @debug
343    @current_token
344  end
345
346  ##
347  # Parses the tokens into an array of RDoc::Markup::XXX objects,
348  # and appends them to the passed +parent+ RDoc::Markup::YYY object.
349  #
350  # Exits at the end of the token stream, or when it encounters a token
351  # in a column less than +indent+ (unless it is a NEWLINE).
352  #
353  # Returns +parent+.
354
355  def parse parent, indent = 0
356    p :parse_start => indent if @debug
357
358    until @tokens.empty? do
359      type, data, column, = get
360
361      case type
362      when :BREAK then
363        parent << RDoc::Markup::BlankLine.new
364        skip :NEWLINE, false
365        next
366      when :NEWLINE then
367        # trailing newlines are skipped below, so this is a blank line
368        parent << RDoc::Markup::BlankLine.new
369        skip :NEWLINE, false
370        next
371      end
372
373      # indentation change: break or verbatim
374      if column < indent then
375        unget
376        break
377      elsif column > indent then
378        unget
379        parent << build_verbatim(indent)
380        next
381      end
382
383      # indentation is the same
384      case type
385      when :HEADER then
386        parent << build_heading(data)
387      when :RULE then
388        parent << RDoc::Markup::Rule.new(data)
389        skip :NEWLINE
390      when :TEXT then
391        unget
392        parent << build_paragraph(indent)
393      when *LIST_TOKENS then
394        unget
395        parent << build_list(indent)
396      else
397        type, data, column, line = @current_token
398        raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
399      end
400    end
401
402    p :parse_end => indent if @debug
403
404    parent
405
406  end
407
408  ##
409  # Returns the next token on the stream without modifying the stream
410
411  def peek_token
412    token = @tokens.first || []
413    p :peek => token if @debug
414    token
415  end
416
417  ##
418  # Creates the StringScanner
419
420  def setup_scanner input
421    @line     = 0
422    @line_pos = 0
423    @input    = input.dup
424
425    if @have_encoding and not @have_byteslice then
426      @input_encoding = @input.encoding
427      @binary_input   = @input.force_encoding Encoding::BINARY
428    end
429
430    @s = StringScanner.new input
431  end
432
433  ##
434  # Skips the next token if its type is +token_type+.
435  #
436  # Optionally raises an error if the next token is not of the expected type.
437
438  def skip token_type, error = true
439    type, = get
440    return unless type # end of stream
441    return @current_token if token_type == type
442    unget
443    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
444  end
445
446  ##
447  # Turns text +input+ into a stream of tokens
448
449  def tokenize input
450    setup_scanner input
451
452    until @s.eos? do
453      pos = @s.pos
454
455      # leading spaces will be reflected by the column of the next token
456      # the only thing we loose are trailing spaces at the end of the file
457      next if @s.scan(/ +/)
458
459      # note: after BULLET, LABEL, etc.,
460      # indent will be the column of the next non-newline token
461
462      @tokens << case
463                 # [CR]LF => :NEWLINE
464                 when @s.scan(/\r?\n/) then
465                   token = [:NEWLINE, @s.matched, *token_pos(pos)]
466                   @line_pos = char_pos @s.pos
467                   @line += 1
468                   token
469                 # === text => :HEADER then :TEXT
470                 when @s.scan(/(=+)(\s*)/) then
471                   level = @s[1].length
472                   header = [:HEADER, level, *token_pos(pos)]
473
474                   if @s[2] =~ /^\r?\n/ then
475                     @s.pos -= @s[2].length
476                     header
477                   else
478                     pos = @s.pos
479                     @s.scan(/.*/)
480                     @tokens << header
481                     [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
482                   end
483                 # --- (at least 3) and nothing else on the line => :RULE
484                 when @s.scan(/(-{3,}) *\r?$/) then
485                   [:RULE, @s[1].length - 2, *token_pos(pos)]
486                 # * or - followed by white space and text => :BULLET
487                 when @s.scan(/([*-]) +(\S)/) then
488                   @s.pos -= @s[2].bytesize # unget \S
489                   [:BULLET, @s[1], *token_pos(pos)]
490                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
491                 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
492                   # FIXME if tab(s), the column will be wrong
493                   # either support tabs everywhere by first expanding them to
494                   # spaces, or assume that they will have been replaced
495                   # before (and provide a check for that at least in debug
496                   # mode)
497                   list_label = @s[1]
498                   @s.pos -= @s[2].bytesize # unget \S
499                   list_type =
500                     case list_label
501                     when /[a-z]/ then :LALPHA
502                     when /[A-Z]/ then :UALPHA
503                     when /\d/    then :NUMBER
504                     else
505                       raise ParseError, "BUG token #{list_label}"
506                     end
507                   [list_type, list_label, *token_pos(pos)]
508                 # [text] followed by spaces or end of line => :LABEL
509                 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
510                   [:LABEL, @s[1], *token_pos(pos)]
511                 # text:: followed by spaces or end of line => :NOTE
512                 when @s.scan(/(.*?)::( +|\r?$)/) then
513                   [:NOTE, @s[1], *token_pos(pos)]
514                 # anything else: :TEXT
515                 else @s.scan(/(.*?)(  )?\r?$/)
516                   token = [:TEXT, @s[1], *token_pos(pos)]
517
518                   if @s[2] then
519                     @tokens << token
520                     [:BREAK, @s[2], *token_pos(pos + @s[1].length)]
521                   else
522                     token
523                   end
524                 end
525    end
526
527    self
528  end
529
530  ##
531  # Calculates the column (by character) and line of the current token from
532  # +scanner+ based on +byte_offset+.
533
534  def token_pos byte_offset
535    offset = char_pos byte_offset
536
537    [offset - @line_pos, @line]
538  end
539
540  ##
541  # Returns the current token to the token stream
542
543  def unget
544    token = @current_token
545    p :unget => token if @debug
546    raise Error, 'too many #ungets' if token == @tokens.first
547    @tokens.unshift token if token
548  end
549
550end
551
552