1require 'strscan' 2 3## 4# A recursive-descent parser for RDoc markup. 5# 6# The parser tokenizes an input string then parses the tokens into a Document. 7# Documents can be converted into output formats by writing a visitor like 8# RDoc::Markup::ToHTML. 9# 10# The parser only handles the block-level constructs Paragraph, List, 11# ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as 12# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager. 13# 14# To see what markup the Parser implements read RDoc. To see how to use 15# RDoc markup to format text in your program read RDoc::Markup. 16 17class RDoc::Markup::Parser 18 19 include RDoc::Text 20 21 ## 22 # List token types 23 24 LIST_TOKENS = [ 25 :BULLET, 26 :LABEL, 27 :LALPHA, 28 :NOTE, 29 :NUMBER, 30 :UALPHA, 31 ] 32 33 ## 34 # Parser error subclass 35 36 class Error < RuntimeError; end 37 38 ## 39 # Raised when the parser is unable to handle the given markup 40 41 class ParseError < Error; end 42 43 ## 44 # Enables display of debugging information 45 46 attr_accessor :debug 47 48 ## 49 # Token accessor 50 51 attr_reader :tokens 52 53 ## 54 # Parses +str+ into a Document. 55 # 56 # Use RDoc::Markup#parse instead of this method. 57 58 def self.parse str 59 parser = new 60 parser.tokenize str 61 doc = RDoc::Markup::Document.new 62 parser.parse doc 63 end 64 65 ## 66 # Returns a token stream for +str+, for testing 67 68 def self.tokenize str 69 parser = new 70 parser.tokenize str 71 parser.tokens 72 end 73 74 ## 75 # Creates a new Parser. See also ::parse 76 77 def initialize 78 @binary_input = nil 79 @current_token = nil 80 @debug = false 81 @have_encoding = Object.const_defined? :Encoding 82 @have_byteslice = ''.respond_to? :byteslice 83 @input = nil 84 @input_encoding = nil 85 @line = 0 86 @line_pos = 0 87 @s = nil 88 @tokens = [] 89 end 90 91 ## 92 # Builds a Heading of +level+ 93 94 def build_heading level 95 type, text, = get 96 97 text = case type 98 when :TEXT then 99 skip :NEWLINE 100 text 101 else 102 unget 103 '' 104 end 105 106 RDoc::Markup::Heading.new level, text 107 end 108 109 ## 110 # Builds a List flush to +margin+ 111 112 def build_list margin 113 p :list_start => margin if @debug 114 115 list = RDoc::Markup::List.new 116 label = nil 117 118 until @tokens.empty? do 119 type, data, column, = get 120 121 case type 122 when *LIST_TOKENS then 123 if column < margin || (list.type && list.type != type) then 124 unget 125 break 126 end 127 128 list.type = type 129 peek_type, _, column, = peek_token 130 131 case type 132 when :NOTE, :LABEL then 133 label = [] unless label 134 135 if peek_type == :NEWLINE then 136 # description not on the same line as LABEL/NOTE 137 # skip the trailing newline & any blank lines below 138 while peek_type == :NEWLINE 139 get 140 peek_type, _, column, = peek_token 141 end 142 143 # we may be: 144 # - at end of stream 145 # - at a column < margin: 146 # [text] 147 # blah blah blah 148 # - at the same column, but with a different type of list item 149 # [text] 150 # * blah blah 151 # - at the same column, with the same type of list item 152 # [one] 153 # [two] 154 # In all cases, we have an empty description. 155 # In the last case only, we continue. 156 if peek_type.nil? || column < margin then 157 empty = true 158 elsif column == margin then 159 case peek_type 160 when type 161 empty = :continue 162 when *LIST_TOKENS 163 empty = true 164 else 165 empty = false 166 end 167 else 168 empty = false 169 end 170 171 if empty then 172 label << data 173 next if empty == :continue 174 break 175 end 176 end 177 else 178 data = nil 179 end 180 181 if label then 182 data = label << data 183 label = nil 184 end 185 186 list_item = RDoc::Markup::ListItem.new data 187 parse list_item, column 188 list << list_item 189 190 else 191 unget 192 break 193 end 194 end 195 196 p :list_end => margin if @debug 197 198 if list.empty? then 199 return nil unless label 200 return nil unless [:LABEL, :NOTE].include? list.type 201 202 list_item = RDoc::Markup::ListItem.new label, RDoc::Markup::BlankLine.new 203 list << list_item 204 end 205 206 list 207 end 208 209 ## 210 # Builds a Paragraph that is flush to +margin+ 211 212 def build_paragraph margin 213 p :paragraph_start => margin if @debug 214 215 paragraph = RDoc::Markup::Paragraph.new 216 217 until @tokens.empty? do 218 type, data, column, = get 219 220 if type == :TEXT and column == margin then 221 paragraph << data 222 223 break if peek_token.first == :BREAK 224 225 data << ' ' if skip :NEWLINE 226 else 227 unget 228 break 229 end 230 end 231 232 paragraph.parts.last.sub!(/ \z/, '') # cleanup 233 234 p :paragraph_end => margin if @debug 235 236 paragraph 237 end 238 239 ## 240 # Builds a Verbatim that is indented from +margin+. 241 # 242 # The verbatim block is shifted left (the least indented lines start in 243 # column 0). Each part of the verbatim is one line of text, always 244 # terminated by a newline. Blank lines always consist of a single newline 245 # character, and there is never a single newline at the end of the verbatim. 246 247 def build_verbatim margin 248 p :verbatim_begin => margin if @debug 249 verbatim = RDoc::Markup::Verbatim.new 250 251 min_indent = nil 252 generate_leading_spaces = true 253 line = '' 254 255 until @tokens.empty? do 256 type, data, column, = get 257 258 if type == :NEWLINE then 259 line << data 260 verbatim << line 261 line = '' 262 generate_leading_spaces = true 263 next 264 end 265 266 if column <= margin 267 unget 268 break 269 end 270 271 if generate_leading_spaces then 272 indent = column - margin 273 line << ' ' * indent 274 min_indent = indent if min_indent.nil? || indent < min_indent 275 generate_leading_spaces = false 276 end 277 278 case type 279 when :HEADER then 280 line << '=' * data 281 _, _, peek_column, = peek_token 282 peek_column ||= column + data 283 indent = peek_column - column - data 284 line << ' ' * indent 285 when :RULE then 286 width = 2 + data 287 line << '-' * width 288 _, _, peek_column, = peek_token 289 peek_column ||= column + width 290 indent = peek_column - column - width 291 line << ' ' * indent 292 when :BREAK, :TEXT then 293 line << data 294 else # *LIST_TOKENS 295 list_marker = case type 296 when :BULLET then data 297 when :LABEL then "[#{data}]" 298 when :NOTE then "#{data}::" 299 else # :LALPHA, :NUMBER, :UALPHA 300 "#{data}." 301 end 302 line << list_marker 303 peek_type, _, peek_column = peek_token 304 unless peek_type == :NEWLINE then 305 peek_column ||= column + list_marker.length 306 indent = peek_column - column - list_marker.length 307 line << ' ' * indent 308 end 309 end 310 311 end 312 313 verbatim << line << "\n" unless line.empty? 314 verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0 315 verbatim.normalize 316 317 p :verbatim_end => margin if @debug 318 319 verbatim 320 end 321 322 ## 323 # The character offset for the input string at the given +byte_offset+ 324 325 def char_pos byte_offset 326 if @have_byteslice then 327 @input.byteslice(0, byte_offset).length 328 elsif @have_encoding then 329 matched = @binary_input[0, byte_offset] 330 matched.force_encoding @input_encoding 331 matched.length 332 else 333 byte_offset 334 end 335 end 336 337 ## 338 # Pulls the next token from the stream. 339 340 def get 341 @current_token = @tokens.shift 342 p :get => @current_token if @debug 343 @current_token 344 end 345 346 ## 347 # Parses the tokens into an array of RDoc::Markup::XXX objects, 348 # and appends them to the passed +parent+ RDoc::Markup::YYY object. 349 # 350 # Exits at the end of the token stream, or when it encounters a token 351 # in a column less than +indent+ (unless it is a NEWLINE). 352 # 353 # Returns +parent+. 354 355 def parse parent, indent = 0 356 p :parse_start => indent if @debug 357 358 until @tokens.empty? do 359 type, data, column, = get 360 361 case type 362 when :BREAK then 363 parent << RDoc::Markup::BlankLine.new 364 skip :NEWLINE, false 365 next 366 when :NEWLINE then 367 # trailing newlines are skipped below, so this is a blank line 368 parent << RDoc::Markup::BlankLine.new 369 skip :NEWLINE, false 370 next 371 end 372 373 # indentation change: break or verbatim 374 if column < indent then 375 unget 376 break 377 elsif column > indent then 378 unget 379 parent << build_verbatim(indent) 380 next 381 end 382 383 # indentation is the same 384 case type 385 when :HEADER then 386 parent << build_heading(data) 387 when :RULE then 388 parent << RDoc::Markup::Rule.new(data) 389 skip :NEWLINE 390 when :TEXT then 391 unget 392 parent << build_paragraph(indent) 393 when *LIST_TOKENS then 394 unget 395 parent << build_list(indent) 396 else 397 type, data, column, line = @current_token 398 raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}" 399 end 400 end 401 402 p :parse_end => indent if @debug 403 404 parent 405 406 end 407 408 ## 409 # Returns the next token on the stream without modifying the stream 410 411 def peek_token 412 token = @tokens.first || [] 413 p :peek => token if @debug 414 token 415 end 416 417 ## 418 # Creates the StringScanner 419 420 def setup_scanner input 421 @line = 0 422 @line_pos = 0 423 @input = input.dup 424 425 if @have_encoding and not @have_byteslice then 426 @input_encoding = @input.encoding 427 @binary_input = @input.force_encoding Encoding::BINARY 428 end 429 430 @s = StringScanner.new input 431 end 432 433 ## 434 # Skips the next token if its type is +token_type+. 435 # 436 # Optionally raises an error if the next token is not of the expected type. 437 438 def skip token_type, error = true 439 type, = get 440 return unless type # end of stream 441 return @current_token if token_type == type 442 unget 443 raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error 444 end 445 446 ## 447 # Turns text +input+ into a stream of tokens 448 449 def tokenize input 450 setup_scanner input 451 452 until @s.eos? do 453 pos = @s.pos 454 455 # leading spaces will be reflected by the column of the next token 456 # the only thing we loose are trailing spaces at the end of the file 457 next if @s.scan(/ +/) 458 459 # note: after BULLET, LABEL, etc., 460 # indent will be the column of the next non-newline token 461 462 @tokens << case 463 # [CR]LF => :NEWLINE 464 when @s.scan(/\r?\n/) then 465 token = [:NEWLINE, @s.matched, *token_pos(pos)] 466 @line_pos = char_pos @s.pos 467 @line += 1 468 token 469 # === text => :HEADER then :TEXT 470 when @s.scan(/(=+)(\s*)/) then 471 level = @s[1].length 472 header = [:HEADER, level, *token_pos(pos)] 473 474 if @s[2] =~ /^\r?\n/ then 475 @s.pos -= @s[2].length 476 header 477 else 478 pos = @s.pos 479 @s.scan(/.*/) 480 @tokens << header 481 [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)] 482 end 483 # --- (at least 3) and nothing else on the line => :RULE 484 when @s.scan(/(-{3,}) *\r?$/) then 485 [:RULE, @s[1].length - 2, *token_pos(pos)] 486 # * or - followed by white space and text => :BULLET 487 when @s.scan(/([*-]) +(\S)/) then 488 @s.pos -= @s[2].bytesize # unget \S 489 [:BULLET, @s[1], *token_pos(pos)] 490 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER 491 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then 492 # FIXME if tab(s), the column will be wrong 493 # either support tabs everywhere by first expanding them to 494 # spaces, or assume that they will have been replaced 495 # before (and provide a check for that at least in debug 496 # mode) 497 list_label = @s[1] 498 @s.pos -= @s[2].bytesize # unget \S 499 list_type = 500 case list_label 501 when /[a-z]/ then :LALPHA 502 when /[A-Z]/ then :UALPHA 503 when /\d/ then :NUMBER 504 else 505 raise ParseError, "BUG token #{list_label}" 506 end 507 [list_type, list_label, *token_pos(pos)] 508 # [text] followed by spaces or end of line => :LABEL 509 when @s.scan(/\[(.*?)\]( +|\r?$)/) then 510 [:LABEL, @s[1], *token_pos(pos)] 511 # text:: followed by spaces or end of line => :NOTE 512 when @s.scan(/(.*?)::( +|\r?$)/) then 513 [:NOTE, @s[1], *token_pos(pos)] 514 # anything else: :TEXT 515 else @s.scan(/(.*?)( )?\r?$/) 516 token = [:TEXT, @s[1], *token_pos(pos)] 517 518 if @s[2] then 519 @tokens << token 520 [:BREAK, @s[2], *token_pos(pos + @s[1].length)] 521 else 522 token 523 end 524 end 525 end 526 527 self 528 end 529 530 ## 531 # Calculates the column (by character) and line of the current token from 532 # +scanner+ based on +byte_offset+. 533 534 def token_pos byte_offset 535 offset = char_pos byte_offset 536 537 [offset - @line_pos, @line] 538 end 539 540 ## 541 # Returns the current token to the token stream 542 543 def unget 544 token = @current_token 545 p :unget => token if @debug 546 raise Error, 'too many #ungets' if token == @tokens.first 547 @tokens.unshift token if token 548 end 549 550end 551 552