1# -*- tcl -*- 2# Copyright (c) 2009 Andreas Kupries <andreas_kupries@sourceforge.net> 3 4# Recursive descent parser for Tcl commands embedded in a string. (=> 5# subst -novariables, without actual evaluation of the embedded 6# commands). Useful for processing templates, etc. The result is an 7# abstract syntax tree of strings and commands, which in turn have 8# strings and commands as arguments. 9 10# The tree can be processed further. The nodes of the tree are 11# annotated with line/column/offset information to allow later stages 12# the reporting of higher-level syntax and semantic errors with exact 13# locations in the input. 14 15# TODO :: Add ability to report progress through the 16# TODO :: input. Callback. Invoked in 'Initialize', 'Step', and 17# TODO :: 'Finalize'. 18 19# TODO :: Investigate possibility of using tclparser package 20# TODO :: ('parser') to handle the command pieces embedded in the 21# TODO :: text. 22 23# # ## ### ##### ######## ############# ##################### 24## Requirements 25 26package require Tcl 8.4 ; # Required runtime. 27package require snit ; # OO system. 28package require fileutil ; # File utilities. 29package require logger ; # User feedback. 30package require struct::list ; # Higher-order list operations. 31package require struct::stack ; # Stacks 32package require struct::set ; # Finite sets 33package require treeql ; # Tree queries and transformation. 34 35# # ## ### ##### ######## ############# ##################### 36## 37 38logger::initNamespace ::doctools::tcl::parse 39snit::type ::doctools::tcl::parse { 40 # # ## ### ##### ######## ############# 41 ## Public API 42 43 typemethod file {t path {root {}}} { 44 $type text $t [fileutil::cat -translation binary -encoding binary $path] $root 45 } 46 47 typemethod text {t text {root {}}} { 48 # --- --- --- --------- --------- --------- 49 # Phase 1. Lexical processing. 50 # The resulting tree contains the raw tokens. See 51 # below for the specification of the resulting tree 52 # structure. 53 # 54 # This part is a recursive descent parser using Tcl's 12 rules 55 # for processing the input. Note: Variable references are not 56 # recognized, they are processed like regular text. 57 58 Initialize $t $text $root 59 String 60 Finalize 61 62 # Tree structure 63 # - All nodes but the root have the attributes 'type', 'range', 'line', and 'col'. 64 # 65 # * 'type' in { Command, Text, Backslash, Word, Quote, Continuation, QBrace } 66 # * 'range' is 2-element list (offset start, offset end) 67 # * 'line' is integer number > 0 68 # * 'col' is integer number >= 0 69 # 70 # 'type' specifies what sort of token the node contains. 71 # 72 # 'range' is the location of the token as offsets in 73 # characters from the beginning of the string, for 74 # first and last character in the token. EOL markers 75 # count as one character. This can be empty. 76 # 77 # 'line', 'col' are the location of the first character 78 # AFTER the token, as the line and column the character is 79 # on and at. 80 # 81 # Meaning of the various node types 82 # 83 # Command .... : A command begins here, the text in the range 84 # .............. is the opening bracket. 85 # Text ....... : A text segment in a word, anything up to the 86 # .............. beginning of a backslash sequence or of an 87 # .............. embedded command. 88 # Backslash .. : A backslash sequence. The text under the 89 # .............. range is the whole sequence. 90 # Word ....... : The beginning of an unquoted, quoted or 91 # .............. braced word. The text under the range is the 92 # .............. opening quote or brace, if any. The range is 93 # .............. empty for an unquoted word. 94 # Quote ...... : An embedded double-quote character which is 95 # .............. not the end of a quoted string (a special 96 # .............. type of backslash sequence). The range is the 97 # .............. whole sequence. 98 # Continuation : A continuation line in an unquoted, quoted, 99 # .............. or braced string. The range covers the whole 100 # .............. sequence, including the whitespace trailing 101 # .............. it. 102 # QBrace ..... : A quoted brace in a braced string. A special 103 # .............. kind of backslash sequence. The range covers 104 # .............. the whole sequence. 105 106 # --- --- --- --------- --------- --------- 107 # Phase 2. Convert the token tree into a syntax tree. 108 # This phase simplifies the tree by converting and 109 # eliminating special tokens, and further decouples 110 # it from the input by storing the relevant string 111 # ranges of the input in the tree. For the the 112 # specification of the resulting structure see method 113 # 'Verify'. 114 # 115 # The sub-phases are and do 116 # 117 # (a) Extract the string information from the input and store 118 # them in their Text tokens. 119 # (b) Convert the special tokens (QBrace, Backslash, Quote, 120 # Continuation) into equivalent 'Text' tokens, with proper 121 # string information. 122 # (c) Merge adjacent 'Text' tokens. 123 # (d) Remove irrelevant 'Word' tokens. These are tokens with a 124 # single Text token as child. Word tokens without children 125 # however represent empty strings. They are converted into 126 # an equivalent Text node instead. 127 # (e) Pull the first word of commands into the command token, 128 # and ensure that it is not dynamic, i.e not an embedded 129 # command. 130 131 ShowTree $t "Raw tree" 132 133 set q [treeql %AUTO% -tree $t] 134 135 # (a) 136 foreach n [$q query tree withatt type Text] { 137 struct::list assign [$t get $n range] a e 138 #$t unset $n range 139 $t set $n text [string range $mydata $a $e] 140 } 141 ShowTree $t "Text annotation" 142 143 # (b1) 144 foreach n [$q query tree withatt type QBrace] { 145 struct::list assign [$t get $n range] a e 146 incr a ; # Skip backslash 147 #$t unset $n range 148 $t set $n text [string range $mydata $a $e] 149 $t set $n type Text 150 } 151 ShowTree $t "Special conversion 1, quoted braces" 152 153 # (b2) 154 foreach n [$q query tree withatt type Backslash] { 155 struct::list assign [$t get $n range] a e 156 #$t unset $n range 157 $t set $n text [subst -nocommands -novariables [string range $mydata $a $e]] 158 159 #puts <'[string range $mydata $a $e]'> 160 #puts _'[subst -nocommands -novariables [string range $mydata $a $e]]'_ 161 162 $t set $n type Text 163 } 164 ShowTree $t "Special conversion 2, backslash sequences" 165 166 # (b3) 167 foreach n [$q query tree withatt type Quote] { 168 #$t unset $n range 169 $t set $n text "\"" 170 $t set $n type Text 171 } 172 ShowTree $t "Special conversion 3, quoted double quotes" 173 174 # (b4) 175 foreach n [$q query tree withatt type Continuation] { 176 #$t unset $n range 177 $t set $n text { } 178 $t set $n type Text 179 } 180 ShowTree $t "Special conversion 4, continuation lines" 181 182 # (c) 183 foreach n [$q query tree withatt type Text right withatt type Text] { 184 set left [$t previous $n] 185 $t append $left text [$t get $n text] 186 187 # Extend covered range. Copy location. 188 struct::list assign [$t get $left range] a _ 189 struct::list assign [$t get $n range] _ e 190 $t set $left range [list $a $e] 191 $t set $left line [$t get $n line] 192 $t set $left col [$t get $n col] 193 194 $t delete $n 195 } 196 ShowTree $t "Merged adjacent texts" 197 198 # (d) 199 foreach n [$q query tree withatt type Word] { 200 if {![$t numchildren $n]} { 201 $t set $n type Text 202 $t set $n text {} 203 } elseif {[$t numchildren $n] == 1} { 204 $t cut $n 205 } 206 } 207 ShowTree $t "Dropped simple words" 208 209 # (e) 210 foreach n [$q query tree withatt type Command] { 211 set first [lindex [$t children $n] 0] 212 if {[$t get $first type] eq "Word"} { 213 error {Dynamic command name} 214 } 215 $t set $n text [$t get $first text] 216 $t set $n range [$t get $first range] 217 $t set $n line [$t get $first line] 218 $t set $n col [$t get $first col] 219 $t delete $first 220 } 221 ShowTree $t "Command lifting" 222 223 $q destroy 224 225 Verify $t 226 return 227 } 228 229 proc Verify {t} { 230 # Tree structure ... 231 # Attributes Values 232 # - type string in {'Command','Text','Word'} (phase 2) 233 # - range 2-tuple (integer, integer), can be empty. start and end offset of the word in the input string. 234 # - line integer, line the node starts on. First line is 1 235 # - col integer, column the node starts on (#char since start of line, first char is 0) 236 # Constraints 237 # .(i) The root node has no attributes at all. 238 # .(ii) The children of the root are Command and Text nodes in semi-alternation. 239 # I.e.: After a Text node a Command has to follow. 240 # After a Command node either Text or Command can follow. 241 # .(iii) The children of a Command node are Text, Word, and Command nodes, the command arguments. If any. 242 # .(iv) The children of a Word node are Command and Text nodes in semi-alternation. 243 # .(v) All Text nodes are leafs. 244 # .(vi) Any Command node can be a leaf. 245 # .(vii) Word nodes cannot be leafs. 246 # .(viii) All non-root nodes have the attributes 'type', 'range', 'col', and 'line'. 247 248 foreach n [$t nodes] { 249 if {[$t parent $n] eq ""} { 250 # (ii) 251 set last {} 252 foreach c [$t children $n] { 253 set type [$t get $c type] 254 if {![struct::set contains {Command Text} $type]} { 255 return -code error "$c :: Bad node type $type in child of root node" 256 } elseif {($type eq $last) && ($last eq "Text")} { 257 return -code error "$c :: Bad node $type, not semi-alternating" 258 } 259 set last $type 260 } 261 # (i) 262 if {[llength [$t getall $n]]} { 263 return -code error "$n :: Bad root node, has attributes, should not" 264 } 265 continue 266 } else { 267 # (viii) 268 foreach k {range line col} { 269 if {![$t keyexists $n $k]} { 270 return -code error "$n :: Bad node, attribute '$k' missing" 271 } 272 } 273 } 274 set type [$t get $n type] 275 switch -exact -- $type { 276 Command { 277 # (vi) 278 # No need to check children. May have some or not, 279 # and no specific sequence is required. 280 } 281 Word { 282 # (vii) 283 if {![llength [$t children $n]]} { 284 return -code error "$n :: Bad word node is leaf" 285 } 286 # (iv) 287 set last {} 288 foreach c [$t children $n] { 289 set type [$t get $c type] 290 if {![struct::set contains {Command Text} $type]} { 291 return -code error "$n :: Bad node type $type in word node" 292 } elseif {($type eq $last) && ($last eq "Text")} { 293 return -code error "$c :: Bad node $type, not semi-alternating" 294 } 295 set last $type 296 } 297 } 298 Text { 299 # (v) 300 if {[llength [$t children $n]]} { 301 return -code error "$n :: Bad text node is not leaf" 302 } 303 } 304 default { 305 # (iii) 306 return -code error "$n :: Bad node type $type" 307 } 308 } 309 } 310 return 311 } 312 313 # # ## ### ##### ######## ############# 314 ## Internal methods, lexical processing 315 316 proc String {} { 317 while 1 { 318 Note @String 319 if {[EOF]} break 320 if {[Command]} continue 321 if {[TextSegment]} continue 322 if {[Backslash]} continue 323 324 Stop ;# Unexpected character 325 } 326 Note @EOF 327 return 328 } 329 330 proc Command {} { 331 # A command starts with an opening bracket. 332 Note ?Command 333 if {![Match "\\A(\\\[)" range]} { 334 Note \t%No-Command 335 return 0 336 } 337 Note !Command 338 339 PushRoot [Node Command $range] 340 while {[Word]} { 341 # Step over any whitespace after the last word 342 Whitespace 343 # Command ends at the closing bracket 344 if {[Match "\\A(\\])" range]} break 345 if {![EOF]} continue 346 347 Stop ;# Unexpected end of input 348 } 349 350 Note !CommandStop 351 PopRoot 352 return 1 353 } 354 355 proc TextSegment {} { 356 # A text segment is anything up to a command start or start of 357 # a back slash sequence. 358 Note ?TextSegment 359 if {![Match "\\A(\[^\\\[\]+)" range]} { 360 Note \t%No-TextSegment 361 return 0 362 } 363 Note !TextSegment 364 Node Text $range 365 return 1 366 } 367 368 proc TextSegmentWithoutQuote {} { 369 Note ?TextSegmentWithoutQuote 370 # A text segment without quote is anything up to a command 371 # start or start of a back slash sequence, or a double-quote 372 # character. 373 if {![Match "\\A(\[^\"\\\\\[\]+)" range]} { 374 Note \t%No-TextSegmentWithoutQuote 375 return 0 376 } 377 Note !TextSegment 378 Node Text $range 379 return 1 380 } 381 382 proc Backslash {} { 383 Note ?Backslash 384 if { 385 ![Match "\\A(\\\\x\[a-fA-F0-9\]+)" range] && 386 ![Match "\\A(\\\\u\[a-fA-F0-9\]{1,4})" range] && 387 ![Match "\\A(\\\\\[0-2\]\[0-7\]{2})" range] && 388 ![Match "\\A(\\\\\[0-7\]{1,2})" range] && 389 ![Match {\A(\\[abfnrtv])} range] 390 } { 391 Note \t%No-Backslash 392 return 0 393 } 394 Note !Backslash 395 Node Backslash $range 396 return 1 397 } 398 399 proc Word {} { 400 Note ?Word 401 if {[QuotedWord]} {return 1} 402 if {[BracedWord 0]} {return 1} 403 return [UnquotedWord] 404 } 405 406 proc Whitespace {} { 407 Note ?Whitespace 408 if {![Match {\A([ \t]|(\\\n[ \t]*))+} range]} { 409 Note \t%No-Whitespace 410 return 0 411 } 412 Note !Whitespace 413 return 1 414 } 415 416 proc QuotedWord {} { 417 # A quoted word starts with a double quote. 418 Note ?QuotedWord 419 if {![Match "\\A(\")" range]} { 420 Note \t%No-QuotedWord 421 return 0 422 } 423 Note !QuotedWord 424 PushRoot [Node Word $range] 425 QuotedString 426 PopRoot 427 return 1 428 } 429 430 proc BracedWord {keepclose} { 431 # A braced word starts with an opening brace. 432 Note ?BracedWord/$keepclose 433 if {![Match "\\A(\{)" range]} { 434 Note \t%No-BracedWord/$keepclose 435 return 0 436 } 437 Note !BracedWord/$keepclose 438 PushRoot [Node Word $range] 439 BracedString $keepclose 440 PopRoot 441 return 1 442 } 443 444 proc UnquotedWord {} { 445 Note !UnquotedWord 446 PushRoot [Node Word {}] 447 UnquotedString 448 PopRoot 449 return 1 450 } 451 452 proc QuotedString {} { 453 Note !QuotedString 454 while 1 { 455 Note !QuotedStringPart 456 # A quoted word (and thus the embedded string) ends with 457 # double quote. 458 if {[Match "\\A(\")" range]} { 459 return 460 } 461 # Now try to match possible pieces of the string. This is 462 # a repetition of the code in 'String', except for the 463 # different end condition above, and the possible embedded 464 # double quotes and continuation lines the outer string 465 # can ignore. 466 if {[Command]} continue 467 if {[Quote]} continue 468 if {[QuotedBraces]} continue 469 if {[Continuation]} continue 470 if {[Backslash]} continue 471 # Check after backslash recognition and processing 472 if {[TextSegmentWithoutQuote]} continue 473 474 Stop ;# Unexpected character or end of input 475 } 476 return 477 } 478 479 proc BracedString {keepclose} { 480 while 1 { 481 Note !BracedStringPart 482 # Closing brace encountered. keepclose is set if we are in 483 # a nested braced string. Only then do we have to put the 484 # brace as a regular text piece into the string 485 if {[Match "\\A(\})" range]} { 486 if {$keepclose} { 487 Node Text $range 488 } 489 return 490 } 491 # Special sequences. 492 if {[QuotedBraces]} continue 493 if {[Continuation]} continue 494 if {[BracedWord 1]} continue 495 # A backslash without a brace coming after is regular a 496 # character. 497 if {[Match {\A(\\)} range]} { 498 Node Text $range 499 continue 500 } 501 # Gooble sequence of regular characters. Stops at 502 # backslash and braces. Backslash stop is needed to handle 503 # the case of them starting a quoted brace. 504 if {[Match {\A([^\\\{\}]*)} range]} { 505 Node Text $range 506 continue 507 } 508 Stop ;# Unexpected character or end of input. 509 } 510 } 511 512 proc UnquotedString {} { 513 while 1 { 514 Note !UnquotedStringPart 515 # Stop conditions 516 # - end of string 517 # - whitespace 518 # - Closing bracket (end of command the word is in) 519 if {[EOF]} return 520 if {[Whitespace]} return 521 if {[Peek "\\A(\\\])" range]} return 522 523 # Match each possible type of part 524 if {[Command]} continue 525 if {[Quote]} continue 526 if {[Continuation]} continue 527 if {[Backslash]} continue 528 # Last, capture backslash sequences first. 529 if {[UnquotedTextSegment]} continue 530 531 Stop ;# Unexpected character or end of input. 532 } 533 return 534 } 535 536 proc UnquotedTextSegment {} { 537 # All chars but whitespace and brackets (start or end of 538 # command). 539 Note ?UnquotedTextSegment 540 if {![Match {\A([^\]\[\t\n ]+)} range]} { 541 Note \t%No-UnquotedTextSegment 542 return 0 543 } 544 Note !UnquotedTextSegment 545 Node Text $range 546 return 1 547 } 548 549 proc Quote {} { 550 Note ?EmdeddedQuote 551 if {![Match "\\A(\\\")" range]} { 552 Note \t%No-EmdeddedQuote 553 return 0 554 } 555 # Embedded double quote, not the end of the quoted string. 556 Note !EmdeddedQuote 557 Node Quote $range 558 return 1 559 } 560 561 proc Continuation {} { 562 Note ?ContinuationLine 563 if {![Match "\\A(\\\\\n\[ \t\]*)" range]} { 564 Note \t%No-ContinuationLine 565 return 0 566 } 567 Note !ContinuationLine 568 Node Continuation $range 569 return 1 570 } 571 572 proc QuotedBraces {} { 573 Note ?QuotedBrace 574 if { 575 ![Match "\\A(\\\\\{)" range] && 576 ![Match "\\A(\\\\\})" range] 577 } { 578 Note \t%No-QuotedBrace 579 return 0 580 } 581 Note !QuotedBrace 582 Node QBrace $range 583 return 1 584 } 585 586 # # ## ### ##### ######## ############# 587 ## Tree construction helper commands. 588 589 proc Node {what range} { 590 set n [lindex [$mytree insert $myroot end] 0] 591 592 Note "+\tNode $n @ $myroot $what" 593 594 $mytree set $n type $what 595 $mytree set $n range $range 596 $mytree set $n line $myline 597 $mytree set $n col $mycol 598 599 return $n 600 } 601 602 proc PushRoot {x} { 603 Note "Push Root = $x" 604 $myrootstack push $myroot 605 set myroot $x 606 return 607 } 608 609 proc PopRoot {} { 610 set myroot [$myrootstack pop] 611 Note "Pop Root = $myroot" 612 return 613 } 614 615 # # ## ### ##### ######## ############# 616 ## Error reporting 617 618 proc Stop {} { 619 ::variable myerr 620 set ahead [string range $mydata $mypos [expr {$mypos + 30}]] 621 set err [expr {![string length $ahead] ? "eof" : "char"}] 622 set ahead [string map [list \n \\n \t \\t \r \\r] [string range $ahead 0 0]] 623 set caller [lindex [info level -1] 0] 624 set msg "[format $myerr($err) $ahead $caller] at line ${myline}.$mycol" 625 set err [list doctools::tcl::parse $err $mypos $myline $mycol] 626 627 return -code error -errorcode $err $msg 628 } 629 630 # # ## ### ##### ######## ############# 631 ## Input processing. Match/peek lexemes, update location after 632 ## stepping over a range. Match = Peek + Step. 633 634 proc EOF {} { 635 Note "?EOF($mypos >= $mysize) = [expr {$mypos >= $mysize}]" 636 return [expr {$mypos >= $mysize}] 637 } 638 639 proc Match {pattern rv} { 640 upvar 1 $rv range 641 set ok [Peek $pattern range] 642 if {$ok} {Step $range} 643 return $ok 644 } 645 646 proc Peek {pattern rv} { 647 upvar 1 $rv range 648 649 Note Peek($pattern)----|[string map [list "\n" "\\n" "\t" "\\t"] [string range $mydata $mypos [expr {$mypos + 30}]]]| 650 651 if {[regexp -start $mypos -indices -- $pattern $mydata -> range]} { 652 Note \tOK 653 return 1 654 } else { 655 Note \tFAIL 656 return 0 657 } 658 } 659 660 proc Step {range} { 661 struct::list assign $range a e 662 663 set mylastpos $mypos 664 665 set mypos $e 666 incr mypos 667 668 set pieces [split [string range $mydata $a $e] \n] 669 set delta [string length [lindex $pieces end]] 670 set nlines [expr {[llength $pieces] - 1}] 671 672 if {$nlines} { 673 incr myline $nlines 674 set mycol $delta 675 } else { 676 incr mycol $delta 677 } 678 return 679 } 680 681 # # ## ### ##### ######## ############# 682 ## Setup / Shutdown of parser/lexer 683 684 proc Initialize {t text root} { 685 set mytree $t 686 if {$root eq {}} { 687 set myroot [$t rootname] 688 } else { 689 set myroot $root 690 } 691 692 if {$myrootstack ne {}} Finalize 693 set myrootstack [struct::stack %AUTO%] 694 $myrootstack clear 695 696 set mydata $text 697 set mysize [string length $mydata] 698 699 set mypos 0 700 set myline 1 701 set mycol 0 702 return 703 } 704 705 proc Finalize {} { 706 $myrootstack destroy 707 set myrootstack {} 708 return 709 } 710 711 # # ## ### ##### ######## ############# 712 ## Debugging helper commands 713 ## Add ability to disable these. 714 ## For the tree maybe add ability to dump through a callback ? 715 716 proc Note {text} { 717 upvar 1 range range 718 set m {} 719 append m "$text " 720 if {[info exists range]} { 721 append m "($range) " 722 if {$range != {}} { 723 foreach {a e} $range break 724 append m " = \"[string map [list "\n" "\\n" "\t" "\\t"] \ 725 [string range $mydata $a $e]]\"" 726 } 727 } else { 728 append m "@$mypos ($myline/$mycol)" 729 } 730 #log::debug $m 731 puts $m 732 return 733 } 734 735 #proc ShowTreeX {args} {} 736 proc ShowTreeX {t x} { 737 puts "=== \[ $x \] [string repeat = [expr {72 - [string length $x] - 9}]]" 738 $t walk root -order pre -type dfs n { 739 set prefix [string repeat .... [$t depth $n]] 740 puts "$prefix$n <[DictSort [$t getall $n]]>" 741 } 742 return 743 } 744 745 proc Note {args} {} 746 proc ShowTree {args} {} 747 748 # # ## ### ##### ######## ############# 749 750 proc DictSort {dict} { 751 array set tmp $dict 752 set res {} 753 foreach k [lsort -dict [array names tmp]] { 754 lappend res $k $tmp($k) 755 } 756 return $res 757 } 758 759 # # ## ### ##### ######## ############# 760 ## Parser state 761 762 typevariable mytree {} ; # Tree we are working on 763 typevariable myroot {} ; # Current root to add nodes to. 764 typevariable myrootstack {} 765 766 typevariable mydata {} ; # String to parse. 767 typevariable mysize 0 ; # Length of string to parse, cache 768 769 typevariable mylastpos ; # Last current position. 770 typevariable mypos 0 ; # Current parse location, offset from 771 typevariable myline 1 ; # the beginning of the string, line 772 typevariable mycol 0 ; # we are on, and the column within the 773 # line. 774 775 typevariable myerr -array { 776 char {Unexpected character '%1$s' in %2$s} 777 eof {Unexpected end of input in %2$s} 778 } 779 780 781 # # ## ### ##### ######## ############# 782 ## Configuration 783 784 pragma -hasinstances no ; # singleton 785 pragma -hastypeinfo no ; # no introspection 786 pragma -hastypedestroy no ; # immortal 787 788 ## 789 # # ## ### ##### ######## ############# 790} 791 792namespace eval ::doctools::tcl { 793 namespace export parse 794} 795 796# # ## ### ##### ######## ############# ##################### 797## Ready 798 799package provide doctools::tcl::parse 0.1 800return 801