1# peg_to_peg.tcl -- 2# 3# Conversion from PEG to PEG (Human readable text). 4# 5# Copyright (c) 2009 Andreas Kupries <andreas_kupries@sourceforge.net> 6# 7# See the file "license.terms" for information on usage and redistribution 8# of this file, and for a DISCLAIMER OF ALL WARRANTIES. 9# 10# RCS: @(#) $Id: pt_peg_to_peg.tcl,v 1.1 2010/03/26 05:07:24 andreas_kupries Exp $ 11 12# This package takes the canonical serialization of a parsing 13# expression grammar and produces text in PEG format, a form of text 14# which specifies a PEG in a human readable, yet formal manner, 15# similar too, but not identical to EBNF. 16 17# ### ### ### ######### ######### ######### 18## Requisites 19 20package require Tcl 8.5 21package require pt::peg ; # Verification that the input 22 # is proper. 23package require pt::pe ; # Walking an expression. 24package require pt::pe::op ; # Flatten & fuse. 25package require text::write ; # Text generation support 26package require textutil::adjust 27package require struct::list 28 29# ### ### ### ######### ######### ######### 30## 31 32namespace eval ::pt::peg::to::peg { 33 namespace export \ 34 reset configure convert 35 36 namespace ensemble create 37} 38 39# ### ### ### ######### ######### ######### 40## API. 41 42proc ::pt::peg::to::peg::reset {} { 43 variable template @code@ 44 variable name a_pe_grammar 45 variable file unknown 46 variable user unknown 47 variable fused 1 48 return 49} 50 51proc ::pt::peg::to::peg::configure {args} { 52 variable template 53 variable name 54 variable file 55 variable user 56 variable fused 57 58 if {[llength $args] == 0} { 59 return [list \ 60 -file $file \ 61 -fused $fused \ 62 -name $name \ 63 -template $template \ 64 -user $user] 65 } elseif {[llength $args] == 1} { 66 lassign $args option 67 set variable [string range $option 1 end] 68 if {[info exists $variable]} { 69 return [set $variable] 70 } else { 71 return -code error "Expected one of -file, -fused, -name, -template, or -user, got \"$option\"" 72 } 73 } elseif {[llength $args] % 2 == 0} { 74 foreach {option value} $args { 75 set variable [string range $option 1 end] 76 if {![info exists $variable]} { 77 return -code error "Expected one of -file, -fused, -name, -template, or -user, got \"$option\"" 78 } 79 } 80 foreach {option value} $args { 81 set variable [string range $option 1 end] 82 switch -exact -- $variable { 83 template { 84 if {$value eq {}} { 85 return -code error "Expected template, got the empty string" 86 } 87 } 88 fused { 89 if {![::string is boolean -strict $value]} { 90 return -code error "Expected boolean, got \"$value\"" 91 } 92 } 93 name - 94 file - 95 user { } 96 } 97 set $variable $value 98 } 99 } else { 100 return -code error {wrong#args, expected option value ...} 101 } 102} 103 104proc ::pt::peg::to::peg::convert {serial} { 105 variable template 106 variable name 107 variable file 108 variable user 109 110 ::pt::peg verify-as-canonical $serial 111 112 # Unpack the serialization, known as canonical 113 array set peg $serial 114 array set peg $peg(pt::grammar::peg) 115 unset peg(pt::grammar::peg) 116 117 # Determine the field sizes for nonterminal symbol names and 118 # semantic modes. 119 120 set smax [text::write maxlen [dict keys $peg(rules)]] 121 set mmax [ModeSize $peg(rules)] 122 123 # Assemble the output, various pieces 124 text::write reset 125 Header $peg(start) 126 Rules $peg(rules) $mmax $smax 127 Trailer 128 129 # At last retrieve the fully assembled result and integrate with 130 # the chosen template. 131 return [string map \ 132 [list \ 133 @user@ $user \ 134 @format@ PEG \ 135 @file@ $file \ 136 @name@ $name \ 137 @code@ [text::write get]] $template] 138 139 # ### ### ### ######### ######### ######### 140} 141 142# ### ### ### ######### ######### ######### 143## Internals 144 145proc ::pt::peg::to::peg::Header {startexpression} { 146 variable name 147 148 text::write field PEG 149 text::write field $name 150 text::write field ([Expression $startexpression]) 151 text::write /line 152 return 153} 154 155proc ::pt::peg::to::peg::Rules {rules mmax smax} { 156 if {[llength $rules]} { text::write /line } 157 158 foreach {symbol def} $rules { 159 lassign $def _ is _ mode 160 set mode [expr {($mode eq "value") 161 ? "" 162 : "${mode}:"}] 163 164 text::write fieldl $mmax $mode 165 text::write fieldl $smax $symbol 166 text::write field "<-" 167 text::write field [Expression $is] 168 text::write field ";" 169 text::write /line 170 } 171 172 if {[llength $rules]} { text::write /line } 173 return 174} 175 176proc ::pt::peg::to::peg::Trailer {} { 177 text::write field {END;} 178 text::write /line 179 return 180} 181 182# ### ### ### ######### ######### ######### 183 184proc ::pt::peg::to::peg::Expression {pe} { 185 variable fused 186 187 if {$fused} { 188 # First flatten for a maximum amount of adjacent terminals and 189 # ranges, then fuse these into strings and classes, then 190 # flatten again, eliminating all sequences and choices fully 191 # subsumed by the new elements. 192 193 set pe [pt::pe::op flatten \ 194 [pt::pe::op fusechars \ 195 [pt::pe::op flatten \ 196 $pe]]] 197 } 198 199 return [lindex [pt::pe bottomup \ 200 [namespace current]::Convert \ 201 $pe] 0] 202} 203 204proc ::pt::peg::to::peg::Convert {pe operator arguments} { 205 # For the inner nodes the each of arguments are a pair of 206 # generated text, and the sub-expression it came from, in this 207 # order. 208 209 switch -exact -- $operator { 210 alpha - alnum - ascii - digit - graph - lower - print - 211 punct - space - upper - wordchar - xdigit - ddigit { 212 # Special forms ... 213 return [list <$operator> $pe] 214 } 215 dot { 216 # Special form ... 217 return [list "." $pe] 218 } 219 epsilon { 220 # Special form ... 221 return [list "" $pe] 222 } 223 t { 224 # Character ... 225 lassign $arguments char 226 return [list "'[Char ${char}]'" $pe] 227 } 228 .. { 229 # Range of characters ... Show as character class. 230 # Note: Canonical input means that an expression like 231 # {.. X X} cannot occur, and can be ignored. 232 233 lassign $arguments chstart chend 234 return [list "\[[Char ${chstart}]-[Char $chend]\]" $pe] 235 } 236 n { 237 # Nonterminal symbol 238 lassign $arguments symbol 239 return [list $symbol $pe] 240 } 241 ? - * - + { 242 # Suffix operators (Option, Kleene Closure, Positive KC) ... 243 lassign $arguments child 244 lassign $child text def 245 lassign $def coperator 246 return [list [MayParens $operator $coperator $text]$operator $pe] 247 } 248 & - 249 ! { 250 # Prefix operators (And/Not Lookahead) ... 251 lassign $arguments child 252 lassign $child text def 253 lassign $def coperator 254 return [list $operator[MayParens $operator $coperator $text] $pe] 255 } 256 x { 257 # Sequences ... 258 # TODO :: merge adjacent chars into strings ... also, cut 259 # x out if only one child 260 261 set t {} 262 set x {} 263 foreach a $arguments { 264 lassign $a text def 265 lassign $def coperator 266 lappend t [MayParens $operator $coperator $text] 267 lappend x $def 268 } 269 return [list [join $t { }] [list x {*}$x]] 270 } 271 / { 272 # Choices ... 273 # TODO :: merge adjacent chars and ranges into classes ... 274 # also, cut / out if only one child 275 276 set t {} 277 set x {} 278 foreach a $arguments { 279 lassign $a text def 280 lassign $def coperator 281 lappend t [MayParens $operator $coperator $text] 282 lappend x $def 283 } 284 return [list [join $t { / }] [list / {*}$x]] 285 } 286 str { 287 return [list \ 288 '[join [struct::list map $arguments \ 289 [namespace current]::Char] {}]' \ 290 $pe] 291 } 292 cl { 293 return [list \ 294 \[[join [struct::list map $arguments \ 295 [namespace current]::Range] {}]\] \ 296 $pe] 297 } 298 } 299} 300 301proc ::pt::peg::to::peg::Range {range} { 302 # See also pt::peg::to::tclparam 303 304 # Use string ops here to distinguish terminals and ranges. The 305 # input can be a single char, not a list, and further the char may 306 # not be a proper list. Example: double-apostroph. 307 if {[string length $range] > 1} { 308 lassign $range s e 309 return [Char $s]-[Char $e] 310 } else { 311 return [Char $range] 312 } 313} 314 315proc ::pt::peg::to::peg::Char {ch} { 316 # Encode a character, handle special cases. We cannot use package 317 # char, as that is geared towards character encoding for Tcl code. 318 319 switch -exact -- $ch { 320 "\n" { return "\\n" } 321 "\r" { return "\\r" } 322 "\t" { return "\\t" } 323 "\\" { return "\\\\" } 324 "\"" { return "\\\"" } 325 "'" { return "\\'" } 326 "\]" { return "\\\]" } 327 "\[" { return "\\\[" } 328 } 329 330 scan $ch %c chcode 331 332 # Control characters: Octal 333 if {[::string is control -strict $ch]} { 334 return \\[format %o $chcode] 335 } 336 337 # Beyond 7-bit ASCII: Unicode 338 339 if {$chcode > 127} { 340 return \\u[format %04x $chcode] 341 } 342 343 # Regular character: Is its own representation. 344 345 return $ch 346 347} 348 349proc ::pt::peg::to::peg::MayParens {op cop text} { 350 if {![NeedParens $op $cop]} { return $text } 351 return "([::textutil::adjust::indent $text " " 1])" 352} 353 354proc ::pt::peg::to::peg::NeedParens {op cop} { 355 variable priority 356 # c(hild)op is nested under op. 357 # Parens are required if cop has a lower priority than op. 358 359 return [expr {$priority($cop) < $priority($op)}] 360} 361 362# ### ### ### ######### ######### ######### 363 364proc ::pt::peg::to::peg::ModeSize {rules} { 365 set modes {} 366 foreach {symbol def} $rules { 367 lassign $def _ is _ mode 368 if {$mode eq "value"} continue ; # These are not shown in the 369 # text representation, as 370 # they are the implicit 371 # default for it. 372 lappend modes ${mode}: 373 } 374 return [text::write maxlen [lsort -uniq $modes]] 375} 376 377# ### ### ### ######### ######### ######### 378## Configuration 379 380namespace eval ::pt::peg::to::peg { 381 382 variable template @code@ ; # A string. Specifies how to 383 # embed the generated code into a 384 # larger frame- work (the 385 # template). 386 variable name a_pe_grammar ; # String. Name of the grammar. 387 variable file unknown ; # String. Name of the file or 388 # other entity the grammar came 389 # from. 390 variable user unknown ; # String. Name of the user on 391 # which behalf the conversion has 392 # been invoked. 393 variable fused 1 ; # Boolean flag. If true character 394 # sequences and choices are fused 395 # into strings and classes. 396 397 variable priority 398 array set priority { 399 / 0 t 4 ascii 4 upper 4 400 x 1 n 4 digit 4 wordchar 4 401 & 2 .. 4 graph 4 xdigit 4 402 ! 2 dot 4 lower 4 ddigit 4 403 + 3 epsilon 4 print 4 str 4 404 * 3 alnum 4 punct 4 cl 4 405 ? 3 alpha 4 space 4 406 } 407} 408 409# ### ### ### ######### ######### ######### 410## Ready 411 412package provide pt::peg::to::peg 1 413return 414