1;;; ebnf-abn.el --- parser for ABNF (Augmented BNF)
2
3;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
4;;   Free Software Foundation, Inc.
5
6;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
8;; Keywords: wp, ebnf, PostScript
9;; Version: 1.1
10
11;; This file is part of GNU Emacs.
12
13;; GNU Emacs is free software; you can redistribute it and/or modify
14;; it under the terms of the GNU General Public License as published by
15;; the Free Software Foundation; either version 2, or (at your option)
16;; any later version.
17
18;; GNU Emacs is distributed in the hope that it will be useful,
19;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21;; GNU General Public License for more details.
22
23;; You should have received a copy of the GNU General Public License
24;; along with GNU Emacs; see the file COPYING.  If not, write to the
25;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26;; Boston, MA 02110-1301, USA.
27
28;;; Commentary:
29
30;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;;
32;;
33;; This is part of ebnf2ps package.
34;;
35;; This package defines a parser for ABNF (Augmented BNF).
36;;
37;; See ebnf2ps.el for documentation.
38;;
39;;
40;; ABNF Syntax
41;; -----------
42;;
43;;	See the URL:
44;;	`http://www.ietf.org/rfc/rfc2234.txt'
45;;	or
46;;	`http://www.faqs.org/rfcs/rfc2234.html'
47;;	or
48;;	`http://www.rnp.br/ietf/rfc/rfc2234.txt'
49;;	("Augmented BNF for Syntax Specifications: ABNF").
50;;
51;;
52;;    rulelist       =  1*( rule / (*c-wsp c-nl) )
53;;
54;;    rule           =  rulename defined-as elements c-nl
55;;			     ; continues if next line starts with white space
56;;
57;;    rulename       =  ALPHA *(ALPHA / DIGIT / "-")
58;;
59;;    defined-as     =  *c-wsp ("=" / "=/") *c-wsp
60;;			     ; basic rules definition and incremental
61;;			     ; alternatives
62;;
63;;    elements       =  alternation *c-wsp
64;;
65;;    c-wsp          =  WSP / (c-nl WSP)
66;;
67;;    c-nl           =  comment / CRLF
68;;			     ; comment or newline
69;;
70;;    comment        =  ";" *(WSP / VCHAR) CRLF
71;;
72;;    alternation    =  concatenation
73;;			*(*c-wsp "/" *c-wsp concatenation)
74;;
75;;    concatenation  =  repetition *(1*c-wsp repetition)
76;;
77;;    repetition     =  [repeat] element
78;;
79;;    repeat         =  1*DIGIT / (*DIGIT "*" *DIGIT)
80;;
81;;    element        =  rulename / group / option /
82;;			char-val / num-val / prose-val
83;;
84;;    group          =  "(" *c-wsp alternation *c-wsp ")"
85;;
86;;    option         =  "[" *c-wsp alternation *c-wsp "]"
87;;
88;;    char-val       =  DQUOTE *(%x20-21 / %x23-7E) DQUOTE
89;;			     ; quoted string of SP and VCHAR without DQUOTE
90;;
91;;    num-val        =  "%" (bin-val / dec-val / hex-val)
92;;
93;;    bin-val        =  "b" 1*BIT
94;;			[ 1*("." 1*BIT) / ("-" 1*BIT) ]
95;;			     ; series of concatenated bit values
96;;			     ; or single ONEOF range
97;;
98;;    dec-val        =  "d" 1*DIGIT
99;;			[ 1*("." 1*DIGIT) / ("-" 1*DIGIT) ]
100;;
101;;    hex-val        =  "x" 1*HEXDIG
102;;			[ 1*("." 1*HEXDIG) / ("-" 1*HEXDIG) ]
103;;
104;;    prose-val      =  "<" *(%x20-3D / %x3F-7E) ">"
105;;				; bracketed string of SP and VCHAR without
106;;				; angles
107;;				; prose description, to be used as last resort
108;;
109;;    ; Core rules -- the coding depends on the system, here is used 7-bit ASCII
110;;
111;;    ALPHA          =  %x41-5A / %x61-7A
112;;				; A-Z / a-z
113;;
114;;    BIT            =  "0" / "1"
115;;
116;;    CHAR           =  %x01-7F
117;;				; any 7-bit US-ASCII character, excluding NUL
118;;
119;;    CR             =  %x0D
120;;				; carriage return
121;;
122;;    CRLF           =  CR LF
123;;				; Internet standard newline
124;;
125;;    CTL            =  %x00-1F / %x7F
126;;				; controls
127;;
128;;    DIGIT          =  %x30-39
129;;				; 0-9
130;;
131;;    DQUOTE         =  %x22
132;;				; " (Double Quote)
133;;
134;;    HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
135;;
136;;    HTAB           =  %x09
137;;				; horizontal tab
138;;
139;;    LF             =  %x0A
140;;				; linefeed
141;;
142;;    LWSP           =  *(WSP / CRLF WSP)
143;;				; linear white space (past newline)
144;;
145;;    OCTET          =  %x00-FF
146;;				; 8 bits of data
147;;
148;;    SP             =  %x20
149;;				; space
150;;
151;;    VCHAR          =  %x21-7E
152;;				; visible (printing) characters
153;;
154;;    WSP            =  SP / HTAB
155;;				; white space
156;;
157;;
158;; NOTES:
159;;
160;; 1. Rules name and terminal strings are case INSENSITIVE.
161;;    So, the following rule names are all equals:
162;;	 Rule-name,  rule-Name, rule-name, RULE-NAME
163;;    Also, the following strings are equals:
164;;	 "abc", "ABC", "aBc", "Abc", "aBC", etc.
165;;
166;; 2. To have a case SENSITIVE string, use the character notation.
167;;    For example, to specify the lowercase string "abc", use:
168;;	 %d97.98.99
169;;
170;; 3. There are no implicit spaces between elements, for example, the
171;;    following rules:
172;;
173;;	 foo = %x61  ; a
174;;
175;;	 bar = %x62  ; b
176;;
177;;	 mumble = foo bar foo
178;;
179;;    Are equivalent to the following rule:
180;;
181;;	 mumble = %x61.62.61
182;;
183;;    If spaces are needed, it should be explicit specified, like:
184;;
185;;	spaces = 1*(%x20 / %x09)  ; one or more spaces or tabs
186;;
187;;	mumble = foo spaces bar spaces foo
188;;
189;; 4. Lines starting with space or tab are considered a continuation line.
190;;    For example, the rule:
191;;
192;;	 rule = foo
193;;	        bar
194;;
195;;    Is equivalent to:
196;;
197;;	 rule = foo bar
198;;
199;;
200;; Differences Between ABNF And ebnf2ps ABNF
201;; -----------------------------------------
202;;
203;; Besides the characters that ABNF accepts, ebnf2ps ABNF accepts also the
204;; underscore (_) for rule name and european 8-bit accentuated characters (from
205;; \240 to \377) for rule name, string and comment.
206;;
207;;
208;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
209
210;;; Code:
211
212
213(require 'ebnf-otz)
214
215
216(defvar ebnf-abn-lex nil
217  "Value returned by `ebnf-abn-lex' function.")
218
219
220;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
221;; Syntactic analyzer
222
223
224;;;    rulelist       =  1*( rule / (*c-wsp c-nl) )
225
226(defun ebnf-abn-parser (start)
227  "ABNF parser."
228  (let ((total (+ (- ebnf-limit start) 1))
229	(bias (1- start))
230	(origin (point))
231	rule-list token rule)
232    (goto-char start)
233    (setq token (ebnf-abn-lex))
234    (and (eq token 'end-of-input)
235	 (error "Invalid ABNF file format"))
236    (and (eq token 'end-of-rule)
237	 (setq token (ebnf-abn-lex)))
238    (while (not (eq token 'end-of-input))
239      (ebnf-message-float
240       "Parsing...%s%%"
241       (/ (* (- (point) bias) 100.0) total))
242      (setq token (ebnf-abn-rule token)
243	    rule  (cdr token)
244	    token (car token))
245      (or (ebnf-add-empty-rule-list rule)
246	  (setq rule-list (cons rule rule-list))))
247    (goto-char origin)
248    rule-list))
249
250
251;;;    rule           =  rulename defined-as elements c-nl
252;;;			     ; continues if next line starts with white space
253;;;
254;;;    rulename       =  ALPHA *(ALPHA / DIGIT / "-")
255;;;
256;;;    defined-as     =  *c-wsp ("=" / "=/") *c-wsp
257;;;			     ; basic rules definition and incremental
258;;;			     ; alternatives
259;;;
260;;;    elements       =  alternation *c-wsp
261;;;
262;;;    c-wsp          =  WSP / (c-nl WSP)
263;;;
264;;;    c-nl           =  comment / CRLF
265;;;			     ; comment or newline
266;;;
267;;;    comment        =  ";" *(WSP / VCHAR) CRLF
268
269
270(defun ebnf-abn-rule (token)
271  (let ((name ebnf-abn-lex)
272	(action ebnf-action)
273	elements)
274    (setq ebnf-action nil)
275    (or (eq token 'non-terminal)
276	(error "Invalid rule name"))
277    (setq token (ebnf-abn-lex))
278    (or (memq token '(equal incremental-alternative))
279	(error "Invalid rule: missing `=' or `=/'"))
280    (and (eq token 'incremental-alternative)
281	 (setq name (concat name " =/")))
282    (setq elements (ebnf-abn-alternation))
283    (or (memq (car elements) '(end-of-rule end-of-input))
284	(error "Invalid rule: there is no end of rule"))
285    (setq elements (cdr elements))
286    (ebnf-eps-add-production name)
287    (cons (ebnf-abn-lex)
288	  (ebnf-make-production name elements action))))
289
290
291;;;    alternation    =  concatenation
292;;;			 *(*c-wsp "/" *c-wsp concatenation)
293
294
295(defun ebnf-abn-alternation ()
296  (let (body concatenation)
297    (while (eq (car (setq concatenation
298			  (ebnf-abn-concatenation (ebnf-abn-lex))))
299	       'alternative)
300      (setq body (cons (cdr concatenation) body)))
301    (ebnf-token-alternative body concatenation)))
302
303
304;;;    concatenation  =  repetition *(1*c-wsp repetition)
305
306
307(defun ebnf-abn-concatenation (token)
308  (let ((term (ebnf-abn-repetition token))
309	seq)
310    (or (setq token (car term)
311	      term  (cdr term))
312	(error "Empty element"))
313    (setq seq (cons term seq))
314    (while (setq term  (ebnf-abn-repetition token)
315		 token (car term)
316		 term  (cdr term))
317      (setq seq (cons term seq)))
318    (cons token
319	  (ebnf-token-sequence seq))))
320
321
322;;;    repetition     =  [repeat] element
323;;;
324;;;    repeat         =  1*DIGIT / (*DIGIT "*" *DIGIT)
325
326
327(defun ebnf-abn-repetition (token)
328  (let (lower upper)
329    ;; INTEGER [ "*" [ INTEGER ] ]
330    (when (eq token 'integer)
331      (setq lower ebnf-abn-lex
332	    token (ebnf-abn-lex))
333      (or (eq token 'repeat)
334	  (setq upper lower)))
335    ;; "*" [ INTEGER ]
336    (when (eq token 'repeat)
337      ;; only * ==> lower & upper are empty string
338      (or lower
339	  (setq lower ""
340		upper ""))
341      (when (eq (setq token (ebnf-abn-lex)) 'integer)
342	(setq upper ebnf-abn-lex
343	      token (ebnf-abn-lex))))
344    (let ((element (ebnf-abn-element token)))
345      (cond
346       ;; there is a repetition
347       (lower
348	(or element
349	    (error "Missing element repetition"))
350	(setq token (ebnf-abn-lex))
351	(cond
352	 ;; one or more
353	 ((and (string= lower "1") (null upper))
354	  (cons token (ebnf-make-one-or-more element)))
355	 ;; zero or more
356	 ((or (and (string= lower "0") (null upper))
357	      (and (string= lower "") (string= upper "")))
358	  (cons token (ebnf-make-zero-or-more element)))
359	 ;; real repetition
360	 (t
361	  (ebnf-token-repeat lower (cons token element) upper))))
362       ;; there is an element
363       (element
364	(cons (ebnf-abn-lex) element))
365       ;; something that caller has to deal
366       (t
367	(cons token nil))))))
368
369
370;;;    element        =  rulename / group / option /
371;;;			char-val / num-val / prose-val
372;;;
373;;;    group          =  "(" *c-wsp alternation *c-wsp ")"
374;;;
375;;;    option         =  "[" *c-wsp alternation *c-wsp "]"
376;;;
377;;;    char-val       =  DQUOTE *(%x20-21 / %x23-7E) DQUOTE
378;;;			     ; quoted string of SP and VCHAR without DQUOTE
379;;;
380;;;    num-val        =  "%" (bin-val / dec-val / hex-val)
381;;;
382;;;    bin-val        =  "b" 1*BIT
383;;;			[ 1*("." 1*BIT) / ("-" 1*BIT) ]
384;;;			     ; series of concatenated bit values
385;;;			     ; or single ONEOF range
386;;;
387;;;    dec-val        =  "d" 1*DIGIT
388;;;			[ 1*("." 1*DIGIT) / ("-" 1*DIGIT) ]
389;;;
390;;;    hex-val        =  "x" 1*HEXDIG
391;;;			[ 1*("." 1*HEXDIG) / ("-" 1*HEXDIG) ]
392;;;
393;;;    prose-val      =  "<" *(%x20-3D / %x3F-7E) ">"
394;;;				; bracketed string of SP and VCHAR without
395;;;				; angles
396;;;				; prose description, to be used as last resort
397
398
399(defun ebnf-abn-element (token)
400  (cond
401   ;; terminal
402   ((eq token 'terminal)
403    (ebnf-make-terminal ebnf-abn-lex))
404   ;; non-terminal
405   ((eq token 'non-terminal)
406    (ebnf-make-non-terminal ebnf-abn-lex))
407   ;; group
408   ((eq token 'begin-group)
409    (let ((body (ebnf-abn-alternation)))
410      (or (eq (car body) 'end-group)
411	  (error "Missing `)'"))
412      (cdr body)))
413   ;; optional
414   ((eq token 'begin-optional)
415    (let ((body (ebnf-abn-alternation)))
416      (or (eq (car body) 'end-optional)
417	  (error "Missing `]'"))
418      (ebnf-token-optional (cdr body))))
419   ;; no element
420   (t
421    nil)
422   ))
423
424
425;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
426;; Lexical analyzer
427
428
429(defconst ebnf-abn-token-table (make-vector 256 'error)
430  "Vector used to map characters to a lexical token.")
431
432
433(defun ebnf-abn-initialize ()
434  "Initialize EBNF token table."
435  ;; control character & control 8-bit character are set to `error'
436  (let ((char ?\060))
437    ;; digits: 0-9
438    (while (< char ?\072)
439      (aset ebnf-abn-token-table char 'integer)
440      (setq char (1+ char)))
441    ;; printable character: A-Z
442    (setq char ?\101)
443    (while (< char ?\133)
444      (aset ebnf-abn-token-table char 'non-terminal)
445      (setq char (1+ char)))
446    ;; printable character: a-z
447    (setq char ?\141)
448    (while (< char ?\173)
449      (aset ebnf-abn-token-table char 'non-terminal)
450      (setq char (1+ char)))
451    ;; European 8-bit accentuated characters:
452    (setq char ?\240)
453    (while (< char ?\400)
454      (aset ebnf-abn-token-table char 'non-terminal)
455      (setq char (1+ char)))
456    ;; Override end of line characters:
457    (aset ebnf-abn-token-table ?\n 'end-of-rule) ; [NL] linefeed
458    (aset ebnf-abn-token-table ?\r 'end-of-rule) ; [CR] carriage return
459    ;; Override space characters:
460    (aset ebnf-abn-token-table ?\013 'space) ; [VT] vertical tab
461    (aset ebnf-abn-token-table ?\t   'space) ; [HT] horizontal tab
462    (aset ebnf-abn-token-table ?\    'space) ; [SP] space
463    ;; Override form feed character:
464    (aset ebnf-abn-token-table ?\f 'form-feed) ; [FF] form feed
465    ;; Override other lexical characters:
466    (aset ebnf-abn-token-table ?<  'non-terminal)
467    (aset ebnf-abn-token-table ?%  'terminal)
468    (aset ebnf-abn-token-table ?\" 'terminal)
469    (aset ebnf-abn-token-table ?\( 'begin-group)
470    (aset ebnf-abn-token-table ?\) 'end-group)
471    (aset ebnf-abn-token-table ?*  'repeat)
472    (aset ebnf-abn-token-table ?=  'equal)
473    (aset ebnf-abn-token-table ?\[ 'begin-optional)
474    (aset ebnf-abn-token-table ?\] 'end-optional)
475    (aset ebnf-abn-token-table ?/  'alternative)
476    ;; Override comment character:
477    (aset ebnf-abn-token-table ?\; 'comment)))
478
479
480;; replace the range "\240-\377" (see `ebnf-range-regexp').
481(defconst ebnf-abn-non-terminal-chars
482  (ebnf-range-regexp "-_0-9A-Za-z" ?\240 ?\377))
483(defconst ebnf-abn-non-terminal-letter-chars
484  (ebnf-range-regexp "A-Za-z" ?\240 ?\377))
485
486
487(defun ebnf-abn-lex ()
488  "Lexical analyzer for ABNF.
489
490Return a lexical token.
491
492See documentation for variable `ebnf-abn-lex'."
493  (if (>= (point) ebnf-limit)
494      'end-of-input
495    (let (token)
496      ;; skip spaces and comments
497      (while (if (> (following-char) 255)
498		 (progn
499		   (setq token 'error)
500		   nil)
501	       (setq token (aref ebnf-abn-token-table (following-char)))
502	       (cond
503		((eq token 'space)
504		 (skip-chars-forward " \013\t" ebnf-limit)
505		 (< (point) ebnf-limit))
506		((eq token 'comment)
507		 (ebnf-abn-skip-comment))
508		((eq token 'form-feed)
509		 (forward-char)
510		 (setq ebnf-action 'form-feed))
511		((eq token 'end-of-rule)
512		 (ebnf-abn-skip-end-of-rule))
513		(t nil)
514		)))
515      (cond
516       ;; end of input
517       ((>= (point) ebnf-limit)
518	'end-of-input)
519       ;; error
520       ((eq token 'error)
521	(error "Invalid character"))
522       ;; end of rule
523       ((eq token 'end-of-rule)
524	'end-of-rule)
525       ;; integer
526       ((eq token 'integer)
527	(setq ebnf-abn-lex (ebnf-buffer-substring "0-9"))
528	'integer)
529       ;; terminal: "string" or %[bdx]NNN((.NNN)+|-NNN)?
530       ((eq token 'terminal)
531	(setq ebnf-abn-lex
532	      (if (= (following-char) ?\")
533		  (ebnf-abn-string)
534		(ebnf-abn-character)))
535	'terminal)
536       ;; non-terminal: NAME or <NAME>
537       ((eq token 'non-terminal)
538	(let ((prose-p (= (following-char) ?<)))
539	  (when prose-p
540	    (forward-char)
541	    (or (looking-at ebnf-abn-non-terminal-letter-chars)
542		(error "Invalid prose value")))
543	  (setq ebnf-abn-lex
544		(ebnf-buffer-substring ebnf-abn-non-terminal-chars))
545	  (when prose-p
546	    (or (= (following-char) ?>)
547		(error "Invalid prose value"))
548	    (setq ebnf-abn-lex (concat "<" ebnf-abn-lex ">"))))
549	'non-terminal)
550       ;; equal: =, =/
551       ((eq token 'equal)
552	(forward-char)
553	(if (/= (following-char) ?/)
554	    'equal
555	  (forward-char)
556	  'incremental-alternative))
557       ;; miscellaneous: (, ), [, ], /, *
558       (t
559	(forward-char)
560	token)
561       ))))
562
563
564(defun ebnf-abn-skip-end-of-rule ()
565  (let (eor-p)
566    (while (progn
567	     ;; end of rule ==> 2 or more consecutive end of lines
568	     (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
569			     eor-p))
570	     ;; skip spaces
571	     (skip-chars-forward " \013\t" ebnf-limit)
572	     ;; skip comments
573	     (and (= (following-char) ?\;)
574		  (ebnf-abn-skip-comment))))
575    (not eor-p)))
576
577
578;; replace the range "\177-\237" (see `ebnf-range-regexp').
579(defconst ebnf-abn-comment-chars
580  (ebnf-range-regexp "^\n\000-\010\016-\037" ?\177 ?\237))
581
582
583(defun ebnf-abn-skip-comment ()
584  (forward-char)
585  (cond
586   ;; open EPS file
587   ((and ebnf-eps-executing (= (following-char) ?\[))
588    (ebnf-eps-add-context (ebnf-abn-eps-filename)))
589   ;; close EPS file
590   ((and ebnf-eps-executing (= (following-char) ?\]))
591    (ebnf-eps-remove-context (ebnf-abn-eps-filename)))
592   ;; any other action in comment
593   (t
594    (setq ebnf-action (aref ebnf-comment-table (following-char)))
595    (skip-chars-forward ebnf-abn-comment-chars ebnf-limit))
596   )
597  ;; check for a valid end of comment
598  (cond ((>= (point) ebnf-limit)
599	 nil)
600	((= (following-char) ?\n)
601	 t)
602	(t
603	 (error "Invalid character"))
604	))
605
606
607(defun ebnf-abn-eps-filename ()
608  (forward-char)
609  (ebnf-buffer-substring ebnf-abn-comment-chars))
610
611
612;; replace the range "\240-\377" (see `ebnf-range-regexp').
613(defconst ebnf-abn-string-chars
614  (ebnf-range-regexp " -!#-~" ?\240 ?\377))
615
616
617(defun ebnf-abn-string ()
618  (buffer-substring-no-properties
619   (progn
620     (forward-char)
621     (point))
622   (progn
623     (skip-chars-forward ebnf-abn-string-chars ebnf-limit)
624     (or (= (following-char) ?\")
625	 (error "Missing `\"'"))
626     (prog1
627	 (point)
628       (forward-char)))))
629
630
631(defun ebnf-abn-character ()
632  ;; %[bdx]NNN((-NNN)|(.NNN)+)?
633  (buffer-substring-no-properties
634   (point)
635   (progn
636     (forward-char)
637     (let* ((char  (following-char))
638	    (chars (cond ((or (= char ?B) (= char ?b)) "01")
639			 ((or (= char ?D) (= char ?d)) "0-9")
640			 ((or (= char ?X) (= char ?x)) "0-9A-Fa-f")
641			 (t (error "Invalid terminal value")))))
642       (forward-char)
643       (or (> (skip-chars-forward chars ebnf-limit) 0)
644	   (error "Invalid terminal value"))
645       (if (= (following-char) ?-)
646	   (progn
647	     (forward-char)
648	     (or (> (skip-chars-forward chars ebnf-limit) 0)
649		 (error "Invalid terminal value range")))
650	 (while (= (following-char) ?.)
651	   (forward-char)
652	   (or (> (skip-chars-forward chars ebnf-limit) 0)
653	       (error "Invalid terminal value")))))
654     (point))))
655
656
657;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
658
659
660(provide 'ebnf-abn)
661
662;;; arch-tag: 8d1b3c4d-4226-4393-b9ae-b7ccf07cf779
663;;; ebnf-abn.el ends here
664