klacks parser

This commit is contained in:
dlichteblau
2007-02-11 18:21:20 +00:00
parent aac4cb719c
commit 0596a0e63d
7 changed files with 649 additions and 82 deletions

View File

@ -83,4 +83,6 @@
#:make-namespace-normalizer
#:make-whitespace-normalizer
#:rod-to-utf8-string
#:utf8-string-to-rod))
#:utf8-string-to-rod
#:make-source))

View File

@ -68,11 +68,11 @@
;; :stag (<name> . <atts>) ;start tag
;; :etag (<name> . <atts>) ;end tag
;; :ztag (<name> . <atts>) ;empty tag
;; :<!element
;; :<!entity
;; :<!attlist
;; :<!notation
;; :<!doctype
;; :<!ELEMENT
;; :<!ENTITY
;; :<!ATTLIST
;; :<!NOTATION
;; :<!DOCTYPE
;; :<![
;; :comment <content>
@ -194,11 +194,13 @@
(defvar *expand-pe-p* nil)
(defparameter *namespace-bindings*
(defparameter *initial-namespace-bindings*
'((#"" . nil)
(#"xmlns" . #"http://www.w3.org/2000/xmlns/")
(#"xml" . #"http://www.w3.org/XML/1998/namespace")))
(defparameter *namespace-bindings* *initial-namespace-bindings*)
;;;; ---------------------------------------------------------------------------
;;;; xstreams
;;;;
@ -2571,22 +2573,16 @@
:main-zstream input
:entity-resolver entity-resolver
:disallow-internal-subset disallow-internal-subset))
(*validate* validate))
(*validate* validate)
(*namespace-bindings* *initial-namespace-bindings*))
(sax:start-document handler)
;; document ::= XMLDecl? Misc* (doctypedecl Misc*)? element Misc*
;; Misc ::= Comment | PI | S
;; xmldecl::='<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
;; sddecl::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
;;
;; we will use the attribute-value parser for the xml decl.
(let ((*data-behaviour* :DTD))
;; optional XMLDecl?
(cond ((eq (peek-token input) :xml-decl)
(let ((hd (parse-xml-decl (cdr (nth-value 1 (peek-token input))))))
(setf (standalone-p *ctx*) (eq (xml-header-standalone-p hd) :yes))
(setup-encoding input hd))
(read-token input)))
(set-full-speed input)
(p/xmldecl input)
;; Misc*
(p/misc*-2 input)
;; (doctypedecl Misc*)?
@ -2595,13 +2591,7 @@
(p/doctype-decl input dtd)
(p/misc*-2 input))
(dtd
(let ((dummy (string->xstream "<!DOCTYPE dummy>")))
(setf (xstream-name dummy)
(make-stream-name
:entity-name "dummy doctype"
:entity-kind :main
:uri (zstream-base-sysid input)))
(p/doctype-decl (make-zstream :input-stack (list dummy)) dtd)))
(synthesize-doctype dtd input))
((and validate (not dtd))
(validity-error "invalid document: no doctype")))
(ensure-dtd)
@ -2610,28 +2600,65 @@
(setf (model-stack *ctx*) (list (make-root-model root))))
;; element
(let ((*data-behaviour* :DOC))
(when (eq (peek-token input) :seen-<)
(multiple-value-bind (c s)
(read-token-after-|<| input (car (zstream-input-stack input)))
(setf (zstream-token-category input) c
(zstream-token-semantic input) s)))
(fix-seen-< input)
(p/element input))
;; optional Misc*
(p/misc*-2 input)
(unless (eq (peek-token input) :eof)
(wf-error input "Garbage at end of document."))
(when *validate*
(maphash (lambda (k v)
(unless v
(validity-error "(11) IDREF: ~S not defined" (rod-string k))))
(id-table *ctx*))
(dolist (name (referenced-notations *ctx*))
(unless (find-notation name (dtd *ctx*))
(validity-error "(23) Notation Declared: ~S" (rod-string name)))))
(p/eof input)
(sax:end-document handler))))
(defun synthesize-doctype (dtd input)
(let ((dummy (string->xstream "<!DOCTYPE dummy>")))
(setf (xstream-name dummy)
(make-stream-name
:entity-name "dummy doctype"
:entity-kind :main
:uri (zstream-base-sysid input)))
(p/doctype-decl (make-zstream :input-stack (list dummy)) dtd)))
(defun fix-seen-< (input)
(when (eq (peek-token input) :seen-<)
(multiple-value-bind (c s)
(read-token-after-|<| input (car (zstream-input-stack input)))
(setf (zstream-token-category input) c
(zstream-token-semantic input) s))))
(defun p/xmldecl (input)
;; we will use the attribute-value parser for the xml decl.
(prog1
(when (eq (peek-token input) :xml-decl)
(let ((hd (parse-xml-decl (cdr (nth-value 1 (peek-token input))))))
(setf (standalone-p *ctx*) (eq (xml-header-standalone-p hd) :yes))
(setup-encoding input hd)
(read-token input)
hd))
(set-full-speed input)))
(defun p/eof (input)
(unless (eq (peek-token input) :eof)
(wf-error input "Garbage at end of document."))
(when *validate*
(maphash (lambda (k v)
(unless v
(validity-error "(11) IDREF: ~S not defined" (rod-string k))))
(id-table *ctx*))
(dolist (name (referenced-notations *ctx*))
(unless (find-notation name (dtd *ctx*))
(validity-error "(23) Notation Declared: ~S" (rod-string name))))))
(defun p/element (input)
(multiple-value-bind (cat n-b new-b uri lname qname attrs) (p/sztag input)
(sax:start-element (handler *ctx*) uri lname qname attrs)
(when (eq cat :stag)
(let ((*namespace-bindings* n-b))
(p/content input))
(p/etag input qname))
(sax:end-element (handler *ctx*) uri lname qname)
(undeclare-namespaces new-b)
(validate-end-element *ctx* qname)))
(defun p/sztag (input)
(multiple-value-bind (cat sem) (read-token input)
(case cat
((:stag :ztag))
@ -2657,28 +2684,39 @@
(setf attrs
(remove-if (compose #'xmlns-attr-p #'sax:attribute-qname)
attrs)))
(cond
((eq cat :ztag)
(sax:start-element (handler *ctx*) uri local-name name attrs)
(sax:end-element (handler *ctx*) uri local-name name))
((eq cat :stag)
(sax:start-element (handler *ctx*) uri local-name name attrs)
(p/content input)
(multiple-value-bind (cat2 sem2) (read-token input)
(unless (and (eq cat2 :etag)
(eq (car sem2) name))
(wf-error input "Bad nesting. ~S / ~S"
(mu name)
(mu (cons cat2 sem2))))
(when (cdr sem2)
(wf-error input "no attributes allowed in end tag")))
(sax:end-element (handler *ctx*) uri local-name name))
(t
(wf-error input "Expecting element, got ~S." cat))))
(undeclare-namespaces new-namespaces))
(validate-end-element *ctx* name))))
(values cat
*namespace-bindings*
new-namespaces
uri local-name name attrs))))))
(defun p/etag (input qname)
(multiple-value-bind (cat2 sem2) (read-token input)
(unless (and (eq cat2 :etag)
(eq (car sem2) qname))
(wf-error input "Bad nesting. ~S / ~S"
(mu qname)
(mu (cons cat2 sem2))))
(when (cdr sem2)
(wf-error input "no attributes allowed in end tag"))))
(defun process-characters (input sem)
(consume-token input)
(when (search #"]]>" sem)
(wf-error input "']]>' not allowed in CharData"))
(validate-characters *ctx* sem))
(defun process-cdata-section (input)
(consume-token input)
(let ((input (car (zstream-input-stack input))))
(unless (and (rune= #/C (read-rune input))
(rune= #/D (read-rune input))
(rune= #/A (read-rune input))
(rune= #/T (read-rune input))
(rune= #/A (read-rune input))
(rune= #/\[ (read-rune input)))
(wf-error input "After '<![', 'CDATA[' is expected."))
(validate-characters *ctx* #"hack") ;anything other than whitespace
(read-cdata-sect input)))
(defun p/content (input)
;; [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)*
@ -2688,10 +2726,7 @@
(p/element input)
(p/content input))
((:CDATA)
(consume-token input)
(when (search #"]]>" sem)
(wf-error input "']]>' not allowed in CharData"))
(validate-characters *ctx* sem)
(process-characters input sem)
(sax:characters (handler *ctx*) sem)
(p/content input))
((:ENTITY-REF)
@ -2709,21 +2744,11 @@
(peek-token input))))))
(p/content input))))
((:<!\[)
(consume-token input)
(cons
(let ((input (car (zstream-input-stack input))))
(unless (and (rune= #/C (read-rune input))
(rune= #/D (read-rune input))
(rune= #/A (read-rune input))
(rune= #/T (read-rune input))
(rune= #/A (read-rune input))
(rune= #/\[ (read-rune input)))
(wf-error input "After '<![', 'CDATA[' is expected."))
(validate-characters *ctx* #"hack") ;anything other than whitespace
(sax:start-cdata (handler *ctx*))
(sax:characters (handler *ctx*) (read-cdata-sect input))
(sax:end-cdata (handler *ctx*)))
(p/content input)))
(let ((data (process-cdata-section input)))
(sax:start-cdata (handler *ctx*))
(sax:characters (handler *ctx*) data)
(sax:end-cdata (handler *ctx*)))
(p/content input))
((:PI)
(consume-token input)
(sax:processing-instruction (handler *ctx*) (car sem) (cdr sem))