Files
CXML/xml/xml-parse.lisp
dlichteblau df50f2547d Fixed (parse xstream ...).
* xml/xml-parse.lisp (PARSE): Call parse-xstream, not
        make-xstream.
2007-10-14 18:40:26 +00:00

3733 lines
134 KiB
Common Lisp

;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; -*-
;;; ---------------------------------------------------------------------------
;;; Title: XML parser
;;; Created: 1999-07-17
;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
;;; Author: Henrik Motakef <hmot@henrik-motakef.de>
;;; Author: David Lichteblau <david@lichteblau.com>
;;; License: Lisp-LGPL (See file COPYING for details).
;;; ---------------------------------------------------------------------------
;;; (c) copyright 1999 by Gilbert Baumann
;;; (c) copyright 2003 by Henrik Motakef
;;; (c) copyright 2004 knowledgeTools Int. GmbH
;;; (c) copyright 2004 David Lichteblau
;;; (c) copyright 2005 David Lichteblau
;;; This library is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Library General Public
;;; License as published by the Free Software Foundation; either
;;; version 2 of the License, or (at your option) any later version.
;;;
;;; This library is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; Library General Public License for more details.
;;;
;;; You should have received a copy of the GNU Library General Public
;;; License along with this library; if not, write to the
;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;;; Boston, MA 02111-1307 USA.
;;; Streams
;;; xstreams
;; For reading runes, I defined my own streams, called xstreams,
;; because we want to be fast. A function call or even a method call
;; per character is not acceptable, instead of that we define a
;; buffered stream with and advertised buffer layout, so that we
;; could use the trick stdio uses: READ-RUNE and PEEK-RUNE are macros,
;; directly accessing the buffer and only calling some underflow
;; handler in case of stream underflows. This will yield to quite a
;; performance boost vs calling READ-BYTE per character.
;; Also we need to do encoding t conversion on ; this better done at large chunks of data rather than on a character
;; by character basis. This way we need a dispatch on the active
;; encoding only once in a while, instead of for each character. This
;; allows us to use a CLOS interface to do the underflow handling.
;;; zstreams
;; Now, for reading tokens, we define another kind of streams, called
;; zstreams. These zstreams also maintain an input stack to implement
;; inclusion of external entities. This input stack contains xstreams
;; or the special marker :STOP. Such a :STOP marker indicates, that
;; input should not continue there, but well stop; that is simulate an
;; EOF. The user is then responsible to pop this marker off the input
;; stack.
;;
;; This input stack is also used to detect circular entity inclusion.
;; The zstream tokenizer recognizes the following types of tokens and
;; is controlled by the *DATA-BEHAVIOUR* flag. (Which should become a
;; slot of zstreams instead).
;; Common
;; :xml-decl (<target> . <content>) ;processing-instruction starting with "<?xml"
;; :pi (<target> . <content>) ;processing-instruction
;; :stag (<name> . <atts>) ;start tag
;; :etag (<name> . <atts>) ;end tag
;; :ztag (<name> . <atts>) ;empty tag
;; :<!ELEMENT
;; :<!ENTITY
;; :<!ATTLIST
;; :<!NOTATION
;; :<!DOCTYPE
;; :<![
;; :comment <content>
;; *data-behaviour* = :DTD
;;
;; :nmtoken <interned-rod>
;; :#required
;; :#implied
;; :#fixed
;; :#pcdata
;; :s
;; :\[ :\] :\( :\) :|\ :\> :\" :\' :\, :\? :\* :\+
;; *data-behaviour* = :DOC
;;
;; :entity-ref <interned-rod>
;; :cdata <rod>
;;; TODO
;;
;; o provide for a faster DOM
;;
;; o morph zstream into a context object and thus also get rid of
;; special variables. Put the current DTD there too.
;; [partly done]
;; o the *scratch-pad* hack should become something much more
;; reentrant, we could either define a system-wide resource
;; or allocate some scratch-pads per context.
;; [for thread-safety reasons the array are allocated per context now,
;; reentrancy is still open]
;; o CR handling in utf-16 deocders
;;
;; o UCS-4 reader
;;
;; o max depth together with circle detection
;; (or proof, that our circle detection is enough).
;; [gemeint ist zstream-push--david]
;;
;; o better extensibility wrt character representation, one may want to
;; have
;; - UCS-4 in vectoren
;;
;; o xstreams auslagern, documententieren und dann auch in SGML und
;; CSS parser verwenden. (halt alles was zeichen liest).
;; [ausgelagert sind sie; dokumentiert "so la la"; die Reintegration
;; in Closure ist ein ganz anderes Thema]
;;
;; o recording of source locations for nodes.
;;
;; o based on the DTD and xml:space attribute implement HTML white
;; space rules.
;;
;; o on a parser option, do not expand external entities.
;;;; Validity constraints:
;;;; (00) Root Element Type like (03), c.f. MAKE-ROOT-MODEL
;;;; (01) Proper Declaration/PE Nesting P/MARKUP-DECL
;;;; (02) Standalone Document Declaration all over the place [*]
;;;; (03) Element Valid VALIDATE-*-ELEMENT, -CHARACTERS
;;;; (04) Attribute Value Type VALIDATE-ATTRIBUTE
;;;; (05) Unique Element Type Declaration DEFINE-ELEMENT
;;;; (06) Proper Group/PE Nesting P/CSPEC
;;;; (07) No Duplicate Types LEGAL-CONTENT-MODEL-P
;;;; (08) ID VALIDATE-ATTRIBUTE
;;;; (09) One ID per Element Type DEFINE-ATTRIBUTE
;;;; (10) ID Attribute Default DEFINE-ATTRIBUTE
;;;; (11) IDREF VALIDATE-ATTRIBUTE, P/DOCUMENT
;;;; (12) Entity Name VALIDATE-ATTRIBUTE
;;;; (13) Name Token VALIDATE-ATTRIBUTE
;;;; (14) Notation Attributes VALIDATE-ATTRIBUTE, P/ATT-TYPE
;;;; (15) One Notation Per Element Type DEFINE-ATTRIBUTE
;;;; (16) No Notation on Empty Element DEFINE-ELEMENT, -ATTRIBUTE
;;;; (17) Enumeration VALIDATE-ATTRIBUTE
;;;; (18) Required Attribute PROCESS-ATTRIBUTES
;;;; (19) Attribute Default Legal DEFINE-ATTRIBUTE
;;;; (20) Fixed Attribute Default VALIDATE-ATTRIBUTE
;;;; (21) Proper Conditional Section/PE Nesting P/CONDITIONAL-SECT, ...
;;;; (22) Entity Declared [**]
;;;; (23) Notation Declared P/ENTITY-DEF, P/DOCUMENT
;;;; (24) Unique Notation Name DEFINE-NOTATION
;;;;
;;;; [*] Perhaps we could revert the explicit checks of (02), if we did
;;;; _not_ read external subsets of standalone documents when parsing in
;;;; validating mode. Violations of VC (02) constraints would then appear as
;;;; wellformedness violations, right?
;;;;
;;;; [**] Although I haven't investigated this properly yet, I believe that
;;;; we check this VC together with the WFC even in non-validating mode.
(in-package :cxml)
#+allegro
(setf (excl:named-readtable :runes) *readtable*)
(eval-when (:compile-toplevel :load-toplevel :execute)
(defparameter *fast* '(optimize (speed 3) (safety 0)))
;;(defparameter *fast* '(optimize (speed 2) (safety 3)))
)
;;; parser context
(defvar *ctx* nil)
(defstruct (context (:conc-name nil))
handler
(dtd nil)
model-stack
;; xml:base machen wir fuer klacks mal gleich als expliziten stack:
base-stack
(referenced-notations '())
(id-table (%make-rod-hash-table))
;; FIXME: Wofuer ist name-hashtable da? Will man das wissen?
(name-hashtable (make-rod-hashtable :size 2000))
(standalone-p nil)
(entity-resolver nil)
(disallow-internal-subset nil)
main-zstream)
(defvar *expand-pe-p* nil)
(defparameter *initial-namespace-bindings*
'((#"" . nil)
(#"xmlns" . #"http://www.w3.org/2000/xmlns/")
(#"xml" . #"http://www.w3.org/XML/1998/namespace")))
(defparameter *namespace-bindings* *initial-namespace-bindings*)
;;;; ---------------------------------------------------------------------------
;;;; xstreams
;;;;
(defstruct (stream-name
(:print-function print-stream-name))
entity-name
entity-kind
uri)
(defun print-stream-name (object stream depth)
(declare (ignore depth))
(format stream "[~A ~S ~A]"
(rod-string (stream-name-entity-name object))
(stream-name-entity-kind object)
(stream-name-uri object)))
(deftype read-element () 'rune)
(defun call-with-open-xstream (fn stream)
(unwind-protect
(funcall fn stream)
(close-xstream stream)))
(defmacro with-open-xstream ((var value) &body body)
`(call-with-open-xstream (lambda (,var) ,@body) ,value))
(defun call-with-open-xfile (continuation &rest open-args)
(let ((input (apply #'open (car open-args) :element-type '(unsigned-byte 8) (cdr open-args))))
(unwind-protect
(progn
(funcall continuation (make-xstream input)))
(close input))))
(defmacro with-open-xfile ((stream &rest open-args) &body body)
`(call-with-open-xfile (lambda (,stream) .,body) .,open-args))
;;;; -------------------------------------------------------------------
;;;; Rechnen mit Runen
;;;;
;; Let us first define fast fixnum arithmetric get rid of type
;; checks. (After all we know what we do here).
(defmacro fx-op (op &rest xs)
`(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
(defmacro fx-pred (op &rest xs)
`(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
(defmacro %+ (&rest xs) `(fx-op + ,@xs))
(defmacro %- (&rest xs) `(fx-op - ,@xs))
(defmacro %* (&rest xs) `(fx-op * ,@xs))
(defmacro %/ (&rest xs) `(fx-op floor ,@xs))
(defmacro %and (&rest xs) `(fx-op logand ,@xs))
(defmacro %ior (&rest xs) `(fx-op logior ,@xs))
(defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
(defmacro %ash (&rest xs) `(fx-op ash ,@xs))
(defmacro %mod (&rest xs) `(fx-op mod ,@xs))
(defmacro %= (&rest xs) `(fx-pred = ,@xs))
(defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
(defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
(defmacro %< (&rest xs) `(fx-pred < ,@xs))
(defmacro %> (&rest xs) `(fx-pred > ,@xs))
;;; XXX Geschwindigkeit dieser Definitionen untersuchen!
(defmacro rune-op (op &rest xs)
`(code-rune (,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs))))
(defmacro rune-pred (op &rest xs)
`(,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs)))
(defmacro %rune+ (&rest xs) `(rune-op + ,@xs))
(defmacro %rune- (&rest xs) `(rune-op - ,@xs))
(defmacro %rune* (&rest xs) `(rune-op * ,@xs))
(defmacro %rune/ (&rest xs) `(rune-op floor ,@xs))
(defmacro %rune-and (&rest xs) `(rune-op logand ,@xs))
(defmacro %rune-ior (&rest xs) `(rune-op logior ,@xs))
(defmacro %rune-xor (&rest xs) `(rune-op logxor ,@xs))
(defmacro %rune-ash (a b) `(code-rune (ash (rune-code ,a) ,b)))
(defmacro %rune-mod (&rest xs) `(rune-op mod ,@xs))
(defmacro %rune= (&rest xs) `(rune-pred = ,@xs))
(defmacro %rune<= (&rest xs) `(rune-pred <= ,@xs))
(defmacro %rune>= (&rest xs) `(rune-pred >= ,@xs))
(defmacro %rune< (&rest xs) `(rune-pred < ,@xs))
(defmacro %rune> (&rest xs) `(rune-pred > ,@xs))
;;;; ---------------------------------------------------------------------------
;;;; rod hashtable
;;;;
;;; make-rod-hashtable
;;; rod-hash-get hashtable rod &optional start end -> value ; successp
;;; (setf (rod-hash-get hashtable rod &optional start end) new-value
;;;
(defstruct (rod-hashtable (:constructor make-rod-hashtable/low))
size ;size of table
table ;
)
(defun make-rod-hashtable (&key (size 200))
(setf size (nearest-greater-prime size))
(make-rod-hashtable/low
:size size
:table (make-array size :initial-element nil)))
(eval-when (:compile-toplevel :load-toplevel :execute)
(defconstant +fixnum-bits+
(1- (integer-length most-positive-fixnum))
"Pessimistic approximation of the number of bits of fixnums.")
(defconstant +fixnum-mask+
(1- (expt 2 +fixnum-bits+))
"Pessimistic approximation of the largest bit-mask, still being a fixnum."))
(definline stir (a b)
(%and +fixnum-mask+
(%xor (%ior (%ash (%and a #.(ash +fixnum-mask+ -5)) 5)
(%ash a #.(- 5 +fixnum-bits+)))
b)))
(definline rod-hash (rod start end)
"Compute a hash code out of a rod."
(let ((res (%- end start)))
(do ((i start (%+ i 1)))
((%= i end))
(declare (type fixnum i))
(setf res (stir res (rune-code (%rune rod i)))))
res))
(definline rod=* (x y &key (start1 0) (end1 (length x))
(start2 0) (end2 (length y)))
(and (%= (%- end1 start1) (%- end2 start2))
(do ((i start1 (%+ i 1))
(j start2 (%+ j 1)))
((%= i end1)
t)
(unless (rune= (%rune x i) (%rune y j))
(return nil)))))
(definline rod=** (x y start1 end1 start2 end2)
(and (%= (%- end1 start1) (%- end2 start2))
(do ((i start1 (%+ i 1))
(j start2 (%+ j 1)))
((%= i end1)
t)
(unless (rune= (%rune x i) (%rune y j))
(return nil)))))
(defun rod-hash-get (hashtable rod &optional (start 0) (end (length rod)))
(declare (type (simple-array rune (*)) rod))
(let ((j (%mod (rod-hash rod start end)
(rod-hashtable-size hashtable))))
(dolist (q (svref (rod-hashtable-table hashtable) j)
(values nil nil nil))
(declare (type cons q))
(when (rod=** (car q) rod 0 (length (the (simple-array rune (*)) (car q))) start end)
(return (values (cdr q) t (car q)))))))
(defun rod-hash-set (new-value hashtable rod &optional (start 0) (end (length rod)))
(let ((j (%mod (rod-hash rod start end)
(rod-hashtable-size hashtable)))
(key nil))
(dolist (q (svref (rod-hashtable-table hashtable) j)
(progn
(setf key (rod-subseq* rod start end))
(push (cons key new-value)
(aref (rod-hashtable-table hashtable) j))))
(when (rod=* (car q) rod :start2 start :end2 end)
(setf key (car q))
(setf (cdr q) new-value)
(return)))
(values new-value key)))
#-rune-is-character
(defun rod-subseq* (source start &optional (end (length source)))
(unless (and (typep start 'fixnum) (>= start 0))
(error "~S is not a non-negative fixnum." start))
(unless (and (typep end 'fixnum) (>= end start))
(error "END argument, ~S, is not a fixnum no less than START, ~S." end start))
(when (> start (length source))
(error "START argument, ~S, should be no greater than length of rod." start))
(when (> end (length source))
(error "END argument, ~S, should be no greater than length of rod." end))
(locally
(declare (type fixnum start end))
(let ((res (make-rod (- end start))))
(declare (type rod res))
(do ((i (- (- end start) 1) (the fixnum (- i 1))))
((< i 0) res)
(declare (type fixnum i))
(setf (%rune res i) (aref source (the fixnum (+ i start))))))))
#+rune-is-character
(defun rod-subseq* (source start &optional (end (length source)))
(subseq source start end))
(deftype ufixnum () `(unsigned-byte ,(integer-length most-positive-fixnum)))
#-rune-is-character
(defun rod-subseq** (source start &optional (end (length source)))
(declare (type (simple-array rune (*)) source)
(type ufixnum start)
(type ufixnum end)
(optimize (speed 3) (safety 0)))
(let ((res (make-array (%- end start) :element-type 'rune)))
(declare (type (simple-array rune (*)) res))
(let ((i (%- end start)))
(declare (type ufixnum i))
(loop
(setf i (- i 1))
(when (= i 0)
(return))
(setf (%rune res i) (%rune source (the ufixnum (+ i start))))))
res))
#+rune-is-character
(defun rod-subseq** (source start &optional (end (length source)))
(subseq source start end))
(defun (setf rod-hash-get) (new-value hashtable rod &optional (start 0) (end (length rod)))
(rod-hash-set new-value hashtable rod start end))
(defun intern-name (rod &optional (start 0) (end (length rod)))
(multiple-value-bind (value successp key) (rod-hash-get (name-hashtable *ctx*) rod start end)
(declare (ignore value))
(if successp
key
(nth-value 1 (rod-hash-set t (name-hashtable *ctx*) rod start end)))))
;;;; ---------------------------------------------------------------------------
;;;;
;;;; rod collector
;;;;
(defvar *scratch-pad*)
(defvar *scratch-pad-2*)
(defvar *scratch-pad-3*)
(defvar *scratch-pad-4*)
(declaim (type (simple-array rune (*))
*scratch-pad* *scratch-pad-2* *scratch-pad-3* *scratch-pad-4*))
(defmacro with-scratch-pads ((&optional) &body body)
`(let ((*scratch-pad* (make-array 1024 :element-type 'rune))
(*scratch-pad-2* (make-array 1024 :element-type 'rune))
(*scratch-pad-3* (make-array 1024 :element-type 'rune))
(*scratch-pad-4* (make-array 1024 :element-type 'rune)))
,@body))
(defmacro %put-unicode-char (code-var put)
`(progn
(cond ((%> ,code-var #xFFFF)
(,put (the rune (code-rune (%+ #xD7C0 (%ash ,code-var -10)))))
(,put (the rune (code-rune (%ior #xDC00 (%and ,code-var #x03FF))))))
(t
(,put (code-rune ,code-var))))))
(defun adjust-array-by-copying (old-array new-size)
"Adjust an array by copying and thus ensures, that result is a SIMPLE-ARRAY."
(let ((res (make-array new-size :element-type (array-element-type old-array))))
(replace res old-array
:start1 0 :end1 (length old-array)
:start2 0 :end2 (length old-array))
res))
(defmacro with-rune-collector-aux (scratch collect body mode)
(let ((rod (gensym))
(n (gensym))
(i (gensym))
(b (gensym)))
`(let ((,n (length ,scratch))
(,i 0)
(,b ,scratch))
(declare (type fixnum ,n ,i))
(macrolet
((,collect (x)
`((lambda (x)
(locally
(declare #.*fast*)
(when (%>= ,',i ,',n)
(setf ,',n (* 2 ,',n))
(setf ,',b
(setf ,',scratch
(adjust-array-by-copying ,',scratch ,',n))))
(setf (aref (the (simple-array rune (*)) ,',b) ,',i) x)
(incf ,',i)))
,x)))
,@body
,(ecase mode
(:intern
`(intern-name ,b 0 ,i))
(:copy
`(let ((,rod (make-rod ,i)))
(while (not (%= ,i 0))
(setf ,i (%- ,i 1))
(setf (%rune ,rod ,i)
(aref (the (simple-array rune (*)) ,b) ,i)))
,rod))
(:raw
`(values ,b 0 ,i))
)))))
'(defmacro with-rune-collector-aux (scratch collect body mode)
(let ((rod (gensym))
(n (gensym))
(i (gensym))
(b (gensym)))
`(let ((,n (length ,scratch))
(,i 0))
(declare (type fixnum ,n ,i))
(macrolet
((,collect (x)
`((lambda (x)
(locally
(declare #.*fast*)
(when (%>= ,',i ,',n)
(setf ,',n (* 2 ,',n))
(setf ,',scratch
(setf ,',scratch
(adjust-array-by-copying ,',scratch ,',n))))
(setf (aref (the (simple-array rune (*)) ,',scratch) ,',i) x)
(incf ,',i)))
,x)))
,@body
,(ecase mode
(:intern
`(intern-name ,scratch 0 ,i))
(:copy
`(let ((,rod (make-rod ,i)))
(while (%> ,i 0)
(setf ,i (%- ,i 1))
(setf (%rune ,rod ,i)
(aref (the (simple-array rune (*)) ,scratch) ,i)))
,rod))
(:raw
`(values ,scratch 0 ,i))
)))))
(defmacro with-rune-collector ((collect) &body body)
`(with-rune-collector-aux *scratch-pad* ,collect ,body :copy))
(defmacro with-rune-collector-2 ((collect) &body body)
`(with-rune-collector-aux *scratch-pad-2* ,collect ,body :copy))
(defmacro with-rune-collector-3 ((collect) &body body)
`(with-rune-collector-aux *scratch-pad-3* ,collect ,body :copy))
(defmacro with-rune-collector-4 ((collect) &body body)
`(with-rune-collector-aux *scratch-pad-4* ,collect ,body :copy))
(defmacro with-rune-collector/intern ((collect) &body body)
`(with-rune-collector-aux *scratch-pad* ,collect ,body :intern))
(defmacro with-rune-collector/raw ((collect) &body body)
`(with-rune-collector-aux *scratch-pad* ,collect ,body :raw))
#|
(defmacro while-reading-runes ((reader stream-in) &rest body)
;; Thou shalt not leave body via a non local exit
(let ((stream (make-symbol "STREAM"))
(rptr (make-symbol "RPTR"))
(fptr (make-symbol "FPTR"))
(buf (make-symbol "BUF")) )
`(let* ((,stream ,stream-in)
(,rptr (xstream-read-ptr ,stream))
(,fptr (xstream-fill-ptr ,stream))
(,buf (xstream-buffer ,stream)))
(declare (type fixnum ,rptr ,fptr)
(type xstream ,stream))
(macrolet ((,reader (res-var)
`(cond ((%= ,',rptr ,',fptr)
(setf (xstream-read-ptr ,',stream) ,',rptr)
(setf ,res-var (xstream-underflow ,',stream))
(setf ,',rptr (xstream-read-ptr ,',stream))
(setf ,',fptr (xstream-fill-ptr ,',stream))
(setf ,',buf (xstream-buffer ,',stream)))
(t
(setf ,res-var
(aref (the (simple-array read-element (*)) ,',buf)
(the fixnum ,',rptr)))
(setf ,',rptr (%+ ,',rptr 1))))))
(prog1
(let () .,body)
(setf (xstream-read-ptr ,stream) ,rptr) )))))
|#
;;;; ---------------------------------------------------------------------------
;;;; DTD
;;;;
(define-condition xml-parse-error (simple-error) ())
(define-condition well-formedness-violation (xml-parse-error) ())
(define-condition validity-error (xml-parse-error) ())
;; We make some effort to signal end of file as a special condition, but we
;; don't actually try very hard. Not sure whether we should. Right now I
;; would prefer not to document this class.
(define-condition end-of-xstream (well-formedness-violation) ())
(defun describe-xstream (x s)
(format s " Line ~D, column ~D in ~A~%"
(xstream-line-number x)
(xstream-column-number x)
(let ((name (xstream-name x)))
(cond
((null name)
"<anonymous stream>")
((eq :main (stream-name-entity-kind name))
(stream-name-uri name))
(t
name)))))
(defun %error (class stream message)
(let* ((zmain (if *ctx* (main-zstream *ctx*) nil))
(zstream (if (zstream-p stream) stream zmain))
(xstream (if (xstream-p stream) stream nil))
(s (make-string-output-stream)))
(write-line message s)
(when xstream
(write-line "Location:" s)
(describe-xstream xstream s))
(when zstream
(let ((stack
(remove xstream (remove :stop (zstream-input-stack zstream)))))
(when stack
(write-line "Context:" s)
(dolist (x stack)
(describe-xstream x s)))))
(when (and zmain (not (eq zstream zmain)))
(let ((stack
(remove xstream (remove :stop (zstream-input-stack zmain)))))
(when stack
(write-line "Context in main document:" s)
(dolist (x stack)
(describe-xstream x s)))))
(error class
:format-control "~A"
:format-arguments (list (get-output-stream-string s)))))
(defun validity-error (fmt &rest args)
(%error 'validity-error
nil
(format nil "Document not valid: ~?" fmt args)))
(defun wf-error (stream fmt &rest args)
(%error 'well-formedness-violation
stream
(format nil "Document not well-formed: ~?" fmt args)))
(defun eox (stream &optional x &rest args)
(%error 'end-of-xstream
stream
(format nil "End of file~@[: ~?~]" x args)))
(defclass cxml-parser (sax:sax-parser) ((ctx :initarg :ctx)))
(defun parser-xstream (parser)
(car (zstream-input-stack (main-zstream (slot-value parser 'ctx)))))
(defun parser-stream-name (parser)
(let ((xstream (parser-xstream parser)))
(if xstream
(xstream-name xstream)
nil)))
(defmethod sax:line-number ((parser cxml-parser))
(let ((x (parser-xstream parser)))
(if x
(xstream-line-number x)
nil)))
(defmethod sax:column-number ((parser cxml-parser))
(let ((x (parser-xstream parser)))
(if x
(xstream-column-number x)
nil)))
(defmethod sax:system-id ((parser cxml-parser))
(let ((name (parser-stream-name parser)))
(if name
(stream-name-uri name)
nil)))
(defmethod sax:xml-base ((parser cxml-parser))
(car (base-stack (slot-value parser 'ctx))))
(defvar *validate* t)
(defvar *external-subset-p* nil)
(defun validate-start-element (ctx name)
(when *validate*
(let* ((pair (car (model-stack ctx)))
(newval (funcall (car pair) name)))
(unless newval
(validity-error "(03) Element Valid: ~A" (rod-string name)))
(setf (car pair) newval)
(let ((e (find-element name (dtd ctx))))
(unless e
(validity-error "(03) Element Valid: no definition for ~A"
(rod-string name)))
(maybe-compile-cspec e)
(push (copy-cons (elmdef-compiled-cspec e)) (model-stack ctx))))))
(defun copy-cons (x)
(cons (car x) (cdr x)))
(defun validate-end-element (ctx name)
(when *validate*
(let ((pair (car (model-stack ctx))))
(unless (eq (funcall (car pair) nil) t)
(validity-error "(03) Element Valid: ~A" (rod-string name)))
(pop (model-stack ctx)))))
(defun validate-characters (ctx rod)
(when *validate*
(let ((pair (car (model-stack ctx))))
(unless (funcall (cdr pair) rod)
(validity-error "(03) Element Valid: unexpected PCDATA")))))
(defun standalone-check-necessary-p (def)
(and *validate*
(standalone-p *ctx*)
(etypecase def
(elmdef (elmdef-external-p def))
(attdef (attdef-external-p def)))))
;; attribute validation, defaulting, and normalization -- except for for
;; uniqueness checks, which are done after namespaces have been declared
(defun process-attributes (ctx name attlist)
(let ((e (find-element name (dtd ctx))))
(cond
(e
(dolist (ad (elmdef-attributes e)) ;handle default values
(unless (get-attribute (attdef-name ad) attlist)
(case (attdef-default ad)
(:IMPLIED)
(:REQUIRED
(when *validate*
(validity-error "(18) Required Attribute: ~S not specified"
(rod-string (attdef-name ad)))))
(t
(when (standalone-check-necessary-p ad)
(validity-error "(02) Standalone Document Declaration: missing attribute value"))
(push (sax:make-attribute :qname (attdef-name ad)
:value (cadr (attdef-default ad))
:specified-p nil)
attlist)))))
(dolist (a attlist) ;normalize non-CDATA values
(let* ((qname (sax:attribute-qname a))
(adef (find-attribute e qname)))
(when adef
(when (and *validate*
sax:*namespace-processing*
(eq (attdef-type adef) :ID)
(find #/: (sax:attribute-value a)))
(validity-error "colon in ID attribute"))
(unless (eq (attdef-type adef) :CDATA)
(let ((canon (canon-not-cdata-attval (sax:attribute-value a))))
(when (and (standalone-check-necessary-p adef)
(not (rod= (sax:attribute-value a) canon)))
(validity-error "(02) Standalone Document Declaration: attribute value not normalized"))
(setf (sax:attribute-value a) canon))))))
(when *validate* ;maybe validate attribute values
(dolist (a attlist)
(validate-attribute ctx e a))))
((and *validate* attlist)
(validity-error "(04) Attribute Value Type: no definition for element ~A"
(rod-string name)))))
attlist)
(defun get-attribute (name attributes)
(member name attributes :key #'sax:attribute-qname :test #'rod=))
(defun validate-attribute (ctx e a)
(when (sax:attribute-specified-p a) ;defaults checked by DEFINE-ATTRIBUTE
(let* ((qname (sax:attribute-qname a))
(adef
(or (find-attribute e qname)
(validity-error "(04) Attribute Value Type: not declared: ~A"
(rod-string qname)))))
(validate-attribute* ctx adef (sax:attribute-value a)))))
(defun validate-attribute* (ctx adef value)
(let ((type (attdef-type adef))
(default (attdef-default adef)))
(when (and (listp default)
(eq (car default) :FIXED)
(not (rod= value (cadr default))))
(validity-error "(20) Fixed Attribute Default: expected ~S but got ~S"
(rod-string (cadr default))
(rod-string value)))
(ecase (if (listp type) (car type) type)
(:ID
(unless (valid-name-p value)
(validity-error "(08) ID: not a name: ~S" (rod-string value)))
(when (eq (gethash value (id-table ctx)) t)
(validity-error "(08) ID: ~S not unique" (rod-string value)))
(setf (gethash value (id-table ctx)) t))
(:IDREF
(validate-idref ctx value))
(:IDREFS
(let ((names (split-names value)))
(unless names
(validity-error "(11) IDREF: malformed names"))
(mapc (curry #'validate-idref ctx) names)))
(:NMTOKEN
(validate-nmtoken value))
(:NMTOKENS
(let ((tokens (split-names value)))
(unless tokens
(validity-error "(13) Name Token: malformed NMTOKENS"))
(mapc #'validate-nmtoken tokens)))
(:ENUMERATION
(unless (member value (cdr type) :test #'rod=)
(validity-error "(17) Enumeration: value not declared: ~S"
(rod-string value))))
(:NOTATION
(unless (member value (cdr type) :test #'rod=)
(validity-error "(14) Notation Attributes: ~S" (rod-string value))))
(:ENTITY
(validate-entity value))
(:ENTITIES
(let ((names (split-names value)))
(unless names
(validity-error "(13) Name Token: malformed NMTOKENS"))
(mapc #'validate-entity names)))
(:CDATA))))
(defun validate-idref (ctx value)
(unless (valid-name-p value)
(validity-error "(11) IDREF: not a name: ~S" (rod-string value)))
(unless (gethash value (id-table ctx))
(setf (gethash value (id-table ctx)) nil)))
(defun validate-nmtoken (value)
(unless (valid-nmtoken-p value)
(validity-error "(13) Name Token: not a NMTOKEN: ~S"
(rod-string value))))
(defstruct (entdef (:constructor)))
(defstruct (internal-entdef
(:include entdef)
(:constructor make-internal-entdef (value))
(:conc-name #:entdef-))
(value (error "missing argument") :type rod)
(expansion nil)
(external-subset-p *external-subset-p*))
(defstruct (external-entdef
(:include entdef)
(:constructor make-external-entdef (extid ndata))
(:conc-name #:entdef-))
(extid (error "missing argument") :type extid)
(ndata nil :type (or rod null)))
(defun validate-entity (value)
(unless (valid-name-p value)
(validity-error "(12) Entity Name: not a name: ~S" (rod-string value)))
(let ((def (let ((*validate*
;; Similarly the entity refs are internal and
;; don't need normalization ... the unparsed
;; entities (and entities) aren't "references"
;; -- sun/valid/sa03.xml
nil))
(get-entity-definition value :general (dtd *ctx*)))))
(unless (and (typep def 'external-entdef) (entdef-ndata def))
;; unparsed entity
(validity-error "(12) Entity Name: ~S" (rod-string value)))))
(defun split-names (rod)
(flet ((whitespacep (x)
(or (rune= x #/U+0009)
(rune= x #/U+000A)
(rune= x #/U+000D)
(rune= x #/U+0020))))
(if (let ((n (length rod)))
(and (not (zerop n))
(or (whitespacep (rune rod 0))
(whitespacep (rune rod (1- n))))))
nil
(split-sequence-if #'whitespacep rod :remove-empty-subseqs t))))
(defun zstream-base-sysid (zstream)
(let ((base-sysid
(dolist (k (zstream-input-stack zstream))
(let ((base-sysid (stream-name-uri (xstream-name k))))
(when base-sysid (return base-sysid))))))
base-sysid))
(defun absolute-uri (sysid source-stream)
(let ((base-sysid (zstream-base-sysid source-stream)))
;; XXX is the IF correct?
(if base-sysid
(puri:merge-uris sysid base-sysid)
sysid)))
(defstruct (extid (:constructor make-extid (public system)))
(public nil :type (or rod null))
(system (error "missing argument") :type (or puri:uri null)))
(defun absolute-extid (source-stream extid)
(let ((sysid (extid-system extid))
(result (copy-extid extid)))
(setf (extid-system result) (absolute-uri sysid source-stream))
result))
(defun define-entity (source-stream name kind def)
(setf name (intern-name name))
(when (and sax:*namespace-processing* (find #/: name))
(wf-error source-stream "colon in entity name"))
(let ((table
(ecase kind
(:general (dtd-gentities (dtd *ctx*)))
(:parameter (dtd-pentities (dtd *ctx*))))))
(unless (gethash name table)
(when (and source-stream (handler *ctx*))
(report-entity (handler *ctx*) kind name def))
(when (typep def 'external-entdef)
(setf (entdef-extid def)
(absolute-extid source-stream (entdef-extid def))))
(setf (gethash name table)
(cons *external-subset-p* def)))))
(defun get-entity-definition (entity-name kind dtd)
(unless dtd
(wf-error nil "entity not defined: ~A" (rod-string entity-name)))
(destructuring-bind (extp &rest def)
(gethash entity-name
(ecase kind
(:general (dtd-gentities dtd))
(:parameter (dtd-pentities dtd)))
'(nil))
(when (and *validate* (standalone-p *ctx*) extp)
(validity-error "(02) Standalone Document Declaration: entity reference: ~S"
(rod-string entity-name)))
def))
(defun entity->xstream (zstream entity-name kind &optional internalp)
;; `zstream' is for error messages
(let ((def (get-entity-definition entity-name kind (dtd *ctx*))))
(unless def
(wf-error zstream "Entity '~A' is not defined." (rod-string entity-name)))
(let (r)
(etypecase def
(internal-entdef
(when (and (standalone-p *ctx*)
(entdef-external-subset-p def))
(wf-error
zstream
"entity declared in external subset, but document is standalone"))
(setf r (make-rod-xstream (entdef-value def)))
(setf (xstream-name r)
(make-stream-name :entity-name entity-name
:entity-kind kind
:uri nil)))
(external-entdef
(when internalp
(wf-error zstream
"entity not internal: ~A" (rod-string entity-name)))
(when (entdef-ndata def)
(wf-error zstream
"reference to unparsed entity: ~A"
(rod-string entity-name)))
(setf r (xstream-open-extid (extid-using-catalog (entdef-extid def))))
(setf (stream-name-entity-name (xstream-name r)) entity-name
(stream-name-entity-kind (xstream-name r)) kind)))
r)))
(defun checked-get-entdef (name type)
(let ((def (get-entity-definition name type (dtd *ctx*))))
(unless def
(wf-error nil "Entity '~A' is not defined." (rod-string name)))
def))
(defun xstream-open-extid* (entity-resolver pubid sysid)
(let* ((stream
(or (funcall (or entity-resolver (constantly nil)) pubid sysid)
(open (uri-to-pathname sysid)
:element-type '(unsigned-byte 8)
:direction :input))))
(make-xstream stream
:name (make-stream-name :uri sysid)
:initial-speed 1)))
(defun xstream-open-extid (extid)
(xstream-open-extid* (entity-resolver *ctx*)
(extid-public extid)
(extid-system extid)))
(defun call-with-entity-expansion-as-stream (zstream cont name kind internalp)
;; `zstream' is for error messages
(let ((in (entity->xstream zstream name kind internalp)))
(push (stream-name-uri (xstream-name in)) (base-stack *ctx*))
(unwind-protect
(funcall cont in)
(pop (base-stack *ctx*))
(close-xstream in))))
(defun ensure-dtd ()
(unless (dtd *ctx*)
(setf (dtd *ctx*) (make-dtd))
(define-default-entities)))
(defun define-default-entities ()
(define-entity nil #"lt" :general (make-internal-entdef #"&#60;"))
(define-entity nil #"gt" :general (make-internal-entdef #">"))
(define-entity nil #"amp" :general (make-internal-entdef #"&#38;"))
(define-entity nil #"apos" :general (make-internal-entdef #"'"))
(define-entity nil #"quot" :general (make-internal-entdef #"\"")))
(defstruct attdef
;; an attribute definition
element ;name of element this attribute belongs to
name ;name of attribute
type ;type of attribute; either one of :CDATA, :ID, :IDREF, :IDREFS,
; :ENTITY, :ENTITIES, :NMTOKEN, :NMTOKENS, or
; (:NOTATION <name>*)
; (:ENUMERATION <name>*)
default ;default value of attribute:
; :REQUIRED, :IMPLIED, (:FIXED content) or (:DEFAULT content)
(external-p *external-subset-p*)
)
(defstruct elmdef
;; an element definition
name ;name of the element
content ;content model [*]
attributes ;list of defined attributes
compiled-cspec ;cons of validation function for contentspec
(external-p *external-subset-p*)
)
;; [*] in XML it is possible to define attributes before the element
;; itself is defined and since we hang attribute definitions into the
;; relevant element definitions, the `content' slot indicates whether an
;; element was actually defined. It is NIL until set to a content model
;; when the element type declaration is processed.
(defun %make-rod-hash-table ()
;; XXX with portable hash tables, this is the only way to case-sensitively
;; use rods. However, EQUALP often has horrible performance! Most Lisps
;; provide extensions for user-defined equality, we should use them! There
;; is also a home-made hash table for rods defined below, written by
;; Gilbert (I think). We could also use that one, but I would prefer the
;; first method, even if it's unportable.
(make-hash-table :test
#+rune-is-character 'equal
#-rune-is-character 'equalp))
(defstruct dtd
(elements (%make-rod-hash-table)) ;elmdefs
(gentities (%make-rod-hash-table)) ;general entities
(pentities (%make-rod-hash-table)) ;parameter entities
(notations (%make-rod-hash-table)))
(defun make-dtd-cache ()
(puri:make-uri-space))
(defvar *cache-all-dtds* nil)
(defvar *dtd-cache* (make-dtd-cache))
(defun remdtd (uri dtd-cache)
(setf uri (puri:intern-uri uri dtd-cache))
(prog1
(and (getf (puri:uri-plist uri) 'dtd) t)
(puri:unintern-uri uri dtd-cache)))
(defun clear-dtd-cache (dtd-cache)
(puri:unintern-uri t dtd-cache))
(defun getdtd (uri dtd-cache)
(getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd))
(defun (setf getdtd) (newval uri dtd-cache)
(setf (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd) newval)
newval)
;;;;
(defun find-element (name dtd)
(gethash name (dtd-elements dtd)))
(defun define-element (dtd element-name &optional content-model)
(let ((e (find-element element-name dtd)))
(cond
((null e)
(prog1
(setf (gethash element-name (dtd-elements dtd))
(make-elmdef :name element-name :content content-model))
(sax:element-declaration (handler *ctx*) element-name content-model)))
((null content-model)
e)
(t
(when *validate*
(when (elmdef-content e)
(validity-error "(05) Unique Element Type Declaration"))
(when (eq content-model :EMPTY)
(dolist (ad (elmdef-attributes e))
(let ((type (attdef-type ad)))
(when (and (listp type) (eq (car type) :NOTATION))
(validity-error "(16) No Notation on Empty Element: ~S"
(rod-string element-name)))))))
(sax:element-declaration (handler *ctx*) element-name content-model)
(setf (elmdef-content e) content-model)
(setf (elmdef-external-p e) *external-subset-p*)
e))))
(defvar *redefinition-warning* nil)
(defun define-attribute (dtd element name type default)
(let ((adef (make-attdef :element element
:name name
:type type
:default default))
(e (or (find-element element dtd)
(define-element dtd element))))
(when (and *validate* (listp default))
(unless (eq (attdef-type adef) :CDATA)
(setf (second default) (canon-not-cdata-attval (second default))))
(validate-attribute* *ctx* adef (second default)))
(cond ((find-attribute e name)
(when *redefinition-warning*
(warn "Attribute \"~A\" of \"~A\" not redefined."
(rod-string name)
(rod-string element))))
(t
(when *validate*
(when (eq type :ID)
(when (find :ID (elmdef-attributes e) :key #'attdef-type)
(validity-error "(09) One ID per Element Type: element ~A"
(rod-string element)))
(unless (member default '(:REQUIRED :IMPLIED))
(validity-error "(10) ID Attribute Default: ~A"
(rod-string element))))
(flet ((notationp (type)
(and (listp type) (eq (car type) :NOTATION))))
(when (notationp type)
(when (find-if #'notationp (elmdef-attributes e)
:key #'attdef-type)
(validity-error "(15) One Notation Per Element Type: ~S"
(rod-string element)))
(when (eq (elmdef-content e) :EMPTY)
(validity-error "(16) No Notation on Empty Element: ~S"
(rod-string element))))))
(sax:attribute-declaration (handler *ctx*) element name type default)
(push adef (elmdef-attributes e))))))
(defun find-attribute (elmdef name)
(find name (elmdef-attributes elmdef) :key #'attdef-name :test #'rod=))
(defun define-notation (dtd name id)
(let ((ns (dtd-notations dtd)))
(when (gethash name ns)
(validity-error "(24) Unique Notation Name: ~S" (rod-string name)))
(setf (gethash name ns) id)))
(defun find-notation (name dtd)
(gethash name (dtd-notations dtd)))
;;;; ---------------------------------------------------------------------------
;;;; z streams and lexer
;;;;
(defstruct zstream
token-category
token-semantic
input-stack)
(defun call-with-zstream (fn zstream)
(unwind-protect
(funcall fn zstream)
(dolist (input (zstream-input-stack zstream))
(cond #-x&y-streams-are-stream
((xstream-p input)
(close-xstream input))
#+x&y-streams-are-stream
((streamp input)
(close input))))))
(defmacro with-zstream ((zstream &rest args) &body body)
`(call-with-zstream (lambda (,zstream) ,@body)
(make-zstream ,@args)))
(defun read-token (input)
(cond ((zstream-token-category input)
(multiple-value-prog1
(values (zstream-token-category input)
(zstream-token-semantic input))
(setf (zstream-token-category input) nil
(zstream-token-semantic input) nil)))
(t
(read-token-2 input))))
(defun peek-token (input)
(cond ((zstream-token-category input)
(values
(zstream-token-category input)
(zstream-token-semantic input)))
(t
(multiple-value-bind (c s) (read-token input)
(setf (zstream-token-category input) c
(zstream-token-semantic input) s))
(values (zstream-token-category input)
(zstream-token-semantic input)))))
(defun read-token-2 (input)
(cond ((null (zstream-input-stack input))
(values :eof nil))
(t
(let ((c (peek-rune (car (zstream-input-stack input)))))
(cond ((eq c :eof)
(cond ((eq (cadr (zstream-input-stack input)) :stop)
(values :eof nil))
(t
(close-xstream (pop (zstream-input-stack input)))
(if (null (zstream-input-stack input))
(values :eof nil)
(values :S nil) ;fake #x20 after PE expansion
))))
(t
(read-token-3 input)))))))
(defvar *data-behaviour*
) ;either :DTD or :DOC
(defun read-token-3 (zinput)
(let ((input (car (zstream-input-stack zinput))))
;; PI Comment
(let ((c (read-rune input)))
(cond
;; first the common tokens
((rune= #/< c)
(read-token-after-|<| zinput input))
;; now dispatch
(t
(ecase *data-behaviour*
(:DTD
(cond ((rune= #/\[ c) :\[)
((rune= #/\] c) :\])
((rune= #/\( c) :\()
((rune= #/\) c) :\))
((rune= #/\| c) :\|)
((rune= #/\> c) :\>)
((rune= #/\" c) :\")
((rune= #/\' c) :\')
((rune= #/\, c) :\,)
((rune= #/\? c) :\?)
((rune= #/\* c) :\*)
((rune= #/\+ c) :\+)
((name-rune-p c)
(unread-rune c input)
(values :nmtoken (read-name-token input)))
((rune= #/# c)
(let ((q (read-name-token input)))
(cond ((rod= q '#.(string-rod "REQUIRED")) :|#REQUIRED|)
((rod= q '#.(string-rod "IMPLIED")) :|#IMPLIED|)
((rod= q '#.(string-rod "FIXED")) :|#FIXED|)
((rod= q '#.(string-rod "PCDATA")) :|#PCDATA|)
(t
(wf-error zinput "Unknown token: ~S." q)))))
((or (rune= c #/U+0020)
(rune= c #/U+0009)
(rune= c #/U+000D)
(rune= c #/U+000A))
(values :S nil))
((rune= #/% c)
(cond ((name-start-rune-p (peek-rune input))
;; an entity reference
(read-pe-reference zinput))
(t
(values :%))))
(t
(wf-error zinput "Unexpected character ~S." c))))
(:DOC
(cond
((rune= c #/&)
(multiple-value-bind (kind data) (read-entity-like input)
(cond ((eq kind :ENTITY-REFERENCE)
(values :ENTITY-REF data))
((eq kind :CHARACTER-REFERENCE)
(values :CDATA
(with-rune-collector (collect)
(%put-unicode-char data collect)))))))
(t
(unread-rune c input)
(values :CDATA (read-cdata input)))))))))))
(definline check-rune (input actual expected)
(unless (eql actual expected)
(wf-error input "expected #/~A but found #/~A"
(rune-char expected)
(rune-char actual))))
(defun read-pe-reference (zinput)
(let* ((input (car (zstream-input-stack zinput)))
(nam (read-name-token input)))
(check-rune input #/\; (read-rune input))
(cond (*expand-pe-p*
;; no external entities here!
(let ((i2 (entity->xstream zinput nam :parameter)))
(zstream-push i2 zinput))
(values :S nil) ;space before inserted PE expansion.
)
(t
(values :PE-REFERENCE nam)) )))
(defun read-token-after-|<| (zinput input)
(let ((d (read-rune input)))
(cond ((eq d :eof)
(eox input "EOF after '<'"))
((rune= #/! d)
(read-token-after-|<!| input))
((rune= #/? d)
(multiple-value-bind (target content) (read-pi input)
(cond ((rod= target '#.(string-rod "xml"))
(values :xml-decl (cons target content)))
((rod-equal target '#.(string-rod "XML"))
(wf-error zinput
"You lost -- no XML processing instructions."))
((and sax:*namespace-processing* (position #/: target))
(wf-error zinput
"Processing instruction target ~S is not a ~
valid NcName."
(mu target)))
(t
(values :PI (cons target content))))))
((eq *data-behaviour* :DTD)
(unread-rune d input)
(unless (or (rune= #// d) (name-start-rune-p d))
(wf-error zinput "Expected '!' or '?' after '<' in DTD."))
(values :seen-< nil))
((rune= #// d)
(let ((c (peek-rune input)))
(cond ((name-start-rune-p c)
(read-tag-2 zinput input :etag))
(t
(wf-error zinput
"Expecting name start rune after \"</\".")))))
((name-start-rune-p d)
(unread-rune d input)
(read-tag-2 zinput input :stag))
(t
(wf-error zinput "Expected '!' or '?' after '<' in DTD.")))))
(defun read-token-after-|<!| (input)
(let ((d (read-rune input)))
(cond ((eq d :eof)
(eox input "EOF after \"<!\"."))
((name-start-rune-p d)
(unread-rune d input)
(let ((name (read-name-token input)))
(cond ((rod= name '#.(string-rod "ELEMENT")) :|<!ELEMENT|)
((rod= name '#.(string-rod "ENTITY")) :|<!ENTITY|)
((rod= name '#.(string-rod "ATTLIST")) :|<!ATTLIST|)
((rod= name '#.(string-rod "NOTATION")) :|<!NOTATION|)
((rod= name '#.(string-rod "DOCTYPE")) :|<!DOCTYPE|)
(t
(wf-error input"`<!~A' unknown." (rod-string name))))))
((rune= #/\[ d)
(values :|<![| nil))
((rune= #/- d)
(setf d (read-rune input))
(cond ((rune= #/- d)
(values
:COMMENT
(read-comment-content input)))
(t
(wf-error input"Bad character ~S after \"<!-\"" d))))
(t
(wf-error input "Bad character ~S after \"<!\"" d)))))
(definline read-S? (input)
(while (member (peek-rune input) '(#/U+0020 #/U+0009 #/U+000A #/U+000D)
:test #'eql)
(consume-rune input)))
(defun read-attribute-list (zinput input imagine-space-p)
(cond ((or imagine-space-p
(let ((c (peek-rune input)))
(and (not (eq c :eof))
(space-rune-p c))))
(read-S? input)
(cond ((eq (peek-rune input) :eof)
nil)
((name-start-rune-p (peek-rune input))
(cons (read-attribute zinput input)
(read-attribute-list zinput input nil)))
(t
nil)))
(t
nil)))
(defun read-entity-like (input)
"Read an entity reference off the xstream `input'. Returns two values:
either :ENTITY-REFERENCE <interned-rod> in case of a named entity
or :CHARACTER-REFERENCE <integer> in case of character references.
The initial #\\& is considered to be consumed already."
(let ((c (peek-rune input)))
(cond ((eq c :eof)
(eox input "EOF after '&'"))
((rune= c #/#)
(values :CHARACTER-REFERENCE (read-character-reference input)))
(t
(unless (name-start-rune-p (peek-rune input))
(wf-error input "Expecting name after &."))
(let ((name (read-name-token input)))
(setf c (read-rune input))
(unless (rune= c #/\;)
(wf-error input "Expected \";\"."))
(values :ENTITY-REFERENCE name))))))
(defun read-tag-2 (zinput input kind)
(let ((name (read-name-token input))
(atts nil))
(setf atts (read-attribute-list zinput input nil))
;; check for double attributes
(do ((q atts (cdr q)))
((null q))
(cond ((find (caar q) (cdr q) :key #'car)
(wf-error zinput "Attribute ~S has two definitions in element ~S."
(rod-string (caar q))
(rod-string name)))))
(cond ((eq (peek-rune input) #/>)
(consume-rune input)
(values kind (cons name atts)))
((eq (peek-rune input) #//)
(consume-rune input)
(check-rune input #/> (read-rune input))
(values :ztag (cons name atts)))
(t
(wf-error zinput "syntax error in read-tag-2.")) )))
(defun read-attribute (zinput input)
(unless (name-start-rune-p (peek-rune input))
(wf-error zinput "Expected name."))
;; arg thanks to the post mortem nature of name space declarations,
;; we could only process the attribute values post mortem.
(let ((name (read-name-token input)))
(while (let ((c (peek-rune input)))
(and (not (eq c :eof))
(or (rune= c #/U+0020)
(rune= c #/U+0009)
(rune= c #/U+000A)
(rune= c #/U+000D))))
(consume-rune input))
(unless (eq (read-rune input) #/=)
(wf-error zinput "Expected \"=\"."))
(while (let ((c (peek-rune input)))
(and (not (eq c :eof))
(or (rune= c #/U+0020)
(rune= c #/U+0009)
(rune= c #/U+000A)
(rune= c #/U+000D))))
(consume-rune input))
(cons name (read-att-value-2 input))))
(defun canon-not-cdata-attval (value)
;; | If the declared value is not CDATA, then the XML processor must
;; | further process the normalized attribute value by discarding any
;; | leading and trailing space (#x20) characters, and by replacing
;; | sequences of space (#x20) characters by a single space (#x20)
;; | character.
(with-rune-collector (collect)
(let ((gimme-20 nil)
(anything-seen-p nil))
(map nil (lambda (c)
(cond ((rune= c #/u+0020)
(setf gimme-20 t))
(t
(when (and anything-seen-p gimme-20)
(collect #/u+0020))
(setf gimme-20 nil)
(setf anything-seen-p t)
(collect c))))
value))))
(definline data-rune-p (rune)
;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
;;
;; FIXME: das halte ich fuer verkehrt. Surrogates als Unicode-Zeichen
;; sind verboten. Das liegt hier aber nicht vor, denn wir arbeiten
;; ja tatsaechlich mit UTF-16. Verboten ist es nur, wenn wir ein
;; solches Zeichen beim Dekodieren finden, das wird aber eben
;; in encodings.lisp bereits geprueft. --david
(let ((c (rune-code rune)))
(or (= c #x9) (= c #xA) (= c #xD)
(<= #x20 c #xD7FF)
(<= #xE000 c #xFFFD)
(<= #xD800 c #xDBFF)
(<= #xDC00 c #xDFFF))))
(defun read-att-value (zinput input mode &optional canon-space-p (delim nil))
(with-rune-collector-2 (collect)
(labels ((muffle (input delim)
(let (c)
(loop
(setf c (read-rune input))
(cond ((eql delim c)
(return))
((eq c :eof)
(eox input "EOF"))
((rune= c #/&)
(setf c (peek-rune input))
(cond ((eql c :eof)
(eox input))
((rune= c #/#)
(let ((c (read-character-reference input)))
(%put-unicode-char c collect)))
(t
(unless (name-start-rune-p (peek-rune input))
(wf-error zinput "Expecting name after &."))
(let ((name (read-name-token input)))
(setf c (read-rune input))
(check-rune input c #/\;)
(ecase mode
(:ATT
(recurse-on-entity
zinput name :general
(lambda (zinput)
(muffle (car (zstream-input-stack zinput))
:eof))
t))
(:ENT
;; bypass, but never the less we
;; need to check for legal
;; syntax.
;; Must it be defined?
;; allerdings: unparsed sind verboten
(collect #/&)
(map nil (lambda (x) (collect x)) name)
(collect #/\; )))))))
((and (eq mode :ENT) (rune= c #/%))
(let ((d (peek-rune input)))
(when (eq d :eof)
(eox input))
(unless (name-start-rune-p d)
(wf-error zinput "Expecting name after %.")))
(let ((name (read-name-token input)))
(setf c (read-rune input))
(check-rune input c #/\;)
(cond (*expand-pe-p*
(recurse-on-entity
zinput name :parameter
(lambda (zinput)
(muffle (car (zstream-input-stack zinput))
:eof))))
(t
(wf-error zinput "No PE here.")))))
((and (eq mode :ATT) (rune= c #/<))
(wf-error zinput "unexpected #\/<"))
((and canon-space-p (space-rune-p c))
(collect #/space))
((not (data-rune-p c))
(wf-error zinput "illegal char: ~S." c))
(t
(collect c)))))))
(declare (dynamic-extent #'muffle))
(muffle input (or delim
(let ((delim (read-rune input)))
(unless (member delim '(#/\" #/\') :test #'eql)
(wf-error zinput "invalid attribute delimiter"))
delim))))))
(defun read-character-reference (input)
;; The #/& is already read
(let ((res
(let ((c (read-rune input)))
(check-rune input c #/#)
(setq c (read-rune input))
(cond ((eql c :eof)
(eox input))
((eql c #/x)
;; hexadecimal
(setq c (read-rune input))
(when (eql c :eof)
(eox input))
(unless (digit-rune-p c 16)
(wf-error input "garbage in character reference"))
(prog1
(parse-integer
(with-output-to-string (sink)
(write-char (rune-char c) sink)
(while (progn
(setq c (read-rune input))
(when (eql c :eof)
(eox input))
(digit-rune-p c 16))
(write-char (rune-char c) sink)))
:radix 16)
(check-rune input c #/\;)))
((rune<= #/0 c #/9)
;; decimal
(prog1
(parse-integer
(with-output-to-string (sink)
(write-char (rune-char c) sink)
(while (progn
(setq c (read-rune input))
(when (eql c :eof)
(eox input))
(rune<= #/0 c #/9))
(write-char (rune-char c) sink)))
:radix 10)
(check-rune input c #/\;)))
(t
(wf-error input "Bad char in numeric character entity."))))))
(unless (code-data-char-p res)
(wf-error
input
"expansion of numeric character reference (#x~X) is no data char."
res))
res))
(defun read-pi (input)
;; "<?" is already read
(let (name)
(let ((c (peek-rune input)))
(unless (name-start-rune-p c)
(wf-error input "Expecting name after '<?'"))
(setf name (read-name-token input)))
(cond
((member (peek-rune input) '(#/U+0020 #/U+0009 #/U+000A #/U+000D)
:test #'eql)
(values name (read-pi-content input)))
(t
(unless (and (eql (read-rune input) #/?)
(eql (read-rune input) #/>))
(wf-error input "malformed processing instruction"))
(values name "")))))
(defun read-pi-content (input)
(read-S? input)
(let (d)
(with-rune-collector (collect)
(block nil
(tagbody
state-1
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/?) (go state-2))
(collect d)
(go state-1)
state-2 ;; #/? seen
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/>) (return))
(when (rune= d #/?)
(collect #/?)
(go state-2))
(collect #/?)
(collect d)
(go state-1))))))
(defun read-comment-content (input &aux d)
(with-rune-collector (collect)
(block nil
(tagbody
state-1
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/-) (go state-2))
(collect d)
(go state-1)
state-2 ;; #/- seen
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/-) (go state-3))
(collect #/-)
(collect d)
(go state-1)
state-3 ;; #/- #/- seen
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/>) (return))
(wf-error input "'--' not allowed in a comment")
(when (rune= d #/-)
(collect #/-)
(go state-3))
(collect #/-)
(collect #/-)
(collect d)
(go state-1)))))
(defun read-cdata-sect (input &aux d)
;; <![CDATA[ is already read
;; read anything up to ]]>
(with-rune-collector (collect)
(block nil
(tagbody
state-1
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/\]) (go state-2))
(collect d)
(go state-1)
state-2 ;; #/] seen
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/\]) (go state-3))
(collect #/\])
(collect d)
(go state-1)
state-3 ;; #/\] #/\] seen
(setf d (read-rune input))
(when (eq d :eof)
(eox input))
(unless (data-rune-p d)
(wf-error input "Illegal char: ~S." d))
(when (rune= d #/>)
(return))
(when (rune= d #/\])
(collect #/\])
(go state-3))
(collect #/\])
(collect #/\])
(collect d)
(go state-1)))))
;; some character categories
(defun space-rune-p (rune)
(declare (type rune rune))
(or (rune= rune #/U+0020)
(rune= rune #/U+0009)
(rune= rune #/U+000A)
(rune= rune #/U+000D)))
(defun code-data-char-p (c)
;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
(or (= c #x9) (= c #xA) (= c #xD)
(<= #x20 c #xD7FF)
(<= #xE000 c #xFFFD)
(<= #x10000 c #x10FFFF)))
(defun pubid-char-p (c)
(or (rune= c #/u+0020) (rune= c #/u+000D) (rune= c #/u+000A)
(rune<= #/a c #/z)
(rune<= #/A c #/Z)
(rune<= #/0 c #/9)
(member c '(#/- #/' #/\( #/\) #/+ #/, #/. #//
#/: #/= #/? #/\; #/! #/* #/#
#/@ #/$ #/_ #/%))))
(defun expect (input category)
(multiple-value-bind (cat sem) (read-token input)
(unless (eq cat category)
(wf-error input "Expected ~S saw ~S [~S]" category cat sem))
(values cat sem)))
(defun consume-token (input)
(read-token input))
;;;; ---------------------------------------------------------------------------
;;;; Parser
;;;;
(defun p/S (input)
;; S ::= (#x20 | #x9 | #xD | #xA)+
(expect input :S)
(while (eq (peek-token input) :S)
(consume-token input)))
(defun p/S? (input)
;; S ::= (#x20 | #x9 | #xD | #xA)+
(while (eq (peek-token input) :S)
(consume-token input)))
(defun p/nmtoken (input)
(nth-value 1 (expect input :nmtoken)))
(defun p/name (input)
(let ((result (p/nmtoken input)))
(unless (name-start-rune-p (elt result 0))
(wf-error input "Expected name."))
result))
(defun p/attlist-decl (input)
;; [52] AttlistDecl ::= '<!ATTLIST' S Name (S AttDef)* S? '>'
(let (elm-name)
(expect input :|<!ATTLIST|)
(p/S input)
(setf elm-name (p/nmtoken input))
(loop
(let ((tok (read-token input)))
(case tok
(:S
(p/S? input)
(cond ((eq (peek-token input) :>)
(consume-token input)
(return))
(t
(multiple-value-bind (name type default) (p/attdef input)
(define-attribute (dtd *ctx*) elm-name name type default)) )))
(:>
(return))
(otherwise
(wf-error input
"Expected either another AttDef or end of \"<!ATTLIST\". -- saw ~S."
tok)))))))
(defun p/attdef (input)
;; [53] AttDef ::= Name S AttType S DefaultDecl
(let (name type default)
(setf name (p/nmtoken input))
(p/S input)
(setf type (p/att-type input))
(p/S input)
(setf default (p/default-decl input))
(values name type default)))
(defun p/list (input item-parser delimiter)
;; Parse something like S? <item> (S? <delimiter> <item>)* S?
;;
(declare (type function item-parser))
(let (res)
(p/S? input)
(setf res (list (funcall item-parser input)))
(loop
(p/S? input)
(cond ((eq (peek-token input) delimiter)
(consume-token input)
(p/S? input)
(push (funcall item-parser input) res))
(t
(return))))
(p/S? input)
(reverse res)))
(defun p/att-type (input)
;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
;; [55] StringType ::= 'CDATA'
;; [56] TokenizedType ::= 'ID' /*VC: ID */
;; /*VC: One ID per Element Type */
;; /*VC: ID Attribute Default */
;; | 'IDREF' /*VC: IDREF */
;; | 'IDREFS' /*VC: IDREF */
;; | 'ENTITY' /*VC: Entity Name */
;; | 'ENTITIES' /*VC: Entity Name */
;; | 'NMTOKEN' /*VC: Name Token */
;; | 'NMTOKENS' /*VC: Name Token */
;; [57] EnumeratedType ::= NotationType | Enumeration
;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
;; /* VC: Notation Attributes */
;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' /* VC: Enumeration */
(multiple-value-bind (cat sem) (read-token input)
(cond ((eq cat :nmtoken)
(cond ((rod= sem '#.(string-rod "CDATA")) :CDATA)
((rod= sem '#.(string-rod "ID")) :ID)
((rod= sem '#.(string-rod "IDREF")) :IDREFS)
((rod= sem '#.(string-rod "IDREFS")) :IDREFS)
((rod= sem '#.(string-rod "ENTITY")) :ENTITY)
((rod= sem '#.(string-rod "ENTITIES")) :ENTITIES)
((rod= sem '#.(string-rod "NMTOKEN")) :NMTOKEN)
((rod= sem '#.(string-rod "NMTOKENS")) :NMTOKENS)
((rod= sem '#.(string-rod "NOTATION"))
(let (names)
(p/S input)
(expect input :\()
(setf names (p/list input #'p/nmtoken :\| ))
(expect input :\))
(when *validate*
(setf (referenced-notations *ctx*)
(append names (referenced-notations *ctx*))))
(cons :NOTATION names)))
(t
(wf-error input "In p/att-type: ~S ~S." cat sem))))
((eq cat :\()
;; XXX Die Nmtoken-Syntax pruefen wir derzeit nur beim Validieren.
(let (names)
;;(expect input :\()
(setf names (p/list input #'p/nmtoken :\| ))
(expect input :\))
(cons :ENUMERATION names)))
(t
(wf-error input "In p/att-type: ~S ~S." cat sem)) )))
(defun p/default-decl (input)
;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
;; | (('#FIXED' S)? AttValue) /* VC: Required Attribute */
;;
;; /* VC: Attribute Default Legal */
;; /* WFC: No < in Attribute Values */
;; /* VC: Fixed Attribute Default */
(multiple-value-bind (cat sem) (peek-token input)
(cond ((eq cat :|#REQUIRED|)
(consume-token input) :REQUIRED)
((eq cat :|#IMPLIED|)
(consume-token input) :IMPLIED)
((eq cat :|#FIXED|)
(consume-token input)
(p/S input)
(list :FIXED (p/att-value input)))
((or (eq cat :\') (eq cat :\"))
(list :DEFAULT (p/att-value input)))
(t
(wf-error input "p/default-decl: ~S ~S." cat sem)) )))
;;;;
;; [70] EntityDecl ::= GEDecl | PEDecl
;; [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
;; [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
;; [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
;; [74] PEDef ::= EntityValue | ExternalID
;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
;; | 'PUBLIC' S PubidLiteral S SystemLiteral
;; [76] NDataDecl ::= S 'NDATA' S Name /* VC: Notation Declared */
(defun p/entity-decl (input)
(let (name def kind)
(expect input :|<!ENTITY|)
(p/S input)
(cond ((eq (peek-token input) :%)
(setf kind :parameter)
(consume-token input)
(p/S input))
(t
(setf kind :general)))
(setf name (p/name input))
(p/S input)
(setf def (p/entity-def input kind))
(define-entity input name kind def)
(p/S? input)
(expect input :\>)))
(defun report-entity (h kind name def)
(etypecase def
(external-entdef
(let ((extid (entdef-extid def))
(ndata (entdef-ndata def)))
(if ndata
(sax:unparsed-entity-declaration h
name
(extid-public extid)
(uri-rod (extid-system extid))
ndata)
(sax:external-entity-declaration h
kind
name
(extid-public extid)
(uri-rod (extid-system extid))))))
(internal-entdef
(sax:internal-entity-declaration h kind name (entdef-value def)))))
(defun p/entity-def (input kind)
(multiple-value-bind (cat sem) (peek-token input)
(cond ((member cat '(:\" :\'))
(make-internal-entdef (p/entity-value input)))
((and (eq cat :nmtoken)
(or (rod= sem '#.(string-rod "SYSTEM"))
(rod= sem '#.(string-rod "PUBLIC"))))
(let (extid ndata)
(setf extid (p/external-id input nil))
(when (eq kind :general) ;NDATA allowed at all?
(cond ((eq (peek-token input) :S)
(p/S? input)
(when (and (eq (peek-token input) :nmtoken)
(rod= (nth-value 1 (peek-token input))
'#.(string-rod "NDATA")))
(consume-token input)
(p/S input)
(setf ndata (p/nmtoken input))
(when *validate*
(push ndata (referenced-notations *ctx*)))))))
(make-external-entdef extid ndata)))
(t
(wf-error input "p/entity-def: ~S / ~S." cat sem)) )))
(defun p/entity-value (input)
(let ((delim (if (eq (read-token input) :\") #/\" #/\')))
(read-att-value input
(car (zstream-input-stack input))
:ENT
nil
delim)))
(defun p/att-value (input)
(let ((delim (if (eq (read-token input) :\") #/\" #/\')))
(read-att-value input
(car (zstream-input-stack input))
:ATT
t
delim)))
(defun p/external-id (input &optional (public-only-ok-p nil))
;; xxx public-only-ok-p
(multiple-value-bind (cat sem) (read-token input)
(cond ((and (eq cat :nmtoken) (rod= sem '#.(string-rod "SYSTEM")))
(p/S input)
(make-extid nil (p/system-literal input)))
((and (eq cat :nmtoken) (rod= sem '#.(string-rod "PUBLIC")))
(let (pub sys)
(p/S input)
(setf pub (p/pubid-literal input))
(when (eq (peek-token input) :S)
(p/S input)
(when (member (peek-token input) '(:\" :\'))
(setf sys (p/system-literal input))))
(when (and (not public-only-ok-p)
(null sys))
(wf-error input "System identifier needed for this PUBLIC external identifier."))
(make-extid pub sys)))
(t
(wf-error input "Expected external-id: ~S / ~S." cat sem)))))
;; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
;; [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
;; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9]
;; | [-'()+,./:=?;!*#@$_%]
(defun p/id (input)
(multiple-value-bind (cat) (read-token input)
(cond ((member cat '(:\" :\'))
(let ((delim (if (eq cat :\") #/\" #/\')))
(with-rune-collector (collect)
(loop
(let ((c (read-rune (car (zstream-input-stack input)))))
(cond ((eq c :eof)
(eox input "EOF in system literal."))
((rune= c delim)
(return))
(t
(collect c))))))))
(t
(wf-error input "Expect either \" or \'.")))))
;; it is important to cache the orginal URI rod, since the re-serialized
;; uri-string can be different from the one parsed originally.
(defun uri-rod (uri)
(if uri
(or (getf (puri:uri-plist uri) 'original-rod)
(rod (puri:render-uri uri nil)))
nil))
(defun safe-parse-uri (str)
;; puri doesn't like strings starting with file:///, although that is a very
;; common is practise. Cut it away, we don't distinguish between scheme
;; :FILE and NIL anway.
(when (eql (search "file://" str) 0)
(setf str (subseq str (length "file://"))))
(puri:parse-uri (coerce str 'simple-string)))
(defun p/system-literal (input)
(let* ((rod (p/id input))
(result (safe-parse-uri (rod-string rod))))
(setf (getf (puri:uri-plist result) 'original-rod) rod)
result))
(defun p/pubid-literal (input)
(let ((result (p/id input)))
(unless (every #'pubid-char-p result)
(wf-error input "Illegal pubid: ~S." (rod-string result)))
result))
;;;;
(defun p/element-decl (input)
(let (name content)
(expect input :|<!ELEMENT|)
(p/S input)
(setf name (p/nmtoken input))
(p/S input)
(setf content (normalize-mixed-cspec (p/cspec input)))
(unless (legal-content-model-p content *validate*)
(wf-error input "Malformed or invalid content model: ~S." (mu content)))
(p/S? input)
(expect input :\>)
(when *validate*
(define-element (dtd *ctx*) name content))
(list :element name content)))
(defun maybe-compile-cspec (e)
(or (elmdef-compiled-cspec e)
(setf (elmdef-compiled-cspec e)
(let ((cspec (elmdef-content e)))
(unless cspec
(validity-error "(03) Element Valid: no definition for ~A"
(rod-string (elmdef-name e))))
(multiple-value-call #'cons
(compile-cspec cspec (standalone-check-necessary-p e)))))))
(defun make-root-model (name)
(cons (lambda (actual-name)
(if (rod= actual-name name)
(constantly :dummy)
nil))
(constantly t)))
;;; content spec validation:
;;;
;;; Given a `contentspec', COMPILE-CSPEC returns as multiple values two
;;; functions A and B of one argument to be called for every
;;; A. child element
;;; B. text child node
;;;
;;; Function A will be called with
;;; - the element name rod as its argument. If that element may appear
;;; at the current position, a new function to be called for the next
;;; child is returned. Otherwise NIL is returned.
;;; - argument NIL at the end of the element, it must then return T or NIL
;;; to indicate whether the end tag is valid.
;;;
;;; Function B will be called with the character data rod as its argument, it
;;; returns a boolean indicating whether this text node is allowed.
;;;
;;; That is, if one of the functions ever returns NIL, the node is
;;; rejected as invalid.
(defun cmodel-done (actual-value)
(null actual-value))
(defun compile-cspec (cspec &optional standalone-check)
(cond
((atom cspec)
(ecase cspec
(:EMPTY (values #'cmodel-done (constantly nil)))
(:PCDATA (values #'cmodel-done (constantly t)))
(:ANY
(values (labels ((doit (name) (if name #'doit t))) #'doit)
(constantly t)))))
((and (eq (car cspec) '*)
(let ((subspec (second cspec)))
(and (eq (car subspec) 'or) (eq (cadr subspec) :PCDATA))))
(values (compile-mixed (second cspec))
(constantly t)))
(t
(values (compile-content-model cspec)
(lambda (rod)
(when standalone-check
(validity-error "(02) Standalone Document Declaration: whitespace"))
(every #'white-space-rune-p rod))))))
(defun compile-mixed (cspec)
;; das koennten wir theoretisch auch COMPILE-CONTENT-MODEL erledigen lassen
(let ((allowed-names (cddr cspec)))
(labels ((doit (actual-name)
(cond
((null actual-name) t)
((member actual-name allowed-names :test #'rod=) #'doit)
(t nil))))
#'doit)))
(defun compile-content-model (cspec &optional (continuation #'cmodel-done))
(if (vectorp cspec)
(lambda (actual-name)
(if (and actual-name (rod= cspec actual-name))
continuation
nil))
(ecase (car cspec)
(and
(labels ((traverse (seq)
(compile-content-model (car seq)
(if (cdr seq)
(traverse (cdr seq))
continuation))))
(traverse (cdr cspec))))
(or
(let ((options (mapcar (rcurry #'compile-content-model continuation)
(cdr cspec))))
(lambda (actual-name)
(some (rcurry #'funcall actual-name) options))))
(?
(let ((maybe (compile-content-model (second cspec) continuation)))
(lambda (actual-name)
(or (funcall maybe actual-name)
(funcall continuation actual-name)))))
(*
(let (maybe-continuation)
(labels ((recurse (actual-name)
(if (null actual-name)
(funcall continuation actual-name)
(or (funcall maybe-continuation actual-name)
(funcall continuation actual-name)))))
(setf maybe-continuation
(compile-content-model (second cspec) #'recurse))
#'recurse)))
(+
(let ((it (cadr cspec)))
(compile-content-model `(and ,it (* ,it)) continuation))))))
(defun setp (list &key (test 'eql))
(equal list (remove-duplicates list :test test)))
(defun legal-content-model-p (cspec &optional validate)
(or (eq cspec :PCDATA)
(eq cspec :ANY)
(eq cspec :EMPTY)
(and (consp cspec)
(eq (car cspec) '*)
(consp (cadr cspec))
(eq (car (cadr cspec)) 'or)
(eq (cadr (cadr cspec)) :PCDATA)
(every #'vectorp (cddr (cadr cspec)))
(if (and validate (not (setp (cddr (cadr cspec)) :test #'rod=)))
(validity-error "VC: No Duplicate Types (07)")
t))
(labels ((walk (x)
(cond ((member x '(:PCDATA :ANY :EMPTY))
nil)
((atom x) t)
((and (walk (car x))
(walk (cdr x)))))))
(walk cspec))))
;; wir fahren besser, wenn wir machen:
;; cspec ::= 'EMPTY' | 'ANY' | '#PCDATA'
;; | Name
;; | cs
;; cs ::= '(' S? cspec ( S? '|' S? cspec)* S? ')' ('?' | '*' | '+')?
;; und eine post factum analyse
(defun p/cspec (input &optional recursivep)
(let ((term
(let ((names nil) op-cat op res stream)
(multiple-value-bind (cat sem) (peek-token input)
(cond ((eq cat :nmtoken)
(consume-token input)
(cond ((rod= sem '#.(string-rod "EMPTY"))
:EMPTY)
((rod= sem '#.(string-rod "ANY"))
:ANY)
((not recursivep)
(wf-error input "invalid content spec"))
(t
sem)))
((eq cat :\#PCDATA)
(consume-token input)
:PCDATA)
((eq cat :\()
(setf stream (car (zstream-input-stack input)))
(consume-token input)
(p/S? input)
(setq names (list (p/cspec input t)))
(p/S? input)
(cond ((member (peek-token input) '(:\| :\,))
(setf op-cat (peek-token input))
(setf op (if (eq op-cat :\,) 'and 'or))
(while (eq (peek-token input) op-cat)
(consume-token input)
(p/S? input)
(push (p/cspec input t) names)
(p/S? input))
(setf res (cons op (reverse names))))
(t
(setf res (cons 'and names))))
(p/S? input)
(expect input :\))
(when *validate*
(unless (eq stream (car (zstream-input-stack input)))
(validity-error "(06) Proper Group/PE Nesting")))
res)
(t
(wf-error input "p/cspec - ~s / ~s" cat sem)))))))
(cond ((eq (peek-token input) :?) (consume-token input) (list '? term))
((eq (peek-token input) :+) (consume-token input) (list '+ term))
((eq (peek-token input) :*) (consume-token input) (list '* term))
(t
term))))
(defun normalize-mixed-cspec (cspec)
;; der Parser oben funktioniert huebsch fuer die children-Regel, aber
;; fuer Mixed ist das Ergebnis nicht praktisch, denn dort wollen wir
;; eigentlich auf eine Liste von Namen in einheitlichem Format hinaus.
;; Dazu normalisieren wir einfach in eine der beiden folgenden Formen:
;; (* (or :PCDATA ...rods...)) -- und zwar exakt so!
;; :PCDATA -- sonst ganz trivial
(flet ((trivialp (c)
(and (consp c)
(and (eq (car c) 'and)
(eq (cadr c) :PCDATA)
(null (cddr c))))))
(if (or (trivialp cspec) ;(and PCDATA)
(and (consp cspec) ;(* (and PCDATA))
(and (eq (car cspec) '*)
(null (cddr cspec))
(trivialp (cadr cspec)))))
:PCDATA
cspec)))
;; [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
;; [52] AttlistDecl ::= '<!ATTLIST' S Name AttDefs S? '>'
;; [52] AttlistDecl ::= '<!ATTLIST' S Name S? '>'
;; [53] AttDefs ::= S Name S AttType S DefaultDecl AttDefs
;; [53] AttDefs ::=
(defun p/notation-decl (input)
(let (name id)
(expect input :|<!NOTATION|)
(p/S input)
(setf name (p/name input))
(p/S input)
(setf id (p/external-id input t))
(p/S? input)
(expect input :\>)
(sax:notation-declaration (handler *ctx*)
name
(if (extid-public id)
(normalize-public-id (extid-public id))
nil)
(uri-rod (extid-system id)))
(when (and sax:*namespace-processing* (find #/: name))
(wf-error input "colon in notation name"))
(when *validate*
(define-notation (dtd *ctx*) name id))
(list :notation-decl name id)))
(defun normalize-public-id (rod)
(with-rune-collector (collect)
(let ((gimme-20 nil)
(anything-seen-p nil))
(map nil (lambda (c)
(cond
((or (rune= c #/u+0009)
(rune= c #/u+000A)
(rune= c #/u+000D)
(rune= c #/u+0020))
(setf gimme-20 t))
(t
(when (and anything-seen-p gimme-20)
(collect #/u+0020))
(setf gimme-20 nil)
(setf anything-seen-p t)
(collect c))))
rod))))
;;;
(defun p/conditional-sect (input)
(expect input :<!\[ )
(let ((stream (car (zstream-input-stack input))))
(p/S? input)
(multiple-value-bind (cat sem) (read-token input)
(cond ((and (eq cat :nmtoken)
(rod= sem '#.(string-rod "INCLUDE")))
(p/include-sect input stream))
((and (eq cat :nmtoken)
(rod= sem '#.(string-rod "IGNORE")))
(p/ignore-sect input stream))
(t
(wf-error input "Expected INCLUDE or IGNORE after \"<![\"."))))))
(defun p/cond-expect (input cat initial-stream)
(expect input cat)
(when *validate*
(unless (eq (car (zstream-input-stack input)) initial-stream)
(validity-error "(21) Proper Conditional Section/PE Nesting"))))
(defun p/include-sect (input initial-stream)
;; <![INCLUDE is already read.
(p/S? input)
(p/cond-expect input :\[ initial-stream)
(p/ext-subset-decl input)
(p/cond-expect input :\] initial-stream)
(p/cond-expect input :\] initial-stream)
(p/cond-expect input :\> initial-stream))
(defun p/ignore-sect (input initial-stream)
;; <![IGNORE is already read.
;; XXX Is VC 21 being checked for nested sections?
(p/S? input)
(p/cond-expect input :\[ initial-stream)
(let ((input (car (zstream-input-stack input))))
(let ((level 0))
(do ((c1 (read-rune input) (read-rune input))
(c2 #/U+0000 c1)
(c3 #/U+0000 c2))
((= level -1))
(declare (type fixnum level))
(cond ((eq c1 :eof)
(eox input "EOF in <![IGNORE ... >")))
(cond ((and (rune= c3 #/<) (rune= c2 #/!) (rune= c1 #/\[))
(incf level)))
(cond ((and (rune= c3 #/\]) (rune= c2 #/\]) (rune= c1 #/>))
(decf level))) )))
(unless (eq (car (zstream-input-stack input)) initial-stream)
(validity-error "(21) Proper Conditional Section/PE Nesting")))
(defun p/ext-subset-decl (input)
;; ( markupdecl | conditionalSect | S )*
(loop
(case (let ((*expand-pe-p* nil)) (peek-token input))
(:|<![| (let ((*expand-pe-p* t)) (p/conditional-sect input)))
(:S (consume-token input))
(:eof (return))
((:|<!ELEMENT| :|<!ATTLIST| :|<!ENTITY| :|<!NOTATION| :PI :COMMENT)
(let ((*expand-pe-p* t)
(*external-subset-p* t))
(p/markup-decl input)))
((:PE-REFERENCE)
(let ((name (nth-value 1 (read-token input))))
(recurse-on-entity input name :parameter
(lambda (input)
(etypecase (checked-get-entdef name :parameter)
(external-entdef
(p/ext-subset input))
(internal-entdef
(p/ext-subset-decl input)))
(unless (eq :eof (peek-token input))
(wf-error input "Trailing garbage."))))))
(otherwise (return)))) )
(defun p/markup-decl (input)
(peek-token input)
(let ((stream (car (zstream-input-stack input))))
(multiple-value-prog1
(p/markup-decl-unsafe input)
(when *validate*
(unless (eq stream (car (zstream-input-stack input)))
(validity-error "(01) Proper Declaration/PE Nesting"))))))
(defun p/markup-decl-unsafe (input)
;; markupdecl ::= elementdecl | AttlistDecl /* VC: Proper Declaration/PE Nesting */
;; | EntityDecl | NotationDecl
;; | PI | Comment /* WFC: PEs in Internal Subset */
(let ((token (peek-token input))
(*expand-pe-p* (and *expand-pe-p* *external-subset-p*)))
(case token
(:|<!ELEMENT| (p/element-decl input))
(:|<!ATTLIST| (p/attlist-decl input))
(:|<!ENTITY| (p/entity-decl input))
(:|<!NOTATION| (p/notation-decl input))
(:PI
(let ((sem (nth-value 1 (read-token input))))
(sax:processing-instruction (handler *ctx*) (car sem) (cdr sem))))
(:COMMENT (consume-token input))
(otherwise
(wf-error input "p/markup-decl ~S" (peek-token input))))))
(defun setup-encoding (input xml-header)
(when (xml-header-encoding xml-header)
(let ((enc (find-encoding (xml-header-encoding xml-header))))
(cond (enc
(setf (xstream-encoding (car (zstream-input-stack input)))
enc))
(t
(warn "There is no such encoding: ~S." (xml-header-encoding xml-header)))))))
(defun set-full-speed (input)
(let ((xstream (car (zstream-input-stack input))))
(when xstream
(set-to-full-speed xstream))))
(defun p/ext-subset (input)
(cond ((eq (peek-token input) :xml-decl)
(let ((hd (parse-text-decl (cdr (nth-value 1 (peek-token input))))))
(setup-encoding input hd))
(consume-token input)))
(set-full-speed input)
(p/ext-subset-decl input)
(unless (eq (peek-token input) :eof)
(wf-error input "Trailing garbage - ~S." (peek-token input))))
(defvar *catalog* nil)
(defun extid-using-catalog (extid)
(if *catalog*
(let ((sysid
(resolve-extid (extid-public extid)
(extid-system extid)
*catalog*)))
(if sysid
(make-extid nil sysid)
extid))
extid))
(defun p/doctype-decl (input &optional dtd-extid)
(let ()
(let ((*expand-pe-p* nil)
name extid)
(expect input :|<!DOCTYPE|)
(p/S input)
(setq name (p/nmtoken input))
(when *validate*
(setf (model-stack *ctx*) (list (make-root-model name))))
(when (eq (peek-token input) :S)
(p/S input)
(unless (or (eq (peek-token input) :\[ )
(eq (peek-token input) :\> ))
(setf extid (p/external-id input t))))
(when dtd-extid
(setf extid dtd-extid))
(p/S? input)
(sax:start-dtd (handler *ctx*)
name
(and extid (extid-public extid))
(and extid (uri-rod (extid-system extid))))
(when (eq (peek-token input) :\[ )
(when (disallow-internal-subset *ctx*)
(wf-error input "document includes an internal subset"))
(ensure-dtd)
(consume-token input)
(sax:start-internal-subset (handler *ctx*))
(while (progn (p/S? input)
(not (eq (peek-token input) :\] )))
(if (eq (peek-token input) :PE-REFERENCE)
(let ((name (nth-value 1 (read-token input))))
(recurse-on-entity input name :parameter
(lambda (input)
(etypecase (checked-get-entdef name :parameter)
(external-entdef
(p/ext-subset input))
(internal-entdef
(p/ext-subset-decl input)))
(unless (eq :eof (peek-token input))
(wf-error input "Trailing garbage.")))))
(let ((*expand-pe-p* t))
(p/markup-decl input))))
(consume-token input)
(sax:end-internal-subset (handler *ctx*))
(p/S? input))
(expect input :>)
(when extid
(let* ((effective-extid
(extid-using-catalog (absolute-extid input extid)))
(sysid (extid-system effective-extid))
(fresh-dtd-p (null (dtd *ctx*)))
(cached-dtd
(and fresh-dtd-p
(not (standalone-p *ctx*))
(getdtd sysid *dtd-cache*))))
(cond
(cached-dtd
(setf (dtd *ctx*) cached-dtd)
(report-cached-dtd cached-dtd))
(t
(let ((xi2 (xstream-open-extid effective-extid)))
(with-zstream (zi2 :input-stack (list xi2))
(ensure-dtd)
(p/ext-subset zi2)
(when (and fresh-dtd-p
*cache-all-dtds*
*validate*
(not (standalone-p *ctx*)))
(setf (getdtd sysid *dtd-cache*) (dtd *ctx*)))))))))
(sax:end-dtd (handler *ctx*))
(let ((dtd (dtd *ctx*)))
(sax:entity-resolver
(handler *ctx*)
(lambda (name handler) (resolve-entity name handler dtd)))
(sax::dtd (handler *ctx*) dtd))
(list :DOCTYPE name extid))))
(defun report-cached-dtd (dtd)
(maphash (lambda (k v)
(report-entity (handler *ctx*) :general k (cdr v)))
(dtd-gentities dtd))
(maphash (lambda (k v)
(report-entity (handler *ctx*) :parameter k (cdr v)))
(dtd-pentities dtd))
(maphash (lambda (k v)
(sax:notation-declaration
(handler *ctx*)
k
(if (extid-public v)
(normalize-public-id (extid-public v))
nil)
(uri-rod (extid-system v))))
(dtd-notations dtd)))
(defun p/misc*-2 (input)
;; Misc*
(while (member (peek-token input) '(:COMMENT :PI :S))
(case (peek-token input)
(:COMMENT
(sax:comment (handler *ctx*) (nth-value 1 (peek-token input))))
(:PI
(sax:processing-instruction
(handler *ctx*)
(car (nth-value 1 (peek-token input)))
(cdr (nth-value 1 (peek-token input))))))
(consume-token input)))
(defun p/document
(input handler
&key validate dtd root entity-resolver disallow-internal-subset
(recode t))
;; check types of user-supplied arguments for better error messages:
(check-type validate boolean)
(check-type recode boolean)
(check-type dtd (or null extid))
(check-type root (or null rod))
(check-type entity-resolver (or null function symbol))
(check-type disallow-internal-subset boolean)
#+rune-is-integer
(when recode
(setf handler (make-recoder handler #'rod-to-utf8-string)))
(let* ((xstream (car (zstream-input-stack input)))
(name (xstream-name xstream))
(base (when name (stream-name-uri name)))
(*ctx*
(make-context :handler handler
:main-zstream input
:entity-resolver entity-resolver
:base-stack (list (or base ""))
:disallow-internal-subset disallow-internal-subset))
(*validate* validate)
(*namespace-bindings* *initial-namespace-bindings*))
(sax:register-sax-parser handler (make-instance 'cxml-parser :ctx *ctx*))
(sax:start-document handler)
;; document ::= XMLDecl? Misc* (doctypedecl Misc*)? element Misc*
;; Misc ::= Comment | PI | S
;; xmldecl::='<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
;; sddecl::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
(let ((*data-behaviour* :DTD))
;; optional XMLDecl?
(p/xmldecl input)
;; Misc*
(p/misc*-2 input)
;; (doctypedecl Misc*)?
(cond
((eq (peek-token input) :<!DOCTYPE)
(p/doctype-decl input dtd)
(p/misc*-2 input))
(dtd
(synthesize-doctype dtd input))
((and validate (not dtd))
(validity-error "invalid document: no doctype")))
(ensure-dtd)
;; Override expected root element if asked to
(when root
(setf (model-stack *ctx*) (list (make-root-model root))))
;; element
(let ((*data-behaviour* :DOC))
(fix-seen-< input)
(p/element input))
;; optional Misc*
(p/misc*-2 input)
(p/eof input)
(sax:end-document handler))))
(defun synthesize-doctype (dtd input)
(let ((dummy (string->xstream "<!DOCTYPE dummy>")))
(setf (xstream-name dummy)
(make-stream-name
:entity-name "dummy doctype"
:entity-kind :main
:uri (zstream-base-sysid input)))
(with-zstream (zstream :input-stack (list dummy))
(p/doctype-decl zstream dtd))))
(defun fix-seen-< (input)
(when (eq (peek-token input) :seen-<)
(multiple-value-bind (c s)
(read-token-after-|<| input (car (zstream-input-stack input)))
(setf (zstream-token-category input) c
(zstream-token-semantic input) s))))
(defun p/xmldecl (input)
;; we will use the attribute-value parser for the xml decl.
(prog1
(when (eq (peek-token input) :xml-decl)
(let ((hd (parse-xml-decl (cdr (nth-value 1 (peek-token input))))))
(setf (standalone-p *ctx*) (eq (xml-header-standalone-p hd) :yes))
(setup-encoding input hd)
(read-token input)
hd))
(set-full-speed input)))
(defun p/eof (input)
(unless (eq (peek-token input) :eof)
(wf-error input "Garbage at end of document."))
(when *validate*
(maphash (lambda (k v)
(unless v
(validity-error "(11) IDREF: ~S not defined" (rod-string k))))
(id-table *ctx*))
(dolist (name (referenced-notations *ctx*))
(unless (find-notation name (dtd *ctx*))
(validity-error "(23) Notation Declared: ~S" (rod-string name))))))
(defun p/element (input)
(multiple-value-bind (cat n-b new-b uri lname qname attrs) (p/sztag input)
(sax:start-element (handler *ctx*) uri lname qname attrs)
(when (eq cat :stag)
(let ((*namespace-bindings* n-b))
(p/content input))
(p/etag input qname))
(sax:end-element (handler *ctx*) uri lname qname)
(undeclare-namespaces new-b)
(pop (base-stack *ctx*))
(validate-end-element *ctx* qname)))
(defun p/sztag (input)
(multiple-value-bind (cat sem) (read-token input)
(case cat
((:stag :ztag))
(:eof (eox input))
(t (wf-error input "element expected")))
(destructuring-bind (&optional name &rest raw-attrs) sem
(validate-start-element *ctx* name)
(let* ((attrs
(process-attributes *ctx* name (build-attribute-list raw-attrs)))
(*namespace-bindings* *namespace-bindings*)
new-namespaces)
(when sax:*namespace-processing*
(setf new-namespaces (declare-namespaces attrs))
(mapc #'set-attribute-namespace attrs))
(push (compute-base attrs) (base-stack *ctx*))
(multiple-value-bind (uri prefix local-name)
(if sax:*namespace-processing*
(decode-qname name)
(values nil nil nil))
(declare (ignore prefix))
(check-attribute-uniqueness attrs)
(unless (or sax:*include-xmlns-attributes*
(null sax:*namespace-processing*))
(setf attrs
(remove-if (compose #'xmlns-attr-p #'sax:attribute-qname)
attrs)))
(values cat
*namespace-bindings*
new-namespaces
uri local-name name attrs))))))
(defun p/etag (input qname)
(multiple-value-bind (cat2 sem2) (read-token input)
(unless (and (eq cat2 :etag)
(eq (car sem2) qname))
(wf-error input "Bad nesting. ~S / ~S"
(mu qname)
(mu (cons cat2 sem2))))
(when (cdr sem2)
(wf-error input "no attributes allowed in end tag"))))
;; copy&paste from cxml-rng
(defun escape-uri (string)
(with-output-to-string (out)
(loop for c across (cxml::rod-to-utf8-string string) do
(let ((code (char-code c)))
;; http://www.w3.org/TR/xlink/#link-locators
(if (or (>= code 127) (<= code 32) (find c "<>\"{}|\\^`"))
(format out "%~2,'0X" code)
(write-char c out))))))
(defun compute-base (attrs)
(let ((new (sax:find-attribute #"xml:base" attrs))
(current (car (base-stack *ctx*))))
(if new
(puri:merge-uris (escape-uri (sax:attribute-value new)) current)
current)))
(defun process-characters (input sem)
(consume-token input)
(when (search #"]]>" sem)
(wf-error input "']]>' not allowed in CharData"))
(validate-characters *ctx* sem))
(defun process-cdata-section (input)
(consume-token input)
(let ((input (car (zstream-input-stack input))))
(unless (and (rune= #/C (read-rune input))
(rune= #/D (read-rune input))
(rune= #/A (read-rune input))
(rune= #/T (read-rune input))
(rune= #/A (read-rune input))
(rune= #/\[ (read-rune input)))
(wf-error input "After '<![', 'CDATA[' is expected."))
(validate-characters *ctx* #"hack") ;anything other than whitespace
(read-cdata-sect input)))
(defun p/content (input)
;; [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)*
(loop
(multiple-value-bind (cat sem) (peek-token input)
(case cat
((:stag :ztag)
(p/element input))
((:CDATA)
(process-characters input sem)
(sax:characters (handler *ctx*) sem))
((:ENTITY-REF)
(let ((name sem))
(consume-token input)
(recurse-on-entity input name :general
(lambda (input)
(prog1
(etypecase (checked-get-entdef name :general)
(internal-entdef (p/content input))
(external-entdef (p/ext-parsed-ent input)))
(unless (eq (peek-token input) :eof)
(wf-error input "Trailing garbage. - ~S"
(peek-token input))))))))
((:<!\[)
(let ((data (process-cdata-section input)))
(sax:start-cdata (handler *ctx*))
(sax:characters (handler *ctx*) data)
(sax:end-cdata (handler *ctx*))))
((:PI)
(consume-token input)
(sax:processing-instruction (handler *ctx*) (car sem) (cdr sem)))
((:COMMENT)
(consume-token input)
(sax:comment (handler *ctx*) sem))
(otherwise
(return))))))
;; [78] extParsedEnt ::= TextDecl? contentw
;; [79] extPE ::= TextDecl? extSubsetDecl
(defstruct xml-header
version
encoding
(standalone-p nil))
(defun p/ext-parsed-ent (input)
;; [78] extParsedEnt ::= '<?xml' VersionInfo? EncodingDecl S? '?>' content
(when (eq (peek-token input) :xml-decl)
(let ((hd (parse-text-decl (cdr (nth-value 1 (peek-token input))))))
(setup-encoding input hd))
(consume-token input))
(set-full-speed input)
(p/content input))
(defun parse-xml-decl (content)
(let* ((res (make-xml-header))
(i (make-rod-xstream content)))
(with-zstream (z :input-stack (list i))
(let ((atts (read-attribute-list z i t)))
(unless (eq (peek-rune i) :eof)
(wf-error i "Garbage at end of XMLDecl."))
;; versioninfo muss da sein
;; dann ? encodingdecl
;; dann ? sddecl
;; dann ende
(unless (eq (caar atts) (intern-name '#.(string-rod "version")))
(wf-error i "XMLDecl needs version."))
(unless (and (>= (length (cdar atts)) 1)
(every (lambda (x)
(or (rune<= #/a x #/z)
(rune<= #/A x #/Z)
(rune<= #/0 x #/9)
(rune= x #/_)
(rune= x #/.)
(rune= x #/:)
(rune= x #/-)))
(cdar atts)))
(wf-error i"Bad XML version number: ~S." (rod-string (cdar atts))))
(setf (xml-header-version res) (rod-string (cdar atts)))
(pop atts)
(when (eq (caar atts) (intern-name '#.(string-rod "encoding")))
(unless (and (>= (length (cdar atts)) 1)
(every (lambda (x)
(or (rune<= #/a x #/z)
(rune<= #/A x #/Z)
(rune<= #/0 x #/9)
(rune= x #/_)
(rune= x #/.)
(rune= x #/-)))
(cdar atts))
((lambda (x)
(or (rune<= #/a x #/z)
(rune<= #/A x #/Z)))
(aref (cdar atts) 0)))
(wf-error i "Bad XML encoding name: ~S." (rod-string (cdar atts))))
(setf (xml-header-encoding res) (rod-string (cdar atts)))
(pop atts))
(when (eq (caar atts) (intern-name '#.(string-rod "standalone")))
(unless (or (rod= (cdar atts) '#.(string-rod "yes"))
(rod= (cdar atts) '#.(string-rod "no")))
(wf-error i "XMLDecl's 'standalone' attribute must be exactly \"yes\" or \"no\" and not ~S."
(rod-string (cdar atts))))
(setf (xml-header-standalone-p res)
(if (rod-equal '#.(string-rod "yes") (cdar atts))
:yes
:no))
(pop atts))
(when atts
(wf-error i "Garbage in XMLDecl: ~A" (rod-string content)))
res))))
(defun parse-text-decl (content)
(let* ((res (make-xml-header))
(i (make-rod-xstream content)))
(with-zstream (z :input-stack (list i))
(let ((atts (read-attribute-list z i t)))
(unless (eq (peek-rune i) :eof)
(wf-error i "Garbage at end of TextDecl"))
;; versioninfo optional
;; encodingdecl muss da sein
;; dann ende
(when (eq (caar atts) (intern-name '#.(string-rod "version")))
(unless (and (>= (length (cdar atts)) 1)
(every (lambda (x)
(or (rune<= #/a x #/z)
(rune<= #/A x #/Z)
(rune<= #/0 x #/9)
(rune= x #/_)
(rune= x #/.)
(rune= x #/:)
(rune= x #/-)))
(cdar atts)))
(wf-error i "Bad XML version number: ~S." (rod-string (cdar atts))))
(setf (xml-header-version res) (rod-string (cdar atts)))
(pop atts))
(unless (eq (caar atts) (intern-name '#.(string-rod "encoding")))
(wf-error i "TextDecl needs encoding."))
(unless (and (>= (length (cdar atts)) 1)
(every (lambda (x)
(or (rune<= #/a x #/z)
(rune<= #/A x #/Z)
(rune<= #/0 x #/9)
(rune= x #/_)
(rune= x #/.)
(rune= x #/-)))
(cdar atts))
((lambda (x)
(or (rune<= #/a x #/z)
(rune<= #/A x #/Z)
(rune<= #/0 x #/9)))
(aref (cdar atts) 0)))
(wf-error i "Bad XML encoding name: ~S." (rod-string (cdar atts))))
(setf (xml-header-encoding res) (rod-string (cdar atts)))
(pop atts)
(when atts
(wf-error i "Garbage in TextDecl: ~A" (rod-string content)))))
res))
;;;; ---------------------------------------------------------------------------
;;;; mu
;;;;
(defun mu (x)
(cond ((stringp x) x)
((vectorp x) (rod-string x))
((consp x)
(cons (mu (car x)) (mu (cdr x))))
(x)))
;;;; ---------------------------------------------------------------------------
;;;; User interface ;;;;
#-cxml-system::uri-is-namestring
(defun specific-or (component &optional (alternative nil))
(if (eq component :unspecific)
alternative
component))
(defun string-or (str &optional (alternative nil))
(if (zerop (length str))
alternative
str))
#-cxml-system::uri-is-namestring
(defun make-uri (&rest initargs &key path query &allow-other-keys)
(apply #'make-instance
'puri:uri
:path (and path (escape-path path))
:query (and query (escape-query query))
initargs))
#-cxml-system::uri-is-namestring
(defun escape-path (list)
(puri::render-parsed-path list t))
#-cxml-system::uri-is-namestring
(defun escape-query (pairs)
(flet ((escape (str)
(puri::encode-escaped-encoding str puri::*reserved-characters* t)))
(let ((first t))
(with-output-to-string (s)
(dolist (pair pairs)
(if first
(setf first nil)
(write-char #\& s))
(write-string (escape (car pair)) s)
(write-char #\= s)
(write-string (escape (cdr pair)) s))))))
#-cxml-system::uri-is-namestring
(defun uri-parsed-query (uri)
(flet ((unescape (str)
(puri::decode-escaped-encoding str t puri::*reserved-characters*)))
(let ((str (puri:uri-query uri)))
(cond
(str
(let ((pairs '()))
(dolist (s (split-sequence-if (lambda (x) (eql x #\&)) str))
(destructuring-bind (name value)
(split-sequence-if (lambda (x) (eql x #\=)) s)
(push (cons (unescape name) (unescape value)) pairs)))
(reverse pairs)))
(t
nil)))))
#-cxml-system::uri-is-namestring
(defun query-value (name alist)
(cdr (assoc name alist :test #'equal)))
#-cxml-system::uri-is-namestring
(defun pathname-to-uri (pathname)
(let ((path
(append (pathname-directory pathname)
(list
(if (specific-or (pathname-type pathname))
(concatenate 'string
(pathname-name pathname)
"."
(pathname-type pathname))
(pathname-name pathname))))))
(if (eq (car path) :relative)
(make-uri :path path)
(make-uri :scheme :file
:host (concatenate 'string
(string-or (host-namestring pathname))
"+"
(specific-or (pathname-device pathname)))
:path path))))
#+cxml-system::uri-is-namestring
(defun pathname-to-uri (pathname)
(puri:parse-uri (namestring pathname)))
#-cxml-system::uri-is-namestring
(defun parse-name.type (str)
(if str
(let ((i (position #\. str :from-end t)))
(if i
(values (subseq str 0 i) (subseq str (1+ i)))
(values str nil)))
(values nil nil)))
#-cxml-system::uri-is-namestring
(defun uri-to-pathname (uri)
(let ((scheme (puri:uri-scheme uri))
(path (puri:uri-parsed-path uri)))
(unless (member scheme '(nil :file))
(error 'xml-parse-error
:format-control "URI scheme ~S not supported"
:format-arguments (list scheme)))
(if (eq (car path) :relative)
(multiple-value-bind (name type)
(parse-name.type (car (last path)))
(make-pathname :directory (butlast path)
:name name
:type type))
(multiple-value-bind (name type)
(parse-name.type (car (last (cdr path))))
(destructuring-bind (host device)
(split-sequence-if (lambda (x) (eql x #\+))
(or (puri:uri-host uri) "+"))
(make-pathname :host (string-or host)
:device (string-or device)
:directory (cons :absolute (butlast (cdr path)))
:name name
:type type))))))
#+cxml-system::uri-is-namestring
(defun uri-to-pathname (uri)
(let ((pathname (puri:render-uri uri nil)))
(when (equalp (pathname-host pathname) "+")
(setf (slot-value pathname 'lisp::host) "localhost"))
pathname))
(defun parse
(input handler &rest args
&key validate dtd root entity-resolver disallow-internal-subset
recode pathname)
(declare (ignore validate dtd root entity-resolver disallow-internal-subset
recode))
(let ((args
(loop
for (name value) on args by #'cddr
unless (eq name :pathname)
append (list name value))))
(etypecase input
(xstream (apply #'parse-xstream input handler args))
(pathname (apply #'parse-file input handler args))
(rod (apply #'parse-rod input handler args))
(array (apply #'parse-octets input handler args))
(stream
(let ((xstream (make-xstream input :speed 8192)))
(setf (xstream-name xstream)
(make-stream-name
:entity-name "main document"
:entity-kind :main
:uri (pathname-to-uri
(merge-pathnames (or pathname (pathname input))))))
(apply #'parse-xstream xstream handler args))))))
(defun parse-xstream (xstream handler &rest args)
(let ((*ctx* nil))
(handler-case
(with-zstream (zstream :input-stack (list xstream))
(peek-rune xstream)
(with-scratch-pads ()
(apply #'p/document zstream handler args)))
(runes-encoding:encoding-error (c)
(wf-error xstream "~A" c)))))
(defun parse-file (filename handler &rest args)
(with-open-xfile (input filename)
(setf (xstream-name input)
(make-stream-name
:entity-name "main document"
:entity-kind :main
:uri (pathname-to-uri (merge-pathnames filename))))
(apply #'parse-xstream input handler args)))
(defun resolve-synonym-stream (stream)
(while (typep stream 'synonym-stream)
(setf stream (symbol-value (synonym-stream-symbol stream))))
stream)
(defun safe-stream-sysid (stream)
(if (and (typep (resolve-synonym-stream stream) 'file-stream)
;; ignore-errors, because sb-bsd-sockets creates instances of
;; FILE-STREAMs that aren't
(ignore-errors (pathname stream)))
(pathname-to-uri (merge-pathnames (pathname stream)))
nil))
(defun parse-stream (stream handler &rest args)
(let ((xstream
(make-xstream
stream
:name (make-stream-name
:entity-name "main document"
:entity-kind :main
:uri (safe-stream-sysid stream))
:initial-speed 1)))
(apply #'parse-xstream xstream handler args)))
(defun parse-empty-document
(uri qname handler &key public-id system-id entity-resolver (recode t))
(check-type uri (or null rod))
(check-type qname (or null rod))
(check-type public-id (or null rod))
(check-type system-id (or null puri:uri))
(check-type entity-resolver (or null function symbol))
(check-type recode boolean)
#+rune-is-integer
(when recode
(setf handler (make-recoder handler #'rod-to-utf8-string)))
(let ((*ctx*
(make-context :handler handler :entity-resolver entity-resolver))
(*validate* nil)
(extid
(when (or public-id system-id)
(extid-using-catalog (make-extid public-id system-id)))))
(sax:start-document handler)
(when extid
(sax:start-dtd handler
qname
(and public-id)
(and system-id (uri-rod system-id)))
(setf (dtd *ctx*) (getdtd (extid-system extid) *dtd-cache*))
(unless (dtd *ctx*)
(with-scratch-pads ()
(let ((*data-behaviour* :DTD))
(let ((xi2 (xstream-open-extid extid)))
(with-zstream (zi2 :input-stack (list xi2))
(ensure-dtd)
(p/ext-subset zi2))))))
(sax:end-dtd handler)
(let ((dtd (dtd *ctx*)))
(sax:entity-resolver handler (lambda (n h) (resolve-entity n h dtd)))
(sax::dtd handler dtd)))
(ensure-dtd)
(when (or uri qname)
(let* ((attrs
(when uri
(list (sax:make-attribute :qname #"xmlns"
:value (rod uri)
:specified-p t))))
(*namespace-bindings* *namespace-bindings*)
new-namespaces)
(when sax:*namespace-processing*
(setf new-namespaces (declare-namespaces attrs))
(mapc #'set-attribute-namespace attrs))
(multiple-value-bind (uri prefix local-name)
(if sax:*namespace-processing* (decode-qname qname) nil)
(declare (ignore prefix))
(unless (or sax:*include-xmlns-attributes*
(null sax:*namespace-processing*))
(setf attrs nil))
(sax:start-element (handler *ctx*) uri local-name qname attrs)
(sax:end-element (handler *ctx*) uri local-name qname))
(undeclare-namespaces new-namespaces)))
(sax:end-document handler)))
(defun parse-dtd-file (filename &optional handler)
(with-open-file (s filename :element-type '(unsigned-byte 8))
(parse-dtd-stream s handler)))
(defun parse-dtd-stream (stream &optional handler)
(let ((input (make-xstream stream)))
(setf (xstream-name input)
(make-stream-name
:entity-name "dtd"
:entity-kind :main
:uri (safe-stream-sysid stream)))
(let ((*ctx* (make-context :handler handler))
(*validate* t)
(*data-behaviour* :DTD))
(with-zstream (zstream :input-stack (list input))
(with-scratch-pads ()
(ensure-dtd)
(peek-rune input)
(p/ext-subset zstream)
(dtd *ctx*))))))
(defun parse-rod (string handler &rest args)
(let ((xstream (string->xstream string)))
(setf (xstream-name xstream)
(make-stream-name
:entity-name "main document"
:entity-kind :main
:uri nil))
(apply #'parse-xstream xstream handler args)))
(defun string->xstream (string)
(make-rod-xstream (string-rod string)))
(defun parse-octets (octets handler &rest args)
(apply #'parse-stream (make-octet-input-stream octets) handler args))
;;;;
(defun zstream-push (new-xstream zstream)
(cond ((find-if (lambda (x)
(and (xstream-p x)
(eql (stream-name-entity-name (xstream-name x))
(stream-name-entity-name (xstream-name new-xstream)))
(eql (stream-name-entity-kind (xstream-name x))
(stream-name-entity-kind (xstream-name new-xstream)))))
(zstream-input-stack zstream))
(wf-error zstream "Infinite recursion.")))
(push new-xstream (zstream-input-stack zstream))
zstream)
(defun recurse-on-entity (zstream name kind continuation &optional internalp)
(assert (not (zstream-token-category zstream)))
(call-with-entity-expansion-as-stream
zstream
(lambda (new-xstream)
(push :stop (zstream-input-stack zstream))
(zstream-push new-xstream zstream)
(prog1
(funcall continuation zstream)
(assert (eq (peek-token zstream) :eof))
(assert (eq (pop (zstream-input-stack zstream)) new-xstream))
(close-xstream new-xstream)
(assert (eq (pop (zstream-input-stack zstream)) :stop))
(setf (zstream-token-category zstream) nil)
'(consume-token zstream)) )
name
kind
internalp))
#||
(defmacro read-data-until* ((predicate input res res-start res-end) &body body)
;; fast variant -- for now disabled for no apparent reason
;; -> res, res-start, res-end
`(let* ((rptr (xstream-read-ptr ,input))
(p0 rptr)
(fptr (xstream-fill-ptr ,input))
(buf (xstream-buffer ,input))
,res ,res-start ,res-end)
(declare (type fixnum rptr fptr p0)
(type (simple-array read-element (*)) buf))
(loop
(cond ((%= rptr fptr)
;; underflow -- hmm inject the scratch-pad with what we
;; read and continue, while using read-rune and collecting
;; d.h. besser waere hier auch while-reading zu benutzen.
(setf (xstream-read-ptr ,input) rptr)
(multiple-value-setq (,res ,res-start ,res-end)
(with-rune-collector/raw (collect)
(do ((i p0 (%+ i 1)))
((%= i rptr))
(collect (%rune buf i)))
(let (c)
(loop
(cond ((%= rptr fptr)
(setf (xstream-read-ptr ,input) rptr)
(setf c (peek-rune input))
(cond ((eq c :eof)
(return)))
(setf rptr (xstream-read-ptr ,input)
fptr (xstream-fill-ptr ,input)
buf (xstream-buffer ,input)))
(t
(setf c (%rune buf rptr))))
(cond ((,predicate c)
;; we stop
(setf (xstream-read-ptr ,input) rptr)
(return))
(t
;; we continue
(collect c)
(setf rptr (%+ rptr 1))) )))))
(return))
((,predicate (%rune buf rptr))
;; we stop
(setf (xstream-read-ptr ,input) rptr)
(setf ,res buf ,res-start p0 ,res-end rptr)
(return) )
(t
we continue
(sf rptr (%+ rptr 1))) ))
,@body ))
||#
(defmacro read-data-until* ((predicate input res res-start res-end) &body body)
"Read data from `input' until `predicate' applied to the read char
turns true. Then execute `body' with `res', `res-start', `res-end'
bound to denote a subsequence (of RUNEs) containing the read portion.
The rune upon which `predicate' turned true is neither consumed from
the stream, nor included in `res'.
Keep the predicate short, this it may be included more than once into
the macro's expansion."
;;
(let ((input-var (gensym))
(collect (gensym))
(c (gensym)))
`(LET ((,input-var ,input))
(MULTIPLE-VALUE-BIND (,res ,res-start ,res-end)
(WITH-RUNE-COLLECTOR/RAW (,collect)
(LOOP
(LET ((,c (PEEK-RUNE ,input-var)))
(COND ((EQ ,c :EOF)
;; xxx error message
(RETURN))
((FUNCALL ,predicate ,c)
(RETURN))
(t
(,collect ,c)
(CONSUME-RUNE ,input-var))))))
(LOCALLY
,@body)))))
(defun read-name-token (input)
(read-data-until* ((lambda (rune)
(declare (type rune rune))
(not (name-rune-p rune)))
input
r rs re)
(intern-name r rs re)))
(defun read-cdata (input)
(read-data-until* ((lambda (rune)
(declare (type rune rune))
(when (and (%rune< rune #/U+0020)
(not (or (%rune= rune #/U+0009)
(%rune= rune #/U+000a)
(%rune= rune #/U+000d))))
(wf-error input "code point invalid: ~A" rune))
(or (%rune= rune #/<) (%rune= rune #/&)))
input
source start end)
(locally
(declare (type (simple-array rune (*)) source)
(type ufixnum start)
(type ufixnum end)
(optimize (speed 3) (safety 0)))
(let ((res (make-array (%- end start) :element-type 'rune)))
(declare (type (simple-array rune (*)) res))
(let ((i (%- end start)))
(declare (type ufixnum i))
(loop
(setf i (- i 1))
(setf (%rune res i) (%rune source (the ufixnum (+ i start))))
(when (= i 0)
(return))))
res))))
;; used only by read-att-value-2
(defun internal-entity-expansion (name)
(let ((def (get-entity-definition name :general (dtd *ctx*))))
(unless def
(wf-error nil "Entity '~A' is not defined." (rod-string name)))
(unless (typep def 'internal-entdef)
(wf-error nil "Entity '~A' is not an internal entity." name))
(or (entdef-expansion def)
(setf (entdef-expansion def) (find-internal-entity-expansion name)))))
;; used only by read-att-value-2
(defun find-internal-entity-expansion (name)
(with-zstream (zinput)
(with-rune-collector-3 (collect)
(labels ((muffle (input)
(let (c)
(loop
(setf c (read-rune input))
(cond ((eq c :eof)
(return))
((rune= c #/&)
(setf c (peek-rune input))
(cond ((eql c :eof)
(eox input))
((rune= c #/#)
(let ((c (read-character-reference input)))
(%put-unicode-char c collect)))
(t
(unless (name-start-rune-p c)
(wf-error zinput "Expecting name after &."))
(let ((name (read-name-token input)))
(setf c (read-rune input))
(check-rune input c #/\;)
(recurse-on-entity
zinput name :general
(lambda (zinput)
(muffle (car (zstream-input-stack zinput)))))))))
((rune= c #/<)
(wf-error zinput "unexpected #\/<"))
((space-rune-p c)
(collect #/space))
((not (data-rune-p c))
(wf-error zinput "illegal char: ~S." c))
(t
(collect c)))))))
(declare (dynamic-extent #'muffle))
(recurse-on-entity
zinput name :general
(lambda (zinput)
(muffle (car (zstream-input-stack zinput)))))))))
;; callback for DOM
(defun resolve-entity (name handler dtd)
(let ((*validate* nil))
(if (get-entity-definition name :general dtd)
(let* ((*ctx* (make-context :handler handler :dtd dtd))
(*data-behaviour* :DOC))
(with-zstream (input)
(with-scratch-pads ()
(recurse-on-entity
input name :general
(lambda (input)
(prog1
(etypecase (checked-get-entdef name :general)
(internal-entdef (p/content input))
(external-entdef (p/ext-parsed-ent input)))
(unless (eq (peek-token input) :eof)
(wf-error input "Trailing garbage. - ~S"
(peek-token input)))))))))
nil)))
(defun read-att-value-2 (input)
(let ((delim (read-rune input)))
(when (eql delim :eof)
(eox input))
(unless (member delim '(#/\" #/\') :test #'eql)
(wf-error input
"Bad attribute value delimiter ~S, must be either #\\\" or #\\\'."
(rune-char delim)))
(with-rune-collector-4 (collect)
(loop
(let ((c (read-rune input)))
(cond ((eq c :eof)
(eox input "EOF"))
((rune= c delim)
(return))
((rune= c #/<)
(wf-error input "'<' not allowed in attribute values"))
((rune= #/& c)
(multiple-value-bind (kind sem) (read-entity-like input)
(ecase kind
(:CHARACTER-REFERENCE
(%put-unicode-char sem collect))
(:ENTITY-REFERENCE
(let* ((exp (internal-entity-expansion sem))
(n (length exp)))
(declare (type (simple-array rune (*)) exp))
(do ((i 0 (%+ i 1)))
((%= i n))
(collect (%rune exp i))))))))
((space-rune-p c)
(collect #/u+0020))
(t
(collect c))))))))
;;;;;;;;;;;;;;;;;
;;; Namespace stuff
;; We already know that name is part of a valid XML name, so all we
;; have to check is that the first rune is a name-start-rune and that
;; there is not colon in it.
(defun nc-name-p (name)
(and (plusp (length name))
(name-start-rune-p (rune name 0))
(notany #'(lambda (rune) (rune= #/: rune)) name)))
(defun split-qname (qname)
(declare (type runes:simple-rod qname))
(let ((pos (position #/: qname)))
(if pos
(let ((prefix (subseq qname 0 pos))
(local-name (subseq qname (1+ pos))))
(when (zerop pos)
(wf-error nil "empty namespace prefix"))
(if (nc-name-p local-name)
(values prefix local-name)
(wf-error nil "~S is not a valid NcName."
(rod-string local-name))))
(values () qname))))
(defun decode-qname (qname)
"decode-qname name => namespace-uri, prefix, local-name"
(declare (type runes:simple-rod qname))
(multiple-value-bind (prefix local-name) (split-qname qname)
(let ((uri (find-namespace-binding prefix)))
(if uri
(values uri prefix local-name)
(values nil nil qname)))))
(defun find-namespace-binding (prefix)
(cdr (or (assoc (or prefix #"") *namespace-bindings* :test #'rod=)
(wf-error nil "Undeclared namespace prefix: ~A" (rod-string prefix)))))
;; FIXME: Should probably be refactored by adding :start and :end to rod=/rod-equal
(defun rod-starts-with (prefix rod)
(and (<= (length prefix) (length rod))
(dotimes (i (length prefix) t)
(unless (rune= (rune prefix i) (rune rod i))
(return nil)))))
(defun xmlns-attr-p (attr-name)
(rod-starts-with #.(string-rod "xmlns") attr-name))
(defun attrname->prefix (attrname)
(if (< 5 (length attrname))
(subseq attrname 6)
nil))
(defun find-namespace-declarations (attributes)
(loop
for attribute in attributes
for qname = (sax:attribute-qname attribute)
when (xmlns-attr-p qname)
collect (cons (attrname->prefix qname) (sax:attribute-value attribute))))
(defun declare-namespaces (attributes)
(let ((ns-decls (find-namespace-declarations attributes)))
(dolist (ns-decl ns-decls)
;; check some namespace validity constraints
(let ((prefix (car ns-decl))
(uri (cdr ns-decl)))
(cond
((and (rod= prefix #"xml")
(not (rod= uri #"http://www.w3.org/XML/1998/namespace")))
(wf-error nil
"Attempt to rebind the prefix \"xml\" to ~S." (mu uri)))
((and (rod= uri #"http://www.w3.org/XML/1998/namespace")
(not (rod= prefix #"xml")))
(wf-error nil
"The namespace ~
URI \"http://www.w3.org/XML/1998/namespace\" may not ~
be bound to the prefix ~S, only \"xml\" is legal."
(mu prefix)))
((and (rod= prefix #"xmlns")
(rod= uri #"http://www.w3.org/2000/xmlns/"))
(wf-error nil
"Attempt to bind the prefix \"xmlns\" to its predefined ~
URI \"http://www.w3.org/2000/xmlns/\", which is ~
forbidden for no good reason."))
((rod= prefix #"xmlns")
(wf-error nil
"Attempt to bind the prefix \"xmlns\" to the URI ~S, ~
but it may not be declared." (mu uri)))
((rod= uri #"http://www.w3.org/2000/xmlns/")
(wf-error nil
"The namespace URI \"http://www.w3.org/2000/xmlns/\" may ~
not be bound to prefix ~S (or any other)." (mu prefix)))
((and (rod= uri #"") prefix)
(wf-error nil
"Only the default namespace (the one without a prefix) ~
may be bound to an empty namespace URI, thus ~
undeclaring it."))
(t
(push (cons prefix (if (rod= #"" uri) nil uri))
*namespace-bindings*)
(sax:start-prefix-mapping (handler *ctx*)
(car ns-decl)
(cdr ns-decl))))))
ns-decls))
(defun undeclare-namespaces (ns-decls)
(dolist (ns-decl ns-decls)
(sax:end-prefix-mapping (handler *ctx*) (car ns-decl))))
(defun build-attribute-list (attr-alist)
;; fixme: if there is a reason this function reverses attribute order,
;; it should be documented.
(let (attributes)
(dolist (pair attr-alist)
(push (sax:make-attribute :qname (car pair)
:value (cdr pair)
:specified-p t)
attributes))
attributes))
(defun check-attribute-uniqueness (attributes)
;; 5.3 Uniqueness of Attributes
;; In XML documents conforming to [the xmlns] specification, no
;; tag may contain two attributes which:
;; 1. have identical names, or
;; 2. have qualified names with the same local part and with
;; prefixes which have been bound to namespace names that are
;; identical.
;;
;; 1. is checked by read-tag-2, so we only deal with 2 here
(loop for (attr-1 . rest) on attributes do
(when (and (sax:attribute-namespace-uri attr-1)
(find-if (lambda (attr-2)
(and (rod= (sax:attribute-namespace-uri attr-1)
(sax:attribute-namespace-uri attr-2))
(rod= (sax:attribute-local-name attr-1)
(sax:attribute-local-name attr-2))))
rest))
(wf-error nil
"Multiple definitions of attribute ~S in namespace ~S."
(mu (sax:attribute-local-name attr-1))
(mu (sax:attribute-namespace-uri attr-1))))))
(defun set-attribute-namespace (attribute)
(let ((qname (sax:attribute-qname attribute)))
(if (and sax:*use-xmlns-namespace* (rod= qname #"xmlns"))
(setf (sax:attribute-namespace-uri attribute)
#"http://www.w3.org/2000/xmlns/")
(multiple-value-bind (prefix local-name) (split-qname qname)
(when (and prefix ;; default namespace doesn't apply to attributes
(or (not (rod= #"xmlns" prefix))
sax:*use-xmlns-namespace*))
(setf (sax:attribute-namespace-uri attribute)
(decode-qname qname)))
(setf (sax:attribute-local-name attribute) local-name)))))
;;;;;;;;;;;;;;;;;
;; System Identifier Protocol
;; A system identifier is an object obeying to the system identifier
;; protocol. Often something like an URL or a pathname.
;; OPEN-SYS-ID sys-id [generic function]
;;
;; Opens the resource associated with the system identifier `sys-id'
;; for reading and returns a stream. For now it is expected, that the
;; stream is an octet stream (one of element type (unsigned-byte 8)).
;;
;; More precisely: The returned object only has to obey to the xstream
;; controller protocol. (That is it has to provide implementations for
;; READ-OCTETS and XSTREAM-CONTROLLER-CLOSE).
;; MERGE-SYS-ID sys-id base [generic function]
;;
;; Merges two system identifiers. That is resolve `sys-id' relative to
;; `base' yielding an absolute system identifier suitable for
;; OPEN-SYS-ID.
;;;;;;;;;;;;;;;;;
;;; SAX validation handler
(defclass validator ()
((context :initarg :context :accessor context)
(cdatap :initform nil :accessor cdatap)))
(defun make-validator (dtd root)
(make-instance 'validator
:context (make-context
:handler nil
:dtd dtd
:model-stack (list (make-root-model root)))))
(macrolet ((with-context ((validator) &body body)
`(let ((*ctx* (context ,validator))
(*validate* t))
(with-scratch-pads () ;nicht schoen
,@body))))
(defmethod sax:start-element ((handler validator) uri lname qname attributes)
uri lname
(with-context (handler)
(validate-start-element *ctx* qname)
(process-attributes *ctx* qname attributes)))
(defmethod sax:start-cdata ((handler validator))
(setf (cdatap handler) t))
(defmethod sax:characters ((handler validator) data)
(with-context (handler)
(validate-characters *ctx* (if (cdatap handler) #"hack" data))))
(defmethod sax:end-cdata ((handler validator))
(setf (cdatap handler) nil))
(defmethod sax:end-element ((handler validator) uri lname qname)
uri lname
(with-context (handler)
(validate-end-element *ctx* qname))))