;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; -*- ;;; --------------------------------------------------------------------------- ;;; Title: XML parser ;;; Created: 1999-07-17 ;;; Author: Gilbert Baumann ;;; Author: Henrik Motakef ;;; Author: David Lichteblau ;;; License: LGPL (See file COPYING for details). ;;; --------------------------------------------------------------------------- ;;; © copyright 1999 by Gilbert Baumann ;;; © copyright 2003 by Henrik Motakef ;;; © copyright 2004 knowledgeTools Int. GmbH ;;; © copyright 2004 David Lichteblau ;;; This library is free software; you can redistribute it and/or ;;; modify it under the terms of the GNU Library General Public ;;; License as published by the Free Software Foundation; either ;;; version 2 of the License, or (at your option) any later version. ;;; ;;; This library is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; Library General Public License for more details. ;;; ;;; You should have received a copy of the GNU Library General Public ;;; License along with this library; if not, write to the ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;;; Boston, MA 02111-1307 USA. ;;; Streams ;;; xstreams ;; For reading runes, I defined my own streams, called xstreams, ;; because we want to be fast. A function call or even a method call ;; per character is not acceptable, instead of that we define a ;; buffered stream with and advertised buffer layout, so that we ;; could use the trick stdio uses: READ-RUNE and PEEK-RUNE are macros, ;; directly accessing the buffer and only calling some underflow ;; handler in case of stream underflows. This will yield to quite a ;; performance boost vs calling READ-BYTE per character. ;; Also we need to do encoding t conversion on ; this better done at large chunks of data rather than on a character ;; by character basis. This way we need a dispatch on the active ;; encoding only once in a while, instead of for each character. This ;; allows us to use a CLOS interface to do the underflow handling. ;;; zstreams ;; Now, for reading tokens, we define another kind of streams, called ;; zstreams. These zstreams also maintain an input stack to implement ;; inclusion of external entities. This input stack contains xstreams ;; or the special marker :STOP. Such a :STOP marker indicates, that ;; input should not continue there, but well stop; that is simulate an ;; EOF. The user is then responsible to pop this marker off the input ;; stack. ;; ;; This input stack is also used to detect circular entity inclusion. ;; The zstream tokenizer recognizes the following types of tokens and ;; is controlled by the *DATA-BEHAVIOUR* flag. (Which should become a ;; slot of zstreams instead). ;; Common ;; :xml-pi ( . ) ;processing-instruction starting with " . ) ;processing-instruction ;; :stag ( . ) ;start tag ;; :etag ( . ) ;end tag ;; :ztag ( . ) ;empty tag ;; : ;; *data-behaviour* = :DTD ;; ;; :name ;; :#required ;; :#implied ;; :#fixed ;; :#pcdata ;; :s ;; :\[ :\] :\( :\) :|\ :\> :\" :\' :\, :\? :\* :\+ ;; *data-behaviour* = :DOC ;; ;; :entity-ref ;; :cdata ;;; NOTES ;; ;; Stream buffers as well as RODs are supposed to be encoded in ;; UTF-16. ;; where does the time go? ;; DATA-RUNE-P ;; CANON-NOT-CDATA-ATTVAL ;; READ-ATTVAL (MUFFLE) ;; CLOSy DOM ;; UTF-8 decoder (13%) ;; READ-ATTVAL (10%) ;; ;;; TODO ;; ;; o Improve error messages: ;; - line and column number counters ;; - better texts ;; - better handling of errors (no crash'n burn behaviour) ;; ;; o provide for a faster DOM ;; ;; o morph zstream into a context object and thus also get rid of ;; special variables. Put the current DTD there too. ;; [partly done] ;; o the *scratch-pad* hack should become something much more ;; reentrant, we could either define a system-wide resource ;; or allocate some scratch-pads per context. ;; [for thread-safety reasons the array are allocated per context now, ;; reentrancy is still open] ;; o CR handling in utf-16 deocders ;; ;; o UCS-4 reader ;; ;; o max depth together with circle detection ;; (or proof, that our circle detection is enough). ;; ;; o element definitions (with att definitions in the elements) ;; [das haben wir doch, oder?] ;; ;; o store entities in the DTD ;; ;; o better extensibility wrt character representation, one may want to ;; have ;; - UTF-8 in standard CL strings ;; - UCS-2 in RODs ;; - UTF-16 in RODs ;; - UCS-4 in vectoren ;; [habe ich eigentlich nicht vor--david] ;; ;; o xstreams auslagern, documententieren und dann auch in SGML und ;; CSS parser verwenden. (halt alles was zeichen liest). ;; [ausgelagert sind sie; dokumentiert "so la la"; die Reintegration ;; in Closure ist ein ganz anderes Thema] ;; ;; o merge node representation with SGML module ;; [???] ;; ;; o line/column number recording ;; ;; o better error messages ;; ;; o recording of source locations for nodes. ;; ;; o make the *scratch-pad* hack safe ;; ;; o based on the DTD and xml:space attribute implement HTML white ;; space rules. ;; ;; o on a parser option, do not expand external entities. ;; ;; o does the user need the distinction between "" and " " ? ;; That is literal and 'quoted' white space. ;; [verstehe ich nicht --david] ;; ;; o on an option merge CDATA section; ;; ;; o data in parse tree? extra nodes like in SGML?! ;; ;; o what to store in the node-gi field? Some name object or the ;; string used? ;; ;; Test that fail: ;; ;; not-wf/sa/128 is false a alarm ;; ;;;; Validity constraints: ;;;; (00) Root Element Type like (03), c.f. MAKE-ROOT-MODEL ;;;; (01) Proper Declaration/PE Nesting P/MARKUP-DECL ;;;; (02) Standalone Document Declaration all over the place [*] ;;;; (03) Element Valid VALIDATE-*-ELEMENT, -CHARACTERS ;;;; (04) Attribute Value Type VALIDATE-ATTRIBUTE ;;;; (05) Unique Element Type Declaration DEFINE-ELEMENT ;;;; (06) Proper Group/PE Nesting P/CSPEC ;;;; (07) No Duplicate Types LEGAL-CONTENT-MODEL-P ;;;; (08) ID VALIDATE-ATTRIBUTE ;;;; (09) One ID per Element Type DEFINE-ATTRIBUTE ;;;; (10) ID Attribute Default DEFINE-ATTRIBUTE ;;;; (11) IDREF VALIDATE-ATTRIBUTE, P/DOCUMENT ;;;; (12) Entity Name VALIDATE-ATTRIBUTE ;;;; (13) Name Token VALIDATE-ATTRIBUTE ;;;; (14) Notation Attributes VALIDATE-ATTRIBUTE, P/ATT-TYPE ;;;; (15) One Notation Per Element Type DEFINE-ATTRIBUTE ;;;; (16) No Notation on Empty Element DEFINE-ELEMENT, -ATTRIBUTE ;;;; (17) Enumeration VALIDATE-ATTRIBUTE ;;;; (18) Required Attribute PROCESS-ATTRIBUTES ;;;; (19) Attribute Default Legal DEFINE-ATTRIBUTE ;;;; (20) Fixed Attribute Default VALIDATE-ATTRIBUTE ;;;; (21) Proper Conditional Section/PE Nesting P/CONDITIONAL-SECT, ... ;;;; (22) Entity Declared [**] ;;;; (23) Notation Declared P/ENTITY-DEF, P/DOCUMENT ;;;; (24) Unique Notation Name DEFINE-NOTATION ;;;; ;;;; [*] Perhaps we could revert the explicit checks of (02), if we did ;;;; _not_ read external subsets of standalone documents when parsing in ;;;; validating mode. Violations of VC (02) constraints would then appear as ;;;; wellformedness violations, right? ;;;; ;;;; [**] Although I haven't investigated this properly yet, I believe that ;;;; we check this VC together with the WFC even in non-validating mode. (in-package :cxml) #+allegro (setf (excl:named-readtable :runes) *readtable*) (eval-when (:compile-toplevel :load-toplevel :execute) (defparameter *fast* '(optimize (speed 3) (safety 0))) ;;(defparameter *fast* '(optimize (speed 2) (safety 3))) ) ;;; parser context (defvar *ctx*) ;; forward declaration for DEFVAR (declaim (special *default-namespace-bindings*)) (defstruct (context (:conc-name nil)) handler (namespace-bindings *default-namespace-bindings*) (dtd nil) model-stack (referenced-notations '()) (id-table (%make-rod-hash-table)) (standalone-p nil) (entity-resolver nil) (disallow-internal-subset nil)) (defvar *expand-pe-p*) ;;;; --------------------------------------------------------------------------- ;;;; xstreams ;;;; (defstruct (stream-name (:type list)) entity-name entity-kind uri) (deftype read-element () 'rune) (defun call-with-open-xstream (fn stream) (unwind-protect (funcall fn stream) (close-xstream stream))) (defmacro with-open-xstream ((var value) &body body) `(call-with-open-xstream (lambda (,var) ,@body) ,value)) (defun call-with-open-xfile (continuation &rest open-args) (let ((input (apply #'open (car open-args) :element-type '(unsigned-byte 8) (cdr open-args)))) (unwind-protect (progn (funcall continuation (make-xstream input))) (close input)))) (defmacro with-open-xfile ((stream &rest open-args) &body body) `(call-with-open-xfile (lambda (,stream) .,body) .,open-args)) ;;; Decoders ;; The decoders share a common signature: ;; ;; DECODE input input-start input-end ;; output output-start output-end ;; eof-p ;; -> first-not-written ; first-not-read ;; ;; These decode functions should decode as much characters off `input' ;; into the `output' as possible and return the indexes to the first ;; not read and first not written element of `input' and `output' ;; respectively. If there are not enough bytes in `input' to decode a ;; full character, decoding shold be abandomed; the caller has to ;; ensure that the remaining bytes of `input' are passed to the ;; decoder again with more bytes appended. ;; ;; `eof-p' now in turn indicates, if the given input sequence, is all ;; the producer does have and might be used to produce error messages ;; in case of incomplete codes or decided what to do. ;; ;; Decoders are expected to handle the various CR/NL conventions and ;; canonicalize each end of line into a single NL rune (#xA) in good ;; old Lisp tradition. ;; ;; TODO: change this to an encoding class, which then might carry ;; additional state. Stateless encodings could been represented by ;; keywords. e.g. ;; ;; defmethod DECODE-SEQUENCE ((encoding (eql :utf-8)) ...) ;; ;;;; ------------------------------------------------------------------- ;;;; Rechnen mit Runen ;;;; ;; Let us first define fast fixnum arithmetric get rid of type ;; checks. (After all we know what we do here). (defmacro fx-op (op &rest xs) `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))) (defmacro fx-pred (op &rest xs) `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))) (defmacro %+ (&rest xs) `(fx-op + ,@xs)) (defmacro %- (&rest xs) `(fx-op - ,@xs)) (defmacro %* (&rest xs) `(fx-op * ,@xs)) (defmacro %/ (&rest xs) `(fx-op floor ,@xs)) (defmacro %and (&rest xs) `(fx-op logand ,@xs)) (defmacro %ior (&rest xs) `(fx-op logior ,@xs)) (defmacro %xor (&rest xs) `(fx-op logxor ,@xs)) (defmacro %ash (&rest xs) `(fx-op ash ,@xs)) (defmacro %mod (&rest xs) `(fx-op mod ,@xs)) (defmacro %= (&rest xs) `(fx-pred = ,@xs)) (defmacro %<= (&rest xs) `(fx-pred <= ,@xs)) (defmacro %>= (&rest xs) `(fx-pred >= ,@xs)) (defmacro %< (&rest xs) `(fx-pred < ,@xs)) (defmacro %> (&rest xs) `(fx-pred > ,@xs)) ;;; XXX Geschwindigkeit dieser Definitionen untersuchen! (defmacro rune-op (op &rest xs) `(code-rune (,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs)))) (defmacro rune-pred (op &rest xs) `(,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs))) (defmacro %rune+ (&rest xs) `(rune-op + ,@xs)) (defmacro %rune- (&rest xs) `(rune-op - ,@xs)) (defmacro %rune* (&rest xs) `(rune-op * ,@xs)) (defmacro %rune/ (&rest xs) `(rune-op floor ,@xs)) (defmacro %rune-and (&rest xs) `(rune-op logand ,@xs)) (defmacro %rune-ior (&rest xs) `(rune-op logior ,@xs)) (defmacro %rune-xor (&rest xs) `(rune-op logxor ,@xs)) (defmacro %rune-ash (a b) `(code-rune (ash (rune-code ,a) ,b))) (defmacro %rune-mod (&rest xs) `(rune-op mod ,@xs)) (defmacro %rune= (&rest xs) `(rune-pred = ,@xs)) (defmacro %rune<= (&rest xs) `(rune-pred <= ,@xs)) (defmacro %rune>= (&rest xs) `(rune-pred >= ,@xs)) (defmacro %rune< (&rest xs) `(rune-pred < ,@xs)) (defmacro %rune> (&rest xs) `(rune-pred > ,@xs)) ;;;; --------------------------------------------------------------------------- ;;;; rod hashtable ;;;; ;;; make-rod-hashtable ;;; rod-hash-get hashtable rod &optional start end -> value ; successp ;;; (setf (rod-hash-get hashtable rod &optional start end) new-value ;;; (defstruct (rod-hashtable (:constructor make-rod-hashtable/low)) size ;size of table table ; ) (defun make-rod-hashtable (&key (size 200)) (setf size (nearest-greater-prime size)) (make-rod-hashtable/low :size size :table (make-array size :initial-element nil))) (eval-when (:compile-toplevel :load-toplevel :execute) (defconstant +fixnum-bits+ (1- (integer-length most-positive-fixnum)) "Pessimistic approximation of the number of bits of fixnums.") (defconstant +fixnum-mask+ (1- (expt 2 +fixnum-bits+)) "Pessimistic approximation of the largest bit-mask, still being a fixnum.")) (definline stir (a b) (%and +fixnum-mask+ (%xor (%ior (%ash (%and a #.(ash +fixnum-mask+ -5)) 5) (%ash a #.(- 5 +fixnum-bits+))) b))) (definline rod-hash (rod start end) "Compute a hash code out of a rod." (let ((res (%- end start))) (do ((i start (%+ i 1))) ((%= i end)) (declare (type fixnum i)) (setf res (stir res (rune-code (%rune rod i))))) res)) (definline rod=* (x y &key (start1 0) (end1 (length x)) (start2 0) (end2 (length y))) (and (%= (%- end1 start1) (%- end2 start2)) (do ((i start1 (%+ i 1)) (j start2 (%+ j 1))) ((%= i end1) t) (unless (rune= (%rune x i) (%rune y j)) (return nil))))) (definline rod=** (x y start1 end1 start2 end2) (and (%= (%- end1 start1) (%- end2 start2)) (do ((i start1 (%+ i 1)) (j start2 (%+ j 1))) ((%= i end1) t) (unless (rune= (%rune x i) (%rune y j)) (return nil))))) (defun rod-hash-get (hashtable rod &optional (start 0) (end (length rod))) (declare (type (simple-array rune (*)) rod)) (let ((j (%mod (rod-hash rod start end) (rod-hashtable-size hashtable)))) (dolist (q (svref (rod-hashtable-table hashtable) j) (values nil nil nil)) (declare (type cons q)) (when (rod=** (car q) rod 0 (length (the (simple-array rune (*)) (car q))) start end) (return (values (cdr q) t (car q))))))) (defun rod-hash-set (new-value hashtable rod &optional (start 0) (end (length rod))) (let ((j (%mod (rod-hash rod start end) (rod-hashtable-size hashtable))) (key nil)) (dolist (q (svref (rod-hashtable-table hashtable) j) (progn (setf key (rod-subseq* rod start end)) (push (cons key new-value) (aref (rod-hashtable-table hashtable) j)))) (when (rod=* (car q) rod :start2 start :end2 end) (setf key (car q)) (setf (cdr q) new-value) (return))) (values new-value key))) #-rune-is-character (defun rod-subseq* (source start &optional (end (length source))) (unless (and (typep start 'fixnum) (>= start 0)) (error "~S is not a non-negative fixnum." start)) (unless (and (typep end 'fixnum) (>= end start)) (error "END argument, ~S, is not a fixnum no less than START, ~S." end start)) (when (> start (length source)) (error "START argument, ~S, should be no greater than length of rod." start)) (when (> end (length source)) (error "END argument, ~S, should be no greater than length of rod." end)) (locally (declare (type fixnum start end)) (let ((res (make-rod (- end start)))) (declare (type rod res)) (do ((i (- (- end start) 1) (the fixnum (- i 1)))) ((< i 0) res) (declare (type fixnum i)) (setf (%rune res i) (aref source (the fixnum (+ i start)))))))) #+rune-is-character (defun rod-subseq* (source start &optional (end (length source))) (subseq source start end)) (deftype ufixnum () `(unsigned-byte ,(integer-length most-positive-fixnum))) #-rune-is-character (defun rod-subseq** (source start &optional (end (length source))) (declare (type (simple-array rune (*)) source) (type ufixnum start) (type ufixnum end) (optimize (speed 3) (safety 0))) (let ((res (make-array (%- end start) :element-type 'rune))) (declare (type (simple-array rune (*)) res)) (let ((i (%- end start))) (declare (type ufixnum i)) (loop (setf i (- i 1)) (when (= i 0) (return)) (setf (%rune res i) (%rune source (the ufixnum (+ i start)))))) res)) #+rune-is-character (defun rod-subseq** (source start &optional (end (length source))) (subseq source start end)) (defun (setf rod-hash-get) (new-value hashtable rod &optional (start 0) (end (length rod))) (rod-hash-set new-value hashtable rod start end)) (defparameter *name-hashtable* (make-rod-hashtable :size 2000)) (defun intern-name (rod &optional (start 0) (end (length rod))) (multiple-value-bind (value successp key) (rod-hash-get *name-hashtable* rod start end) (declare (ignore value)) (if successp key (nth-value 1 (rod-hash-set t *name-hashtable* rod start end))))) ;;;; --------------------------------------------------------------------------- ;;;; ;;;; rod collector ;;;; (defvar *scratch-pad*) (defvar *scratch-pad-2*) (defvar *scratch-pad-3*) (defvar *scratch-pad-4*) (declaim (type (simple-array rune (*)) *scratch-pad* *scratch-pad-2* *scratch-pad-3* *scratch-pad-4*)) (defmacro with-scratch-pads ((&optional) &body body) `(let ((*scratch-pad* (make-array 1024 :element-type 'rune)) (*scratch-pad-2* (make-array 1024 :element-type 'rune)) (*scratch-pad-3* (make-array 1024 :element-type 'rune)) (*scratch-pad-4* (make-array 1024 :element-type 'rune))) ,@body)) (defmacro %put-unicode-char (code-var put) `(progn (cond ((%> ,code-var #xFFFF) (,put (the rune (code-rune (%+ #xD7C0 (%ash ,code-var -10))))) (,put (the rune (code-rune (%ior #xDC00 (%and ,code-var #x03FF)))))) (t (,put (code-rune ,code-var)))))) (defun adjust-array-by-copying (old-array new-size) "Adjust an array by copying and thus ensures, that result is a SIMPLE-ARRAY." (let ((res (make-array new-size :element-type (array-element-type old-array)))) (replace res old-array :start1 0 :end1 (length old-array) :start2 0 :end2 (length old-array)) res)) (defmacro with-rune-collector-aux (scratch collect body mode) (let ((rod (gensym)) (n (gensym)) (i (gensym)) (b (gensym))) `(let ((,n (length ,scratch)) (,i 0) (,b ,scratch)) (declare (type fixnum ,n ,i)) (macrolet ((,collect (x) `((lambda (x) (locally (declare #.*fast*) (when (%>= ,',i ,',n) (setf ,',n (* 2 ,',n)) (setf ,',b (setf ,',scratch (adjust-array-by-copying ,',scratch ,',n)))) (setf (aref (the (simple-array rune (*)) ,',b) ,',i) x) (incf ,',i))) ,x))) ,@body ,(ecase mode (:intern `(intern-name ,b 0 ,i)) (:copy `(let ((,rod (make-rod ,i))) (while (not (%= ,i 0)) (setf ,i (%- ,i 1)) (setf (%rune ,rod ,i) (aref (the (simple-array rune (*)) ,b) ,i))) ,rod)) (:raw `(values ,b 0 ,i)) ))))) '(defmacro with-rune-collector-aux (scratch collect body mode) (let ((rod (gensym)) (n (gensym)) (i (gensym)) (b (gensym))) `(let ((,n (length ,scratch)) (,i 0)) (declare (type fixnum ,n ,i)) (macrolet ((,collect (x) `((lambda (x) (locally (declare #.*fast*) (when (%>= ,',i ,',n) (setf ,',n (* 2 ,',n)) (setf ,',scratch (setf ,',scratch (adjust-array-by-copying ,',scratch ,',n)))) (setf (aref (the (simple-array rune (*)) ,',scratch) ,',i) x) (incf ,',i))) ,x))) ,@body ,(ecase mode (:intern `(intern-name ,scratch 0 ,i)) (:copy `(let ((,rod (make-rod ,i))) (while (%> ,i 0) (setf ,i (%- ,i 1)) (setf (%rune ,rod ,i) (aref (the (simple-array rune (*)) ,scratch) ,i))) ,rod)) (:raw `(values ,scratch 0 ,i)) ))))) (defmacro with-rune-collector ((collect) &body body) `(with-rune-collector-aux *scratch-pad* ,collect ,body :copy)) (defmacro with-rune-collector-2 ((collect) &body body) `(with-rune-collector-aux *scratch-pad-2* ,collect ,body :copy)) (defmacro with-rune-collector-3 ((collect) &body body) `(with-rune-collector-aux *scratch-pad-3* ,collect ,body :copy)) (defmacro with-rune-collector-4 ((collect) &body body) `(with-rune-collector-aux *scratch-pad-4* ,collect ,body :copy)) (defmacro with-rune-collector/intern ((collect) &body body) `(with-rune-collector-aux *scratch-pad* ,collect ,body :intern)) (defmacro with-rune-collector/raw ((collect) &body body) `(with-rune-collector-aux *scratch-pad* ,collect ,body :raw)) #| (defmacro while-reading-runes ((reader stream-in) &rest body) ;; Thou shalt not leave body via a non local exit (let ((stream (make-symbol "STREAM")) (rptr (make-symbol "RPTR")) (fptr (make-symbol "FPTR")) (buf (make-symbol "BUF")) ) `(let* ((,stream ,stream-in) (,rptr (xstream-read-ptr ,stream)) (,fptr (xstream-fill-ptr ,stream)) (,buf (xstream-buffer ,stream))) (declare (type fixnum ,rptr ,fptr) (type xstream ,stream)) (macrolet ((,reader (res-var) `(cond ((%= ,',rptr ,',fptr) (setf (xstream-read-ptr ,',stream) ,',rptr) (setf ,res-var (xstream-underflow ,',stream)) (setf ,',rptr (xstream-read-ptr ,',stream)) (setf ,',fptr (xstream-fill-ptr ,',stream)) (setf ,',buf (xstream-buffer ,',stream))) (t (setf ,res-var (aref (the (simple-array read-element (*)) ,',buf) (the fixnum ,',rptr))) (setf ,',rptr (%+ ,',rptr 1)))))) (prog1 (let () .,body) (setf (xstream-read-ptr ,stream) ,rptr) ))))) |# ;;;; --------------------------------------------------------------------------- ;;;; DTD ;;;; (define-condition parser-error (simple-error) ()) (define-condition validity-error (parser-error) ()) (defun validity-error (x &rest args) (error 'validity-error :format-control "Validity constraint violated: ~@?" :format-arguments (list x args))) (defvar *validate* t) (defvar *markup-declaration-external-p* nil) (defun validate-start-element (ctx name) (when *validate* (let* ((pair (car (model-stack ctx))) (newval (funcall (car pair) name))) (unless newval (validity-error "(03) Element Valid: ~A" (rod-string name))) (setf (car pair) newval) (let ((e (find-element name (dtd ctx)))) (unless e (validity-error "(03) Element Valid: no definition for ~A" (rod-string name))) (maybe-compile-cspec e) (push (copy-cons (elmdef-compiled-cspec e)) (model-stack ctx)))))) (defun copy-cons (x) (cons (car x) (cdr x))) (defun validate-end-element (ctx name) (when *validate* (let ((pair (car (model-stack ctx)))) (unless (eq (funcall (car pair) nil) t) (validity-error "(03) Element Valid: ~A" (rod-string name))) (pop (model-stack ctx))))) (defun validate-characters (ctx rod) (when *validate* (let ((pair (car (model-stack ctx)))) (unless (funcall (cdr pair) rod) (validity-error "(03) Element Valid: unexpected PCDATA"))))) (defun standalone-check-necessary-p (def) (and *validate* (standalone-p *ctx*) (etypecase def (elmdef (elmdef-external-p def)) (attdef (attdef-external-p def))))) (defun process-attributes (ctx name attlist) (let ((e (find-element name (dtd ctx)))) (cond (e (dolist (ad (elmdef-attributes e)) ;handle default values (unless (get-attribute (attdef-name ad) attlist) (case (attdef-default ad) (:IMPLIED) (:REQUIRED (when *validate* (validity-error "(18) Required Attribute: ~S not specified" (rod-string (attdef-name ad))))) (t (when (standalone-check-necessary-p ad) (validity-error "(02) Standalone Document Declaration: missing attribute value")) (push (build-attribute (attdef-name ad) (cadr (attdef-default ad)) nil) attlist))))) (dolist (a attlist) ;normalize non-CDATA values (let* ((qname (sax:attribute-qname a)) (adef (find-attribute e qname))) (when (and adef (not (eq (attdef-type adef) :CDATA))) (let ((canon (canon-not-cdata-attval (sax:attribute-value a)))) (when (and (standalone-check-necessary-p adef) (not (rod= (sax:attribute-value a) canon))) (validity-error "(02) Standalone Document Declaration: attribute value not normalized")) (setf (sax:attribute-value a) canon))))) (when *validate* ;maybe validate attribute values (dolist (a attlist) (validate-attribute ctx e a)))) ((and *validate* attlist) (validity-error "(04) Attribute Value Type: no definition for element ~A" (rod-string name))))) attlist) (defun get-attribute (name attributes) (member name attributes :key #'sax:attribute-qname :test #'rod=)) (defun validate-attribute (ctx e a) (when (sax:attribute-specified-p a) ;defaults checked by DEFINE-ATTRIBUTE (let* ((qname (sax:attribute-qname a)) (adef (or (find-attribute e qname) (validity-error "(04) Attribute Value Type: not declared: ~A" (rod-string qname))))) (validate-attribute* ctx adef (sax:attribute-value a))))) (defun validate-attribute* (ctx adef value) (let ((type (attdef-type adef)) (default (attdef-default adef))) (when (and (listp default) (eq (car default) :FIXED) (not (rod= value (cadr default)))) (validity-error "(20) Fixed Attribute Default: expected ~S but got ~S" (rod-string (cadr default)) (rod-string value))) (ecase (if (listp type) (car type) type) (:ID (unless (valid-name-p value) (validity-error "(08) ID: not a name: ~S" (rod-string value))) (when (eq (gethash value (id-table ctx)) t) (validity-error "(08) ID: ~S not unique" (rod-string value))) (setf (gethash value (id-table ctx)) t)) (:IDREF (validate-idref ctx value)) (:IDREFS (let ((names (split-names value))) (unless names (validity-error "(11) IDREF: malformed names")) (mapc (curry #'validate-idref ctx) names))) (:NMTOKEN (validate-nmtoken value)) (:NMTOKENS (let ((tokens (split-names value))) (unless tokens (validity-error "(13) Name Token: malformed NMTOKENS")) (mapc #'validate-nmtoken tokens))) (:ENUMERATION (unless (member value (cdr type) :test #'rod=) (validity-error "(17) Enumeration: value not declared: ~S" (rod-string value)))) (:NOTATION (unless (member value (cdr type) :test #'rod=) (validity-error "(14) Notation Attributes: ~S" (rod-string value)))) (:ENTITY (validate-entity value)) (:ENTITIES (let ((names (split-names value))) (unless names (validity-error "(13) Name Token: malformed NMTOKENS")) (mapc #'validate-entity names))) (:CDATA)))) (defun validate-idref (ctx value) (unless (valid-name-p value) (validity-error "(11) IDREF: not a name: ~S" (rod-string value))) (unless (gethash value (id-table ctx)) (setf (gethash value (id-table ctx)) nil))) (defun validate-nmtoken (value) (unless (valid-nmtoken-p value) (validity-error "(13) Name Token: not a NMTOKEN: ~S" (rod-string value)))) (defstruct (entdef (:constructor))) (defstruct (internal-entdef (:include entdef) (:constructor make-internal-entdef (value)) (:conc-name #:entdef-)) (value (error "missing argument") :type rod) (expansion nil)) (defstruct (external-entdef (:include entdef) (:constructor make-external-entdef (extid ndata)) (:conc-name #:entdef-)) (extid (error "missing argument") :type extid) (ndata nil :type (or rod null))) (defun validate-entity (value) (unless (valid-name-p value) (validity-error "(12) Entity Name: not a name: ~S" (rod-string value))) (let ((def (let ((*validate* ;; Similarly the entity refs are internal and ;; don't need normalization ... the unparsed ;; entities (and entities) aren't "references" ;; -- sun/valid/sa03.xml nil)) (get-entity-definition value :general (dtd *ctx*))))) (unless (and (typep def 'external-entdef) (entdef-ndata def)) ;; unparsed entity (validity-error "(12) Entity Name: ~S" (rod-string value))))) (defun split-names (rod) (flet ((whitespacep (x) (or (rune= x #/U+0009) (rune= x #/U+000A) (rune= x #/U+000D) (rune= x #/U+0020)))) (if (let ((n (length rod))) (and (not (zerop n)) (or (whitespacep (rune rod 0)) (whitespacep (rune rod (1- n)))))) nil (split-sequence-if #'whitespacep rod :remove-empty-subseqs t)))) (defun zstream-base-sysid (zstream) (let ((base-sysid (dolist (k (zstream-input-stack zstream)) (let ((base-sysid (stream-name-uri (xstream-name k)))) (when base-sysid (return base-sysid)))))) base-sysid)) (defun absolute-uri (sysid source-stream) (let ((base-sysid (zstream-base-sysid source-stream))) ;; XXX is the IF correct? (if base-sysid (puri:merge-uris sysid base-sysid) sysid))) (defstruct (extid (:constructor make-extid (public system))) (public nil :type (or rod null)) (system (error "missing argument") :type (or puri:uri null))) (defun absolute-extid (source-stream extid) (let ((sysid (extid-system extid)) (result (copy-extid extid))) (setf (extid-system result) (absolute-uri sysid source-stream)) result)) (defun define-entity (source-stream name kind def) (setf name (intern-name name)) (let ((table (ecase kind (:general (dtd-gentities (dtd *ctx*))) (:parameter (dtd-pentities (dtd *ctx*)))))) (unless (gethash name table) (when (and source-stream (handler *ctx*)) (report-entity (handler *ctx*) kind name def)) (when (typep def 'external-entdef) (setf (entdef-extid def) (absolute-extid source-stream (entdef-extid def)))) (setf (gethash name table) (cons *markup-declaration-external-p* def))))) (defun get-entity-definition (entity-name kind dtd) (destructuring-bind (extp &rest def) (gethash entity-name (ecase kind (:general (dtd-gentities dtd)) (:parameter (dtd-pentities dtd))) '(nil)) (when (and *validate* (standalone-p *ctx*) extp) (validity-error "(02) Standalone Document Declaration: entity reference: ~S" (rod-string entity-name))) def)) (defun entity->xstream (entity-name kind &optional zstream) ;; `zstream' is for error messages (let ((def (get-entity-definition entity-name kind (dtd *ctx*)))) (unless def (if zstream (perror zstream "Entity '~A' is not defined." (rod-string entity-name)) (error "Entity '~A' is not defined." (rod-string entity-name)))) (let (r) (etypecase def (internal-entdef (setf r (make-rod-xstream (entdef-value def))) (setf (xstream-name r) (make-stream-name :entity-name entity-name :entity-kind kind :uri nil))) (external-entdef (setf r (xstream-open-extid (extid-using-catalog (entdef-extid def)))) (setf (stream-name-entity-name (xstream-name r)) entity-name (stream-name-entity-kind (xstream-name r)) kind))) r))) (defun checked-get-entdef (name type) (let ((def (get-entity-definition name type (dtd *ctx*)))) (unless def (error "Entity '~A' is not defined." (rod-string name))) def)) (defun xstream-open-extid (extid) (let* ((sysid (extid-system extid)) (stream (or (funcall (or (entity-resolver *ctx*) (constantly nil)) (extid-public extid) (extid-system extid)) (open (uri-to-pathname sysid) :element-type '(unsigned-byte 8) :direction :input)))) (make-xstream stream :name (make-stream-name :uri sysid) :initial-speed 1))) (defun call-with-entity-expansion-as-stream (zstream cont name kind) ;; `zstream' is for error messages -- we need something better! (let ((in (entity->xstream name kind zstream))) (unwind-protect (funcall cont in) (close-xstream in)))) (defun ensure-dtd () (unless (dtd *ctx*) (setf (dtd *ctx*) (make-dtd)) (define-default-entities))) (defun define-default-entities () (define-entity nil #"lt" :general (make-internal-entdef #"<")) (define-entity nil #"gt" :general (make-internal-entdef #">")) (define-entity nil #"amp" :general (make-internal-entdef #"&")) (define-entity nil #"apos" :general (make-internal-entdef #"'")) (define-entity nil #"quot" :general (make-internal-entdef #"\""))) (defstruct attdef ;; an attribute definition element ;name of element this attribute belongs to name ;name of attribute type ;type of attribute; either one of :CDATA, :ID, :IDREF, :IDREFS, ; :ENTITY, :ENTITIES, :NMTOKEN, :NMTOKENS, or ; (:NOTATION *) ; (:ENUMERATION *) default ;default value of attribute: ; :REQUIRED, :IMPLIED, (:FIXED content) or (:DEFAULT content) (external-p *markup-declaration-external-p*) ) (defstruct elmdef ;; an element definition name ;name of the element content ;content model [*] attributes ;list of defined attributes compiled-cspec ;cons of validation function for contentspec (external-p *markup-declaration-external-p*) ) ;; [*] in XML it is possible to define attributes before the element ;; itself is defined and since we hang attribute definitions into the ;; relevant element definitions, the `content' slot indicates whether an ;; element was actually defined. It is NIL until set to a content model ;; when the element type declaration is processed. (defun %make-rod-hash-table () ;; XXX with portable hash tables, this is the only way to case-sensitively ;; use rods. However, EQUALP often has horrible performance! Most Lisps ;; provide extensions for user-defined equality, we should use them! There ;; is also a home-made hash table for rods defined below, written by ;; Gilbert (I think). We could also use that one, but I would prefer the ;; first method, even if it's unportable. (make-hash-table :test #+rune-is-character 'equal #-rune-is-character 'equalp)) (defstruct dtd (elements (%make-rod-hash-table)) ;elmdefs (gentities (%make-rod-hash-table)) ;general entities (pentities (%make-rod-hash-table)) ;parameter entities (notations (%make-rod-hash-table))) (defun make-dtd-cache () (puri:make-uri-space)) (defvar *cache-all-dtds* nil) (defvar *dtd-cache* (make-dtd-cache)) (defun remdtd (uri dtd-cache) (setf uri (puri:intern-uri uri dtd-cache)) (prog1 (and (getf (puri:uri-plist uri) 'dtd) t) (puri:unintern-uri uri dtd-cache))) (defun clear-dtd-cache (dtd-cache) (puri:unintern-uri t dtd-cache)) (defun getdtd (uri dtd-cache) (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd)) (defun (setf getdtd) (newval uri dtd-cache) (setf (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd) newval) newval) ;;;; (defun find-element (name dtd) (gethash name (dtd-elements dtd))) (defun define-element (dtd element-name &optional content-model) (let ((e (find-element element-name dtd))) (cond ((null e) (setf (gethash element-name (dtd-elements dtd)) (make-elmdef :name element-name :content content-model))) ((null content-model) e) (t (when *validate* (when (elmdef-content e) (validity-error "(05) Unique Element Type Declaration")) (when (eq content-model :EMPTY) (dolist (ad (elmdef-attributes e)) (let ((type (attdef-type ad))) (when (and (listp type) (eq (car type) :NOTATION)) (validity-error "(16) No Notation on Empty Element: ~S" (rod-string element-name))))))) (sax:element-declaration (handler *ctx*) element-name content-model) (setf (elmdef-content e) content-model) (setf (elmdef-external-p e) *markup-declaration-external-p*) e)))) (defvar *redefinition-warning* t) (defun define-attribute (dtd element name type default) (let ((adef (make-attdef :element element :name name :type type :default default)) (e (or (find-element element dtd) (define-element dtd element)))) (when (and *validate* (listp default)) (unless (eq (attdef-type adef) :CDATA) (setf (second default) (canon-not-cdata-attval (second default)))) (validate-attribute* *ctx* adef (second default))) (cond ((find-attribute e name) (when *redefinition-warning* (warn "Attribute \"~A\" of \"~A\" not redefined." (rod-string name) (rod-string element)))) (t (when *validate* (when (eq type :ID) (when (find :ID (elmdef-attributes e) :key #'attdef-type) (validity-error "(09) One ID per Element Type: element ~A" (rod-string element))) (unless (member default '(:REQUIRED :IMPLIED)) (validity-error "(10) ID Attribute Default: ~A" (rod-string element)))) (flet ((notationp (type) (and (listp type) (eq (car type) :NOTATION)))) (when (notationp type) (when (find-if #'notationp (elmdef-attributes e) :key #'attdef-type) (validity-error "(15) One Notation Per Element Type: ~S" (rod-string element))) (when (eq (elmdef-content e) :EMPTY) (validity-error "(16) No Notation on Empty Element: ~S" (rod-string element)))))) (sax:attribute-declaration (handler *ctx*) element name type default) (push adef (elmdef-attributes e)))))) (defun find-attribute (elmdef name) (find name (elmdef-attributes elmdef) :key #'attdef-name :test #'rod=)) (defun define-notation (dtd name id) (let ((ns (dtd-notations dtd))) (when (gethash name ns) (validity-error "(24) Unique Notation Name: ~S" (rod-string name))) (setf (gethash name ns) id))) (defun find-notation (name dtd) (gethash name (dtd-notations dtd))) ;;;; --------------------------------------------------------------------------- ;;;; z streams and lexer ;;;; (defstruct zstream token-category token-semantic input-stack) (defun read-token (input) (cond ((zstream-token-category input) (multiple-value-prog1 (values (zstream-token-category input) (zstream-token-semantic input)) (setf (zstream-token-category input) nil (zstream-token-semantic input) nil))) (t (read-token-2 input)))) (defun peek-token (input) (cond ((zstream-token-category input) (values (zstream-token-category input) (zstream-token-semantic input))) (t (multiple-value-bind (c s) (read-token input) (setf (zstream-token-category input) c (zstream-token-semantic input) s)) (values (zstream-token-category input) (zstream-token-semantic input))))) (defun read-token-2 (input) (cond ((null (zstream-input-stack input)) (values :eof nil)) (t (let ((c (peek-rune (car (zstream-input-stack input))))) (cond ((eq c :eof) (cond ((eq (cadr (zstream-input-stack input)) :stop) (values :eof nil)) (t (close-xstream (pop (zstream-input-stack input))) (if (null (zstream-input-stack input)) (values :eof nil) (values :S nil) ;fake #x20 after PE expansion )))) (t (read-token-3 input))))))) (defvar *data-behaviour* ) ;either :DTD or :DOC (defun read-token-3 (zinput) (let ((input (car (zstream-input-stack zinput)))) ;; PI Comment (let ((c (read-rune input))) (cond ;; first the common tokens ((rune= #/< c) (read-token-after-|<| zinput input)) ;; now dispatch (t (ecase *data-behaviour* (:DTD (cond ((rune= #/\[ c) :\[) ((rune= #/\] c) :\]) ((rune= #/\( c) :\() ((rune= #/\) c) :\)) ((rune= #/\| c) :\|) ((rune= #/\> c) :\>) ((rune= #/\" c) :\") ((rune= #/\' c) :\') ((rune= #/\, c) :\,) ((rune= #/\? c) :\?) ((rune= #/\* c) :\*) ((rune= #/\+ c) :\+) ((name-rune-p c) (unread-rune c input) (values :name (read-name-token input))) ((rune= #/# c) (let ((q (read-name-token input))) (cond ((equalp q '#.(string-rod "REQUIRED")) :|#REQUIRED|) ((equalp q '#.(string-rod "IMPLIED")) :|#IMPLIED|) ((equalp q '#.(string-rod "FIXED")) :|#FIXED|) ((equalp q '#.(string-rod "PCDATA")) :|#PCDATA|) (t (error "Unknown token: ~S." q))))) ((or (rune= c #/U+0020) (rune= c #/U+0009) (rune= c #/U+000D) (rune= c #/U+000A)) (values :S nil)) ((rune= #/% c) (cond ((name-start-rune-p (peek-rune input)) ;; an entity reference (read-pe-reference zinput)) (t (values :%)))) (t (error "Unexpected character ~S." c)))) (:DOC (cond ((rune= c #/&) (multiple-value-bind (kind data) (read-entity-ref input) (cond ((eq kind :NAMED) (values :ENTITY-REF data) ) ((eq kind :NUMERIC) (values :CDATA (with-rune-collector (collect) (%put-unicode-char data collect))))))) (t (unread-rune c input) (values :CDATA (read-cdata input))) )))))))) (defun read-pe-reference (zinput) (let* ((input (car (zstream-input-stack zinput))) (nam (read-name-token input))) (assert (rune= #/\; (read-rune input))) (cond (*expand-pe-p* ;; no external entities here! (let ((i2 (entity->xstream nam :parameter))) (zstream-push i2 zinput)) (values :S nil) ;space before inserted PE expansion. ) (t (values :PE-REFERENCE nam)) ))) (defun read-token-after-|<| (zinput input) (let ((d (read-rune input))) (cond ((eq d :eof) (error "EOF after '<'")) ((rune= #/! d) (read-token-after-| in case of a named entity or :NUMERIC in case of numeric entities. The initial #\\& is considered to be consumed already." (let ((c (peek-rune input))) (cond ((eq c :eof) (error "EOF after '&'")) ((rune= c #/#) (values :NUMERIC (read-numeric-entity input))) (t (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after &.")) (let ((name (read-name-token input))) (setf c (read-rune input)) (unless (rune= c #/\;) (perror input "Expected \";\".")) (values :NAMED name)))))) (definline read-S? (input) (while (member (peek-rune input) '(#/U+0020 #/U+0009 #/U+000A #/U+000D) :test #'eq) (consume-rune input))) (defun read-tag-2 (zinput input kind) (let ((name (read-name-token input)) (atts nil)) (setf atts (read-attribute-list zinput input nil)) ;; check for double attributes (do ((q atts (cdr q))) ((null q)) (cond ((find (caar q) (cdr q) :key #'car) (error "Attribute ~S has two definitions in element ~S." (rod-string (caar q)) (rod-string name))))) (cond ((eq (peek-rune input) #/>) (consume-rune input) (values kind (cons name atts))) ((eq (peek-rune input) #//) (consume-rune input) (assert (rune= #/> (read-rune input))) (values :ztag (cons name atts))) (t (error "syntax error in read-tag-2.")) ))) (defun read-attribute (zinput input) (unless (name-start-rune-p (peek-rune input)) (error "Expected name.")) ;; arg thanks to the post mortem nature of name space declarations, ;; we could only process the attribute values post mortem. (let ((name (read-name-token input))) (while (let ((c (peek-rune input))) (and (not (eq c :eof)) (or (rune= c #/U+0020) (rune= c #/U+0009) (rune= c #/U+000A) (rune= c #/U+000D)))) (consume-rune input)) (unless (eq (read-rune input) #/=) (perror zinput "Expected \"=\".")) (while (let ((c (peek-rune input))) (and (not (eq c :eof)) (or (rune= c #/U+0020) (rune= c #/U+0009) (rune= c #/U+000A) (rune= c #/U+000D)))) (consume-rune input)) (cons name (read-att-value-2 input)) ;;(cons name (read-att-value zinput input :ATT t)) )) (defun canon-not-cdata-attval (value) ;; | If the declared value is not CDATA, then the XML processor must ;; | further process the normalized attribute value by discarding any ;; | leading and trailing space (#x20) characters, and by replacing ;; | sequences of space (#x20) characters by a single space (#x20) ;; | character. (with-rune-collector (collect) (let ((gimme-20 nil) (anything-seen-p nil)) (map nil (lambda (c) (cond ((rune= c #/u+0020) (setf gimme-20 t)) (t (when (and anything-seen-p gimme-20) (collect #/u+0020)) (setf gimme-20 nil) (setf anything-seen-p t) (collect c)))) value)))) (definline data-rune-p (rune) ;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. (let ((c (rune-code rune))) (or (= c #x9) (= c #xA) (= c #xD) (<= #x20 c #xD7FF) (<= #xE000 c #xFFFD) ;; (<= #xD800 c #xDBFF) (<= #xDC00 c #xDFFF) ;; ))) (defun read-att-value (zinput input mode &optional canon-space-p (delim nil)) (with-rune-collector-2 (collect) (labels ((muffle (input delim) (let (c) (loop (setf c (read-rune input)) (cond ((eql delim c) (return)) ((eq c :eof) (error "EOF")) ((rune= c #/&) (setf c (peek-rune input)) (cond ((rune= c #/#) (let ((c (read-numeric-entity input))) (%put-unicode-char c collect))) (t (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after &.")) (let ((name (read-name-token input))) (setf c (read-rune input)) (assert (rune= c #/\;)) (ecase mode (:ATT (recurse-on-entity zinput name :general (lambda (zinput) (muffle (car (zstream-input-stack zinput)) :eof)))) (:ENT ;; bypass, but never the less we ;; need to check for legal ;; syntax. ;; Must it be defined? ;; allerdings: unparsed sind verboten (collect #/&) (map nil (lambda (x) (collect x)) name) (collect #/\; ))))))) ((and (eq mode :ENT) (rune= c #/%)) (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after %.")) (let ((name (read-name-token input))) (setf c (read-rune input)) (assert (rune= c #/\;)) (cond (*expand-pe-p* (recurse-on-entity zinput name :parameter (lambda (zinput) (muffle (car (zstream-input-stack zinput)) :eof)))) (t (error "No PE here."))))) ((and (eq mode :ATT) (rune= c #/<)) ;; xxx fix error message (cerror "Eat them in spite of this." "For no apparent reason #\/< is forbidden in attribute values. ~ You lost -- next time choose SEXPR syntax.") (collect c)) ((and canon-space-p (space-rune-p c)) (collect #/space)) ((not (data-rune-p c)) (error "illegal char: ~S." c)) (t (collect c))))))) (declare (dynamic-extent #'muffle)) (muffle input (or delim (let ((delim (read-rune input))) (assert (member delim '(#/\" #/\'))) delim)))))) (defun read-numeric-entity (input) ;; xxx eof handling ;; The #/& is already read (let ((res (let ((c (read-rune input))) (assert (rune= c #/#)) (setq c (read-rune input)) (cond ((rune= c #/x) ;; hexadecimal (setq c (read-rune input)) (assert (digit-rune-p c 16)) (prog1 (parse-integer (with-output-to-string (sink) (write-char (rune-char c) sink) (while (digit-rune-p (setq c (read-rune input)) 16) (write-char (rune-char c) sink))) :radix 16) (assert (rune= c #/\;))) ) ((rune<= #/0 c #/9) ;; decimal (prog1 (parse-integer (with-output-to-string (sink) (write-char (rune-char c) sink) (while (rune<= #/0 (setq c (read-rune input)) #/9) (write-char (rune-char c) sink))) :radix 10) (assert (rune= c #/\;))) ) (t (error "Bad char in numeric character entity.") ))))) (unless (code-data-char-p res) (error "expansion of numeric character reference (#x~X) is no data char." res)) res)) (defun read-pi (input) ;; ") (return)) (when (rune= d #/?) (collect #/?) (go state-2)) (collect #/?) (collect d) (go state-1))))) (defun read-comment-content (input &aux d) (let ((warnedp nil)) (with-rune-collector (collect) (block nil (tagbody state-1 (setf d (read-rune input)) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/-) (go state-2)) (collect d) (go state-1) state-2 ;; #/- seen (setf d (read-rune input)) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/-) (go state-3)) (collect #/-) (collect d) (go state-1) state-3 ;; #/- #/- seen (setf d (read-rune input)) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/>) (return)) (unless warnedp (warn "WFC: no '--' in comments please.") (setf warnedp t)) (when (rune= d #/-) (collect #/-) (go state-3)) (collect #/-) (collect #/-) (collect d) (go state-1)))))) (defun read-cdata-sect (input &aux d) ;; (with-rune-collector (collect) (block nil (tagbody state-1 (setf d (read-rune input)) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/\]) (go state-2)) (collect d) (go state-1) state-2 ;; #/] seen (setf d (read-rune input)) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/\]) (go state-3)) (collect #/\]) (collect d) (go state-1) state-3 ;; #/\] #/\] seen (setf d (read-rune input)) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/>) (return)) (when (rune= d #/\]) (collect #/\]) (go state-3)) (collect #/\]) (collect #/\]) (collect d) (go state-1))))) #+(or) ;; FIXME: There is another definition below that looks more reasonable. (defun read-cdata (input initial-char &aux d) (cond ((not (data-rune-p initial-char)) (error "Illegal char: ~S." initial-char))) (with-rune-collector (collect) (block nil (tagbody (cond ((rune= initial-char #/\]) (go state-2)) (t (collect initial-char))) state-1 (setf d (peek-rune input)) (when (or (eq d :eof) (rune= d #/<) (rune= d #/&)) (return)) (read-rune input) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/\]) (go state-2)) (collect d) (go state-1) state-2 ;; #/\] seen (setf d (peek-rune input)) (when (or (eq d :eof) (rune= d #/<) (rune= d #/&)) (collect #/\]) (return)) (read-rune input) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/\]) (go state-3)) (collect #/\]) (collect d) (go state-1) state-3 ;; #/\] #/\] seen (setf d (peek-rune input)) (when (or (eq d :eof) (rune= d #/<) (rune= d #/&)) (collect #/\]) (collect #/\]) (return)) (read-rune input) (unless (data-rune-p d) (error "Illegal char: ~S." d)) (when (rune= d #/>) (error "For no apparent reason ']]>' in not allowed in a CharData token -- you lost.")) (when (rune= d #/\]) (collect #/\]) (go state-3)) (collect #/\]) (collect #/\]) (collect d) (go state-1))))) ;; some character categories (defun space-rune-p (rune) (declare (type rune rune)) (or (rune= rune #/U+0020) (rune= rune #/U+0009) (rune= rune #/U+000A) (rune= rune #/U+000D))) (defun code-data-char-p (c) ;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. (or (= c #x9) (= c #xA) (= c #xD) (<= #x20 c #xD7FF) (<= #xE000 c #xFFFD) (<= #x10000 c #x10FFFF))) (defun pubid-char-p (c) (or (rune= c #/u+0020) (rune= c #/u+000D) (rune= c #/u+000A) (rune<= #/a c #/z) (rune<= #/A c #/Z) (rune<= #/0 c #/9) (member c '(#/- #/' #/\( #/\) #/+ #/, #/. #// #/: #/= #/? #/\; #/! #/* #/# #/@ #/$ #/_ #/%)))) (defun expect (input category) (multiple-value-bind (cat sem) (read-token input) (unless (eq cat category) (error "Expected ~S saw ~S [~S]" category cat sem)) (values cat sem))) (defun consume-token (input) (read-token input)) ;;;; --------------------------------------------------------------------------- ;;;; Parser ;;;; (defun p/S (input) ;; S ::= (#x20 | #x9 | #xD | #xA)+ (expect input :S) (while (eq (peek-token input) :S) (consume-token input))) (defun p/S? (input) ;; S ::= (#x20 | #x9 | #xD | #xA)+ (while (eq (peek-token input) :S) (consume-token input))) (defun p/name (input) (nth-value 1 (expect input :name))) (defun p/attlist-decl (input) ;; [52] AttlistDecl ::= '' (let (elm-name) (expect input :|) (consume-token input) (return)) (t (multiple-value-bind (name type default) (p/attdef input) (define-attribute (dtd *ctx*) elm-name name type default)) ))) (:> (return)) (otherwise (error "Expected either another AttDef or end of \" (S? )* S? ;; (declare (type function item-parser)) (let (res) (p/S? input) (setf res (list (funcall item-parser input))) (loop (p/S? input) (cond ((eq (peek-token input) delimiter) (consume-token input) (p/S? input) (push (funcall item-parser input) res)) (t (return)))) (p/S? input) (reverse res))) (defun p/att-type (input) ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType ;; [55] StringType ::= 'CDATA' ;; [56] TokenizedType ::= 'ID' /*VC: ID */ ;; /*VC: One ID per Element Type */ ;; /*VC: ID Attribute Default */ ;; | 'IDREF' /*VC: IDREF */ ;; | 'IDREFS' /*VC: IDREF */ ;; | 'ENTITY' /*VC: Entity Name */ ;; | 'ENTITIES' /*VC: Entity Name */ ;; | 'NMTOKEN' /*VC: Name Token */ ;; | 'NMTOKENS' /*VC: Name Token */ ;; [57] EnumeratedType ::= NotationType | Enumeration ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' ;; /* VC: Notation Attributes */ ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' /* VC: Enumeration */ (multiple-value-bind (cat sem) (read-token input) (cond ((eq cat :name) (cond ((equalp sem '#.(string-rod "CDATA")) :CDATA) ((equalp sem '#.(string-rod "ID")) :ID) ((equalp sem '#.(string-rod "IDREF")) :IDREFS) ((equalp sem '#.(string-rod "IDREFS")) :IDREFS) ((equalp sem '#.(string-rod "ENTITY")) :ENTITY) ((equalp sem '#.(string-rod "ENTITIES")) :ENTITIES) ((equalp sem '#.(string-rod "NMTOKEN")) :NMTOKEN) ((equalp sem '#.(string-rod "NMTOKENS")) :NMTOKENS) ((equalp sem '#.(string-rod "NOTATION")) (let (names) (p/S input) (expect input :\() (setf names (p/list input #'p/name :\| )) (expect input :\)) (when *validate* (setf (referenced-notations *ctx*) (append names (referenced-notations *ctx*)))) (cons :NOTATION names))) (t (error "In p/att-type: ~S ~S." cat sem)))) ((eq cat :\() ;; XXX Die Nmtoken-Syntax pruefen wir derzeit nur beim Validieren. (let (names) ;;(expect input :\() (setf names (p/list input #'p/name :\| )) (expect input :\)) (cons :ENUMERATION names))) (t (error "In p/att-type: ~S ~S." cat sem)) ))) (defun p/default-decl (input) ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' ;; | (('#FIXED' S)? AttValue) /* VC: Required Attribute */ ;; ;; /* VC: Attribute Default Legal */ ;; /* WFC: No < in Attribute Values */ ;; /* VC: Fixed Attribute Default */ (multiple-value-bind (cat sem) (peek-token input) (cond ((eq cat :|#REQUIRED|) (consume-token input) :REQUIRED) ((eq cat :|#IMPLIED|) (consume-token input) :IMPLIED) ((eq cat :|#FIXED|) (consume-token input) (p/S input) (list :FIXED (p/att-value input))) ((or (eq cat :\') (eq cat :\")) (list :DEFAULT (p/att-value input))) (t (error "p/default-decl: ~S ~S." cat sem)) ))) ;;;; ;; [70] EntityDecl ::= GEDecl | PEDecl ;; [71] GEDecl ::= '' ;; [72] PEDecl ::= '' ;; [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) ;; [74] PEDef ::= EntityValue | ExternalID ;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral ;; | 'PUBLIC' S PubidLiteral S SystemLiteral ;; [76] NDataDecl ::= S 'NDATA' S Name /* VC: Notation Declared */ (defun p/entity-decl (input) (let (name def kind) (expect input :|))) (defun report-entity (h kind name def) (etypecase def (external-entdef (let ((extid (entdef-extid def)) (ndata (entdef-ndata def))) (if ndata (sax:unparsed-entity-declaration h name (extid-public extid) (uri-rod (extid-system extid)) ndata) (sax:external-entity-declaration h kind name (extid-public extid) (uri-rod (extid-system extid)))))) (internal-entdef (sax:internal-entity-declaration h kind name (entdef-value def))))) (defun p/entity-def (input kind) (multiple-value-bind (cat sem) (peek-token input) (cond ((member cat '(:\" :\')) (make-internal-entdef (p/entity-value input))) ((and (eq cat :name) (or (equalp sem '#.(string-rod "SYSTEM")) (equalp sem '#.(string-rod "PUBLIC")))) (let (extid ndata) (setf extid (p/external-id input nil)) (when (eq kind :general) ;NDATA allowed at all? (cond ((eq (peek-token input) :S) (p/S? input) (when (and (eq (peek-token input) :name) (equalp (nth-value 1 (peek-token input)) '#.(string-rod "NDATA"))) (consume-token input) (p/S input) (setf ndata (p/name input)) (when *validate* (push ndata (referenced-notations *ctx*))))))) (make-external-entdef extid ndata))) (t (error "p/entity-def: ~S / ~S." cat sem)) ))) (defun p/entity-value (input) (let ((delim (if (eq (read-token input) :\") #/\" #/\'))) (read-att-value input (car (zstream-input-stack input)) :ENT nil delim))) (defun p/att-value (input) (let ((delim (if (eq (read-token input) :\") #/\" #/\'))) (read-att-value input (car (zstream-input-stack input)) :ATT t delim))) (defun p/external-id (input &optional (public-only-ok-p nil)) ;; xxx public-only-ok-p (multiple-value-bind (cat sem) (read-token input) (cond ((and (eq cat :name) (equalp sem '#.(string-rod "SYSTEM"))) (p/S input) (make-extid nil (p/system-literal input))) ((and (eq cat :name) (equalp sem '#.(string-rod "PUBLIC"))) (let (pub sys) (p/S input) (setf pub (p/pubid-literal input)) (when (eq (peek-token input) :S) (p/S input) (when (member (peek-token input) '(:\" :\')) (setf sys (p/system-literal input)))) (when (and (not public-only-ok-p) (null sys)) (error "System identifier needed for this PUBLIC external identifier.")) (make-extid pub sys))) (t (error "Expected external-id: ~S / ~S." cat sem))))) ;; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") ;; [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" ;; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] ;; | [-'()+,./:=?;!*#@$_%] (defun p/id (input) (multiple-value-bind (cat) (read-token input) (cond ((member cat '(:\" :\')) (let ((delim (if (eq cat :\") #/\" #/\'))) (with-rune-collector (collect) (loop (let ((c (read-rune (car (zstream-input-stack input))))) (cond ((eq c :eof) (error "EOF in system literal.")) ((rune= c delim) (return)) (t (collect c)))))))) (t (error "Expect either \" or \'."))))) ;; it is important to cache the orginal URI rod, since the re-serialized ;; uri-string can be different from the one parsed originally. (defun uri-rod (uri) (if uri (or (getf (puri:uri-plist uri) 'original-rod) (rod (puri:render-uri uri nil))) nil)) (defun safe-parse-uri (str) ;; puri doesn't like strings starting with file:///, although that is a very ;; common is practise. Cut it away, we don't distinguish between scheme ;; :FILE and NIL anway. (when (eql (search "file://" str) 0) (setf str (subseq str (length "file://")))) (puri:parse-uri str)) (defun p/system-literal (input) (let* ((rod (p/id input)) (result (safe-parse-uri (rod-string rod)))) (setf (getf (puri:uri-plist result) 'original-rod) rod) result)) (defun p/pubid-literal (input) (let ((result (p/id input))) (unless (every #'pubid-char-p result) (error "Illegal pubid: ~S." (rod-string result))) result)) ;;;; (defun p/element-decl (input) (let (name content) (expect input :|) (when *validate* (define-element (dtd *ctx*) name content)) (list :element name content))) (defun maybe-compile-cspec (e) (or (elmdef-compiled-cspec e) (setf (elmdef-compiled-cspec e) (let ((cspec (elmdef-content e))) (unless cspec (validity-error "(03) Element Valid: no definition for ~A" (rod-string (elmdef-name e)))) (multiple-value-call #'cons (compile-cspec cspec (standalone-check-necessary-p e))))))) (defun make-root-model (name) (cons (lambda (actual-name) (if (rod= actual-name name) (constantly :dummy) nil)) (constantly t))) ;;; content spec validation: ;;; ;;; Given a `contentspec', COMPILE-CSPEC returns as multiple values two ;;; functions A and B of one argument to be called for every ;;; A. child element ;;; B. text child node ;;; ;;; Function A will be called with ;;; - the element name rod as its argument. If that element may appear ;;; at the current position, a new function to be called for the next ;;; child is returned. Otherwise NIL is returned. ;;; - argument NIL at the end of the element, it must then return T or NIL ;;; to indicate whether the end tag is valid. ;;; ;;; Function B will be called with the character data rod as its argument, it ;;; returns a boolean indicating whether this text element is allowed. ;;; ;;; That is, if one of the functions ever returns NIL, the element is ;;; rejected as invalid. (defun cmodel-done (actual-value) (null actual-value)) (defun compile-cspec (cspec &optional standalone-check) (cond ((atom cspec) (ecase cspec (:EMPTY (values #'cmodel-done (constantly nil))) (:PCDATA (values #'cmodel-done (constantly t))) (:ANY (values (labels ((doit (name) (if name #'doit t))) #'doit) (constantly t))))) ((and (eq (car cspec) '*) (let ((subspec (second cspec))) (and (eq (car subspec) 'or) (eq (cadr subspec) :PCDATA)))) (values (compile-mixed (second cspec)) (constantly t))) (t (values (compile-content-model cspec) (lambda (rod) (when standalone-check (validity-error "(02) Standalone Document Declaration: whitespace")) (every #'white-space-rune-p rod)))))) (defun compile-mixed (cspec) ;; das koennten wir theoretisch auch COMPILE-CONTENT-MODEL erledigen lassen (let ((allowed-names (cddr cspec))) (labels ((doit (actual-name) (cond ((null actual-name) t) ((member actual-name allowed-names :test #'rod=) #'doit) (t nil)))) #'doit))) (defun compile-content-model (cspec &optional (continuation #'cmodel-done)) (if (vectorp cspec) (lambda (actual-name) (if (and actual-name (rod= cspec actual-name)) continuation nil)) (ecase (car cspec) (and (labels ((traverse (seq) (compile-content-model (car seq) (if (cdr seq) (traverse (cdr seq)) continuation)))) (traverse (cdr cspec)))) (or (let ((options (mapcar (rcurry #'compile-content-model continuation) (cdr cspec)))) (lambda (actual-name) (some (rcurry #'funcall actual-name) options)))) (? (let ((maybe (compile-content-model (second cspec) continuation))) (lambda (actual-name) (or (funcall maybe actual-name) (funcall continuation actual-name))))) (* (let (maybe-continuation) (labels ((recurse (actual-name) (if (null actual-name) (funcall continuation actual-name) (or (funcall maybe-continuation actual-name) (funcall continuation actual-name))))) (setf maybe-continuation (compile-content-model (second cspec) #'recurse)) #'recurse))) (+ (let ((it (cadr cspec))) (compile-content-model `(and ,it (* ,it)) continuation)))))) (defun setp (list &key (test 'eql)) (equal list (remove-duplicates list :test test))) (defun legal-content-model-p (cspec &optional validate) (or (eq cspec :PCDATA) (eq cspec :ANY) (eq cspec :EMPTY) (and (consp cspec) (eq (car cspec) '*) (consp (cadr cspec)) (eq (car (cadr cspec)) 'or) (eq (cadr (cadr cspec)) :PCDATA) (every #'vectorp (cddr (cadr cspec))) (if (and validate (not (setp (cddr (cadr cspec)) :test #'rod=))) (validity-error "VC: No Duplicate Types (07)") t)) (labels ((walk (x) (cond ((member x '(:PCDATA :ANY :EMPTY)) nil) ((atom x) t) ((and (walk (car x)) (walk (cdr x))))))) (walk cspec)))) ;; wir fahren besser, wenn wir machen: ;; cspec ::= 'EMPTY' | 'ANY' | '#PCDATA' ;; | Name ;; | cs ;; cs ::= '(' S? cspec ( S? '|' S? cspec)* S? ')' ('?' | '*' | '+')? ;; und eine post mortem analyse (defun p/cspec (input) (let ((term (let ((names nil) op-cat op res stream) (multiple-value-bind (cat sem) (peek-token input) (cond ((eq cat :name) (consume-token input) (cond ((rod= sem '#.(string-rod "EMPTY")) :EMPTY) ((rod= sem '#.(string-rod "ANY")) :ANY) (t sem))) ((eq cat :\#PCDATA) (consume-token input) :PCDATA) ((eq cat :\() (setf stream (car (zstream-input-stack input))) (consume-token input) (p/S? input) (setq names (list (p/cspec input))) (p/S? input) (cond ((member (peek-token input) '(:\| :\,)) (setf op-cat (peek-token input)) (setf op (if (eq op-cat :\,) 'and 'or)) (while (eq (peek-token input) op-cat) (consume-token input) (p/S? input) (push (p/cspec input) names) (p/S? input)) (setf res (cons op (reverse names)))) (t (setf res (cons 'and names)))) (p/S? input) (expect input :\)) (when *validate* (unless (eq stream (car (zstream-input-stack input))) (validity-error "(06) Proper Group/PE Nesting"))) res) (t (error "p/cspec - ~s / ~s" cat sem))))))) (cond ((eq (peek-token input) :?) (consume-token input) (list '? term)) ((eq (peek-token input) :+) (consume-token input) (list '+ term)) ((eq (peek-token input) :*) (consume-token input) (list '* term)) (t term)))) (defun normalize-mixed-cspec (cspec) ;; der Parser oben funktioniert huebsch fuer die children-Regel, aber ;; fuer Mixed ist das Ergebnis nicht praktisch, denn dort wollen wir ;; eigentlich auf eine Liste von Namen in einheitlichem Format hinaus. ;; Dazu normalisieren wir einfach in eine der beiden folgenden Formen: ;; (* (or :PCDATA ...rods...)) -- und zwar exakt so! ;; :PCDATA -- sonst ganz trivial (flet ((trivialp (c) (and (consp c) (and (eq (car c) 'and) (eq (cadr c) :PCDATA) (null (cddr c)))))) (if (or (trivialp cspec) ;(and PCDATA) (and (consp cspec) ;(* (and PCDATA)) (and (eq (car cspec) '*) (null (cddr cspec)) (trivialp (cadr cspec))))) :PCDATA cspec))) ;; [52] AttlistDecl ::= '' ;; [52] AttlistDecl ::= '' ;; [52] AttlistDecl ::= '' ;; [53] AttDefs ::= S Name S AttType S DefaultDecl AttDefs ;; [53] AttDefs ::= (defun p/notation-decl (input) (let (name id) (expect input :|) (sax:notation-declaration (handler *ctx*) name (if (extid-public id) (normalize-public-id (extid-public id)) nil) (uri-rod (extid-system id))) (when *validate* (define-notation (dtd *ctx*) name id)) (list :notation-decl name id))) (defun normalize-public-id (rod) (with-rune-collector (collect) (let ((gimme-20 nil) (anything-seen-p nil)) (map nil (lambda (c) (cond ((or (rune= c #/u+0009) (rune= c #/u+000A) (rune= c #/u+000D) (rune= c #/u+0020)) (setf gimme-20 t)) (t (when (and anything-seen-p gimme-20) (collect #/u+0020)) (setf gimme-20 nil) (setf anything-seen-p t) (collect c)))) rod)))) ;;; (defun p/conditional-sect (input) (expect input : initial-stream)) (defun p/ignore-sect (input initial-stream) ;; "))) (cond ((and (rune= c3 #/<) (rune= c2 #/!) (rune= c1 #/\[)) (incf level))) (cond ((and (rune= c3 #/\]) (rune= c2 #/\]) (rune= c1 #/>)) (decf level))) ))) (unless (eq (car (zstream-input-stack input)) initial-stream) (validity-error "(21) Proper Conditional Section/PE Nesting"))) (defun p/ext-subset-decl (input) ;; ( markupdecl | conditionalSect | S )* (loop (case (let ((*expand-pe-p* nil)) (peek-token input)) (:| )) (setf extid (p/external-id input t)))) (when dtd-extid (setf extid dtd-extid)) (p/S? input) (sax:start-dtd (handler *ctx*) name (and extid (extid-public extid)) (and extid (uri-rod (extid-system extid)))) (when (eq (peek-token input) :\[ ) (when (disallow-internal-subset *ctx*) (error "document includes an internal subset")) (ensure-dtd) (consume-token input) (while (progn (p/S? input) (not (eq (peek-token input) :\] ))) (if (eq (peek-token input) :PE-REFERENCE) (let ((name (nth-value 1 (read-token input)))) (recurse-on-entity input name :parameter (lambda (input) (etypecase (checked-get-entdef name :parameter) (external-entdef (p/ext-subset input)) (internal-entdef (p/ext-subset-decl input))) (unless (eq :eof (peek-token input)) (error "Trailing garbage."))))) (let ((*expand-pe-p* t)) (p/markup-decl input)))) (consume-token input) (p/S? input)) (expect input :>) (when extid (let* ((effective-extid (extid-using-catalog (absolute-extid input extid))) (sysid (extid-system effective-extid)) (fresh-dtd-p (null (dtd *ctx*))) (cached-dtd (and fresh-dtd-p (not (standalone-p *ctx*)) (getdtd sysid *dtd-cache*)))) (cond (cached-dtd (setf (dtd *ctx*) cached-dtd) (report-cached-dtd cached-dtd)) (t (let* ((xi2 (xstream-open-extid effective-extid)) (zi2 (make-zstream :input-stack (list xi2)))) (ensure-dtd) (p/ext-subset zi2) (when (and fresh-dtd-p *cache-all-dtds* *validate* (not (standalone-p *ctx*))) (setf (getdtd sysid *dtd-cache*) (dtd *ctx*)))))))) (sax:end-dtd (handler *ctx*)) (let ((dtd (dtd *ctx*))) (sax:entity-resolver (handler *ctx*) (lambda (name handler) (resolve-entity name handler dtd)))) (list :DOCTYPE name extid)))) (defun report-cached-dtd (dtd) (maphash (lambda (k v) (report-entity (handler *ctx*) :general k (cdr v))) (dtd-gentities dtd)) (maphash (lambda (k v) (report-entity (handler *ctx*) :parameter k (cdr v))) (dtd-pentities dtd)) (maphash (lambda (k v) (sax:notation-declaration (handler *ctx*) k (if (extid-public v) (normalize-public-id (extid-public v)) nil) (uri-rod (extid-system v)))) (dtd-notations dtd))) (defun p/misc*-2 (input) ;; Misc* (while (member (peek-token input) '(:COMMENT :PI :S)) (case (peek-token input) (:COMMENT (sax:comment (handler *ctx*) (nth-value 1 (peek-token input)))) (:PI (sax:processing-instruction (handler *ctx*) (car (nth-value 1 (peek-token input))) (cdr (nth-value 1 (peek-token input)))))) (consume-token input))) (defun p/document (input handler &key validate dtd root entity-resolver disallow-internal-subset) ;; check types of user-supplied arguments for better error messages: (check-type validate boolean) (check-type dtd (or null extid)) (check-type root (or null rod)) (check-type entity-resolver (or null function symbol)) (check-type disallow-internal-subset boolean) (let ((*ctx* (make-context :handler handler :entity-resolver entity-resolver :disallow-internal-subset disallow-internal-subset)) (*validate* validate)) (sax:start-document handler) ;; document ::= XMLDecl? Misc* (doctypedecl Misc*)? element Misc* ;; Misc ::= Comment | PI | S ;; xmldecl::='' ;; sddecl::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) ;; ;; we will use the attribute-value parser for the xml decl. (let ((*data-behaviour* :DTD)) ;; optional XMLDecl? (cond ((eq (peek-token input) :xml-pi) (let ((hd (parse-xml-pi (cdr (nth-value 1 (peek-token input))) t))) (setf (standalone-p *ctx*) (eq (xml-header-standalone-p hd) :yes)) (setup-encoding input hd)) (read-token input))) (set-full-speed input) ;; Misc* (p/misc*-2 input) ;; (doctypedecl Misc*)? (cond ((eq (peek-token input) :xstream ""))) (setf (xstream-name dummy) (make-stream-name :entity-name "dummy doctype" :entity-kind :main :uri (zstream-base-sysid input))) (p/doctype-decl (make-zstream :input-stack (list dummy)) dtd))) ((and validate (not dtd)) (validity-error "invalid document: no doctype"))) (ensure-dtd) ;; Override expected root element if asked to (when root (setf (model-stack *ctx*) (list (make-root-model root)))) ;; element (let ((*data-behaviour* :DOC)) (p/element input)) ;; optional Misc* (p/misc*-2 input) (unless (eq (peek-token input) :eof) (error "Garbage at end of document.")) (when *validate* (maphash (lambda (k v) (unless v (validity-error "(11) IDREF: ~S not defined" (rod-string k)))) (id-table *ctx*)) (dolist (name (referenced-notations *ctx*)) (unless (find-notation name (dtd *ctx*)) (validity-error "(23) Notation Declared: ~S" (rod-string name))))) (sax:end-document handler)))) (defun p/element (input) (if sax:*namespace-processing* (p/element-ns input) (p/element-no-ns input))) (defun p/element-no-ns (input) ;; [39] element ::= EmptyElemTag | STag content ETag (error "sorry, bitrot") (multiple-value-bind (cat sem) (read-token input) (cond ((eq cat :ztag) (sax:start-element (handler *ctx*) nil nil (car sem) (build-attribute-list-no-ns (cdr sem))) (sax:end-element (handler *ctx*) nil nil (car sem))) ((eq cat :stag) (sax:start-element (handler *ctx*) nil nil (car sem) (build-attribute-list-no-ns (cdr sem))) (p/content input) (multiple-value-bind (cat2 sem2) (read-token input) (unless (and (eq cat2 :etag) (eq (car sem2) (car sem))) (perror input "Bad nesting. ~S / ~S" (mu sem) (mu (cons cat2 sem2))))) (sax:end-element (handler *ctx*) nil nil (car sem))) (t (error "Expecting element."))))) (defun p/element-ns (input) (destructuring-bind (cat (name &rest attrs)) (multiple-value-list (read-token input)) (validate-start-element *ctx* name) (let ((ns-decls (declare-namespaces name attrs))) (multiple-value-bind (ns-uri prefix local-name) (decode-qname name) (declare (ignore prefix)) (let* ((raw-attlist (build-attribute-list-ns attrs)) (attlist (remove-if-not (lambda (a) (or sax:*include-xmlns-attributes* (not (xmlns-attr-p (sax:attribute-qname a))))) (process-attributes *ctx* name raw-attlist)))) (cond ((eq cat :ztag) (sax:start-element (handler *ctx*) ns-uri local-name name attlist) (sax:end-element (handler *ctx*) ns-uri local-name name)) ((eq cat :stag) (sax:start-element (handler *ctx*) ns-uri local-name name attlist) (p/content input) (multiple-value-bind (cat2 sem2) (read-token input) (unless (and (eq cat2 :etag) (eq (car sem2) name)) (perror input "Bad nesting. ~S / ~S" (mu name) (mu (cons cat2 sem2))))) (sax:end-element (handler *ctx*) ns-uri local-name name)) (t (error "Expecting element, got ~S." cat))))) (undeclare-namespaces ns-decls)) (validate-end-element *ctx* name))) (defun perror (stream format-string &rest format-args) (when (zstream-p stream) (setf stream (car (zstream-input-stack stream)))) (error "Parse error at line ~D column ~D: ~A" (xstream-line-number stream) (xstream-column-number stream) (apply #'format nil format-string format-args))) (defun p/content (input) ;; [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)* (multiple-value-bind (cat sem) (peek-token input) (case cat ((:stag :ztag) (p/element input) (p/content input)) ((:CDATA) (consume-token input) (validate-characters *ctx* sem) (sax:characters (handler *ctx*) sem) (p/content input)) ((:ENTITY-REF) (let ((name sem)) (consume-token input) (append ;; nil #+(OR) (recurse-on-entity input name :general (lambda (input) (prog1 (etypecase (checked-get-entdef name :general) (internal-entdef (p/content input)) (external-entdef (p/ext-parsed-ent input))) (unless (eq (peek-token input) :eof) (error "Trailing garbage. - ~S" (peek-token input)))))) (p/content input)))) ((:' content (when (eq (peek-token input) :xml-pi) (let ((hd (parse-xml-pi (cdr (nth-value 1 (peek-token input))) nil))) (setup-encoding input hd)) (consume-token input) ) (set-full-speed input) (p/content input)) (defun parse-xml-pi (content sd-ok-p) ;; --> xml-header ;;(make-xml-header)) (let* ((res (make-xml-header)) (i (make-rod-xstream content)) (atts (read-attribute-list 'foo i t))) ;xxx on 'foo (unless (eq (peek-rune i) :eof) (error "Garbage at end of XML PI.")) ;; versioninfo muss da sein ;; dann ? encodingdecl ;; dann ? sddecl ;; dann ende (when (and (not (eq (caar atts) (intern-name '#.(string-rod "version")))) sd-ok-p) (error "XML PI needs version.")) (when (eq (caar atts) (intern-name '#.(string-rod "version"))) (unless (and (>= (length (cdar atts)) 1) (every (lambda (x) (or (rune<= #/a x #/z) (rune<= #/A x #/Z) (rune<= #/0 x #/9) (rune= x #/_) (rune= x #/.) (rune= x #/:) (rune= x #/-))) (cdar atts))) (error "Bad XML version number: ~S." (rod-string (cdar atts)))) (setf (xml-header-version res) (rod-string (cdar atts))) (pop atts)) (when (eq (caar atts) (intern-name '#.(string-rod "encoding"))) (unless (and (>= (length (cdar atts)) 1) (every (lambda (x) (or (rune<= #/a x #/z) (rune<= #/A x #/Z) (rune<= #/0 x #/9) (rune= x #/_) (rune= x #/.) (rune= x #/-))) (cdar atts)) ((lambda (x) (or (rune<= #/a x #/z) (rune<= #/A x #/Z) (rune<= #/0 x #/9))) (aref (cdar atts) 0))) (error "Bad XML encoding name: ~S." (rod-string (cdar atts)))) (setf (xml-header-encoding res) (rod-string (cdar atts))) (pop atts)) (when (and sd-ok-p (eq (caar atts) (intern-name '#.(string-rod "standalone")))) (unless (or (rod= (cdar atts) '#.(string-rod "yes")) (rod= (cdar atts) '#.(string-rod "no"))) (error "Hypersensitivity pitfall: ~ XML PI's 'standalone' attribute must be exactly \"yes\" or \"no\" and not ~S." (rod-string (cdar atts)))) (setf (xml-header-standalone-p res) (if (rod-equal '#.(string-rod "yes") (cdar atts)) :yes :no)) (pop atts)) (when atts (error "XML designers decided to disallow future extensions to the set ~ of allowed XML PI's attributes -- you might have lost big on ~S (~S)" (rod-string content) sd-ok-p )) res)) ;;;; --------------------------------------------------------------------------- ;;;; mu ;;;; (defun mu (x) (cond ((stringp x) x) ((vectorp x) (rod-string x)) ((consp x) (cons (mu (car x)) (mu (cdr x)))) (x))) ;;;; --------------------------------------------------------------------------- ;;;; User interface ;;;; (defun specific-or (component &optional (alternative nil)) (if (eq component :unspecific) alternative component)) (defun string-or (str &optional (alternative nil)) (if (zerop (length str)) alternative str)) (defun make-uri (&rest initargs &key path query &allow-other-keys) (apply #'make-instance 'puri:uri :path (and path (escape-path path)) :query (and query (escape-query query)) initargs)) (defun escape-path (list) (puri::render-parsed-path list t)) (defun escape-query (pairs) (flet ((escape (str) (puri::encode-escaped-encoding str puri::*reserved-characters* t))) (let ((first t)) (with-output-to-string (s) (dolist (pair pairs) (if first (setf first nil) (write-char #\& s)) (write-string (escape (car pair)) s) (write-char #\= s) (write-string (escape (cdr pair)) s)))))) (defun uri-parsed-query (uri) (flet ((unescape (str) (puri::decode-escaped-encoding str t puri::*reserved-characters*))) (let ((str (puri:uri-query uri))) (cond (str (let ((pairs '())) (dolist (s (split-sequence-if (lambda (x) (eql x #\&)) str)) (destructuring-bind (name value) (split-sequence-if (lambda (x) (eql x #\=)) s) (push (cons (unescape name) (unescape value)) pairs))) (reverse pairs))) (t nil))))) (defun query-value (name alist) (cdr (assoc name alist :test #'equal))) (defun pathname-to-uri (pathname) (let ((path (append (pathname-directory pathname) (list (if (specific-or (pathname-type pathname)) (concatenate 'string (pathname-name pathname) "." (pathname-type pathname)) (pathname-name pathname)))))) (if (eq (car path) :relative) (make-uri :path path) (make-uri :scheme :file :host (concatenate 'string (string-or (host-namestring pathname)) "+" (specific-or (pathname-device pathname))) :path path)))) (defun parse-name.type (str) (if str (let ((i (position #\. str :from-end t))) (if i (values (subseq str 0 i) (subseq str (1+ i))) (values str nil))) (values nil nil))) (defun uri-to-pathname (uri) (let ((scheme (puri:uri-scheme uri)) (path (puri:uri-parsed-path uri))) (unless (member scheme '(nil :file)) (error 'parser-error :format-control "URI scheme ~S not supported" :format-arguments (list scheme))) (if (eq (car path) :relative) (multiple-value-bind (name type) (parse-name.type (car (last path))) (make-pathname :directory (butlast path) :name name :type type)) (multiple-value-bind (name type) (parse-name.type (car (last (cdr path)))) (destructuring-bind (host device) (split-sequence-if (lambda (x) (eql x #\+)) (or (puri:uri-host uri) "+")) (make-pathname :host (string-or host) :device (string-or device) :directory (cons :absolute (butlast (cdr path))) :name name :type type)))))) (defun parse-xstream (xstream handler &rest args) (let ((zstream (make-zstream :input-stack (list xstream)))) (peek-rune xstream) (with-scratch-pads () (apply #'p/document zstream handler args)))) (defun parse-file (filename handler &rest args) (with-open-xfile (input filename) (setf (xstream-name input) (make-stream-name :entity-name "main document" :entity-kind :main :uri (pathname-to-uri filename))) (apply #'parse-xstream input handler args))) (defun resolve-synonym-stream (stream) (while (typep stream 'synonym-stream) (setf stream (symbol-value (synonym-stream-symbol stream)))) stream) (defun safe-stream-sysid (stream) (if (and (typep (resolve-synonym-stream stream) 'file-stream) (pathname stream)) (pathname-to-uri (pathname stream)) nil)) (defun parse-stream (stream handler &rest args) (let ((xstream (make-xstream stream :name (make-stream-name :entity-name "main document" :entity-kind :main :uri (safe-stream-sysid stream)) :initial-speed 1))) (apply #'parse-xstream xstream handler args))) (defun parse-dtd-file (filename &optional handler) (with-open-file (s filename :element-type '(unsigned-byte 8)) (parse-dtd-stream s handler))) (defun parse-dtd-stream (stream &optional handler) (let ((input (make-xstream stream))) (setf (xstream-name input) (make-stream-name :entity-name "dtd" :entity-kind :main :uri (safe-stream-sysid stream))) (let ((zstream (make-zstream :input-stack (list input))) (*ctx* (make-context :handler handler)) (*validate* t) (*data-behaviour* :DTD)) (with-scratch-pads () (ensure-dtd) (peek-rune input) (p/ext-subset zstream) (dtd *ctx*))))) (defun parse-string (string handler) ;; XXX this function mis-handles encoding (with-scratch-pads () (let* ((x (string->xstream string)) (z (make-zstream :input-stack (list x)))) (p/document z handler)))) (defun string->xstream (string) ;; XXX encoding is mis-handled by this kind of stream (make-rod-xstream (string-rod string))) (defclass octet-input-stream (fundamental-binary-input-stream) ((octets :initarg :octets) (pos :initform 0))) (defmethod close ((stream octet-input-stream) &key abort) (declare (ignore abort)) (open-stream-p stream)) (defmethod stream-read-byte ((stream octet-input-stream)) (with-slots (octets pos) stream (if (>= pos (length octets)) :eof (prog1 (elt octets pos) (incf pos))))) (defmethod stream-read-sequence ((stream octet-input-stream) sequence &optional (start 0) (end (length sequence))) (with-slots (octets pos) stream (let* ((length (min (- end start) (- (length octets) pos))) (end1 (+ start length)) (end2 (+ pos length))) (replace sequence octets :start1 start :end1 end1 :start2 pos :end2 end2) (setf pos end2) end1))) (defun make-octet-input-stream (octets) (make-instance 'octet-input-stream :octets octets)) (defun parse-octets (octets handler &rest args) (apply #'parse-stream (make-octet-input-stream octets) handler args)) ;;;; #+allegro (defmacro sp (&body body) `(progn (prof:with-profiling (:type :space) .,body) (prof:show-flat-profile))) #+allegro (defmacro tm (&body body) `(progn (prof:with-profiling (:type :time) .,body) (prof:show-flat-profile))) ;;;; (defun zstream-push (new-xstream zstream) (cond ((find-if (lambda (x) (and (xstream-p x) (eql (stream-name-entity-name (xstream-name x)) (stream-name-entity-name (xstream-name new-xstream))) (eql (stream-name-entity-kind (xstream-name x)) (stream-name-entity-kind (xstream-name new-xstream))))) (zstream-input-stack zstream)) (error "Infinite recursion."))) (push new-xstream (zstream-input-stack zstream)) zstream) (defun recurse-on-entity (zstream name kind continuation) (assert (not (zstream-token-category zstream))) ;;(sleep .2) ;;(warn "~S / ~S[~S]." (zstream-input-stack zstream) (mu name) kind) (call-with-entity-expansion-as-stream zstream (lambda (new-xstream) (push :stop (zstream-input-stack zstream)) (zstream-push new-xstream zstream) (prog1 (funcall continuation zstream) (assert (eq (peek-token zstream) :eof)) (assert (eq (pop (zstream-input-stack zstream)) new-xstream)) (close-xstream new-xstream) (assert (eq (pop (zstream-input-stack zstream)) :stop)) (setf (zstream-token-category zstream) nil) '(consume-token zstream)) ) name kind)) ;;;; #| (defparameter *test-files* '(;;"jclark:xmltest;not-wf;*;*.xml" "jclark:xmltest;valid;*;*.xml" ;;"jclark:xmltest;invalid;*.xml" )) (defun run-all-tests (&optional (test-files *test-files*)) (let ((failed nil)) (dolist (k test-files) (dolist (j (sort (directory k) #'string< :key #'pathname-name)) (unless (test-file j) (push j failed)))) (fresh-line) (cond (failed (write-string "**** Test failed on") (dolist (k failed) (format t "~%**** ~S." k)) nil) (t (write-string "**** Test passed!") t)))) (defun test-file (filename) (let ((out-filename (merge-pathnames "out/" filename))) (if (probe-file out-filename) (positive-test-file filename out-filename) (negative-test-file filename)))) (defun positive-test-file (filename out-filename) (multiple-value-bind (nodes condition) (ignore-errors (parse-file filename)) (cond (condition (warn "**** Error in ~S: ~A." filename condition) nil) (t (let (res equal?) (setf res (with-output-to-string (sink) (unparse-document nodes sink))) (setf equal? (with-open-file (in out-filename :direction :input :element-type 'character) (do ((i 0 (+ i 1)) (c (read-char in nil nil) (read-char in nil nil))) ((or (eq c nil) (= i (length res))) (and (eq c nil) (= i (length res)))) (unless (eql c (char res i)) (return nil))))) (cond ((not equal?) (format t "~&**** Test failed on ~S." filename) (fresh-line) (format t "** me: ~A" res) (fresh-line) (format t "** he: " res) (finish-output) (with-open-file (in out-filename :direction :input :element-type 'character) (do ((c (read-char in nil nil) (read-char in nil nil))) ((eq c nil)) (write-char c))) nil) (t t))))))) (defun negative-test-file (filename) (multiple-value-bind (nodes condition) (ignore-errors (parse-file filename)) (declare (ignore nodes)) (cond (condition t) (t (warn "**** negative test failed on ~S." filename))))) |# ;;;; #+(or) ;was ist das? (progn (defmethod dom:create-processing-instruction ((document null) target data) (declare (ignorable document target data)) nil) (defmethod dom:append-child ((node null) child) (declare (ignorable node child)) nil) (defmethod dom:create-element ((document null) name) (declare (ignorable document name)) nil) (defmethod dom:set-attribute ((document null) name value) (declare (ignorable document name value)) nil) (defmethod dom:create-text-node ((document null) data) (declare (ignorable document data)) nil) (defmethod dom:create-cdata-section ((document null) data) (declare (ignorable document data)) nil) ) #|| (defmacro read-data-until* ((predicate input res res-start res-end) &body body) ;; fast variant -- for now disabled for no apparent reason ;; -> res, res-start, res-end `(let* ((rptr (xstream-read-ptr ,input)) (p0 rptr) (fptr (xstream-fill-ptr ,input)) (buf (xstream-buffer ,input)) ,res ,res-start ,res-end) (declare (type fixnum rptr fptr p0) (type (simple-array read-element (*)) buf)) (loop (cond ((%= rptr fptr) ;; underflow -- hmm inject the scratch-pad with what we ;; read and continue, while using read-rune and collecting ;; d.h. besser wäre hier auch while-reading zu benutzen. (setf (xstream-read-ptr ,input) rptr) (multiple-value-setq (,res ,res-start ,res-end) (with-rune-collector/raw (collect) (do ((i p0 (%+ i 1))) ((%= i rptr)) (collect (%rune buf i))) (let (c) (loop (cond ((%= rptr fptr) (setf (xstream-read-ptr ,input) rptr) (setf c (peek-rune input)) (cond ((eq c :eof) (return))) (setf rptr (xstream-read-ptr ,input) fptr (xstream-fill-ptr ,input) buf (xstream-buffer ,input))) (t (setf c (%rune buf rptr)))) (cond ((,predicate c) ;; we stop (setf (xstream-read-ptr ,input) rptr) (return)) (t ;; we continue (collect c) (setf rptr (%+ rptr 1))) ))))) (return)) ((,predicate (%rune buf rptr)) ;; we stop (setf (xstream-read-ptr ,input) rptr) (setf ,res buf ,res-start p0 ,res-end rptr) (return) ) (t we continue (sf rptr (%+ rptr 1))) )) ,@body )) ||# ;(defun read-data-until (predicate input continuation) ; ) (defmacro read-data-until* ((predicate input res res-start res-end) &body body) "Read data from `input' until `predicate' applied to the read char turns true. Then execute `body' with `res', `res-start', `res-end' bound to denote a subsequence (of RUNEs) containing the read portion. The rune upon which `predicate' turned true is neither consumed from the stream, nor included in `res'. Keep the predicate short, this it may be included more than once into the macro's expansion." ;; (let ((input-var (gensym)) (collect (gensym)) (c (gensym))) `(LET ((,input-var ,input)) (MULTIPLE-VALUE-BIND (,res ,res-start ,res-end) (WITH-RUNE-COLLECTOR/RAW (,collect) (LOOP (LET ((,c (PEEK-RUNE ,input-var))) (COND ((EQ ,c :EOF) ;; xxx error message (RETURN)) ((FUNCALL ,predicate ,c) (RETURN)) (t (,collect ,c) (CONSUME-RUNE ,input-var)))))) (LOCALLY ,@body))))) (defun read-name-token (input) (read-data-until* ((lambda (rune) (declare (type rune rune)) (not (name-rune-p rune))) input r rs re) (intern-name r rs re))) (defun read-cdata (input) (read-data-until* ((lambda (rune) (declare (type rune rune)) (or (%rune= rune #/<) (%rune= rune #/&))) input source start end) (locally (declare (type (simple-array rune (*)) source) (type ufixnum start) (type ufixnum end) (optimize (speed 3) (safety 0))) (let ((res (make-array (%- end start) :element-type 'rune))) (declare (type (simple-array rune (*)) res)) (let ((i (%- end start))) (declare (type ufixnum i)) (loop (setf i (- i 1)) (setf (%rune res i) (%rune source (the ufixnum (+ i start)))) (when (= i 0) (return)))) res)))) (defun internal-entity-expansion (name) (let ((def (get-entity-definition name :general (dtd *ctx*)))) (unless def (error "Entity '~A' is not defined." (rod-string name))) (unless (typep def 'internal-entdef) (error "Entity '~A' is not an internal entity." name)) (or (entdef-expansion def) (setf (entdef-expansion def) (find-internal-entity-expansion name))))) (defun find-internal-entity-expansion (name) (let ((zinput (make-zstream))) (with-rune-collector-3 (collect) (labels ((muffle (input) (let (c) (loop (setf c (read-rune input)) (cond ((eq c :eof) (return)) ((rune= c #/&) (setf c (peek-rune input)) (cond ((rune= c #/#) (let ((c (read-numeric-entity input))) (%put-unicode-char c collect))) (t (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after &.")) (let ((name (read-name-token input))) (setf c (read-rune input)) (assert (rune= c #/\;)) (recurse-on-entity zinput name :general (lambda (zinput) (muffle (car (zstream-input-stack zinput))))))))) ((and (rune= c #/<)) ;; xxx fix error message (cerror "Eat them in spite of this." "For no apparent reason #\/< is forbidden in attribute values. ~ You lost -- next time choose SEXPR syntax.") (collect c)) ((space-rune-p c) (collect #/space)) ((not (data-rune-p c)) (error "illegal char: ~S." c)) (t (collect c))))))) (declare (dynamic-extent #'muffle)) (recurse-on-entity zinput name :general (lambda (zinput) (muffle (car (zstream-input-stack zinput))))) )))) (defun resolve-entity (name handler dtd) (let ((*validate* nil)) (if (get-entity-definition name :general dtd) (let* ((*ctx* (make-context :handler handler :dtd dtd)) (input (make-zstream)) (*data-behaviour* :DOC)) (with-scratch-pads () (recurse-on-entity input name :general (lambda (input) (prog1 (etypecase (checked-get-entdef name :general) (internal-entdef (p/content input)) (external-entdef (p/ext-parsed-ent input))) (unless (eq (peek-token input) :eof) (error "Trailing garbage. - ~S" (peek-token input)))))))) nil))) (defun read-att-value-2 (input) (let ((delim (read-rune input))) (unless (member delim '(#/\" #/\') :test #'eql) (error "Bad attribute value delimiter ~S, must be either #\\\" or #\\\'." (rune-char delim delim))) (with-rune-collector-4 (collect) (loop (let ((c (read-rune input))) (cond ((eq c :eof) (error "EOF")) ((rune= c delim) (return)) ((rune= #/& c) (multiple-value-bind (kind sem) (read-entity-ref input) (ecase kind (:NUMERIC (%put-unicode-char sem collect)) (:NAMED (let* ((exp (internal-entity-expansion sem)) (n (length exp))) (declare (type (simple-array rune (*)) exp)) (do ((i 0 (%+ i 1))) ((%= i n)) (collect (%rune exp i)))))))) ((space-rune-p c) (collect #/u+0020)) (t (collect c)))))))) ;;;;;;;;;;;;;;;;; ;;; Namespace stuff (defvar *default-namespace-bindings* '((#"" . nil) (#"xmlns" . #"http://www.w3.org/2000/xmlns/") (#"xml" . #"http://www.w3.org/XML/1998/namespace"))) ;; We already know that name is part of a valid XML name, so all we ;; have to check is that the first rune is a name-start-rune and that ;; there is not colon in it. (defun nc-name-p (name) (and (name-start-rune-p (rune name 0)) (notany #'(lambda (rune) (rune= #/: rune)) name))) (defun split-qname (qname) (declare (type runes:simple-rod qname)) (let ((pos (position #/: qname))) (if pos (let ((prefix (subseq qname 0 pos)) (local-name (subseq qname (1+ pos)))) (if (nc-name-p local-name) (values prefix local-name) (error "~S is not a valid NcName." local-name))) (values () qname)))) (defun decode-qname (qname) "decode-qname name => namespace-uri, prefix, local-name" (declare (type runes:simple-rod qname)) (multiple-value-bind (prefix local-name) (split-qname qname) (let ((uri (find-namespace-binding prefix))) (if uri (values uri prefix local-name) (values nil nil nil))))) (defun find-namespace-binding (prefix) (cdr (or (assoc (or prefix #"") (namespace-bindings *ctx*) :test #'rod=) (error "Undeclared namespace prefix: ~A" (rod-string prefix))))) ;; FIXME: Should probably be refactored by adding :start and :end to rod=/rod-equal (defun rod-starts-with (prefix rod) (and (<= (length prefix) (length rod)) (dotimes (i (length prefix) t) (unless (rune= (rune prefix i) (rune rod i)) (return nil))))) (defun xmlns-attr-p (attr-name) (rod-starts-with #.(string-rod "xmlns") attr-name)) (defun attrname->prefix (attrname) (if (< 5 (length attrname)) (subseq attrname 6) nil)) (defun find-namespace-declarations (element attr-alist) (let ((result (mapcar #'(lambda (attr) (cons (attrname->prefix (car attr)) (cdr attr))) (remove-if-not #'xmlns-attr-p attr-alist :key #'car)))) ;; Argh! PROCESS-ATTRIBUTES needs to know the attributes' namespaces ;; already. But namespace declarations can be done using default values ;; in the DTD. So we need to handle defaulting of attribute values twice, ;; once for xmlns attributes, then for all others. (I really hope I'm ;; wrong on this one, but I don't see how.) (let ((e (find-element element (dtd *ctx*)))) (when e (dolist (ad (elmdef-attributes e)) ;handle default values (let* ((name (attdef-name ad)) (prefix (attrname->prefix name))) (when (and (xmlns-attr-p name) (not (member prefix result :key #'car :test #'rod=)) (listp (attdef-default ad)) ;:DEFAULT or :FIXED ) (push (cons prefix (cadr (attdef-default ad))) result)))))) result)) (defun declare-namespaces (element attr-alist) (let ((ns-decls (find-namespace-declarations element attr-alist))) (dolist (ns-decl ns-decls ) ;; check some namespace validity constraints ;; FIXME: Would be nice to add "this is insane, go ahead" restarts (let ((prefix (car ns-decl)) (uri (if (rod= #"" (cdr ns-decl)) nil (cdr ns-decl)))) (cond ((and (rod= prefix #"xml") (not (rod= uri #"http://www.w3.org/XML/1998/namespace"))) (error "Attempt to rebind the prefix \"xml\" to ~S." (mu uri))) ((and (rod= uri #"http://www.w3.org/XML/1998/namespace") (not (rod= prefix #"xml"))) (error "The namespace URI \"http://www.w3.org/XML/1998/namespace\" ~ may not be bound to the prefix ~S, only \"xml\" is legal." (mu prefix))) ((and (rod= prefix #"xmlns") (rod= uri #"http://www.w3.org/2000/xmlns/")) (error "Attempt to bind the prefix \"xmlns\" to its predefined ~ URI \"http://www.w3.org/2000/xmlns/\", which is ~ forbidden for no good reason.")) ((rod= prefix #"xmlns") (error "Attempt to bind the prefix \"xmlns\" to the URI ~S, ~ but it may not be declared." (mu uri))) ((rod= uri #"http://www.w3.org/2000/xmlns/") (error "The namespace URI \"http://www.w3.org/2000/xmlns/\" may ~ not be bound to prefix ~S (or any other)." (mu prefix))) ((and (rod= uri #"") prefix) (error "Only the default namespace (the one without a prefix) may ~ be bound to an empty namespace URI, thus undeclaring it.")) (t (push (cons prefix uri) (namespace-bindings *ctx*)) (sax:start-prefix-mapping (handler *ctx*) (car ns-decl) (cdr ns-decl)))))) ns-decls)) (defun undeclare-namespaces (ns-decls) (dolist (ns-decl ns-decls) (setf (namespace-bindings *ctx*) (delete ns-decl (namespace-bindings *ctx*))) (sax:end-prefix-mapping (handler *ctx*) (car ns-decl)))) (defun build-attribute-list-no-ns (attr-alist) (mapcar #'(lambda (pair) (sax:make-attribute :qname (car pair) :value (cdr pair) :specified-p t)) attr-alist)) ;; FIXME: Use a non-braindead way to enforce attribute uniqueness (defun build-attribute-list-ns (attr-alist) (let (attributes) (dolist (pair attr-alist) (push (build-attribute (car pair) (cdr pair) t) attributes)) ;; 5.3 Uniqueness of Attributes ;; In XML documents conforming to [the xmlns] specification, no ;; tag may contain two attributes which: ;; 1. have identical names, or ;; 2. have qualified names with the same local part and with ;; prefixes which have been bound to namespace names that are ;; identical. ;; ;; 1. is checked by read-tag-2, so we only deal with 2 here (do ((sublist attributes (cdr sublist))) ((null sublist) attributes) (let ((attr-1 (car sublist))) (when (and (sax:attribute-namespace-uri attr-1) (find-if #'(lambda (attr-2) (and (rod= (sax:attribute-namespace-uri attr-1) (sax:attribute-namespace-uri attr-2)) (rod= (sax:attribute-local-name attr-1) (sax:attribute-local-name attr-2)))) (cdr sublist))) (error "Multiple definitions of attribute ~S in namespace ~S." (mu (sax:attribute-local-name attr-1)) (mu (sax:attribute-namespace-uri attr-1)))))))) (defun build-attribute (name value specified-p) (multiple-value-bind (prefix local-name) (split-qname name) (declare (ignorable local-name)) (if (or (not prefix) ;; default namespace doesn't apply to attributes (and (rod= #"xmlns" prefix) (not sax:*use-xmlns-namespace*))) (sax:make-attribute :qname name :value value :specified-p specified-p) (multiple-value-bind (uri prefix local-name) (decode-qname name) (declare (ignore prefix)) (sax:make-attribute :qname name :value value :namespace-uri uri :local-name local-name :specified-p specified-p))))) ;;; Faster constructors ;; Since using the general DOM interface to construct the parsed trees ;; may turn out to be quite expensive (That depends on the underlying ;; DOM implementation). A particular DOM implementation may choose to ;; implement an XML:FAST-CONSTRUCTORS method: ;; XML:FAST-CONSTRUCTORS document [method] ;; ;; Return an alist of constructors suitable for the document `document'. ;; ;; (:MAKE-TEXT document parent data) ;; (:MAKE-PROCESSING-INSTRUCTION document parent target content) ;; (:MAKE-NODE document parent attributes content) ;; [`attributes' now in turn is an alist] ;; (:MAKE-CDATA document parent data) ;; (:MAKE-COMMENT document parent data) ;; ;;;;;;;;;;;;;;;;; ;; System Identifier Protocol ;; A system identifier is an object obeying to the system identifier ;; protocol. Often something like an URL or a pathname. ;; OPEN-SYS-ID sys-id [generic function] ;; ;; Opens the resource associated with the system identifier `sys-id' ;; for reading and returns a stream. For now it is expected, that the ;; stream is an octet stream (one of element type (unsigned-byte 8)). ;; ;; More precisely: The returned object only has to obey to the xstream ;; controller protocol. (That is it has to provide implementations for ;; READ-OCTETS and XSTREAM-CONTROLLER-CLOSE). ;; MERGE-SYS-ID sys-id base [generic function] ;; ;; Merges two system identifiers. That is resolve `sys-id' relative to ;; `base' yielding an absolute system identifier suitable for ;; OPEN-SYS-ID. ;; xstream Controller Protocol ;; ;; #|| (defun xml-parse (system-id &key document standalone-p) ) ||# ;;;;;;;;;;;;;;;;; ;;; SAX validation handler (defclass validator () ((context :initarg :context :accessor context) (cdatap :initform nil :accessor cdatap))) (defun make-validator (dtd root) (make-instance 'validator :context (make-context :handler nil :dtd dtd :model-stack (list (make-root-model root))))) (macrolet ((with-context ((validator) &body body) `(let ((*ctx* (context ,validator)) (*validate* t)) (with-scratch-pads () ;nicht schoen ,@body)))) (defmethod sax:start-element ((handler validator) uri lname qname attributes) uri lname (with-context (handler) (validate-start-element *ctx* qname) (process-attributes *ctx* qname attributes))) (defmethod sax:start-cdata ((handler validator)) (setf (cdatap handler) t)) (defmethod sax:characters ((handler validator) data) (with-context (handler) (validate-characters *ctx* (if (cdatap handler) #"hack" data)))) (defmethod sax:end-cdata ((handler validator)) (setf (cdatap handler) nil)) (defmethod sax:end-element ((handler validator) uri lname qname) uri lname (with-context (handler) (validate-end-element *ctx* qname))))