diff --git a/cxml.asd b/cxml.asd index b3b8102..11a585f 100644 --- a/cxml.asd +++ b/cxml.asd @@ -1,58 +1,112 @@ +;;; XXX Die vielen verschiedenen Systeme hier sollten vielleicht +;;; Module eines grossen Systems CXML werden? + (defpackage :cxml-system (:use :asdf :cl)) (in-package :cxml-system) +;; XXX das sollte natuerlich erst beim laden stattfinden +#+cmu +(require :gray-streams) + (defclass closure-source-file (cl-source-file) ()) #+sbcl (defmethod perform :around ((o compile-op) (s closure-source-file)) ;; shut up already. Correctness first. (handler-bind ((sb-ext:compiler-note #'muffle-warning)) - (call-next-method))) + (let (#+sbcl (*compile-print* nil)) + (call-next-method)))) -(unless (find-package :glisp) - (defpackage :glisp)) +#-(or rune-is-character rune-is-octet) +(progn + (format t "~&;;; Checking for wide character support...") + (force-output) + (pushnew (dotimes (x 65536 + (progn + (format t " ok, characters have at least 16 bits.~%") + :rune-is-character)) + (unless (and (< x char-code-limit) (code-char x)) + (format t " no, reverting to octet strings.~%") + (return :rune-is-octet))) + *features*)) -(defsystem glisp +#-rune-is-character +(format t "~&;;; Building cxml with (UNSIGNED-BYTE 16) RUNES~%") + +#+rune-is-character +(format t "~&;;; Building cxml with CHARACTER RUNES~%") + +(defsystem runes :default-component-class closure-source-file :pathname (merge-pathnames - "glisp/" + "runes/" (make-pathname :name nil :type nil :defaults *load-truename*)) :components - ((:file dependent + ((:file "package") + (:file dependent :pathname #+CLISP "dep-clisp" #+(AND :CMU (NOT :PTHREAD)) "dep-cmucl" #+sbcl "dep-sbcl" #+(AND :CMU :PTHREAD) "dep-cmucl-dtc" - #+(and allegro allegro-v5.0) "dep-acl5" - #+(and allegro (not allegro-v5.0)) "dep-acl" - #+GCL "dep-gcl" - #-(or sbcl CLISP CMU allegro GCL) #.(error "Configure!")) - (:file "package" - :depends-on (dependent)) - (:file "runes" + #+(and allegro-version>= (version>= 5.0)) "dep-acl5" + #+(and allegro-version>= (not (version>= 5.0))) "dep-acl" + #+openmcl "dep-openmcl" + #-(or sbcl CLISP CMU allegro openmcl) #.(error "Configure!") + :depends-on ("package")) + (:file runes + :pathname + #-rune-is-character "runes" + #+rune-is-character "characters" :depends-on ("package" dependent)) - (:file "util" - :depends-on ("package" dependent "runes")) - (:file "match" - :depends-on ("package" dependent "runes" "util")))) + (:file "syntax" :depends-on ("package" dependent runes)) + (:file "encodings" :depends-on ("package")) + (:file "encodings-data" :depends-on ("package" "encodings")) + (:file "xstream" + :depends-on ("package" dependent "syntax" "encodings-data")))) -(asdf:defsystem :cxml +(asdf:defsystem :xml :default-component-class closure-source-file :pathname (merge-pathnames - "cxml/" + "xml/" (make-pathname :name nil :type nil :defaults *load-truename*)) :components ((:file "package") - (:file "encodings" :depends-on ("package")) - (:file "encodings-data" :depends-on ("package" "encodings")) + (:file "util" :depends-on ("package")) (:file "sax-handler") - (:file "dompack") - (:file "dom-impl" :depends-on ("dompack")) - (:file "dom-builder" :depends-on ("dom-impl" "sax-handler")) - (:file "xml-stream" :depends-on ("package")) + (:file "characters" :depends-on ("package")) (:file "xml-name-rune-p" :depends-on ("package")) - (:file "xml-parse" :depends-on ("package" "dompack" "sax-handler")) - (:file "xml-canonic" :depends-on ("package" "dompack" "xml-parse"))) - :depends-on (:glisp)) + (:file "split-sequence" :depends-on ("package")) + (:file "xml-parse" :depends-on ("package" "util" "sax-handler" "split-sequence" "xml-name-rune-p" "characters")) + (:file "characters" :depends-on ("package")) + (:file "unparse" :depends-on ("xml-parse")) + (:file "xmls-compat" :depends-on ("xml-parse")) + (:file "recoder" :depends-on ("xml-parse")) + (:file "catalog" :depends-on ("xml-parse")) + (:file "sax-proxy" :depends-on ("xml-parse"))) + :depends-on (:runes :puri)) + +(asdf:defsystem :dom + :default-component-class closure-source-file + :pathname (merge-pathnames + "dom/" + (make-pathname :name nil :type nil :defaults *load-truename*)) + :components + ((:file "package") + (:file "dom-impl" :depends-on ("package")) + (:file "dom-builder" :depends-on ("dom-impl")) + (:file "unparse" :depends-on ("package")) + (:file "simple-dom" :depends-on ("package")) + (:file "dom-sax" :depends-on ("package"))) + :depends-on (:xml)) + +(asdf:defsystem :cxml-test + :default-component-class closure-source-file + :pathname (merge-pathnames + "test/" + (make-pathname :name nil :type nil :defaults *load-truename*)) + :components ((:file "domtest") (:file "xmlconf")) + :depends-on (:xml :dom)) + +(asdf:defsystem :cxml :components () :depends-on (:dom :cxml-test)) diff --git a/dom/dom-impl.lisp b/dom/dom-impl.lisp index 823e642..08edd83 100644 --- a/dom/dom-impl.lisp +++ b/dom/dom-impl.lisp @@ -1,3 +1,12 @@ +;;;; dom-impl.lisp -- Implementation of DOM 1 Core +;;;; +;;;; This file is part of the CXML parser, released under (L)LGPL. +;;;; See file COPYING for details. +;;;; +;;;; Author: Gilbert Baumann +;;;; Author: David Lichteblau +;;;; Author: knowledgeTools Int. GmbH + (defpackage :dom-impl (:use :cl :runes)) @@ -26,7 +35,8 @@ (defclass document (node) ((doc-type :initarg :doc-type :reader dom:doctype) - (entities :initform nil :reader entities))) + (dtd :initform nil :reader dtd) + (entity-resolver :initform nil))) (defclass document-fragment (node) ()) @@ -91,6 +101,9 @@ (read-only-p :initform nil :reader read-only-p) (element-type :initarg :element-type))) +(defclass attribute-node-map (named-node-map) + ((element :initarg :element))) + ;;; Implementation @@ -115,18 +128,19 @@ (defun move (from to from-start to-start length) ;; like (setf (subseq to to-start (+ to-start length)) ;; (subseq from from-start (+ from-start length))) - ;; but without creating the garbage + ;; but without creating the garbage. + ;; Also, this is using AREF not ELT so that fill-pointers are ignored. (if (< to-start from-start) (loop repeat length for i from from-start for j from to-start - do (setf (elt to j) (elt from i))) + do (setf (aref to j) (aref from i))) (loop repeat length - for i from (+ from-start length -1) by -1 - for j from (+ to-start length -1) by -1 - do (setf (elt to j) (elt from i))))) + for i downfrom (+ from-start length -1) + for j downfrom (+ to-start length -1) + do (setf (aref to j) (aref from i))))) (defun adjust-vector-exponentially (vector new-dimension set-fill-pointer-p) (let ((d (array-dimension vector 0))) @@ -175,14 +189,18 @@ (defmethod dom:create-element ((document document) tag-name) (setf tag-name (rod tag-name)) - (unless (xml::valid-name-p tag-name) + (unless (cxml::valid-name-p tag-name) (dom-error :INVALID_CHARACTER_ERR "not a name: ~A" (rod-string tag-name))) - (make-instance 'element - :tag-name tag-name - :owner document - :attributes (make-instance 'named-node-map - :element-type :attribute + (let ((result (make-instance 'element + :tag-name tag-name :owner document))) + (setf (slot-value result 'attributes) + (make-instance 'attribute-node-map + :element-type :attribute + :owner document + :element result)) + (add-default-attributes result) + result)) (defmethod dom:create-document-fragment ((document document)) (make-instance 'document-fragment @@ -209,7 +227,7 @@ (defmethod dom:create-processing-instruction ((document document) target data) (setf target (rod target)) (setf data (rod data)) - (unless (xml::valid-name-p target) + (unless (cxml::valid-name-p target) (dom-error :INVALID_CHARACTER_ERR "not a name: ~A" (rod-string target))) (make-instance 'processing-instruction :owner document @@ -218,7 +236,7 @@ (defmethod dom:create-attribute ((document document) name) (setf name (rod name)) - (unless (xml::valid-name-p name) + (unless (cxml::valid-name-p name) (dom-error :INVALID_CHARACTER_ERR "not a name: ~A" (rod-string name))) (make-instance 'attribute :name name @@ -227,7 +245,7 @@ (defmethod dom:create-entity-reference ((document document) name) (setf name (rod name)) - (unless (xml::valid-name-p name) + (unless (cxml::valid-name-p name) (dom-error :INVALID_CHARACTER_ERR "not a name: ~A" (rod-string name))) (make-instance 'entity-reference :name name @@ -590,7 +608,7 @@ (assert-writeable node) (setq arg (rod arg)) (with-slots (value) node - (setf value (concatenate (type-of value) value arg))) + (setf value (concatenate 'rod value arg))) (values)) (defmethod dom:delete-data ((node character-data) offset count) @@ -680,7 +698,7 @@ (with-slots (children owner) node ;; remove children, add new TEXT-NODE child ;; (alas, we must not reuse an old TEXT-NODE) - (while (plusp (length children)) + (cxml::while (plusp (length children)) (dom:remove-child node (dom:last-child node))) (dom:append-child node (dom:create-text-node owner rod)))) new-value) @@ -756,8 +774,38 @@ (unless (find old-attr items) (dom-error :NOT_FOUND_ERR "Attribute not found.")) (setf items (remove old-attr items)) + (maybe-add-default-attribute element (dom:name old-attr)) old-attr)) +;; eek, defaulting: + +(defun maybe-add-default-attribute (element name) + (let* ((dtd (dtd (slot-value element 'owner))) + (e (cxml::find-element (dom:tag-name element) dtd)) + (a (when e (cxml::find-attribute e name)))) + (when (and a (listp (cxml::attdef-default a))) + (add-default-attribute element a)))) + +(defun add-default-attributes (element) + (let* ((dtd (dtd (slot-value element 'owner))) + (e (cxml::find-element (dom:tag-name element) dtd))) + (when e + (dolist (a (cxml::elmdef-attributes e)) + (when (and a (listp (cxml::attdef-default a))) + (add-default-attribute element a)))))) + +(defun add-default-attribute (element adef) + (let* ((value (second (cxml::attdef-default adef))) + (owner (slot-value element 'owner)) + (anode (dom:create-attribute owner (cxml::attdef-name adef))) + (text (dom:create-text-node owner value))) + (setf (slot-value anode 'dom-impl::specified-p) nil) + (dom:append-child anode text) + (push anode (slot-value (dom:attributes element) 'items)))) + +(defmethod dom:remove-named-item :after ((self attribute-node-map) name) + (maybe-add-default-attribute (slot-value self 'element) name)) + (defmethod dom:get-elements-by-tag-name ((element element) name) (assert-writeable element) (get-elements-by-tag-name-internal element name)) @@ -771,7 +819,7 @@ (i 0) (previous nil)) ;; careful here, we're modifying the array we are iterating over - (while (< i (length children)) + (cxml::while (< i (length children)) (let ((child (elt children i))) (cond ((not (eq (dom:node-type child) :text)) @@ -779,7 +827,7 @@ (incf i)) ((and previous (eq (dom:node-type previous) :text)) (setf (slot-value previous 'value) - (concatenate 'vector + (concatenate 'rod (dom:data previous) (dom:data child))) (dom:remove-child n child) @@ -816,13 +864,13 @@ (defmethod initialize-instance :after ((instance entity-reference) &key) (let* ((owner (dom:owner-document instance)) - (entities (or (entities owner) xml::*entities*)) - (children (xml::resolve-entity (dom:name instance) entities))) - (setf (slot-value instance 'children) - (make-node-list - (map 'vector - (lambda (node) (dom:import-node owner node t)) - children)))) + (handler (dom:make-dom-builder)) + (resolver (slot-value owner 'entity-resolver))) + (unless resolver + (dom-error :NOT_SUPPORTED_ERR "No entity resolver registered.")) + (setf (document handler) owner) + (push instance (element-stack handler)) + (funcall resolver (dom:name instance) handler)) (labels ((walk (n) (setf (slot-value n 'read-only-p) t) (when (dom:element-p n) @@ -925,12 +973,13 @@ (import-node-internal 'document-fragment document node deep)) (defmethod dom:import-node ((document document) (node element) deep) - (let* ((attributes (make-instance 'named-node-map + (let* ((attributes (make-instance 'attribute-node-map :element-type :attribute :owner document)) (result (import-node-internal 'element document node deep :attributes attributes :tag-name (dom:tag-name node)))) + (setf (slot-value attributes 'element) result) (dolist (attribute (dom:items (dom:attributes node))) (when (or (dom:specified attribute) *clone-not-import*) (dom:set-attribute result (dom:name attribute) (dom:value attribute)))) @@ -981,3 +1030,19 @@ (defmethod dom:clone-node ((node node) deep) (let ((*clone-not-import* t)) (dom:import-node (dom:owner-document node) node deep))) + + +;;; Erweiterung + +(defun dom:create-document (&optional document-element) + ;; Um ein neues Dokumentenobject zu erzeugen, parsen wir einfach ein + ;; Dummydokument. + (let* ((handler (dom:make-dom-builder)) + (cxml::*ctx* (cxml::make-context :handler handler)) + (result + (progn + (sax:start-document handler) + (sax:end-document handler)))) + (when document-element + (dom:append-child result (dom:import-node result document-element t))) + result)) diff --git a/runes/characters.lisp b/runes/characters.lisp index a8fa7e9..828a40c 100644 --- a/runes/characters.lisp +++ b/runes/characters.lisp @@ -24,35 +24,35 @@ (in-package :runes) -(deftype rune () 'base-char) -(deftype rod () 'base-string) -(deftype simple-rod () 'simple-string) +(deftype rune () 'character) +(deftype rod () '(vector character)) +(deftype simple-rod () '(simple-array character)) -(defsubst rune (rod index) +(definline rune (rod index) (char rod index)) (defun (setf rune) (new rod index) (setf (char rod index) new)) -(defsubst %rune (rod index) +(definline %rune (rod index) (aref (the simple-string rod) (the fixnum index))) -(defsubst (setf %rune) (new rod index) +(definline (setf %rune) (new rod index) (setf (aref (the simple-string rod) (the fixnum index)) new)) (defun rod-capitalize (rod) (string-upcase rod)) -(defsubst code-rune (x) (code-char x)) -(defsubst rune-code (x) (char-code x)) +(definline code-rune (x) (code-char x)) +(definline rune-code (x) (char-code x)) -(defsubst rune= (x y) +(definline rune= (x y) (char= x y)) (defun rune-downcase (rune) (char-downcase rune)) -(defsubst rune-upcase (rune) +(definline rune-upcase (rune) (char-upcase rune)) (defun rune-upper-case-letter-p (rune) @@ -70,13 +70,13 @@ (defun rod-upcase (rod) (string-upcase rod)) -(defsubst white-space-rune-p (char) +(definline white-space-rune-p (char) (or (char= char #\tab) (char= char #.(code-char 10)) ;Linefeed (char= char #.(code-char 13)) ;Carriage Return (char= char #\space))) -(defsubst digit-rune-p (char &optional (radix 10)) +(definline digit-rune-p (char &optional (radix 10)) (digit-char-p char radix)) (defun rod (x) @@ -100,7 +100,7 @@ (defun rod-equal (x y) (string-equal x y)) -(defsubst make-rod (size) +(definline make-rod (size) (make-string size)) (defun char-rune (char) @@ -134,9 +134,6 @@ (defun rodp (object) (stringp object)) -(defun really-rod-p (object) - (stringp object)) - (defun rod-subseq (source start &optional (end (length source))) (unless (stringp source) (error "~S is not of type ~S." source 'rod)) diff --git a/runes/dep-acl.lisp b/runes/dep-acl.lisp index 5bbda45..efd67b0 100644 --- a/runes/dep-acl.lisp +++ b/runes/dep-acl.lisp @@ -28,13 +28,13 @@ ;; Unfortunately it is also incapable to declaim such functions inline. ;; So we revoke the DEFUN hack from dep-gcl here. -(defmacro runes::defsubst (fun args &body body) +(defmacro runes::definline (fun args &body body) (if (and (consp fun) (eq (car fun) 'setf)) (let ((fnam (intern (concatenate 'string "(SETF " (symbol-name (cadr fun)) ")") (symbol-package (cadr fun))))) `(progn (defsetf ,(cadr fun) (&rest ap) (new-value) (list* ',fnam new-value ap)) - (runes::defsubst ,fnam ,args .,body))) + (runes::definline ,fnam ,args .,body))) `(progn (defun ,fun ,args .,body) (define-compiler-macro ,fun (&rest .args.) diff --git a/runes/dep-acl5.lisp b/runes/dep-acl5.lisp index 64534d9..a597064 100644 --- a/runes/dep-acl5.lisp +++ b/runes/dep-acl5.lisp @@ -40,13 +40,13 @@ ;; Unfortunately it is also incapable to declaim such functions inline. ;; So we revoke the DEFUN hack from dep-gcl here. -(defmacro runes::defsubst (fun args &body body) +(defmacro runes::definline (fun args &body body) (if (and (consp fun) (eq (car fun) 'setf)) (let ((fnam (intern (concatenate 'string "(SETF " (symbol-name (cadr fun)) ")") (symbol-package (cadr fun))))) `(progn (defsetf ,(cadr fun) (&rest ap) (new-value) (list* ',fnam new-value ap)) - (runes::defsubst ,fnam ,args .,body))) + (runes::definline ,fnam ,args .,body))) (labels ((declp (x) (and (consp x) (eq (car x) 'declare)))) `(progn diff --git a/runes/dep-clisp.lisp b/runes/dep-clisp.lisp index 2d9216b..e8fa296 100644 --- a/runes/dep-clisp.lisp +++ b/runes/dep-clisp.lisp @@ -30,30 +30,30 @@ (if (fboundp 'cl::define-compiler-macro) (pushnew 'define-compiler-macro *features*))) -(setq lisp:*load-paths* '(#P"./")) +;;;(setq lisp:*load-paths* '(#P"./")) +;;; +;;;#+DEFINE-COMPILER-MACRO +;;;(cl:define-compiler-macro ldb (bytespec value &whole whole) +;;; (let (pos size) +;;; (cond ((and (consp bytespec) +;;; (= (length bytespec) 3) +;;; (eq (car bytespec) 'byte) +;;; (constantp (setq size (second bytespec))) +;;; (constantp (setq pos (third bytespec)))) +;;; `(logand ,(if (eql pos 0) value `(ash ,value (- ,pos))) +;;; (1- (ash 1 ,size)))) +;;; (t +;;; whole)))) +;;; +;;;#-DEFINE-COMPILER-MACRO +;;;(progn +;;; (export 'runes::define-compiler-macro :runes) +;;; (defmacro runes::define-compiler-macro (name args &body body) +;;; (declare (ignore args body)) +;;; `(progn +;;; ',name))) -#+DEFINE-COMPILER-MACRO -(cl:define-compiler-macro ldb (bytespec value &whole whole) - (let (pos size) - (cond ((and (consp bytespec) - (= (length bytespec) 3) - (eq (car bytespec) 'byte) - (constantp (setq size (second bytespec))) - (constantp (setq pos (third bytespec)))) - `(logand ,(if (eql pos 0) value `(ash ,value (- ,pos))) - (1- (ash 1 ,size)))) - (t - whole)))) - -#-DEFINE-COMPILER-MACRO -(progn - (export 'runes::define-compiler-macro :runes) - (defmacro runes::define-compiler-macro (name args &body body) - (declare (ignore args body)) - `(progn - ',name))) - -(defmacro runes::defsubst (name args &body body) +(defmacro runes::definline (name args &body body) `(progn (declaim (inline ,name)) (defun ,name ,args .,body))) diff --git a/runes/dep-cmucl-dtc.lisp b/runes/dep-cmucl-dtc.lisp index 2e080c3..2f6cb29 100644 --- a/runes/dep-cmucl-dtc.lisp +++ b/runes/dep-cmucl-dtc.lisp @@ -24,7 +24,7 @@ ;;; superseded by a newer version) or write to the Free Software ;;; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -(defmacro runes::defsubst (name args &body body) +(defmacro runes::definline (name args &body body) `(progn (declaim (inline ,name)) (defun ,name ,args .,body))) diff --git a/runes/dep-cmucl.lisp b/runes/dep-cmucl.lisp index 2e080c3..2f6cb29 100644 --- a/runes/dep-cmucl.lisp +++ b/runes/dep-cmucl.lisp @@ -24,7 +24,7 @@ ;;; superseded by a newer version) or write to the Free Software ;;; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -(defmacro runes::defsubst (name args &body body) +(defmacro runes::definline (name args &body body) `(progn (declaim (inline ,name)) (defun ,name ,args .,body))) diff --git a/runes/dep-openmcl.lisp b/runes/dep-openmcl.lisp index 3ff2c6f..f5bb8a9 100644 --- a/runes/dep-openmcl.lisp +++ b/runes/dep-openmcl.lisp @@ -5,7 +5,7 @@ ;;;; ;;;; (c) copyright 1999 by Gilbert Baumann -(defmacro runes::defsubst (fun args &body body) +(defmacro runes::definline (fun args &body body) (if (consp fun) `(defun ,fun ,args ,@body) diff --git a/runes/dep-sbcl.lisp b/runes/dep-sbcl.lisp index 9431fb3..c111a17 100644 --- a/runes/dep-sbcl.lisp +++ b/runes/dep-sbcl.lisp @@ -24,7 +24,7 @@ ;;; superseded by a newer version) or write to the Free Software ;;; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -(defmacro runes::defsubst (name args &body body) +(defmacro runes::definline (name args &body body) `(progn (declaim (inline ,name)) (defun ,name ,args .,body))) diff --git a/runes/package.lisp b/runes/package.lisp index 921c457..d92ed60 100644 --- a/runes/package.lisp +++ b/runes/package.lisp @@ -10,15 +10,8 @@ (defpackage :runes (:use :cl) - (:export #:defsubst + (:export #:definline - ;; util.lisp : - #:compose - #:curry - #:rcurry - #:until - #:while - ;; runes.lisp #:rune #:rod @@ -47,4 +40,29 @@ #:rod-string #:string-rod #:rod-subseq - #:rod<)) + #:rod< + + ;; xstream.lisp + #:make-xstream + #:make-rod-xstream + #:close-xstream + #:xstream-p + #:read-rune + #:peek-rune + #:fread-rune + #:fpeek-rune + #:consume-rune + #:unread-rune + #:xstream-position + #:xstream-line-number + #:xstream-column-number + #:xstream-plist + #:xstream-encoding + #:set-to-full-speed + #:xstream-name)) + +(defpackage :encoding + (:use :cl :runes) + (:export + #:find-encoding + #:decode-sequence)) diff --git a/runes/runes.lisp b/runes/runes.lisp index 7aed6d0..620bb79 100644 --- a/runes/runes.lisp +++ b/runes/runes.lisp @@ -42,26 +42,26 @@ (deftype rod () '(array rune (*))) (deftype simple-rod () '(simple-array rune (*))) -(defsubst rune (rod index) +(definline rune (rod index) (aref rod index)) (defun (setf rune) (new rod index) (setf (aref rod index) new)) -(defsubst %rune (rod index) +(definline %rune (rod index) (aref (the (simple-array (unsigned-byte 16) (*)) rod) (the fixnum index))) -(defsubst (setf %rune) (new rod index) +(definline (setf %rune) (new rod index) (setf (aref (the (simple-array (unsigned-byte 16) (*)) rod) (the fixnum index)) new)) (defun rod-capitalize (rod) (warn "~S is not implemented." 'rod-capitalize) rod) -(defsubst code-rune (x) x) -(defsubst rune-code (x) x) +(definline code-rune (x) x) +(definline rune-code (x) x) -(defsubst rune= (x y) +(definline rune= (x y) (= x y)) (defun rune-downcase (rune) @@ -70,7 +70,7 @@ ((<= #x00c0 rune #x00de) (+ rune #x20)) (t rune))) -(defsubst rune-upcase (rune) +(definline rune-upcase (rune) (cond ((<= #x0061 rune #x007a) (- rune #x20)) ((= rune #x00f7) rune) ((<= #x00e0 rune #x00fe) (- rune #x20)) @@ -89,21 +89,19 @@ (defun rod-downcase (rod) ;; FIXME - (register-rod - (map '(simple-array (unsigned-byte 16) (*)) #'rune-downcase rod))) + (map '(simple-array (unsigned-byte 16) (*)) #'rune-downcase rod)) (defun rod-upcase (rod) ;; FIXME - (register-rod - (map '(simple-array (unsigned-byte 16) (*)) #'rune-upcase rod))) + (map '(simple-array (unsigned-byte 16) (*)) #'rune-upcase rod)) -(defsubst white-space-rune-p (char) +(definline white-space-rune-p (char) (or (= char 9) ;TAB (= char 10) ;Linefeed (= char 13) ;Carriage Return (= char 32))) ;Space -(defsubst digit-rune-p (char &optional (radix 10)) +(definline digit-rune-p (char &optional (radix 10)) (cond ((<= #.(char-code #\0) char #.(char-code #\9)) (and (< (- char #.(char-code #\0)) radix) (- char #.(char-code #\0)))) @@ -115,11 +113,11 @@ (- char #.(char-code #\a) -10))) )) (defun rod (x) - (cond ((stringp x) (register-rod (map 'rod #'char-code x))) + (cond ((stringp x) (map 'rod #'char-code x)) ((symbolp x) (rod (string x))) ((characterp x) (rod (string x))) - ((vectorp x) (register-rod (coerce x 'rod))) - ((integerp x) (register-rod (map 'rod #'identity (list x)))) + ((vectorp x) (coerce x 'rod)) + ((integerp x) (map 'rod #'identity (list x))) (t (error "Cannot convert ~S to a ~S" x 'rod)))) (defun runep (x) @@ -143,19 +141,16 @@ (unless (rune-equal (rune x i) (rune y i)) (return nil))))) -(defsubst make-rod (size) - (let ((res (make-array size :element-type 'rune))) - (register-rod res) - res)) +(definline make-rod (size) + (make-array size :element-type 'rune)) (defun char-rune (char) (code-rune (char-code char))) (defun rune-char (rune &optional (default #\?)) - #+CMU - (if (< rune 256) (code-char rune) default) - #-CMU - (or (code-char rune) default)) + (if (>= rune char-code-limit) + default + (or (code-char rune) default))) (defun rod-string (rod &optional (default-char #\?)) (map 'string (lambda (x) (rune-char x default-char)) rod)) @@ -178,10 +173,6 @@ (defun rodp (object) (typep object 'rod)) -(defun really-rod-p (object) - (and (typep object 'rod) - (really-really-rod-p object))) - (defun rod-subseq (source start &optional (end (length source))) (unless (rodp source) (error "~S is not of type ~S." source 'rod)) @@ -221,45 +212,6 @@ (declare (type fixnum i)) (setf (%rune res i) (aref source (the fixnum (+ i start)))))))) -;;; Support for telling ROD and arrays apart: - -#+CMU -(progn - (defvar *rod-hash-table* - (make-array 5003 :initial-element nil))) - -(defun register-rod (rod) - #+CMU - (unless (really-really-rod-p rod) - (push (ext:make-weak-pointer rod) - (aref *rod-hash-table* (mod (cl::pointer-hash rod) - (length *rod-hash-table*))))) - rod) - -(defun really-really-rod-p (rod) - #+CMU - (find rod (aref *rod-hash-table* (mod (cl::pointer-hash rod) - (length *rod-hash-table*))) - :key #'ext:weak-pointer-value)) - -#+CMU -(progn - (defun rod-hash-table-rehash () - (let* ((n 5003) - (new (make-array n :initial-element nil))) - (loop for bucket across *rod-hash-table* do - (loop for item in bucket do - (let ((v (ext:weak-pointer-value item))) - (when v - (push item (aref new (mod (cl::pointer-hash v) n))))))) - (setf *rod-hash-table* new))) - - (defun rod-hash-after-gc-hook () - ;; hmm interesting question: should we rehash? - (rod-hash-table-rehash)) - - (pushnew 'rod-hash-after-gc-hook extensions:*after-gc-hooks*) ) - (defun rod< (rod1 rod2) (do ((i 0 (+ i 1))) (nil) diff --git a/runes/xstream.lisp b/runes/xstream.lisp index 9032a7b..fe818ca 100644 --- a/runes/xstream.lisp +++ b/runes/xstream.lisp @@ -65,7 +65,7 @@ ;; XSTREAM/CLOSE os-stream ;; -(eval-when (eval compile load) +(eval-when (:compile-toplevel :load-toplevel :execute) (defparameter *fast* '(optimize (speed 3) (safety 0))) ;;(defparameter *fast* '(optimize (speed 2) (safety 3))) ) @@ -154,6 +154,10 @@ ;; `buffer-start'. ) +(defun print-xstream (self sink depth) + (declare (ignore depth)) + (format sink "#<~S ~S>" (type-of self) (xstream-name self))) + (defmacro read-rune (input) "Read a single rune off the xstream `input'. In case of end of file :EOF is returned." @@ -213,7 +217,7 @@ nil) ,input)) -(defsubst unread-rune (rune input) +(definline unread-rune (rune input) "Unread the last recently read rune; if there wasn't such a rune, you deserve to lose." (declare (ignore rune)) @@ -376,10 +380,20 @@ ;;; controller implementations (defmethod read-octets (sequence (stream stream) start end) - (#+CLISP lisp:read-byte-sequence + (#+CLISP ext:read-byte-sequence #-CLISP read-sequence sequence stream :start start :end end)) +#+cmu +(defmethod read-octets :around (sequence (stream stream) start end) + ;; CMUCL <= 19a on non-SunOS accidentally triggers EFAULT in read(2) + ;; if SEQUENCE has been write protected by GC. Workaround: Touch all pages + ;; in SEQUENCE and make sure no GC happens between that and the read(2). + (ext::without-gcing + (loop for i from start below end + do (setf (elt sequence i) (elt sequence i))) + (call-next-method))) + (defmethod read-octets (sequence (stream null) start end) (declare (ignore sequence start end)) 0) diff --git a/runes/util.lisp b/xml/util.lisp similarity index 99% rename from runes/util.lisp rename to xml/util.lisp index 60cd74c..abd5305 100644 --- a/runes/util.lisp +++ b/xml/util.lisp @@ -32,7 +32,7 @@ ;; subforms ;; -(in-package :runes) +(in-package :cxml) ;;; -------------------------------------------------------------------------------- ;;; Meta functions diff --git a/xml/xml-name-rune-p.lisp b/xml/xml-name-rune-p.lisp index 9d2efd2..2f5a8a0 100644 --- a/xml/xml-name-rune-p.lisp +++ b/xml/xml-name-rune-p.lisp @@ -1,6 +1,11 @@ -;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: XML; readtable: glisp; Encoding: utf-8; -*- +;;;; xml-name-rune-p -- character class definitions +;;;; +;;;; This file is part of the CXML parser, released under (L)LGPL. +;;;; See file COPYING for details. +;;;; +;;;; Author: Gilbert Baumann -(in-package :xml) +(in-package :cxml) #.(funcall (compile @@ -206,13 +211,15 @@ (setf (aref r i) 1))))) ) `(progn - (DEFSUBST NAME-RUNE-P (RUNE) + (DEFINLINE NAME-RUNE-P (RUNE) + (SETF RUNE (RUNE-CODE RUNE)) (AND (<= 0 RUNE ,*max*) (LOCALLY (DECLARE (OPTIMIZE (SAFETY 0) (SPEED 3))) (= 1 (SBIT ',(predicate-to-bv #'name-rune-p) (THE FIXNUM RUNE)))))) - (DEFSUBST NAME-START-RUNE-P (RUNE) + (DEFINLINE NAME-START-RUNE-P (RUNE) + (SETF RUNE (RUNE-CODE RUNE)) (AND (<= 0 RUNE ,*MAX*) (LOCALLY (DECLARE (OPTIMIZE (SAFETY 0) (SPEED 3))) (= 1 (SBIT ',(predicate-to-bv #'name-start-rune-p) - (THE FIXNUM RUNE)))))))) )))) \ No newline at end of file + (THE FIXNUM RUNE)))))))) )))) diff --git a/xml/xml-parse.lisp b/xml/xml-parse.lisp index 3d4c4c2..0f912ef 100644 --- a/xml/xml-parse.lisp +++ b/xml/xml-parse.lisp @@ -1,11 +1,16 @@ -;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: XML; readtable: glisp; Encoding: utf-8; -*- +;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; -*- ;;; --------------------------------------------------------------------------- -;;; Title: A prototype XML parser +;;; Title: XML parser ;;; Created: 1999-07-17 ;;; Author: Gilbert Baumann +;;; Author: Henrik Motakef +;;; Author: David Lichteblau ;;; License: LGPL (See file COPYING for details). ;;; --------------------------------------------------------------------------- ;;; © copyright 1999 by Gilbert Baumann +;;; © copyright 2003 by Henrik Motakef +;;; © copyright 2004 knowledgeTools Int. GmbH +;;; © copyright 2004 David Lichteblau ;;; This library is free software; you can redistribute it and/or ;;; modify it under the terms of the GNU Library General Public @@ -35,8 +40,7 @@ ;; handler in case of stream underflows. This will yield to quite a ;; performance boost vs calling READ-BYTE per character. -;; Also we need to do encoding and character set conversion on input, -;; this better done at large chunks of data rather than on a character +;; Also we need to do encoding t conversion on ; this better done at large chunks of data rather than on a character ;; by character basis. This way we need a dispatch on the active ;; encoding only once in a while, instead of for each character. This ;; allows us to use a CLOS interface to do the underflow handling. @@ -112,20 +116,16 @@ ;; ;; o provide for a faster DOM ;; -;; o parse document should get passed a document instance, so that a user -;; could pass his own DOM implementation -;; ;; o morph zstream into a context object and thus also get rid of ;; special variables. Put the current DTD there too. +;; [partly done] ;; o the *scratch-pad* hack should become something much more ;; reentrant, we could either define a system-wide resource ;; or allocate some scratch-pads per context. +;; [for thread-safety reasons the array are allocated per context now, +;; reentrancy is still open] -;; o only parse the DTD on an option - -;; o make the invalid tests pass. -;; ;; o CR handling in utf-16 deocders ;; ;; o UCS-4 reader @@ -134,6 +134,7 @@ ;; (or proof, that our circle detection is enough). ;; ;; o element definitions (with att definitions in the elements) +;; [das haben wir doch, oder?] ;; ;; o store entities in the DTD ;; @@ -143,16 +144,16 @@ ;; - UCS-2 in RODs ;; - UTF-16 in RODs ;; - UCS-4 in vectoren +;; [habe ich eigentlich nicht vor--david] ;; ;; o xstreams auslagern, documententieren und dann auch in SGML und ;; CSS parser verwenden. (halt alles was zeichen liest). +;; [ausgelagert sind sie; dokumentiert "so la la"; die Reintegration +;; in Closure ist ein ganz anderes Thema] ;; ;; o merge node representation with SGML module +;; [???] ;; -;; o namespaces (this will get ugly). -;; -;; o validation -;; ;; o line/column number recording ;; ;; o better error messages @@ -166,19 +167,9 @@ ;; ;; o on a parser option, do not expand external entities. ;; -;; o on a parser option, do not parse the DTD. -;; -;; o caching DTDs? -;; -;; That is, if we parse a lot of documents all having the same DTD, -;; we do not need to re-read it every time. -;; But watch the file write date, since not doing so would be -;; a good way to confuse a hell lot of users. -;; But: What to do with declarations in the " (type-of self) (mu (xstream-name self)))) + uri) (deftype read-element () 'rune) -;; (unsigned-byte 16)) ;;t) +(defun call-with-open-xstream (fn stream) + (unwind-protect + (funcall fn stream) + (close-xstream stream))) -(defmethod figure-encoding ((stream null)) - (values :utf-8 nil)) +(defmacro with-open-xstream ((var value) &body body) + `(call-with-open-xstream (lambda (,var) ,@body) ,value)) -(defmethod figure-encoding ((stream stream)) - (let ((c0 (read-byte stream nil :eof))) - (cond ((eq c0 :eof) - (values :utf-8 nil)) - (t - (let ((c1 (read-byte stream nil :eof))) - (cond ((eq c1 :eof) - (values :utf-8 (list c0))) - (t - (cond ((and (= c0 #xFE) (= c1 #xFF)) (values :utf-16-big-endian nil)) - ((and (= c0 #xFF) (= c1 #xFE)) (values :utf-16-little-endian nil)) - (t - (values :utf-8 (list c0 c1))))))))))) - -(defun call-with-open-xstream (continuation &rest open-args) +(defun call-with-open-xfile (continuation &rest open-args) (let ((input (apply #'open (car open-args) :element-type '(unsigned-byte 8) (cdr open-args)))) (unwind-protect (progn (funcall continuation (make-xstream input))) (close input)))) -(defmacro with-open-xstream ((stream &rest open-args) &body body) - `(call-with-open-xstream (lambda (,stream) .,body) .,open-args)) +(defmacro with-open-xfile ((stream &rest open-args) &body body) + `(call-with-open-xfile (lambda (,stream) .,body) .,open-args)) ;;; Decoders @@ -283,6 +312,57 @@ ;; defmethod DECODE-SEQUENCE ((encoding (eql :utf-8)) ...) ;; +;;;; ------------------------------------------------------------------- +;;;; Rechnen mit Runen +;;;; + +;; Let us first define fast fixnum arithmetric get rid of type +;; checks. (After all we know what we do here). + +(defmacro fx-op (op &rest xs) + `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))) +(defmacro fx-pred (op &rest xs) + `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))) + +(defmacro %+ (&rest xs) `(fx-op + ,@xs)) +(defmacro %- (&rest xs) `(fx-op - ,@xs)) +(defmacro %* (&rest xs) `(fx-op * ,@xs)) +(defmacro %/ (&rest xs) `(fx-op floor ,@xs)) +(defmacro %and (&rest xs) `(fx-op logand ,@xs)) +(defmacro %ior (&rest xs) `(fx-op logior ,@xs)) +(defmacro %xor (&rest xs) `(fx-op logxor ,@xs)) +(defmacro %ash (&rest xs) `(fx-op ash ,@xs)) +(defmacro %mod (&rest xs) `(fx-op mod ,@xs)) + +(defmacro %= (&rest xs) `(fx-pred = ,@xs)) +(defmacro %<= (&rest xs) `(fx-pred <= ,@xs)) +(defmacro %>= (&rest xs) `(fx-pred >= ,@xs)) +(defmacro %< (&rest xs) `(fx-pred < ,@xs)) +(defmacro %> (&rest xs) `(fx-pred > ,@xs)) + +;;; XXX Geschwindigkeit dieser Definitionen untersuchen! + +(defmacro rune-op (op &rest xs) + `(code-rune (,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs)))) +(defmacro rune-pred (op &rest xs) + `(,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs))) + +(defmacro %rune+ (&rest xs) `(rune-op + ,@xs)) +(defmacro %rune- (&rest xs) `(rune-op - ,@xs)) +(defmacro %rune* (&rest xs) `(rune-op * ,@xs)) +(defmacro %rune/ (&rest xs) `(rune-op floor ,@xs)) +(defmacro %rune-and (&rest xs) `(rune-op logand ,@xs)) +(defmacro %rune-ior (&rest xs) `(rune-op logior ,@xs)) +(defmacro %rune-xor (&rest xs) `(rune-op logxor ,@xs)) +(defmacro %rune-ash (a b) `(code-rune (ash (rune-code ,a) ,b))) +(defmacro %rune-mod (&rest xs) `(rune-op mod ,@xs)) + +(defmacro %rune= (&rest xs) `(rune-pred = ,@xs)) +(defmacro %rune<= (&rest xs) `(rune-pred <= ,@xs)) +(defmacro %rune>= (&rest xs) `(rune-pred >= ,@xs)) +(defmacro %rune< (&rest xs) `(rune-pred < ,@xs)) +(defmacro %rune> (&rest xs) `(rune-pred > ,@xs)) + ;;;; --------------------------------------------------------------------------- ;;;; rod hashtable ;;;; @@ -298,12 +378,12 @@ ) (defun make-rod-hashtable (&key (size 200)) - (setf size (glisp::nearest-greater-prime size)) + (setf size (nearest-greater-prime size)) (make-rod-hashtable/low :size size :table (make-array size :initial-element nil))) -(eval-when (compile eval load) +(eval-when (:compile-toplevel :load-toplevel :execute) (defconstant +fixnum-bits+ (1- (integer-length most-positive-fixnum)) "Pessimistic approximation of the number of bits of fixnums.") @@ -312,38 +392,38 @@ (1- (expt 2 +fixnum-bits+)) "Pessimistic approximation of the largest bit-mask, still being a fixnum.")) -(defsubst stir (a b) +(definline stir (a b) (%and +fixnum-mask+ (%xor (%ior (%ash (%and a #.(ash +fixnum-mask+ -5)) 5) (%ash a #.(- 5 +fixnum-bits+))) b))) -(defsubst rod-hash (rod start end) +(definline rod-hash (rod start end) "Compute a hash code out of a rod." (let ((res (%- end start))) (do ((i start (%+ i 1))) ((%= i end)) (declare (type fixnum i)) - (setf res (stir res (%rune rod i)))) + (setf res (stir res (rune-code (%rune rod i))))) res)) -(defsubst rod=* (x y &key (start1 0) (end1 (length x)) +(definline rod=* (x y &key (start1 0) (end1 (length x)) (start2 0) (end2 (length y))) (and (%= (%- end1 start1) (%- end2 start2)) (do ((i start1 (%+ i 1)) (j start2 (%+ j 1))) ((%= i end1) t) - (unless (= (%rune x i) (%rune y j)) + (unless (rune= (%rune x i) (%rune y j)) (return nil))))) -(defsubst rod=** (x y start1 end1 start2 end2) +(definline rod=** (x y start1 end1 start2 end2) (and (%= (%- end1 start1) (%- end2 start2)) (do ((i start1 (%+ i 1)) (j start2 (%+ j 1))) ((%= i end1) t) - (unless (= (%rune x i) (%rune y j)) + (unless (rune= (%rune x i) (%rune y j)) (return nil))))) (defun rod-hash-get (hashtable rod &optional (start 0) (end (length rod))) @@ -371,6 +451,7 @@ (return))) (values new-value key))) +#-rune-is-character (defun rod-subseq* (source start &optional (end (length source))) (unless (and (typep start 'fixnum) (>= start 0)) (error "~S is not a non-negative fixnum." start)) @@ -389,8 +470,13 @@ (declare (type fixnum i)) (setf (%rune res i) (aref source (the fixnum (+ i start)))))))) +#+rune-is-character +(defun rod-subseq* (source start &optional (end (length source))) + (subseq source start end)) + (deftype ufixnum () `(unsigned-byte ,(integer-length most-positive-fixnum))) +#-rune-is-character (defun rod-subseq** (source start &optional (end (length source))) (declare (type (simple-array rune (*)) source) (type ufixnum start) @@ -407,6 +493,10 @@ (setf (%rune res i) (%rune source (the ufixnum (+ i start)))))) res)) +#+rune-is-character +(defun rod-subseq** (source start &optional (end (length source))) + (subseq source start end)) + (defun (setf rod-hash-get) (new-value hashtable rod &optional (start 0) (end (length rod))) (rod-hash-set new-value hashtable rod start end)) @@ -424,28 +514,28 @@ ;;;; rod collector ;;;; -(defparameter *scratch-pad* - (make-array 1024 :element-type 'rune)) - -(defparameter *scratch-pad-2* - (make-array 1024 :element-type 'rune)) - -(defparameter *scratch-pad-3* - (make-array 1024 :element-type 'rune)) - -(defparameter *scratch-pad-4* - (make-array 1024 :element-type 'rune)) +(defvar *scratch-pad*) +(defvar *scratch-pad-2*) +(defvar *scratch-pad-3*) +(defvar *scratch-pad-4*) (declaim (type (simple-array rune (*)) *scratch-pad* *scratch-pad-2* *scratch-pad-3* *scratch-pad-4*)) -(defmacro %put-rune (rune-var put) +(defmacro with-scratch-pads ((&optional) &body body) + `(let ((*scratch-pad* (make-array 1024 :element-type 'rune)) + (*scratch-pad-2* (make-array 1024 :element-type 'rune)) + (*scratch-pad-3* (make-array 1024 :element-type 'rune)) + (*scratch-pad-4* (make-array 1024 :element-type 'rune))) + ,@body)) + +(defmacro %put-unicode-char (code-var put) `(progn - (cond ((%> ,rune-var #xFFFF) - (,put (the (unsigned-byte 16) (%+ #xD7C0 (ash ,rune-var -10)))) - (,put (the (unsigned-byte 16) (%ior #xDC00 (%and ,rune-var #x3FF))))) + (cond ((%> ,code-var #xFFFF) + (,put (the rune (code-rune (%+ #xD7C0 (%ash ,code-var -10))))) + (,put (the rune (code-rune (%ior #xDC00 (%and ,code-var #x03FF)))))) (t - (,put ,rune-var))))) + (,put (code-rune ,code-var)))))) (defun adjust-array-by-copying (old-array new-size) "Adjust an array by copying and thus ensures, that result is a SIMPLE-ARRAY." @@ -580,80 +670,289 @@ ;;;; DTD ;;;; -(defparameter *entities* nil) -(defvar *dtd*) +(define-condition parser-error (simple-error) ()) +(define-condition validity-error (parser-error) ()) + +(defun validity-error (x &rest args) + (error 'validity-error + :format-control "Validity constraint violated: ~@?" + :format-arguments (list x args))) + +(defvar *validate* t) +(defvar *markup-declaration-external-p* nil) + +(defun validate-start-element (ctx name) + (when *validate* + (let* ((pair (car (model-stack ctx))) + (newval (funcall (car pair) name))) + (unless newval + (validity-error "(03) Element Valid: ~A" (rod-string name))) + (setf (car pair) newval) + (let ((e (find-element name (dtd ctx)))) + (unless e + (validity-error "(03) Element Valid: no definition for ~A" + (rod-string name))) + (maybe-compile-cspec e) + (push (copy-cons (elmdef-compiled-cspec e)) (model-stack ctx)))))) + +(defun copy-cons (x) + (cons (car x) (cdr x))) + +(defun validate-end-element (ctx name) + (when *validate* + (let ((pair (car (model-stack ctx)))) + (unless (eq (funcall (car pair) nil) t) + (validity-error "(03) Element Valid: ~A" (rod-string name))) + (pop (model-stack ctx))))) + +(defun validate-characters (ctx rod) + (when *validate* + (let ((pair (car (model-stack ctx)))) + (unless (funcall (cdr pair) rod) + (validity-error "(03) Element Valid: unexpected PCDATA"))))) + +(defun standalone-check-necessary-p (def) + (and *validate* + (standalone-p *ctx*) + (etypecase def + (elmdef (elmdef-external-p def)) + (attdef (attdef-external-p def))))) + +(defun process-attributes (ctx name attlist) + (let ((e (find-element name (dtd ctx)))) + (cond + (e + (dolist (ad (elmdef-attributes e)) ;handle default values + (unless (get-attribute (attdef-name ad) attlist) + (case (attdef-default ad) + (:IMPLIED) + (:REQUIRED + (when *validate* + (validity-error "(18) Required Attribute: ~S not specified" + (rod-string (attdef-name ad))))) + (t + (when (standalone-check-necessary-p ad) + (validity-error "(02) Standalone Document Declaration: missing attribute value")) + (push (build-attribute (attdef-name ad) + (cadr (attdef-default ad)) + nil) + attlist))))) + (dolist (a attlist) ;normalize non-CDATA values + (let* ((qname (sax:attribute-qname a)) + (adef (find-attribute e qname))) + (when (and adef (not (eq (attdef-type adef) :CDATA))) + (let ((canon (canon-not-cdata-attval (sax:attribute-value a)))) + (when (and (standalone-check-necessary-p adef) + (not (rod= (sax:attribute-value a) canon))) + (validity-error "(02) Standalone Document Declaration: attribute value not normalized")) + (setf (sax:attribute-value a) canon))))) + (when *validate* ;maybe validate attribute values + (dolist (a attlist) + (validate-attribute ctx e a)))) + ((and *validate* attlist) + (validity-error "(04) Attribute Value Type: no definition for element ~A" + (rod-string name))))) + attlist) + +(defun get-attribute (name attributes) + (member name attributes :key #'sax:attribute-qname :test #'rod=)) + +(defun validate-attribute (ctx e a) + (when (sax:attribute-specified-p a) ;defaults checked by DEFINE-ATTRIBUTE + (let* ((qname (sax:attribute-qname a)) + (adef + (or (find-attribute e qname) + (validity-error "(04) Attribute Value Type: not declared: ~A" + (rod-string qname))))) + (validate-attribute* ctx adef (sax:attribute-value a))))) + +(defun validate-attribute* (ctx adef value) + (let ((type (attdef-type adef)) + (default (attdef-default adef))) + (when (and (listp default) + (eq (car default) :FIXED) + (not (rod= value (cadr default)))) + (validity-error "(20) Fixed Attribute Default: expected ~S but got ~S" + (rod-string (cadr default)) + (rod-string value))) + (ecase (if (listp type) (car type) type) + (:ID + (unless (valid-name-p value) + (validity-error "(08) ID: not a name: ~S" (rod-string value))) + (when (eq (gethash value (id-table ctx)) t) + (validity-error "(08) ID: ~S not unique" (rod-string value))) + (setf (gethash value (id-table ctx)) t)) + (:IDREF + (validate-idref ctx value)) + (:IDREFS + (let ((names (split-names value))) + (unless names + (validity-error "(11) IDREF: malformed names")) + (mapc (curry #'validate-idref ctx) names))) + (:NMTOKEN + (validate-nmtoken value)) + (:NMTOKENS + (let ((tokens (split-names value))) + (unless tokens + (validity-error "(13) Name Token: malformed NMTOKENS")) + (mapc #'validate-nmtoken tokens))) + (:ENUMERATION + (unless (member value (cdr type) :test #'rod=) + (validity-error "(17) Enumeration: value not declared: ~S" + (rod-string value)))) + (:NOTATION + (unless (member value (cdr type) :test #'rod=) + (validity-error "(14) Notation Attributes: ~S" (rod-string value)))) + (:ENTITY + (validate-entity value)) + (:ENTITIES + (let ((names (split-names value))) + (unless names + (validity-error "(13) Name Token: malformed NMTOKENS")) + (mapc #'validate-entity names))) + (:CDATA)))) + +(defun validate-idref (ctx value) + (unless (valid-name-p value) + (validity-error "(11) IDREF: not a name: ~S" (rod-string value))) + (unless (gethash value (id-table ctx)) + (setf (gethash value (id-table ctx)) nil))) + +(defun validate-nmtoken (value) + (unless (valid-nmtoken-p value) + (validity-error "(13) Name Token: not a NMTOKEN: ~S" + (rod-string value)))) + +(defstruct (entdef (:constructor))) + +(defstruct (internal-entdef + (:include entdef) + (:constructor make-internal-entdef (value)) + (:conc-name #:ENTDEF-)) + (value (error "missing argument") :type rod) + (expansion nil)) + +(defstruct (external-entdef + (:include entdef) + (:constructor make-external-entdef (extid ndata)) + (:conc-name #:ENTDEF-)) + (extid (error "missing argument") :type extid) + (ndata nil :type (or rod null))) + +(defun validate-entity (value) + (unless (valid-name-p value) + (validity-error "(12) Entity Name: not a name: ~S" (rod-string value))) + (let ((def (let ((*validate* + ;; Similarly the entity refs are internal and + ;; don't need normalization ... the unparsed + ;; entities (and entities) aren't "references" + ;; -- sun/valid/sa03.xml + nil)) + (get-entity-definition value :general (dtd *ctx*))))) + (unless (and (typep def 'external-entdef) (entdef-ndata def)) + ;; unparsed entity + (validity-error "(12) Entity Name: ~S" (rod-string value))))) + +(defun split-names (rod) + (flet ((whitespacep (x) + (or (rune= x #/U+0009) + (rune= x #/U+000A) + (rune= x #/U+000D) + (rune= x #/U+0020)))) + (if (let ((n (length rod))) + (and (not (zerop n)) + (or (whitespacep (rune rod 0)) + (whitespacep (rune rod (1- n)))))) + nil + (split-sequence-if #'whitespacep rod :remove-empty-subseqs t)))) + +(defun zstream-base-sysid (zstream) + (let ((base-sysid + (dolist (k (zstream-input-stack zstream)) + (let ((base-sysid (stream-name-uri (xstream-name k)))) + (when base-sysid (return base-sysid)))))) + base-sysid)) (defun absolute-uri (sysid source-stream) - (setq sysid (rod-string sysid)) - (let ((base-sysid - (dolist (k (zstream-input-stack source-stream)) - (let ((base-sysid (stream-name-file-name (xstream-name k)))) - (when base-sysid (return base-sysid)))))) + (let ((base-sysid (zstream-base-sysid source-stream))) (assert (not (null base-sysid))) - (merge-sysid sysid base-sysid))) + (puri:merge-uris sysid base-sysid))) + +(defstruct (extid (:constructor make-extid (public system))) + (public nil :type (or rod null)) + (system (error "missing argument") :type (or puri:uri null))) (defun absolute-extid (source-stream extid) - (case (car extid) - (:system - (list (car extid) - (absolute-uri (cadr extid) source-stream))) - (:public - (list (car extid) - (cadr extid) - (absolute-uri (caddr extid) source-stream))))) + (let ((sysid (extid-system extid)) + (result (copy-extid extid))) + (setf (extid-system result) (absolute-uri sysid source-stream)) + result)) (defun define-entity (source-stream name kind def) - (when (eq (car def) :external) - (setf def - (list (car def) (absolute-extid source-stream (cadr def))))) (setf name (intern-name name)) - (setf *entities* - (append *entities* - (list (cons (list kind name) - def))))) + (let ((table + (ecase kind + (:general (dtd-gentities (dtd *ctx*))) + (:parameter (dtd-pentities (dtd *ctx*)))))) + (unless (gethash name table) + (when (and source-stream (handler *ctx*)) + (report-entity (handler *ctx*) kind name def)) + (when (typep def 'external-entdef) + (setf (entdef-extid def) + (absolute-extid source-stream (entdef-extid def)))) + (setf (gethash name table) + (cons *markup-declaration-external-p* def))))) -#|| -(defun define-element (zinput dtd element-name content-model) - ;; zinput is for source code location recoding - (let ((elmdef (make-elmdef :name element-name - :content content-model - ))) - ())) -||# +(defun get-entity-definition (entity-name kind dtd) + (destructuring-bind (extp &rest def) + (gethash entity-name + (ecase kind + (:general (dtd-gentities dtd)) + (:parameter (dtd-pentities dtd))) + '(nil)) + (when (and *validate* (standalone-p *ctx*) extp) + (validity-error "(02) Standalone Document Declaration: entity reference: ~S" + (rod-string entity-name))) + def)) (defun entity->xstream (entity-name kind &optional zstream) ;; `zstream' is for error messages - (let ((looked (assoc (list kind entity-name) *entities* :test #'equal))) - (unless looked + (let ((def (get-entity-definition entity-name kind (dtd *ctx*)))) + (unless def (if zstream (perror zstream "Entity '~A' is not defined." (rod-string entity-name)) (error "Entity '~A' is not defined." (rod-string entity-name)))) (let (r) - (ecase (cadr looked) - (:internal - (setf r (make-rod-xstream (caddr looked))) + (etypecase def + (internal-entdef + (setf r (make-rod-xstream (entdef-value def))) (setf (xstream-name r) (make-stream-name :entity-name entity-name :entity-kind kind - :file-name nil))) - (:external - (setf r (open-extid (caddr looked))) + :uri nil))) + (external-entdef + (setf r (xstream-open-extid (extid-using-catalog (entdef-extid def)))) (setf (stream-name-entity-name (xstream-name r)) entity-name (stream-name-entity-kind (xstream-name r)) kind))) r))) -(defun entity-source-kind (name type) - (let ((looked (assoc (list type name) *entities* :test #'equal))) - (unless looked +(defun checked-get-entdef (name type) + (let ((def (get-entity-definition name type (dtd *ctx*)))) + (unless def (error "Entity '~A' is not defined." (rod-string name))) - (cadr looked))) + def)) -(defun open-extid (extid) - (let ((nam (ecase (car extid) - (:SYSTEM (cadr extid)) - (:PUBLIC (caddr extid))))) - (make-xstream (open-sysid nam) - :name (make-stream-name :file-name nam) +(defun xstream-open-extid (extid) + (let* ((sysid (extid-system extid)) + (stream + (or (funcall (or (entity-resolver *ctx*) (constantly nil)) + (extid-public extid) + (extid-system extid)) + (open (uri-to-pathname sysid) + :element-type '(unsigned-byte 8) + :direction :input)))) + (make-xstream stream + :name (make-stream-name :uri sysid) :initial-speed 1))) (defun call-with-entity-expansion-as-stream (zstream cont name kind) @@ -663,30 +962,17 @@ (funcall cont in) (close-xstream in)))) +(defun ensure-dtd () + (unless (dtd *ctx*) + (setf (dtd *ctx*) (make-dtd)) + (define-default-entities))) + (defun define-default-entities () - (define-entity nil '#.(string-rod "lt") :general `(:internal #.(string-rod "<"))) - (define-entity nil '#.(string-rod "gt") :general `(:internal #.(string-rod ">"))) - (define-entity nil '#.(string-rod "amp") :general `(:internal #.(string-rod "&"))) - (define-entity nil '#.(string-rod "apos") :general `(:internal #.(string-rod "'"))) - (define-entity nil '#.(string-rod "quot") :general `(:internal #.(string-rod "\""))) - ;; - #|| - (define-entity nil '#.(string-rod "ouml") :general `(:internal #.(string-rod "ö"))) - (define-entity nil '#.(string-rod "uuml") :general `(:internal #.(string-rod "ü"))) - (define-entity nil '#.(string-rod "auml") :general `(:internal #.(string-rod "ä"))) - (define-entity nil '#.(string-rod "Ouml") :general `(:internal #.(string-rod "Ö"))) - (define-entity nil '#.(string-rod "Auml") :general `(:internal #.(string-rod "Ä"))) - (define-entity nil '#.(string-rod "Uuml") :general `(:internal #.(string-rod "Ü"))) - (define-entity nil '#.(string-rod "szlig") :general `(:internal #.(string-rod "ß"))) - ||# - ;; - #|| - (define-entity nil '#.(string-rod "nbsp") - :general `(:internal ,(let ((r (make-rod 1))) - (setf (aref r 0) #o240) - r))) - ||# - ) + (define-entity nil #"lt" :general (make-internal-entdef #"<")) + (define-entity nil #"gt" :general (make-internal-entdef #">")) + (define-entity nil #"amp" :general (make-internal-entdef #"&")) + (define-entity nil #"apos" :general (make-internal-entdef #"'")) + (define-entity nil #"quot" :general (make-internal-entdef #"\""))) (defstruct attdef ;; an attribute definition @@ -696,53 +982,145 @@ ; :ENTITY, :ENTITIES, :NMTOKEN, :NMTOKENS, or ; (:NOTATION *) ; (:ENUMERATION *) - default) ;default value of attribute: + default ;default value of attribute: ; :REQUIRED, :IMPLIED, (:FIXED content) or (:DEFAULT content) + (external-p *markup-declaration-external-p*) + ) (defstruct elmdef ;; an element definition name ;name of the element - content ;content model - attributes ;list of defined attribtes - defined-p) ;is this element defined? [*] - -;; [*] in XML it is possible to define attributes, before the element -;; itself is defined and since we hang attribute definitions into the -;; relevant element definitions, this flag indicates, whether an -;; element was actually defined. - -(defstruct dtd - elements ;hashtable or whatnot of all elements - attdefs ; - gentities ;general entities - pentities ;parameter entities + content ;content model [*] + attributes ;list of defined attributes + compiled-cspec ;cons of validation function for contentspec + (external-p *markup-declaration-external-p*) ) +;; [*] in XML it is possible to define attributes before the element +;; itself is defined and since we hang attribute definitions into the +;; relevant element definitions, the `content' slot indicates whether an +;; element was actually defined. It is NIL until set to a content model +;; when the element type declaration is processed. + +(defun %make-rod-hash-table () + ;; XXX with portable hash tables, this is the only way to case-sensitively + ;; use rods. However, EQUALP often has horrible performance! Most Lisps + ;; provide extensions for user-defined equality, we should use them! There + ;; is also a home-made hash table for rods defined below, written by + ;; Gilbert (I think). We could also use that one, but I would prefer the + ;; first method, even if it's unportable. + (make-hash-table :test + #+rune-is-character 'equal + #-rune-is-character 'equalp)) + +(defstruct dtd + (elements (%make-rod-hash-table)) ;elmdefs + (gentities (%make-rod-hash-table)) ;general entities + (pentities (%make-rod-hash-table)) ;parameter entities + (notations (%make-rod-hash-table))) + +(defun make-dtd-cache () + (puri:make-uri-space)) + +(defvar *cache-all-dtds* nil) +(defvar *dtd-cache* (make-dtd-cache)) + +(defun remdtd (uri dtd-cache) + (setf uri (puri:intern-uri uri dtd-cache)) + (prog1 + (and (getf (puri:uri-plist uri) 'dtd) t) + (puri:unintern-uri uri dtd-cache))) + +(defun clear-dtd-cache (dtd-cache) + (puri:unintern-uri t dtd-cache)) + +(defun getdtd (uri dtd-cache) + (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd)) + +(defun (setf getdtd) (newval uri dtd-cache) + (setf (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd) newval) + newval) + + ;;;; +(defun find-element (name dtd) + (gethash name (dtd-elements dtd))) + +(defun define-element (dtd element-name &optional content-model) + (let ((e (find-element element-name dtd))) + (cond + ((null e) + (setf (gethash element-name (dtd-elements dtd)) + (make-elmdef :name element-name :content content-model))) + ((null content-model) + e) + (t + (when *validate* + (when (elmdef-content e) + (validity-error "(05) Unique Element Type Declaration")) + (when (eq content-model :EMPTY) + (dolist (ad (elmdef-attributes e)) + (let ((type (attdef-type ad))) + (when (and (listp type) (eq (car type) :NOTATION)) + (validity-error "(16) No Notation on Empty Element: ~S" + (rod-string element-name))))))) + (sax:element-declaration (handler *ctx*) element-name content-model) + (setf (elmdef-content e) content-model) + (setf (elmdef-external-p e) *markup-declaration-external-p*) + e)))) + +(defvar *redefinition-warning* t) + (defun define-attribute (dtd element name type default) (let ((adef (make-attdef :element element :name name :type type - :default default))) - (cond ((find-attribute dtd element name) - (warn "Attribute \"~A\" of \"~A\" not redefined." - (rod-string name) - (rod-string element))) + :default default)) + (e (or (find-element element dtd) + (define-element dtd element)))) + (when (and *validate* (listp default)) + (unless (eq (attdef-type adef) :CDATA) + (setf (second default) (canon-not-cdata-attval (second default)))) + (validate-attribute* *ctx* adef (second default))) + (cond ((find-attribute e name) + (when *redefinition-warning* + (warn "Attribute \"~A\" of \"~A\" not redefined." + (rod-string name) + (rod-string element)))) (t - (push adef (dtd-attdefs dtd)))))) + (when *validate* + (when (eq type :ID) + (when (find :ID (elmdef-attributes e) :key #'attdef-type) + (validity-error "(09) One ID per Element Type: element ~A" + (rod-string element))) + (unless (member default '(:REQUIRED :IMPLIED)) + (validity-error "(10) ID Attribute Default: ~A" + (rod-string element)))) + (flet ((notationp (type) + (and (listp type) (eq (car type) :NOTATION)))) + (when (notationp type) + (when (find-if #'notationp (elmdef-attributes e) + :key #'attdef-type) + (validity-error "(15) One Notation Per Element Type: ~S" + (rod-string element))) + (when (eq (elmdef-content e) :EMPTY) + (validity-error "(16) No Notation on Empty Element: ~S" + (rod-string element)))))) + (sax:attribute-declaration (handler *ctx*) element name type default) + (push adef (elmdef-attributes e)))))) -(defun find-attribute (dtd element name) - (dolist (k (dtd-attdefs dtd)) - (cond ((and (eq element (attdef-element k)) - (eq name (attdef-name k))) - (return k))))) +(defun find-attribute (elmdef name) + (find name (elmdef-attributes elmdef) :key #'attdef-name :test #'rod=)) -(defun map-all-attdefs-for-element (dtd element continuation) - (declare (dynamic-extent continuation));this does not help under ACL - (dolist (k (dtd-attdefs dtd)) - (cond ((eq element (attdef-element k)) - (funcall continuation k))))) +(defun define-notation (dtd name id) + (let ((ns (dtd-notations dtd))) + (when (gethash name ns) + (validity-error "(24) Unique Notation Name: ~S" (rod-string name))) + (setf (gethash name ns) id))) + +(defun find-notation (name dtd) + (gethash name (dtd-notations dtd))) ;;;; --------------------------------------------------------------------------- ;;;; z streams and lexer @@ -834,7 +1212,7 @@ (rune= c #/U+0009) (rune= c #/U+000D) (rune= c #/U+000A)) - (values :s nil)) + (values :S nil)) ((rune= #/% c) (cond ((name-start-rune-p (peek-rune input)) ;; an entity reference @@ -847,15 +1225,15 @@ (cond ((rune= c #/&) (multiple-value-bind (kind data) (read-entity-ref input) - (cond ((eq kind :named) - (values :entity-ref data) ) - ((eq kind :numeric) - (values :cdata + (cond ((eq kind :NAMED) + (values :ENTITY-REF data) ) + ((eq kind :NUMERIC) + (values :CDATA (with-rune-collector (collect) - (%put-rune data collect))))))) + (%put-unicode-char data collect))))))) (t (unread-rune c input) - (values :cdata (read-cdata input))) )))))))) + (values :CDATA (read-cdata input))) )))))))) (defun read-pe-reference (zinput) (let* ((input (car (zstream-input-stack zinput))) @@ -868,7 +1246,7 @@ (values :S nil) ;space before inserted PE expansion. ) (t - (values :pe-reference nam)) ))) + (values :PE-REFERENCE nam)) ))) (defun read-token-after-|<| (zinput input) (let ((d (read-rune input))) @@ -886,7 +1264,7 @@ (error "Processing instruction target ~S is not a valid NcName." (mu target))) (t - (values :pi (cons target content)))))) + (values :PI (cons target content)))))) ((rune= #// d) (let ((c (peek-rune input))) (cond ((name-start-rune-p c) @@ -951,7 +1329,7 @@ (cond ((eq c :eof) (error "EOF after '&'")) ((rune= c #/#) - (values :numeric (read-numeric-entity input))) + (values :NUMERIC (read-numeric-entity input))) (t (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after &.")) @@ -959,9 +1337,9 @@ (setf c (read-rune input)) (unless (rune= c #/\;) (perror input "Expected \";\".")) - (values :named name)))))) + (values :NAMED name)))))) -(defsubst read-S? (input) +(definline read-S? (input) (while (member (peek-rune input) '(#/U+0020 #/U+0009 #/U+000A #/U+000D) :test #'eq) (consume-rune input))) @@ -970,32 +1348,6 @@ (let ((name (read-name-token input)) (atts nil)) (setf atts (read-attribute-list zinput input nil)) - ;;(setf atts (nreverse atts)) - ;; care for atts - ;; - ;;zzz - (let ((fn (lambda (adef &aux x) - (setf x (assoc (attdef-name adef) atts)) - - (when (and (consp (attdef-default adef)) - (eq (car (attdef-default adef)) :default) - (not x)) - (setf atts (cons (setf x (cons (attdef-name adef) (cadr (attdef-default adef)))) - atts))) - (when (and (consp (attdef-default adef)) - (eq (car (attdef-default adef)) :fixed) - (not x)) - (setf atts (cons (setf x (cons (attdef-name adef) (cadr (attdef-default adef)))) - atts))) - (unless (eq (attdef-type adef) :cdata) - (when x - (setf (cdr x) (canon-not-cdata-attval (cdr x))))) - - ;; xxx more tests - ))) - (declare (dynamic-extent fn)) - (map-all-attdefs-for-element - *dtd* name fn)) ;; check for double attributes (do ((q atts (cdr q))) @@ -1023,22 +1375,22 @@ (let ((name (read-name-token input))) (while (let ((c (peek-rune input))) (and (not (eq c :eof)) - (or (= c #/U+0020) - (= c #/U+0009) - (= c #/U+000A) - (= c #/U+000D)))) + (or (rune= c #/U+0020) + (rune= c #/U+0009) + (rune= c #/U+000A) + (rune= c #/U+000D)))) (consume-rune input)) (unless (eq (read-rune input) #/=) (perror zinput "Expected \"=\".")) (while (let ((c (peek-rune input))) (and (not (eq c :eof)) - (or (= c #/U+0020) - (= c #/U+0009) - (= c #/U+000A) - (= c #/U+000D)))) + (or (rune= c #/U+0020) + (rune= c #/U+0009) + (rune= c #/U+000A) + (rune= c #/U+000D)))) (consume-rune input)) (cons name (read-att-value-2 input)) - ;;(cons name (read-att-value zinput input :att t)) + ;;(cons name (read-att-value zinput input :ATT t)) )) (defun canon-not-cdata-attval (value) @@ -1051,41 +1403,27 @@ (let ((gimme-20 nil) (anything-seen-p nil)) (map nil (lambda (c) - (cond ((= c #x20) + (cond ((rune= c #/u+0020) (setf gimme-20 t)) (t (when (and anything-seen-p gimme-20) - (collect #x20)) + (collect #/u+0020)) (setf gimme-20 nil) (setf anything-seen-p t) (collect c)))) value)))) -#|| -(defun canon-not-cdata-attval (value) - ;; | If the declared value is not CDATA, then the XML processor must - ;; | further process the normalized attribute value by discarding any - ;; | leading and trailing space (#x20) characters, and by replacing - ;; | sequences of space (#x20) characters by a single space (#x20) - ;; | character. - value) -||# - -(defsubst data-rune-p (c) +(definline data-rune-p (rune) ;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. - (or (= c #x9) (= c #xA) (= c #xD) - (<= #x20 c #xD7FF) - (<= #xE000 c #xFFFD) - ;; - (<= #xD800 c #xDBFF) - (<= #xDC00 c #xDFFF) - ;; - )) - -#|| -(defsubst data-rune-p (c) - t) -||# + (let ((c (rune-code rune))) + (or (= c #x9) (= c #xA) (= c #xD) + (<= #x20 c #xD7FF) + (<= #xE000 c #xFFFD) + ;; + (<= #xD800 c #xDBFF) + (<= #xDC00 c #xDFFF) + ;; + ))) (defun read-att-value (zinput input mode &optional canon-space-p (delim nil)) (with-rune-collector-2 (collect) @@ -1101,7 +1439,7 @@ (setf c (peek-rune input)) (cond ((rune= c #/#) (let ((c (read-numeric-entity input))) - (%put-rune c collect))) + (%put-unicode-char c collect))) (t (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after &.")) @@ -1109,13 +1447,13 @@ (setf c (read-rune input)) (assert (rune= c #/\;)) (ecase mode - (:att + (:ATT (recurse-on-entity zinput name :general (lambda (zinput) (muffle (car (zstream-input-stack zinput)) :eof)))) - (:ent + (:ENT ;; bypass, but never the less we ;; need to check for legal ;; syntax. @@ -1124,7 +1462,7 @@ (collect #/&) (map nil (lambda (x) (collect x)) name) (collect #/\; ))))))) - ((and (eq mode :ent) (rune= c #/%)) + ((and (eq mode :ENT) (rune= c #/%)) (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after %.")) (let ((name (read-name-token input))) @@ -1138,7 +1476,7 @@ :eof)))) (t (error "No PE here."))))) - ((and (eq mode :att) (rune= c #/<)) + ((and (eq mode :ATT) (rune= c #/<)) ;; xxx fix error message (cerror "Eat them in spite of this." "For no apparent reason #\/< is forbidden in attribute values. ~ @@ -1170,25 +1508,25 @@ (prog1 (parse-integer (with-output-to-string (sink) - (write-char (code-char c) sink) + (write-char (rune-char c) sink) (while (digit-rune-p (setq c (read-rune input)) 16) - (write-char (code-char c) sink))) + (write-char (rune-char c) sink))) :radix 16) (assert (rune= c #/\;))) ) - ((<= #/0 c #/9) + ((rune<= #/0 c #/9) ;; decimal (prog1 (parse-integer (with-output-to-string (sink) - (write-char (code-char c) sink) - (while (<= #/0 (setq c (read-rune input)) #/9) - (write-char (code-char c) sink))) + (write-char (rune-char c) sink) + (while (rune<= #/0 (setq c (read-rune input)) #/9) + (write-char (rune-char c) sink))) :radix 10) (assert (rune= c #/\;))) ) (t (error "Bad char in numeric character entity.") ))))) - (unless (data-char-p res) + (unless (code-data-char-p res) (error "expansion of numeric character reference (#x~X) is no data char." res)) res)) @@ -1204,7 +1542,7 @@ (read-pi-content input)))) (defun read-pi-content (input &aux d) - (read-s? input) + (read-S? input) (with-rune-collector (collect) (block nil (tagbody @@ -1355,22 +1693,6 @@ ;; some character categories -#|| -(defun name-start-rune-p (rune) - (or (<= #x0041 rune #x005A) - (<= #x0061 rune #x007A) - ;; lots more - (>= rune #x0080) - (rune= rune #/_) - (rune= rune #/:))) - -(defun name-rune-p (rune) - (or (name-start-rune-p rune) - (rune= rune #/.) - (rune= rune #/-) - (rune<= #/0 rune #/9))) -||# - (defun space-rune-p (rune) (declare (type rune rune)) (or (rune= rune #/U+0020) @@ -1378,7 +1700,7 @@ (rune= rune #/U+000A) (rune= rune #/U+000D))) -(defun data-char-p (c) +(defun code-data-char-p (c) ;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. (or (= c #x9) (= c #xA) (= c #xD) (<= #x20 c #xD7FF) @@ -1386,10 +1708,10 @@ (<= #x10000 c #x10FFFF))) (defun pubid-char-p (c) - (or (= c #x20) (= c #xD) (= c #xA) - (<= #/a c #/z) - (<= #/A c #/Z) - (<= #/0 c #/9) + (or (rune= c #/u+0020) (rune= c #/u+000D) (rune= c #/u+000A) + (rune<= #/a c #/z) + (rune<= #/A c #/Z) + (rune<= #/0 c #/9) (member c '(#/- #/' #/\( #/\) #/+ #/, #/. #// #/: #/= #/? #/\; #/! #/* #/# #/@ #/$ #/_ #/%)))) @@ -1438,7 +1760,7 @@ (return)) (t (multiple-value-bind (name type default) (p/attdef input) - (define-attribute *dtd* elm-name name type default)) ))) + (define-attribute (dtd *ctx*) elm-name name type default)) ))) (:> (return)) (otherwise @@ -1491,31 +1813,33 @@ ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' /* VC: Enumeration */ (multiple-value-bind (cat sem) (read-token input) (cond ((eq cat :name) - (cond ((equalp sem '#.(string-rod "CDATA")) :cdata) - ((equalp sem '#.(string-rod "ID")) :id) - ((equalp sem '#.(string-rod "IDREF")) :idrefs) - ((equalp sem '#.(string-rod "IDREFS")) :idrefs) - ((equalp sem '#.(string-rod "ENTITY")) :entity) - ((equalp sem '#.(string-rod "ENTITIES")) :entities) - ((equalp sem '#.(string-rod "NMTOKEN")) :nmtoken) - ((equalp sem '#.(string-rod "NMTOKENS")) :nmtokens) + (cond ((equalp sem '#.(string-rod "CDATA")) :CDATA) + ((equalp sem '#.(string-rod "ID")) :ID) + ((equalp sem '#.(string-rod "IDREF")) :IDREFS) + ((equalp sem '#.(string-rod "IDREFS")) :IDREFS) + ((equalp sem '#.(string-rod "ENTITY")) :ENTITY) + ((equalp sem '#.(string-rod "ENTITIES")) :ENTITIES) + ((equalp sem '#.(string-rod "NMTOKEN")) :NMTOKEN) + ((equalp sem '#.(string-rod "NMTOKENS")) :NMTOKENS) ((equalp sem '#.(string-rod "NOTATION")) - ;; xxx nmtoken vs name (let (names) (p/S input) (expect input :\() (setf names (p/list input #'p/name :\| )) (expect input :\)) - (cons :notation names))) + (when *validate* + (setf (referenced-notations *ctx*) + (append names (referenced-notations *ctx*)))) + (cons :NOTATION names))) (t (error "In p/att-type: ~S ~S." cat sem)))) ((eq cat :\() - ;; xxx nmtoken vs name + ;; XXX Die Nmtoken-Syntax pruefen wir derzeit nur beim Validieren. (let (names) ;;(expect input :\() (setf names (p/list input #'p/name :\| )) (expect input :\)) - (cons :enumeration names))) + (cons :ENUMERATION names))) (t (error "In p/att-type: ~S ~S." cat sem)) ))) @@ -1528,15 +1852,15 @@ ;; /* VC: Fixed Attribute Default */ (multiple-value-bind (cat sem) (peek-token input) (cond ((eq cat :|#REQUIRED|) - (consume-token input) :required) + (consume-token input) :REQUIRED) ((eq cat :|#IMPLIED|) - (consume-token input) :implied) + (consume-token input) :IMPLIED) ((eq cat :|#FIXED|) (consume-token input) (p/S input) - (list :fixed (p/att-value input))) + (list :FIXED (p/att-value input))) ((or (eq cat :\') (eq cat :\")) - (list :default (p/att-value input))) + (list :DEFAULT (p/att-value input))) (t (error "p/default-decl: ~S ~S." cat sem)) ))) ;;;; @@ -1567,10 +1891,29 @@ (p/S? input) (expect input :\>))) +(defun report-entity (h kind name def) + (etypecase def + (external-entdef + (let ((extid (entdef-extid def)) + (ndata (entdef-ndata def))) + (if ndata + (sax:unparsed-entity-declaration h + name + (extid-public extid) + (uri-rod (extid-system extid)) + ndata) + (sax:external-entity-declaration h + kind + name + (extid-public extid) + (uri-rod (extid-system extid)))))) + (internal-entdef + (sax:internal-entity-declaration h kind name (entdef-value def))))) + (defun p/entity-def (input kind) (multiple-value-bind (cat sem) (peek-token input) (cond ((member cat '(:\" :\')) - (list :internal (p/entity-value input))) + (make-internal-entdef (p/entity-value input))) ((and (eq cat :name) (or (equalp sem '#.(string-rod "SYSTEM")) (equalp sem '#.(string-rod "PUBLIC")))) @@ -1584,8 +1927,10 @@ '#.(string-rod "NDATA"))) (consume-token input) (p/S input) - (setf ndata (p/name input)))))) - (list :external extid ndata))) + (setf ndata (p/name input)) + (when *validate* + (push ndata (referenced-notations *ctx*))))))) + (make-external-entdef extid ndata))) (t (error "p/entity-def: ~S / ~S." cat sem)) ))) @@ -1593,7 +1938,7 @@ (let ((delim (if (eq (read-token input) :\") #/\" #/\'))) (read-att-value input (car (zstream-input-stack input)) - :ent + :ENT nil delim))) @@ -1601,7 +1946,7 @@ (let ((delim (if (eq (read-token input) :\") #/\" #/\'))) (read-att-value input (car (zstream-input-stack input)) - :att + :ATT t delim))) @@ -1610,8 +1955,7 @@ (multiple-value-bind (cat sem) (read-token input) (cond ((and (eq cat :name) (equalp sem '#.(string-rod "SYSTEM"))) (p/S input) - (list :system (p/system-literal input)) - ) + (make-extid nil (p/system-literal input))) ((and (eq cat :name) (equalp sem '#.(string-rod "PUBLIC"))) (let (pub sys) (p/S input) @@ -1620,12 +1964,10 @@ (p/S input) (when (member (peek-token input) '(:\" :\')) (setf sys (p/system-literal input)))) - (unless (every #'pubid-char-p pub) - (error "Illegal pubid: ~S." (rod-string pub))) (when (and (not public-only-ok-p) (null sys)) (error "System identifier needed for this PUBLIC external identifier.")) - (list :public pub sys))) + (make-extid pub sys))) (t (error "Expected external-id: ~S / ~S." cat sem))))) @@ -1635,7 +1977,7 @@ ;; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] ;; | [-'()+,./:=?;!*#@$_%] -(defun p/system-literal (input) +(defun p/id (input) (multiple-value-bind (cat) (read-token input) (cond ((member cat '(:\" :\')) (let ((delim (if (eq cat :\") #/\" #/\'))) @@ -1651,9 +1993,33 @@ (t (error "Expect either \" or \'."))))) +;; it is important to cache the orginal URI rod, since the re-serialized +;; uri-string can be different from the one parsed originally. +(defun uri-rod (uri) + (if uri + (or (getf (puri:uri-plist uri) 'original-rod) + (rod (puri:render-uri uri nil))) + nil)) + +(defun safe-parse-uri (str) + ;; puri doesn't like strings starting with file:///, although that is a very + ;; common is practise. Cut it away, we don't distinguish between scheme + ;; :FILE and NIL anway. + (when (eql (search "file://" str) 0) + (setf str (subseq str (length "file://")))) + (puri:parse-uri str)) + +(defun p/system-literal (input) + (let* ((rod (p/id input)) + (result (safe-parse-uri (rod-string rod)))) + (setf (getf (puri:uri-plist result) 'original-rod) rod) + result)) + (defun p/pubid-literal (input) - ;; xxx check for valid chars - (p/system-literal input)) + (let ((result (p/id input))) + (unless (every #'pubid-char-p result) + (error "Illegal pubid: ~S." (rod-string result))) + result)) ;;;; @@ -1664,15 +2030,128 @@ (p/S input) (setf name (p/name input)) (p/S input) - (setf content (p/cspec input)) - (unless (legal-content-model-p content) - '(error "Illegal content model: ~S." (mu content)) - (warn "Illegal content model: ~S." (mu content))) + (setf content (normalize-mixed-cspec (p/cspec input))) + (unless (legal-content-model-p content *validate*) + (error "Malformed or invalid content model: ~S." (mu content))) (p/S? input) (expect input :\>) + (when *validate* + (define-element (dtd *ctx*) name content)) (list :element name content))) -(defun legal-content-model-p (cspec) +(defun maybe-compile-cspec (e) + (or (elmdef-compiled-cspec e) + (setf (elmdef-compiled-cspec e) + (let ((cspec (elmdef-content e))) + (unless cspec + (validity-error "(03) Element Valid: no definition for ~A" + (rod-string (elmdef-name e)))) + (multiple-value-call #'cons + (compile-cspec cspec (standalone-check-necessary-p e))))))) + +(defun make-root-model (name) + (cons (lambda (actual-name) + (if (rod= actual-name name) + (constantly :dummy) + nil)) + (constantly t))) + +;;; content spec validation: +;;; +;;; Given a `contentspec', COMPILE-CSPEC returns as multiple values two +;;; functions A and B of one argument to be called for every +;;; A. child element +;;; B. text child node +;;; +;;; Function A will be called with +;;; - the element name rod as its argument. If that element may appear +;;; at the current position, a new function to be called for the next +;;; child is returned. Otherwise NIL is returned. +;;; - argument NIL at the end of the element, it must then return T or NIL +;;; to indicate whether the end tag is valid. +;;; +;;; Function B will be called with the character data rod as its argument, it +;;; returns a boolean indicating whether this text element is allowed. +;;; +;;; That is, if one of the functions ever returns NIL, the element is +;;; rejected as invalid. + +(defun cmodel-done (actual-value) + (null actual-value)) + +(defun compile-cspec (cspec &optional standalone-check) + (cond + ((atom cspec) + (ecase cspec + (:EMPTY (values #'cmodel-done (constantly nil))) + (:PCDATA (values #'cmodel-done (constantly t))) + (:ANY + (values (labels ((doit (name) (if name #'doit t))) #'doit) + (constantly t))))) + ((and (eq (car cspec) '*) + (let ((subspec (second cspec))) + (and (eq (car subspec) 'or) (eq (cadr subspec) :PCDATA)))) + (values (compile-mixed (second cspec)) + (constantly t))) + (t + (values (compile-content-model cspec) + (lambda (rod) + (when standalone-check + (validity-error "(02) Standalone Document Declaration: whitespace")) + (every #'white-space-rune-p rod)))))) + +(defun compile-mixed (cspec) + ;; das koennten wir theoretisch auch COMPILE-CONTENT-MODEL erledigen lassen + (let ((allowed-names (cddr cspec))) + (labels ((doit (actual-name) + (cond + ((null actual-name) t) + ((member actual-name allowed-names :test #'rod=) #'doit) + (t nil)))) + #'doit))) + +(defun compile-content-model (cspec &optional (continuation #'cmodel-done)) + (if (vectorp cspec) + (lambda (actual-name) + (if (and actual-name (rod= cspec actual-name)) + continuation + nil)) + (ecase (car cspec) + (and + (labels ((traverse (seq) + (compile-content-model (car seq) + (if (cdr seq) + (traverse (cdr seq)) + continuation)))) + (traverse (cdr cspec)))) + (or + (let ((options (mapcar (rcurry #'compile-content-model continuation) + (cdr cspec)))) + (lambda (actual-name) + (some (rcurry #'funcall actual-name) options)))) + (? + (let ((maybe (compile-content-model (second cspec) continuation))) + (lambda (actual-name) + (or (funcall maybe actual-name) + (funcall continuation actual-name))))) + (* + (let (maybe-continuation) + (labels ((recurse (actual-name) + (if (null actual-name) + (funcall continuation actual-name) + (or (funcall maybe-continuation actual-name) + (funcall continuation actual-name))))) + (setf maybe-continuation + (compile-content-model (second cspec) #'recurse)) + #'recurse))) + (+ + (let ((it (cadr cspec))) + (compile-content-model `(and ,it (* ,it)) continuation)))))) + +(defun setp (list &key (test 'eql)) + (equal list (remove-duplicates list :test test))) + +(defun legal-content-model-p (cspec &optional validate) (or (eq cspec :PCDATA) (eq cspec :ANY) (eq cspec :EMPTY) @@ -1680,8 +2159,11 @@ (eq (car cspec) '*) (consp (cadr cspec)) (eq (car (cadr cspec)) 'or) - (eq (cadr (cadr cspec)) :pcdata) - (every #'vectorp (cddr (cadr cspec)))) + (eq (cadr (cadr cspec)) :PCDATA) + (every #'vectorp (cddr (cadr cspec))) + (if (and validate (not (setp (cddr (cadr cspec)) :test #'rod=))) + (validity-error "VC: No Duplicate Types (07)") + t)) (labels ((walk (x) (cond ((member x '(:PCDATA :ANY :EMPTY)) nil) @@ -1695,45 +2177,46 @@ ;; cspec ::= 'EMPTY' | 'ANY' | '#PCDATA' ;; | Name ;; | cs -;; cs ::= '(' S? cspec ( S? '|' S? cs)* S? ')' ('?' | '*' | '+')? +;; cs ::= '(' S? cspec ( S? '|' S? cspec)* S? ')' ('?' | '*' | '+')? ;; und eine post mortem analyse -(defun p/cspec (input &optional (level 0) (only-names-p nil)) +(defun p/cspec (input) (let ((term - (let ((names nil) op-cat op res) + (let ((names nil) op-cat op res stream) (multiple-value-bind (cat sem) (peek-token input) (cond ((eq cat :name) (consume-token input) (cond ((rod= sem '#.(string-rod "EMPTY")) - :empty) + :EMPTY) ((rod= sem '#.(string-rod "ANY")) - :any) + :ANY) (t sem))) - ((and (eq cat :\#PCDATA) (not only-names-p)) - (unless (= level 1) - (error "#PCDATA only on top level in content modell.")) + ((eq cat :\#PCDATA) (consume-token input) - :pcdata) - ((and (eq cat :\() (not only-names-p)) + :PCDATA) + ((eq cat :\() + (setf stream (car (zstream-input-stack input))) (consume-token input) (p/S? input) - (setq names (list (p/cspec input (+ level 1)))) + (setq names (list (p/cspec input))) (p/S? input) - (let ((on? (eq (car names) :pcdata))) - (cond ((member (peek-token input) '(:\| :\,)) - (setf op-cat (peek-token input)) - (setf op (if (eq op-cat :\,) 'and 'or)) - (while (eq (peek-token input) op-cat) - (consume-token input) - (p/S? input) - (push (p/cspec input (+ level 1) on?) names) - (p/S? input)) - (setf res (cons op (reverse names)))) - (t - (setf res (car names))))) + (cond ((member (peek-token input) '(:\| :\,)) + (setf op-cat (peek-token input)) + (setf op (if (eq op-cat :\,) 'and 'or)) + (while (eq (peek-token input) op-cat) + (consume-token input) + (p/S? input) + (push (p/cspec input) names) + (p/S? input)) + (setf res (cons op (reverse names)))) + (t + (setf res (cons 'and names)))) (p/S? input) (expect input :\)) + (when *validate* + (unless (eq stream (car (zstream-input-stack input))) + (validity-error "(06) Proper Group/PE Nesting"))) res) (t (error "p/cspec - ~s / ~s" cat sem))))))) @@ -1742,6 +2225,26 @@ ((eq (peek-token input) :*) (consume-token input) (list '* term)) (t term)))) + +(defun normalize-mixed-cspec (cspec) + ;; der Parser oben funktioniert huebsch fuer die children-Regel, aber + ;; fuer Mixed ist das Ergebnis nicht praktisch, denn dort wollen wir + ;; eigentlich auf eine Liste von Namen in einheitlichem Format hinaus. + ;; Dazu normalisieren wir einfach in eine der beiden folgenden Formen: + ;; (* (or :PCDATA ...rods...)) -- und zwar exakt so! + ;; :PCDATA -- sonst ganz trivial + (flet ((trivialp (c) + (and (consp c) + (and (eq (car c) 'and) + (eq (cadr c) :PCDATA) + (null (cddr c)))))) + (if (or (trivialp cspec) ;(and PCDATA) + (and (consp cspec) ;(* (and PCDATA)) + (and (eq (car cspec) '*) + (null (cddr cspec)) + (trivialp (cadr cspec))))) + :PCDATA + cspec))) ;; [52] AttlistDecl ::= '' @@ -1760,41 +2263,76 @@ (setf id (p/external-id input t)) (p/S? input) (expect input :\>) + (sax:notation-declaration (handler *ctx*) + name + (if (extid-public id) + (normalize-public-id (extid-public id)) + nil) + (uri-rod (extid-system id))) + (when *validate* + (define-notation (dtd *ctx*) name id)) (list :notation-decl name id))) +(defun normalize-public-id (rod) + (with-rune-collector (collect) + (let ((gimme-20 nil) + (anything-seen-p nil)) + (map nil (lambda (c) + (cond + ((or (rune= c #/u+0009) + (rune= c #/u+000A) + (rune= c #/u+000D) + (rune= c #/u+0020)) + (setf gimme-20 t)) + (t + (when (and anything-seen-p gimme-20) + (collect #/u+0020)) + (setf gimme-20 nil) + (setf anything-seen-p t) + (collect c)))) + rod)))) + ;;; (defun p/conditional-sect (input) (expect input :) ) + (p/cond-expect input :\] initial-stream) + (p/cond-expect input :\] initial-stream) + (p/cond-expect input :\> initial-stream)) -(defun p/ignore-sect (input) +(defun p/ignore-sect (input initial-stream) ;; )) - (decf level))) )))) + (decf level))) ))) + (unless (eq (car (zstream-input-stack input)) initial-stream) + (validity-error "(21) Proper Conditional Section/PE Nesting"))) (defun p/ext-subset-decl (input) ;; ( markupdecl | conditionalSect | S )* @@ -1812,22 +2352,32 @@ (:S (consume-token input)) (:eof (return)) ((:| )) (setf extid (p/external-id input t)))) + (when dtd-extid + (setf extid dtd-extid)) (p/S? input) + (sax:start-dtd (handler *ctx*) + name + (and extid (extid-public extid)) + (and extid (uri-rod (extid-system extid)))) (when (eq (peek-token input) :\[ ) + (when (disallow-internal-subset *ctx*) + (error "document includes an internal subset")) + (ensure-dtd) (consume-token input) (while (progn (p/S? input) (not (eq (peek-token input) :\] ))) - (if (eq (peek-token input) :pe-reference) + (if (eq (peek-token input) :PE-REFERENCE) (let ((name (nth-value 1 (read-token input)))) (recurse-on-entity input name :parameter (lambda (input) - (ecase (entity-source-kind name :parameter) - (:external + (etypecase (checked-get-entdef name :parameter) + (external-entdef (p/ext-subset input)) - (:internal + (internal-entdef (p/ext-subset-decl input))) (unless (eq :eof (peek-token input)) (error "Trailing garbage."))))) - (p/markup-decl input))) + (let ((*expand-pe-p* t)) + (p/markup-decl input)))) (consume-token input) (p/S? input)) (expect input :>) (when extid - (let* ((xi2 (open-extid (absolute-extid input extid))) - (zi2 (make-zstream :input-stack (list xi2)))) - (let () - (p/ext-subset zi2)))) - (list :doctype name extid)))) + (let* ((effective-extid + (extid-using-catalog (absolute-extid input extid))) + (sysid (extid-system effective-extid)) + (fresh-dtd-p (null (dtd *ctx*))) + (cached-dtd + (and fresh-dtd-p + (not (standalone-p *ctx*)) + (getdtd sysid *dtd-cache*)))) + (cond + (cached-dtd + (setf (dtd *ctx*) cached-dtd) + (report-cached-dtd cached-dtd)) + (t + (let* ((xi2 (xstream-open-extid effective-extid)) + (zi2 (make-zstream :input-stack (list xi2)))) + (ensure-dtd) + (p/ext-subset zi2) + (when (and fresh-dtd-p + *cache-all-dtds* + *validate* + (not (standalone-p *ctx*))) + (setf (getdtd sysid *dtd-cache*) (dtd *ctx*)))))))) + (sax:end-dtd (handler *ctx*)) + (let ((dtd (dtd *ctx*))) + (sax:entity-resolver + (handler *ctx*) + (lambda (name handler) (resolve-entity name handler dtd)))) + (list :DOCTYPE name extid)))) + +(defun report-cached-dtd (dtd) + (maphash (lambda (k v) + (report-entity (handler *ctx*) :general k (cdr v))) + (dtd-gentities dtd)) + (maphash (lambda (k v) + (report-entity (handler *ctx*) :parameter k (cdr v))) + (dtd-pentities dtd)) + (maphash (lambda (k v) + (sax:notation-declaration + (handler *ctx*) + k + (if (extid-public v) + (normalize-public-id (extid-public v)) + nil) + (uri-rod (extid-system v)))) + (dtd-notations dtd))) (defun p/misc*-2 (input) ;; Misc* - (while (member (peek-token input) '(:comment :pi :s)) - (when (eq (peek-token input) :pi) - (sax:processing-instruction - *handler* - (car (nth-value 1 (peek-token input))) - (cdr (nth-value 1 (peek-token input))))) - (consume-token input))) + (while (member (peek-token input) '(:COMMENT :PI :S)) + (case (peek-token input) + (:COMMENT + (sax:comment (handler *ctx*) (nth-value 1 (peek-token input)))) + (:PI + (sax:processing-instruction + (handler *ctx*) + (car (nth-value 1 (peek-token input))) + (cdr (nth-value 1 (peek-token input)))))) + (consume-token input))) - -(defvar *handler*) - -(defun p/document (input handler) - (let ((*handler* handler) - (*namespace-bindings* *default-namespace-bindings*)) - (setf *entities* nil) - (setf *dtd* (make-dtd)) - (define-default-entities) - (sax:start-document *handler*) +(defun p/document + (input handler + &key validate dtd root entity-resolver disallow-internal-subset) + (let ((*ctx* + (make-context :handler handler + :entity-resolver entity-resolver + :disallow-internal-subset disallow-internal-subset)) + (*validate* validate)) + (sax:start-document handler) ;; document ::= XMLDecl? Misc* (doctypedecl Misc*)? element Misc* ;; Misc ::= Comment | PI | S ;; xmldecl::='' @@ -1936,29 +2553,48 @@ ;; optional XMLDecl? (cond ((eq (peek-token input) :xml-pi) (let ((hd (parse-xml-pi (cdr (nth-value 1 (peek-token input))) t))) + (setf (standalone-p *ctx*) (eq (xml-header-standalone-p hd) :yes)) (setup-encoding input hd)) - ;; FIXME: Ceci n'est pas un pi. Should probably go away. - ;; (hmot 30/06/03) - (sax:processing-instruction - *handler* - (car (nth-value 1 (peek-token input))) - (cdr (nth-value 1 (peek-token input)))) (read-token input))) (set-full-speed input) ;; Misc* (p/misc*-2 input) ;; (doctypedecl Misc*)? - (when (eq (peek-token input) :xstream ""))) + (setf (xstream-name dummy) + (make-stream-name + :entity-name "dummy doctype" + :entity-kind :main + :uri (zstream-base-sysid input))) + (p/doctype-decl (make-zstream :input-stack (list dummy)) dtd))) + ((and validate (not dtd)) + (validity-error "invalid document: no doctype"))) + (ensure-dtd) + ;; Override expected root element if asked to + (when root + (setf (model-stack *ctx*) (list (make-root-model root)))) ;; element - (let ((*data-behaviour* :doc)) + (let ((*data-behaviour* :DOC)) (p/element input)) ;; optional Misc* (p/misc*-2 input) (unless (eq (peek-token input) :eof) (error "Garbage at end of document.")) - (sax:end-document *handler*)))) + (when *validate* + (maphash (lambda (k v) + (unless v + (validity-error "(11) IDREF: ~S not defined" (rod-string k)))) + (id-table *ctx*)) + + (dolist (name (referenced-notations *ctx*)) + (unless (find-notation name (dtd *ctx*)) + (validity-error "(23) Notation Declared: ~S" (rod-string name))))) + (sax:end-document handler)))) (defun p/element (input) (if sax:*namespace-processing* @@ -1967,46 +2603,55 @@ (defun p/element-no-ns (input) ;; [39] element ::= EmptyElemTag | STag content ETag + (error "sorry, bitrot") (multiple-value-bind (cat sem) (read-token input) (cond ((eq cat :ztag) - (sax:start-element *handler* nil nil (car sem) (build-attribute-list-no-ns (cdr sem))) - (sax:end-element *handler* nil nil (car sem))) + (sax:start-element (handler *ctx*) nil nil (car sem) (build-attribute-list-no-ns (cdr sem))) + (sax:end-element (handler *ctx*) nil nil (car sem))) ((eq cat :stag) - (sax:start-element *handler* nil nil (car sem) (build-attribute-list-no-ns (cdr sem))) + (sax:start-element (handler *ctx*) nil nil (car sem) (build-attribute-list-no-ns (cdr sem))) (p/content input) (multiple-value-bind (cat2 sem2) (read-token input) (unless (and (eq cat2 :etag) (eq (car sem2) (car sem))) (perror input "Bad nesting. ~S / ~S" (mu sem) (mu (cons cat2 sem2))))) - (sax:end-element *handler* nil nil (car sem))) + (sax:end-element (handler *ctx*) nil nil (car sem))) (t (error "Expecting element."))))) + (defun p/element-ns (input) (destructuring-bind (cat (name &rest attrs)) (multiple-value-list (read-token input)) - (let ((ns-decls (declare-namespaces attrs))) + (validate-start-element *ctx* name) + (let ((ns-decls (declare-namespaces name attrs))) (multiple-value-bind (ns-uri prefix local-name) (decode-qname name) (declare (ignore prefix)) - (let ((attlist (build-attribute-list-ns attrs))) - (cond ((eq cat :ztag) - (sax:start-element *handler* ns-uri local-name name attlist) - (sax:end-element *handler* ns-uri local-name name)) + (let* ((raw-attlist (build-attribute-list-ns attrs)) + (attlist + (remove-if-not (lambda (a) + (or sax:*include-xmlns-attributes* + (not (xmlns-attr-p (sax:attribute-qname a))))) + (process-attributes *ctx* name raw-attlist)))) + (cond ((eq cat :ztag) + (sax:start-element (handler *ctx*) ns-uri local-name name attlist) + (sax:end-element (handler *ctx*) ns-uri local-name name)) ((eq cat :stag) - (sax:start-element *handler* ns-uri local-name name attlist) + (sax:start-element (handler *ctx*) ns-uri local-name name attlist) (p/content input) (multiple-value-bind (cat2 sem2) (read-token input) (unless (and (eq cat2 :etag) (eq (car sem2) name)) (perror input "Bad nesting. ~S / ~S" (mu name) (mu (cons cat2 sem2))))) - (sax:end-element *handler* ns-uri local-name name)) + (sax:end-element (handler *ctx*) ns-uri local-name name)) (t - (error "Expecting element."))))) - (undeclare-namespaces ns-decls)))) + (error "Expecting element, got ~S." cat))))) + (undeclare-namespaces ns-decls)) + (validate-end-element *ctx* name))) (defun perror (stream format-string &rest format-args) (when (zstream-p stream) @@ -2023,20 +2668,21 @@ ((:stag :ztag) (p/element input) (p/content input)) - ((:cdata) + ((:CDATA) (consume-token input) - (sax:characters *handler* sem) + (validate-characters *ctx* sem) + (sax:characters (handler *ctx*) sem) (p/content input)) - ((:entity-ref) + ((:ENTITY-REF) (let ((name sem)) (consume-token input) (append ;; nil #+(OR) (recurse-on-entity input name :general (lambda (input) (prog1 - (ecase (entity-source-kind name :general) - (:internal (p/content input)) - (:external (p/ext-parsed-ent input))) + (etypecase (checked-get-entdef name :general) + (internal-entdef (p/content input)) + (external-entdef (p/ext-parsed-ent input))) (unless (eq (peek-token input) :eof) (error "Trailing garbage. - ~S" (peek-token input)))))) (p/content input)))) @@ -2051,16 +2697,18 @@ (rune= #/A (read-rune input)) (rune= #/\[ (read-rune input))) (error "After '= (length (cdar atts)) 1) (every (lambda (x) - (or (<= #/a x #/z) - (<= #/A x #/Z) - (<= #/0 x #/9) + (or (rune<= #/a x #/z) + (rune<= #/A x #/Z) + (rune<= #/0 x #/9) (rune= x #/_) (rune= x #/.) (rune= x #/:) @@ -2114,17 +2762,17 @@ (when (eq (caar atts) (intern-name '#.(string-rod "encoding"))) (unless (and (>= (length (cdar atts)) 1) (every (lambda (x) - (or (<= #/a x #/z) - (<= #/A x #/Z) - (<= #/0 x #/9) + (or (rune<= #/a x #/z) + (rune<= #/A x #/Z) + (rune<= #/0 x #/9) (rune= x #/_) (rune= x #/.) (rune= x #/-))) (cdar atts)) ((lambda (x) - (or (<= #/a x #/z) - (<= #/A x #/Z) - (<= #/0 x #/9))) + (or (rune<= #/a x #/z) + (rune<= #/A x #/Z) + (rune<= #/0 x #/9))) (aref (cdar atts) 0))) (error "Bad XML encoding name: ~S." (rod-string (cdar atts)))) (setf (xml-header-encoding res) (rod-string (cdar atts))) @@ -2159,54 +2807,220 @@ (x))) ;;;; --------------------------------------------------------------------------- -;;;; -;;;; canonical XML according to James Clark -;;;; +;;;; User interface ;;;; -;;;; User inteface ;;;; +(defun specific-or (component &optional (alternative nil)) + (if (eq component :unspecific) + alternative + component)) -(defun parse-file (filename &optional (handler (make-instance 'dom-impl::dom-builder))) - (with-open-xstream (input filename) +(defun string-or (str &optional (alternative nil)) + (if (zerop (length str)) + alternative + str)) + +(defun make-uri (&rest initargs &key path query &allow-other-keys) + (apply #'make-instance + 'puri:uri + :path (and path (escape-path path)) + :query (and query (escape-query query)) + initargs)) + +(defun escape-path (list) + (puri::render-parsed-path list t)) + +(defun escape-query (pairs) + (flet ((escape (str) + (puri::encode-escaped-encoding str puri::*reserved-characters* t))) + (let ((first t)) + (with-output-to-string (s) + (dolist (pair pairs) + (if first + (setf first nil) + (write-char #\& s)) + (write-string (escape (car pair)) s) + (write-char #\= s) + (write-string (escape (cdr pair)) s)))))) + +(defun uri-parsed-query (uri) + (flet ((unescape (str) + (puri::decode-escaped-encoding str t puri::*reserved-characters*))) + (let ((str (puri:uri-query uri))) + (cond + (str + (let ((pairs '())) + (dolist (s (split-sequence-if (lambda (x) (eql x #\&)) str)) + (destructuring-bind (name value) + (split-sequence-if (lambda (x) (eql x #\=)) s) + (push (cons (unescape name) (unescape value)) pairs))) + (reverse pairs))) + (t + nil))))) + +(defun query-value (name alist) + (cdr (assoc name alist :test #'equal))) + +(defun pathname-to-uri (pathname) + (let ((path + (append (pathname-directory pathname) + (list + (if (specific-or (pathname-type pathname)) + (concatenate 'string + (pathname-name pathname) + "." + (pathname-type pathname)) + (pathname-name pathname)))))) + (if (eq (car path) :relative) + (make-uri :path path) + (make-uri :scheme :file + :host (concatenate 'string + (string-or (host-namestring pathname)) + "+" + (specific-or (pathname-device pathname))) + :path path)))) + +(defun parse-name.type (str) + (if str + (let ((i (position #\. str :from-end t))) + (if i + (values (subseq str 0 i) (subseq str (1+ i))) + (values str nil))) + (values nil nil))) + +(defun uri-to-pathname (uri) + (let ((scheme (puri:uri-scheme uri)) + (path (puri:uri-parsed-path uri))) + (unless (member scheme '(nil :file)) + (error 'parser-error + :format-control "URI scheme ~S not supported" + :format-arguments (list scheme))) + (if (eq (car path) :relative) + (multiple-value-bind (name type) + (parse-name.type (car (last path))) + (make-pathname :directory (butlast path) + :name name + :type type)) + (multiple-value-bind (name type) + (parse-name.type (car (last (cdr path)))) + (destructuring-bind (host device) + (split-sequence-if (lambda (x) (eql x #\+)) + (or (puri:uri-host uri) "+")) + (make-pathname :host (string-or host) + :device (string-or device) + :directory (cons :absolute (butlast (cdr path))) + :name name + :type type)))))) + +(defun parse-xstream (xstream handler &rest args) + (let ((zstream (make-zstream :input-stack (list xstream)))) + (peek-rune xstream) + (with-scratch-pads () + (apply #'p/document zstream handler args)))) + +(defun parse-file (filename handler &rest args) + (with-open-xfile (input filename) (setf (xstream-name input) (make-stream-name :entity-name "main document" :entity-kind :main - :file-name filename)) - (let ((zstream (make-zstream :input-stack (list input)))) - (peek-rune input) - (progn 'time - (p/document zstream handler))))) + :uri (pathname-to-uri filename))) + (apply #'parse-xstream input handler args))) -(defun parse-stream (stream &optional (handler (make-instance 'dom-impl::dom-builder))) - (let* ((xstream - (make-xstream - stream - :name (make-stream-name - :entity-name "main document" - :entity-kind :main - :file-name (or (ignore-errors (pathname *standard-output*)) - *default-pathname-defaults*)) - :initial-speed 1)) - (zstream (make-zstream :input-stack (list xstream)))) - (p/document zstream handler))) +(defun resolve-synonym-stream (stream) + (while (typep stream 'synonym-stream) + (setf stream (symbol-value (synonym-stream-symbol stream)))) + stream) -(defun parse-string (string &optional (handler (make-instance 'dom-impl::dom-builder))) - (let* ((x (string->xstream string)) - (z (make-zstream :input-stack (list x)))) - (p/document z handler))) +(defun safe-stream-sysid (stream) + (if (and (typep (resolve-synonym-stream stream) 'file-stream) + (pathname stream)) + (pathname-to-uri (pathname stream)) + nil)) + +(defun parse-stream (stream handler &rest args) + (let ((xstream + (make-xstream + stream + :name (make-stream-name + :entity-name "main document" + :entity-kind :main + :uri (safe-stream-sysid stream)) + :initial-speed 1))) + (apply #'parse-xstream xstream handler args))) + +(defun parse-dtd-file (filename &optional handler) + (with-open-file (s filename :element-type '(unsigned-byte 8)) + (parse-dtd-stream s handler))) + +(defun parse-dtd-stream (stream &optional handler) + (let ((input (make-xstream stream))) + (setf (xstream-name input) + (make-stream-name + :entity-name "dtd" + :entity-kind :main + :uri (safe-stream-sysid stream))) + (let ((zstream (make-zstream :input-stack (list input))) + (*ctx* (make-context :handler handler)) + (*validate* t) + (*data-behaviour* :DTD)) + (with-scratch-pads () + (ensure-dtd) + (peek-rune input) + (p/ext-subset zstream) + (dtd *ctx*))))) + +(defun parse-string (string handler) + ;; XXX this function mis-handles encoding + (with-scratch-pads () + (let* ((x (string->xstream string)) + (z (make-zstream :input-stack (list x)))) + (p/document z handler)))) (defun string->xstream (string) + ;; XXX encoding is mis-handled by this kind of stream (make-rod-xstream (string-rod string))) +(defclass octet-input-stream (fundamental-binary-input-stream) + ((octets :initarg :octets) + (pos :initform 0))) + +(defmethod close ((stream octet-input-stream) &key abort) + (declare (ignore abort)) + (open-stream-p stream)) + +(defmethod stream-read-byte ((stream octet-input-stream)) + (with-slots (octets pos) stream + (if (>= pos (length octets)) + :eof + (prog1 + (elt octets pos) + (incf pos))))) + +(defmethod stream-read-sequence ((stream octet-input-stream) sequence + &optional (start 0) (end (length sequence))) + (with-slots (octets pos) stream + (let* ((length (min (- end start) (- (length octets) pos))) + (end1 (+ start length)) + (end2 (+ pos length))) + (replace sequence octets :start1 start :end1 end1 :start2 pos :end2 end2) + (setf pos end2) + end1))) + +(defun make-octet-input-stream (octets) + (make-instance 'octet-input-stream :octets octets)) + +(defun parse-octets (octets handler &rest args) + (apply #'parse-stream (make-octet-input-stream octets) handler args)) + ;;;; -#+ALLEGRO +#+allegro (defmacro sp (&body body) `(progn (prof:with-profiling (:type :space) .,body) (prof:show-flat-profile))) -#+ALLEGRO +#+allegro (defmacro tm (&body body) `(progn (prof:with-profiling (:type :time) .,body) @@ -2245,15 +3059,10 @@ '(consume-token zstream)) ) name kind)) -(defun merge-sysid (sysid base) - (merge-pathnames sysid base)) - -(defun open-sysid (sysid) - (open sysid :element-type '(unsigned-byte 8) :direction :input)) - - ;;;; +#| + (defparameter *test-files* '(;;"jclark:xmltest;not-wf;*;*.xml" "jclark:xmltest;valid;*;*.xml" @@ -2324,8 +3133,11 @@ (t (warn "**** negative test failed on ~S." filename))))) +|# + ;;;; +#+(or) ;was ist das? (progn (defmethod dom:create-processing-instruction ((document null) target data) @@ -2354,51 +3166,6 @@ ) -;;; Implementation of a simple but faster DOM. - -(defclass simple-document () - ((children :initform nil :accessor simple-document-children))) - -(defstruct node - parent) - -(defstruct (processing-instruction (:include node)) - target - data) - -(defstruct (text (:include node) - (:constructor make-text-boa (parent data))) - data) - -(defstruct (element (:include node)) - gi - attributes - children) - -(defmethod dom:create-processing-instruction ((document simple-document) target data) - (make-processing-instruction :target target :data data)) - -(defmethod dom:append-child ((node element) child) - (setf (node-parent child) node) - (push child (element-children node))) - -(defmethod dom:append-child ((node simple-document) child) - (push child (simple-document-children node)) - nil) - -(defmethod dom:create-element ((document simple-document) name) - (make-element :gi name)) - -(defmethod dom:set-attribute ((node element) name value) - (push (cons name value) - (element-attributes node))) - -(defmethod dom:create-text-node ((document simple-document) data) - (make-text-boa nil data)) - -(defmethod dom:create-cdata-section ((document simple-document) data) - (make-text-boa nil data)) - #|| (defmacro read-data-until* ((predicate input res res-start res-end) &body body) ;; fast variant -- for now disabled for no apparent reason @@ -2448,8 +3215,8 @@ (setf ,res buf ,res-start p0 ,res-end rptr) (return) ) (t - ;; we continue - (setf rptr (%+ rptr 1))) )) + we continue + (sf rptr (%+ rptr 1))) )) ,@body )) ||# @@ -2496,7 +3263,7 @@ (defun read-cdata (input) (read-data-until* ((lambda (rune) (declare (type rune rune)) - (or (%= rune #/<) (%= rune #/&))) + (or (%rune= rune #/<) (%rune= rune #/&))) input source start end) (locally @@ -2516,15 +3283,13 @@ res)))) (defun internal-entity-expansion (name) - (let ((e (assoc (list :general name) *entities* :test #'equal))) - (unless e + (let ((def (get-entity-definition name :general (dtd *ctx*)))) + (unless def (error "Entity '~A' is not defined." (rod-string name))) - (unless (eq :internal (cadr e)) - (error "Entity '~A' is not an internal entity.")) - (or (cadddr e) - (car - (setf (cdddr e) - (cons (find-internal-entity-expansion name) nil)))))) + (unless (typep def 'internal-entdef) + (error "Entity '~A' is not an internal entity." name)) + (or (entdef-expansion def) + (setf (entdef-expansion def) (find-internal-entity-expansion name))))) (defun find-internal-entity-expansion (name) (let ((zinput (make-zstream))) @@ -2539,7 +3304,7 @@ (setf c (peek-rune input)) (cond ((rune= c #/#) (let ((c (read-numeric-entity input))) - (%put-rune c collect))) + (%put-unicode-char c collect))) (t (unless (name-start-rune-p (peek-rune input)) (error "Expecting name after &.")) @@ -2568,26 +3333,29 @@ (lambda (zinput) (muffle (car (zstream-input-stack zinput))))) )))) -#+(or) ;; Do we need this? Not called anywhere -(defun ff (name) - (let ((input (make-zstream))) - (let ((*data-behaviour* :doc) - (*document* (make-instance 'simple-document))) - (recurse-on-entity - input name :general - (lambda (input) - (prog1 - (ecase (entity-source-kind name :general) - (:internal (p/content input)) - (:external (p/ext-parsed-ent input))) - (unless (eq (peek-token input) :eof) - (error "Trailing garbage. - ~S" (peek-token input))))))))) +(defun resolve-entity (name handler dtd) + (let ((*validate* nil)) + (if (get-entity-definition name :general dtd) + (let* ((*ctx* (make-context :handler handler :dtd dtd)) + (input (make-zstream)) + (*data-behaviour* :DOC)) + (with-scratch-pads () + (recurse-on-entity + input name :general + (lambda (input) + (prog1 + (etypecase (checked-get-entdef name :general) + (internal-entdef (p/content input)) + (external-entdef (p/ext-parsed-ent input))) + (unless (eq (peek-token input) :eof) + (error "Trailing garbage. - ~S" (peek-token input)))))))) + nil))) (defun read-att-value-2 (input) (let ((delim (read-rune input))) (unless (member delim '(#/\" #/\') :test #'eql) (error "Bad attribute value delimiter ~S, must be either #\\\" or #\\\'." - (if (< delim char-code-limit) (code-char delim) delim))) + (rune-char delim delim))) (with-rune-collector-4 (collect) (loop (let ((c (read-rune input))) @@ -2598,9 +3366,9 @@ ((rune= #/& c) (multiple-value-bind (kind sem) (read-entity-ref input) (ecase kind - (:numeric - (%put-rune sem collect)) - (:named + (:NUMERIC + (%put-unicode-char sem collect)) + (:NAMED (let* ((exp (internal-entity-expansion sem)) (n (length exp))) (declare (type (simple-array rune (*)) exp)) @@ -2608,7 +3376,7 @@ ((%= i n)) (collect (%rune exp i)))))))) ((space-rune-p c) - (collect #x20)) + (collect #/u+0020)) (t (collect c)))))))) @@ -2616,7 +3384,6 @@ ;;; Namespace stuff -(defvar *namespace-bindings* ()) (defvar *default-namespace-bindings* '((#"" . nil) (#"xmlns" . #"http://www.w3.org/2000/xmlns/") @@ -2630,7 +3397,7 @@ (notany #'(lambda (rune) (rune= #/: rune)) name))) (defun split-qname (qname) - (declare (type glisp:simple-rod qname)) + (declare (type runes:simple-rod qname)) (let ((pos (position #/: qname))) (if pos (let ((prefix (subseq qname 0 pos)) @@ -2642,7 +3409,7 @@ (defun decode-qname (qname) "decode-qname name => namespace-uri, prefix, local-name" - (declare (type glisp:simple-rod qname)) + (declare (type runes:simple-rod qname)) (multiple-value-bind (prefix local-name) (split-qname qname) (let ((uri (find-namespace-binding prefix))) (if uri @@ -2651,7 +3418,7 @@ (defun find-namespace-binding (prefix) - (cdr (or (assoc prefix *namespace-bindings* :test #'rod=) + (cdr (or (assoc (or prefix #"") (namespace-bindings *ctx*) :test #'rod=) (error "Undeclared namespace prefix: ~A" (rod-string prefix))))) ;; FIXME: Should probably be refactored by adding :start and :end to rod=/rod-equal @@ -2669,13 +3436,30 @@ (subseq attrname 6) nil)) -(defun find-namespace-declarations (attr-alist) - (mapcar #'(lambda (attr) - (cons (attrname->prefix (car attr)) (cdr attr))) - (remove-if-not #'xmlns-attr-p attr-alist :key #'car))) +(defun find-namespace-declarations (element attr-alist) + (let ((result + (mapcar #'(lambda (attr) + (cons (attrname->prefix (car attr)) (cdr attr))) + (remove-if-not #'xmlns-attr-p attr-alist :key #'car)))) + ;; Argh! PROCESS-ATTRIBUTES needs to know the attributes' namespaces + ;; already. But namespace declarations can be done using default values + ;; in the DTD. So we need to handle defaulting of attribute values twice, + ;; once for xmlns attributes, then for all others. (I really hope I'm + ;; wrong on this one, but I don't see how.) + (let ((e (find-element element (dtd *ctx*)))) + (when e + (dolist (ad (elmdef-attributes e)) ;handle default values + (let* ((name (attdef-name ad)) + (prefix (attrname->prefix name))) + (when (and (xmlns-attr-p name) + (not (member prefix result :key #'car :test #'rod=)) + (listp (attdef-default ad)) ;:DEFAULT or :FIXED + ) + (push (cons prefix (cadr (attdef-default ad))) result)))))) + result)) -(defun declare-namespaces (attr-alist) - (let ((ns-decls (find-namespace-declarations attr-alist))) +(defun declare-namespaces (element attr-alist) + (let ((ns-decls (find-namespace-declarations element attr-alist))) (dolist (ns-decl ns-decls ) ;; check some namespace validity constraints ;; FIXME: Would be nice to add "this is insane, go ahead" restarts @@ -2707,32 +3491,27 @@ (error "Only the default namespace (the one without a prefix) may ~ be bound to an empty namespace URI, thus undeclaring it.")) (t - (push (cons prefix uri) *namespace-bindings*) - (sax:start-prefix-mapping *handler* (car ns-decl) (cdr ns-decl)))))) + (push (cons prefix uri) (namespace-bindings *ctx*)) + (sax:start-prefix-mapping (handler *ctx*) (car ns-decl) (cdr ns-decl)))))) ns-decls)) (defun undeclare-namespaces (ns-decls) (dolist (ns-decl ns-decls) - (setq *namespace-bindings* (delete ns-decl *namespace-bindings*)) - (sax:end-prefix-mapping *handler* (car ns-decl)))) - -(defstruct attribute - namespace-uri - local-name - qname - value) + (setf (namespace-bindings *ctx*) (delete ns-decl (namespace-bindings *ctx*))) + (sax:end-prefix-mapping (handler *ctx*) (car ns-decl)))) (defun build-attribute-list-no-ns (attr-alist) - (mapcar #'(lambda (pair) (make-attribute :qname (car pair) :value (cdr pair))) + (mapcar #'(lambda (pair) + (sax:make-attribute :qname (car pair) + :value (cdr pair) + :specified-p t)) attr-alist)) ;; FIXME: Use a non-braindead way to enforce attribute uniqueness (defun build-attribute-list-ns (attr-alist) (let (attributes) (dolist (pair attr-alist) - (when (or (not (xmlns-attr-p (car pair))) - sax:*include-xmlns-attributes*) - (push (build-attribute (car pair) (cdr pair)) attributes))) + (push (build-attribute (car pair) (cdr pair) t) attributes)) ;; 5.3 Uniqueness of Attributes ;; In XML documents conforming to [the xmlns] specification, no @@ -2746,30 +3525,33 @@ (do ((sublist attributes (cdr sublist))) ((null sublist) attributes) (let ((attr-1 (car sublist))) - (when (and (attribute-namespace-uri attr-1) + (when (and (sax:attribute-namespace-uri attr-1) (find-if #'(lambda (attr-2) - (and (rod= (attribute-namespace-uri attr-1) - (attribute-namespace-uri attr-2)) - (rod= (attribute-local-name attr-1) - (attribute-local-name attr-2)))) + (and (rod= (sax:attribute-namespace-uri attr-1) + (sax:attribute-namespace-uri attr-2)) + (rod= (sax:attribute-local-name attr-1) + (sax:attribute-local-name attr-2)))) (cdr sublist))) (error "Multiple definitions of attribute ~S in namespace ~S." - (mu (attribute-local-name attr-1)) - (mu (attribute-namespace-uri attr-1)))))))) + (mu (sax:attribute-local-name attr-1)) + (mu (sax:attribute-namespace-uri attr-1)))))))) -(defun build-attribute (name value) +(defun build-attribute (name value specified-p) (multiple-value-bind (prefix local-name) (split-qname name) (declare (ignorable local-name)) (if (or (not prefix) ;; default namespace doesn't apply to attributes (and (rod= #"xmlns" prefix) (not sax:*use-xmlns-namespace*))) - (make-attribute :qname name :value value) + (sax:make-attribute :qname name + :value value + :specified-p specified-p) (multiple-value-bind (uri prefix local-name) (decode-qname name) (declare (ignore prefix)) - (make-attribute :qname name - :value value - :namespace-uri uri - :local-name local-name))))) + (sax:make-attribute :qname name + :value value + :namespace-uri uri + :local-name local-name + :specified-p specified-p))))) ;;; Faster constructors @@ -2822,3 +3604,44 @@ (defun xml-parse (system-id &key document standalone-p) ) ||# + +;;;;;;;;;;;;;;;;; + +;;; SAX validation handler + +(defclass validator () + ((context :initarg :context :accessor context) + (cdatap :initform nil :accessor cdatap))) + +(defun make-validator (dtd root) + (make-instance 'validator + :context (make-context + :handler nil + :dtd dtd + :model-stack (list (make-root-model root))))) + +(macrolet ((with-context ((validator) &body body) + `(let ((*ctx* (context ,validator)) + (*validate* t)) + (with-scratch-pads () ;nicht schoen + ,@body)))) + (defmethod sax:start-element ((handler validator) uri lname qname attributes) + uri lname + (with-context (handler) + (validate-start-element *ctx* qname) + (process-attributes *ctx* qname attributes))) + + (defmethod sax:start-cdata ((handler validator)) + (setf (cdatap handler) t)) + + (defmethod sax:characters ((handler validator) data) + (with-context (handler) + (validate-characters *ctx* (if (cdatap handler) #"hack" data)))) + + (defmethod sax:end-cdata ((handler validator)) + (setf (cdatap handler) nil)) + + (defmethod sax:end-element ((handler validator) uri lname qname) + uri lname + (with-context (handler) + (validate-end-element *ctx* qname))))