diff --git a/doc/dom.html b/doc/dom.html
index 78fead7..2174f0e 100644
--- a/doc/dom.html
+++ b/doc/dom.html
@@ -22,10 +22,33 @@
To parse an XML document into a DOM tree, use the SAX parser with a
DOM builder as the SAX handler. Example:
- (cxml:parse-file "test.xml" (dom:make-dom-builder))
+ (cxml:parse-file "test.xml" (cxml-dom:make-dom-builder))
-
Function DOM:MAKE-DOM-BUILDER ()
+ Function CXML-DOM:MAKE-DOM-BUILDER ()
Create a SAX handler which builds a DOM document.
+
+
+ This functions returns a DOM builder that will work with the default
+ configuration of the SAX parser and is guaranteed to use
+ characters/strings instead of runes/rods, if that makes a
+ difference on the Lisp in question.
+
+
+ This is the same as rune-dom:make-dom-builder on Lisps
+ with Unicode support, and the same as
+ utf8-dom:make-dom-builder otherwise.
+
+
+
+
Function RUNE-DOM:MAKE-DOM-BUILDER ()
+ Create a SAX handler which builds a DOM document using runes and rods.
+
+
+
+
Function UTF8-DOM:MAKE-DOM-BUILDER ()
+ (Only on Lisps without Unicode support:)
+ Create a SAX handler which builds a DOM document using
+ UTF-8-encoded strings.
@@ -63,6 +86,12 @@
include-default-values -- include attribute nodes with nil
dom:specified.
+
+ recode -- (ignored on Lisps with Unicode support.) If
+ true, recode UTF-8 strings to rods. Defaults to true if used
+ with a UTF-8 DOM document. It can be set to false manually to
+ suppress recoding in this case.
+
diff --git a/doc/quickstart.html b/doc/quickstart.html
index 552675c..513076b 100644
--- a/doc/quickstart.html
+++ b/doc/quickstart.html
@@ -33,7 +33,7 @@
Parsing and Validating
Serialization
Miscellaneous SAX handlers
- Dealing with Rods
+ Recoders
Caching of DTD Objects
XML Catalogs
SAX Interface
@@ -67,7 +67,7 @@
Parse example.xml into a DOM tree (read
more):
- * (cxml:parse-file "example.xml" (dom:make-dom-builder))
+ * (cxml:parse-file "example.xml" (cxml-dom:make-dom-builder))
#<DOM-IMPL::DOCUMENT @ #x72206172>
;; save result for later:
* (defparameter *example* *)
diff --git a/doc/using.html b/doc/using.html
index 14c989b..67c033e 100644
--- a/doc/using.html
+++ b/doc/using.html
@@ -69,6 +69,13 @@
disallow-internal-subset -- a boolean. If true, signal
an error if the document contains an internal subset.
+
+ recode -- a boolean. (Ignored on Lisps with Unicode
+ support.) Recode rods to UTF-8 strings. Defaults to true.
+ Make sure to use utf8-dom:make-dom-builder if this
+ option is enabled and rune-dom:make-dom-builder
+ otherwise.
+
@@ -258,7 +265,7 @@
ignored.
Example:
- (let ((d (parse-file "~/test.xml" (dom:make-dom-builder)))
+ (let ((d (parse-file "~/test.xml" (cxml-dom:make-dom-builder)))
(x (parse-dtd-file "~/test.dtd")))
(dom:map-document (cxml:make-validator x #"foo") d))
@@ -287,40 +294,15 @@
- Dealing with Rods
+ Recoders
- As explained above, the XML parser handles character encoding and
- uses 16bit strings internally. Instead of using characters and strings
- it uses runes and rods. This is seen as a
- feature, but can be inconvenient.
+ Recoders are a mechanism used by CXML internally on Lisp implementations
+ without Unicode support to recode UTF-16 vectors (rods) of
+ integers (runes) into UTF-8 strings.
-
- -
- If your Lisp supports 16 bit unicode strings, use feature
- :rune-is-character and forget about runes and rods.
- CXML will use ordinary Lisp characters and strings both
- internally and externally.
-
- -
- If your Lisp does not support such strings and your application
- needs Unicode support, use functions defined in the
- runes package instead of ordinary string operators.
-
- -
- If your Lisp does not support such strings and your application
- does not need Unicode support anyway, it will probably be more
- convenient to let CXML convert rods into strings automatically.
- To do that, use cxml:make-recoder to chain a special
- sax handler between the parser and your application handler.
- The recoder translates all rods using an application defined
- function, which defaults to runes:rod-string. Although
- the actual XML parser still uses rods internally, you SAX
- handler will only see ordinary Lisp strings.
-
-
- Note that the recoder approach does not work with the DOM
- builder, since DOM is specified to use UTF-16.
+ User code does not usually need to deal with recoders in current
+ versions of CXML.
Function CXML:MAKE-RECODER (chained-handler recoder-fn)
@@ -328,16 +310,6 @@
chained-handler after converting all strings and rods
using recoder-fn, a function of one argument.
-
- Example. In a Lisp which ordinarily would use octet vector rods:
-
- CL-USER(14): (cxml:parse-string "<test/>" (cxml-xmls:make-xmls-builder))
-(#(116 101 115 116) NIL)
-
- Use a SAX recoder to get strings instead::
-
- CL-USER(17): (parse-string "<test/>" (cxml:make-recoder (cxml-xmls:make-xmls-builder) 'runes:rod-string))
-("test" NIL)
Caching of DTD Objects
diff --git a/dom/dom-builder.lisp b/dom/dom-builder.lisp
index 4d29618..30cec4f 100644
--- a/dom/dom-builder.lisp
+++ b/dom/dom-builder.lisp
@@ -83,7 +83,7 @@
(defmethod sax:start-element
((handler dom-builder) namespace-uri local-name qname attributes)
- (check-type qname rod)
+ (check-type qname rod) ;catch recoder/builder mismatch
(with-slots (document element-stack) handler
(let* ((nsp sax:*namespace-processing*)
(element (make-instance 'element
diff --git a/dom/dom-impl.lisp b/dom/dom-impl.lisp
index 506cbc4..0e0a512 100644
--- a/dom/dom-impl.lisp
+++ b/dom/dom-impl.lisp
@@ -1228,7 +1228,9 @@
(when resolver
(setf (document handler) owner)
(push instance (element-stack handler))
- (funcall resolver (dom:name instance) handler)))
+ #+cxml-system::utf8dom-file
+ (setf handler (cxml:make-recoder handler #'cxml:rod-to-utf8-string))
+ (funcall resolver (real-rod (dom:name instance)) handler)))
(labels ((walk (n)
(setf (slot-value n 'read-only-p) t)
(when (dom:element-p n)
diff --git a/dom/dom-sax.lisp b/dom/dom-sax.lisp
index c7d8dae..544d328 100644
--- a/dom/dom-sax.lisp
+++ b/dom/dom-sax.lisp
@@ -12,7 +12,11 @@
(handler document
&key (include-xmlns-attributes sax:*include-xmlns-attributes*)
include-doctype
- include-default-values)
+ include-default-values
+ (recode (typep document 'utf8-dom::node)))
+ #+rune-is-integer
+ (when recode
+ (setf handler (make-recoder handler #'utf8-string-to-rod)))
(sax:start-document handler)
(when include-doctype
(let ((doctype (dom:doctype document)))
diff --git a/test/domtest.lisp b/test/domtest.lisp
index 9047ecf..ba99ff8 100644
--- a/test/domtest.lisp
+++ b/test/domtest.lisp
@@ -175,7 +175,8 @@
(defun read-members (&optional (directory *directory*))
(let* ((pathname (merge-pathnames "build/dom2-interfaces.xml" directory))
(builder (rune-dom:make-dom-builder))
- (library (dom:document-element (cxml:parse-file pathname builder)))
+ (library (dom:document-element
+ (cxml:parse-file pathname builder :recode nil)))
(methods '())
(fields '()))
(do-child-elements (interface library :name "interface")
@@ -584,7 +585,8 @@
(catch 'give-up
(let* ((builder (rune-dom:make-dom-builder))
(cxml::*validate* nil) ;dom1.dtd is buggy
- (test (dom:document-element (cxml:parse-file pathname builder)))
+ (test (dom:document-element
+ (cxml:parse-file pathname builder :recode nil)))
title
(bindings '())
(code '()))
@@ -631,7 +633,8 @@
(setf name (runes:rod-string name))
(cxml:parse-file
(make-pathname :name name :type "xml" :defaults *files-directory*)
- (rune-dom:make-dom-builder)))
+ (rune-dom:make-dom-builder)
+ :recode nil))
(defparameter *bad-tests*
'("hc_elementnormalize2.xml"
@@ -656,7 +659,7 @@
(let* ((all-tests (merge-pathnames "alltests.xml" test-directory))
(builder (rune-dom:make-dom-builder))
(suite (dom:document-element
- (cxml:parse-file all-tests builder)))
+ (cxml:parse-file all-tests builder :recode nil)))
(*files-directory*
(merge-pathnames "files/" test-directory)))
(do-child-elements (member suite)
diff --git a/test/utf8domtest.diff b/test/utf8domtest.diff
new file mode 100644
index 0000000..cef62ae
--- /dev/null
+++ b/test/utf8domtest.diff
@@ -0,0 +1,102 @@
+Index: test/domtest.lisp
+===================================================================
+RCS file: /project/cxml/cvsroot/cxml/test/domtest.lisp,v
+retrieving revision 1.13
+diff -u -r1.13 domtest.lisp
+--- test/domtest.lisp 27 Dec 2005 00:21:37 -0000 1.13
++++ test/domtest.lisp 27 Dec 2005 00:46:00 -0000
+@@ -137,21 +137,22 @@
+ ((digit-char-p (runes:rune-char (elt str 0)))
+ (parse-integer (runes:rod-string str)))
+ ((runes:rune= (elt str 0) #.(runes:char-rune #\"))
+- (let ((v (make-array 1 :fill-pointer 0 :adjustable t)))
+- (for* ((i = 1 :then (1+ i))
+- (c = (elt str i))
+- :until (runes:rune= c #.(runes:char-rune #\")))
+- (if (runes:rune= c #.(runes:char-rune #\\))
+- (let ((frob
+- (progn
+- (incf i)
+- (elt str i))))
+- (ecase frob
+- ;; ...
+- (#/n (vector-push-extend #/newline v (length v)))
+- ((#/\\ #/\") (vector-push-extend #/\\ v (length v)))))
+- (vector-push-extend c v (length v))))
+- (coerce v 'runes::simple-rod)))
++ (utf8-dom::%rod
++ (let ((v (make-array 1 :fill-pointer 0 :adjustable t)))
++ (for* ((i = 1 :then (1+ i))
++ (c = (elt str i))
++ :until (runes:rune= c #.(runes:char-rune #\")))
++ (if (runes:rune= c #.(runes:char-rune #\\))
++ (let ((frob
++ (progn
++ (incf i)
++ (elt str i))))
++ (ecase frob
++ ;; ...
++ (#/n (vector-push-extend #/newline v (length v)))
++ ((#/\\ #/\") (vector-push-extend #/\\ v (length v)))))
++ (vector-push-extend c v (length v))))
++ (coerce v 'runes::simple-rod))))
+ (t
+ (%intern str))))
+
+@@ -368,7 +369,7 @@
+
+ (defun translate-implementation (elt)
+ (with-attributes (|var|) elt
+- (maybe-setf (%intern |var|) `'rune-dom:implementation)))
++ (maybe-setf (%intern |var|) `'utf8-dom:implementation)))
+
+ (defun translate-length (load)
+ ;; XXX Soweit ich sehe unterscheiden die Tests nicht zwischen
+@@ -406,7 +407,7 @@
+ (if (nullify |obj|)
+ (translate-member element)
+ (maybe-setf (%intern |var|)
+- `(dom:has-feature 'rune-dom:implementation
++ `(dom:has-feature 'utf8-dom:implementation
+ ,(parse-java-literal |feature|)
+ ,(parse-java-literal |version|))))))
+
+@@ -493,9 +494,9 @@
+ (return
+ `(block assert-domexception
+ (handler-bind
+- ((rune-dom::dom-exception
++ ((utf8-dom::dom-exception
+ (lambda (c)
+- (when (eq (rune-dom::dom-exception-key c)
++ (when (eq (utf8-dom::dom-exception-key c)
+ ,(intern (tag-name c) :keyword))
+ (return-from assert-domexception)))))
+ ,@(translate-body c)
+@@ -506,7 +507,7 @@
+ ,@(map-child-elements
+ 'list
+ (lambda (exception)
+- `(when (eq (rune-dom::dom-exception-key c)
++ `(when (eq (utf8-dom::dom-exception-key c)
+ ,(intern (runes:rod-string (dom:get-attribute exception "code"))
+ :keyword))
+ ,@(translate-body exception)
+@@ -516,7 +517,7 @@
+ (defun translate-try (element)
+ `(block try
+ (handler-bind
+- ((rune-dom::dom-exception
++ ((utf8-dom::dom-exception
+ ,(translate-catch
+ (do-child-elements (c element :name "catch") (return c))
+ '(return-from try))))
+@@ -631,7 +632,7 @@
+ (setf name (runes:rod-string name))
+ (cxml:parse-file
+ (make-pathname :name name :type "xml" :defaults *files-directory*)
+- (rune-dom:make-dom-builder)))
++ (cxml:make-recoder (utf8-dom:make-dom-builder) 'cxml:rod-to-utf8-string)))
+
+ (defparameter *bad-tests*
+ '("hc_elementnormalize2.xml"
diff --git a/test/xmlconf.lisp b/test/xmlconf.lisp
index 9ab9818..ed1bcf4 100644
--- a/test/xmlconf.lisp
+++ b/test/xmlconf.lisp
@@ -77,7 +77,7 @@
(defun run-all-tests (directory)
(let* ((pathname (merge-pathnames "xmlconf.xml" directory))
(builder (rune-dom:make-dom-builder))
- (xmlconf (cxml:parse-file pathname builder))
+ (xmlconf (cxml:parse-file pathname builder :recode nil))
(ntried 0)
(nfailed 0)
(nskipped 0)
@@ -125,6 +125,7 @@
(let ((document (apply #'cxml:parse-file
pathname
(rune-dom:make-dom-builder)
+ :recode nil
args)))
(cond
((null output)
@@ -161,7 +162,10 @@
(handler-case
(progn
(format t " [validating:]")
- (cxml:parse-file pathname (rune-dom:make-dom-builder) :validate t)
+ (cxml:parse-file pathname
+ (rune-dom:make-dom-builder)
+ :recode nil
+ :validate t)
(error "validity error not detected")
nil)
(cxml:validity-error ()
@@ -174,7 +178,10 @@
(handler-case
(progn
(format t " [not validating:]")
- (cxml:parse-file pathname (rune-dom:make-dom-builder) :validate nil)
+ (cxml:parse-file pathname
+ (rune-dom:make-dom-builder)
+ :recode nil
+ :validate nil)
(error "well-formedness violation not detected")
nil)
(cxml:well-formedness-violation ()
@@ -183,7 +190,10 @@
(handler-case
(progn
(format t " [validating:]")
- (cxml:parse-file pathname (rune-dom:make-dom-builder) :validate t)
+ (cxml:parse-file pathname
+ (rune-dom:make-dom-builder)
+ :recode nil
+ :validate t)
(error "well-formedness violation not detected")
nil)
(cxml:well-formedness-violation ()
diff --git a/xml/package.lisp b/xml/package.lisp
index cc4b265..8797981 100644
--- a/xml/package.lisp
+++ b/xml/package.lisp
@@ -75,4 +75,6 @@
#:make-recoder
#:sax-proxy
#:proxy-chained-handler
- #:make-namespace-normalizer))
+ #:make-namespace-normalizer
+ #:rod-to-utf8-string
+ #:utf8-string-to-rod))
diff --git a/xml/recoder.lisp b/xml/recoder.lisp
index 9f96792..0816377 100644
--- a/xml/recoder.lisp
+++ b/xml/recoder.lisp
@@ -74,6 +74,9 @@
(%string public-id)
(%string system-id))
+ (defwrapper sax:start-internal-subset ())
+ (defwrapper sax:end-internal-subset ())
+
(defwrapper sax:end-dtd ())
(defwrapper sax:unparsed-entity-declaration
diff --git a/xml/xml-parse.lisp b/xml/xml-parse.lisp
index a9e17eb..3d5da79 100644
--- a/xml/xml-parse.lisp
+++ b/xml/xml-parse.lisp
@@ -2552,13 +2552,18 @@
(defun p/document
(input handler
- &key validate dtd root entity-resolver disallow-internal-subset)
+ &key validate dtd root entity-resolver disallow-internal-subset
+ (recode t))
;; check types of user-supplied arguments for better error messages:
(check-type validate boolean)
+ (check-type recode boolean)
(check-type dtd (or null extid))
(check-type root (or null rod))
(check-type entity-resolver (or null function symbol))
(check-type disallow-internal-subset boolean)
+ #+rune-is-integer
+ (when recode
+ (setf handler (make-recoder handler #'rod-to-utf8-string)))
(let ((*ctx*
(make-context :handler handler
:main-zstream input