From 487338d20ec45cb87f5ad0c99aadd779b6cdab01 Mon Sep 17 00:00:00 2001 From: dlichteblau Date: Sun, 27 Nov 2005 18:20:08 +0000 Subject: [PATCH] Hmm. Auf U+ffff und dergleichen wurde durch data-rune-p geprueft, das aber eben nicht ueberall benutzt wurde. Ich habe die Pruefung jetzt mal direkt im Decoding eingebaut. -xmltest/not-wf/sa/171.xml [not validating:] FAILED: - well-formedness violation not detected -[ - Character FFFF is not legal anywhere in an XML document. ] --- XMLCONF | 7 ++----- runes/encodings.lisp | 14 +++++++++++--- xml/xml-parse.lisp | 28 ++++++++++++---------------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/XMLCONF b/XMLCONF index 3b4976d..aa72bf1 100644 --- a/XMLCONF +++ b/XMLCONF @@ -168,10 +168,7 @@ xmltest/not-wf/sa/167.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/168.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/169.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/170.xml [not validating:] not-wf [validating:] invalid -xmltest/not-wf/sa/171.xml [not validating:] FAILED: - well-formedness violation not detected -[ - Character FFFF is not legal anywhere in an XML document. ] +xmltest/not-wf/sa/171.xml [not validating:] not-wf [validating:] not-wf xmltest/not-wf/sa/172.xml [not validating:] not-wf [validating:] not-wf xmltest/not-wf/sa/173.xml [not validating:] not-wf [validating:] not-wf xmltest/not-wf/sa/174.xml [not validating:] not-wf [validating:] invalid @@ -1856,4 +1853,4 @@ ibm/valid/P86/ibm86v01.xml [not validating:] input [validating:] input ibm/valid/P87/ibm87v01.xml [not validating:] input [validating:] input ibm/valid/P88/ibm88v01.xml [not validating:] input [validating:] input ibm/valid/P89/ibm89v01.xml [not validating:] input [validating:] input -14/1786 tests failed; 376 tests were skipped \ No newline at end of file +13/1786 tests failed; 376 tests were skipped \ No newline at end of file diff --git a/runes/encodings.lisp b/runes/encodings.lisp index bdd8e99..ce8197a 100644 --- a/runes/encodings.lisp +++ b/runes/encodings.lisp @@ -123,7 +123,10 @@ ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere ;; Haelfte fehlt! - (setf (aref out wptr) (logior (ash hi 8) lo)) + (let ((x (logior (ash hi 8) lo))) + (when (or (eql x #xFFFE) (eql x #/U+FFFF)) + (xerror "not a valid code point: #x~X" x)) + (setf (aref out wptr) x)) (setf wptr (%+ 1 wptr)))) (values wptr rptr))) @@ -143,7 +146,10 @@ ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere ;; Haelfte fehlt! - (setf (aref out wptr) (logior (ash hi 8) lo)) + (let ((x (logior (ash hi 8) lo))) + (when (or (eql x #xFFFE) (eql x #/U+FFFF)) + (xerror "not a valid code point: #x~X" x)) + (setf (aref out wptr) x)) (setf wptr (%+ 1 wptr)))) (values wptr rptr))) @@ -161,7 +167,9 @@ (when (or (<= #xD800 x #xDBFF) (<= #xDC00 x #xDFFF)) (xerror "surrogate encoded in UTF-8: #x~X." x)) - (cond ((%> x #x10FFFF) + (cond ((or (%> x #x10FFFF) + (eql x #xFFFE) + (eql x #/U+FFFF)) (xerror "not a valid code point: #x~X" x)) ((%> x #xFFFF) (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10)) diff --git a/xml/xml-parse.lisp b/xml/xml-parse.lisp index 6c68400..01f9f8e 100644 --- a/xml/xml-parse.lisp +++ b/xml/xml-parse.lisp @@ -1413,15 +1413,18 @@ (definline data-rune-p (rune) ;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. + ;; + ;; FIXME: das halte ich fuer verkehrt. Surrogates als Unicode-Zeichen + ;; sind verboten. Das liegt hier aber nicht vor, denn wir arbeiten + ;; ja tatsaechlich mit UTF-16. Verboten ist es nur, wenn wir ein + ;; solches Zeichen beim Dekodieren finden, das wird aber eben + ;; in encodings.lisp bereits geprueft. --david (let ((c (rune-code rune))) (or (= c #x9) (= c #xA) (= c #xD) (<= #x20 c #xD7FF) (<= #xE000 c #xFFFD) - ;; (<= #xD800 c #xDBFF) - (<= #xDC00 c #xDFFF) - ;; - ))) + (<= #xDC00 c #xDFFF)))) (defun read-att-value (zinput input mode &optional canon-space-p (delim nil)) (with-rune-collector-2 (collect) @@ -2686,7 +2689,7 @@ ((:ENTITY-REF) (let ((name sem)) (consume-token input) - (append ;; nil #+(OR) + (append (recurse-on-entity input name :general (lambda (input) (prog1 @@ -3190,17 +3193,10 @@ (defun read-cdata (input) (read-data-until* ((lambda (rune) (declare (type rune rune)) - (when (or (and (%rune< rune #/U+0020) - (not (or (%rune= rune #/U+0009) - (%rune= rune #/U+000a) - (%rune= rune #/U+000d)))) - ;; Surrogates nicht ausschliessen, denn wir - ;; haben ja UTF-16 Runen. - #+(or) - (and (%rune<= #/U+D800 rune) - (%rune< rune #/U+E000)) - (%rune= rune #/U+FFFE) - (%rune= rune #/U+FFFF)) + (when (and (%rune< rune #/U+0020) + (not (or (%rune= rune #/U+0009) + (%rune= rune #/U+000a) + (%rune= rune #/U+000d)))) (wf-error "code point invalid: ~A" rune)) (or (%rune= rune #/<) (%rune= rune #/&))) input