Hmm. Auf U+ffff und dergleichen wurde durch data-rune-p geprueft,

das aber eben nicht ueberall benutzt wurde. Ich habe die Pruefung jetzt mal direkt im Decoding eingebaut. -xmltest/not-wf/sa/171.xml [not validating:] FAILED: - well-formedness violation not detected -[ - Character FFFF is not legal anywhere in an XML document. ]
2005-11-27 18:20:08 +00:00
parent b95f1ef093
commit 487338d20e
3 changed files with 25 additions and 24 deletions
--- a/7
+++ b/7
@ -168,10 +168,7 @@ xmltest/not-wf/sa/167.xml [not validating:] not-wf [validating:] invalid
 xmltest/not-wf/sa/168.xml [not validating:] not-wf [validating:] invalid
 xmltest/not-wf/sa/169.xml [not validating:] not-wf [validating:] invalid
 xmltest/not-wf/sa/170.xml [not validating:] not-wf [validating:] invalid
-xmltest/not-wf/sa/171.xml [not validating:] FAILED:
-  well-formedness violation not detected
-[
-    Character FFFF is not legal anywhere in an XML document. ]
+xmltest/not-wf/sa/171.xml [not validating:] not-wf [validating:] not-wf
 xmltest/not-wf/sa/172.xml [not validating:] not-wf [validating:] not-wf
 xmltest/not-wf/sa/173.xml [not validating:] not-wf [validating:] not-wf
 xmltest/not-wf/sa/174.xml [not validating:] not-wf [validating:] invalid
@ -1856,4 +1853,4 @@ ibm/valid/P86/ibm86v01.xml [not validating:] input [validating:] input
 ibm/valid/P87/ibm87v01.xml [not validating:] input [validating:] input
 ibm/valid/P88/ibm88v01.xml [not validating:] input [validating:] input
 ibm/valid/P89/ibm89v01.xml [not validating:] input [validating:] input
-14/1786 tests failed; 376 tests were skipped
+13/1786 tests failed; 376 tests were skipped
--- a/runes/encodings.lisp
+++ b/runes/encodings.lisp
@ -123,7 +123,10 @@
 	;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
 	;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
 	;; Haelfte fehlt!
-        (setf (aref out wptr) (logior (ash hi 8) lo))
+        (let ((x (logior (ash hi 8) lo)))
+	  (when (or (eql x #xFFFE) (eql x #/U+FFFF))
+	    (xerror "not a valid code point: #x~X" x))
+	  (setf (aref out wptr) x))
        (setf wptr (%+ 1 wptr))))
    (values wptr rptr)))

@ -143,7 +146,10 @@
 	;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
 	;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
 	;; Haelfte fehlt!
-        (setf (aref out wptr) (logior (ash hi 8) lo))
+        (let ((x (logior (ash hi 8) lo)))
+	  (when (or (eql x #xFFFE) (eql x #/U+FFFF))
+	    (xerror "not a valid code point: #x~X" x))
+	  (setf (aref out wptr) x))
        (setf wptr (%+ 1 wptr))))
    (values wptr rptr)))

@ -161,7 +167,9 @@
                     (when (or (<= #xD800 x #xDBFF)
 			       (<= #xDC00 x #xDFFF))
 		       (xerror "surrogate encoded in UTF-8: #x~X." x))
-                     (cond ((%> x #x10FFFF)
+                     (cond ((or (%> x #x10FFFF)
+				(eql x #xFFFE)
+				(eql x #/U+FFFF))
                            (xerror "not a valid code point: #x~X" x))
 		           ((%> x #xFFFF)
                            (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
--- a/xml/xml-parse.lisp
+++ b/xml/xml-parse.lisp
@ -1413,15 +1413,18 @@

 (definline data-rune-p (rune)
  ;; any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
+  ;;
+  ;; FIXME: das halte ich fuer verkehrt.  Surrogates als Unicode-Zeichen
+  ;; sind verboten.  Das liegt hier aber nicht vor, denn wir arbeiten
+  ;; ja tatsaechlich mit UTF-16.  Verboten ist es nur, wenn wir ein
+  ;; solches Zeichen beim Dekodieren finden, das wird aber eben
+  ;; in encodings.lisp bereits geprueft.  --david
  (let ((c (rune-code rune)))
    (or (= c #x9) (= c #xA) (= c #xD)
        (<= #x20 c #xD7FF)
        (<= #xE000 c #xFFFD)
-        ;;
        (<= #xD800 c #xDBFF)
-        (<= #xDC00 c #xDFFF)
-        ;;
-        )))
+        (<= #xDC00 c #xDFFF))))

 (defun read-att-value (zinput input mode &optional canon-space-p (delim nil))
  (with-rune-collector-2 (collect)
@ -2686,7 +2689,7 @@
      ((:ENTITY-REF)
       (let ((name sem))
         (consume-token input)
-         (append ;; nil  #+(OR)
+         (append
          (recurse-on-entity input name :general
                             (lambda (input)
                               (prog1
@ -3190,17 +3193,10 @@
 (defun read-cdata (input)
  (read-data-until* ((lambda (rune)
                       (declare (type rune rune))
-		       (when (or (and (%rune< rune #/U+0020)
-				      (not (or (%rune= rune #/U+0009)
-					       (%rune= rune #/U+000a)
-					       (%rune= rune #/U+000d))))
-				 ;; Surrogates nicht ausschliessen, denn wir
-				 ;; haben ja UTF-16 Runen.
-				 #+(or)
-				 (and (%rune<= #/U+D800 rune)
-				      (%rune< rune #/U+E000))
-				 (%rune= rune #/U+FFFE)
-				 (%rune= rune #/U+FFFF))
+		       (when (and (%rune< rune #/U+0020)
+				  (not (or (%rune= rune #/U+0009)
+					   (%rune= rune #/U+000a)
+					   (%rune= rune #/U+000d))))
 			 (wf-error "code point invalid: ~A" rune))
                       (or (%rune= rune #/<) (%rune= rune #/&)))
                     input