diff --git a/XMLCONF b/XMLCONF index 6b2cf7b..84212c9 100644 --- a/XMLCONF +++ b/XMLCONF @@ -165,16 +165,8 @@ xmltest/not-wf/sa/164.xml [not validating:] not-wf [validating:] not-wf xmltest/not-wf/sa/165.xml [not validating:] not-wf [validating:] not-wf xmltest/not-wf/sa/166.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/167.xml [not validating:] not-wf [validating:] invalid -xmltest/not-wf/sa/168.xml [not validating:] FAILED: - Encoding UTF-16 in UTF-8? : #xD800. -[ - An unpaired surrogate (D800) is not legal anywhere - in an XML document.] -xmltest/not-wf/sa/169.xml [not validating:] FAILED: - Encoding UTF-16 in UTF-8? : #xDC00. -[ - An unpaired surrogate (DC00) is not legal anywhere - in an XML document.] +xmltest/not-wf/sa/168.xml [not validating:] not-wf [validating:] invalid +xmltest/not-wf/sa/169.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/170.xml [not validating:] FAILED: well-formedness violation not detected [ @@ -968,16 +960,8 @@ ibm/not-wf/P02/ibm02n26.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n27.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n28.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n29.xml [not validating:] not-wf [validating:] not-wf -ibm/not-wf/P02/ibm02n30.xml [not validating:] FAILED: - Encoding UTF-16 in UTF-8? : #xD800. -[ - Tests a comment which contains an illegal Char: #xD800 - ] -ibm/not-wf/P02/ibm02n31.xml [not validating:] FAILED: - Encoding UTF-16 in UTF-8? : #xDFFF. -[ - Tests a comment which contains an illegal Char: #xDFFF - ] +ibm/not-wf/P02/ibm02n30.xml [not validating:] not-wf [validating:] not-wf +ibm/not-wf/P02/ibm02n31.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n32.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n33.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P03/ibm03n01.xml [not validating:] not-wf [validating:] not-wf @@ -1878,4 +1862,4 @@ ibm/valid/P86/ibm86v01.xml [not validating:] input [validating:] input ibm/valid/P87/ibm87v01.xml [not validating:] input [validating:] input ibm/valid/P88/ibm88v01.xml [not validating:] input [validating:] input ibm/valid/P89/ibm89v01.xml [not validating:] input [validating:] input -19/1786 tests failed; 376 tests were skipped \ No newline at end of file +15/1786 tests failed; 376 tests were skipped \ No newline at end of file diff --git a/runes/encodings-data.lisp b/runes/encodings-data.lisp index e29a683..c10131a 100644 --- a/runes/encodings-data.lisp +++ b/runes/encodings-data.lisp @@ -1,4 +1,4 @@ -(in-package :encoding) +(in-package :runes-encoding) (progn (add-name :us-ascii "ANSI_X3.4-1968") diff --git a/runes/encodings.lisp b/runes/encodings.lisp index 0982caa..04ddd93 100644 --- a/runes/encodings.lisp +++ b/runes/encodings.lisp @@ -1,4 +1,9 @@ -(in-package :encoding) +(in-package :runes-encoding) + +(define-condition encoding-error (simple-error) ()) + +(defun xerror (fmt &rest args) + (error 'encoding-error :format-control fmt :format-arguments args)) ;;;; --------------------------------------------------------------------------- ;;;; Encoding names @@ -115,6 +120,9 @@ (let ((hi (aref in rptr)) (lo (aref in (%+ 1 rptr)))) (setf rptr (%+ 2 rptr)) + ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste + ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere + ;; Haelfte fehlt! (setf (aref out wptr) (logior (ash hi 8) lo)) (setf wptr (%+ 1 wptr)))) (values wptr rptr))) @@ -132,6 +140,9 @@ (let ((lo (aref in (%+ 0 rptr))) (hi (aref in (%+ 1 rptr)))) (setf rptr (%+ 2 rptr)) + ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste + ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere + ;; Haelfte fehlt! (setf (aref out wptr) (logior (ash hi 8) lo)) (setf wptr (%+ 1 wptr)))) (values wptr rptr))) @@ -147,13 +158,9 @@ byte0) (macrolet ((put (x) `((lambda (x) - (cond ((or (<= #xD800 x #xDBFF) - (<= #xDC00 x #xDFFF)) - (error "Encoding UTF-16 in UTF-8? : #x~x." x))) - '(unless (data-char-p x) - (error "#x~x is not a data character." x)) - ;;(fresh-line) - ;;(prin1 x) (princ "-> ") + (when (or (<= #xD800 x #xDBFF) + (<= #xDC00 x #xDFFF)) + (xerror "surrogate encoded in UTF-8: #x~x." x)) (cond ((%> x #xFFFF) (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10)) (aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF))) @@ -196,7 +203,7 @@ (setf rptr (%+ rptr 1))) ((%<= #|#b10000000|# byte0 #b10111111) - (error "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0) + (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0) (setf rptr (%+ rptr 1))) ((%<= #|#b11000000|# byte0 #b11011111) @@ -260,7 +267,7 @@ (return)))) (t - (error "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) )) + (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) )) (values wptr rptr)) ) (defmethod encoding-p ((object (eql :utf-16-little-endian))) t) @@ -343,5 +350,4 @@ (defun find-charset (name) (or (gethash name *charsets*) - (error "There is no character set named ~S." name))) - + (xerror "There is no character set named ~S." name))) diff --git a/runes/package.lisp b/runes/package.lisp index d92ed60..cfb822d 100644 --- a/runes/package.lisp +++ b/runes/package.lisp @@ -61,8 +61,9 @@ #:set-to-full-speed #:xstream-name)) -(defpackage :encoding +(defpackage :runes-encoding (:use :cl :runes) (:export + #:encoding-error #:find-encoding #:decode-sequence)) diff --git a/runes/xstream.lisp b/runes/xstream.lisp index fe818ca..ea5049e 100644 --- a/runes/xstream.lisp +++ b/runes/xstream.lisp @@ -1,4 +1,4 @@ -;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: runes; readtable: runes; Encoding: utf-8; -*- +;;; -*- Mode: Lisp; Syntax: Common-Lisp; readtable: runes; Encoding: utf-8; -*- ;;; --------------------------------------------------------------------------- ;;; Title: Fast streams ;;; Created: 1999-07-17 @@ -66,9 +66,7 @@ ;; (eval-when (:compile-toplevel :load-toplevel :execute) - (defparameter *fast* '(optimize (speed 3) (safety 0))) - ;;(defparameter *fast* '(optimize (speed 2) (safety 3))) - ) + (defparameter *fast* '(optimize (speed 3) (safety 0)))) ;; Let us first define fast fixnum arithmetric get rid of type ;; checks. (After all we know what we do here). @@ -277,10 +275,14 @@ :end2 (xstream-os-left-end input)) ;; then we take care that the buffer is large enough to carry at ;; least 100 bytes (a random number) + ;; + ;; david: was heisst da random? ich nehme an, dass 100 einfach + ;; ausreichend sein soll, um die laengste utf-8 bytesequenz oder die + ;; beiden utf-16 surrogates zu halten? dann ist 100 ja wohl dicke + ;; ausreichend und koennte in make-xstream ordentlich geprueft werden. + ;; oder was geht hier vor? (unless (>= (length (xstream-os-buffer input)) 100) - (error "You lost") - ;; todo: enlarge buffer - )) + (error "You lost"))) (setf n (read-octets (xstream-os-buffer input) (xstream-os-stream input) m (min (1- (length (xstream-os-buffer input))) @@ -292,7 +294,7 @@ :eof) (t (multiple-value-bind (fnw fnr) - (encoding:decode-sequence + (runes-encoding:decode-sequence (xstream-encoding input) (xstream-os-buffer input) 0 n (xstream-buffer input) 0 (1- (length (xstream-buffer input))) diff --git a/xml/xml-parse.lisp b/xml/xml-parse.lisp index a5a7434..6c68400 100644 --- a/xml/xml-parse.lisp +++ b/xml/xml-parse.lisp @@ -2960,10 +2960,13 @@ :type type)))))) (defun parse-xstream (xstream handler &rest args) - (let ((zstream (make-zstream :input-stack (list xstream)))) - (peek-rune xstream) - (with-scratch-pads () - (apply #'p/document zstream handler args)))) + (handler-case + (let ((zstream (make-zstream :input-stack (list xstream)))) + (peek-rune xstream) + (with-scratch-pads () + (apply #'p/document zstream handler args))) + (runes-encoding:encoding-error (c) + (wf-error "~A" c)))) (defun parse-file (filename handler &rest args) (with-open-xfile (input filename) @@ -3021,10 +3024,7 @@ (defun parse-string (string handler) ;; XXX this function mis-handles encoding - (with-scratch-pads () - (let* ((x (string->xstream string)) - (z (make-zstream :input-stack (list x)))) - (p/document z handler)))) + (parse-xstream (string->xstream string) handler)) (defun string->xstream (string) ;; XXX encoding is mis-handled by this kind of stream