encoding-fehler resignalisieren

This commit is contained in:
dlichteblau
2005-11-27 17:19:10 +00:00
parent e4c9e057b1
commit 11f2514116
6 changed files with 44 additions and 51 deletions

26
XMLCONF
View File

@ -165,16 +165,8 @@ xmltest/not-wf/sa/164.xml [not validating:] not-wf [validating:] not-wf
xmltest/not-wf/sa/165.xml [not validating:] not-wf [validating:] not-wf xmltest/not-wf/sa/165.xml [not validating:] not-wf [validating:] not-wf
xmltest/not-wf/sa/166.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/166.xml [not validating:] not-wf [validating:] invalid
xmltest/not-wf/sa/167.xml [not validating:] not-wf [validating:] invalid xmltest/not-wf/sa/167.xml [not validating:] not-wf [validating:] invalid
xmltest/not-wf/sa/168.xml [not validating:] FAILED: xmltest/not-wf/sa/168.xml [not validating:] not-wf [validating:] invalid
Encoding UTF-16 in UTF-8? : #xD800. xmltest/not-wf/sa/169.xml [not validating:] not-wf [validating:] invalid
[
An unpaired surrogate (D800) is not legal anywhere
in an XML document.]
xmltest/not-wf/sa/169.xml [not validating:] FAILED:
Encoding UTF-16 in UTF-8? : #xDC00.
[
An unpaired surrogate (DC00) is not legal anywhere
in an XML document.]
xmltest/not-wf/sa/170.xml [not validating:] FAILED: xmltest/not-wf/sa/170.xml [not validating:] FAILED:
well-formedness violation not detected well-formedness violation not detected
[ [
@ -968,16 +960,8 @@ ibm/not-wf/P02/ibm02n26.xml [not validating:] not-wf [validating:] not-wf
ibm/not-wf/P02/ibm02n27.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n27.xml [not validating:] not-wf [validating:] not-wf
ibm/not-wf/P02/ibm02n28.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n28.xml [not validating:] not-wf [validating:] not-wf
ibm/not-wf/P02/ibm02n29.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n29.xml [not validating:] not-wf [validating:] not-wf
ibm/not-wf/P02/ibm02n30.xml [not validating:] FAILED: ibm/not-wf/P02/ibm02n30.xml [not validating:] not-wf [validating:] not-wf
Encoding UTF-16 in UTF-8? : #xD800. ibm/not-wf/P02/ibm02n31.xml [not validating:] not-wf [validating:] not-wf
[
Tests a comment which contains an illegal Char: #xD800
]
ibm/not-wf/P02/ibm02n31.xml [not validating:] FAILED:
Encoding UTF-16 in UTF-8? : #xDFFF.
[
Tests a comment which contains an illegal Char: #xDFFF
]
ibm/not-wf/P02/ibm02n32.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n32.xml [not validating:] not-wf [validating:] not-wf
ibm/not-wf/P02/ibm02n33.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P02/ibm02n33.xml [not validating:] not-wf [validating:] not-wf
ibm/not-wf/P03/ibm03n01.xml [not validating:] not-wf [validating:] not-wf ibm/not-wf/P03/ibm03n01.xml [not validating:] not-wf [validating:] not-wf
@ -1878,4 +1862,4 @@ ibm/valid/P86/ibm86v01.xml [not validating:] input [validating:] input
ibm/valid/P87/ibm87v01.xml [not validating:] input [validating:] input ibm/valid/P87/ibm87v01.xml [not validating:] input [validating:] input
ibm/valid/P88/ibm88v01.xml [not validating:] input [validating:] input ibm/valid/P88/ibm88v01.xml [not validating:] input [validating:] input
ibm/valid/P89/ibm89v01.xml [not validating:] input [validating:] input ibm/valid/P89/ibm89v01.xml [not validating:] input [validating:] input
19/1786 tests failed; 376 tests were skipped 15/1786 tests failed; 376 tests were skipped

View File

@ -1,4 +1,4 @@
(in-package :encoding) (in-package :runes-encoding)
(progn (progn
(add-name :us-ascii "ANSI_X3.4-1968") (add-name :us-ascii "ANSI_X3.4-1968")

View File

@ -1,4 +1,9 @@
(in-package :encoding) (in-package :runes-encoding)
(define-condition encoding-error (simple-error) ())
(defun xerror (fmt &rest args)
(error 'encoding-error :format-control fmt :format-arguments args))
;;;; --------------------------------------------------------------------------- ;;;; ---------------------------------------------------------------------------
;;;; Encoding names ;;;; Encoding names
@ -115,6 +120,9 @@
(let ((hi (aref in rptr)) (let ((hi (aref in rptr))
(lo (aref in (%+ 1 rptr)))) (lo (aref in (%+ 1 rptr))))
(setf rptr (%+ 2 rptr)) (setf rptr (%+ 2 rptr))
;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
;; Haelfte fehlt!
(setf (aref out wptr) (logior (ash hi 8) lo)) (setf (aref out wptr) (logior (ash hi 8) lo))
(setf wptr (%+ 1 wptr)))) (setf wptr (%+ 1 wptr))))
(values wptr rptr))) (values wptr rptr)))
@ -132,6 +140,9 @@
(let ((lo (aref in (%+ 0 rptr))) (let ((lo (aref in (%+ 0 rptr)))
(hi (aref in (%+ 1 rptr)))) (hi (aref in (%+ 1 rptr))))
(setf rptr (%+ 2 rptr)) (setf rptr (%+ 2 rptr))
;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
;; Haelfte fehlt!
(setf (aref out wptr) (logior (ash hi 8) lo)) (setf (aref out wptr) (logior (ash hi 8) lo))
(setf wptr (%+ 1 wptr)))) (setf wptr (%+ 1 wptr))))
(values wptr rptr))) (values wptr rptr)))
@ -147,13 +158,9 @@
byte0) byte0)
(macrolet ((put (x) (macrolet ((put (x)
`((lambda (x) `((lambda (x)
(cond ((or (<= #xD800 x #xDBFF) (when (or (<= #xD800 x #xDBFF)
(<= #xDC00 x #xDFFF)) (<= #xDC00 x #xDFFF))
(error "Encoding UTF-16 in UTF-8? : #x~x." x))) (xerror "surrogate encoded in UTF-8: #x~x." x))
'(unless (data-char-p x)
(error "#x~x is not a data character." x))
;;(fresh-line)
;;(prin1 x) (princ "-> ")
(cond ((%> x #xFFFF) (cond ((%> x #xFFFF)
(setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10)) (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
(aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF))) (aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
@ -196,7 +203,7 @@
(setf rptr (%+ rptr 1))) (setf rptr (%+ rptr 1)))
((%<= #|#b10000000|# byte0 #b10111111) ((%<= #|#b10000000|# byte0 #b10111111)
(error "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0) (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)
(setf rptr (%+ rptr 1))) (setf rptr (%+ rptr 1)))
((%<= #|#b11000000|# byte0 #b11011111) ((%<= #|#b11000000|# byte0 #b11011111)
@ -260,7 +267,7 @@
(return)))) (return))))
(t (t
(error "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) )) (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) ))
(values wptr rptr)) ) (values wptr rptr)) )
(defmethod encoding-p ((object (eql :utf-16-little-endian))) t) (defmethod encoding-p ((object (eql :utf-16-little-endian))) t)
@ -343,5 +350,4 @@
(defun find-charset (name) (defun find-charset (name)
(or (gethash name *charsets*) (or (gethash name *charsets*)
(error "There is no character set named ~S." name))) (xerror "There is no character set named ~S." name)))

View File

@ -61,8 +61,9 @@
#:set-to-full-speed #:set-to-full-speed
#:xstream-name)) #:xstream-name))
(defpackage :encoding (defpackage :runes-encoding
(:use :cl :runes) (:use :cl :runes)
(:export (:export
#:encoding-error
#:find-encoding #:find-encoding
#:decode-sequence)) #:decode-sequence))

View File

@ -1,4 +1,4 @@
;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: runes; readtable: runes; Encoding: utf-8; -*- ;;; -*- Mode: Lisp; Syntax: Common-Lisp; readtable: runes; Encoding: utf-8; -*-
;;; --------------------------------------------------------------------------- ;;; ---------------------------------------------------------------------------
;;; Title: Fast streams ;;; Title: Fast streams
;;; Created: 1999-07-17 ;;; Created: 1999-07-17
@ -66,9 +66,7 @@
;; ;;
(eval-when (:compile-toplevel :load-toplevel :execute) (eval-when (:compile-toplevel :load-toplevel :execute)
(defparameter *fast* '(optimize (speed 3) (safety 0))) (defparameter *fast* '(optimize (speed 3) (safety 0))))
;;(defparameter *fast* '(optimize (speed 2) (safety 3)))
)
;; Let us first define fast fixnum arithmetric get rid of type ;; Let us first define fast fixnum arithmetric get rid of type
;; checks. (After all we know what we do here). ;; checks. (After all we know what we do here).
@ -277,10 +275,14 @@
:end2 (xstream-os-left-end input)) :end2 (xstream-os-left-end input))
;; then we take care that the buffer is large enough to carry at ;; then we take care that the buffer is large enough to carry at
;; least 100 bytes (a random number) ;; least 100 bytes (a random number)
;;
;; david: was heisst da random? ich nehme an, dass 100 einfach
;; ausreichend sein soll, um die laengste utf-8 bytesequenz oder die
;; beiden utf-16 surrogates zu halten? dann ist 100 ja wohl dicke
;; ausreichend und koennte in make-xstream ordentlich geprueft werden.
;; oder was geht hier vor?
(unless (>= (length (xstream-os-buffer input)) 100) (unless (>= (length (xstream-os-buffer input)) 100)
(error "You lost") (error "You lost")))
;; todo: enlarge buffer
))
(setf n (setf n
(read-octets (xstream-os-buffer input) (xstream-os-stream input) (read-octets (xstream-os-buffer input) (xstream-os-stream input)
m (min (1- (length (xstream-os-buffer input))) m (min (1- (length (xstream-os-buffer input)))
@ -292,7 +294,7 @@
:eof) :eof)
(t (t
(multiple-value-bind (fnw fnr) (multiple-value-bind (fnw fnr)
(encoding:decode-sequence (runes-encoding:decode-sequence
(xstream-encoding input) (xstream-encoding input)
(xstream-os-buffer input) 0 n (xstream-os-buffer input) 0 n
(xstream-buffer input) 0 (1- (length (xstream-buffer input))) (xstream-buffer input) 0 (1- (length (xstream-buffer input)))

View File

@ -2960,10 +2960,13 @@
:type type)))))) :type type))))))
(defun parse-xstream (xstream handler &rest args) (defun parse-xstream (xstream handler &rest args)
(let ((zstream (make-zstream :input-stack (list xstream)))) (handler-case
(peek-rune xstream) (let ((zstream (make-zstream :input-stack (list xstream))))
(with-scratch-pads () (peek-rune xstream)
(apply #'p/document zstream handler args)))) (with-scratch-pads ()
(apply #'p/document zstream handler args)))
(runes-encoding:encoding-error (c)
(wf-error "~A" c))))
(defun parse-file (filename handler &rest args) (defun parse-file (filename handler &rest args)
(with-open-xfile (input filename) (with-open-xfile (input filename)
@ -3021,10 +3024,7 @@
(defun parse-string (string handler) (defun parse-string (string handler)
;; XXX this function mis-handles encoding ;; XXX this function mis-handles encoding
(with-scratch-pads () (parse-xstream (string->xstream string) handler))
(let* ((x (string->xstream string))
(z (make-zstream :input-stack (list x))))
(p/document z handler))))
(defun string->xstream (string) (defun string->xstream (string)
;; XXX encoding is mis-handled by this kind of stream ;; XXX encoding is mis-handled by this kind of stream