Files
closure-common/encodings.lisp
dlichteblau 055bbc5560 Hmm. Auf U+ffff und dergleichen wurde durch data-rune-p geprueft,
das aber eben nicht ueberall benutzt wurde.  Ich habe die Pruefung
jetzt mal direkt im Decoding eingebaut.

-xmltest/not-wf/sa/171.xml [not validating:] FAILED:
-  well-formedness violation not detected
-[
-    Character FFFF is not legal anywhere in an XML document. ]
2005-11-27 18:20:10 +00:00

364 lines
14 KiB
Common Lisp

(in-package :runes-encoding)
(define-condition encoding-error (simple-error) ())
(defun xerror (fmt &rest args)
(error 'encoding-error :format-control fmt :format-arguments args))
;;;; ---------------------------------------------------------------------------
;;;; Encoding names
;;;;
(defvar *names* (make-hash-table :test #'eq))
(defun canon-name (string)
(with-output-to-string (bag)
(map nil (lambda (ch)
(cond ((char= ch #\_) (write-char #\- bag))
(t (write-char (char-upcase ch) bag))))
string)))
(defun canon-name-2 (string)
(with-output-to-string (bag)
(map nil (lambda (ch)
(cond ((char= ch #\_))
((char= ch #\-))
(t (write-char (char-upcase ch) bag))))
string)))
(defmethod encoding-names ((encoding symbol))
(gethash encoding *names*))
(defmethod (setf encoding-names) (new-value (encoding symbol))
(setf (gethash encoding *names*) new-value))
(defun add-name (encoding name)
(pushnew (canon-name name) (encoding-names encoding) :test #'string=))
(defun resolve-name (string)
(cond ((symbolp string)
string)
(t
(setq string (canon-name string))
(or
(block nil
(maphash (lambda (x y)
(when (member string y :test #'string=)
(return x)))
*names*)
nil)
(block nil
(maphash (lambda (x y)
(when (member string y
:test #'(lambda (x y)
(string= (canon-name-2 x)
(canon-name-2 y))))
(return x)))
*names*)
nil)))))
;;;; ---------------------------------------------------------------------------
;;;; Encodings
;;;;
(defvar *encodings* (make-hash-table :test #'eq))
(defmacro define-encoding (name init-form)
`(progn
(setf (gethash ',name *encodings*)
(list nil (lambda () ,init-form)))
',name))
(defun find-encoding (name)
(let ((x (gethash (resolve-name name) *encodings*)))
(and x
(or (first x)
(setf (first x) (funcall (second x)))))))
(defclass encoding () ())
(defclass simple-8-bit-encoding (encoding)
((table :initarg :table)))
(defun make-simple-8-bit-encoding (&key charset)
(make-instance 'simple-8-bit-encoding
:table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256)))))
;;;;;;;
(defmacro fx-op (op &rest xs)
`(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
(defmacro fx-pred (op &rest xs)
`(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
(defmacro %+ (&rest xs) `(fx-op + ,@xs))
(defmacro %- (&rest xs) `(fx-op - ,@xs))
(defmacro %* (&rest xs) `(fx-op * ,@xs))
(defmacro %/ (&rest xs) `(fx-op floor ,@xs))
(defmacro %and (&rest xs) `(fx-op logand ,@xs))
(defmacro %ior (&rest xs) `(fx-op logior ,@xs))
(defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
(defmacro %ash (&rest xs) `(fx-op ash ,@xs))
(defmacro %mod (&rest xs) `(fx-op mod ,@xs))
(defmacro %= (&rest xs) `(fx-pred = ,@xs))
(defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
(defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
(defmacro %< (&rest xs) `(fx-pred < ,@xs))
(defmacro %> (&rest xs) `(fx-pred > ,@xs))
(defmethod decode-sequence ((encoding (eql :utf-16-big-endian))
in in-start in-end out out-start out-end eof?)
;; -> new wptr, new rptr
(let ((wptr out-start)
(rptr in-start))
(loop
(when (%= wptr out-end)
(return))
(when (>= (%+ rptr 1) in-end)
(return))
(let ((hi (aref in rptr))
(lo (aref in (%+ 1 rptr))))
(setf rptr (%+ 2 rptr))
;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
;; Haelfte fehlt!
(let ((x (logior (ash hi 8) lo)))
(when (or (eql x #xFFFE) (eql x #/U+FFFF))
(xerror "not a valid code point: #x~X" x))
(setf (aref out wptr) x))
(setf wptr (%+ 1 wptr))))
(values wptr rptr)))
(defmethod decode-sequence ((encoding (eql :utf-16-little-endian))
in in-start in-end out out-start out-end eof?)
;; -> new wptr, new rptr
(let ((wptr out-start)
(rptr in-start))
(loop
(when (%= wptr out-end)
(return))
(when (>= (%+ rptr 1) in-end)
(return))
(let ((lo (aref in (%+ 0 rptr)))
(hi (aref in (%+ 1 rptr))))
(setf rptr (%+ 2 rptr))
;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
;; Haelfte fehlt!
(let ((x (logior (ash hi 8) lo)))
(when (or (eql x #xFFFE) (eql x #/U+FFFF))
(xerror "not a valid code point: #x~X" x))
(setf (aref out wptr) x))
(setf wptr (%+ 1 wptr))))
(values wptr rptr)))
(defmethod decode-sequence ((encoding (eql :utf-8))
in in-start in-end out out-start out-end eof?)
(declare (optimize (speed 3) (safety 0))
(type (simple-array (unsigned-byte 8) (*)) in)
(type (simple-array (unsigned-byte 16) (*)) out)
(type fixnum in-start in-end out-start out-end))
(let ((wptr out-start)
(rptr in-start)
byte0)
(macrolet ((put (x)
`((lambda (x)
(when (or (<= #xD800 x #xDBFF)
(<= #xDC00 x #xDFFF))
(xerror "surrogate encoded in UTF-8: #x~X." x))
(cond ((or (%> x #x10FFFF)
(eql x #xFFFE)
(eql x #/U+FFFF))
(xerror "not a valid code point: #x~X" x))
((%> x #xFFFF)
(setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
(aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
(setf wptr (%+ wptr 2)))
(t
(setf (aref out wptr) x)
(setf wptr (%+ wptr 1)))))
,x))
(put1 (x)
`(progn
(setf (aref out wptr) ,x)
(setf wptr (%+ wptr 1)))))
(loop
(when (%= (+ wptr 1) out-end) (return))
(when (%>= rptr in-end) (return))
(setq byte0 (aref in rptr))
(cond ((= byte0 #x0D)
;; CR handling
;; we need to know the following character
(cond ((>= (%+ rptr 1) in-end)
;; no characters in buffer
(cond (eof?
;; at EOF, pass it as NL
(put #x0A)
(setf rptr (%+ rptr 1)))
(t
;; demand more characters
(return))))
((= (aref in (%+ rptr 1)) #x0A)
;; we see CR NL, so forget this CR and the next NL will be
;; inserted literally
(setf rptr (%+ rptr 1)))
(t
;; singleton CR, pass it as NL
(put #x0A)
(setf rptr (%+ rptr 1)))))
((%<= #|#b00000000|# byte0 #b01111111)
(put1 byte0)
(setf rptr (%+ rptr 1)))
((%<= #|#b10000000|# byte0 #b10111111)
(xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)
(setf rptr (%+ rptr 1)))
((%<= #|#b11000000|# byte0 #b11011111)
(cond ((< (%+ rptr 2) in-end)
(put
(dpb (ldb (byte 5 0) byte0) (byte 5 6)
(dpb (ldb (byte 6 0) (aref in (%+ rptr 1))) (byte 6 0)
0)))
(setf rptr (%+ rptr 2)))
(t
(return))))
((%<= #|#b11100000|# byte0 #b11101111)
(cond ((< (%+ rptr 3) in-end)
(put
(dpb (ldb (byte 4 0) byte0) (byte 4 12)
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 6)
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 0)
0))))
(setf rptr (%+ rptr 3)))
(t
(return))))
((%<= #|#b11110000|# byte0 #b11110111)
(cond ((< (%+ rptr 4) in-end)
(put
(dpb (ldb (byte 3 0) byte0) (byte 3 18)
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 12)
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 6)
(dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 0)
0)))))
(setf rptr (%+ rptr 4)))
(t
(return))))
((%<= #|#b11111000|# byte0 #b11111011)
(cond ((< (%+ rptr 5) in-end)
(put
(dpb (ldb (byte 2 0) byte0) (byte 2 24)
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 18)
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 12)
(dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 6)
(dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 0)
0))))))
(setf rptr (%+ rptr 5)))
(t
(return))))
((%<= #|#b11111100|# byte0 #b11111101)
(cond ((< (%+ rptr 6) in-end)
(put
(dpb (ldb (byte 1 0) byte0) (byte 1 30)
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 24)
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 18)
(dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 12)
(dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 6)
(dpb (ldb (byte 6 0) (aref in (%+ 5 rptr))) (byte 6 0)
0)))))))
(setf rptr (%+ rptr 6)))
(t
(return))))
(t
(xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) ))
(values wptr rptr)) )
(defmethod encoding-p ((object (eql :utf-16-little-endian))) t)
(defmethod encoding-p ((object (eql :utf-16-big-endian))) t)
(defmethod encoding-p ((object (eql :utf-8))) t)
(defmethod encoding-p ((object encoding)) t)
(defmethod decode-sequence ((encoding simple-8-bit-encoding)
in in-start in-end
out out-start out-end
eof?)
(declare (optimize (speed 3) (safety 0))
(type (simple-array (unsigned-byte 8) (*)) in)
(type (simple-array (unsigned-byte 16) (*)) out)
(type fixnum in-start in-end out-start out-end))
(let ((wptr out-start)
(rptr in-start)
(byte 0)
(table (slot-value encoding 'table)))
(declare (type fixnum wptr rptr)
(type (unsigned-byte 8) byte)
(type (simple-array (unsigned-byte 16) (*)) table))
(loop
(when (%= wptr out-end) (return))
(when (%>= rptr in-end) (return))
(setq byte (aref in rptr))
(cond ((= byte #x0D)
;; CR handling
;; we need to know the following character
(cond ((>= (%+ rptr 1) in-end)
;; no characters in buffer
(cond (eof?
;; at EOF, pass it as NL
(setf (aref out wptr) #x0A)
(setf wptr (%+ wptr 1))
(setf rptr (%+ rptr 1)))
(t
;; demand more characters
(return))))
((= (aref in (%+ rptr 1)) #x0A)
;; we see CR NL, so forget this CR and the next NL will be
;; inserted literally
(setf rptr (%+ rptr 1)))
(t
;; singleton CR, pass it as NL
(setf (aref out wptr) #x0A)
(setf wptr (%+ wptr 1))
(setf rptr (%+ rptr 1)))))
(t
(setf (aref out wptr) (aref table byte))
(setf wptr (%+ wptr 1))
(setf rptr (%+ rptr 1))) ))
(values wptr rptr)))
;;;; ---------------------------------------------------------------------------
;;;; Character sets
;;;;
(defvar *charsets* (make-hash-table :test #'eq))
(defclass 8-bit-charset ()
((name :initarg :name)
(to-unicode-table
:initarg :to-unicode-table
:reader to-unicode-table)))
(defmacro define-8-bit-charset (name &rest codes)
(assert (= 256 (length codes)))
`(progn
(setf (gethash ',name *charsets*)
(make-instance '8-bit-charset
:name ',name
:to-unicode-table
',(make-array 256
:element-type '(unsigned-byte 16)
:initial-contents codes)))
',name))
(defun find-charset (name)
(or (gethash name *charsets*)
(xerror "There is no character set named ~S." name)))