Initial revision
This commit is contained in:
347
encodings.lisp
Normal file
347
encodings.lisp
Normal file
@ -0,0 +1,347 @@
|
||||
(in-package :encoding)
|
||||
|
||||
;;;; ---------------------------------------------------------------------------
|
||||
;;;; Encoding names
|
||||
;;;;
|
||||
|
||||
(defvar *names* (make-hash-table :test #'eq))
|
||||
|
||||
(defun canon-name (string)
|
||||
(with-output-to-string (bag)
|
||||
(map nil (lambda (ch)
|
||||
(cond ((char= ch #\_) (write-char #\- bag))
|
||||
(t (write-char (char-upcase ch) bag))))
|
||||
string)))
|
||||
|
||||
(defun canon-name-2 (string)
|
||||
(with-output-to-string (bag)
|
||||
(map nil (lambda (ch)
|
||||
(cond ((char= ch #\_))
|
||||
((char= ch #\-))
|
||||
(t (write-char (char-upcase ch) bag))))
|
||||
string)))
|
||||
|
||||
(defmethod encoding-names ((encoding symbol))
|
||||
(gethash encoding *names*))
|
||||
|
||||
(defmethod (setf encoding-names) (new-value (encoding symbol))
|
||||
(setf (gethash encoding *names*) new-value))
|
||||
|
||||
(defun add-name (encoding name)
|
||||
(pushnew (canon-name name) (encoding-names encoding) :test #'string=))
|
||||
|
||||
(defun resolve-name (string)
|
||||
(cond ((symbolp string)
|
||||
string)
|
||||
(t
|
||||
(setq string (canon-name string))
|
||||
(or
|
||||
(block nil
|
||||
(maphash (lambda (x y)
|
||||
(when (member string y :test #'string=)
|
||||
(return x)))
|
||||
*names*)
|
||||
nil)
|
||||
(block nil
|
||||
(maphash (lambda (x y)
|
||||
(when (member string y
|
||||
:test #'(lambda (x y)
|
||||
(string= (canon-name-2 x)
|
||||
(canon-name-2 y))))
|
||||
(return x)))
|
||||
*names*)
|
||||
nil)))))
|
||||
|
||||
;;;; ---------------------------------------------------------------------------
|
||||
;;;; Encodings
|
||||
;;;;
|
||||
|
||||
(defvar *encodings* (make-hash-table :test #'eq))
|
||||
|
||||
(defmacro define-encoding (name init-form)
|
||||
`(progn
|
||||
(setf (gethash ',name *encodings*)
|
||||
(list nil (lambda () ,init-form)))
|
||||
',name))
|
||||
|
||||
(defun find-encoding (name)
|
||||
(let ((x (gethash (resolve-name name) *encodings*)))
|
||||
(and x
|
||||
(or (first x)
|
||||
(setf (first x) (funcall (second x)))))))
|
||||
|
||||
(defclass encoding () ())
|
||||
|
||||
(defclass simple-8-bit-encoding (encoding)
|
||||
((table :initarg :table)))
|
||||
|
||||
(defun make-simple-8-bit-encoding (&key charset)
|
||||
(make-instance 'simple-8-bit-encoding
|
||||
:table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256)))))
|
||||
|
||||
;;;;;;;
|
||||
|
||||
(defmacro fx-op (op &rest xs)
|
||||
`(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
|
||||
(defmacro fx-pred (op &rest xs)
|
||||
`(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
|
||||
|
||||
(defmacro %+ (&rest xs) `(fx-op + ,@xs))
|
||||
(defmacro %- (&rest xs) `(fx-op - ,@xs))
|
||||
(defmacro %* (&rest xs) `(fx-op * ,@xs))
|
||||
(defmacro %/ (&rest xs) `(fx-op floor ,@xs))
|
||||
(defmacro %and (&rest xs) `(fx-op logand ,@xs))
|
||||
(defmacro %ior (&rest xs) `(fx-op logior ,@xs))
|
||||
(defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
|
||||
(defmacro %ash (&rest xs) `(fx-op ash ,@xs))
|
||||
(defmacro %mod (&rest xs) `(fx-op mod ,@xs))
|
||||
|
||||
(defmacro %= (&rest xs) `(fx-pred = ,@xs))
|
||||
(defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
|
||||
(defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
|
||||
(defmacro %< (&rest xs) `(fx-pred < ,@xs))
|
||||
(defmacro %> (&rest xs) `(fx-pred > ,@xs))
|
||||
|
||||
(defmethod decode-sequence ((encoding (eql :utf-16-big-endian))
|
||||
in in-start in-end out out-start out-end eof?)
|
||||
;; -> new wptr, new rptr
|
||||
(let ((wptr out-start)
|
||||
(rptr in-start))
|
||||
(loop
|
||||
(when (%= wptr out-end)
|
||||
(return))
|
||||
(when (>= (%+ rptr 1) in-end)
|
||||
(return))
|
||||
(let ((hi (aref in rptr))
|
||||
(lo (aref in (%+ 1 rptr))))
|
||||
(setf rptr (%+ 2 rptr))
|
||||
(setf (aref out wptr) (logior (ash hi 8) lo))
|
||||
(setf wptr (%+ 1 wptr))))
|
||||
(values wptr rptr)))
|
||||
|
||||
(defmethod decode-sequence ((encoding (eql :utf-16-little-endian))
|
||||
in in-start in-end out out-start out-end eof?)
|
||||
;; -> new wptr, new rptr
|
||||
(let ((wptr out-start)
|
||||
(rptr in-start))
|
||||
(loop
|
||||
(when (%= wptr out-end)
|
||||
(return))
|
||||
(when (>= (%+ rptr 1) in-end)
|
||||
(return))
|
||||
(let ((lo (aref in (%+ 0 rptr)))
|
||||
(hi (aref in (%+ 1 rptr))))
|
||||
(setf rptr (%+ 2 rptr))
|
||||
(setf (aref out wptr) (logior (ash hi 8) lo))
|
||||
(setf wptr (%+ 1 wptr))))
|
||||
(values wptr rptr)))
|
||||
|
||||
(defmethod decode-sequence ((encoding (eql :utf-8))
|
||||
in in-start in-end out out-start out-end eof?)
|
||||
(declare (optimize (speed 3) (safety 0))
|
||||
(type (simple-array (unsigned-byte 8) (*)) in)
|
||||
(type (simple-array (unsigned-byte 16) (*)) out)
|
||||
(type fixnum in-start in-end out-start out-end))
|
||||
(let ((wptr out-start)
|
||||
(rptr in-start)
|
||||
byte0)
|
||||
(macrolet ((put (x)
|
||||
`((lambda (x)
|
||||
(cond ((or (<= #xD800 x #xDBFF)
|
||||
(<= #xDC00 x #xDFFF))
|
||||
(error "Encoding UTF-16 in UTF-8? : #x~x." x)))
|
||||
'(unless (data-char-p x)
|
||||
(error "#x~x is not a data character." x))
|
||||
;;(fresh-line)
|
||||
;;(prin1 x) (princ "-> ")
|
||||
(cond ((%> x #xFFFF)
|
||||
(setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
|
||||
(aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
|
||||
(setf wptr (%+ wptr 2)))
|
||||
(t
|
||||
(setf (aref out wptr) x)
|
||||
(setf wptr (%+ wptr 1)))))
|
||||
,x))
|
||||
(put1 (x)
|
||||
`(progn
|
||||
(setf (aref out wptr) ,x)
|
||||
(setf wptr (%+ wptr 1)))))
|
||||
(loop
|
||||
(when (%= (+ wptr 1) out-end) (return))
|
||||
(when (%>= rptr in-end) (return))
|
||||
(setq byte0 (aref in rptr))
|
||||
(cond ((= byte0 #x0D)
|
||||
;; CR handling
|
||||
;; we need to know the following character
|
||||
(cond ((>= (%+ rptr 1) in-end)
|
||||
;; no characters in buffer
|
||||
(cond (eof?
|
||||
;; at EOF, pass it as NL
|
||||
(put #x0A)
|
||||
(setf rptr (%+ rptr 1)))
|
||||
(t
|
||||
;; demand more characters
|
||||
(return))))
|
||||
((= (aref in (%+ rptr 1)) #x0A)
|
||||
;; we see CR NL, so forget this CR and the next NL will be
|
||||
;; inserted literally
|
||||
(setf rptr (%+ rptr 1)))
|
||||
(t
|
||||
;; singleton CR, pass it as NL
|
||||
(put #x0A)
|
||||
(setf rptr (%+ rptr 1)))))
|
||||
|
||||
((%<= #|#b00000000|# byte0 #b01111111)
|
||||
(put1 byte0)
|
||||
(setf rptr (%+ rptr 1)))
|
||||
|
||||
((%<= #|#b10000000|# byte0 #b10111111)
|
||||
(error "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)
|
||||
(setf rptr (%+ rptr 1)))
|
||||
|
||||
((%<= #|#b11000000|# byte0 #b11011111)
|
||||
(cond ((< (%+ rptr 2) in-end)
|
||||
(put
|
||||
(dpb (ldb (byte 5 0) byte0) (byte 5 6)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ rptr 1))) (byte 6 0)
|
||||
0)))
|
||||
(setf rptr (%+ rptr 2)))
|
||||
(t
|
||||
(return))))
|
||||
|
||||
((%<= #|#b11100000|# byte0 #b11101111)
|
||||
(cond ((< (%+ rptr 3) in-end)
|
||||
(put
|
||||
(dpb (ldb (byte 4 0) byte0) (byte 4 12)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 6)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 0)
|
||||
0))))
|
||||
(setf rptr (%+ rptr 3)))
|
||||
(t
|
||||
(return))))
|
||||
|
||||
((%<= #|#b11110000|# byte0 #b11110111)
|
||||
(cond ((< (%+ rptr 4) in-end)
|
||||
(put
|
||||
(dpb (ldb (byte 3 0) byte0) (byte 3 18)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 12)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 6)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 0)
|
||||
0)))))
|
||||
(setf rptr (%+ rptr 4)))
|
||||
(t
|
||||
(return))))
|
||||
|
||||
((%<= #|#b11111000|# byte0 #b11111011)
|
||||
(cond ((< (%+ rptr 5) in-end)
|
||||
(put
|
||||
(dpb (ldb (byte 2 0) byte0) (byte 2 24)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 18)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 12)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 6)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 0)
|
||||
0))))))
|
||||
(setf rptr (%+ rptr 5)))
|
||||
(t
|
||||
(return))))
|
||||
|
||||
((%<= #|#b11111100|# byte0 #b11111101)
|
||||
(cond ((< (%+ rptr 6) in-end)
|
||||
(put
|
||||
(dpb (ldb (byte 1 0) byte0) (byte 1 30)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 24)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 18)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 12)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 6)
|
||||
(dpb (ldb (byte 6 0) (aref in (%+ 5 rptr))) (byte 6 0)
|
||||
0)))))))
|
||||
(setf rptr (%+ rptr 6)))
|
||||
(t
|
||||
(return))))
|
||||
|
||||
(t
|
||||
(error "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) ))
|
||||
(values wptr rptr)) )
|
||||
|
||||
(defmethod encoding-p ((object (eql :utf-16-little-endian))) t)
|
||||
(defmethod encoding-p ((object (eql :utf-16-big-endian))) t)
|
||||
(defmethod encoding-p ((object (eql :utf-8))) t)
|
||||
|
||||
(defmethod encoding-p ((object encoding)) t)
|
||||
|
||||
(defmethod decode-sequence ((encoding simple-8-bit-encoding)
|
||||
in in-start in-end
|
||||
out out-start out-end
|
||||
eof?)
|
||||
(declare (optimize (speed 3) (safety 0))
|
||||
(type (simple-array (unsigned-byte 8) (*)) in)
|
||||
(type (simple-array (unsigned-byte 16) (*)) out)
|
||||
(type fixnum in-start in-end out-start out-end))
|
||||
(let ((wptr out-start)
|
||||
(rptr in-start)
|
||||
(byte 0)
|
||||
(table (slot-value encoding 'table)))
|
||||
(declare (type fixnum wptr rptr)
|
||||
(type (unsigned-byte 8) byte)
|
||||
(type (simple-array (unsigned-byte 16) (*)) table))
|
||||
(loop
|
||||
(when (%= wptr out-end) (return))
|
||||
(when (%>= rptr in-end) (return))
|
||||
(setq byte (aref in rptr))
|
||||
(cond ((= byte #x0D)
|
||||
;; CR handling
|
||||
;; we need to know the following character
|
||||
(cond ((>= (%+ rptr 1) in-end)
|
||||
;; no characters in buffer
|
||||
(cond (eof?
|
||||
;; at EOF, pass it as NL
|
||||
(setf (aref out wptr) #x0A)
|
||||
(setf wptr (%+ wptr 1))
|
||||
(setf rptr (%+ rptr 1)))
|
||||
(t
|
||||
;; demand more characters
|
||||
(return))))
|
||||
((= (aref in (%+ rptr 1)) #x0A)
|
||||
;; we see CR NL, so forget this CR and the next NL will be
|
||||
;; inserted literally
|
||||
(setf rptr (%+ rptr 1)))
|
||||
(t
|
||||
;; singleton CR, pass it as NL
|
||||
(setf (aref out wptr) #x0A)
|
||||
(setf wptr (%+ wptr 1))
|
||||
(setf rptr (%+ rptr 1)))))
|
||||
|
||||
(t
|
||||
(setf (aref out wptr) (aref table byte))
|
||||
(setf wptr (%+ wptr 1))
|
||||
(setf rptr (%+ rptr 1))) ))
|
||||
(values wptr rptr)))
|
||||
|
||||
;;;; ---------------------------------------------------------------------------
|
||||
;;;; Character sets
|
||||
;;;;
|
||||
|
||||
(defvar *charsets* (make-hash-table :test #'eq))
|
||||
|
||||
(defclass 8-bit-charset ()
|
||||
((name :initarg :name)
|
||||
(to-unicode-table
|
||||
:initarg :to-unicode-table
|
||||
:reader to-unicode-table)))
|
||||
|
||||
(defmacro define-8-bit-charset (name &rest codes)
|
||||
(assert (= 256 (length codes)))
|
||||
`(progn
|
||||
(setf (gethash ',name *charsets*)
|
||||
(make-instance '8-bit-charset
|
||||
:name ',name
|
||||
:to-unicode-table
|
||||
',(make-array 256
|
||||
:element-type '(unsigned-byte 16)
|
||||
:initial-contents codes)))
|
||||
',name))
|
||||
|
||||
(defun find-charset (name)
|
||||
(or (gethash name *charsets*)
|
||||
(error "There is no character set named ~S." name)))
|
||||
|
||||
Reference in New Issue
Block a user