Use 21 bit characters on Lisp offering them.
This commit is contained in:
@ -15,19 +15,31 @@
|
|||||||
(let (#+sbcl (*compile-print* nil))
|
(let (#+sbcl (*compile-print* nil))
|
||||||
(call-next-method))))
|
(call-next-method))))
|
||||||
|
|
||||||
#-(or rune-is-character rune-is-integer)
|
|
||||||
(progn
|
(progn
|
||||||
(format t "~&;;; Checking for wide character support...")
|
(format t "~&;;; Checking for wide character support...")
|
||||||
(force-output)
|
(force-output)
|
||||||
(pushnew (dotimes (x 65536
|
(flet ((test (code)
|
||||||
(progn
|
(and (< code char-code-limit) (code-char code))))
|
||||||
(format t " ok, characters have at least 16 bits.~%")
|
(cond
|
||||||
:rune-is-character))
|
((not (test 50000))
|
||||||
(unless (or (<= #xD800 x #xDFFF)
|
|
||||||
(and (< x char-code-limit) (code-char x)))
|
|
||||||
(format t " no, reverting to octet strings.~%")
|
(format t " no, reverting to octet strings.~%")
|
||||||
(return :rune-is-integer)))
|
#+rune-is-character
|
||||||
*features*))
|
(error "conflicting unicode configuration. Please recompile.")
|
||||||
|
(pushnew :rune-is-integer *features*))
|
||||||
|
((code-char 70000)
|
||||||
|
(when (test #xD800)
|
||||||
|
(format t " WARNING: Lisp implementation doesn't use UTF-16, ~
|
||||||
|
but accepts surrogate code points.~%"))
|
||||||
|
(format t " yes, using code points.~%")
|
||||||
|
#+(or rune-is-integer rune-is-utf-16)
|
||||||
|
(error "conflicting unicode configuration. Please recompile.")
|
||||||
|
(pushnew :rune-is-character *features*))
|
||||||
|
(t
|
||||||
|
(format t " yes, using UTF-16.~%")
|
||||||
|
#+(or rune-is-integer (and rune-is-character (not rune-is-utf-16)))
|
||||||
|
(error "conflicting unicode configuration. Please recompile.")
|
||||||
|
(pushnew :rune-is-utf-16 *features*)
|
||||||
|
(pushnew :rune-is-character *features*)))))
|
||||||
|
|
||||||
#-rune-is-character
|
#-rune-is-character
|
||||||
(format t "~&;;; Building Closure with (UNSIGNED-BYTE 16) RUNES~%")
|
(format t "~&;;; Building Closure with (UNSIGNED-BYTE 16) RUNES~%")
|
||||||
|
|||||||
@ -1,5 +1,10 @@
|
|||||||
(in-package :runes-encoding)
|
(in-package :runes-encoding)
|
||||||
|
|
||||||
|
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||||
|
(defparameter +buffer-byte+
|
||||||
|
#+rune-is-utf-16 '(unsigned-byte 16)
|
||||||
|
#-rune-is-utf-16 '(unsigned-byte 32)))
|
||||||
|
|
||||||
(define-condition encoding-error (simple-error) ())
|
(define-condition encoding-error (simple-error) ())
|
||||||
|
|
||||||
(defun xerror (fmt &rest args)
|
(defun xerror (fmt &rest args)
|
||||||
@ -82,7 +87,7 @@
|
|||||||
|
|
||||||
(defun make-simple-8-bit-encoding (&key charset)
|
(defun make-simple-8-bit-encoding (&key charset)
|
||||||
(make-instance 'simple-8-bit-encoding
|
(make-instance 'simple-8-bit-encoding
|
||||||
:table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256)))))
|
:table (coerce (to-unicode-table charset) '(simple-array #.+buffer-byte+ (256)))))
|
||||||
|
|
||||||
;;;;;;;
|
;;;;;;;
|
||||||
|
|
||||||
@ -150,16 +155,30 @@
|
|||||||
(return))
|
(return))
|
||||||
(when (>= (%+ rptr 1) in-end)
|
(when (>= (%+ rptr 1) in-end)
|
||||||
(return))
|
(return))
|
||||||
(let ((hi (aref in rptr))
|
(let* ((hi (aref in rptr))
|
||||||
(lo (aref in (%+ 1 rptr))))
|
(lo (aref in (%+ 1 rptr)))
|
||||||
(setf rptr (%+ 2 rptr))
|
(x (logior (ash hi 8) lo)))
|
||||||
;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
|
|
||||||
;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
|
|
||||||
;; Haelfte fehlt!
|
|
||||||
(let ((x (logior (ash hi 8) lo)))
|
|
||||||
(when (or (eql x #xFFFE) (eql x #xFFFF))
|
(when (or (eql x #xFFFE) (eql x #xFFFF))
|
||||||
(xerror "not a valid code point: #x~X" x))
|
(xerror "not a valid code point: #x~X" x))
|
||||||
(setf (aref out wptr) x))
|
(when (<= #xDC00 x #xDFFF)
|
||||||
|
(xerror "unexpected high surrogate: #x~X" x))
|
||||||
|
(when (<= #xD800 x #xDBFF)
|
||||||
|
;; seen low surrogate, look for high surrogate now
|
||||||
|
(when (>= (%+ rptr 3) in-end)
|
||||||
|
(return))
|
||||||
|
(let* ((hi2 (aref in (%+ 2 rptr)))
|
||||||
|
(lo2 (aref in (%+ 3 rptr)))
|
||||||
|
(y (logior (ash hi2 8) lo2)))
|
||||||
|
(unless (<= #xDC00 x #xDFFF)
|
||||||
|
(xerror "expected a high surrogate but found: #x~X" x))
|
||||||
|
#-rune-is-utf-16
|
||||||
|
(progn
|
||||||
|
(setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF)))
|
||||||
|
(setf rptr (%+ 2 rptr))))
|
||||||
|
;; end of surrogate handling
|
||||||
|
)
|
||||||
|
(setf (aref out wptr) x)
|
||||||
|
(setf rptr (%+ 2 rptr))
|
||||||
(setf wptr (%+ 1 wptr))))
|
(setf wptr (%+ 1 wptr))))
|
||||||
(values wptr rptr)))
|
(values wptr rptr)))
|
||||||
|
|
||||||
@ -173,16 +192,30 @@
|
|||||||
(return))
|
(return))
|
||||||
(when (>= (%+ rptr 1) in-end)
|
(when (>= (%+ rptr 1) in-end)
|
||||||
(return))
|
(return))
|
||||||
(let ((lo (aref in (%+ 0 rptr)))
|
(let* ((lo (aref in rptr))
|
||||||
(hi (aref in (%+ 1 rptr))))
|
(hi (aref in (%+ 1 rptr)))
|
||||||
(setf rptr (%+ 2 rptr))
|
(x (logior (ash hi 8) lo)))
|
||||||
;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
|
|
||||||
;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
|
|
||||||
;; Haelfte fehlt!
|
|
||||||
(let ((x (logior (ash hi 8) lo)))
|
|
||||||
(when (or (eql x #xFFFE) (eql x #xFFFF))
|
(when (or (eql x #xFFFE) (eql x #xFFFF))
|
||||||
(xerror "not a valid code point: #x~X" x))
|
(xerror "not a valid code point: #x~X" x))
|
||||||
(setf (aref out wptr) x))
|
(when (<= #xDC00 x #xDFFF)
|
||||||
|
(xerror "unexpected high surrogate: #x~X" x))
|
||||||
|
(when (<= #xD800 x #xDBFF)
|
||||||
|
;; seen low surrogate, look for high surrogate now
|
||||||
|
(when (>= (%+ rptr 3) in-end)
|
||||||
|
(return))
|
||||||
|
(let* ((lo2 (aref in (%+ 2 rptr)))
|
||||||
|
(hi2 (aref in (%+ 3 rptr)))
|
||||||
|
(y (logior (ash hi2 8) lo2)))
|
||||||
|
(unless (<= #xDC00 x #xDFFF)
|
||||||
|
(xerror "expected a high surrogate but found: #x~X" x))
|
||||||
|
#-rune-is-utf-16
|
||||||
|
(progn
|
||||||
|
(setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF)))
|
||||||
|
(setf rptr (%+ 2 rptr))))
|
||||||
|
;; end of surrogate handling
|
||||||
|
)
|
||||||
|
(setf (aref out wptr) x)
|
||||||
|
(setf rptr (%+ 2 rptr))
|
||||||
(setf wptr (%+ 1 wptr))))
|
(setf wptr (%+ 1 wptr))))
|
||||||
(values wptr rptr)))
|
(values wptr rptr)))
|
||||||
|
|
||||||
@ -190,7 +223,8 @@
|
|||||||
in in-start in-end out out-start out-end eof?)
|
in in-start in-end out out-start out-end eof?)
|
||||||
(declare (optimize (speed 3) (safety 0))
|
(declare (optimize (speed 3) (safety 0))
|
||||||
(type (simple-array (unsigned-byte 8) (*)) in)
|
(type (simple-array (unsigned-byte 8) (*)) in)
|
||||||
(type (simple-array (unsigned-byte 16) (*)) out)
|
(type (simple-array #.+buffer-byte+ (*))
|
||||||
|
out)
|
||||||
(type fixnum in-start in-end out-start out-end))
|
(type fixnum in-start in-end out-start out-end))
|
||||||
(let ((wptr out-start)
|
(let ((wptr out-start)
|
||||||
(rptr in-start)
|
(rptr in-start)
|
||||||
@ -204,6 +238,7 @@
|
|||||||
(eql x #xFFFE)
|
(eql x #xFFFE)
|
||||||
(eql x #xFFFF))
|
(eql x #xFFFF))
|
||||||
(xerror "not a valid code point: #x~X" x))
|
(xerror "not a valid code point: #x~X" x))
|
||||||
|
#+rune-is-utf-16
|
||||||
((%> x #xFFFF)
|
((%> x #xFFFF)
|
||||||
(setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
|
(setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
|
||||||
(aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
|
(aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
|
||||||
@ -325,7 +360,7 @@
|
|||||||
eof?)
|
eof?)
|
||||||
(declare (optimize (speed 3) (safety 0))
|
(declare (optimize (speed 3) (safety 0))
|
||||||
(type (simple-array (unsigned-byte 8) (*)) in)
|
(type (simple-array (unsigned-byte 8) (*)) in)
|
||||||
(type (simple-array (unsigned-byte 16) (*)) out)
|
(type (simple-array #.+buffer-byte+ (*)) out)
|
||||||
(type fixnum in-start in-end out-start out-end))
|
(type fixnum in-start in-end out-start out-end))
|
||||||
(let ((wptr out-start)
|
(let ((wptr out-start)
|
||||||
(rptr in-start)
|
(rptr in-start)
|
||||||
@ -333,7 +368,7 @@
|
|||||||
(table (slot-value encoding 'table)))
|
(table (slot-value encoding 'table)))
|
||||||
(declare (type fixnum wptr rptr)
|
(declare (type fixnum wptr rptr)
|
||||||
(type (unsigned-byte 8) byte)
|
(type (unsigned-byte 8) byte)
|
||||||
(type (simple-array (unsigned-byte 16) (*)) table))
|
(type (simple-array #.+buffer-byte+ (*)) table))
|
||||||
(loop
|
(loop
|
||||||
(when (%= wptr out-end) (return))
|
(when (%= wptr out-end) (return))
|
||||||
(when (%>= rptr in-end) (return))
|
(when (%>= rptr in-end) (return))
|
||||||
@ -387,7 +422,7 @@
|
|||||||
:name ',name
|
:name ',name
|
||||||
:to-unicode-table
|
:to-unicode-table
|
||||||
',(make-array 256
|
',(make-array 256
|
||||||
:element-type '(unsigned-byte 16)
|
:element-type '#.+buffer-byte+
|
||||||
:initial-contents codes)))
|
:initial-contents codes)))
|
||||||
',name))
|
',name))
|
||||||
|
|
||||||
|
|||||||
@ -83,7 +83,10 @@
|
|||||||
`(unsigned-byte ,(integer-length array-total-size-limit)))
|
`(unsigned-byte ,(integer-length array-total-size-limit)))
|
||||||
|
|
||||||
(deftype buffer-byte ()
|
(deftype buffer-byte ()
|
||||||
`(unsigned-byte 16))
|
#+rune-is-utf-16
|
||||||
|
`(unsigned-byte 16)
|
||||||
|
#-rune-is-utf-16
|
||||||
|
`(unsigned-byte 32))
|
||||||
|
|
||||||
(deftype octet ()
|
(deftype octet ()
|
||||||
`(unsigned-byte 8))
|
`(unsigned-byte 8))
|
||||||
|
|||||||
13
ystream.lisp
13
ystream.lisp
@ -76,14 +76,17 @@
|
|||||||
(when (plusp ptr)
|
(when (plusp ptr)
|
||||||
(let* ((in (ystream-in-buffer ystream))
|
(let* ((in (ystream-in-buffer ystream))
|
||||||
(out (ystream-out-buffer ystream))
|
(out (ystream-out-buffer ystream))
|
||||||
|
#+rune-is-utf-16
|
||||||
(surrogatep (<= #xD800 (rune-code (elt in (1- ptr))) #xDBFF))
|
(surrogatep (<= #xD800 (rune-code (elt in (1- ptr))) #xDBFF))
|
||||||
n)
|
n)
|
||||||
|
#+rune-is-utf-16
|
||||||
(when surrogatep
|
(when surrogatep
|
||||||
(decf ptr))
|
(decf ptr))
|
||||||
(when (plusp ptr)
|
(when (plusp ptr)
|
||||||
(setf n (runes-to-utf8 out in ptr))
|
(setf n (runes-to-utf8 out in ptr))
|
||||||
(ystream-device-write ystream out n)
|
(ystream-device-write ystream out n)
|
||||||
(cond
|
(cond
|
||||||
|
#+rune-is-utf-16
|
||||||
(surrogatep
|
(surrogatep
|
||||||
(setf (elt in 0) (elt in (1- ptr)))
|
(setf (elt in 0) (elt in (1- ptr)))
|
||||||
(setf (ystream-in-ptr ystream) 1))
|
(setf (ystream-in-ptr ystream) 1))
|
||||||
@ -98,7 +101,7 @@
|
|||||||
|
|
||||||
(macrolet ((define-utf8-writer (name (byte &rest aux) result &body body)
|
(macrolet ((define-utf8-writer (name (byte &rest aux) result &body body)
|
||||||
`(defun ,name (out in n)
|
`(defun ,name (out in n)
|
||||||
(let ((high-surrogate nil)
|
(let (#+rune-is-utf-16 (high-surrogate nil)
|
||||||
,@aux)
|
,@aux)
|
||||||
(labels
|
(labels
|
||||||
((write0 (,byte)
|
((write0 (,byte)
|
||||||
@ -134,13 +137,19 @@
|
|||||||
(write0 (logior #b10000000 (ldb (byte 6 0) r))))))
|
(write0 (logior #b10000000 (ldb (byte 6 0) r))))))
|
||||||
(write2 (r)
|
(write2 (r)
|
||||||
(cond
|
(cond
|
||||||
|
#+rune-is-utf-16
|
||||||
((<= #xD800 r #xDBFF)
|
((<= #xD800 r #xDBFF)
|
||||||
(setf high-surrogate r))
|
(setf high-surrogate r))
|
||||||
|
#+rune-is-utf-16
|
||||||
((<= #xDC00 r #xDFFF)
|
((<= #xDC00 r #xDFFF)
|
||||||
(let ((q (logior (ash (- high-surrogate #xD7C0) 10)
|
(let ((q (logior (ash (- high-surrogate #xD7C0) 10)
|
||||||
(- r #xDC00))))
|
(- r #xDC00))))
|
||||||
(write1 q))
|
(write1 q))
|
||||||
(setf high-surrogate nil))
|
(setf high-surrogate nil))
|
||||||
|
#-rune-is-utf-16
|
||||||
|
((<= #xD800 r #xDFFF)
|
||||||
|
(error
|
||||||
|
"surrogates not allowed in this configuration"))
|
||||||
(t
|
(t
|
||||||
(write1 r)))))
|
(write1 r)))))
|
||||||
(dotimes (j n)
|
(dotimes (j n)
|
||||||
@ -259,7 +268,7 @@
|
|||||||
|
|
||||||
(defun utf8-string-to-rod (str)
|
(defun utf8-string-to-rod (str)
|
||||||
(let* ((bytes (map '(vector (unsigned-byte 8)) #'char-code str))
|
(let* ((bytes (map '(vector (unsigned-byte 8)) #'char-code str))
|
||||||
(buffer (make-array (length bytes) :element-type '(unsigned-byte 16)))
|
(buffer (make-array (length bytes) :element-type 'buffer-byte))
|
||||||
(n (runes-encoding:decode-sequence
|
(n (runes-encoding:decode-sequence
|
||||||
:utf-8 bytes 0 (length bytes) buffer 0 0 nil))
|
:utf-8 bytes 0 (length bytes) buffer 0 0 nil))
|
||||||
(result (make-array n :element-type 'rune)))
|
(result (make-array n :element-type 'rune)))
|
||||||
|
|||||||
Reference in New Issue
Block a user