git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@4453 4281704c-cde7-0310-8518-8e2dc76b1ff0
153 lines
6.8 KiB
Common Lisp
153 lines
6.8 KiB
Common Lisp
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/charmap.lisp,v 1.19 2009/09/17 19:17:30 edi Exp $
|
|
|
|
;;; An optimized representation of sets of characters.
|
|
|
|
;;; Copyright (c) 2008-2009, Dr. Edmund Weitz. All rights reserved.
|
|
|
|
;;; Redistribution and use in source and binary forms, with or without
|
|
;;; modification, are permitted provided that the following conditions
|
|
;;; are met:
|
|
|
|
;;; * Redistributions of source code must retain the above copyright
|
|
;;; notice, this list of conditions and the following disclaimer.
|
|
|
|
;;; * Redistributions in binary form must reproduce the above
|
|
;;; copyright notice, this list of conditions and the following
|
|
;;; disclaimer in the documentation and/or other materials
|
|
;;; provided with the distribution.
|
|
|
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
(in-package :cl-ppcre)
|
|
|
|
(defstruct (charmap (:constructor make-charmap%))
|
|
;; a bit vector mapping char codes to "booleans" (1 for set members,
|
|
;; 0 for others)
|
|
(vector #*0 :type simple-bit-vector)
|
|
;; the smallest character code of all characters in the set
|
|
(start 0 :type fixnum)
|
|
;; the upper (exclusive) bound of all character codes in the set
|
|
(end 0 :type fixnum)
|
|
;; the number of characters in the set, or NIL if this is unknown
|
|
(count nil :type (or fixnum null))
|
|
;; whether the charmap actually represents the complement of the set
|
|
(complementp nil :type boolean))
|
|
|
|
;; seems to be necessary for some Lisps like ClozureCL
|
|
(defmethod make-load-form ((map charmap) &optional environment)
|
|
(make-load-form-saving-slots map :environment environment))
|
|
|
|
(declaim (inline in-charmap-p))
|
|
(defun in-charmap-p (char charmap)
|
|
"Tests whether the character CHAR belongs to the set represented by CHARMAP."
|
|
(declare #.*standard-optimize-settings*)
|
|
(declare (character char) (charmap charmap))
|
|
(let* ((char-code (char-code char))
|
|
(char-in-vector-p
|
|
(let ((charmap-start (charmap-start charmap)))
|
|
(declare (fixnum charmap-start))
|
|
(and (<= charmap-start char-code)
|
|
(< char-code (the fixnum (charmap-end charmap)))
|
|
(= 1 (sbit (the simple-bit-vector (charmap-vector charmap))
|
|
(- char-code charmap-start)))))))
|
|
(cond ((charmap-complementp charmap) (not char-in-vector-p))
|
|
(t char-in-vector-p))))
|
|
|
|
(defun charmap-contents (charmap)
|
|
"Returns a list of all characters belonging to a character map.
|
|
Only works for non-complement charmaps."
|
|
(declare #.*standard-optimize-settings*)
|
|
(declare (charmap charmap))
|
|
(and (not (charmap-complementp charmap))
|
|
(loop for code of-type fixnum from (charmap-start charmap) to (charmap-end charmap)
|
|
for i across (the simple-bit-vector (charmap-vector charmap))
|
|
when (= i 1)
|
|
collect (code-char code))))
|
|
|
|
(defun make-charmap (start end test-function &optional complementp)
|
|
"Creates and returns a charmap representing all characters with
|
|
character codes in the interval [start end) that satisfy
|
|
TEST-FUNCTION. The COMPLEMENTP slot of the charmap is set to the
|
|
value of the optional argument, but this argument doesn't have an
|
|
effect on how TEST-FUNCTION is used."
|
|
(declare #.*standard-optimize-settings*)
|
|
(declare (fixnum start end))
|
|
(let ((vector (make-array (- end start) :element-type 'bit))
|
|
(count 0))
|
|
(declare (fixnum count))
|
|
(loop for code from start below end
|
|
for char = (code-char code)
|
|
for index from 0
|
|
when char do
|
|
(incf count)
|
|
(setf (sbit vector index) (if (funcall test-function char) 1 0)))
|
|
(make-charmap% :vector vector
|
|
:start start
|
|
:end end
|
|
;; we don't know for sure if COMPLEMENTP is true as
|
|
;; there isn't a necessary a character for each
|
|
;; integer below *REGEX-CHAR-CODE-LIMIT*
|
|
:count (and (not complementp) count)
|
|
;; make sure it's boolean
|
|
:complementp (not (not complementp)))))
|
|
|
|
(defun create-charmap-from-test-function (test-function start end)
|
|
"Creates and returns a charmap representing all characters with
|
|
character codes between START and END which satisfy TEST-FUNCTION.
|
|
Tries to find the smallest interval which is necessary to represent
|
|
the character set and uses the complement representation if that
|
|
helps."
|
|
(declare #.*standard-optimize-settings*)
|
|
(let (start-in end-in start-out end-out)
|
|
;; determine the smallest intervals containing the set and its
|
|
;; complement, [start-in, end-in) and [start-out, end-out) - first
|
|
;; the lower bound
|
|
(loop for code from start below end
|
|
for char = (code-char code)
|
|
until (and start-in start-out)
|
|
when (and char
|
|
(not start-in)
|
|
(funcall test-function char))
|
|
do (setq start-in code)
|
|
when (and char
|
|
(not start-out)
|
|
(not (funcall test-function char)))
|
|
do (setq start-out code))
|
|
(unless start-in
|
|
;; no character satisfied the test, so return a "pseudo" charmap
|
|
;; where IN-CHARMAP-P is always false
|
|
(return-from create-charmap-from-test-function
|
|
(make-charmap% :count 0)))
|
|
(unless start-out
|
|
;; no character failed the test, so return a "pseudo" charmap
|
|
;; where IN-CHARMAP-P is always true
|
|
(return-from create-charmap-from-test-function
|
|
(make-charmap% :complementp t)))
|
|
;; now determine upper bound
|
|
(loop for code from (1- end) downto start
|
|
for char = (code-char code)
|
|
until (and end-in end-out)
|
|
when (and char
|
|
(not end-in)
|
|
(funcall test-function char))
|
|
do (setq end-in (1+ code))
|
|
when (and char
|
|
(not end-out)
|
|
(not (funcall test-function char)))
|
|
do (setq end-out (1+ code)))
|
|
;; use the smaller interval
|
|
(cond ((<= (- end-in start-in) (- end-out start-out))
|
|
(make-charmap start-in end-in test-function))
|
|
(t (make-charmap start-out end-out (complement* test-function) t)))))
|