Import 1.4.1 version of CL-PPCRE
git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3577 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
227
charset.lisp
Executable file
227
charset.lisp
Executable file
@ -0,0 +1,227 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.4 2008/07/03 08:39:10 edi Exp $
|
||||
|
||||
;;; A specialized set implementation for characters by Nikodemus Siivola.
|
||||
|
||||
;;; Copyright (c) 2008, Nikodemus Siivola. All rights reserved.
|
||||
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defconstant +probe-depth+ 3
|
||||
"Maximum number of collisions \(for any element) we accept before we
|
||||
allocate more storage. This is now fixed, but could be made to vary
|
||||
depending on the size of the storage vector \(e.g. in the range of
|
||||
1-4). Larger probe-depths mean more collisions are tolerated before
|
||||
the table grows, but increase the constant factor.")
|
||||
|
||||
(defun make-char-vector (size)
|
||||
"Returns a vector of size SIZE to hold characters. All elements are
|
||||
initialized to #\Null except for the first one which is initialized to
|
||||
#\?."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type (integer 2 #.(1- array-total-size-limit)) size))
|
||||
;; Since #\Null always hashes to 0, store something else there
|
||||
;; initially, and #\Null everywhere else
|
||||
(let ((result (make-array size
|
||||
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char
|
||||
:initial-element (code-char 0))))
|
||||
(setf (char result 0) #\?)
|
||||
result))
|
||||
|
||||
(defstruct (charset (:constructor make-charset))
|
||||
;; this is set to 0 when we stop hashing and just use a CHAR-CODE
|
||||
;; indexed vector
|
||||
(depth +probe-depth+ :type fixnum)
|
||||
;; the number of characters in this set
|
||||
(count 0 :type fixnum)
|
||||
;; the storage vector
|
||||
(vector (make-char-vector 12) :type (simple-array character (*))))
|
||||
|
||||
;; seems to be necessary for some Lisps like ClozureCL
|
||||
(defmethod make-load-form ((set charset) &optional environment)
|
||||
(make-load-form-saving-slots set :environment environment))
|
||||
|
||||
(declaim (inline mix))
|
||||
(defun mix (code hash)
|
||||
"Given a character code CODE and a hash code HASH, computes and
|
||||
returns the \"next\" hash code. See comments below."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; mixing the CHAR-CODE back in at each step makes sure that if two
|
||||
;; characters collide (their hashes end up pointing in the same
|
||||
;; storage vector index) on one round, they should (hopefully!) not
|
||||
;; collide on the next
|
||||
(sxhash (logand most-positive-fixnum (+ code hash))))
|
||||
|
||||
(declaim (inline compute-index))
|
||||
(defun compute-index (hash vector)
|
||||
"Computes and returns the index into the vector VECTOR corresponding
|
||||
to the hash code HASH."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(1+ (mod hash (1- (length vector)))))
|
||||
|
||||
(defun in-charset-p (char set)
|
||||
"Checks whether the character CHAR is in the charset SET."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (character char) (charset set))
|
||||
(let ((vector (charset-vector set))
|
||||
(depth (charset-depth set))
|
||||
(code (char-code char)))
|
||||
(declare (fixnum depth))
|
||||
;; As long as the set remains reasonably small, we use non-linear
|
||||
;; hashing - the first hash of any character is its CHAR-CODE, and
|
||||
;; subsequent hashes are computed by MIX above
|
||||
(cond ((or
|
||||
;; depth 0 is special - each char maps only to its code,
|
||||
;; nothing else
|
||||
(zerop depth)
|
||||
;; index 0 is special - only #\Null maps to it, no matter
|
||||
;; what the depth is
|
||||
(zerop code))
|
||||
(eq char (char vector code)))
|
||||
(t
|
||||
;; otherwise hash starts out as the character code, but
|
||||
;; maps to indexes 1-N
|
||||
(let ((hash code))
|
||||
(tagbody
|
||||
:retry
|
||||
(let* ((index (compute-index hash vector))
|
||||
(x (char vector index)))
|
||||
(cond ((eq x (code-char 0))
|
||||
;; empty, no need to probe further
|
||||
(return-from in-charset-p nil))
|
||||
((eq x char)
|
||||
;; got it
|
||||
(return-from in-charset-p t))
|
||||
((zerop (decf depth))
|
||||
;; max probe depth reached, nothing found
|
||||
(return-from in-charset-p nil))
|
||||
(t
|
||||
;; nothing yet, try next place
|
||||
(setf hash (mix code hash))
|
||||
(go :retry))))))))))
|
||||
|
||||
(defun add-to-charset (char set)
|
||||
"Adds the character CHAR to the charset SET, extending SET if
|
||||
necessary. Returns CHAR."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(or (%add-to-charset char set)
|
||||
(%add-to-charset/expand char set)
|
||||
(error "Oops, this should not happen..."))
|
||||
char)
|
||||
|
||||
(defun %add-to-charset (char set)
|
||||
"Tries to add the character CHAR to the charset SET without
|
||||
extending it. Returns NIL if this fails."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (character char) (charset set))
|
||||
(let ((vector (charset-vector set))
|
||||
(depth (charset-depth set))
|
||||
(code (char-code char)))
|
||||
(declare (fixnum depth))
|
||||
;; see comments in IN-CHARSET-P for algorithm
|
||||
(cond ((or (zerop depth) (zerop code))
|
||||
(setf (char vector code) char))
|
||||
(t
|
||||
(let ((hash code))
|
||||
(tagbody
|
||||
:retry
|
||||
(let* ((index (compute-index hash vector))
|
||||
(x (char vector index)))
|
||||
(cond ((eq x (code-char 0))
|
||||
(setf (char vector index) char)
|
||||
(incf (charset-count set))
|
||||
(return-from %add-to-charset char))
|
||||
((eq x char)
|
||||
(return-from %add-to-charset char))
|
||||
((zerop (decf depth))
|
||||
;; need to expand the table
|
||||
(return-from %add-to-charset nil))
|
||||
(t
|
||||
(setf hash (mix code hash))
|
||||
(go :retry))))))))))
|
||||
|
||||
(defun %add-to-charset/expand (char set)
|
||||
"Extends the charset SET and then adds the character CHAR to it."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (character char) (charset set))
|
||||
(let* ((old-vector (charset-vector set))
|
||||
(new-size (* 2 (length old-vector))))
|
||||
(tagbody
|
||||
:retry
|
||||
;; when the table grows large (currently over 1/3 of
|
||||
;; CHAR-CODE-LIMIT), we dispense with hashing and just allocate a
|
||||
;; storage vector with space for all characters, so that each
|
||||
;; character always uses only the CHAR-CODE
|
||||
(multiple-value-bind (new-depth new-vector)
|
||||
(if (>= new-size #.(truncate char-code-limit 3))
|
||||
(values 0 (make-char-vector char-code-limit))
|
||||
(values +probe-depth+ (make-char-vector new-size)))
|
||||
(setf (charset-depth set) new-depth
|
||||
(charset-vector set) new-vector)
|
||||
(flet ((try-add (x)
|
||||
(unless (%add-to-charset x set)
|
||||
(assert (not (zerop new-depth)))
|
||||
(setf new-size (* 2 new-size))
|
||||
(go :retry))))
|
||||
(try-add char)
|
||||
(dotimes (i (length old-vector))
|
||||
(let ((x (char old-vector i)))
|
||||
(if (eq x (code-char 0))
|
||||
(when (zerop i)
|
||||
(try-add x))
|
||||
(unless (zerop i)
|
||||
(try-add x))))))))
|
||||
t))
|
||||
|
||||
(defun all-characters (set)
|
||||
"Returns a list of all characters in the charset SET."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop with count = (charset-count set)
|
||||
with counter = 0
|
||||
for code below char-code-limit
|
||||
for char = (code-char code)
|
||||
while (< counter count)
|
||||
when (and char (in-charset-p char set))
|
||||
do (incf counter)
|
||||
and collect char))
|
||||
|
||||
(defun merge-set (set1 set2 &optional invertedp)
|
||||
"Returns the \"sum\" of two charsets. This is a destructive
|
||||
operation on SET1. If INVERTEDP is true, merges the \"inverse\" of
|
||||
SET2 into SET1 instead."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; we only consider values with character codes below
|
||||
;; *REGEX-CHAR-CODE-LIMIT*
|
||||
(loop for code of-type fixnum from 0 below *regex-char-code-limit*
|
||||
for char = (code-char code)
|
||||
when (and char (if invertedp
|
||||
(not (in-charset-p char set2))
|
||||
(in-charset-p char set2)))
|
||||
do (add-to-charset char set1))
|
||||
set1)
|
||||
|
||||
Reference in New Issue
Block a user