Update to current dev version

git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3581 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
Edi Weitz
2008-07-23 11:44:08 +00:00
parent 2974af4010
commit 25c3dedeeb
37 changed files with 5443 additions and 6794 deletions

View File

@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.4 2008/07/03 08:39:10 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.9 2008/07/23 00:47:58 edi Exp $
;;; A specialized set implementation for characters by Nikodemus Siivola.
@ -30,7 +30,7 @@
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(in-package #:cl-ppcre)
(in-package :cl-ppcre)
(defconstant +probe-depth+ 3
"Maximum number of collisions \(for any element) we accept before we
@ -45,7 +45,7 @@ initialized to #\Null except for the first one which is initialized to
#\?."
(declare #.*standard-optimize-settings*)
(declare (type (integer 2 #.(1- array-total-size-limit)) size))
;; Since #\Null always hashes to 0, store something else there
;; since #\Null always hashes to 0, store something else there
;; initially, and #\Null everywhere else
(let ((result (make-array size
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char
@ -53,7 +53,7 @@ initialized to #\Null except for the first one which is initialized to
(setf (char result 0) #\?)
result))
(defstruct (charset (:constructor make-charset))
(defstruct (charset (:constructor make-charset ()))
;; this is set to 0 when we stop hashing and just use a CHAR-CODE
;; indexed vector
(depth +probe-depth+ :type fixnum)
@ -92,7 +92,7 @@ to the hash code HASH."
(depth (charset-depth set))
(code (char-code char)))
(declare (fixnum depth))
;; As long as the set remains reasonably small, we use non-linear
;; as long as the set remains reasonably small, we use non-linear
;; hashing - the first hash of any character is its CHAR-CODE, and
;; subsequent hashes are computed by MIX above
(cond ((or
@ -129,14 +129,15 @@ to the hash code HASH."
"Adds the character CHAR to the charset SET, extending SET if
necessary. Returns CHAR."
(declare #.*standard-optimize-settings*)
(or (%add-to-charset char set)
(or (%add-to-charset char set t)
(%add-to-charset/expand char set)
(error "Oops, this should not happen..."))
char)
(defun %add-to-charset (char set)
(defun %add-to-charset (char set count)
"Tries to add the character CHAR to the charset SET without
extending it. Returns NIL if this fails."
extending it. Returns NIL if this fails. Counts CHAR as new
if COUNT is true and it is added to SET."
(declare #.*standard-optimize-settings*)
(declare (character char) (charset set))
(let ((vector (charset-vector set))
@ -144,8 +145,12 @@ extending it. Returns NIL if this fails."
(code (char-code char)))
(declare (fixnum depth))
;; see comments in IN-CHARSET-P for algorithm
(cond ((or (zerop depth) (zerop code))
(setf (char vector code) char))
(cond ((or (zerop depth) (zerop code))
(unless (eq char (char vector code))
(setf (char vector code) char)
(when count
(incf (charset-count set))))
char)
(t
(let ((hash code))
(tagbody
@ -154,7 +159,8 @@ extending it. Returns NIL if this fails."
(x (char vector index)))
(cond ((eq x (code-char 0))
(setf (char vector index) char)
(incf (charset-count set))
(when count
(incf (charset-count set)))
(return-from %add-to-charset char))
((eq x char)
(return-from %add-to-charset char))
@ -184,7 +190,10 @@ extending it. Returns NIL if this fails."
(setf (charset-depth set) new-depth
(charset-vector set) new-vector)
(flet ((try-add (x)
(unless (%add-to-charset x set)
;; don't count - old characters are already accounted
;; for, and might count the new one multiple times as
;; well
(unless (%add-to-charset x set nil)
(assert (not (zerop new-depth)))
(setf new-size (* 2 new-size))
(go :retry))))
@ -196,32 +205,38 @@ extending it. Returns NIL if this fails."
(try-add x))
(unless (zerop i)
(try-add x))))))))
;; added and expanded, /now/ count the new character.
(incf (charset-count set))
t))
(defun all-characters (set)
"Returns a list of all characters in the charset SET."
(defun map-charset (function charset)
"Calls FUNCTION with all characters in SET. Returns NIL."
(declare #.*standard-optimize-settings*)
(loop with count = (charset-count set)
with counter = 0
for code below char-code-limit
for char = (code-char code)
while (< counter count)
when (and char (in-charset-p char set))
do (incf counter)
and collect char))
(declare (function function))
(let* ((n (charset-count charset))
(vector (charset-vector charset))
(size (length vector)))
;; see comments in IN-CHARSET-P for algorithm
(when (eq (code-char 0) (char vector 0))
(funcall function (code-char 0))
(decf n))
(loop for i from 1 below size
for char = (char vector i)
unless (eq (code-char 0) char) do
(funcall function char)
;; this early termination test should be worth it when
;; mapping across depth 0 charsets.
(when (zerop (decf n))
(return-from map-charset nil))))
nil)
(defun merge-set (set1 set2 &optional invertedp)
"Returns the \"sum\" of two charsets. This is a destructive
operation on SET1. If INVERTEDP is true, merges the \"inverse\" of
SET2 into SET1 instead."
(defun create-charset-from-test-function (test-function start end)
"Creates and returns a charset representing all characters with
character codes between START and END which satisfy TEST-FUNCTION."
(declare #.*standard-optimize-settings*)
;; we only consider values with character codes below
;; *REGEX-CHAR-CODE-LIMIT*
(loop for code of-type fixnum from 0 below *regex-char-code-limit*
(loop with charset = (make-charset)
for code from start below end
for char = (code-char code)
when (and char (if invertedp
(not (in-charset-p char set2))
(in-charset-p char set2)))
do (add-to-charset char set1))
set1)
when (and char (funcall test-function char))
do (add-to-charset char charset)
finally (return charset)))