Import 1.4.1 version of CL-PPCRE

git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3577 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
Edi Weitz
2008-07-23 11:29:40 +00:00
parent bf6913769f
commit 2974af4010
25 changed files with 1907 additions and 1223 deletions

View File

@ -1,3 +1,61 @@
Version 1.4.1
2008-07-03
Skip non-characters in CREATE-RANGES-FROM-SET
Version 1.4.0
2008-07-03
Replaced hash tables with charsets (by Nikodemus Siivola)
Get rid of duplicates in REGEX-APROPOS(-LIST)
Version 1.3.3
2008-06-25
Let the Lisp decide how it wants to enlarge its hash tables
Fixed anchors for special variables in docs
Fixed typo in docs (thanks to Jason S. Cornez)
Version 1.3.2
2007-09-13
Updated docs and ChangeLog to be really in sync with 1.3.1 changes (thanks to S<>bastien Saint-Sevin)
Version 1.3.1
2007-08-24
Second return value for REGEX-REPLACE and REGEX-REPLACE-ALL (patch by Matthew Sachs)
Version 1.3.0
2007-03-24
Optional support for named registers (patch by Ondrej Svitek)
Version 1.2.19
2007-01-16
Fixed behaviour of look-behind in repeated scans (caught by RegexCoach user Hans Jud)
Version 1.2.18
2006-10-12
Changed default element type for LispWorks
Fixed documentation for REGEX-REPLACE-ALL
Version 1.2.17
2006-10-11
Fixed bug in DO-SCANS which affected anchors (caught by RegexCoach user Laurent Taupiac)
Update link for 'man perlre' (thanks to Ricardo Boccato Alves)
Version 1.2.16
2006-07-16
Added :ELEMENT-TYPE to REGEX-REPLACE(-ALL)
Version 1.2.15
2006-07-03
Added :REGEX tag to parse tree syntax (thanks to Fr<46>d<EFBFBD>ric Jolliton)
Version 1.2.14
2006-05-24
Added missing </code> tag in docs (thanks to Wojciech Kaczmarek)
Fixed IMPORT statement for LW
Version 1.2.13
2005-12-06
Fixed bug involving *REAL-START-POS* (caught by "tichy")
Version 1.2.12
2005-11-01
REGEX-APROPOS-AUX now also uses :INHERITED

325
api.lisp
View File

@ -1,9 +1,9 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.60 2005/11/01 09:51:01 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.79 2008/07/03 08:39:10 edi Exp $
;;; The external API for creating and using scanners.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -38,10 +38,11 @@
destructive)
(:documentation "Accepts a regular expression - either as a
parse-tree or as a string - and returns a scan closure which will scan
strings for this regular expression. The \"mode\" keyboard arguments
are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE is not
NIL the function is allowed to destructively modify its first argument
\(but only if it's a parse tree)."))
strings for this regular expression and a list mapping registers to
their names \(NIL stands for unnamed ones). The \"mode\" keyboard
arguments are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE
is not NIL the function is allowed to destructively modify its first
argument \(but only if it's a parse tree)."))
#-:use-acl-regexp2-engine
(defmethod create-scanner ((regex-string string) &key case-insensitive-mode
@ -104,7 +105,7 @@ NIL the function is allowed to destructively modify its first argument
(when flags
(setq parse-tree (list :group (cons :flags flags) parse-tree))))
(let ((*syntax-error-string* nil))
(multiple-value-bind (regex reg-num starts-with)
(multiple-value-bind (regex reg-num starts-with reg-names)
(convert parse-tree)
;; simplify REGEX by flattening nested SEQ and ALTERNATION
;; constructs and gathering STR objects
@ -151,30 +152,31 @@ NIL the function is allowed to destructively modify its first argument
(case-insensitive-p starts-with))))))
(declare (special end-string-offset end-anchored-p end-string))
;; now create the scanner and return it
(create-scanner-aux match-fn
(regex-min-length regex)
(or (start-anchored-p regex)
;; a dot in single-line-mode also
;; implicitely anchors the regex at
;; the start, i.e. if we can't match
;; from the first position we won't
;; match at all
(and (typep starts-with 'everything)
(single-line-p starts-with)))
starts-with
start-string-test
;; only mark regex as end-anchored if we
;; found a non-zero-length string before
;; the anchor
(and end-string-test end-anchored-p)
end-string-test
(if end-string-test
(len end-string)
nil)
end-string-offset
*rep-num*
*zero-length-num*
reg-num))))))
(values (create-scanner-aux match-fn
(regex-min-length regex)
(or (start-anchored-p regex)
;; a dot in single-line-mode also
;; implicitly anchors the regex at
;; the start, i.e. if we can't match
;; from the first position we won't
;; match at all
(and (typep starts-with 'everything)
(single-line-p starts-with)))
starts-with
start-string-test
;; only mark regex as end-anchored if we
;; found a non-zero-length string before
;; the anchor
(and end-string-test end-anchored-p)
end-string-test
(if end-string-test
(len end-string)
nil)
end-string-offset
*rep-num*
*zero-length-num*
reg-num)
reg-names))))))
#+:use-acl-regexp2-engine
(declaim (inline create-scanner))
@ -185,6 +187,7 @@ NIL the function is allowed to destructively modify its first argument
single-line-mode
extended-mode
destructive)
(declare #.*standard-optimize-settings*)
(declare (ignore destructive))
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
(signal-ppcre-invocation-error
@ -197,6 +200,7 @@ NIL the function is allowed to destructively modify its first argument
single-line-mode
extended-mode
destructive)
(declare #.*standard-optimize-settings*)
(declare (ignore destructive))
(excl:compile-re parse-tree
:case-fold case-insensitive-mode
@ -205,19 +209,22 @@ NIL the function is allowed to destructively modify its first argument
:single-line single-line-mode
:return :index))
(defgeneric scan (regex target-string &key start end)
(defgeneric scan (regex target-string &key start end real-start-pos)
(:documentation "Searches TARGET-STRING from START to END and tries
to match REGEX. On success returns four values - the start of the
to match REGEX. On success returns four values - the start of the
match, the end of the match, and two arrays denoting the beginnings
and ends of register matches. On failure returns NIL. REGEX can be a
and ends of register matches. On failure returns NIL. REGEX can be a
string which will be parsed according to Perl syntax, a parse tree, or
a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will
be coerced to a simple string if it isn't one already."))
a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will
be coerced to a simple string if it isn't one already. The
REAL-START-POS parameter should be ignored - it exists only for
internal purposes."))
#-:use-acl-regexp2-engine
(defmethod scan ((regex-string string) target-string
&key (start 0)
(end (length target-string)))
(end (length target-string))
((:real-start-pos *real-start-pos*) nil))
(declare #.*standard-optimize-settings*)
;; note that the scanners are optimized for simple strings so we
;; have to coerce TARGET-STRING into one if it isn't already
@ -228,7 +235,8 @@ be coerced to a simple string if it isn't one already."))
#-:use-acl-regexp2-engine
(defmethod scan ((scanner function) target-string
&key (start 0)
(end (length target-string)))
(end (length target-string))
((:real-start-pos *real-start-pos*) nil))
(declare #.*standard-optimize-settings*)
(funcall scanner
(maybe-coerce-to-simple-string target-string)
@ -237,7 +245,8 @@ be coerced to a simple string if it isn't one already."))
#-:use-acl-regexp2-engine
(defmethod scan ((parse-tree t) target-string
&key (start 0)
(end (length target-string)))
(end (length target-string))
((:real-start-pos *real-start-pos*) nil))
(declare #.*standard-optimize-settings*)
(funcall (create-scanner parse-tree)
(maybe-coerce-to-simple-string target-string)
@ -249,7 +258,9 @@ be coerced to a simple string if it isn't one already."))
#+:use-acl-regexp2-engine
(defmethod scan ((parse-tree t) target-string
&key (start 0)
(end (length target-string)))
(end (length target-string))
((:real-start-pos *real-start-pos*) nil))
(declare #.*standard-optimize-settings*)
(when (< end start)
(return-from scan nil))
(let ((results (multiple-value-list (excl:match-re parse-tree target-string
@ -274,9 +285,8 @@ be coerced to a simple string if it isn't one already."))
(define-compiler-macro scan (&whole form &environment env regex target-string &rest rest)
"Make sure that constant forms are compiled into scanners at compile time."
(cond ((constantp regex env)
`(scan (load-time-value
(create-scanner ,regex))
,target-string ,@rest))
`(scan (load-time-value (create-scanner ,regex))
,target-string ,@rest))
(t form)))
(defun scan-to-strings (regex target-string &key (start 0)
@ -309,8 +319,7 @@ structure with TARGET-STRING."
(&whole form &environment env regex target-string &rest rest)
"Make sure that constant forms are compiled into scanners at compile time."
(cond ((constantp regex env)
`(scan-to-strings (load-time-value
(create-scanner ,regex))
`(scan-to-strings (load-time-value (create-scanner ,regex))
,target-string ,@rest))
(t form)))
@ -366,54 +375,50 @@ terminate the loop immediately. If REGEX matches an empty string the
scan is continued one position behind this match. BODY may start with
declarations."
(with-rebinding (target-string)
(with-unique-names (%start %end %regex scanner loop-tag block-name)
(with-unique-names (%start %end %regex scanner)
(declare (ignorable %regex scanner))
;; the NIL BLOCK to enable exits via (RETURN ...)
`(block nil
(let* ((,%start (or ,start 0))
(*real-start-pos* ,%start)
(,%end (or ,end (length ,target-string)))
,@(unless (constantp regex env)
;; leave constant regular expressions as they are -
;; SCAN's compiler macro will take care of them;
;; otherwise create a scanner unless the regex is
;; already a function (otherwise SCAN will do this
;; on each iteration)
`((,%regex ,regex)
(,scanner (typecase ,%regex
(function ,%regex)
(t (create-scanner ,%regex)))))))
;; coerce TARGET-STRING to a simple string unless it is one
;; already (otherwise SCAN will do this on each iteration)
(setq ,target-string
(maybe-coerce-to-simple-string ,target-string))
;; a named BLOCK so we can exit the TAGBODY
(block ,block-name
(tagbody
,loop-tag
;; invoke SCAN and bind the returned values to the
;; provided variables
(multiple-value-bind
(,match-start ,match-end ,reg-starts ,reg-ends)
(scan ,(cond ((constantp regex env) regex)
(t scanner))
,target-string :start ,%start :end ,%end)
;; declare the variables to be IGNORABLE to prevent the
;; compiler from issuing warnings
(declare
(ignorable ,match-start ,match-end ,reg-starts ,reg-ends))
(unless ,match-start
;; stop iteration on first failure
(return-from ,block-name ,result-form))
;; execute BODY (wrapped in LOCALLY so it can start with
;; declarations)
(locally
,@body)
;; advance by one position if we had a zero-length match
(setq ,%start (if (= ,match-start ,match-end)
(1+ ,match-end)
,match-end)))
(go ,loop-tag))))))))
(let* ((,%start (or ,start 0))
(,%end (or ,end (length ,target-string)))
,@(unless (constantp regex env)
;; leave constant regular expressions as they are -
;; SCAN's compiler macro will take care of them;
;; otherwise create a scanner unless the regex is
;; already a function (otherwise SCAN will do this
;; on each iteration)
`((,%regex ,regex)
(,scanner (typecase ,%regex
(function ,%regex)
(t (create-scanner ,%regex)))))))
;; coerce TARGET-STRING to a simple string unless it is one
;; already (otherwise SCAN will do this on each iteration)
(setq ,target-string
(maybe-coerce-to-simple-string ,target-string))
(loop
;; invoke SCAN and bind the returned values to the
;; provided variables
(multiple-value-bind
(,match-start ,match-end ,reg-starts ,reg-ends)
(scan ,(cond ((constantp regex env) regex)
(t scanner))
,target-string :start ,%start :end ,%end
:real-start-pos (or ,start 0))
;; declare the variables to be IGNORABLE to prevent the
;; compiler from issuing warnings
(declare
(ignorable ,match-start ,match-end ,reg-starts ,reg-ends))
(unless ,match-start
;; stop iteration on first failure
(return ,result-form))
;; execute BODY (wrapped in LOCALLY so it can start with
;; declarations)
(locally
,@body)
;; advance by one position if we had a zero-length match
(setq ,%start (if (= ,match-start ,match-end)
(1+ ,match-end)
,match-end)))))))))
(defmacro do-matches ((match-start match-end regex
target-string
@ -523,9 +528,8 @@ the scan is continued one position behind this match."
"Make sure that constant forms are compiled into scanners at
compile time."
(cond ((constantp regex env)
`(all-matches (load-time-value
(create-scanner ,regex))
,@rest))
`(all-matches (load-time-value (create-scanner ,regex))
,@rest))
(t form)))
(defun all-matches-as-strings (regex target-string
@ -548,9 +552,8 @@ share structure with TARGET-STRING."
compile time."
(cond ((constantp regex env)
`(all-matches-as-strings
(load-time-value
(create-scanner ,regex))
,@rest))
(load-time-value (create-scanner ,regex))
,@rest))
(t form)))
(defun split (regex target-string
@ -628,8 +631,7 @@ TARGET-STRING."
(define-compiler-macro split (&whole form &environment env regex target-string &rest rest)
"Make sure that constant forms are compiled into scanners at compile time."
(cond ((constantp regex env)
`(split (load-time-value
(create-scanner ,regex))
`(split (load-time-value (create-scanner ,regex))
,target-string ,@rest))
(t form)))
@ -751,14 +753,17 @@ S-expression."))
#-:cormanlisp
(defmethod build-replacement-template ((replacement-function function))
(declare #.*standard-optimize-settings*)
(list replacement-function))
#-:cormanlisp
(defmethod build-replacement-template ((replacement-function-symbol symbol))
(declare #.*standard-optimize-settings*)
(list replacement-function-symbol))
#-:cormanlisp
(defmethod build-replacement-template ((replacement-list list))
(declare #.*standard-optimize-settings*)
replacement-list)
;;; Corman Lisp's methods can't be closures... :(
@ -816,17 +821,18 @@ S-expression."))
start end
match-start match-end
reg-starts reg-ends
simple-calls)
simple-calls
element-type)
(declare #.*standard-optimize-settings*)
"Accepts a replacement template and the current values from the
matching process in REGEX-REPLACE or REGEX-REPLACE-ALL and returns the
corresponding template."
corresponding string."
;; the upper exclusive bound of the register numbers in the regular
;; expression
(let ((reg-bound (if reg-starts
(array-dimension reg-starts 0)
0)))
(with-output-to-string (s)
(with-output-to-string (s nil :element-type element-type)
(loop for token in replacement-template
do (typecase token
(string
@ -901,8 +907,8 @@ corresponding template."
reg-starts reg-ends)))
s)))))))))
(defun replace-aux (target-string replacement pos-list reg-list
start end preserve-case simple-calls)
(defun replace-aux (target-string replacement pos-list reg-list start end
preserve-case simple-calls element-type)
(declare #.*standard-optimize-settings*)
"Auxiliary function used by REGEX-REPLACE and
REGEX-REPLACE-ALL. POS-LIST contains a list with the start and end
@ -910,7 +916,7 @@ positions of all matches while REG-LIST contains a list of arrays
representing the corresponding register start and end positions."
;; build the template once before we start the loop
(let ((replacement-template (build-replacement-template replacement)))
(with-output-to-string (s)
(with-output-to-string (s nil :element-type element-type)
;; loop through all matches and take the start and end of the
;; whole string into account
(loop for (from to) on (append (list start) pos-list (list end))
@ -925,7 +931,8 @@ representing the corresponding register start and end positions."
start end
from to
reg-starts reg-ends
simple-calls)
simple-calls
element-type)
nil)
while to
if replace
@ -942,14 +949,16 @@ representing the corresponding register start and end positions."
;; no replacement
do (write-string target-string s :start from :end to)))))
(defun regex-replace (regex target-string replacement
&key (start 0)
(defun regex-replace (regex target-string replacement &key
(start 0)
(end (length target-string))
preserve-case
simple-calls)
simple-calls
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
(declare #.*standard-optimize-settings*)
"Try to match TARGET-STRING between START and END against REGEX and
replace the first match with REPLACEMENT.
replace the first match with REPLACEMENT. Two values are returned;
the modified string, and T if REGEX matched or NIL otherwise.
REPLACEMENT can be a string which may contain the special substrings
\"\\&\" for the whole match, \"\\`\" for the part of TARGET-STRING
@ -973,34 +982,40 @@ representing register (1+ N) -, or a function designator.
If PRESERVE-CASE is true, the replacement will try to preserve the
case (all upper case, all lower case, or capitalized) of the
match. The result will always be a fresh string, even if REGEX doesn't
match."
match.
ELEMENT-TYPE is the element type of the resulting string."
(multiple-value-bind (match-start match-end reg-starts reg-ends)
(scan regex target-string :start start :end end)
(if match-start
(replace-aux target-string replacement
(list match-start match-end)
(list reg-starts reg-ends)
start end preserve-case simple-calls)
(subseq target-string start end))))
(values (replace-aux target-string replacement
(list match-start match-end)
(list reg-starts reg-ends)
start end preserve-case
simple-calls element-type)
t)
(values (subseq target-string start end)
nil))))
#-:cormanlisp
(define-compiler-macro regex-replace
(&whole form &environment env regex target-string replacement &rest rest)
"Make sure that constant forms are compiled into scanners at compile time."
(cond ((constantp regex env)
`(regex-replace (load-time-value
(create-scanner ,regex))
`(regex-replace (load-time-value (create-scanner ,regex))
,target-string ,replacement ,@rest))
(t form)))
(defun regex-replace-all (regex target-string replacement
&key (start 0)
(end (length target-string))
preserve-case
simple-calls)
(defun regex-replace-all (regex target-string replacement &key
(start 0)
(end (length target-string))
preserve-case
simple-calls
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
(declare #.*standard-optimize-settings*)
"Try to match TARGET-STRING between START and END against REGEX and
replace all matches with REPLACEMENT.
replace all matches with REPLACEMENT. Two values are returned; the
modified string, and T if REGEX matched or NIL otherwise.
REPLACEMENT can be a string which may contain the special substrings
\"\\&\" for the whole match, \"\\`\" for the part of TARGET-STRING
@ -1024,30 +1039,34 @@ representing register (1+ N) -, or a function designator.
If PRESERVE-CASE is true, the replacement will try to preserve the
case (all upper case, all lower case, or capitalized) of the
match. The result will always be a fresh string, even if REGEX doesn't
match."
match.
ELEMENT-TYPE is the element type of the resulting string."
(let ((pos-list '())
(reg-list '()))
(do-scans (match-start match-end reg-starts reg-ends regex target-string
nil
:start start :end end)
nil
:start start :end end)
(push match-start pos-list)
(push match-end pos-list)
(push reg-starts reg-list)
(push reg-ends reg-list))
(if pos-list
(replace-aux target-string replacement
(nreverse pos-list)
(nreverse reg-list)
start end preserve-case simple-calls)
(subseq target-string start end))))
(values (replace-aux target-string replacement
(nreverse pos-list)
(nreverse reg-list)
start end preserve-case
simple-calls element-type)
t)
(values (subseq target-string start end)
nil))))
#-:cormanlisp
(define-compiler-macro regex-replace-all
(&whole form &environment env regex target-string replacement &rest rest)
"Make sure that constant forms are compiled into scanners at compile time."
(cond ((constantp regex env)
`(regex-replace-all (load-time-value
(create-scanner ,regex))
`(regex-replace-all (load-time-value (create-scanner ,regex))
,target-string ,replacement ,@rest))
(t form)))
@ -1060,32 +1079,35 @@ which matches REGEX. Optionally evaluates and returns RETURN-FORM at
the end. If CASE-INSENSITIVE is true and REGEX isn't already a
scanner, a case-insensitive scanner is used."
(with-rebinding (regex)
(with-unique-names (scanner %packages next morep)
(with-unique-names (scanner %packages next morep hash)
`(let* ((,scanner (create-scanner ,regex
:case-insensitive-mode
(and ,case-insensitive
(not (functionp ,regex)))))
(,%packages (or ,packages
(list-all-packages))))
(list-all-packages)))
(,hash (make-hash-table :test #'eq)))
(with-package-iterator (,next ,%packages :external :internal :inherited)
(loop
(multiple-value-bind (,morep symbol)
(,next)
(unless ,morep
(return ,return-form))
(when (scan ,scanner (symbol-name symbol))
,@body))))))))
(unless (gethash symbol ,hash)
(when (scan ,scanner (symbol-name symbol))
(setf (gethash symbol ,hash) t)
,@body)))))))))
;;; The following two functions were provided by Karsten Poeck
#+:cormanlisp
(defmacro do-with-all-symbols ((variable package-packagelist) &body body)
(with-unique-names (pack-var iter-sym)
`(if (listp ,package-packagelist)
(dolist (,pack-var ,package-packagelist)
(do-symbols (,iter-sym ,pack-var)
(defmacro do-with-all-symbols ((variable package-or-packagelist) &body body)
(with-unique-names (pack-var)
`(if (listp ,package-or-packagelist)
(dolist (,pack-var ,package-or-packagelist)
(do-symbols (,variable ,pack-var)
,@body))
(do-symbols (,iter-sym ,package-packagelist)
(do-symbols (,variable ,package-or-packagelist)
,@body))))
#+:cormanlisp
@ -1097,16 +1119,19 @@ which matches REGEX. Optionally evaluates and returns RETURN-FORM at
the end. If CASE-INSENSITIVE is true and REGEX isn't already a
scanner, a case-insensitive scanner is used."
(with-rebinding (regex)
(with-unique-names (scanner %packages)
(with-unique-names (scanner %packages hash)
`(let* ((,scanner (create-scanner ,regex
:case-insensitive-mode
(and ,case-insensitive
(not (functionp ,regex)))))
(,%packages (or ,packages
(list-all-packages))))
(list-all-packages)))
(,hash (make-hash-table :test #'eq)))
(do-with-all-symbols (symbol ,%packages)
(when (scan ,scanner (symbol-name symbol))
,@body))
(unless (gethash symbol ,hash)
(when (scan ,scanner (symbol-name symbol))
(setf (gethash symbol ,hash) t)
,@body)))
,return-form))))
(defun regex-apropos-list (regex &optional packages &key (case-insensitive t))
@ -1148,8 +1173,8 @@ meaningful information about a symbol."
(push (format nil "[constant]~:[~; value: ~S~]"
(boundp symbol) (symbol-value symbol)) output-list))
((boundp symbol)
(push #+(or LispWorks CLISP) "[variable]"
#-(or LispWorks CLISP) (format nil "[variable] value: ~S"
(push #+(or :lispworks :clisp) "[variable]"
#-(or :lispworks :clisp) (format nil "[variable] value: ~S"
(symbol-value symbol))
output-list)))
#-(or :cormanlisp :clisp)
@ -1160,7 +1185,7 @@ meaningful information about a symbol."
(condition ()
;; this seems to be necessary due to some errors I encountered
;; with LispWorks
(format t "~&~S [an error occured while trying to print more info]" symbol))))
(format t "~&~S [an error occurred while trying to print more info]" symbol))))
(defun regex-apropos (regex &optional packages &key (case-insensitive t))
"Similar to the standard function APROPOS but returns a list of all

227
charset.lisp Executable file
View File

@ -0,0 +1,227 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.4 2008/07/03 08:39:10 edi Exp $
;;; A specialized set implementation for characters by Nikodemus Siivola.
;;; Copyright (c) 2008, Nikodemus Siivola. All rights reserved.
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:
;;; * Redistributions of source code must retain the above copyright
;;; notice, this list of conditions and the following disclaimer.
;;; * Redistributions in binary form must reproduce the above
;;; copyright notice, this list of conditions and the following
;;; disclaimer in the documentation and/or other materials
;;; provided with the distribution.
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(in-package #:cl-ppcre)
(defconstant +probe-depth+ 3
"Maximum number of collisions \(for any element) we accept before we
allocate more storage. This is now fixed, but could be made to vary
depending on the size of the storage vector \(e.g. in the range of
1-4). Larger probe-depths mean more collisions are tolerated before
the table grows, but increase the constant factor.")
(defun make-char-vector (size)
"Returns a vector of size SIZE to hold characters. All elements are
initialized to #\Null except for the first one which is initialized to
#\?."
(declare #.*standard-optimize-settings*)
(declare (type (integer 2 #.(1- array-total-size-limit)) size))
;; Since #\Null always hashes to 0, store something else there
;; initially, and #\Null everywhere else
(let ((result (make-array size
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char
:initial-element (code-char 0))))
(setf (char result 0) #\?)
result))
(defstruct (charset (:constructor make-charset))
;; this is set to 0 when we stop hashing and just use a CHAR-CODE
;; indexed vector
(depth +probe-depth+ :type fixnum)
;; the number of characters in this set
(count 0 :type fixnum)
;; the storage vector
(vector (make-char-vector 12) :type (simple-array character (*))))
;; seems to be necessary for some Lisps like ClozureCL
(defmethod make-load-form ((set charset) &optional environment)
(make-load-form-saving-slots set :environment environment))
(declaim (inline mix))
(defun mix (code hash)
"Given a character code CODE and a hash code HASH, computes and
returns the \"next\" hash code. See comments below."
(declare #.*standard-optimize-settings*)
;; mixing the CHAR-CODE back in at each step makes sure that if two
;; characters collide (their hashes end up pointing in the same
;; storage vector index) on one round, they should (hopefully!) not
;; collide on the next
(sxhash (logand most-positive-fixnum (+ code hash))))
(declaim (inline compute-index))
(defun compute-index (hash vector)
"Computes and returns the index into the vector VECTOR corresponding
to the hash code HASH."
(declare #.*standard-optimize-settings*)
(1+ (mod hash (1- (length vector)))))
(defun in-charset-p (char set)
"Checks whether the character CHAR is in the charset SET."
(declare #.*standard-optimize-settings*)
(declare (character char) (charset set))
(let ((vector (charset-vector set))
(depth (charset-depth set))
(code (char-code char)))
(declare (fixnum depth))
;; As long as the set remains reasonably small, we use non-linear
;; hashing - the first hash of any character is its CHAR-CODE, and
;; subsequent hashes are computed by MIX above
(cond ((or
;; depth 0 is special - each char maps only to its code,
;; nothing else
(zerop depth)
;; index 0 is special - only #\Null maps to it, no matter
;; what the depth is
(zerop code))
(eq char (char vector code)))
(t
;; otherwise hash starts out as the character code, but
;; maps to indexes 1-N
(let ((hash code))
(tagbody
:retry
(let* ((index (compute-index hash vector))
(x (char vector index)))
(cond ((eq x (code-char 0))
;; empty, no need to probe further
(return-from in-charset-p nil))
((eq x char)
;; got it
(return-from in-charset-p t))
((zerop (decf depth))
;; max probe depth reached, nothing found
(return-from in-charset-p nil))
(t
;; nothing yet, try next place
(setf hash (mix code hash))
(go :retry))))))))))
(defun add-to-charset (char set)
"Adds the character CHAR to the charset SET, extending SET if
necessary. Returns CHAR."
(declare #.*standard-optimize-settings*)
(or (%add-to-charset char set)
(%add-to-charset/expand char set)
(error "Oops, this should not happen..."))
char)
(defun %add-to-charset (char set)
"Tries to add the character CHAR to the charset SET without
extending it. Returns NIL if this fails."
(declare #.*standard-optimize-settings*)
(declare (character char) (charset set))
(let ((vector (charset-vector set))
(depth (charset-depth set))
(code (char-code char)))
(declare (fixnum depth))
;; see comments in IN-CHARSET-P for algorithm
(cond ((or (zerop depth) (zerop code))
(setf (char vector code) char))
(t
(let ((hash code))
(tagbody
:retry
(let* ((index (compute-index hash vector))
(x (char vector index)))
(cond ((eq x (code-char 0))
(setf (char vector index) char)
(incf (charset-count set))
(return-from %add-to-charset char))
((eq x char)
(return-from %add-to-charset char))
((zerop (decf depth))
;; need to expand the table
(return-from %add-to-charset nil))
(t
(setf hash (mix code hash))
(go :retry))))))))))
(defun %add-to-charset/expand (char set)
"Extends the charset SET and then adds the character CHAR to it."
(declare #.*standard-optimize-settings*)
(declare (character char) (charset set))
(let* ((old-vector (charset-vector set))
(new-size (* 2 (length old-vector))))
(tagbody
:retry
;; when the table grows large (currently over 1/3 of
;; CHAR-CODE-LIMIT), we dispense with hashing and just allocate a
;; storage vector with space for all characters, so that each
;; character always uses only the CHAR-CODE
(multiple-value-bind (new-depth new-vector)
(if (>= new-size #.(truncate char-code-limit 3))
(values 0 (make-char-vector char-code-limit))
(values +probe-depth+ (make-char-vector new-size)))
(setf (charset-depth set) new-depth
(charset-vector set) new-vector)
(flet ((try-add (x)
(unless (%add-to-charset x set)
(assert (not (zerop new-depth)))
(setf new-size (* 2 new-size))
(go :retry))))
(try-add char)
(dotimes (i (length old-vector))
(let ((x (char old-vector i)))
(if (eq x (code-char 0))
(when (zerop i)
(try-add x))
(unless (zerop i)
(try-add x))))))))
t))
(defun all-characters (set)
"Returns a list of all characters in the charset SET."
(declare #.*standard-optimize-settings*)
(loop with count = (charset-count set)
with counter = 0
for code below char-code-limit
for char = (code-char code)
while (< counter count)
when (and char (in-charset-p char set))
do (incf counter)
and collect char))
(defun merge-set (set1 set2 &optional invertedp)
"Returns the \"sum\" of two charsets. This is a destructive
operation on SET1. If INVERTEDP is true, merges the \"inverse\" of
SET2 into SET1 instead."
(declare #.*standard-optimize-settings*)
;; we only consider values with character codes below
;; *REGEX-CHAR-CODE-LIMIT*
(loop for code of-type fixnum from 0 below *regex-char-code-limit*
for char = (code-char code)
when (and char (if invertedp
(not (in-charset-p char set2))
(in-charset-p char set2)))
do (add-to-charset char set1))
set1)

View File

@ -1,9 +1,9 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.asd,v 1.8 2005/11/01 09:51:01 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.asd,v 1.14 2008/06/25 14:04:27 edi Exp $
;;; This ASDF system definition was kindly provided by Marco Baringer.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -29,13 +29,6 @@
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(defpackage #:cl-ppcre-test.system
(:use #:cl
#:asdf))
(in-package #:cl-ppcre-test.system)
(defsystem #:cl-ppcre-test
:version "1.2.12"
:depends-on (#:cl-ppcre)
(asdf:defsystem :cl-ppcre-test
:depends-on (:cl-ppcre)
:components ((:file "ppcre-tests")))

View File

@ -1,7 +1,7 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.9 2005/04/01 21:29:09 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.11 2007/01/01 23:43:10 edi Exp $
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions

View File

@ -1,9 +1,9 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.12 2005/11/01 09:51:01 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.30 2008/07/03 10:06:15 edi Exp $
;;; This ASDF system definition was kindly provided by Marco Baringer.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -29,17 +29,12 @@
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(defpackage #:cl-ppcre.system
(:use #:cl
#:asdf))
(in-package #:cl-ppcre.system)
(defsystem #:cl-ppcre
:version "1.2.12"
(asdf:defsystem :cl-ppcre
:version "1.4.1"
:serial t
:components ((:file "packages")
(:file "specials")
(:file "charset")
(:file "util")
(:file "errors")
#-:use-acl-regexp2-engine

View File

@ -1,7 +1,7 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.11 2005/04/01 21:29:09 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.13 2007/01/01 23:43:10 edi Exp $
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions

View File

@ -1,10 +1,10 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.29 2005/05/16 16:29:23 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.36 2008/07/03 07:44:06 edi Exp $
;;; Here we create the closures which together build the final
;;; scanner.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -63,6 +63,7 @@ START-POS, and tests whether REGEX can match *STRING* at START-POS
such that the call to NEXT-FN after the match would succeed."))
(defmethod create-matcher-aux ((seq seq) next-fn)
(declare #.*standard-optimize-settings*)
;; the closure for a SEQ is a chain of closures for the elements of
;; this sequence which call each other in turn; the last closure
;; calls NEXT-FN
@ -72,6 +73,7 @@ such that the call to NEXT-FN after the match would succeed."))
finally (return next-matcher)))
(defmethod create-matcher-aux ((alternation alternation) next-fn)
(declare #.*standard-optimize-settings*)
;; first create closures for all alternations of ALTERNATION
(let ((all-matchers (mapcar #'(lambda (choice)
(create-matcher-aux choice next-fn))
@ -84,6 +86,7 @@ such that the call to NEXT-FN after the match would succeed."))
thereis (funcall (the function matcher) start-pos)))))
(defmethod create-matcher-aux ((register register) next-fn)
(declare #.*standard-optimize-settings*)
;; the position of this REGISTER within the whole regex; we start to
;; count at 0
(let ((num (num register)))
@ -122,6 +125,7 @@ such that the call to NEXT-FN after the match would succeed."))
next-pos)))))))
(defmethod create-matcher-aux ((lookahead lookahead) next-fn)
(declare #.*standard-optimize-settings*)
;; create a closure which just checks for the inner regex and
;; doesn't care about NEXT-FN
(let ((test-matcher (create-matcher-aux (regex lookahead) #'identity)))
@ -139,6 +143,7 @@ such that the call to NEXT-FN after the match would succeed."))
(funcall next-fn start-pos))))))
(defmethod create-matcher-aux ((lookbehind lookbehind) next-fn)
(declare #.*standard-optimize-settings*)
(let ((len (len lookbehind))
;; create a closure which just checks for the inner regex and
;; doesn't care about NEXT-FN
@ -150,14 +155,14 @@ such that the call to NEXT-FN after the match would succeed."))
;; far enough from the start of *STRING*), then call NEXT-FN
(lambda (start-pos)
(declare (type fixnum start-pos))
(and (>= (- start-pos *start-pos*) len)
(and (>= (- start-pos (or *real-start-pos* *start-pos*)) len)
(funcall test-matcher (- start-pos len))
(funcall next-fn start-pos)))
;; negative look-behind: check failure of inner regex (if we're
;; far enough from the start of *STRING*), then call NEXT-FN
(lambda (start-pos)
(declare (type fixnum start-pos))
(and (or (< start-pos len)
(and (or (< (- start-pos (or *real-start-pos* *start-pos*)) len)
(not (funcall test-matcher (- start-pos len))))
(funcall next-fn start-pos))))))
@ -172,55 +177,54 @@ against CHR-EXPR."
(subst new '(char-class-test) body
:test #'equalp)))
`(let* ((,%char-class ,char-class)
(hash (hash ,%char-class))
(count (if hash
(hash-table-count hash)
(set (charset ,%char-class))
(count (if set
(charset-count set)
most-positive-fixnum))
;; collect a list of "all" characters in the hash if
;; collect a list of "all" characters in the set if
;; there aren't more than two
(key-list (if (<= count 2)
(loop for chr being the hash-keys of hash
collect chr)
nil))
(all-chars (if (<= count 2)
(all-characters set)
nil))
downcasedp)
(declare (type fixnum count))
;; check if we can partition the hash into three ranges (or
;; check if we can partition the charset into three ranges (or
;; less)
(multiple-value-bind (min1 max1 min2 max2 min3 max3)
(create-ranges-from-hash hash)
(create-ranges-from-set set)
;; if that didn't work and CHAR-CLASS is case-insensitive we
;; try it again with every character downcased
(when (and (not min1)
(case-insensitive-p ,%char-class))
(multiple-value-setq (min1 max1 min2 max2 min3 max3)
(create-ranges-from-hash hash :downcasep t))
(create-ranges-from-set set :downcasep t))
(setq downcasedp t))
(cond ((= count 1)
;; hash contains exactly one character so we just
;; charset contains exactly one character so we just
;; check for this single character; (note that this
;; actually can't happen because this case is
;; optimized away in CONVERT already...)
(let ((chr1 (first key-list)))
(let ((chr1 (first all-chars)))
,@(substitute-char-class-tester
`(char= ,chr-expr chr1))))
((= count 2)
;; hash contains exactly two characters
(let ((chr1 (first key-list))
(chr2 (second key-list)))
;; set contains exactly two characters
(let ((chr1 (first all-chars))
(chr2 (second all-chars)))
,@(substitute-char-class-tester
`(let ((chr ,chr-expr))
(or (char= chr chr1)
(char= chr chr2))))))
((word-char-class-p ,%char-class)
;; special-case: hash is \w, \W, [\w], [\W] or
;; special-case: set is \w, \W, [\w], [\W] or
;; something equivalent
,@(substitute-char-class-tester
`(word-char-p ,chr-expr)))
((= count *regex-char-code-limit*)
;; according to the ANSI standard we might have all
;; possible characters in the hash even if it
;; doesn't contain CHAR-CODE-LIMIT characters but
;; this doesn't seem to be the case for current
;; possible characters in the set even if it doesn't
;; contain CHAR-CODE-LIMIT characters but this
;; doesn't seem to be the case for current
;; implementations (also note that this optimization
;; implies that you must not have characters with
;; character codes beyond *REGEX-CHAR-CODE-LIMIT* in
@ -264,17 +268,13 @@ against CHR-EXPR."
`(char<= min1 ,chr-expr max1)))
(t
;; the general case; note that most of the above
;; "optimizations" are based on experiences and
;; benchmarks with CMUCL - if you're really
;; concerned with speed you might find out that the
;; general case is almost always the best one for
;; other implementations (because the speed of their
;; hash-table access in relation to other operations
;; might be better than in CMUCL)
;; "optimizations" are based on early (2002)
;; experiences and benchmarks with CMUCL
,@(substitute-char-class-tester
`(gethash ,chr-expr hash)))))))))
`(in-charset-p ,chr-expr set)))))))))
(defmethod create-matcher-aux ((char-class char-class) next-fn)
(declare #.*standard-optimize-settings*)
(declare (type function next-fn))
;; insert a test against the current character within *STRING*
(insert-char-class-tester (char-class (schar *string* start-pos))
@ -291,6 +291,7 @@ against CHR-EXPR."
(funcall next-fn (1+ start-pos)))))))
(defmethod create-matcher-aux ((str str) next-fn)
(declare #.*standard-optimize-settings*)
(declare (type fixnum *end-string-pos*)
(type function next-fn)
;; this special value is set by CREATE-SCANNER when the
@ -405,6 +406,7 @@ against CHR-EXPR."
(word-char-p (schar *string* start-pos)))))))
(defmethod create-matcher-aux ((word-boundary word-boundary) next-fn)
(declare #.*standard-optimize-settings*)
(declare (type function next-fn))
(if (negatedp word-boundary)
(lambda (start-pos)
@ -415,6 +417,7 @@ against CHR-EXPR."
(funcall next-fn start-pos)))))
(defmethod create-matcher-aux ((everything everything) next-fn)
(declare #.*standard-optimize-settings*)
(declare (type function next-fn))
(if (single-line-p everything)
;; closure for single-line-mode: we really match everything, so we
@ -432,11 +435,12 @@ against CHR-EXPR."
(funcall next-fn (1+ start-pos))))))
(defmethod create-matcher-aux ((anchor anchor) next-fn)
(declare #.*standard-optimize-settings*)
(declare (type function next-fn))
(let ((startp (startp anchor))
(multi-line-p (multi-line-p anchor)))
(cond ((no-newline-p anchor)
;; this must be and end-anchor and it must be modeless, so
;; this must be an end-anchor and it must be modeless, so
;; we just have to check whether START-POS equals
;; *END-POS*
(lambda (start-pos)
@ -486,6 +490,7 @@ against CHR-EXPR."
(funcall next-fn start-pos)))))))
(defmethod create-matcher-aux ((back-reference back-reference) next-fn)
(declare #.*standard-optimize-settings*)
(declare (type function next-fn))
;; the position of the corresponding REGISTER within the whole
;; regex; we start to count at 0
@ -525,6 +530,7 @@ against CHR-EXPR."
(funcall next-fn next-pos)))))))))
(defmethod create-matcher-aux ((branch branch) next-fn)
(declare #.*standard-optimize-settings*)
(let* ((test (test branch))
(then-matcher (create-matcher-aux (then-regex branch) next-fn))
(else-matcher (create-matcher-aux (else-regex branch) next-fn)))
@ -545,6 +551,7 @@ against CHR-EXPR."
(funcall else-matcher start-pos))))))))
(defmethod create-matcher-aux ((standalone standalone) next-fn)
(declare #.*standard-optimize-settings*)
(let ((inner-matcher (create-matcher-aux (regex standalone) #'identity)))
(declare (type function next-fn inner-matcher))
(lambda (start-pos)
@ -553,6 +560,7 @@ against CHR-EXPR."
(funcall next-fn next-pos))))))
(defmethod create-matcher-aux ((filter filter) next-fn)
(declare #.*standard-optimize-settings*)
(let ((fn (fn filter)))
(lambda (start-pos)
(let ((next-pos (funcall fn start-pos)))
@ -560,5 +568,6 @@ against CHR-EXPR."
(funcall next-fn next-pos))))))
(defmethod create-matcher-aux ((void void) next-fn)
(declare #.*standard-optimize-settings*)
;; optimize away VOIDs: don't create a closure, just return NEXT-FN
next-fn)

File diff suppressed because it is too large Load Diff

View File

@ -3,21 +3,30 @@
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>CL-PPCRE - portable Perl-compatible regular expressions for Common Lisp</title>
<title>CL-PPCRE - Portable Perl-compatible regular expressions for Common Lisp</title>
<style type="text/css">
pre { padding:5px; background-color:#e0e0e0 }
a { text-decoration: none; }
a.none:hover { border:1px solid white; }
a { border:1px solid white; }
a:hover { border: 1px solid black; }
a.noborder { border:0px }
a.noborder:hover { border:0px }
h3, h4 { text-decoration: underline; }
a { text-decoration: none; padding: 1px 2px 1px 2px; }
a:visited { text-decoration: none; padding: 1px 2px 1px 2px; }
a:hover { text-decoration: none; padding: 1px 1px 1px 1px; border: 1px solid #000000; }
a:focus { text-decoration: none; padding: 1px 2px 1px 2px; border: none; }
a.none { text-decoration: none; padding: 0; }
a.none:visited { text-decoration: none; padding: 0; }
a.none:hover { text-decoration: none; border: none; padding: 0; }
a.none:focus { text-decoration: none; border: none; padding: 0; }
a.noborder { text-decoration: none; padding: 0; }
a.noborder:visited { text-decoration: none; padding: 0; }
a.noborder:hover { text-decoration: none; border: none; padding: 0; }
a.noborder:focus { text-decoration: none; border: none; padding: 0; }
pre.none { padding:5px; background-color:#ffffff }
</style>
<meta name="description" content="Fast and portable perl-compatible regular expressions for Common Lisp.">
</head>
<body bgcolor=white>
<h2>CL-PPCRE - portable Perl-compatible regular expressions for Common Lisp</h2>
<h2>CL-PPCRE - Portable Perl-compatible regular expressions for Common Lisp</h2>
<blockquote>
<br>&nbsp;<br><h3>Abstract</h3>
@ -29,24 +38,20 @@ which has the following features:
<li>It is <b>compatible with Perl</b>. (Well - as far as you can be
compatible with a language defined by its ever-changing
implementation. Currently, as of December 2002, CL-PPCRE is more
compatible with the regex semantics of Perl&nbsp;5.8.0 than, say,
Perl&nbsp;5.6.1 is...:) It even correctly parses and applies <a
href="http://www.oreilly.com/catalog/regex2/">Jeffrey Friedl's</a>
famous 6600-byte long RFC822 address pattern.
implementation. As of December 2002 CL-PPCRE was <a href="#perl">more
compatible</a> with the regex semantics of Perl&nbsp;5.8.0 than, say,
Perl&nbsp;5.6.1 was...) It even correctly parses and
applies <a href="http://www.oreilly.com/catalog/regex2/">Jeffrey
Friedl's</a> famous 6600-byte long RFC822 address pattern.
<li>It is <b>fast</b>. If compiled with <a
href="http://www.cons.org/cmucl/">CMUCL</a> it <a
href="#performance">outperforms</a> Perl's highly optimized regex engine (written
in C) which to my knowledge is faster than most other regex engines
around. If compiled with <a
href="http://clisp.sourceforge.net/">CLISP</a> it is still comparable
to CLISP's own regex implementation which is also written in
C.
<li>It is <b>fast</b>. Used with a Lisp compiler which compiles to
native code it is <a href="#performance">on par</a> with Perl's highly
optimized regex engine (written in C) which to my knowledge is faster
than most other regex engines around.
<li>It is <b>portable</b>, i.e. the code aims to be strictly <a
href="http://www.lispworks.com/documentation/HyperSpec/Front/index.htm">ANSI-compliant</a>. If
you encounter any deviations this is an error and should be
you encounter any deviations, this is an error and should be
reported to <a
href="#mail">the mailing list</a>. CL-PPCRE has been
successfully tested with the following Common Lisp implementations:
@ -68,7 +73,7 @@ successfully tested with the following Common Lisp implementations:
</ul>
If you succeed in using CL-PPCRE on other platforms please <a
If you succeed in using CL-PPCRE on other platforms, please <a
href="#mail">let us know</a>.
<br>
@ -117,7 +122,9 @@ license</b></a> so you can basically do with it whatever you want.
</ul>
CL-PPCRE has been used successfully in various applications like <a
href="http://nostoc.stanford.edu/Docs/">BioLingua</a>, <a
href="http://nostoc.stanford.edu/Docs/">BioBike</a>,
<a href="http://clutu.com/">clutu</a>,
<a
href="http://www.hpc.unm.edu/~download/LoGS/">LoGS</a>, <a href="http://cafespot.net/">CafeSpot</a>, <a href="http://www.eboy.com/">Eboy</a>, or <a
href="http://weitz.de/regex-coach/">The Regex Coach</a>.
@ -150,9 +157,10 @@ href="http://weitz.de/regex-coach/">The Regex Coach</a>.
<li><a href="#regex-replace-all"><code>regex-replace-all</code></a>
<li><a href="#regex-apropos"><code>regex-apropos</code></a>
<li><a href="#regex-apropos-list"><code>regex-apropos-list</code></a>
<li><a href="#regex-char-code-limit"><code>*regex-char-code-limit*</code></a>
<li><a href="#use-bmh-matchers"><code>*use-bmh-matchers*</code></a>
<li><a href="#*regex-char-code-limit*"><code>*regex-char-code-limit*</code></a>
<li><a href="#*use-bmh-matchers*"><code>*use-bmh-matchers*</code></a>
<li><a href="#*allow-quoting*"><code>*allow-quoting*</code></a>
<li><a href="#*allow-named-registers*"><code>*allow-named-registers*</code></a>
<li><a href="#quote-meta-chars"><code>quote-meta-chars</code></a>
<li><a href="#ppcre-error"><code>ppcre-error</code></a>
<li><a href="#ppcre-invocation-error"><code>ppcre-invocation-error</code></a>
@ -192,15 +200,14 @@ href="http://weitz.de/regex-coach/">The Regex Coach</a>.
CL-PPCRE together with this documentation can be downloaded from <a
href="http://weitz.de/files/cl-ppcre.tar.gz">http://weitz.de/files/cl-ppcre.tar.gz</a>. The
current version is 1.2.12. A <a
href="CHANGELOG">CHANGELOG</a> is available.
current version is 1.4.1.
<p>
If you're on <a href="http://www.debian.org/">Debian</a> you should
If you're on <a href="http://www.debian.org/">Debian</a>, you should
probably use the <a
href="http://packages.debian.org/cgi-bin/search_packages.pl?keywords=cl-ppcre&searchon=names&version=all&release=all">cl-ppcre
href="http://packages.debian.org/cgi-bin/search_packages.pl?keywords=cl-ppcre&amp;searchon=names&amp;version=all&amp;release=all">cl-ppcre
Debian package</a> which is available thanks to <a href="http://pvaneynd.mailworks.org/">Peter van Eynde</a> and <a href="http://b9.com/">Kevin
Rosenberg</a>. There's also a port
for <a href="http://www.cliki.net/gentoo">Gentoo Linux</a> thanks to Matthew Kennedy and a <a href="http://www.freebsd.org/cgi/url.cgi?ports/textproc/cl-ppcre/pkg-descr">FreeBSD port</a> thanks to Henrik Motakef.
for <a href="http://www.gentoo.org/proj/en/common-lisp/index.xml">Gentoo Linux</a> thanks to Matthew Kennedy and a <a href="http://www.freebsd.org/cgi/url.cgi?ports/textproc/cl-ppcre/pkg-descr">FreeBSD port</a> thanks to Henrik Motakef.
Installation via <a
href="http://www.cliki.net/asdf-install">asdf-install</a> should as well
be possible.
@ -214,7 +221,7 @@ directory start your Lisp image and evaluate the form
equivalent one for asdf) which should compile and load the whole
system.
<p>
If for some reason you don't want to use MK:DEFSYSTEM or asdf you
If for some reason you don't want to use MK:DEFSYSTEM or asdf, you
can just <code>LOAD</code> the file <code>load.lisp</code> or you
can also get away with something like this:
@ -239,35 +246,46 @@ cat {packages,specials,util,errors,lexer,parser,regex-class,convert,optimize,clo
your platform.)
<p>
Note that there is <em>no</em> public CVS repository for CL-PPCRE - the repository at <a href="http://common-lisp.net/">common-lisp.net</a> is out of date and not in sync with the (current) version distributed from <a href="http://weitz.de/">weitz.de</a>.
<p>
Lu&iacute;s Oliveira maintains a <a href="http://darcs.net/">darcs</a>
repository of CL-PPCRE
at <a
href="http://common-lisp.net/~loliveira/ediware/">http://common-lisp.net/~loliveira/ediware/</a>.
<br>&nbsp;<br><h3><a name="mail" class=none>Support and mailing lists</a></h3>
For questions, bug reports, feature requests, improvements, or patches
please use the <a
href="http://common-lisp.net/mailman/listinfo/cl-ppcre-devel">cl-ppcre-devel
mailing list</a>. If you want to be notified about future releases
subscribe to the <a
href="http://common-lisp.net/mailman/listinfo/cl-ppcre-announce">cl-ppcre-announce
please use
the <a href="http://common-lisp.net/mailman/listinfo/cl-ppcre-devel">cl-ppcre-devel
mailing list</a>. If you want to be notified about future releases,
subscribe to
the <a href="http://common-lisp.net/mailman/listinfo/cl-ppcre-announce">cl-ppcre-announce
mailing list</a>. These mailing lists were made available thanks to
the services of <a href="http://common-lisp.net/">common-lisp.net</a>.
Terrence Brannon has created a Google group for the list which is
at <a href="http://groups.google.com/group/cl-ppcre">http://groups.google.com/group/cl-ppcre</a>.
<p>
If you want to send patches, please <a href="http://weitz.de/patches.html">read this first</a>.
<br>&nbsp;<br><h3><a class=none name="dict">The CL-PPCRE dictionary</a></h3>
CL-PPCRE exports the following symbols:
<p><br>[Method]
<br><a class=none name="create-scanner"><b>create-scanner</b> <i>(string string)<tt>&amp;key</tt> case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive</i> =&gt; <i>scanner</i></a>
<br><a class=none name="create-scanner"><b>create-scanner</b> <i>(string string)<tt>&amp;key</tt> case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive</i> =&gt; <i>scanner, register-names</i></a>
<blockquote><br> Accepts a string which is a regular expression in
Perl syntax and returns a closure which will scan strings for this
regular expression. The mode keyword arguments are equivalent to the
regular expression. The second value is only returned if <a href="#*allow-named-registers*"><code>*ALLOW-NAMED-REGISTERS*</code></a> is <i>true</i>. It represents a list of strings mapping registers to their respective names - the first element stands for first register, the second element for second register, etc. You have to store this value if you want to map a register number to its name later as <i>scanner</i> doesn't capture any information about register names. If a register isn't named, it has NIL as its name.
<p>
The mode keyword arguments are equivalent to the
<code>&quot;imsx&quot;</code> modifiers in Perl. The
<code>destructive</code> keyword will be ignored.
<p>
The function accepts most of the regex syntax of Perl 5 as described
in <a
href="http://www.perldoc.com/perl5.8.0/pod/perlre.html"><code>man
href="http://perldoc.perl.org/perlre.html"><code>man
perlre</code></a> including extended features like non-greedy
repetitions, positive and negative look-ahead and look-behind
assertions, &quot;standalone&quot; subexpressions, and conditional
@ -308,6 +326,8 @@ codes), <code>\c[</code> (control characters), <code>\w</code>,
Since version 0.6.0 CL-PPCRE also supports Perl's <code>\Q</code> and <code>\E</code> - see <a
href="#*allow-quoting*"><code>*ALLOW-QUOTING*</code></a> below. Make sure you also read <a href="#quote">the relevant section</a> in &quot;<a href="#bugs">Bugs and problems</a>.&quot;
<p>
Since version 1.3.0 CL-PPCRE offers support for <a href="http://www.franz.com/support/documentation/7.0/doc/regexp.htm#regexp-new-capturing-2">AllegroCL's</a> <code>(?&lt;name&gt;"&lt;regex&gt;")</code> named registers and <code>\k&lt;name&gt;</code> back-references syntax, have a look at <a href="#*allow-named-registers*"><code>*ALLOW-NAMED-REGISTERS*</code></a> for details.
<p>
The keyword arguments are just for your
convenience. You can always use embedded modifiers like
<code>&quot;(?i-s)&quot;</code> instead.</blockquote>
@ -319,7 +339,7 @@ In this case <code><i>function</i></code> should be a scanner returned by anothe
</blockquote>
<p><br>[Method]
<br><a class=none name="create-scanner2"><b>create-scanner</b> <i>(parse-tree t)<tt>&amp;key</tt> case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive</i> =&gt; <i>scanner</i></a>
<br><a class=none name="create-scanner2"><b>create-scanner</b> <i>(parse-tree t)<tt>&amp;key</tt> case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive</i> =&gt; <i>scanner, register-names</i></a>
<blockquote><br>
This is similar to <a
href="#create-scanner"><code>CREATE-SCANNER</code></a> for regex strings above but
@ -405,7 +425,7 @@ stands for a register or a parse tree which is a look-ahead or
look-behind assertion. See the entry for
<code>(?(&lt;<i>condition</i>&gt;)&lt;<i>yes-pattern</i>&gt;|&lt;<i>no-pattern</i>&gt;)</code>
in <a
href="http://www.perldoc.com/perl5.8.0/pod/perlre.html#Extended-Patterns"><code>man
href="http://perldoc.perl.org/perlre.html#Extended-Patterns"><code>man
perlre</code></a> for the semantics of this construct. If
<code>&lt;<i>parse-tree</i>&gt;</code> is an alternation is
<em>must</em> enclose exactly one or two parse trees where the second
@ -439,9 +459,19 @@ to the Perl regex string <code>&quot;(?:ab)??&quot;</code>.
register group. As usual, registers are counted from left to right
beginning with 1.
<li><code>(:BACK-REFERENCE &lt;<i>number</i>&gt;)</code> where
<code>&lt;<i>number</i>&gt;</code> is a positive integer is a back-reference to a
register group.
<li><code>(:NAMED-REGISTER &lt;<i>name</i>&gt; &lt;<i>parse-tree</i>&gt;)</code> is a named capturing
register group. Acts as <code>:REGISTER</code>, but assigns <code>&lt;<i>name</i>&gt;</code> to a register too. This <code>&lt;<i>name</i>&gt;</code> can be later referred to via <code>:BACK-REFERENCE</code>. Names are case-sensitive and don't need to be unique. See <a href="#*allow-named-registers*"><code>*ALLOW-NAMED-REGISTERS*</code></a> for details.
<li><code>(:BACK-REFERENCE &lt;<i>ref</i>&gt;)</code> is a
back-reference to a register group. <code>&lt;<i>ref</i>&gt;</code> is
a positive integer or a string denoting a register name. If there are
several registers with the same name, the regex engine tries to
successfully match at least of them, starting with the most recently
seen register continuing to the least recently seen one, until a match
is found. See <a
href="#*allow-named-registers*"><code>*ALLOW-NAMED-REGISTERS*</code></a>
for more information.
<li><a class=none name="filterdef"><code>(:FILTER &lt;<i>function</i>&gt; <tt>&amp;optional</tt>
&lt;<i>length</i>&gt;)</code></a> where
@ -451,6 +481,11 @@ designator</a> and <code>&lt;<i>length</i>&gt;</code> is a
non-negative integer or <code>NIL</code> is a user-defined <a
href="#filters">filter</a>.
<li><code>(:REGEX &lt;<i>string</i>&gt;)</code> where
<code>&lt;<i>string</i>&gt;</code> is an
embedded <a href="#create-scanner">regular expression in Perl
syntax</a>.
<li><code>(:CHAR-CLASS|:INVERTED-CHAR-CLASS
{&lt;<i>item</i>&gt;}*)</code> where <code>&lt;<i>item</i>&gt;</code>
is either a character, a <em>character range</em>, or a symbol for a
@ -483,11 +518,11 @@ doesn't make sense if <code>CREATE-SCANNER</code> is applied to parse
trees and will signal an error.
<p>
If <code>destructive</code> is not <code>NIL</code> (the default is
<code>NIL</code>) the function is allowed to destructively modify
<code>NIL</code>), the function is allowed to destructively modify
<code><i>parse-tree</i></code> while creating the scanner.
<p>
If you want to find out how parse trees are related to Perl regex
strings you should play around with
strings, you should play around with
<code>CL-PPCRE::PARSE-STRING</code> - a function which converts Perl
regex strings to parse trees. Here are some examples:
@ -514,7 +549,7 @@ regex strings to parse trees. Here are some examples:
<p><br>[Accessor]
<br><a class="none" name="parse-tree-synonym"><b>parse-tree-synonym</b> <i>symbol</i> =&gt; <i>parse-tree</i>
<br><tt>(setf (</tt><b>parse-tree-synonym</b> <i>symbol</i>) <i>new-parse-tree</i><tt>)</tt></a>
<br><tt>(setf (</tt><b>parse-tree-synonym</b> <i>symbol</i><tt>)</tt> <i>new-parse-tree</i><tt>)</tt></a>
</p><blockquote><br>
Any symbol (unless it's a keyword with a special meaning in parse
@ -568,7 +603,7 @@ so you can write code like this:
<pre>
(define-parse-tree-synonym a-z
(:char-class (:range #\a #\z) (:range #\a #\z)))
(:char-class (:range #\a #\z) (:range #\A #\Z)))
(define-parse-tree-synonym a-z*
(:greedy-repetition 0 nil a-z))
@ -590,19 +625,21 @@ href="#scan"><code>SCAN</code></a><b>.</b>
<p><br>[Standard Generic Function]
<p><br>[Generic Function]
<br><a class=none name="scan"><b>scan</b> <i>regex target-string <tt>&amp;key</tt> start end</i> =&gt; <i>match-start, match-end, reg-starts, reg-ends</i></a>
<blockquote><br>
Searches the string <code><i>target-string</i></code> from
<code><i>start</i></code> (which defaults to 0) to
Searches the string <code><i>target-string</i></code>
from <code><i>start</i></code> (which defaults to 0) to
<code><i>end</i></code> (which default to the length of
<code><i>target-string</i></code>) and tries to match
<code><i>regex</i></code>. On success returns four values - the start
of the match, the end of the match, and two arrays denoting the
beginnings and ends of register matches. On failure returns
<code>NIL</code>. <code><i>target-string</i></code> will be coerced to a
simple string if it isn't one already.
<code>NIL</code>. <code><i>target-string</i></code> will be coerced
to a simple string if it isn't one already. (There's another keyword
parameter <code><i>real-start-pos</i></code>. This one should
<em>never</em> be set from user code - it is only used internally.)
<p>
<code>SCAN</code> acts as if the part of
<code><i>target-string</i></code> between <code><i>start</i></code>
@ -691,7 +728,7 @@ and <code>FN</code> is a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#function_designator">function
designator</a> (which is evaluated) denoting a function which is to be applied to the string before the result is bound to <code>VAR</code>.
To make this even more convenient the form <code>(FN&nbsp;VAR1&nbsp;...VARn)</code> can be used as an abbreviation for
<code>(FN&nbsp;VAR1)&nbsp;...&nbsp;(FN&nbsp;VARn).
<code>(FN&nbsp;VAR1)&nbsp;...&nbsp;(FN&nbsp;VARn)</code>.
<p>
If there is no match, the <code><i>statement*</i></code> forms are <em>not</em>
executed. For each element of
@ -739,7 +776,7 @@ returns <code><i>result-form</i></code> if provided or
<code>NIL</code> otherwise. An implicit block named <code>NIL</code>
surrounds <code>DO-SCANS</code>; <code>RETURN</code> may be used to
terminate the loop immediately. If <code><i>regex</i></code> matches
an empty string the scan is continued one position behind this match.
an empty string, the scan is continued one position behind this match.
<p>
This is the most general macro to iterate over all matches in a target
string. See the source code of <a
@ -830,7 +867,7 @@ groups. For each element of
group. After the last match, returns <code><i>result-form</i></code> if provided or <code>NIL</code>
otherwise. An implicit block named <code>NIL</code> surrounds <code>DO-REGISTER-GROUPS</code>;
<code>RETURN</code> may be used to terminate the loop immediately. If <code><i>regex</i></code> matches
an empty string the scan is continued one position behind this
an empty string, the scan is continued one position behind this
match. If <code><i>sharedp</i></code> is true, the substrings may share structure with
<code><i>target-string</i></code>.
<p>Example:
@ -916,7 +953,7 @@ simply be left out, otherwise they will show up as
elements returned - registers aren't counted. If
<code><i>limit</i></code> is <code>NIL</code> (or 0 which is
equivalent), trailing empty strings are removed from the result list.
If <code><i>regex</i></code> matches an empty string the scan is
If <code><i>regex</i></code> matches an empty string, the scan is
continued one position behind this match. If <code><i>sharedp</i></code> is true, the substrings may share structure with
<code><i>target-string</i></code>.
<p>
@ -970,12 +1007,14 @@ frob")
<p><br>[Function]
<br><a class=none name="regex-replace"><b>regex-replace</b> <i>regex target-string replacement <tt>&amp;key</tt> start end preserve-case simple-calls</i> =&gt; <i>list</i></a>
<br><a class=none name="regex-replace"><b>regex-replace</b> <i>regex target-string replacement <tt>&amp;key</tt> start end preserve-case simple-calls element-type</i> =&gt; <i>string, matchp</i></a>
<blockquote><br> Try to match <code><i>target-string</i></code>
between <code><i>start</i></code> and <code><i>end</i></code> against
<code><i>regex</i></code> and replace the first match with
<code><i>replacement</i></code>.
<code><i>replacement</i></code>. Two values are returned; the modified
string, and <code>T</code> if <code><i>regex</i></code> matched or
<code>NIL</code> otherwise.
<p>
<code><i>replacement</i></code> can be a string which may contain the
special substrings <code>&quot;\&amp;&quot;</code> for the whole
@ -1024,35 +1063,61 @@ will always be a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#fresh">fresh</a>
string, even if <code><i>regex</i></code> doesn't match.
<p>
<code><i>element-type</i></code> specifies
the <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_a.htm#array_element_type">array
element type</a> of the string which is returned, the default
is <a
href="http://www.lispworks.com/documentation/lw50/LWRM/html/lwref-346.htm"><code>LW:SIMPLE-CHAR</code></a>
for LispWorks
and <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/t_ch.htm"><code>CHARACTER</code></a>
for other Lisps.
<p>
Examples:
<pre>
* (cl-ppcre:regex-replace "fo+" "foo bar" "frob")
"frob bar"
T
* (cl-ppcre:regex-replace "fo+" "FOO bar" "frob")
"FOO bar"
NIL
* (cl-ppcre:regex-replace "(?i)fo+" "FOO bar" "frob")
"frob bar"
T
* (cl-ppcre:regex-replace "(?i)fo+" "FOO bar" "frob" :preserve-case t)
"FROB bar"
T
* (cl-ppcre:regex-replace "(?i)fo+" "Foo bar" "frob" :preserve-case t)
"Frob bar"
T
* (cl-ppcre:regex-replace "bar" "foo bar baz" "[frob (was '\\&' between '\\`' and '\\'')]")
"foo [frob (was 'bar' between 'foo ' and ' baz')] baz"
T
* (cl-ppcre:regex-replace "bar" "foo bar baz"
'("[frob (was '" :match "' between '" :before-match "' and '" :after-match "')]"))
"foo [frob (was 'bar' between 'foo ' and ' baz')] baz"
T
* (cl-ppcre:regex-replace "(be)(nev)(o)(lent)"
"benevolent: adj. generous, kind"
#'(lambda (match &amp;rest registers)
(format nil "~A [~{~A~^.~}]" match registers))
:simple-calls t)
"benevolent [be.nev.o.lent]: adj. generous, kind"
T
</pre></blockquote>
<p><br>[Function]
<br><a class=none name="regex-replace-all"><b>regex-replace-all</b> <i>regex target-string replacement <tt>&amp;key</tt> start end preserve-case simple-calls</i> =&gt; <i>list</i></a>
<br><a class=none name="regex-replace-all"><b>regex-replace-all</b> <i>regex target-string replacement <tt>&amp;key</tt> start end preserve-case simple-calls element-type</i> =&gt; <i>string, matchp</i></a>
<blockquote><br>
Like <a href="#regex-replace"><code>REGEX-REPLACE</code></a> but replaces all matches.
@ -1062,9 +1127,11 @@ Examples:
<pre>
* (cl-ppcre:regex-replace-all "(?i)fo+" "foo Fooo FOOOO bar" "frob" :preserve-case t)
"frob Frob FROB bar"
T
* (cl-ppcre:regex-replace-all "(?i)f(o+)" "foo Fooo FOOOO bar" "fr\\1b" :preserve-case t)
"froob Frooob FROOOOB bar"
T
* (let ((qp-regex (cl-ppcre:create-scanner "[\\x80-\\xff]")))
(defun encode-quoted-printable (string)
@ -1079,6 +1146,7 @@ ENCODE-QUOTED-PRINTABLE
* (encode-quoted-printable "F&ecirc;te S&oslash;rensen na&iuml;ve H&uuml;hner Stra&szlig;e")
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe"
T
* (let ((url-regex (cl-ppcre:create-scanner "[^a-zA-Z0-9_\\-.]")))
(defun url-encode (string)
@ -1093,6 +1161,7 @@ URL-ENCODE
* (url-encode "F&ecirc;te S&oslash;rensen na&iuml;ve H&uuml;hner Stra&szlig;e")
"F%EAte%20S%F8rensen%20na%EFve%20H%FChner%20Stra%DFe"
T
* (defun how-many (target-string start end match-start match-end reg-starts reg-ends)
(declare (ignore start end match-start match-end))
@ -1104,6 +1173,7 @@ HOW-MANY
"foo{...}bar{.....}{..}baz{....}frob"
(list "[" 'how-many " dots]"))
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"
T
* (let ((qp-regex (cl-ppcre:create-scanner "[\\x80-\\xff]")))
(defun encode-quoted-printable (string)
@ -1120,6 +1190,7 @@ ENCODE-QUOTED-PRINTABLE
* (encode-quoted-printable "F&ecirc;te S&oslash;rensen na&iuml;ve H&uuml;hner Stra&szlig;e")
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe"
T
* (defun how-many (match first-register)
(declare (ignore match))
@ -1132,6 +1203,7 @@ HOW-MANY
:simple-calls t)
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"
T
</pre></blockquote>
<p><br>[Function]
@ -1217,18 +1289,18 @@ Example (continued from above):
</pre></blockquote>
<p><br>[Special variable]
<br><a class=none name="regex-char-code-limit"><b>*regex-char-code-limit*</b></a>
<br><a class=none name="*regex-char-code-limit*"><b>*regex-char-code-limit*</b></a>
<blockquote><br>This variable controls whether scanners take into
account all characters of your CL implementation or only those the <a
account all characters of your CL implementation or only those
the <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/f_char_c.htm#char-code"><code>CHAR-CODE</code></a>
of which is not larger than its value. It is only relevant if the
regular expression contains certain character classes. The default is
of which is not larger than its value. The default is
<a
href="http://www.lispworks.com/documentation/HyperSpec/Body/v_char_c.htm"><code>CHAR-CODE-LIMIT</code></a>,
and you might see significant speed and space improvements during
scanner <em>creation</em> if, say, your target strings only contain <a
href="http://wwwwbs.cs.tu-berlin.de/user/czyborra/charsets/">ISO-8859-1</a>
href="http://czyborra.com/charsets/iso8859.html">ISO-8859-1</a>
characters and you're using an implementation like AllegroCL,
CLISP, LispWorks, or SBCL where <code>CHAR-CODE-LIMIT</code> has a value
much higher than 256. The <a href="#test">test suite</a> will
@ -1259,8 +1331,8 @@ Allocation = 3336 bytes standard / 8338 bytes fixlen
#&lt;closure 206569DA&gt;
</pre>
<p>
Note: Due to the nature of <code>LOAD-TIME-VALUE</code> and the <a
href="#compiler-macro">compiler macro for <code>SCAN</code></a> some
Note: Due to the nature of <a href="http://www.lispworks.com/documentation/HyperSpec/Body/s_ld_tim.htm"><code>LOAD-TIME-VALUE</code></a> and the <a
href="#compiler-macro">compiler macro for <code>SCAN</code> and other functions</a>, some
scanners might be created in a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_n.htm#null_lexical_environment">null
lexical environment</a> at load time or at compile time so be careful
@ -1269,11 +1341,11 @@ time. The default value should always yield correct results unless you
play dirty tricks with implementation-dependent behaviour, though.</blockquote>
<p><br>[Special variable]
<br><a class=none name="use-bmh-matchers"><b>*use-bmh-matchers*</b></a>
<br><a class=none name="*use-bmh-matchers*"><b>*use-bmh-matchers*</b></a>
<blockquote><br>Usually, the scanners created by <a
href="#create-scanner"><code>CREATE-SCANNER</code></a> (or
implicitely by other functions and macros) will use fast <a
implicitly by other functions and macros) will use fast <a
href="http://www-igm.univ-mlv.fr/~lecroq/string/node18.html">Boyer-Moore-Horspool
matchers</a> to check for constant strings at the start or end of the
regular expression. If <code>*USE-BMH-MATCHERS*</code> is
@ -1286,8 +1358,8 @@ href="#test">test suite</a> will automatically set
<code>*USE-BMH-MATCHERS*</code> to <code>NIL</code> while you're running
the default test.
<p>
Note: Due to the nature of <code>LOAD-TIME-VALUE</code> and the <a
href="#compiler-macro">compiler macro for <code>SCAN</code></a> some
Note: Due to the nature of <a href="http://www.lispworks.com/documentation/HyperSpec/Body/s_ld_tim.htm"><code>LOAD-TIME-VALUE</code></a> and the <a
href="#compiler-macro">compiler macro for <code>SCAN</code> and other functions</a>, some
scanners might be created in a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_n.htm#null_lexical_environment">null
lexical environment</a> at load time or at compile time so be careful
@ -1298,7 +1370,7 @@ time.</blockquote>
<br><a class=none name="*allow-quoting*"><b>*allow-quoting*</b></a>
<blockquote><br>
If this value is <em>true</em> (the default is <code>NIL</code>)
If this value is <em>true</em> (the default is <code>NIL</code>),
CL-PPCRE will support <code>\Q</code> and <code>\E</code> in regex
strings to quote (disable) metacharacters. Note that this entails a
slight performance penalty when creating scanners because (a copy of) the regex
@ -1312,14 +1384,15 @@ about the converted string and not about the original regex string.
NIL
* (let ((cl-ppcre:*allow-quoting* t))
(cl-ppcre:scan &quot;^\\Qa+\\E$&quot; &quot;a+&quot;))
<font color=orange>;;we use CREATE-SCANNER because of Lisps like SBCL that don't have an interpreter</font>
(cl-ppcre:scan (cl-ppcre:create-scanner &quot;^\\Qa+\\E$&quot;) &quot;a+&quot;))
0
2
#()
#()
* (let ((cl-ppcre:*allow-quoting* t))
(cl-ppcre:scan &quot;\\Qa()\\E(?#comment\\Q)a**b&quot; &quot;()ab&quot;))
(cl-ppcre:scan (cl-ppcre:create-scanner &quot;\\Qa()\\E(?#comment\\Q)a**b&quot;) &quot;()ab&quot;))
Quantifier '*' not allowed at position 19 in string &quot;a\\(\\)(?#commentQ)a**b&quot;
</pre>
@ -1341,9 +1414,135 @@ understand (and Lisp-ier) if you write it like this:
</pre>
Make sure you also read <a href="#quote">the relevant section</a> in &quot;<a href="#bugs">Bugs and problems</a>.&quot;
<p>
Note: Due to the nature of <a href="http://www.lispworks.com/documentation/HyperSpec/Body/s_ld_tim.htm"><code>LOAD-TIME-VALUE</code></a> and the <a
href="#compiler-macro">compiler macro for <code>SCAN</code> and other functions</a>, some
scanners might be created in a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_n.htm#null_lexical_environment">null
lexical environment</a> at load time or at compile time so be careful
to which value <code>*ALLOW-QUOTING*</code> is bound at that
time.</blockquote>
</blockquote>
<p><br>[Special variable]
<br><a class=none name="*allow-named-registers*"><b>*allow-named-registers*</b></a>
<blockquote><br>
If this value is <em>true</em> (the default is <code>NIL</code>),
CL-PPCRE will support <code>(?<i>&lt;name&gt;"&lt;regex&gt;"</i>)</code> and <code>\k<i>&lt;name&gt;</i></code> in regex
strings to provide named registers and back-references as in <a href="http://www.franz.com/support/documentation/7.0/doc/regexp.htm#regexp-new-capturing-2">AllegroCL</a>. <code><i>name</i></code> is has to start with a letter and can contain only alphanumeric characters or minus sign. Names of registers are matched case-sensitively.
The <a href="#create-scanner2">parse tree syntax</a> is not affected by the <code>*ALLOW-NAMED-REGISTERS*</code> switch, <code>:NAMED-REGISTER</code> and <code>:BACK-REFERENCE</code> forms are always resolved as expected. There are also no restrictions on register names in this syntax except that they have to be strings.
<p>
Examples:
<pre>
<font color=orange>;; Perl compatible mode (*ALLOW-NAMED-REGISTERS* is NIL)</font>
* (cl-ppcre:create-scanner "(?&lt;reg&gt;.*)")
Character 'r' may not follow '(?&lt' at position 3 in string "(?&lt;reg&gt;)"
<font color=orange>;; just unescapes "\\k"</font>
* (cl-ppcre::parse-string "\\k&lt;reg&gt;")
"k&lt;reg&gt;"
</pre>
<pre>
* (setq cl-ppcre:*allow-named-registers* t)
T
* (cl-ppcre:create-scanner "((?&lt;small&gt;[a-z]*)(?&lt;big&gt;[A-Z]*))")
#&LT;CLOSURE (LAMBDA (STRING CL-PPCRE::START CL-PPCRE::END)) {AD75BFD}&gt;
(NIL "small" "big")
<font color=orange>;; the scanner doesn't capture any information about named groups -
;; you have to store the second value returned from CREATE-SCANNER yourself</font>
* (cl-ppcre:scan * "aaaBBB")
0
6
#(0 0 3)
#(6 3 6)
</pre>
<pre>
<font color=orange>;; parse tree syntax</font>
* (cl-ppcre::parse-string "((?&lt;small&gt;[a-z]*)(?&lt;big&gt;[A-Z]*))")
(:REGISTER
(:SEQUENCE
(:NAMED-REGISTER "small"
(:GREEDY-REPETITION 0 NIL (:CHAR-CLASS (:RANGE #\a #\z))))
(:NAMED-REGISTER "big"
(:GREEDY-REPETITION 0 NIL (:CHAR-CLASS (:RANGE #\A #\Z))))))
* (cl-ppcre:create-scanner *)
#&lt;CLOSURE (LAMBDA (STRING CL-PPCRE::START CL-PPCRE::END)) {B158E3D}&gt;
(NIL "small" "big")
</pre>
<pre>
<font color=orange>;; multiple-choice back-reference</font>
* (cl-ppcre:scan "^(?&lt;reg&gt;[ab])(?&lt;reg&gt;[12])\\k&lt;reg&gt;\\k&lt;reg&gt;$" "a1aa")
0
4
#(0 1)
#(1 2)
* (cl-ppcre:scan "^(?&lt;reg&gt;[ab])(?&lt;reg&gt;[12])\\k&lt;reg&gt;\\k&lt;reg&gt;$" "a22a")
0
4
#(0 1)
#(1 2)
</pre>
<pre>
<font color=orange>;; demonstrating most-recently-seen-register-first property of back-reference;
;; "greedy" regex (analogous to "aa?")</font>
* (cl-ppcre:scan "^(?&lt;reg&gt;)(?&lt;reg&gt;a)(\\k&lt;reg&gt;)" "a")
0
1
#(0 0 1)
#(0 1 1)
* (cl-ppcre:scan "^(?&lt;reg&gt;)(?&lt;reg&gt;a)(\\k&lt;reg&gt;)" "aa")
0
2
#(0 0 1)
#(0 1 2)
</pre>
<pre>
<font color=orange>;; switched groups
;; "lazy" regex (analogous to "aa??")</font>
* (cl-ppcre:scan "^(?&lt;reg&gt;a)(?&lt;reg&gt;)(\\k&lt;reg&gt;)" "a")
0
1
#(0 1 1)
#(1 1 1)
<font color=orange>;; scanner ignores the second "a"</font>
* (cl-ppcre:scan "^(?&lt;reg&gt;a)(?&lt;reg&gt;)(\\k&lt;reg&gt;)" "aa")
0
1
#(0 1 1)
#(1 1 1)
<font color=orange>;; "aa" will be matched only when forced by adding "$" at the end</font>
* (cl-ppcre:scan "^(?&lt;reg&gt;a)(?&lt;reg&gt;)(\\k&lt;reg&gt;)$" "aa")
0
2
#(0 1 1)
#(1 1 2)
</pre>
Note: Due to the nature of <a href="http://www.lispworks.com/documentation/HyperSpec/Body/s_ld_tim.htm"><code>LOAD-TIME-VALUE</code></a> and the <a
href="#compiler-macro">compiler macro for <code>SCAN</code> and other functions</a>, some
scanners might be created in a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_n.htm#null_lexical_environment">null
lexical environment</a> at load time or at compile time so be careful
to which value <code>*ALLOW-NAMED-REGISTERS*</code> is bound at that
time.</blockquote>
</blockquote>
<p><br>[Function]
<br><a class=none name="quote-meta-chars"><b>quote-meta-chars</b> <i>string</i> =&gt; <i>string'</i></a>
@ -1392,8 +1591,8 @@ string or to convert a parse tree into its internal representation.
This is a direct subtype of <a
href="#ppcre-error"><code>PPCRE-ERROR</code></a> with two additional
slots. These denote the regex string which HTML-PPCRE was parsing and
the position within the string where the error occured. If the error
happens while CL-PPCRE is converting a parse tree both of these slots
the position within the string where the error occurred. If the error
happens while CL-PPCRE is converting a parse tree, both of these slots
contain <code>NIL</code>. (See the next two entries on how to access
these slots.)
<p>
@ -1420,11 +1619,11 @@ The last message we received was &quot;Quantifier '*' not allowed&quot;.
</blockquote>
<p><br>[Function]
<br><a class=none name="ppcre-syntax-error-string"><b>ppcre-syntax-error-string</b></a> <i>condition</i> =&gt; <i>string</i></a>
<br><a class=none name="ppcre-syntax-error-string"><b>ppcre-syntax-error-string</b></a> <i>condition</i> =&gt; <i>string</i>
<blockquote><br>
If <code><i>condition</i></code> is a condition of type <a
href="#ppcre-syntax-error"><code>PPCRE-SYNTAX-ERROR</code></a> this
href="#ppcre-syntax-error"><code>PPCRE-SYNTAX-ERROR</code></a>, this
function will return the string the parser was parsing when the error was
encountered (or <code>NIL</code> if the error happened while trying to
convert a parse tree). This might be particularly useful when <a
@ -1434,13 +1633,13 @@ href="#create-scanner"><code>CREATE-SCANNER</code></a> function.
</blockquote>
<p><br>[Function]
<br><a class=none name="ppcre-syntax-error-pos"><b>ppcre-syntax-error-pos</b></a> <i>condition</i> =&gt; <i>number</i></a>
<br><a class=none name="ppcre-syntax-error-pos"><b>ppcre-syntax-error-pos</b></a> <i>condition</i> =&gt; <i>number</i>
<blockquote><br>
If <code><i>condition</i></code> is a condition of type <a
href="#ppcre-syntax-error"><code>PPCRE-SYNTAX-ERROR</code></a> this
href="#ppcre-syntax-error"><code>PPCRE-SYNTAX-ERROR</code></a>, this
function will return the position within the string where the error
occured (or <code>NIL</code> if the error happened while trying to
occurred (or <code>NIL</code> if the error happened while trying to
convert a parse tree).
</blockquote>
@ -1454,13 +1653,10 @@ regex building blocks. Filters can only be used within <a
href="#create-scanner2">parse trees</a>, not within Perl regex
strings.
<p>
Note that filters are currently considered an experimental feature and
their API might change in the future.
<p>
A filter is defined by its <em>filter function</em> which must be a
function of one argument. During the parsing process this function
might be called once or several times or it might not be called at
all. If it's called its argument is an integer <code><i>pos</i></code>
all. If it's called, its argument is an integer <code><i>pos</i></code>
which is the current position within the target string. The filter can
either return <code>NIL</code> (which means that the subexpression
represented by this filter didn't match) or an integer not smaller
@ -1470,7 +1666,7 @@ wants to consume <code>N</code> characters should return
<code>(+&nbsp;POS&nbsp;N)</code>.
<p>
If you supply the optional value <code><i>length</i></code> and it is
not <code>NIL</code> then this is a promise to the regex engine that
not <code>NIL</code>, then this is a promise to the regex engine that
your filter will <em>always</em> consume <em>exactly</em>
<code><i>length</i></code> characters. The regex engine might use this
information for optimization purposes but it is otherwise irrelevant
@ -1499,7 +1695,7 @@ value of this variable is <code>NIL</code>.
<li><CODE>CL-PPCRE::*REG-STARTS*</CODE> and
<CODE>CL-PPCRE::*REG-ENDS*</CODE>: Two simple vectors which denote the
start and end indices of registers within the regular expression. The
first register is indexed by&nbsp;0. If a register hasn't matched yet
first register is indexed by&nbsp;0. If a register hasn't matched yet,
then its corresponding entry in <CODE>CL-PPCRE::*REG-STARTS*</CODE> is
<code>NIL</code>.
@ -1568,7 +1764,7 @@ NIL
* (defun my-weird-filter (pos)
&quot;Only match at this point if either pos is odd and the character
we're looking at is lowerrcase or if pos is even and the next two
we're looking at is lowercase or if pos is even and the next two
characters we're looking at are uppercase. Consume these characters if
there's a match.&quot;
(format t &quot;Trying at position ~A~%&quot; pos)
@ -1653,10 +1849,10 @@ NIL
* (cl-ppcre-test:test)
<font color=orange>;; ....
;; (a list of <a class=noborder href="#perl">incompatibilities with Perl</a>)</font color=orange>
;; (a list of <a class=noborder href="#perl">incompatibilities with Perl</a>)</font>
</pre>
(If you're not using MK:DEFSYSTEM or asdf it suffices to build
(If you're not using MK:DEFSYSTEM or asdf, it suffices to build
CL-PPCRE and then compile and load the file
<code>ppcre-tests.lisp</code>.)
<p>
@ -1746,7 +1942,7 @@ matching non-ASCII characters.
The <a href="">CL-PPCRE test suite</a> can also be used for
benchmarking purposes: If you call <code>perltest.pl</code> with a
command line argument it will be interpreted as the minimum number of seconds
command line argument, it will be interpreted as the minimum number of seconds
each test should run. Perl will time its tests accordingly and create
output which, when fed to <code>CL-PPCRE-TEST:TEST</code>, will result
in a benchmark. Here's an example:
@ -1789,7 +1985,7 @@ NIL
We gave two test cases to <code>perltest.pl</code> and asked it to repeat those tests often enough so that it takes at least 0.5 seconds to run each of them. In both cases, CMUCL was about twice as fast as Perl.
<p>
Here are some more benchmarks (done with Perl 5.6.1 and CMUCL 18d+):
Here are some more benchmarks (done with Perl 5.6.1 and CMUCL 18d+ in 2002):
<p>
<table border=1>
@ -1898,8 +2094,8 @@ to <a href="#scan"><code>SCAN</code></a>, <a href="#scan-to-strings"><code>SCAN-
<a href="#regex-replace"><code>REGEX-REPLACE</code></a>, or <a href="#regex-replace-all"><code>REGEX-REPLACE-ALL</code></a> is a <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_c.htm#constant_form">constant
form</a>. (But see the notes for <a
href="#regex-char-code-limit"><code>*REGEX-CHAR-CODE-LIMIT*</code></a> and
<a href="#use-bmh-matchers"><code>*USE-BMH-MATCHERS*</code></a>.)
href="#*regex-char-code-limit*"><code>*REGEX-CHAR-CODE-LIMIT*</code></a> and
<a href="#*use-bmh-matchers*"><code>*USE-BMH-MATCHERS*</code></a>.)
<p>
Here's an example of its effect
@ -2012,9 +2208,9 @@ Another thing to consider is that, for performance reasons, CL-PPCRE
assumes that most of the target strings you're trying to match are <a
href="http://www.lispworks.com/documentation/HyperSpec/Body/t_smp_st.htm">simple
strings</a> and coerces non-simple strings to simple strings before
scanning them. If you plan on working with non-simple strings mostly
scanning them. If you plan on working with non-simple strings mostly,
you might consider modifying the CL-PPCRE source code. This is easy:
Change all occurences of <code>SCHAR</code> to <code>CHAR</code> and
Change all occurrences of <code>SCHAR</code> to <code>CHAR</code> and
redefine the macro in <code>util.lisp</code> where the coercion takes
place - that's all.
@ -2157,7 +2353,7 @@ print 1
if '\E*\E*' =~ /(?:\Q$a\E){2}/;
</pre>
If you try to do something similar in CL-PPCRE you get an error:
If you try to do something similar in CL-PPCRE, you get an error:
<pre>
* (let ((cl-ppcre:*allow-quoting* t)
@ -2179,7 +2375,7 @@ the scary details. It <em>can</em> happen in CL-PPCRE, though.
Bummer!
<p>
What gives? <code>&quot;\\Q...\\E&quot;</code> in CL-PPCRE should only
be used in literal strings. If you want to quote arbitrary strings
be used in literal strings. If you want to quote arbitrary strings,
try <a href="http://weitz.de/cl-interpol/">CL-INTERPOL</a> or use <a
href="#quote-meta-chars"><code>QUOTE-META-CHARS</code></a>:
<pre>
@ -2270,7 +2466,6 @@ differences (most of which probably don't matter very often):
href="#parse-tree-synonym">parse tree synonyms</a> and <a href="#filters">filters</a>.
<li>The AllegroCL engine <a href="http://www.franz.com/support/documentation/7.0/doc/regexp.htm#regexp-new-compatibility-2">will choke on some regular expressions involving curly braces</a> that are accepted by Perl and CL-PPCRE's native engine.
<li>The AllegroCL engine's case-folding mode switch (which is used instead of CL-PPCRE's <a href="#create-scanner"><code>:CASE-INSENSITIVE</code> keyword parameter</a>) <a href="http://www.franz.com/support/documentation/7.0/doc/regexp.htm#regexp-new-matching-2">is currently only effective for ASCII characters</a>.
<li>CL-PPCRE's engine doesn't understand the <a href="http://www.franz.com/support/documentation/7.0/doc/regexp.htm#regexp-new-capturing-2">named register groups</a> provided by AllegroCL.
<li>The AllegroCL engine <a href="http://www.franz.com/support/documentation/7.0/doc/regexp.htm#regexp-new-compatibility-2">doesn't support</a> <a href="#*allow-quoting*">quoting of metacharacters</a>.
<li>In AllegroCL compatibility mode compiled regular expressions (as returned by <a href="#create-scanner"><code>CREATE-SCANNER</code></a>) aren't functions but structures.
</ul>
@ -2294,22 +2489,25 @@ href="http://www.cons.org/cmucl/support.html">mailing list</a> as well
as the output of Perl's <code>use re &quot;debug&quot;</code> pragma
have been very helpful in optimizing the scanners created by CL-PPCRE.
<p>
The asdf system definitions were kindly provided by Marco
<p> The asdf system definitions were kindly provided by Marco
Baringer. Hannu Koivisto provided patches to make the
<code>.system</code> files more usable. Thanks to Kevin Rosenberg and
Douglas Crosher for pointing out how to be friendly to case-sensitive
ACL images. Thanks to Karsten Poeck and JP Massar for their help in
making CL-PPCRE work with Corman Lisp. JP Massar and Kent M. Pitman
also helped to improve/fix the test suite and the compiler macro.
also helped to improve/fix the test suite and the compiler macro. <a
href="http://random-state.net/">Nikodemus Siivola</a> provided the
fast charset implementation in <code>charset.lisp</code>. See the <a
href="http://weitz.de/cl-ppcre/CHANGELOG">ChangeLog</a> for several
other people who helped with bug reports or patches.
<p>
Thanks to the guys at &quot;Caf&eacute; Ol&eacute;&quot; in Hamburg
Thanks to the guys at &quot;<a href="http://www.weinhandel-ottensen.de/">Caf&eacute; Ol&eacute;</a>&quot; in Hamburg
where I wrote most of the code and thanks to my wife for lending me
her PowerBook to test CL-PPCRE with MCL and OpenMCL.
<p>
$Header: /usr/local/cvsrep/cl-ppcre/doc/index.html,v 1.131 2005/11/01 09:51:02 edi Exp $
$Header: /usr/local/cvsrep/cl-ppcre/doc/index.html,v 1.171 2008/07/03 10:06:17 edi Exp $
<p><a href="http://weitz.de/index.html">BACK TO MY HOMEPAGE</a>
</body>

View File

@ -1,7 +1,7 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.14 2005/04/01 21:29:09 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.18 2008/06/25 14:04:27 edi Exp $
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -61,7 +61,7 @@ encountered \(or NIL if the error happened while trying to convert a
parse tree).")
(setf (documentation 'ppcre-syntax-error-pos 'function)
"Returns the position within the string where the error occured
"Returns the position within the string where the error occurred
\(or NIL if the error happened while trying to convert a parse tree")
(define-condition ppcre-invocation-error (ppcre-error)

View File

@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.28 2008/06/25 14:04:27 edi Exp $
;;; The lexer's responsibility is to convert the regex string into a
;;; sequence of tokens which are in turn consumed by the parser.
@ -9,7 +9,7 @@
;;; has opened so far. (The latter is necessary for interpreting
;;; strings like "\\10" correctly.)
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -462,203 +462,255 @@ resets the lexer to its old position."
(otherwise
(fail lexer)))))
(defun parse-register-name-aux (lexer)
"Reads and returns the name in a named register group. It is
assumed that the starting #\< character has already been read. The
closing #\> will also be consumed."
;; we have to look for an ending > character now
(let ((end-name (position #\>
(lexer-str lexer)
:start (lexer-pos lexer)
:test #'char=)))
(unless end-name
;; there has to be > somewhere, syntax error otherwise
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Opening #\< in named group has no closing #\>"))
(let ((name (subseq (lexer-str lexer)
(lexer-pos lexer)
end-name)))
(unless (every #'(lambda (char)
(or (alphanumericp char)
(char= #\- char)))
name)
;; register name can contain only alphanumeric characters or #\-
(signal-ppcre-syntax-error*
(lexer-pos lexer)
"Invalid character in named register group"))
;; advance lexer beyond "<name>" part
(setf (lexer-pos lexer) (1+ end-name))
name)))
(defun get-token (lexer)
(declare #.*standard-optimize-settings*)
"Returns and consumes the next token from the regex string (or NIL)."
"Returns and consumes the next token from the regex string \(or NIL)."
;; remember starting position for UNGET-TOKEN function
(push (lexer-pos lexer)
(lexer-last-pos lexer))
(let ((next-char (next-char lexer)))
(cond (next-char
(case next-char
;; the easy cases first - the following six characters
;; always have a special meaning and get translated
;; into tokens immediately
((#\))
:close-paren)
((#\|)
:vertical-bar)
((#\?)
:question-mark)
((#\.)
:everything)
((#\^)
:start-anchor)
((#\$)
:end-anchor)
((#\+ #\*)
;; quantifiers will always be consumend by
;; GET-QUANTIFIER, they must not appear here
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Quantifier '~A' not allowed"
next-char))
((#\{)
;; left brace isn't a special character in it's own
;; right but we must check if what follows might
;; look like a quantifier
(let ((this-pos (lexer-pos lexer))
(this-last-pos (lexer-last-pos lexer)))
(unget-token lexer)
(when (get-quantifier lexer)
(signal-ppcre-syntax-error*
(car this-last-pos)
"Quantifier '~A' not allowed"
(subseq (lexer-str lexer)
(car this-last-pos)
(lexer-pos lexer))))
(setf (lexer-pos lexer) this-pos
(lexer-last-pos lexer) this-last-pos)
next-char))
((#\[)
;; left bracket always starts a character class
(cons (cond ((looking-at-p lexer #\^)
(incf (lexer-pos lexer))
:inverted-char-class)
(t
:char-class))
(collect-char-class lexer)))
((#\\)
;; backslash might mean different things so we have
;; to peek one char ahead:
(let ((next-char (next-char-non-extended lexer)))
(case next-char
((#\A)
:modeless-start-anchor)
((#\Z)
:modeless-end-anchor)
((#\z)
:modeless-end-anchor-no-newline)
((#\b)
:word-boundary)
((#\B)
:non-word-boundary)
((#\d #\D #\w #\W #\s #\S)
;; these will be treated like character classes
(map-char-to-special-char-class next-char))
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
;; uh, a digit...
(let* ((old-pos (decf (lexer-pos lexer)))
;; ...so let's get the whole number first
(backref-number (get-number lexer)))
(declare (type fixnum backref-number))
(cond ((and (> backref-number (lexer-reg lexer))
(<= 10 backref-number))
;; \10 and higher are treated as octal
;; character codes if we haven't
;; opened that much register groups
;; yet
(setf (lexer-pos lexer) old-pos)
;; re-read the number from the old
;; position and convert it to its
;; corresponding character
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
old-pos))
(t
;; otherwise this must refer to a
;; backreference
(list :back-reference backref-number)))))
((#\0)
;; this always means an octal character code
;; (at most three digits)
(let ((old-pos (decf (lexer-pos lexer))))
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
old-pos)))
(otherwise
;; in all other cases just unescape the
;; character
(decf (lexer-pos lexer))
(unescape-char lexer)))))
((#\()
;; an open parenthesis might mean different things
;; depending on what follows...
(cond ((looking-at-p lexer #\?)
;; this is the case '(?' (and probably more behind)
(incf (lexer-pos lexer))
;; we have to check for modifiers first
;; because a colon might follow
(let* ((flags (maybe-parse-flags lexer))
(next-char (next-char-non-extended lexer)))
;; modifiers are only allowed if a colon
;; or a closing parenthesis are following
(when (and flags
(not (find next-char ":)" :test #'char=)))
(signal-ppcre-syntax-error*
(car (lexer-last-pos lexer))
"Sequence '~A' not recognized"
(subseq (lexer-str lexer)
(car (lexer-last-pos lexer))
(lexer-pos lexer))))
(case next-char
((nil)
;; syntax error
(signal-ppcre-syntax-error
"End of string following '(?'"))
((#\))
;; an empty group except for the flags
;; (if there are any)
(or (and flags
(cons :flags flags))
:void))
((#\()
;; branch
:open-paren-paren)
((#\>)
;; standalone
:open-paren-greater)
((#\=)
;; positive look-ahead
:open-paren-equal)
((#\!)
;; negative look-ahead
:open-paren-exclamation)
((#\:)
;; non-capturing group - return flags as
;; second value
(values :open-paren-colon flags))
((#\<)
;; might be a look-behind assertion, so
;; check next character
(let ((next-char (next-char-non-extended lexer)))
(case next-char
((#\=)
;; positive look-behind
:open-paren-less-equal)
((#\!)
;; negative look-behind
:open-paren-less-exclamation)
((#\))
;; Perl allows "(?<)" and treats
;; it like a null string
:void)
((nil)
;; syntax error
(signal-ppcre-syntax-error
"End of string following '(?<'"))
(t
;; also syntax error
(case next-char
;; the easy cases first - the following six characters
;; always have a special meaning and get translated
;; into tokens immediately
((#\))
:close-paren)
((#\|)
:vertical-bar)
((#\?)
:question-mark)
((#\.)
:everything)
((#\^)
:start-anchor)
((#\$)
:end-anchor)
((#\+ #\*)
;; quantifiers will always be consumend by
;; GET-QUANTIFIER, they must not appear here
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Quantifier '~A' not allowed"
next-char))
((#\{)
;; left brace isn't a special character in it's own
;; right but we must check if what follows might
;; look like a quantifier
(let ((this-pos (lexer-pos lexer))
(this-last-pos (lexer-last-pos lexer)))
(unget-token lexer)
(when (get-quantifier lexer)
(signal-ppcre-syntax-error*
(car this-last-pos)
"Quantifier '~A' not allowed"
(subseq (lexer-str lexer)
(car this-last-pos)
(lexer-pos lexer))))
(setf (lexer-pos lexer) this-pos
(lexer-last-pos lexer) this-last-pos)
next-char))
((#\[)
;; left bracket always starts a character class
(cons (cond ((looking-at-p lexer #\^)
(incf (lexer-pos lexer))
:inverted-char-class)
(t
:char-class))
(collect-char-class lexer)))
((#\\)
;; backslash might mean different things so we have
;; to peek one char ahead:
(let ((next-char (next-char-non-extended lexer)))
(case next-char
((#\A)
:modeless-start-anchor)
((#\Z)
:modeless-end-anchor)
((#\z)
:modeless-end-anchor-no-newline)
((#\b)
:word-boundary)
((#\B)
:non-word-boundary)
((#\k)
(cond ((and *allow-named-registers*
(looking-at-p lexer #\<))
;; back-referencing a named register
(incf (lexer-pos lexer))
(list :back-reference
(nreverse (parse-register-name-aux lexer))))
(t
;; false alarm, just unescape \k
#\k)))
((#\d #\D #\w #\W #\s #\S)
;; these will be treated like character classes
(map-char-to-special-char-class next-char))
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
;; uh, a digit...
(let* ((old-pos (decf (lexer-pos lexer)))
;; ...so let's get the whole number first
(backref-number (get-number lexer)))
(declare (type fixnum backref-number))
(cond ((and (> backref-number (lexer-reg lexer))
(<= 10 backref-number))
;; \10 and higher are treated as octal
;; character codes if we haven't
;; opened that much register groups
;; yet
(setf (lexer-pos lexer) old-pos)
;; re-read the number from the old
;; position and convert it to its
;; corresponding character
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
old-pos))
(t
;; otherwise this must refer to a
;; backreference
(list :back-reference backref-number)))))
((#\0)
;; this always means an octal character code
;; (at most three digits)
(let ((old-pos (decf (lexer-pos lexer))))
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
old-pos)))
(otherwise
;; in all other cases just unescape the
;; character
(decf (lexer-pos lexer))
(unescape-char lexer)))))
((#\()
;; an open parenthesis might mean different things
;; depending on what follows...
(cond ((looking-at-p lexer #\?)
;; this is the case '(?' (and probably more behind)
(incf (lexer-pos lexer))
;; we have to check for modifiers first
;; because a colon might follow
(let* ((flags (maybe-parse-flags lexer))
(next-char (next-char-non-extended lexer)))
;; modifiers are only allowed if a colon
;; or a closing parenthesis are following
(when (and flags
(not (find next-char ":)" :test #'char=)))
(signal-ppcre-syntax-error*
(car (lexer-last-pos lexer))
"Sequence '~A' not recognized"
(subseq (lexer-str lexer)
(car (lexer-last-pos lexer))
(lexer-pos lexer))))
(case next-char
((nil)
;; syntax error
(signal-ppcre-syntax-error
"End of string following '(?'"))
((#\))
;; an empty group except for the flags
;; (if there are any)
(or (and flags
(cons :flags flags))
:void))
((#\()
;; branch
:open-paren-paren)
((#\>)
;; standalone
:open-paren-greater)
((#\=)
;; positive look-ahead
:open-paren-equal)
((#\!)
;; negative look-ahead
:open-paren-exclamation)
((#\:)
;; non-capturing group - return flags as
;; second value
(values :open-paren-colon flags))
((#\<)
;; might be a look-behind assertion or a named group, so
;; check next character
(let ((next-char (next-char-non-extended lexer)))
(if (alpha-char-p next-char)
(progn
;; we have encountered a named group
;; are we supporting register naming?
(unless *allow-named-registers*
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Character '~A' may not follow '(?<'"
next-char )))))
(otherwise
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Character '~A' may not follow '(?'"
next-char)))))
(t
;; if next-char was not #\? (this is within
;; the first COND), we've just seen an opening
;; parenthesis and leave it like that
:open-paren)))
(otherwise
;; all other characters are their own tokens
next-char)))
next-char))
;; put the letter back
(decf (lexer-pos lexer))
;; named group
:open-paren-less-letter)
(case next-char
((#\=)
;; positive look-behind
:open-paren-less-equal)
((#\!)
;; negative look-behind
:open-paren-less-exclamation)
((#\))
;; Perl allows "(?<)" and treats
;; it like a null string
:void)
((nil)
;; syntax error
(signal-ppcre-syntax-error
"End of string following '(?<'"))
(t
;; also syntax error
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Character '~A' may not follow '(?<'"
next-char ))))))
(otherwise
(signal-ppcre-syntax-error*
(1- (lexer-pos lexer))
"Character '~A' may not follow '(?'"
next-char)))))
(t
;; if next-char was not #\? (this is within
;; the first COND), we've just seen an opening
;; parenthesis and leave it like that
:open-paren)))
(otherwise
;; all other characters are their own tokens
next-char)))
;; we didn't get a character (this if the "else" branch from
;; the first IF), so we don't return a token but NIL
(t
(pop (lexer-last-pos lexer))
nil))))
(pop (lexer-last-pos lexer))
nil))))
(declaim (inline unget-token))
(defun unget-token (lexer)

View File

@ -1,9 +1,9 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.1 2005/04/30 20:00:50 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.4 2008/06/25 14:04:27 edi Exp $
;;; This system definition for LispWorks was kindly provided by Wade Humeniuk
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions

4
load.lisp Executable file → Normal file
View File

@ -1,7 +1,7 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.13 2005/04/01 21:29:09 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.16 2008/06/25 14:04:27 edi Exp $
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions

View File

@ -1,10 +1,10 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.26 2005/04/13 15:35:57 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.31 2008/06/25 14:04:27 edi Exp $
;;; This file contains optimizations which can be applied to converted
;;; parse trees.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -40,6 +40,7 @@ transforms #<SEQ #<STR \"a\"> #<SEQ #<STR \"b\"> #<STR \"c\">>> to
operation on REGEX."))
(defmethod flatten ((seq seq))
(declare #.*standard-optimize-settings*)
;; this looks more complicated than it is because we modify SEQ in
;; place to avoid unnecessary consing
(let ((elements-rest (elements seq)))
@ -71,6 +72,7 @@ operation on REGEX."))
(t (make-instance 'void)))))
(defmethod flatten ((alternation alternation))
(declare #.*standard-optimize-settings*)
;; same algorithm as above
(let ((choices-rest (choices alternation)))
(loop
@ -98,9 +100,8 @@ operation on REGEX."))
"Encountered alternation without choices.")))))
(defmethod flatten ((branch branch))
(with-slots ((test test)
(then-regex then-regex)
(else-regex else-regex))
(declare #.*standard-optimize-settings*)
(with-slots (test then-regex else-regex)
branch
(setq test
(if (numberp test)
@ -111,6 +112,7 @@ operation on REGEX."))
branch))
(defmethod flatten ((regex regex))
(declare #.*standard-optimize-settings*)
(typecase regex
((or repetition register lookahead lookbehind standalone)
;; if REGEX contains exactly one inner REGEX object flatten it
@ -124,12 +126,13 @@ operation on REGEX."))
regex)))
(defgeneric gather-strings (regex)
(declare #.*standard-optimize-settings*)
(declare #.*standard-optimize-settings*)
(:documentation "Collects adjacent strings or characters into one
string provided they have the same case mode. This is a destructive
operation on REGEX."))
(defmethod gather-strings ((seq seq))
(declare #.*standard-optimize-settings*)
;; note that GATHER-STRINGS is to be applied after FLATTEN, i.e. it
;; expects SEQ to be flattened already; in particular, SEQ cannot be
;; empty and cannot contain embedded SEQ objects
@ -246,6 +249,7 @@ operation on REGEX."))
seq))
(defmethod gather-strings ((alternation alternation))
(declare #.*standard-optimize-settings*)
;; loop ON the choices of ALTERNATION so we can modify them directly
(loop for choices-rest on (choices alternation)
while choices-rest
@ -254,9 +258,8 @@ operation on REGEX."))
alternation)
(defmethod gather-strings ((branch branch))
(with-slots ((test test)
(then-regex then-regex)
(else-regex else-regex))
(declare #.*standard-optimize-settings*)
(with-slots (test then-regex else-regex)
branch
(setq test
(if (numberp test)
@ -267,6 +270,7 @@ operation on REGEX."))
branch))
(defmethod gather-strings ((regex regex))
(declare #.*standard-optimize-settings*)
(typecase regex
((or repetition register lookahead lookbehind standalone)
;; if REGEX contains exactly one inner REGEX object apply
@ -283,7 +287,7 @@ operation on REGEX."))
;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS.
(defgeneric start-anchored-p (regex &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare #.*standard-optimize-settings*)
(:documentation "Returns T if REGEX starts with a \"real\" start
anchor, i.e. one that's not in multi-line mode, NIL otherwise. If
IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a
@ -302,6 +306,7 @@ zero-length assertion."))
finally (return (and anchored-p (not (eq anchored-p :zero-length))))))
(defmethod start-anchored-p ((alternation alternation) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare (ignore in-seq-p))
;; clearly an alternation can only be start-anchored if all of its
;; choices are start-anchored
@ -309,30 +314,36 @@ zero-length assertion."))
always (start-anchored-p choice)))
(defmethod start-anchored-p ((branch branch) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare (ignore in-seq-p))
(and (start-anchored-p (then-regex branch))
(start-anchored-p (else-regex branch))))
(defmethod start-anchored-p ((repetition repetition) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare (ignore in-seq-p))
;; well, this wouldn't make much sense, but anyway...
(and (plusp (minimum repetition))
(start-anchored-p (regex repetition))))
(defmethod start-anchored-p ((register register) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare (ignore in-seq-p))
(start-anchored-p (regex register)))
(defmethod start-anchored-p ((standalone standalone) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare (ignore in-seq-p))
(start-anchored-p (regex standalone)))
(defmethod start-anchored-p ((anchor anchor) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(declare (ignore in-seq-p))
(and (startp anchor)
(not (multi-line-p anchor))))
(defmethod start-anchored-p ((regex regex) &optional in-seq-p)
(declare #.*standard-optimize-settings*)
(typecase regex
((or lookahead lookbehind word-boundary void)
;; zero-length assertions
@ -352,7 +363,7 @@ zero-length assertion."))
;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS.
(defgeneric end-string-aux (regex &optional old-case-insensitive-p)
(declare #.*standard-optimize-settings*)
(declare #.*standard-optimize-settings*)
(:documentation "Returns the constant string (if it exists) REGEX
ends with wrapped into a STR object, otherwise NIL.
OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR
@ -361,6 +372,7 @@ function called by END-STRIN.)"))
(defmethod end-string-aux ((str str)
&optional (old-case-insensitive-p :void))
(declare #.*standard-optimize-settings*)
(declare (special last-str))
(cond ((and (not (skip str)) ; avoid constituents of STARTS-WITH
;; only use STR if nothing has been collected yet or if
@ -376,6 +388,7 @@ function called by END-STRIN.)"))
(defmethod end-string-aux ((seq seq)
&optional (old-case-insensitive-p :void))
(declare #.*standard-optimize-settings*)
(declare (special continuep))
(let (case-insensitive-p
concatenated-string
@ -444,14 +457,17 @@ function called by END-STRIN.)"))
(defmethod end-string-aux ((register register)
&optional (old-case-insensitive-p :void))
(declare #.*standard-optimize-settings*)
(end-string-aux (regex register) old-case-insensitive-p))
(defmethod end-string-aux ((standalone standalone)
&optional (old-case-insensitive-p :void))
(declare #.*standard-optimize-settings*)
(end-string-aux (regex standalone) old-case-insensitive-p))
(defmethod end-string-aux ((regex regex)
&optional (old-case-insensitive-p :void))
(declare #.*standard-optimize-settings*)
(declare (special last-str end-anchored-p continuep))
(typecase regex
((or anchor lookahead lookbehind word-boundary void)
@ -474,14 +490,11 @@ function called by END-STRIN.)"))
;; REPETITION, FILTER)
nil)))
(defgeneric end-string (regex)
(declare #.*standard-optimize-settings*)
(:documentation "Returns the constant string (if it exists) REGEX ends with wrapped
into a STR object, otherwise NIL."))
(defmethod end-string ((regex regex))
(defun end-string (regex)
(declare (special end-string-offset))
(declare #.*standard-optimize-settings*)
(declare #.*standard-optimize-settings*)
"Returns the constant string (if it exists) REGEX ends with wrapped
into a STR object, otherwise NIL."
;; LAST-STR points to the last STR object (seen from the end) that's
;; part of END-STRING; CONTINUEP is set to T if we stop collecting
;; in the middle of a SEQ
@ -499,53 +512,64 @@ into a STR object, otherwise NIL."))
end-string-offset (offset last-str))))))
(defgeneric compute-min-rest (regex current-min-rest)
(declare #.*standard-optimize-settings*)
(declare #.*standard-optimize-settings*)
(:documentation "Returns the minimal length of REGEX plus
CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it
recurses down into REGEX and sets the MIN-REST slots of REPETITION
objects."))
(defmethod compute-min-rest ((seq seq) current-min-rest)
(declare #.*standard-optimize-settings*)
(loop for element in (reverse (elements seq))
for last-min-rest = current-min-rest then this-min-rest
for this-min-rest = (compute-min-rest element last-min-rest)
finally (return this-min-rest)))
(defmethod compute-min-rest ((alternation alternation) current-min-rest)
(declare #.*standard-optimize-settings*)
(loop for choice in (choices alternation)
minimize (compute-min-rest choice current-min-rest)))
(defmethod compute-min-rest ((branch branch) current-min-rest)
(declare #.*standard-optimize-settings*)
(min (compute-min-rest (then-regex branch) current-min-rest)
(compute-min-rest (else-regex branch) current-min-rest)))
(defmethod compute-min-rest ((str str) current-min-rest)
(declare #.*standard-optimize-settings*)
(+ current-min-rest (len str)))
(defmethod compute-min-rest ((filter filter) current-min-rest)
(declare #.*standard-optimize-settings*)
(+ current-min-rest (or (len filter) 0)))
(defmethod compute-min-rest ((repetition repetition) current-min-rest)
(declare #.*standard-optimize-settings*)
(setf (min-rest repetition) current-min-rest)
(compute-min-rest (regex repetition) current-min-rest)
(+ current-min-rest (* (minimum repetition) (min-len repetition))))
(defmethod compute-min-rest ((register register) current-min-rest)
(declare #.*standard-optimize-settings*)
(compute-min-rest (regex register) current-min-rest))
(defmethod compute-min-rest ((standalone standalone) current-min-rest)
(declare #.*standard-optimize-settings*)
(declare (ignore current-min-rest))
(compute-min-rest (regex standalone) 0))
(defmethod compute-min-rest ((lookahead lookahead) current-min-rest)
(declare #.*standard-optimize-settings*)
(compute-min-rest (regex lookahead) 0)
current-min-rest)
(defmethod compute-min-rest ((lookbehind lookbehind) current-min-rest)
(declare #.*standard-optimize-settings*)
(compute-min-rest (regex lookbehind) (+ current-min-rest (len lookbehind)))
current-min-rest)
(defmethod compute-min-rest ((regex regex) current-min-rest)
(declare #.*standard-optimize-settings*)
(typecase regex
((or char-class everything)
(1+ current-min-rest))

View File

@ -1,7 +1,7 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.19 2005/04/01 21:29:10 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.24 2008/06/25 14:04:27 edi Exp $
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -53,15 +53,14 @@
#:*regex-char-code-limit*
#:*use-bmh-matchers*
#:*allow-quoting*
#:*allow-named-registers*
#:ppcre-error
#:ppcre-invocation-error
#:ppcre-syntax-error
#:ppcre-syntax-error-string
#:ppcre-syntax-error-pos
#:register-groups-bind
#:do-register-groups
#:*standard-optimize-settings*
#:*special-optimize-settings*))
#:do-register-groups))
#+:cormanlisp
(defpackage "CL-PPCRE"
@ -86,15 +85,14 @@
"*REGEX-CHAR-CODE-LIMIT*"
"*USE-BMH-MATCHERS*"
"*ALLOW-QUOTING*"
"*ALLOW-NAMED-REGISTERS*"
"PPCRE-ERROR"
"PPCRE-INVOCATION-ERROR"
"PPCRE-SYNTAX-ERROR"
"PPCRE-SYNTAX-ERROR-STRING"
"PPCRE-SYNTAX-ERROR-POS"
"REGISTER-GROUPS-BIND"
"DO-REGISTER-GROUPS"
"*STANDARD-OPTIMIZE-SETTINGS*"
"*SPECIAL-OPTIMIZE-SETTINGS*"))
"DO-REGISTER-GROUPS"))
#-:cormanlisp
(defpackage #:cl-ppcre-test

View File

@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.25 2008/06/25 14:04:28 edi Exp $
;;; The parser will - with the help of the lexer - parse a regex
;;; string and convert it into a "parse tree" (see docs for details
@ -7,7 +7,7 @@
;;; illegal parse trees. It is assumed that the conversion process
;;; later on will track them down.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -38,19 +38,20 @@
(defun group (lexer)
(declare #.*standard-optimize-settings*)
"Parses and consumes a <group>.
The productions are: <group> -> \"(\"<regex>\")\"
\"(?:\"<regex>\")\"
\"(?>\"<regex>\")\"
\"(?<flags>:\"<regex>\")\"
\"(?=\"<regex>\")\"
\"(?!\"<regex>\")\"
\"(?<=\"<regex>\")\"
\"(?<!\"<regex>\")\"
\"(?(\"<num>\")\"<regex>\")\"
\"(?(\"<regex>\")\"<regex>\")\"
The productions are: <group> -> \"\(\"<regex>\")\"
\"\(?:\"<regex>\")\"
\"\(?>\"<regex>\")\"
\"\(?<flags>:\"<regex>\")\"
\"\(?=\"<regex>\")\"
\"\(?!\"<regex>\")\"
\"\(?<=\"<regex>\")\"
\"\(?<!\"<regex>\")\"
\"\(?\(\"<num>\")\"<regex>\")\"
\"\(?\(\"<regex>\")\"<regex>\")\"
\"\(?<name>\"<regex>\")\" \(when *ALLOW-NAMED-REGISTERS* is T)
<legal-token>
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
Will return <parse-tree> or (<grouping-type> <parse-tree>) where
Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
<grouping-type> is one of six keywords - see source for details."
(multiple-value-bind (open-token flags)
(get-token lexer)
@ -103,17 +104,21 @@ Will return <parse-tree> or (<grouping-type> <parse-tree>) where
:open-paren-equal
:open-paren-exclamation
:open-paren-less-equal
:open-paren-less-exclamation)
:open-paren-less-exclamation
:open-paren-less-letter)
:test #'eq)
;; make changes to extended-mode-p local
(let ((*extended-mode-p* *extended-mode-p*))
;; we saw one of the six token representing opening
;; parentheses
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
(register-name (when (eq open-token :open-paren-less-letter)
(parse-register-name-aux lexer)))
(reg-expr (reg-expr lexer))
(close-token (get-token lexer)))
(when (eq open-token :open-paren)
;; if this is the "("<regex>")" production we have to
(when (or (eq open-token :open-paren)
(eq open-token :open-paren-less-letter))
;; if this is the "("<regex>")" or "(?"<name>""<regex>")" production we have to
;; increment the register counter of the lexer
(incf (lexer-reg lexer)))
(unless (eq close-token :close-paren)
@ -126,27 +131,33 @@ Will return <parse-tree> or (<grouping-type> <parse-tree>) where
;; if the lexer has returned a list of flags this must
;; have been the "(?:"<regex>")" production
(cons :group (nconc flags (list reg-expr)))
(list (case open-token
((:open-paren)
:register)
((:open-paren-colon)
:group)
((:open-paren-greater)
:standalone)
((:open-paren-equal)
:positive-lookahead)
((:open-paren-exclamation)
:negative-lookahead)
((:open-paren-less-equal)
:positive-lookbehind)
((:open-paren-less-exclamation)
:negative-lookbehind))
reg-expr)))))
(if (eq open-token :open-paren-less-letter)
(list :named-register
;; every string was reversed, so we have to
;; reverse it back to get the name
(nreverse register-name)
reg-expr)
(list (case open-token
((:open-paren)
:register)
((:open-paren-colon)
:group)
((:open-paren-greater)
:standalone)
((:open-paren-equal)
:positive-lookahead)
((:open-paren-exclamation)
:negative-lookahead)
((:open-paren-less-equal)
:positive-lookbehind)
((:open-paren-less-exclamation)
:negative-lookbehind))
reg-expr))))))
(t
;; this is the <legal-token> production; <legal-token> is
;; any token which passes START-OF-SUBEXPR-P (otherwise
;; parsing had already stopped in the SEQ method)
open-token))))
;; this is the <legal-token> production; <legal-token> is
;; any token which passes START-OF-SUBEXPR-P (otherwise
;; parsing had already stopped in the SEQ method)
open-token))))
(defun greedy-quant (lexer)
(declare #.*standard-optimize-settings*)

View File

@ -1,7 +1,7 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.31 2005/08/23 12:23:13 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.36 2008/06/25 14:04:28 edi Exp $
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -41,7 +41,8 @@
#+:ecl (si:gc t)
#+:clisp (ext:gc)
#+:cormanlisp (loop for i from 0 to 3 do (cormanlisp:gc i))
#+:lispworks (hcl:mark-and-sweep 3)
#+:lispworks4 (hcl:mark-and-sweep 3)
#+:lispworks5 (hcl:gc-generation #+:lispworks-32bit 3 #+:lispworks-64bit :blocking-gen-num)
#+:sbcl (sb-ext:gc :full t))
;; warning: ugly code ahead!!
@ -52,7 +53,7 @@
multi-line-mode
single-line-mode
extended-mode)
(declare #.*standard-optimize-settings*)
(declare #.ppcre::*standard-optimize-settings*)
"Auxiliary function used by TEST to benchmark a regex scanner
against Perl timings."
(declare (type string string))
@ -73,7 +74,7 @@ against Perl timings."
lispworks
(and sbcl sb-thread))
(defun threaded-scan (scanner target-string &key (threads 10) (repetitions 5000))
(declare #.*standard-optimize-settings*)
(declare #.ppcre::*standard-optimize-settings*)
"Auxiliary function used by TEST to check whether SCANNER is thread-safe."
(full-gc)
(let ((collector (make-array threads))
@ -133,7 +134,7 @@ against Perl timings."
:defaults *cl-ppcre-test-base-directory*)
file-name-provided-p)
threaded)
(declare #.*standard-optimize-settings*)
(declare #.ppcre::*standard-optimize-settings*)
(declare (ignorable threaded))
"Loop through all test cases in FILE-NAME and print report. Only in
LispWorks and SCL: If THREADED is true, also test whether the scanners

View File

@ -1,11 +1,11 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.26 2005/06/10 10:23:42 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.34 2008/07/03 07:44:06 edi Exp $
;;; This file defines the REGEX class and some utility methods for
;;; this class. REGEX objects are used to represent the (transformed)
;;; parse trees internally
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -122,7 +122,10 @@ appear after this repetition.")
:reader num
:type fixnum
:documentation "The number of this register, starting from 0.
This is the index into *REGS-START* and *REGS-END*."))
This is the index into *REGS-START* and *REGS-END*.")
(name :initarg :name
:reader name
:documentation "Name of this register or NIL."))
(:documentation "REGISTER objects represent register groups."))
(defclass standalone (regex)
@ -137,18 +140,21 @@ This is the index into *REGS-START* and *REGS-END*."))
:type fixnum
:documentation "The number of the register this
reference refers to.")
(name :initarg :name
:accessor name
:documentation "The name of the register this
reference refers to or NIL.")
(case-insensitive-p :initarg :case-insensitive-p
:reader case-insensitive-p
:documentation "Whether we check
case-insensitively."))
(:documentation "BACK-REFERENCE objects represent backreferences."))
(defclass char-class (regex)
((hash :initarg :hash
:reader hash
:type (or hash-table null)
:documentation "A hash table the keys of which are the
characters; the values are always T.")
((charset :initarg :charset
:reader charset
:type (or charset null)
:documentation "A charset denoting the characters
in the character class.")
(case-insensitive-p :initarg :case-insensitive-p
:reader case-insensitive-p
:documentation "If the char class
@ -253,20 +259,16 @@ defined by the user."))
(defmethod initialize-instance :after ((char-class char-class) &rest init-args)
(declare #.*standard-optimize-settings*)
"Make large hash tables smaller, if possible."
(let ((hash (getf init-args :hash)))
(when (and hash
"Make large charsets smaller, if possible."
(let ((set (getf init-args :charset)))
(when (and set
(> *regex-char-code-limit* 256)
(> (hash-table-count hash)
(> (charset-count set)
(/ *regex-char-code-limit* 2)))
(setf (slot-value char-class 'hash)
(merge-inverted-hash (make-hash-table)
hash)
(setf (slot-value char-class 'set)
(merge-set (make-charset) set)
(slot-value char-class 'invertedp)
(not (slot-value char-class 'invertedp))))))
;;; The following four methods allow a VOID object to behave like a
;;; zero-length STR object (only readers needed)
(not (slot-value char-class 'invertedp))))))
(defmethod initialize-instance :after ((str str) &rest init-args)
(declare #.*standard-optimize-settings*)
@ -277,6 +279,9 @@ defined by the user."))
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
(setf (len str) (length (str str))))
;;; The following four methods allow a VOID object to behave like a
;;; zero-length STR object (only readers needed)
(defmethod len ((void void))
(declare #.*standard-optimize-settings*)
0)
@ -301,6 +306,7 @@ second argument if the STR has length 0. Returns NIL for REGEX objects
which are not of type STR."))
(defmethod case-mode ((str str) old-case-mode)
(declare #.*standard-optimize-settings*)
(cond ((zerop (len str))
old-case-mode)
((case-insensitive-p str)
@ -309,6 +315,7 @@ which are not of type STR."))
:case-sensitive)))
(defmethod case-mode ((regex regex) old-case-mode)
(declare #.*standard-optimize-settings*)
(declare (ignore old-case-mode))
nil)
@ -317,37 +324,45 @@ which are not of type STR."))
(:documentation "Implements a deep copy of a REGEX object."))
(defmethod copy-regex ((anchor anchor))
(declare #.*standard-optimize-settings*)
(make-instance 'anchor
:startp (startp anchor)
:multi-line-p (multi-line-p anchor)
:no-newline-p (no-newline-p anchor)))
(defmethod copy-regex ((everything everything))
(declare #.*standard-optimize-settings*)
(make-instance 'everything
:single-line-p (single-line-p everything)))
(defmethod copy-regex ((word-boundary word-boundary))
(declare #.*standard-optimize-settings*)
(make-instance 'word-boundary
:negatedp (negatedp word-boundary)))
(defmethod copy-regex ((void void))
(declare #.*standard-optimize-settings*)
(make-instance 'void))
(defmethod copy-regex ((lookahead lookahead))
(declare #.*standard-optimize-settings*)
(make-instance 'lookahead
:regex (copy-regex (regex lookahead))
:positivep (positivep lookahead)))
(defmethod copy-regex ((seq seq))
(declare #.*standard-optimize-settings*)
(make-instance 'seq
:elements (mapcar #'copy-regex (elements seq))))
(defmethod copy-regex ((alternation alternation))
(declare #.*standard-optimize-settings*)
(make-instance 'alternation
:choices (mapcar #'copy-regex (choices alternation))))
(defmethod copy-regex ((branch branch))
(with-slots ((test test))
(declare #.*standard-optimize-settings*)
(with-slots (test)
branch
(make-instance 'branch
:test (if (typep test 'regex)
@ -357,12 +372,14 @@ which are not of type STR."))
:else-regex (copy-regex (else-regex branch)))))
(defmethod copy-regex ((lookbehind lookbehind))
(declare #.*standard-optimize-settings*)
(make-instance 'lookbehind
:regex (copy-regex (regex lookbehind))
:positivep (positivep lookbehind)
:len (len lookbehind)))
(defmethod copy-regex ((repetition repetition))
(declare #.*standard-optimize-settings*)
(make-instance 'repetition
:regex (copy-regex (regex repetition))
:greedyp (greedyp repetition)
@ -373,32 +390,39 @@ which are not of type STR."))
:contains-register-p (contains-register-p repetition)))
(defmethod copy-regex ((register register))
(declare #.*standard-optimize-settings*)
(make-instance 'register
:regex (copy-regex (regex register))
:num (num register)))
:num (num register)
:name (name register)))
(defmethod copy-regex ((standalone standalone))
(declare #.*standard-optimize-settings*)
(make-instance 'standalone
:regex (copy-regex (regex standalone))))
(defmethod copy-regex ((back-reference back-reference))
(declare #.*standard-optimize-settings*)
(make-instance 'back-reference
:num (num back-reference)
:case-insensitive-p (case-insensitive-p back-reference)))
(defmethod copy-regex ((char-class char-class))
(declare #.*standard-optimize-settings*)
(make-instance 'char-class
:hash (hash char-class)
:charset (charset char-class)
:case-insensitive-p (case-insensitive-p char-class)
:invertedp (invertedp char-class)
:word-char-class-p (word-char-class-p char-class)))
(defmethod copy-regex ((str str))
(declare #.*standard-optimize-settings*)
(make-instance 'str
:str (str str)
:case-insensitive-p (case-insensitive-p str)))
(defmethod copy-regex ((filter filter))
(declare #.*standard-optimize-settings*)
(make-instance 'filter
:fn (fn filter)
:len (len filter)))
@ -420,6 +444,7 @@ optionally removes embedded REGISTER objects if possible and if the
special variable REMOVE-REGISTERS-P is true."))
(defmethod remove-registers ((register register))
(declare #.*standard-optimize-settings*)
(declare (special remove-registers-p reg-seen))
(cond (remove-registers-p
(remove-registers (regex register)))
@ -430,6 +455,7 @@ special variable REMOVE-REGISTERS-P is true."))
(copy-regex register))))
(defmethod remove-registers ((repetition repetition))
(declare #.*standard-optimize-settings*)
(let* (reg-seen
(inner-regex (remove-registers (regex repetition))))
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
@ -445,22 +471,26 @@ special variable REMOVE-REGISTERS-P is true."))
:contains-register-p reg-seen)))
(defmethod remove-registers ((standalone standalone))
(declare #.*standard-optimize-settings*)
(make-instance 'standalone
:regex (remove-registers (regex standalone))))
(defmethod remove-registers ((lookahead lookahead))
(declare #.*standard-optimize-settings*)
(make-instance 'lookahead
:regex (remove-registers (regex lookahead))
:positivep (positivep lookahead)))
(defmethod remove-registers ((lookbehind lookbehind))
(declare #.*standard-optimize-settings*)
(make-instance 'lookbehind
:regex (remove-registers (regex lookbehind))
:positivep (positivep lookbehind)
:len (len lookbehind)))
(defmethod remove-registers ((branch branch))
(with-slots ((test test))
(declare #.*standard-optimize-settings*)
(with-slots (test)
branch
(make-instance 'branch
:test (if (typep test 'regex)
@ -470,15 +500,18 @@ special variable REMOVE-REGISTERS-P is true."))
:else-regex (remove-registers (else-regex branch)))))
(defmethod remove-registers ((alternation alternation))
(declare #.*standard-optimize-settings*)
(declare (special remove-registers-p))
;; an ALTERNATION, so we can't remove REGISTER objects further down
(setq remove-registers-p nil)
(copy-regex alternation))
(defmethod remove-registers ((regex regex))
(declare #.*standard-optimize-settings*)
(copy-regex regex))
(defmethod remove-registers ((seq seq))
(declare #.*standard-optimize-settings*)
(make-instance 'seq
:elements (mapcar #'remove-registers (elements seq))))
@ -489,6 +522,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(i.e. the object corresponding to \".\", for example."))
(defmethod everythingp ((seq seq))
(declare #.*standard-optimize-settings*)
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
;; due to the parsing process
(let ((cleaned-elements (remove-if #'(lambda (element)
@ -498,7 +532,8 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(everythingp (first cleaned-elements)))))
(defmethod everythingp ((alternation alternation))
(with-slots ((choices choices))
(declare #.*standard-optimize-settings*)
(with-slots (choices)
alternation
(and (= 1 (length choices))
;; this is unlikely to happen for human-generated regexes,
@ -506,9 +541,8 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(everythingp (first choices)))))
(defmethod everythingp ((repetition repetition))
(with-slots ((maximum maximum)
(minimum minimum)
(regex regex))
(declare #.*standard-optimize-settings*)
(with-slots (maximum minimum regex)
repetition
(and maximum
(= 1 minimum maximum)
@ -516,15 +550,19 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(everythingp regex))))
(defmethod everythingp ((register register))
(declare #.*standard-optimize-settings*)
(everythingp (regex register)))
(defmethod everythingp ((standalone standalone))
(declare #.*standard-optimize-settings*)
(everythingp (regex standalone)))
(defmethod everythingp ((everything everything))
(declare #.*standard-optimize-settings*)
everything)
(defmethod everythingp ((regex regex))
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
nil)
@ -534,6 +572,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
(defmethod regex-length ((seq seq))
(declare #.*standard-optimize-settings*)
;; simply add all inner lengths unless one of them is NIL
(loop for sub-regex in (elements seq)
for len = (regex-length sub-regex)
@ -541,6 +580,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
sum len))
(defmethod regex-length ((alternation alternation))
(declare #.*standard-optimize-settings*)
;; only return a true value if all inner lengths are non-NIL and
;; mutually equal
(loop for sub-regex in (choices alternation)
@ -551,6 +591,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
finally (return len)))
(defmethod regex-length ((branch branch))
(declare #.*standard-optimize-settings*)
;; only return a true value if both alternations have a length and
;; if they're equal
(let ((then-length (regex-length (then-regex branch))))
@ -559,13 +600,12 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
then-length)))
(defmethod regex-length ((repetition repetition))
(declare #.*standard-optimize-settings*)
;; we can only compute the length of a REPETITION object if the
;; number of repetitions is fixed; note that we don't call
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
;; always set correctly
(with-slots ((len len)
(minimum minimum)
(maximum maximum))
(with-slots (len minimum maximum)
repetition
(if (and len
(eql minimum maximum))
@ -573,29 +613,37 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
nil)))
(defmethod regex-length ((register register))
(declare #.*standard-optimize-settings*)
(regex-length (regex register)))
(defmethod regex-length ((standalone standalone))
(declare #.*standard-optimize-settings*)
(regex-length (regex standalone)))
(defmethod regex-length ((back-reference back-reference))
(declare #.*standard-optimize-settings*)
;; with enough effort we could possibly do better here, but
;; currently we just give up and return NIL
nil)
(defmethod regex-length ((char-class char-class))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-length ((everything everything))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-length ((str str))
(declare #.*standard-optimize-settings*)
(len str))
(defmethod regex-length ((filter filter))
(declare #.*standard-optimize-settings*)
(len filter))
(defmethod regex-length ((regex regex))
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
;; WORD-BOUNDARY (which all have zero-length)
0)
@ -605,12 +653,14 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(:documentation "Returns the minimal length of REGEX."))
(defmethod regex-min-length ((seq seq))
(declare #.*standard-optimize-settings*)
;; simply add all inner minimal lengths
(loop for sub-regex in (elements seq)
for len = (regex-min-length sub-regex)
sum len))
(defmethod regex-min-length ((alternation alternation))
(declare #.*standard-optimize-settings*)
;; minimal length of an alternation is the minimal length of the
;; "shortest" element
(loop for sub-regex in (choices alternation)
@ -618,35 +668,44 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
minimize len))
(defmethod regex-min-length ((branch branch))
(declare #.*standard-optimize-settings*)
;; minimal length of both alternations
(min (regex-min-length (then-regex branch))
(regex-min-length (else-regex branch))))
(defmethod regex-min-length ((repetition repetition))
(declare #.*standard-optimize-settings*)
;; obviously the product of the inner minimal length and the minimal
;; number of repetitions
(* (minimum repetition) (min-len repetition)))
(defmethod regex-min-length ((register register))
(declare #.*standard-optimize-settings*)
(regex-min-length (regex register)))
(defmethod regex-min-length ((standalone standalone))
(declare #.*standard-optimize-settings*)
(regex-min-length (regex standalone)))
(defmethod regex-min-length ((char-class char-class))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-min-length ((everything everything))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-min-length ((str str))
(declare #.*standard-optimize-settings*)
(len str))
(defmethod regex-min-length ((filter filter))
(declare #.*standard-optimize-settings*)
(or (len filter)
0))
(defmethod regex-min-length ((regex regex))
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
0)
@ -664,6 +723,7 @@ slots of STR objects further down the tree."))
;; into repetitions
(defmethod compute-offsets ((seq seq) start-pos)
(declare #.*standard-optimize-settings*)
(loop for element in (elements seq)
;; advance offset argument for next call while looping through
;; the elements
@ -673,6 +733,7 @@ slots of STR objects further down the tree."))
finally (return curr-offset)))
(defmethod compute-offsets ((alternation alternation) start-pos)
(declare #.*standard-optimize-settings*)
(loop for choice in (choices alternation)
for old-offset = nil then curr-offset
for curr-offset = (compute-offsets choice start-pos)
@ -684,6 +745,7 @@ slots of STR objects further down the tree."))
finally (return curr-offset)))
(defmethod compute-offsets ((branch branch) start-pos)
(declare #.*standard-optimize-settings*)
;; only return offset if both alternations have equal value
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
(and then-offset
@ -691,10 +753,9 @@ slots of STR objects further down the tree."))
then-offset)))
(defmethod compute-offsets ((repetition repetition) start-pos)
(declare #.*standard-optimize-settings*)
;; no need to descend into the inner regex
(with-slots ((len len)
(minimum minimum)
(maximum maximum))
(with-slots (len minimum maximum)
repetition
(if (and len
(eq minimum maximum))
@ -704,34 +765,42 @@ slots of STR objects further down the tree."))
nil)))
(defmethod compute-offsets ((register register) start-pos)
(declare #.*standard-optimize-settings*)
(compute-offsets (regex register) start-pos))
(defmethod compute-offsets ((standalone standalone) start-pos)
(declare #.*standard-optimize-settings*)
(compute-offsets (regex standalone) start-pos))
(defmethod compute-offsets ((char-class char-class) start-pos)
(declare #.*standard-optimize-settings*)
(1+ start-pos))
(defmethod compute-offsets ((everything everything) start-pos)
(declare #.*standard-optimize-settings*)
(1+ start-pos))
(defmethod compute-offsets ((str str) start-pos)
(declare #.*standard-optimize-settings*)
(setf (offset str) start-pos)
(+ start-pos (len str)))
(defmethod compute-offsets ((back-reference back-reference) start-pos)
(declare #.*standard-optimize-settings*)
;; with enough effort we could possibly do better here, but
;; currently we just give up and return NIL
(declare (ignore start-pos))
nil)
(defmethod compute-offsets ((filter filter) start-pos)
(declare #.*standard-optimize-settings*)
(let ((len (len filter)))
(if len
(+ start-pos len)
nil)))
(defmethod compute-offsets ((regex regex) start-pos)
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
;; WORD-BOUNDARY (which all have zero-length)
start-pos)

View File

@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.24 2005/04/13 15:35:58 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $
;;; This is actually a part of closures.lisp which we put into a
;;; separate file because it is rather complex. We only deal with
@ -7,7 +7,7 @@
;;; rather crazy micro-optimizations which were introduced to be as
;;; competitive with Perl as possible in tight loops.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -804,45 +804,41 @@ that REPETITION has a constant number of repetitions."))
;; utilizes all the functions and macros defined above
(defmethod create-matcher-aux ((repetition repetition) next-fn)
(with-slots ((minimum minimum)
(maximum maximum)
(len len)
(min-len min-len)
(greedyp greedyp)
(contains-register-p contains-register-p))
(declare #.*standard-optimize-settings*)
(with-slots (minimum maximum len min-len greedyp contains-register-p)
repetition
(cond ((and maximum
(zerop maximum))
;; this should have been optimized away by CONVERT but just
;; in case...
(error "Got REPETITION with MAXIMUM 0 \(should not happen)"))
;; this should have been optimized away by CONVERT but just
;; in case...
(error "Got REPETITION with MAXIMUM 0 \(should not happen)"))
((and maximum
(= minimum maximum 1))
;; this should have been optimized away by CONVERT but just
;; in case...
(error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)"))
;; this should have been optimized away by CONVERT but just
;; in case...
(error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)"))
((and (eql minimum maximum)
len
(not contains-register-p))
(create-constant-repetition-constant-length-matcher repetition next-fn))
(create-constant-repetition-constant-length-matcher repetition next-fn))
((eql minimum maximum)
(create-constant-repetition-matcher repetition next-fn))
(create-constant-repetition-matcher repetition next-fn))
((and greedyp
len
(not contains-register-p))
(create-greedy-constant-length-matcher repetition next-fn))
(create-greedy-constant-length-matcher repetition next-fn))
((and greedyp
(or (plusp min-len)
(eql maximum 1)))
(create-greedy-no-zero-matcher repetition next-fn))
(create-greedy-no-zero-matcher repetition next-fn))
(greedyp
(create-greedy-matcher repetition next-fn))
(create-greedy-matcher repetition next-fn))
((and len
(plusp len)
(not contains-register-p))
(create-non-greedy-constant-length-matcher repetition next-fn))
(create-non-greedy-constant-length-matcher repetition next-fn))
((or (plusp min-len)
(eql maximum 1))
(create-non-greedy-no-zero-matcher repetition next-fn))
(create-non-greedy-no-zero-matcher repetition next-fn))
(t
(create-non-greedy-matcher repetition next-fn)))))
(create-non-greedy-matcher repetition next-fn)))))

View File

@ -1,10 +1,10 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.26 2005/07/19 23:18:15 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $
;;; Here the scanner for the actual regex as well as utility scanners
;;; for the constant start and end strings are created.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions

View File

@ -1,9 +1,9 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.21 2005/04/01 21:29:10 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.27 2008/07/03 08:13:28 edi Exp $
;;; globally declared special variables
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -45,7 +45,7 @@
(defvar *special-optimize-settings*
'(optimize speed space)
"Special optimize settings used only be a few declaration expressions.")
"Special optimize settings used only by a few declaration expressions.")
;;; special variables used by the lexer/parser combo
@ -55,6 +55,13 @@
;;; special variables used by the SCAN function and the matchers
(defvar *regex-char-code-limit* char-code-limit
"The upper exclusive bound on the char-codes of characters which can
occur in character classes. Change this value BEFORE creating
scanners if you don't need the \(full) Unicode support of
implementations like AllegroCL, CLISP, LispWorks, or SBCL.")
(declaim (type fixnum *regex-char-code-limit*))
(defvar *string* ""
"The string which is currently scanned by SCAN.
Will always be coerced to a SIMPLE-STRING.")
@ -120,11 +127,16 @@ but large) Boyer-Moore-Horspool matchers.")
(defvar *allow-quoting* nil
"Whether the parser should support Perl's \\Q and \\E.")
(defvar *allow-named-registers* nil
"Whether the parser should support AllegroCL's named registers
\(?<name>\"<regex>\") and back-reference \\k<name> syntax.")
(pushnew :cl-ppcre *features*)
;; stuff for Nikodemus Siivola's HYPERDOC
;; see <http://common-lisp.net/project/hyperdoc/>
;; and <http://www.cliki.net/hyperdoc>
;; also used by LW-ADD-ONS
(defvar *hyperdoc-base-uri* "http://weitz.de/cl-ppcre/")

View File

@ -14285,3 +14285,4 @@ b" nil 1 0 "b" (nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil)
./" "(?x)a#\\Q
." nil nil nil nil "aa" nil 1 0 nil nil)
(1623 "\"abcdxklqj\" =~ /ab(?=.*q)cd/" "ab(?=.*q)cd" nil nil nil nil "abcdxklqj" nil 1 0 "abcd" (nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil))
(1624 "\"ab\" =~ /a(?!.*$)b/" "a(?!.*$)b" nil nil nil nil "ab" nil 1 0 nil nil)

View File

@ -3943,3 +3943,6 @@
abcdxklqj
/a(?!.*$)b/
ab
/.{2}[a-z]/

180
util.lisp
View File

@ -1,13 +1,10 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.32 2005/08/23 10:32:30 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.40 2008/07/03 10:06:16 edi Exp $
;;; Utility functions and constants dealing with the hash-tables
;;; we use to encode character classes
;;; Utility functions and constants dealing with the character sets we
;;; use to encode character classes
;;; Hash-tables are treated like sets, i.e. a character C is a member of the
;;; hash-table H iff (GETHASH C H) is true.
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -36,7 +33,8 @@
(in-package #:cl-ppcre)
#+:lispworks
(import 'lw:with-unique-names)
(eval-when (:compile-toplevel :load-toplevel :execute)
(import 'lw:with-unique-names))
#-:lispworks
(defmacro with-unique-names ((&rest bindings) &body body)
@ -103,22 +101,15 @@ are discarded \(that is, the body is an implicit PROGN)."
,,@body))))))
(eval-when (:compile-toplevel :execute :load-toplevel)
(defvar *regex-char-code-limit* char-code-limit
"The upper exclusive bound on the char-codes of characters which
can occur in character classes. Change this value BEFORE creating
scanners if you don't need the Unicode support of implementations like
AllegroCL, CLISP, LispWorks, or SBCL.")
(declaim (type fixnum *regex-char-code-limit*))
(defun make-char-hash (test)
(defun make-char-set (test)
(declare #.*special-optimize-settings*)
"Returns a hash-table of all characters satisfying test."
(loop with hash = (make-hash-table)
for c of-type fixnum from 0 below char-code-limit
for chr = (code-char c)
if (and chr (funcall test chr))
do (setf (gethash chr hash) t)
finally (return hash)))
"Returns a CHARSET for all characters satisfying test."
(loop with set = (make-charset)
for code of-type fixnum from 0 below char-code-limit
for char = (code-char code)
if (and char (funcall test char))
do (add-to-charset char set)
finally (return set)))
(declaim (inline word-char-p))
@ -147,101 +138,72 @@ i.e. whether it would match [\\s] in Perl."
;; the following DEFCONSTANT statements are wrapped with
;; (UNLESS (BOUNDP ...) ...) to make SBCL happy
(unless (boundp '+digit-hash+)
(defconstant +digit-hash+
(make-char-hash (lambda (chr) (char<= #\0 chr #\9)))
"Hash-table containing the digits from 0 to 9."))
(unless (boundp '+digit-set+)
(defconstant +digit-set+
(make-char-set (lambda (chr) (char<= #\0 chr #\9)))
"Character set containing the digits from 0 to 9."))
(unless (boundp '+word-char-hash+)
(defconstant +word-char-hash+
(make-char-hash #'word-char-p)
"Hash-table containing all \"word\" characters."))
(unless (boundp '+word-char-set+)
(defconstant +word-char-set+
(make-char-set #'word-char-p)
"Character set containing all \"word\" characters."))
(unless (boundp '+whitespace-char-hash+)
(defconstant +whitespace-char-hash+
(make-char-hash #'whitespacep)
"Hash-table containing all whitespace characters."))
(unless (boundp '+whitespace-char-set+)
(defconstant +whitespace-char-set+
(make-char-set #'whitespacep)
"Character set containing all whitespace characters."))
(defun merge-hash (hash1 hash2)
(defun create-ranges-from-set (set &key downcasep)
(declare #.*standard-optimize-settings*)
"Returns the \"sum\" of two hashes. This is a destructive operation
on HASH1."
(cond ((> (hash-table-count hash2)
*regex-char-code-limit*)
;; don't walk through, e.g., the whole +WORD-CHAR-HASH+ if
;; the user has set *REGEX-CHAR-CODE-LIMIT* to a lower value
(loop for c of-type fixnum from 0 below *regex-char-code-limit*
for chr = (code-char c)
if (and chr (gethash chr hash2))
do (setf (gethash chr hash1) t)))
(t
(loop for chr being the hash-keys of hash2
do (setf (gethash chr hash1) t))))
hash1)
(defun merge-inverted-hash (hash1 hash2)
(declare #.*standard-optimize-settings*)
"Returns the \"sum\" of HASH1 and the \"inverse\" of HASH2. This is
a destructive operation on HASH1."
(loop for c of-type fixnum from 0 below *regex-char-code-limit*
for chr = (code-char c)
if (and chr (not (gethash chr hash2)))
do (setf (gethash chr hash1) t))
hash1)
(defun create-ranges-from-hash (hash &key downcasep)
(declare #.*standard-optimize-settings*)
"Tries to identify up to three intervals (with respect to CHAR<)
which together comprise HASH. Returns NIL if this is not possible.
If DOWNCASEP is true it will treat the hash-table as if it represents
both the lower-case and the upper-case variants of its members and
will only return the respective lower-case intervals."
;; discard empty hash-tables
(unless (and hash (plusp (hash-table-count hash)))
(return-from create-ranges-from-hash nil))
"Tries to identify up to three intervals \(with respect to CHAR<)
which together comprise the charset SET. Returns NIL if this is not
possible. If DOWNCASEP is true it will treat the charset as if it
represents both the lower-case and the upper-case variants of its
members and will only return the respective lower-case intervals."
;; discard empty charsets
(unless (and set (plusp (charset-count set)))
(return-from create-ranges-from-set nil))
(loop with min1 and min2 and min3
and max1 and max2 and max3
;; loop through all characters in HASH, sorted by CHAR<
for chr in (sort (the list
(loop for chr being the hash-keys of hash
collect (if downcasep
(char-downcase chr)
chr)))
#'char<)
for code = (char-code chr)
;; loop through all characters in SET, sorted by CHAR<
;; (actually by < on their character codes, see 13.1.6 in the
;; ANSI standard)
for code of-type fixnum below *regex-char-code-limit*
for char = (code-char code)
when (and char (in-charset-p (if downcasep (char-downcase char) char) set))
;; MIN1, MAX1, etc. are _exclusive_
;; bounds of the intervals identified so far
do (cond
((not min1)
;; this will only happen once, for the first character
(setq min1 (1- code)
max1 (1+ code)))
((<= (the fixnum min1) code (the fixnum max1))
;; we're here as long as CHR fits into the first interval
(setq min1 (min (the fixnum min1) (1- code))
max1 (max (the fixnum max1) (1+ code))))
((not min2)
;; we need to open a second interval
;; this'll also happen only once
(setq min2 (1- code)
max2 (1+ code)))
((<= (the fixnum min2) code (the fixnum max2))
;; CHR fits into the second interval
(setq min2 (min (the fixnum min2) (1- code))
max2 (max (the fixnum max2) (1+ code))))
((not min3)
;; we need to open the third interval
;; happens only once
(setq min3 (1- code)
max3 (1+ code)))
((<= (the fixnum min3) code (the fixnum max3))
;; CHR fits into the third interval
(setq min3 (min (the fixnum min3) (1- code))
max3 (max (the fixnum max3) (1+ code))))
(t
;; we're out of luck, CHR doesn't fit
;; into one of the three intervals
(return nil)))
((not min1)
;; this will only happen once, for the first character
(setq min1 (1- code)
max1 (1+ code)))
((<= (the fixnum min1) code (the fixnum max1))
;; we're here as long as CHAR fits into the first interval
(setq min1 (min (the fixnum min1) (1- code))
max1 (max (the fixnum max1) (1+ code))))
((not min2)
;; we need to open a second interval
;; this'll also happen only once
(setq min2 (1- code)
max2 (1+ code)))
((<= (the fixnum min2) code (the fixnum max2))
;; CHAR fits into the second interval
(setq min2 (min (the fixnum min2) (1- code))
max2 (max (the fixnum max2) (1+ code))))
((not min3)
;; we need to open the third interval
;; happens only once
(setq min3 (1- code)
max3 (1+ code)))
((<= (the fixnum min3) code (the fixnum max3))
;; CHAR fits into the third interval
(setq min3 (min (the fixnum min3) (1- code))
max3 (max (the fixnum max3) (1+ code))))
(t
;; we're out of luck, CHAR doesn't fit
;; into one of the three intervals
(return nil)))
;; on success return all bounds
;; make them inclusive bounds before returning
finally (return (values (code-char (1+ min1))