Update to current dev version
git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3581 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
12
CHANGELOG
12
CHANGELOG
@ -1,3 +1,15 @@
|
|||||||
|
Version 2.0.0
|
||||||
|
2008-07-23
|
||||||
|
Added named properties (\p{foo})
|
||||||
|
Added Unicode support
|
||||||
|
Introduced test functions for character classes
|
||||||
|
Added optional test function optimization
|
||||||
|
Cleaned up test suite, removed performance cruft
|
||||||
|
Removed the various alternative system definitions (too much maintenance work)
|
||||||
|
Exported PARSE-STRING
|
||||||
|
General cleanup
|
||||||
|
Lots of documentation additions
|
||||||
|
|
||||||
Version 1.4.1
|
Version 1.4.1
|
||||||
2008-07-03
|
2008-07-03
|
||||||
Skip non-characters in CREATE-RANGES-FROM-SET
|
Skip non-characters in CREATE-RANGES-FROM-SET
|
||||||
|
|||||||
62
README
62
README
@ -1,62 +0,0 @@
|
|||||||
Complete documentation for CL-PPCRE can be found in the 'doc'
|
|
||||||
directory.
|
|
||||||
|
|
||||||
CL-PPCRE also supports Nikodemus Siivola's HYPERDOC, see
|
|
||||||
<http://common-lisp.net/project/hyperdoc/> and
|
|
||||||
<http://www.cliki.net/hyperdoc>.
|
|
||||||
|
|
||||||
1. Installation
|
|
||||||
|
|
||||||
1.1. Probably the easiest way is
|
|
||||||
|
|
||||||
(load "/path/to/cl-ppcre/load.lisp")
|
|
||||||
|
|
||||||
This should compile and load CL-PPCRE on most Common Lisp
|
|
||||||
implementations.
|
|
||||||
|
|
||||||
1.2. With MK:DEFSYSTEM you can make a symbolic link from
|
|
||||||
'cl-ppcre.system' and 'cl-ppcre-test.system' to your central registry
|
|
||||||
(which by default is in '/usr/local/lisp/Registry/') and then issue
|
|
||||||
the command
|
|
||||||
|
|
||||||
(mk:compile-system "cl-ppcre")
|
|
||||||
|
|
||||||
Note that this relies on TRUENAME returning the original file a
|
|
||||||
symbolic link is pointing to. This will only work with AllegroCL
|
|
||||||
6.2 if you've applied all patches with (SYS:UPDATE-ALLEGRO).
|
|
||||||
|
|
||||||
1.3. You can also use ASDF instead of MK:DEFSYSTEM in a similar way
|
|
||||||
(use the .asd files instead of the .system files).
|
|
||||||
|
|
||||||
1.4. For LispWorks there's a file 'lispworks-defsystem.lisp' which includes
|
|
||||||
a system definition for LispWork's Common Defsystem.
|
|
||||||
|
|
||||||
2. Test
|
|
||||||
|
|
||||||
CL-PPCRE comes with a test suite that can be used to check its
|
|
||||||
compatibility with Perl's regex syntax. See the documentation on how
|
|
||||||
to use this test suite for benchmarks and on how to write your own
|
|
||||||
tests.
|
|
||||||
|
|
||||||
2.1. If you've used 'load.lisp' to load CL-PPCRE you already have the
|
|
||||||
test suite loaded and can start the default tests with
|
|
||||||
|
|
||||||
(cl-ppcre-test:test)
|
|
||||||
|
|
||||||
2.2. With MK:DEFSYSTEM you need to compile the 'cl-ppcre-test' system
|
|
||||||
as well before you can proceed as in 2.1.
|
|
||||||
|
|
||||||
2.3. Same for ASDF.
|
|
||||||
|
|
||||||
Depending on your machine and your CL implementation the default test
|
|
||||||
will take between a few seconds and a couple of minutes. (It will
|
|
||||||
print a dot for every tenth test case while it proceeds to give some
|
|
||||||
visual feedback.) It should exactly report three 'errors' (662, 790,
|
|
||||||
and 1439) which are explained in the documentation.
|
|
||||||
|
|
||||||
MCL might report an error for the ninth test case which is also
|
|
||||||
explained in the docs.
|
|
||||||
|
|
||||||
Genera notes (thanks to Patrick O'Donnell): Some more tests will fail
|
|
||||||
because characters like #\Return, #\Linefeed, or #\Tab have encodings
|
|
||||||
which differ from Perl's (and thus CL-PPCRE's) expectations.
|
|
||||||
154
api.lisp
154
api.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.79 2008/07/03 08:39:10 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.84 2008/07/06 18:12:04 edi Exp $
|
||||||
|
|
||||||
;;; The external API for creating and using scanners.
|
;;; The external API for creating and using scanners.
|
||||||
|
|
||||||
@ -29,7 +29,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defgeneric create-scanner (regex &key case-insensitive-mode
|
(defgeneric create-scanner (regex &key case-insensitive-mode
|
||||||
multi-line-mode
|
multi-line-mode
|
||||||
@ -39,10 +39,10 @@
|
|||||||
(:documentation "Accepts a regular expression - either as a
|
(:documentation "Accepts a regular expression - either as a
|
||||||
parse-tree or as a string - and returns a scan closure which will scan
|
parse-tree or as a string - and returns a scan closure which will scan
|
||||||
strings for this regular expression and a list mapping registers to
|
strings for this regular expression and a list mapping registers to
|
||||||
their names \(NIL stands for unnamed ones). The \"mode\" keyboard
|
their names \(NIL stands for unnamed ones). The \"mode\" keyboard
|
||||||
arguments are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE
|
arguments are equivalent to the imsx modifiers in Perl. If
|
||||||
is not NIL the function is allowed to destructively modify its first
|
DESTRUCTIVE is not NIL, the function is allowed to destructively
|
||||||
argument \(but only if it's a parse tree)."))
|
modify its first argument \(but only if it's a parse tree)."))
|
||||||
|
|
||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
(defmethod create-scanner ((regex-string string) &key case-insensitive-mode
|
(defmethod create-scanner ((regex-string string) &key case-insensitive-mode
|
||||||
@ -76,8 +76,7 @@ argument \(but only if it's a parse tree)."))
|
|||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (ignore destructive))
|
(declare (ignore destructive))
|
||||||
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
|
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
|
||||||
(signal-ppcre-invocation-error
|
(signal-invocation-error "You can't use the keyword arguments to modify an existing scanner."))
|
||||||
"You can't use the keyword arguments to modify an existing scanner."))
|
|
||||||
scanner)
|
scanner)
|
||||||
|
|
||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
@ -88,8 +87,7 @@ argument \(but only if it's a parse tree)."))
|
|||||||
destructive)
|
destructive)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(when extended-mode
|
(when extended-mode
|
||||||
(signal-ppcre-invocation-error
|
(signal-invocation-error "Extended mode doesn't make sense in parse trees."))
|
||||||
"Extended mode doesn't make sense in parse trees."))
|
|
||||||
;; convert parse-tree into internal representation REGEX and at the
|
;; convert parse-tree into internal representation REGEX and at the
|
||||||
;; same time compute the number of registers and the constant string
|
;; same time compute the number of registers and the constant string
|
||||||
;; (or anchor) the regex starts with (if any)
|
;; (or anchor) the regex starts with (if any)
|
||||||
@ -180,7 +178,6 @@ argument \(but only if it's a parse tree)."))
|
|||||||
|
|
||||||
#+:use-acl-regexp2-engine
|
#+:use-acl-regexp2-engine
|
||||||
(declaim (inline create-scanner))
|
(declaim (inline create-scanner))
|
||||||
|
|
||||||
#+:use-acl-regexp2-engine
|
#+:use-acl-regexp2-engine
|
||||||
(defmethod create-scanner ((scanner regexp::regular-expression) &key case-insensitive-mode
|
(defmethod create-scanner ((scanner regexp::regular-expression) &key case-insensitive-mode
|
||||||
multi-line-mode
|
multi-line-mode
|
||||||
@ -190,8 +187,7 @@ argument \(but only if it's a parse tree)."))
|
|||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (ignore destructive))
|
(declare (ignore destructive))
|
||||||
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
|
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
|
||||||
(signal-ppcre-invocation-error
|
(signal-invocation-error "You can't use the keyword arguments to modify an existing scanner."))
|
||||||
"You can't use the keyword arguments to modify an existing scanner."))
|
|
||||||
scanner)
|
scanner)
|
||||||
|
|
||||||
#+:use-acl-regexp2-engine
|
#+:use-acl-regexp2-engine
|
||||||
@ -254,7 +250,6 @@ internal purposes."))
|
|||||||
|
|
||||||
#+:use-acl-regexp2-engine
|
#+:use-acl-regexp2-engine
|
||||||
(declaim (inline scan))
|
(declaim (inline scan))
|
||||||
|
|
||||||
#+:use-acl-regexp2-engine
|
#+:use-acl-regexp2-engine
|
||||||
(defmethod scan ((parse-tree t) target-string
|
(defmethod scan ((parse-tree t) target-string
|
||||||
&key (start 0)
|
&key (start 0)
|
||||||
@ -292,12 +287,12 @@ internal purposes."))
|
|||||||
(defun scan-to-strings (regex target-string &key (start 0)
|
(defun scan-to-strings (regex target-string &key (start 0)
|
||||||
(end (length target-string))
|
(end (length target-string))
|
||||||
sharedp)
|
sharedp)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Like SCAN but returns substrings of TARGET-STRING instead of
|
"Like SCAN but returns substrings of TARGET-STRING instead of
|
||||||
positions, i.e. this function returns two values on success: the whole
|
positions, i.e. this function returns two values on success: the whole
|
||||||
match as a string plus an array of substrings (or NILs) corresponding
|
match as a string plus an array of substrings (or NILs) corresponding
|
||||||
to the matched registers. If SHAREDP is true, the substrings may share
|
to the matched registers. If SHAREDP is true, the substrings may
|
||||||
structure with TARGET-STRING."
|
share structure with TARGET-STRING."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(multiple-value-bind (match-start match-end reg-starts reg-ends)
|
(multiple-value-bind (match-start match-end reg-starts reg-ends)
|
||||||
(scan regex target-string :start start :end end)
|
(scan regex target-string :start start :end end)
|
||||||
(unless match-start
|
(unless match-start
|
||||||
@ -329,11 +324,11 @@ structure with TARGET-STRING."
|
|||||||
"Executes BODY with the variables in VAR-LIST bound to the
|
"Executes BODY with the variables in VAR-LIST bound to the
|
||||||
corresponding register groups after TARGET-STRING has been matched
|
corresponding register groups after TARGET-STRING has been matched
|
||||||
against REGEX, i.e. each variable is either bound to a string or to
|
against REGEX, i.e. each variable is either bound to a string or to
|
||||||
NIL. If there is no match, BODY is _not_ executed. For each element of
|
NIL. If there is no match, BODY is _not_ executed. For each element
|
||||||
VAR-LIST which is NIL there's no binding to the corresponding register
|
of VAR-LIST which is NIL there's no binding to the corresponding
|
||||||
group. The number of variables in VAR-LIST must not be greater than
|
register group. The number of variables in VAR-LIST must not be
|
||||||
the number of register groups. If SHAREDP is true, the substrings may
|
greater than the number of register groups. If SHAREDP is true, the
|
||||||
share structure with TARGET-STRING."
|
substrings may share structure with TARGET-STRING."
|
||||||
(with-rebinding (target-string)
|
(with-rebinding (target-string)
|
||||||
(with-unique-names (match-start match-end reg-starts reg-ends
|
(with-unique-names (match-start match-end reg-starts reg-ends
|
||||||
start-index substr-fn)
|
start-index substr-fn)
|
||||||
@ -368,10 +363,10 @@ share structure with TARGET-STRING."
|
|||||||
&environment env)
|
&environment env)
|
||||||
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
||||||
possible evaluating BODY with MATCH-START, MATCH-END, REG-STARTS, and
|
possible evaluating BODY with MATCH-START, MATCH-END, REG-STARTS, and
|
||||||
REG-ENDS bound to the four return values of each match in turn. After
|
REG-ENDS bound to the four return values of each match in turn. After
|
||||||
the last match, returns RESULT-FORM if provided or NIL otherwise. An
|
the last match, returns RESULT-FORM if provided or NIL otherwise. An
|
||||||
implicit block named NIL surrounds DO-SCANS; RETURN may be used to
|
implicit block named NIL surrounds DO-SCANS; RETURN may be used to
|
||||||
terminate the loop immediately. If REGEX matches an empty string the
|
terminate the loop immediately. If REGEX matches an empty string the
|
||||||
scan is continued one position behind this match. BODY may start with
|
scan is continued one position behind this match. BODY may start with
|
||||||
declarations."
|
declarations."
|
||||||
(with-rebinding (target-string)
|
(with-rebinding (target-string)
|
||||||
@ -427,11 +422,11 @@ declarations."
|
|||||||
&body body)
|
&body body)
|
||||||
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
||||||
possible evaluating BODY with MATCH-START and MATCH-END bound to the
|
possible evaluating BODY with MATCH-START and MATCH-END bound to the
|
||||||
start/end positions of each match in turn. After the last match,
|
start/end positions of each match in turn. After the last match,
|
||||||
returns RESULT-FORM if provided or NIL otherwise. An implicit block
|
returns RESULT-FORM if provided or NIL otherwise. An implicit block
|
||||||
named NIL surrounds DO-MATCHES; RETURN may be used to terminate the
|
named NIL surrounds DO-MATCHES; RETURN may be used to terminate the
|
||||||
loop immediately. If REGEX matches an empty string the scan is
|
loop immediately. If REGEX matches an empty string the scan is
|
||||||
continued one position behind this match. BODY may start with
|
continued one position behind this match. BODY may start with
|
||||||
declarations."
|
declarations."
|
||||||
;; this is a simplified form of DO-SCANS - we just provide two dummy
|
;; this is a simplified form of DO-SCANS - we just provide two dummy
|
||||||
;; vars and ignore them
|
;; vars and ignore them
|
||||||
@ -450,12 +445,12 @@ declarations."
|
|||||||
&body body)
|
&body body)
|
||||||
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
||||||
possible evaluating BODY with MATCH-VAR bound to the substring of
|
possible evaluating BODY with MATCH-VAR bound to the substring of
|
||||||
TARGET-STRING corresponding to each match in turn. After the last
|
TARGET-STRING corresponding to each match in turn. After the last
|
||||||
match, returns RESULT-FORM if provided or NIL otherwise. An implicit
|
match, returns RESULT-FORM if provided or NIL otherwise. An implicit
|
||||||
block named NIL surrounds DO-MATCHES-AS-STRINGS; RETURN may be used to
|
block named NIL surrounds DO-MATCHES-AS-STRINGS; RETURN may be used to
|
||||||
terminate the loop immediately. If REGEX matches an empty string the
|
terminate the loop immediately. If REGEX matches an empty string the
|
||||||
scan is continued one position behind this match. If SHAREDP is true,
|
scan is continued one position behind this match. If SHAREDP is true,
|
||||||
the substrings may share structure with TARGET-STRING. BODY may start
|
the substrings may share structure with TARGET-STRING. BODY may start
|
||||||
with declarations."
|
with declarations."
|
||||||
(with-rebinding (target-string)
|
(with-rebinding (target-string)
|
||||||
(with-unique-names (match-start match-end substr-fn)
|
(with-unique-names (match-start match-end substr-fn)
|
||||||
@ -475,15 +470,16 @@ with declarations."
|
|||||||
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
"Iterates over TARGET-STRING and tries to match REGEX as often as
|
||||||
possible evaluating BODY with the variables in VAR-LIST bound to the
|
possible evaluating BODY with the variables in VAR-LIST bound to the
|
||||||
corresponding register groups for each match in turn, i.e. each
|
corresponding register groups for each match in turn, i.e. each
|
||||||
variable is either bound to a string or to NIL. For each element of
|
variable is either bound to a string or to NIL. For each element of
|
||||||
VAR-LIST which is NIL there's no binding to the corresponding register
|
VAR-LIST which is NIL there's no binding to the corresponding register
|
||||||
group. The number of variables in VAR-LIST must not be greater than
|
group. The number of variables in VAR-LIST must not be greater than
|
||||||
the number of register groups. After the last match, returns
|
the number of register groups. After the last match, returns
|
||||||
RESULT-FORM if provided or NIL otherwise. An implicit block named NIL
|
RESULT-FORM if provided or NIL otherwise. An implicit block named NIL
|
||||||
surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop
|
surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop
|
||||||
immediately. If REGEX matches an empty string the scan is continued
|
immediately. If REGEX matches an empty string the scan is continued
|
||||||
one position behind this match. If SHAREDP is true, the substrings may
|
one position behind this match. If SHAREDP is true, the substrings
|
||||||
share structure with TARGET-STRING. BODY may start with declarations."
|
may share structure with TARGET-STRING. BODY may start with
|
||||||
|
declarations."
|
||||||
(with-rebinding (target-string)
|
(with-rebinding (target-string)
|
||||||
(with-unique-names (substr-fn match-start match-end
|
(with-unique-names (substr-fn match-start match-end
|
||||||
reg-starts reg-ends start-index)
|
reg-starts reg-ends start-index)
|
||||||
@ -510,11 +506,11 @@ share structure with TARGET-STRING. BODY may start with declarations."
|
|||||||
(defun all-matches (regex target-string
|
(defun all-matches (regex target-string
|
||||||
&key (start 0)
|
&key (start 0)
|
||||||
(end (length target-string)))
|
(end (length target-string)))
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Returns a list containing the start and end positions of all
|
"Returns a list containing the start and end positions of all
|
||||||
matches of REGEX against TARGET-STRING, i.e. if there are N matches
|
matches of REGEX against TARGET-STRING, i.e. if there are N matches
|
||||||
the list contains (* 2 N) elements. If REGEX matches an empty string
|
the list contains (* 2 N) elements. If REGEX matches an empty string
|
||||||
the scan is continued one position behind this match."
|
the scan is continued one position behind this match."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let (result-list)
|
(let (result-list)
|
||||||
(do-matches (match-start match-end
|
(do-matches (match-start match-end
|
||||||
regex target-string
|
regex target-string
|
||||||
@ -536,11 +532,11 @@ compile time."
|
|||||||
&key (start 0)
|
&key (start 0)
|
||||||
(end (length target-string))
|
(end (length target-string))
|
||||||
sharedp)
|
sharedp)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Returns a list containing all substrings of TARGET-STRING which
|
"Returns a list containing all substrings of TARGET-STRING which
|
||||||
match REGEX. If REGEX matches an empty string the scan is continued
|
match REGEX. If REGEX matches an empty string the scan is continued
|
||||||
one position behind this match. If SHAREDP is true, the substrings may
|
one position behind this match. If SHAREDP is true, the substrings may
|
||||||
share structure with TARGET-STRING."
|
share structure with TARGET-STRING."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let (result-list)
|
(let (result-list)
|
||||||
(do-matches-as-strings (match regex target-string (nreverse result-list)
|
(do-matches-as-strings (match regex target-string (nreverse result-list)
|
||||||
:start start :end end :sharedp sharedp)
|
:start start :end end :sharedp sharedp)
|
||||||
@ -563,18 +559,18 @@ compile time."
|
|||||||
with-registers-p
|
with-registers-p
|
||||||
omit-unmatched-p
|
omit-unmatched-p
|
||||||
sharedp)
|
sharedp)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Matches REGEX against TARGET-STRING as often as possible and
|
"Matches REGEX against TARGET-STRING as often as possible and
|
||||||
returns a list of the substrings between the matches. If
|
returns a list of the substrings between the matches. If
|
||||||
WITH-REGISTERS-P is true, substrings corresponding to matched
|
WITH-REGISTERS-P is true, substrings corresponding to matched
|
||||||
registers are inserted into the list as well. If OMIT-UNMATCHED-P is
|
registers are inserted into the list as well. If OMIT-UNMATCHED-P is
|
||||||
true, unmatched registers will simply be left out, otherwise they will
|
true, unmatched registers will simply be left out, otherwise they will
|
||||||
show up as NIL. LIMIT limits the number of elements returned -
|
show up as NIL. LIMIT limits the number of elements returned -
|
||||||
registers aren't counted. If LIMIT is NIL (or 0 which is equivalent),
|
registers aren't counted. If LIMIT is NIL \(or 0 which is
|
||||||
trailing empty strings are removed from the result list. If REGEX
|
equivalent), trailing empty strings are removed from the result list.
|
||||||
matches an empty string the scan is continued one position behind this
|
If REGEX matches an empty string the scan is continued one position
|
||||||
match. If SHAREDP is true, the substrings may share structure with
|
behind this match. If SHAREDP is true, the substrings may share
|
||||||
TARGET-STRING."
|
structure with TARGET-STRING."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
;; initialize list of positions POS-LIST to extract substrings with
|
;; initialize list of positions POS-LIST to extract substrings with
|
||||||
;; START so that the start of the next match will mark the end of
|
;; START so that the start of the next match will mark the end of
|
||||||
;; the first substring
|
;; the first substring
|
||||||
@ -637,13 +633,13 @@ TARGET-STRING."
|
|||||||
|
|
||||||
(defun string-case-modifier (str from to start end)
|
(defun string-case-modifier (str from to start end)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type fixnum from to start end))
|
(declare (fixnum from to start end))
|
||||||
"Checks whether all words in STR between FROM and TO are upcased,
|
"Checks whether all words in STR between FROM and TO are upcased,
|
||||||
downcased or capitalized and returns a function which applies a
|
downcased or capitalized and returns a function which applies a
|
||||||
corresponding case modification to strings. Returns #'IDENTITY
|
corresponding case modification to strings. Returns #'IDENTITY
|
||||||
otherwise, especially if words in the target area extend beyond FROM
|
otherwise, especially if words in the target area extend beyond FROM
|
||||||
or TO. STR is supposed to be bounded by START and END. It is assumed
|
or TO. STR is supposed to be bounded by START and END. It is assumed
|
||||||
that (<= START FROM TO END)."
|
that \(<= START FROM TO END)."
|
||||||
(case
|
(case
|
||||||
(if (or (<= to from)
|
(if (or (<= to from)
|
||||||
(and (< start from)
|
(and (< start from)
|
||||||
@ -740,9 +736,8 @@ S-expression."))
|
|||||||
((#\\) :backslash)))))
|
((#\\) :backslash)))))
|
||||||
(when (and (numberp token) (< token 0))
|
(when (and (numberp token) (< token 0))
|
||||||
;; make sure we don't accept something like "\\0"
|
;; make sure we don't accept something like "\\0"
|
||||||
(signal-ppcre-invocation-error
|
(signal-invocation-error "Illegal substring ~S in replacement string."
|
||||||
"Illegal substring ~S in replacement string"
|
(subseq replacement-string match-start match-end)))
|
||||||
(subseq replacement-string match-start match-end)))
|
|
||||||
(push token collector))
|
(push token collector))
|
||||||
;; remember where the match ended
|
;; remember where the match ended
|
||||||
(setq from match-end))
|
(setq from match-end))
|
||||||
@ -801,9 +796,8 @@ S-expression."))
|
|||||||
((#\\) :backslash)))))
|
((#\\) :backslash)))))
|
||||||
(when (and (numberp token) (< token 0))
|
(when (and (numberp token) (< token 0))
|
||||||
;; make sure we don't accept something like "\\0"
|
;; make sure we don't accept something like "\\0"
|
||||||
(signal-ppcre-invocation-error
|
(signal-invocation-error "Illegal substring ~S in replacement string."
|
||||||
"Illegal substring ~S in replacement string"
|
(subseq replacement match-start match-end)))
|
||||||
(subseq replacement match-start match-end)))
|
|
||||||
(push token collector))
|
(push token collector))
|
||||||
;; remember where the match ended
|
;; remember where the match ended
|
||||||
(setq from match-end))
|
(setq from match-end))
|
||||||
@ -843,9 +837,8 @@ corresponding string."
|
|||||||
(when (>= token reg-bound)
|
(when (>= token reg-bound)
|
||||||
;; but only if the register was referenced in the
|
;; but only if the register was referenced in the
|
||||||
;; regular expression
|
;; regular expression
|
||||||
(signal-ppcre-invocation-error
|
(signal-invocation-error "Reference to non-existent register ~A in replacement string."
|
||||||
"Reference to non-existent register ~A in replacement string"
|
(1+ token)))
|
||||||
(1+ token)))
|
|
||||||
(when (svref reg-starts token)
|
(when (svref reg-starts token)
|
||||||
;; and only if it matched, i.e. no match results
|
;; and only if it matched, i.e. no match results
|
||||||
;; in an empty string
|
;; in an empty string
|
||||||
@ -909,11 +902,11 @@ corresponding string."
|
|||||||
|
|
||||||
(defun replace-aux (target-string replacement pos-list reg-list start end
|
(defun replace-aux (target-string replacement pos-list reg-list start end
|
||||||
preserve-case simple-calls element-type)
|
preserve-case simple-calls element-type)
|
||||||
|
"Auxiliary function used by REGEX-REPLACE and REGEX-REPLACE-ALL.
|
||||||
|
POS-LIST contains a list with the start and end positions of all
|
||||||
|
matches while REG-LIST contains a list of arrays representing the
|
||||||
|
corresponding register start and end positions."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
"Auxiliary function used by REGEX-REPLACE and
|
|
||||||
REGEX-REPLACE-ALL. POS-LIST contains a list with the start and end
|
|
||||||
positions of all matches while REG-LIST contains a list of arrays
|
|
||||||
representing the corresponding register start and end positions."
|
|
||||||
;; build the template once before we start the loop
|
;; build the template once before we start the loop
|
||||||
(let ((replacement-template (build-replacement-template replacement)))
|
(let ((replacement-template (build-replacement-template replacement)))
|
||||||
(with-output-to-string (s nil :element-type element-type)
|
(with-output-to-string (s nil :element-type element-type)
|
||||||
@ -955,7 +948,6 @@ representing the corresponding register start and end positions."
|
|||||||
preserve-case
|
preserve-case
|
||||||
simple-calls
|
simple-calls
|
||||||
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
|
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Try to match TARGET-STRING between START and END against REGEX and
|
"Try to match TARGET-STRING between START and END against REGEX and
|
||||||
replace the first match with REPLACEMENT. Two values are returned;
|
replace the first match with REPLACEMENT. Two values are returned;
|
||||||
the modified string, and T if REGEX matched or NIL otherwise.
|
the modified string, and T if REGEX matched or NIL otherwise.
|
||||||
@ -985,6 +977,7 @@ match. The result will always be a fresh string, even if REGEX doesn't
|
|||||||
match.
|
match.
|
||||||
|
|
||||||
ELEMENT-TYPE is the element type of the resulting string."
|
ELEMENT-TYPE is the element type of the resulting string."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(multiple-value-bind (match-start match-end reg-starts reg-ends)
|
(multiple-value-bind (match-start match-end reg-starts reg-ends)
|
||||||
(scan regex target-string :start start :end end)
|
(scan regex target-string :start start :end end)
|
||||||
(if match-start
|
(if match-start
|
||||||
@ -1012,7 +1005,6 @@ match.
|
|||||||
preserve-case
|
preserve-case
|
||||||
simple-calls
|
simple-calls
|
||||||
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
|
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Try to match TARGET-STRING between START and END against REGEX and
|
"Try to match TARGET-STRING between START and END against REGEX and
|
||||||
replace all matches with REPLACEMENT. Two values are returned; the
|
replace all matches with REPLACEMENT. Two values are returned; the
|
||||||
modified string, and T if REGEX matched or NIL otherwise.
|
modified string, and T if REGEX matched or NIL otherwise.
|
||||||
@ -1042,6 +1034,7 @@ match. The result will always be a fresh string, even if REGEX doesn't
|
|||||||
match.
|
match.
|
||||||
|
|
||||||
ELEMENT-TYPE is the element type of the resulting string."
|
ELEMENT-TYPE is the element type of the resulting string."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let ((pos-list '())
|
(let ((pos-list '())
|
||||||
(reg-list '()))
|
(reg-list '()))
|
||||||
(do-scans (match-start match-end reg-starts reg-ends regex target-string
|
(do-scans (match-start match-end reg-starts reg-ends regex target-string
|
||||||
@ -1102,6 +1095,9 @@ scanner, a case-insensitive scanner is used."
|
|||||||
|
|
||||||
#+:cormanlisp
|
#+:cormanlisp
|
||||||
(defmacro do-with-all-symbols ((variable package-or-packagelist) &body body)
|
(defmacro do-with-all-symbols ((variable package-or-packagelist) &body body)
|
||||||
|
"Executes BODY with VARIABLE bound to each symbol in
|
||||||
|
PACKAGE-OR-PACKAGELIST \(a designator for a list of packages) in
|
||||||
|
turn."
|
||||||
(with-unique-names (pack-var)
|
(with-unique-names (pack-var)
|
||||||
`(if (listp ,package-or-packagelist)
|
`(if (listp ,package-or-packagelist)
|
||||||
(dolist (,pack-var ,package-or-packagelist)
|
(dolist (,pack-var ,package-or-packagelist)
|
||||||
@ -1113,11 +1109,11 @@ scanner, a case-insensitive scanner is used."
|
|||||||
#+:cormanlisp
|
#+:cormanlisp
|
||||||
(defmacro regex-apropos-aux ((regex packages case-insensitive &optional return-form)
|
(defmacro regex-apropos-aux ((regex packages case-insensitive &optional return-form)
|
||||||
&body body)
|
&body body)
|
||||||
"Auxiliary macro used by REGEX-APROPOS and REGEX-APROPOS-LIST. Loops
|
"Auxiliary macro used by REGEX-APROPOS and REGEX-APROPOS-LIST.
|
||||||
through PACKAGES and executes BODY with SYMBOL bound to each symbol
|
Loops through PACKAGES and executes BODY with SYMBOL bound to each
|
||||||
which matches REGEX. Optionally evaluates and returns RETURN-FORM at
|
symbol which matches REGEX. Optionally evaluates and returns
|
||||||
the end. If CASE-INSENSITIVE is true and REGEX isn't already a
|
RETURN-FORM at the end. If CASE-INSENSITIVE is true and REGEX isn't
|
||||||
scanner, a case-insensitive scanner is used."
|
already a scanner, a case-insensitive scanner is used."
|
||||||
(with-rebinding (regex)
|
(with-rebinding (regex)
|
||||||
(with-unique-names (scanner %packages hash)
|
(with-unique-names (scanner %packages hash)
|
||||||
`(let* ((,scanner (create-scanner ,regex
|
`(let* ((,scanner (create-scanner ,regex
|
||||||
@ -1137,7 +1133,7 @@ scanner, a case-insensitive scanner is used."
|
|||||||
(defun regex-apropos-list (regex &optional packages &key (case-insensitive t))
|
(defun regex-apropos-list (regex &optional packages &key (case-insensitive t))
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
"Similar to the standard function APROPOS-LIST but returns a list of
|
"Similar to the standard function APROPOS-LIST but returns a list of
|
||||||
all symbols which match the regular expression REGEX. If
|
all symbols which match the regular expression REGEX. If
|
||||||
CASE-INSENSITIVE is true and REGEX isn't already a scanner, a
|
CASE-INSENSITIVE is true and REGEX isn't already a scanner, a
|
||||||
case-insensitive scanner is used."
|
case-insensitive scanner is used."
|
||||||
(let ((collector '()))
|
(let ((collector '()))
|
||||||
@ -1189,7 +1185,7 @@ meaningful information about a symbol."
|
|||||||
|
|
||||||
(defun regex-apropos (regex &optional packages &key (case-insensitive t))
|
(defun regex-apropos (regex &optional packages &key (case-insensitive t))
|
||||||
"Similar to the standard function APROPOS but returns a list of all
|
"Similar to the standard function APROPOS but returns a list of all
|
||||||
symbols which match the regular expression REGEX. If CASE-INSENSITIVE
|
symbols which match the regular expression REGEX. If CASE-INSENSITIVE
|
||||||
is true and REGEX isn't already a scanner, a case-insensitive scanner
|
is true and REGEX isn't already a scanner, a case-insensitive scanner
|
||||||
is used."
|
is used."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -1232,7 +1228,7 @@ sections. These sections may nest."
|
|||||||
(quote-token-replace-scanner "\\\\([QE])"))
|
(quote-token-replace-scanner "\\\\([QE])"))
|
||||||
(defun clean-comments (string &optional extended-mode)
|
(defun clean-comments (string &optional extended-mode)
|
||||||
"Clean \(?#...) comments within STRING for quoting, i.e. convert
|
"Clean \(?#...) comments within STRING for quoting, i.e. convert
|
||||||
\\Q to Q and \\E to E. If EXTENDED-MODE is true, also clean
|
\\Q to Q and \\E to E. If EXTENDED-MODE is true, also clean
|
||||||
end-of-line comments, i.e. those starting with #\\# and ending with
|
end-of-line comments, i.e. those starting with #\\# and ending with
|
||||||
#\\Newline."
|
#\\Newline."
|
||||||
(flet ((remove-tokens (target-string start end match-start
|
(flet ((remove-tokens (target-string start end match-start
|
||||||
@ -1251,7 +1247,7 @@ end-of-line comments, i.e. those starting with #\\# and ending with
|
|||||||
#'remove-tokens))))
|
#'remove-tokens))))
|
||||||
|
|
||||||
(defun parse-tree-synonym (symbol)
|
(defun parse-tree-synonym (symbol)
|
||||||
"Returns the parse tree the SYMBOL symbol is a synonym for. Returns
|
"Returns the parse tree the SYMBOL symbol is a synonym for. Returns
|
||||||
NIL is SYMBOL wasn't yet defined to be a synonym."
|
NIL is SYMBOL wasn't yet defined to be a synonym."
|
||||||
(get symbol 'parse-tree-synonym))
|
(get symbol 'parse-tree-synonym))
|
||||||
|
|
||||||
@ -1261,6 +1257,6 @@ NIL is SYMBOL wasn't yet defined to be a synonym."
|
|||||||
|
|
||||||
(defmacro define-parse-tree-synonym (name parse-tree)
|
(defmacro define-parse-tree-synonym (name parse-tree)
|
||||||
"Defines the symbol NAME to be a synonym for the parse tree
|
"Defines the symbol NAME to be a synonym for the parse tree
|
||||||
PARSE-TREE. Both arguments are quoted."
|
PARSE-TREE. Both arguments are quoted."
|
||||||
`(eval-when (:compile-toplevel :load-toplevel :execute)
|
`(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||||
(setf (parse-tree-synonym ',name) ',parse-tree)))
|
(setf (parse-tree-synonym ',name) ',parse-tree)))
|
||||||
|
|||||||
152
charmap.lisp
Normal file
152
charmap.lisp
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/charmap.lisp,v 1.18 2008/07/22 23:54:59 edi Exp $
|
||||||
|
|
||||||
|
;;; An optimized representation of sets of characters.
|
||||||
|
|
||||||
|
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
|
(defstruct (charmap (:constructor make-charmap%))
|
||||||
|
;; a bit vector mapping char codes to "booleans" (1 for set members,
|
||||||
|
;; 0 for others)
|
||||||
|
(vector #*0 :type simple-bit-vector)
|
||||||
|
;; the smallest character code of all characters in the set
|
||||||
|
(start 0 :type fixnum)
|
||||||
|
;; the upper (exclusive) bound of all character codes in the set
|
||||||
|
(end 0 :type fixnum)
|
||||||
|
;; the number of characters in the set, or NIL if this is unknown
|
||||||
|
(count nil :type (or fixnum null))
|
||||||
|
;; whether the charmap actually represents the complement of the set
|
||||||
|
(complementp nil :type boolean))
|
||||||
|
|
||||||
|
;; seems to be necessary for some Lisps like ClozureCL
|
||||||
|
(defmethod make-load-form ((map charmap) &optional environment)
|
||||||
|
(make-load-form-saving-slots map :environment environment))
|
||||||
|
|
||||||
|
(declaim (inline in-charmap-p))
|
||||||
|
(defun in-charmap-p (char charmap)
|
||||||
|
"Tests whether the character CHAR belongs to the set represented by CHARMAP."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (character char) (charmap charmap))
|
||||||
|
(let* ((char-code (char-code char))
|
||||||
|
(char-in-vector-p
|
||||||
|
(let ((charmap-start (charmap-start charmap)))
|
||||||
|
(declare (fixnum charmap-start))
|
||||||
|
(and (<= charmap-start char-code)
|
||||||
|
(< char-code (the fixnum (charmap-end charmap)))
|
||||||
|
(= 1 (sbit (the simple-bit-vector (charmap-vector charmap))
|
||||||
|
(- char-code charmap-start)))))))
|
||||||
|
(cond ((charmap-complementp charmap) (not char-in-vector-p))
|
||||||
|
(t char-in-vector-p))))
|
||||||
|
|
||||||
|
(defun charmap-contents (charmap)
|
||||||
|
"Returns a list of all characters belonging to a character map.
|
||||||
|
Only works for non-complement charmaps."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (charmap charmap))
|
||||||
|
(and (not (charmap-complementp charmap))
|
||||||
|
(loop for code of-type fixnum from (charmap-start charmap) to (charmap-end charmap)
|
||||||
|
for i across (the simple-bit-vector (charmap-vector charmap))
|
||||||
|
when (= i 1)
|
||||||
|
collect (code-char code))))
|
||||||
|
|
||||||
|
(defun make-charmap (start end test-function &optional complementp)
|
||||||
|
"Creates and returns a charmap representing all characters with
|
||||||
|
character codes in the interval [start end) that satisfy
|
||||||
|
TEST-FUNCTION. The COMPLEMENTP slot of the charmap is set to the
|
||||||
|
value of the optional argument, but this argument doesn't have an
|
||||||
|
effect on how TEST-FUNCTION is used."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (fixnum start end))
|
||||||
|
(let ((vector (make-array (- end start) :element-type 'bit))
|
||||||
|
(count 0))
|
||||||
|
(declare (fixnum count))
|
||||||
|
(loop for code from start below end
|
||||||
|
for char = (code-char code)
|
||||||
|
for index from 0
|
||||||
|
when char do
|
||||||
|
(incf count)
|
||||||
|
(setf (sbit vector index) (if (funcall test-function char) 1 0)))
|
||||||
|
(make-charmap% :vector vector
|
||||||
|
:start start
|
||||||
|
:end end
|
||||||
|
;; we don't know for sure if COMPLEMENTP is true as
|
||||||
|
;; there isn't a necessary a character for each
|
||||||
|
;; integer below *REGEX-CHAR-CODE-LIMIT*
|
||||||
|
:count (and (not complementp) count)
|
||||||
|
;; make sure it's boolean
|
||||||
|
:complementp (not (not complementp)))))
|
||||||
|
|
||||||
|
(defun create-charmap-from-test-function (test-function start end)
|
||||||
|
"Creates and returns a charmap representing all characters with
|
||||||
|
character codes between START and END which satisfy TEST-FUNCTION.
|
||||||
|
Tries to find the smallest interval which is necessary to represent
|
||||||
|
the character set and uses the complement representation if that
|
||||||
|
helps."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(let (start-in end-in start-out end-out)
|
||||||
|
;; determine the smallest intervals containing the set and its
|
||||||
|
;; complement, [start-in, end-in) and [start-out, end-out) - first
|
||||||
|
;; the lower bound
|
||||||
|
(loop for code from start below end
|
||||||
|
for char = (code-char code)
|
||||||
|
until (and start-in start-out)
|
||||||
|
when (and char
|
||||||
|
(not start-in)
|
||||||
|
(funcall test-function char))
|
||||||
|
do (setq start-in code)
|
||||||
|
when (and char
|
||||||
|
(not start-out)
|
||||||
|
(not (funcall test-function char)))
|
||||||
|
do (setq start-out code))
|
||||||
|
(unless start-in
|
||||||
|
;; no character satisfied the test, so return a "pseudo" charmap
|
||||||
|
;; where IN-CHARMAP-P is always false
|
||||||
|
(return-from create-charmap-from-test-function
|
||||||
|
(make-charmap% :count 0)))
|
||||||
|
(unless start-out
|
||||||
|
;; no character failed the test, so return a "pseudo" charmap
|
||||||
|
;; where IN-CHARMAP-P is always true
|
||||||
|
(return-from create-charmap-from-test-function
|
||||||
|
(make-charmap% :complementp t)))
|
||||||
|
;; now determine upper bound
|
||||||
|
(loop for code from (1- end) downto start
|
||||||
|
for char = (code-char code)
|
||||||
|
until (and end-in end-out)
|
||||||
|
when (and char
|
||||||
|
(not end-in)
|
||||||
|
(funcall test-function char))
|
||||||
|
do (setq end-in (1+ code))
|
||||||
|
when (and char
|
||||||
|
(not end-out)
|
||||||
|
(not (funcall test-function char)))
|
||||||
|
do (setq end-out (1+ code)))
|
||||||
|
;; use the smaller interval
|
||||||
|
(cond ((<= (- end-in start-in) (- end-out start-out))
|
||||||
|
(make-charmap start-in end-in test-function))
|
||||||
|
(t (make-charmap start-out end-out (complement* test-function) t)))))
|
||||||
85
charset.lisp
85
charset.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.4 2008/07/03 08:39:10 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.9 2008/07/23 00:47:58 edi Exp $
|
||||||
|
|
||||||
;;; A specialized set implementation for characters by Nikodemus Siivola.
|
;;; A specialized set implementation for characters by Nikodemus Siivola.
|
||||||
|
|
||||||
@ -30,7 +30,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defconstant +probe-depth+ 3
|
(defconstant +probe-depth+ 3
|
||||||
"Maximum number of collisions \(for any element) we accept before we
|
"Maximum number of collisions \(for any element) we accept before we
|
||||||
@ -45,7 +45,7 @@ initialized to #\Null except for the first one which is initialized to
|
|||||||
#\?."
|
#\?."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type (integer 2 #.(1- array-total-size-limit)) size))
|
(declare (type (integer 2 #.(1- array-total-size-limit)) size))
|
||||||
;; Since #\Null always hashes to 0, store something else there
|
;; since #\Null always hashes to 0, store something else there
|
||||||
;; initially, and #\Null everywhere else
|
;; initially, and #\Null everywhere else
|
||||||
(let ((result (make-array size
|
(let ((result (make-array size
|
||||||
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char
|
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char
|
||||||
@ -53,7 +53,7 @@ initialized to #\Null except for the first one which is initialized to
|
|||||||
(setf (char result 0) #\?)
|
(setf (char result 0) #\?)
|
||||||
result))
|
result))
|
||||||
|
|
||||||
(defstruct (charset (:constructor make-charset))
|
(defstruct (charset (:constructor make-charset ()))
|
||||||
;; this is set to 0 when we stop hashing and just use a CHAR-CODE
|
;; this is set to 0 when we stop hashing and just use a CHAR-CODE
|
||||||
;; indexed vector
|
;; indexed vector
|
||||||
(depth +probe-depth+ :type fixnum)
|
(depth +probe-depth+ :type fixnum)
|
||||||
@ -92,7 +92,7 @@ to the hash code HASH."
|
|||||||
(depth (charset-depth set))
|
(depth (charset-depth set))
|
||||||
(code (char-code char)))
|
(code (char-code char)))
|
||||||
(declare (fixnum depth))
|
(declare (fixnum depth))
|
||||||
;; As long as the set remains reasonably small, we use non-linear
|
;; as long as the set remains reasonably small, we use non-linear
|
||||||
;; hashing - the first hash of any character is its CHAR-CODE, and
|
;; hashing - the first hash of any character is its CHAR-CODE, and
|
||||||
;; subsequent hashes are computed by MIX above
|
;; subsequent hashes are computed by MIX above
|
||||||
(cond ((or
|
(cond ((or
|
||||||
@ -129,14 +129,15 @@ to the hash code HASH."
|
|||||||
"Adds the character CHAR to the charset SET, extending SET if
|
"Adds the character CHAR to the charset SET, extending SET if
|
||||||
necessary. Returns CHAR."
|
necessary. Returns CHAR."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(or (%add-to-charset char set)
|
(or (%add-to-charset char set t)
|
||||||
(%add-to-charset/expand char set)
|
(%add-to-charset/expand char set)
|
||||||
(error "Oops, this should not happen..."))
|
(error "Oops, this should not happen..."))
|
||||||
char)
|
char)
|
||||||
|
|
||||||
(defun %add-to-charset (char set)
|
(defun %add-to-charset (char set count)
|
||||||
"Tries to add the character CHAR to the charset SET without
|
"Tries to add the character CHAR to the charset SET without
|
||||||
extending it. Returns NIL if this fails."
|
extending it. Returns NIL if this fails. Counts CHAR as new
|
||||||
|
if COUNT is true and it is added to SET."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (character char) (charset set))
|
(declare (character char) (charset set))
|
||||||
(let ((vector (charset-vector set))
|
(let ((vector (charset-vector set))
|
||||||
@ -144,8 +145,12 @@ extending it. Returns NIL if this fails."
|
|||||||
(code (char-code char)))
|
(code (char-code char)))
|
||||||
(declare (fixnum depth))
|
(declare (fixnum depth))
|
||||||
;; see comments in IN-CHARSET-P for algorithm
|
;; see comments in IN-CHARSET-P for algorithm
|
||||||
(cond ((or (zerop depth) (zerop code))
|
(cond ((or (zerop depth) (zerop code))
|
||||||
(setf (char vector code) char))
|
(unless (eq char (char vector code))
|
||||||
|
(setf (char vector code) char)
|
||||||
|
(when count
|
||||||
|
(incf (charset-count set))))
|
||||||
|
char)
|
||||||
(t
|
(t
|
||||||
(let ((hash code))
|
(let ((hash code))
|
||||||
(tagbody
|
(tagbody
|
||||||
@ -154,7 +159,8 @@ extending it. Returns NIL if this fails."
|
|||||||
(x (char vector index)))
|
(x (char vector index)))
|
||||||
(cond ((eq x (code-char 0))
|
(cond ((eq x (code-char 0))
|
||||||
(setf (char vector index) char)
|
(setf (char vector index) char)
|
||||||
(incf (charset-count set))
|
(when count
|
||||||
|
(incf (charset-count set)))
|
||||||
(return-from %add-to-charset char))
|
(return-from %add-to-charset char))
|
||||||
((eq x char)
|
((eq x char)
|
||||||
(return-from %add-to-charset char))
|
(return-from %add-to-charset char))
|
||||||
@ -184,7 +190,10 @@ extending it. Returns NIL if this fails."
|
|||||||
(setf (charset-depth set) new-depth
|
(setf (charset-depth set) new-depth
|
||||||
(charset-vector set) new-vector)
|
(charset-vector set) new-vector)
|
||||||
(flet ((try-add (x)
|
(flet ((try-add (x)
|
||||||
(unless (%add-to-charset x set)
|
;; don't count - old characters are already accounted
|
||||||
|
;; for, and might count the new one multiple times as
|
||||||
|
;; well
|
||||||
|
(unless (%add-to-charset x set nil)
|
||||||
(assert (not (zerop new-depth)))
|
(assert (not (zerop new-depth)))
|
||||||
(setf new-size (* 2 new-size))
|
(setf new-size (* 2 new-size))
|
||||||
(go :retry))))
|
(go :retry))))
|
||||||
@ -196,32 +205,38 @@ extending it. Returns NIL if this fails."
|
|||||||
(try-add x))
|
(try-add x))
|
||||||
(unless (zerop i)
|
(unless (zerop i)
|
||||||
(try-add x))))))))
|
(try-add x))))))))
|
||||||
|
;; added and expanded, /now/ count the new character.
|
||||||
|
(incf (charset-count set))
|
||||||
t))
|
t))
|
||||||
|
|
||||||
(defun all-characters (set)
|
(defun map-charset (function charset)
|
||||||
"Returns a list of all characters in the charset SET."
|
"Calls FUNCTION with all characters in SET. Returns NIL."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(loop with count = (charset-count set)
|
(declare (function function))
|
||||||
with counter = 0
|
(let* ((n (charset-count charset))
|
||||||
for code below char-code-limit
|
(vector (charset-vector charset))
|
||||||
for char = (code-char code)
|
(size (length vector)))
|
||||||
while (< counter count)
|
;; see comments in IN-CHARSET-P for algorithm
|
||||||
when (and char (in-charset-p char set))
|
(when (eq (code-char 0) (char vector 0))
|
||||||
do (incf counter)
|
(funcall function (code-char 0))
|
||||||
and collect char))
|
(decf n))
|
||||||
|
(loop for i from 1 below size
|
||||||
|
for char = (char vector i)
|
||||||
|
unless (eq (code-char 0) char) do
|
||||||
|
(funcall function char)
|
||||||
|
;; this early termination test should be worth it when
|
||||||
|
;; mapping across depth 0 charsets.
|
||||||
|
(when (zerop (decf n))
|
||||||
|
(return-from map-charset nil))))
|
||||||
|
nil)
|
||||||
|
|
||||||
(defun merge-set (set1 set2 &optional invertedp)
|
(defun create-charset-from-test-function (test-function start end)
|
||||||
"Returns the \"sum\" of two charsets. This is a destructive
|
"Creates and returns a charset representing all characters with
|
||||||
operation on SET1. If INVERTEDP is true, merges the \"inverse\" of
|
character codes between START and END which satisfy TEST-FUNCTION."
|
||||||
SET2 into SET1 instead."
|
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
;; we only consider values with character codes below
|
(loop with charset = (make-charset)
|
||||||
;; *REGEX-CHAR-CODE-LIMIT*
|
for code from start below end
|
||||||
(loop for code of-type fixnum from 0 below *regex-char-code-limit*
|
|
||||||
for char = (code-char code)
|
for char = (code-char code)
|
||||||
when (and char (if invertedp
|
when (and char (funcall test-function char))
|
||||||
(not (in-charset-p char set2))
|
do (add-to-charset char charset)
|
||||||
(in-charset-p char set2)))
|
finally (return charset)))
|
||||||
do (add-to-charset char set1))
|
|
||||||
set1)
|
|
||||||
|
|
||||||
|
|||||||
98
chartest.lisp
Normal file
98
chartest.lisp
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/chartest.lisp,v 1.3 2008/07/23 00:47:58 edi Exp $
|
||||||
|
|
||||||
|
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
|
(defun create-hash-table-from-test-function (test-function start end)
|
||||||
|
"Creates and returns a hash table representing all characters with
|
||||||
|
character codes between START and END which satisfy TEST-FUNCTION."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(loop with hash-table = (make-hash-table)
|
||||||
|
for code from start below end
|
||||||
|
for char = (code-char code)
|
||||||
|
when (and char (funcall test-function char))
|
||||||
|
do (setf (gethash char hash-table) t)
|
||||||
|
finally (return hash-table)))
|
||||||
|
|
||||||
|
(defun create-optimized-test-function (test-function &key
|
||||||
|
(start 0)
|
||||||
|
(end *regex-char-code-limit*)
|
||||||
|
(kind *optimize-char-classes*))
|
||||||
|
"Given a unary test function which is applicable to characters
|
||||||
|
returns a function which yields the same boolean results for all
|
||||||
|
characters with character codes from START to \(excluding) END. If
|
||||||
|
KIND is NIL, TEST-FUNCTION will simply be returned. Otherwise, KIND
|
||||||
|
should be one of:
|
||||||
|
|
||||||
|
* :HASH-TABLE - builds a hash table representing all characters which
|
||||||
|
satisfy the test and returns a closure which checks if
|
||||||
|
a character is in that hash table
|
||||||
|
|
||||||
|
* :CHARSET - instead of a hash table uses a \"charset\" which is a
|
||||||
|
data structure using non-linear hashing and optimized to
|
||||||
|
represent \(sparse) sets of characters in a fast and
|
||||||
|
space-efficient way \(contributed by Nikodemus Siivola)
|
||||||
|
|
||||||
|
* :CHARMAP - instead of a hash table uses a bit vector to represent
|
||||||
|
the set of characters
|
||||||
|
|
||||||
|
You can also use :HASH-TABLE* or :CHARSET* which are like :HASH-TABLE
|
||||||
|
and :CHARSET but use the complement of the set if the set contains
|
||||||
|
more than half of all characters between START and END. This saves
|
||||||
|
space but needs an additional pass across all characters to create the
|
||||||
|
data structure. There is no corresponding :CHARMAP* kind as the bit
|
||||||
|
vectors are already created to cover the smallest possible interval
|
||||||
|
which contains either the set or its complement."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(ecase kind
|
||||||
|
((nil) test-function)
|
||||||
|
(:charmap
|
||||||
|
(let ((charmap (create-charmap-from-test-function test-function start end)))
|
||||||
|
(lambda (char)
|
||||||
|
(in-charmap-p char charmap))))
|
||||||
|
((:charset :charset*)
|
||||||
|
(let ((charset (create-charset-from-test-function test-function start end)))
|
||||||
|
(cond ((or (eq kind :charset)
|
||||||
|
(<= (charset-count charset) (ceiling (- end start) 2)))
|
||||||
|
(lambda (char)
|
||||||
|
(in-charset-p char charset)))
|
||||||
|
(t (setq charset (create-charset-from-test-function (complement* test-function)
|
||||||
|
start end))
|
||||||
|
(lambda (char)
|
||||||
|
(not (in-charset-p char charset)))))))
|
||||||
|
((:hash-table :hash-table*)
|
||||||
|
(let ((hash-table (create-hash-table-from-test-function test-function start end)))
|
||||||
|
(cond ((or (eq kind :charset)
|
||||||
|
(<= (hash-table-count hash-table) (ceiling (- end start) 2)))
|
||||||
|
(lambda (char)
|
||||||
|
(gethash char hash-table)))
|
||||||
|
(t (setq hash-table (create-hash-table-from-test-function (complement* test-function)
|
||||||
|
start end))
|
||||||
|
(lambda (char)
|
||||||
|
(not (gethash char hash-table)))))))))
|
||||||
58
cl-ppcre-unicode.asd
Normal file
58
cl-ppcre-unicode.asd
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-unicode.asd,v 1.14 2008/07/22 14:19:44 edi Exp $
|
||||||
|
|
||||||
|
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
||||||
|
|
||||||
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-user)
|
||||||
|
|
||||||
|
(defpackage :cl-ppcre-unicode-asd
|
||||||
|
(:use :cl :asdf))
|
||||||
|
|
||||||
|
(in-package :cl-ppcre-unicode-asd)
|
||||||
|
|
||||||
|
(defsystem :cl-ppcre-unicode
|
||||||
|
:components ((:module "cl-ppcre-unicode"
|
||||||
|
:serial t
|
||||||
|
:components ((:file "packages")
|
||||||
|
(:file "resolver"))))
|
||||||
|
:depends-on (:cl-ppcre :cl-unicode))
|
||||||
|
|
||||||
|
(defsystem :cl-ppcre-unicode-test
|
||||||
|
:depends-on (:cl-ppcre-unicode :cl-ppcre-test)
|
||||||
|
:components ((:module "test"
|
||||||
|
:serial t
|
||||||
|
:components ((:file "unicode-tests")))))
|
||||||
|
|
||||||
|
(defmethod perform ((o test-op) (c (eql (find-system :cl-ppcre-unicode))))
|
||||||
|
;; we must load CL-PPCRE explicitly so that the CL-PPCRE-TEST system
|
||||||
|
;; will be found
|
||||||
|
(operate 'load-op :cl-ppcre)
|
||||||
|
(operate 'load-op :cl-ppcre-unicode-test)
|
||||||
|
(funcall (intern (symbol-name :run-all-tests) (find-package :cl-ppcre-test))
|
||||||
|
:more-tests (intern (symbol-name :unicode-test) (find-package :cl-ppcre-test))))
|
||||||
@ -1,7 +1,7 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.11 2007/01/01 23:43:10 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-unicode/packages.lisp,v 1.2 2008/07/22 13:58:13 edi Exp $
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
;;; Redistribution and use in source and binary forms, with or without
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
;;; modification, are permitted provided that the following conditions
|
;;; modification, are permitted provided that the following conditions
|
||||||
@ -27,14 +27,12 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-user)
|
(in-package :cl-user)
|
||||||
|
|
||||||
(defparameter *cl-ppcre-test-base-directory*
|
(defpackage :cl-ppcre-unicode
|
||||||
(make-pathname :name nil :type nil :version nil
|
#+:genera
|
||||||
:defaults (parse-namestring *load-truename*)))
|
(:shadowing-import-from :common-lisp :lambda :string)
|
||||||
|
(:use #-:genera :cl #+:genera :future-common-lisp
|
||||||
(mk:defsystem #:cl-ppcre-test
|
:cl-ppcre :cl-unicode)
|
||||||
:source-pathname *cl-ppcre-test-base-directory*
|
(:import-from :cl-ppcre :signal-syntax-error)
|
||||||
:source-extension "lisp"
|
(:export :unicode-property-resolver))
|
||||||
:depends-on (#:cl-ppcre)
|
|
||||||
:components ((:file "ppcre-tests")))
|
|
||||||
61
cl-ppcre-unicode/resolver.lisp
Normal file
61
cl-ppcre-unicode/resolver.lisp
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-unicode/resolver.lisp,v 1.5 2008/07/23 02:14:08 edi Exp $
|
||||||
|
|
||||||
|
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre-unicode)
|
||||||
|
|
||||||
|
(defun unicode-property-resolver (property-name)
|
||||||
|
"A property resolver which understands Unicode properties using
|
||||||
|
CL-UNICODE's PROPERTY-TEST function. This resolver is automatically
|
||||||
|
installed in *PROPERTY-RESOLVER* when the CL-PPCRE-UNICODE system is
|
||||||
|
loaded."
|
||||||
|
(or (property-test property-name :errorp nil)
|
||||||
|
(signal-syntax-error "There is no property named ~S." property-name)))
|
||||||
|
|
||||||
|
(setq *property-resolver* 'unicode-property-resolver)
|
||||||
|
|
||||||
|
(pushnew :cl-ppcre-unicode *features*)
|
||||||
|
|
||||||
|
;; stuff for Nikodemus Siivola's HYPERDOC
|
||||||
|
;; see <http://common-lisp.net/project/hyperdoc/>
|
||||||
|
;; and <http://www.cliki.net/hyperdoc>
|
||||||
|
;; also used by LW-ADD-ONS
|
||||||
|
|
||||||
|
(defvar *hyperdoc-base-uri* "http://weitz.de/cl-ppcre/")
|
||||||
|
|
||||||
|
(let ((exported-symbols-alist
|
||||||
|
(loop for symbol being the external-symbols of :cl-ppcre-unicode
|
||||||
|
collect (cons symbol
|
||||||
|
(concatenate 'string
|
||||||
|
"#"
|
||||||
|
(string-downcase symbol))))))
|
||||||
|
(defun hyperdoc-lookup (symbol type)
|
||||||
|
(declare (ignore type))
|
||||||
|
(cdr (assoc symbol
|
||||||
|
exported-symbols-alist
|
||||||
|
:test #'eq))))
|
||||||
31
cl-ppcre.asd
31
cl-ppcre.asd
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.30 2008/07/03 10:06:15 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.45 2008/07/23 02:14:06 edi Exp $
|
||||||
|
|
||||||
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
||||||
|
|
||||||
@ -29,14 +29,23 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(asdf:defsystem :cl-ppcre
|
(in-package :cl-user)
|
||||||
:version "1.4.1"
|
|
||||||
|
(defpackage :cl-ppcre-asd
|
||||||
|
(:use :cl :asdf))
|
||||||
|
|
||||||
|
(in-package :cl-ppcre-asd)
|
||||||
|
|
||||||
|
(defsystem :cl-ppcre
|
||||||
|
:version "2.0.0"
|
||||||
:serial t
|
:serial t
|
||||||
:components ((:file "packages")
|
:components ((:file "packages")
|
||||||
(:file "specials")
|
(:file "specials")
|
||||||
(:file "charset")
|
|
||||||
(:file "util")
|
(:file "util")
|
||||||
(:file "errors")
|
(:file "errors")
|
||||||
|
(:file "charset")
|
||||||
|
(:file "charmap")
|
||||||
|
(:file "chartest")
|
||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
(:file "lexer")
|
(:file "lexer")
|
||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
@ -44,6 +53,8 @@
|
|||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
(:file "regex-class")
|
(:file "regex-class")
|
||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
|
(:file "regex-class-util")
|
||||||
|
#-:use-acl-regexp2-engine
|
||||||
(:file "convert")
|
(:file "convert")
|
||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
(:file "optimize")
|
(:file "optimize")
|
||||||
@ -54,3 +65,15 @@
|
|||||||
#-:use-acl-regexp2-engine
|
#-:use-acl-regexp2-engine
|
||||||
(:file "scanner")
|
(:file "scanner")
|
||||||
(:file "api")))
|
(:file "api")))
|
||||||
|
|
||||||
|
(defsystem :cl-ppcre-test
|
||||||
|
:depends-on (:cl-ppcre :flexi-streams)
|
||||||
|
:components ((:module "test"
|
||||||
|
:serial t
|
||||||
|
:components ((:file "packages")
|
||||||
|
(:file "tests")
|
||||||
|
(:file "perl-tests")))))
|
||||||
|
|
||||||
|
(defmethod perform ((o test-op) (c (eql (find-system :cl-ppcre))))
|
||||||
|
(operate 'load-op :cl-ppcre-test)
|
||||||
|
(funcall (intern (symbol-name :run-all-tests) (find-package :cl-ppcre-test))))
|
||||||
|
|||||||
@ -1,59 +0,0 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.13 2007/01/01 23:43:10 edi Exp $
|
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.
|
|
||||||
|
|
||||||
;;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;;; modification, are permitted provided that the following conditions
|
|
||||||
;;; are met:
|
|
||||||
|
|
||||||
;;; * Redistributions of source code must retain the above copyright
|
|
||||||
;;; notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
;;; * Redistributions in binary form must reproduce the above
|
|
||||||
;;; copyright notice, this list of conditions and the following
|
|
||||||
;;; disclaimer in the documentation and/or other materials
|
|
||||||
;;; provided with the distribution.
|
|
||||||
|
|
||||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
||||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
||||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
||||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
||||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
(in-package #:cl-user)
|
|
||||||
|
|
||||||
(defparameter *cl-ppcre-base-directory*
|
|
||||||
(make-pathname :name nil :type nil :version nil
|
|
||||||
:defaults (parse-namestring *load-truename*)))
|
|
||||||
|
|
||||||
(mk:defsystem #:cl-ppcre
|
|
||||||
:source-pathname *cl-ppcre-base-directory*
|
|
||||||
:source-extension "lisp"
|
|
||||||
:components ((:file "packages")
|
|
||||||
(:file "specials" :depends-on ("packages"))
|
|
||||||
(:file "util" :depends-on ("packages"))
|
|
||||||
(:file "errors" :depends-on ("util"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "lexer" :depends-on ("errors" "specials"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "parser" :depends-on ("lexer"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "regex-class" :depends-on ("parser"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "convert" :depends-on ("regex-class"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "optimize" :depends-on ("convert"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "closures" :depends-on ("optimize" "specials"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "repetition-closures" :depends-on ("closures"))
|
|
||||||
#-:use-acl-regexp2-engine
|
|
||||||
(:file "scanner" :depends-on ("repetition-closures"))
|
|
||||||
(:file "api" :depends-on ("scanner"))))
|
|
||||||
226
closures.lisp
226
closures.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.36 2008/07/03 07:44:06 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.44 2008/07/22 22:38:05 edi Exp $
|
||||||
|
|
||||||
;;; Here we create the closures which together build the final
|
;;; Here we create the closures which together build the final
|
||||||
;;; scanner.
|
;;; scanner.
|
||||||
@ -30,16 +30,15 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(declaim (inline *string*= *string*-equal))
|
(declaim (inline *string*= *string*-equal))
|
||||||
|
|
||||||
(defun *string*= (string2 start1 end1 start2 end2)
|
(defun *string*= (string2 start1 end1 start2 end2)
|
||||||
"Like STRING=, i.e. compares the special string *STRING* from START1
|
"Like STRING=, i.e. compares the special string *STRING* from START1
|
||||||
to END1 with STRING2 from START2 to END2. Note that there's no
|
to END1 with STRING2 from START2 to END2. Note that there's no
|
||||||
boundary check - this has to be implemented by the caller."
|
boundary check - this has to be implemented by the caller."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type fixnum start1 end1 start2 end2))
|
(declare (fixnum start1 end1 start2 end2))
|
||||||
(loop for string1-idx of-type fixnum from start1 below end1
|
(loop for string1-idx of-type fixnum from start1 below end1
|
||||||
for string2-idx of-type fixnum from start2 below end2
|
for string2-idx of-type fixnum from start2 below end2
|
||||||
always (char= (schar *string* string1-idx)
|
always (char= (schar *string* string1-idx)
|
||||||
@ -50,7 +49,7 @@ boundary check - this has to be implemented by the caller."
|
|||||||
START1 to END1 with STRING2 from START2 to END2. Note that there's no
|
START1 to END1 with STRING2 from START2 to END2. Note that there's no
|
||||||
boundary check - this has to be implemented by the caller."
|
boundary check - this has to be implemented by the caller."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type fixnum start1 end1 start2 end2))
|
(declare (fixnum start1 end1 start2 end2))
|
||||||
(loop for string1-idx of-type fixnum from start1 below end1
|
(loop for string1-idx of-type fixnum from start1 below end1
|
||||||
for string2-idx of-type fixnum from start2 below end2
|
for string2-idx of-type fixnum from start2 below end2
|
||||||
always (char-equal (schar *string* string1-idx)
|
always (char-equal (schar *string* string1-idx)
|
||||||
@ -81,7 +80,7 @@ such that the call to NEXT-FN after the match would succeed."))
|
|||||||
;; now create a closure which checks if one of the closures
|
;; now create a closure which checks if one of the closures
|
||||||
;; created above can succeed
|
;; created above can succeed
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(loop for matcher in all-matchers
|
(loop for matcher in all-matchers
|
||||||
thereis (funcall (the function matcher) start-pos)))))
|
thereis (funcall (the function matcher) start-pos)))))
|
||||||
|
|
||||||
@ -90,13 +89,13 @@ such that the call to NEXT-FN after the match would succeed."))
|
|||||||
;; the position of this REGISTER within the whole regex; we start to
|
;; the position of this REGISTER within the whole regex; we start to
|
||||||
;; count at 0
|
;; count at 0
|
||||||
(let ((num (num register)))
|
(let ((num (num register)))
|
||||||
(declare (type fixnum num))
|
(declare (fixnum num))
|
||||||
;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which will
|
;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which will
|
||||||
;; update the corresponding values of *REGS-START* and *REGS-END*
|
;; update the corresponding values of *REGS-START* and *REGS-END*
|
||||||
;; after the inner matcher has succeeded
|
;; after the inner matcher has succeeded
|
||||||
(flet ((store-end-of-reg (start-pos)
|
(flet ((store-end-of-reg (start-pos)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(setf (svref *reg-starts* num) (svref *regs-maybe-start* num)
|
(setf (svref *reg-starts* num) (svref *regs-maybe-start* num)
|
||||||
(svref *reg-ends* num) start-pos)
|
(svref *reg-ends* num) start-pos)
|
||||||
(funcall next-fn start-pos)))
|
(funcall next-fn start-pos)))
|
||||||
@ -104,10 +103,10 @@ such that the call to NEXT-FN after the match would succeed."))
|
|||||||
;; wrapped by this REGISTER
|
;; wrapped by this REGISTER
|
||||||
(let ((inner-matcher (create-matcher-aux (regex register)
|
(let ((inner-matcher (create-matcher-aux (regex register)
|
||||||
#'store-end-of-reg)))
|
#'store-end-of-reg)))
|
||||||
(declare (type function inner-matcher))
|
(declare (function inner-matcher))
|
||||||
;; here comes the actual closure for REGISTER
|
;; here comes the actual closure for REGISTER
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
;; remember the old values of *REGS-START* and friends in
|
;; remember the old values of *REGS-START* and friends in
|
||||||
;; case we cannot match
|
;; case we cannot match
|
||||||
(let ((old-*reg-starts* (svref *reg-starts* num))
|
(let ((old-*reg-starts* (svref *reg-starts* num))
|
||||||
@ -129,7 +128,7 @@ such that the call to NEXT-FN after the match would succeed."))
|
|||||||
;; create a closure which just checks for the inner regex and
|
;; create a closure which just checks for the inner regex and
|
||||||
;; doesn't care about NEXT-FN
|
;; doesn't care about NEXT-FN
|
||||||
(let ((test-matcher (create-matcher-aux (regex lookahead) #'identity)))
|
(let ((test-matcher (create-matcher-aux (regex lookahead) #'identity)))
|
||||||
(declare (type function next-fn test-matcher))
|
(declare (function next-fn test-matcher))
|
||||||
(if (positivep lookahead)
|
(if (positivep lookahead)
|
||||||
;; positive look-ahead: check success of inner regex, then call
|
;; positive look-ahead: check success of inner regex, then call
|
||||||
;; NEXT-FN
|
;; NEXT-FN
|
||||||
@ -148,152 +147,52 @@ such that the call to NEXT-FN after the match would succeed."))
|
|||||||
;; create a closure which just checks for the inner regex and
|
;; create a closure which just checks for the inner regex and
|
||||||
;; doesn't care about NEXT-FN
|
;; doesn't care about NEXT-FN
|
||||||
(test-matcher (create-matcher-aux (regex lookbehind) #'identity)))
|
(test-matcher (create-matcher-aux (regex lookbehind) #'identity)))
|
||||||
(declare (type function next-fn test-matcher)
|
(declare (function next-fn test-matcher)
|
||||||
(type fixnum len))
|
(fixnum len))
|
||||||
(if (positivep lookbehind)
|
(if (positivep lookbehind)
|
||||||
;; positive look-behind: check success of inner regex (if we're
|
;; positive look-behind: check success of inner regex (if we're
|
||||||
;; far enough from the start of *STRING*), then call NEXT-FN
|
;; far enough from the start of *STRING*), then call NEXT-FN
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (>= (- start-pos (or *real-start-pos* *start-pos*)) len)
|
(and (>= (- start-pos (or *real-start-pos* *start-pos*)) len)
|
||||||
(funcall test-matcher (- start-pos len))
|
(funcall test-matcher (- start-pos len))
|
||||||
(funcall next-fn start-pos)))
|
(funcall next-fn start-pos)))
|
||||||
;; negative look-behind: check failure of inner regex (if we're
|
;; negative look-behind: check failure of inner regex (if we're
|
||||||
;; far enough from the start of *STRING*), then call NEXT-FN
|
;; far enough from the start of *STRING*), then call NEXT-FN
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (or (< (- start-pos (or *real-start-pos* *start-pos*)) len)
|
(and (or (< (- start-pos (or *real-start-pos* *start-pos*)) len)
|
||||||
(not (funcall test-matcher (- start-pos len))))
|
(not (funcall test-matcher (- start-pos len))))
|
||||||
(funcall next-fn start-pos))))))
|
(funcall next-fn start-pos))))))
|
||||||
|
|
||||||
(defmacro insert-char-class-tester ((char-class chr-expr) &body body)
|
(defmacro insert-char-class-tester ((char-class chr-expr) &body body)
|
||||||
"Utility macro to replace each occurence of '(CHAR-CLASS-TEST)
|
"Utility macro to replace each occurence of '\(CHAR-CLASS-TEST)
|
||||||
within BODY with the correct test (corresponding to CHAR-CLASS)
|
within BODY with the correct test (corresponding to CHAR-CLASS)
|
||||||
against CHR-EXPR."
|
against CHR-EXPR."
|
||||||
(with-unique-names (%char-class)
|
(with-rebinding (char-class)
|
||||||
;; the actual substitution is done here: replace
|
(with-unique-names (test-function)
|
||||||
;; '(CHAR-CLASS-TEST) with NEW
|
(flet ((substitute-char-class-tester (new)
|
||||||
(flet ((substitute-char-class-tester (new)
|
|
||||||
(subst new '(char-class-test) body
|
(subst new '(char-class-test) body
|
||||||
:test #'equalp)))
|
:test #'equalp)))
|
||||||
`(let* ((,%char-class ,char-class)
|
`(let ((,test-function (test-function ,char-class)))
|
||||||
(set (charset ,%char-class))
|
,@(substitute-char-class-tester
|
||||||
(count (if set
|
`(funcall ,test-function ,chr-expr)))))))
|
||||||
(charset-count set)
|
|
||||||
most-positive-fixnum))
|
|
||||||
;; collect a list of "all" characters in the set if
|
|
||||||
;; there aren't more than two
|
|
||||||
(all-chars (if (<= count 2)
|
|
||||||
(all-characters set)
|
|
||||||
nil))
|
|
||||||
downcasedp)
|
|
||||||
(declare (type fixnum count))
|
|
||||||
;; check if we can partition the charset into three ranges (or
|
|
||||||
;; less)
|
|
||||||
(multiple-value-bind (min1 max1 min2 max2 min3 max3)
|
|
||||||
(create-ranges-from-set set)
|
|
||||||
;; if that didn't work and CHAR-CLASS is case-insensitive we
|
|
||||||
;; try it again with every character downcased
|
|
||||||
(when (and (not min1)
|
|
||||||
(case-insensitive-p ,%char-class))
|
|
||||||
(multiple-value-setq (min1 max1 min2 max2 min3 max3)
|
|
||||||
(create-ranges-from-set set :downcasep t))
|
|
||||||
(setq downcasedp t))
|
|
||||||
(cond ((= count 1)
|
|
||||||
;; charset contains exactly one character so we just
|
|
||||||
;; check for this single character; (note that this
|
|
||||||
;; actually can't happen because this case is
|
|
||||||
;; optimized away in CONVERT already...)
|
|
||||||
(let ((chr1 (first all-chars)))
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(char= ,chr-expr chr1))))
|
|
||||||
((= count 2)
|
|
||||||
;; set contains exactly two characters
|
|
||||||
(let ((chr1 (first all-chars))
|
|
||||||
(chr2 (second all-chars)))
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(let ((chr ,chr-expr))
|
|
||||||
(or (char= chr chr1)
|
|
||||||
(char= chr chr2))))))
|
|
||||||
((word-char-class-p ,%char-class)
|
|
||||||
;; special-case: set is \w, \W, [\w], [\W] or
|
|
||||||
;; something equivalent
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(word-char-p ,chr-expr)))
|
|
||||||
((= count *regex-char-code-limit*)
|
|
||||||
;; according to the ANSI standard we might have all
|
|
||||||
;; possible characters in the set even if it doesn't
|
|
||||||
;; contain CHAR-CODE-LIMIT characters but this
|
|
||||||
;; doesn't seem to be the case for current
|
|
||||||
;; implementations (also note that this optimization
|
|
||||||
;; implies that you must not have characters with
|
|
||||||
;; character codes beyond *REGEX-CHAR-CODE-LIMIT* in
|
|
||||||
;; your regexes if you've changed this limit); we
|
|
||||||
;; expect the compiler to optimize this T "test"
|
|
||||||
;; away
|
|
||||||
,@(substitute-char-class-tester t))
|
|
||||||
((and downcasedp min1 min2 min3)
|
|
||||||
;; three different ranges, downcased
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(let ((chr ,chr-expr))
|
|
||||||
(or (char-not-greaterp min1 chr max1)
|
|
||||||
(char-not-greaterp min2 chr max2)
|
|
||||||
(char-not-greaterp min3 chr max3)))))
|
|
||||||
((and downcasedp min1 min2)
|
|
||||||
;; two ranges, downcased
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(let ((chr ,chr-expr))
|
|
||||||
(or (char-not-greaterp min1 chr max1)
|
|
||||||
(char-not-greaterp min2 chr max2)))))
|
|
||||||
((and downcasedp min1)
|
|
||||||
;; one downcased range
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(char-not-greaterp min1 ,chr-expr max1)))
|
|
||||||
((and min1 min2 min3)
|
|
||||||
;; three ranges
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(let ((chr ,chr-expr))
|
|
||||||
(or (char<= min1 chr max1)
|
|
||||||
(char<= min2 chr max2)
|
|
||||||
(char<= min3 chr max3)))))
|
|
||||||
((and min1 min2)
|
|
||||||
;; two ranges
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(let ((chr ,chr-expr))
|
|
||||||
(or (char<= min1 chr max1)
|
|
||||||
(char<= min2 chr max2)))))
|
|
||||||
(min1
|
|
||||||
;; one range
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(char<= min1 ,chr-expr max1)))
|
|
||||||
(t
|
|
||||||
;; the general case; note that most of the above
|
|
||||||
;; "optimizations" are based on early (2002)
|
|
||||||
;; experiences and benchmarks with CMUCL
|
|
||||||
,@(substitute-char-class-tester
|
|
||||||
`(in-charset-p ,chr-expr set)))))))))
|
|
||||||
|
|
||||||
(defmethod create-matcher-aux ((char-class char-class) next-fn)
|
(defmethod create-matcher-aux ((char-class char-class) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
;; insert a test against the current character within *STRING*
|
;; insert a test against the current character within *STRING*
|
||||||
(insert-char-class-tester (char-class (schar *string* start-pos))
|
(insert-char-class-tester (char-class (schar *string* start-pos))
|
||||||
(if (invertedp char-class)
|
(lambda (start-pos)
|
||||||
(lambda (start-pos)
|
(declare (fixnum start-pos))
|
||||||
(declare (type fixnum start-pos))
|
(and (< start-pos *end-pos*)
|
||||||
(and (< start-pos *end-pos*)
|
(char-class-test)
|
||||||
(not (char-class-test))
|
(funcall next-fn (1+ start-pos))))))
|
||||||
(funcall next-fn (1+ start-pos))))
|
|
||||||
(lambda (start-pos)
|
|
||||||
(declare (type fixnum start-pos))
|
|
||||||
(and (< start-pos *end-pos*)
|
|
||||||
(char-class-test)
|
|
||||||
(funcall next-fn (1+ start-pos)))))))
|
|
||||||
|
|
||||||
(defmethod create-matcher-aux ((str str) next-fn)
|
(defmethod create-matcher-aux ((str str) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type fixnum *end-string-pos*)
|
(declare (fixnum *end-string-pos*)
|
||||||
(type function next-fn)
|
(function next-fn)
|
||||||
;; this special value is set by CREATE-SCANNER when the
|
;; this special value is set by CREATE-SCANNER when the
|
||||||
;; closures are built
|
;; closures are built
|
||||||
(special end-string))
|
(special end-string))
|
||||||
@ -307,15 +206,15 @@ against CHR-EXPR."
|
|||||||
(end-string-len (if end-string
|
(end-string-len (if end-string
|
||||||
(length end-string)
|
(length end-string)
|
||||||
nil)))
|
nil)))
|
||||||
(declare (type fixnum len))
|
(declare (fixnum len))
|
||||||
(cond ((and start-of-end-string-p case-insensitive-p)
|
(cond ((and start-of-end-string-p case-insensitive-p)
|
||||||
;; closure for the first STR which belongs to the constant
|
;; closure for the first STR which belongs to the constant
|
||||||
;; string at the end of the regular expression;
|
;; string at the end of the regular expression;
|
||||||
;; case-insensitive version
|
;; case-insensitive version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos end-string-len))
|
(declare (fixnum start-pos end-string-len))
|
||||||
(let ((test-end-pos (+ start-pos end-string-len)))
|
(let ((test-end-pos (+ start-pos end-string-len)))
|
||||||
(declare (type fixnum test-end-pos))
|
(declare (fixnum test-end-pos))
|
||||||
;; either we're at *END-STRING-POS* (which means that
|
;; either we're at *END-STRING-POS* (which means that
|
||||||
;; it has already been confirmed that end-string
|
;; it has already been confirmed that end-string
|
||||||
;; starts here) or we really have to test
|
;; starts here) or we really have to test
|
||||||
@ -329,9 +228,9 @@ against CHR-EXPR."
|
|||||||
;; string at the end of the regular expression;
|
;; string at the end of the regular expression;
|
||||||
;; case-sensitive version
|
;; case-sensitive version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos end-string-len))
|
(declare (fixnum start-pos end-string-len))
|
||||||
(let ((test-end-pos (+ start-pos end-string-len)))
|
(let ((test-end-pos (+ start-pos end-string-len)))
|
||||||
(declare (type fixnum test-end-pos))
|
(declare (fixnum test-end-pos))
|
||||||
;; either we're at *END-STRING-POS* (which means that
|
;; either we're at *END-STRING-POS* (which means that
|
||||||
;; it has already been confirmed that end-string
|
;; it has already been confirmed that end-string
|
||||||
;; starts here) or we really have to test
|
;; starts here) or we really have to test
|
||||||
@ -344,13 +243,13 @@ against CHR-EXPR."
|
|||||||
;; a STR which can be skipped because some other function
|
;; a STR which can be skipped because some other function
|
||||||
;; has already confirmed that it matches
|
;; has already confirmed that it matches
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(funcall next-fn (+ start-pos len))))
|
(funcall next-fn (+ start-pos len))))
|
||||||
((and (= len 1) case-insensitive-p)
|
((and (= len 1) case-insensitive-p)
|
||||||
;; STR represent exactly one character; case-insensitive
|
;; STR represent exactly one character; case-insensitive
|
||||||
;; version
|
;; version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (< start-pos *end-pos*)
|
(and (< start-pos *end-pos*)
|
||||||
(char-equal (schar *string* start-pos) chr)
|
(char-equal (schar *string* start-pos) chr)
|
||||||
(funcall next-fn (1+ start-pos)))))
|
(funcall next-fn (1+ start-pos)))))
|
||||||
@ -358,35 +257,34 @@ against CHR-EXPR."
|
|||||||
;; STR represent exactly one character; case-sensitive
|
;; STR represent exactly one character; case-sensitive
|
||||||
;; version
|
;; version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (< start-pos *end-pos*)
|
(and (< start-pos *end-pos*)
|
||||||
(char= (schar *string* start-pos) chr)
|
(char= (schar *string* start-pos) chr)
|
||||||
(funcall next-fn (1+ start-pos)))))
|
(funcall next-fn (1+ start-pos)))))
|
||||||
(case-insensitive-p
|
(case-insensitive-p
|
||||||
;; general case, case-insensitive version
|
;; general case, case-insensitive version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((next-pos (+ start-pos len)))
|
(let ((next-pos (+ start-pos len)))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and (<= next-pos *end-pos*)
|
(and (<= next-pos *end-pos*)
|
||||||
(*string*-equal str start-pos next-pos 0 len)
|
(*string*-equal str start-pos next-pos 0 len)
|
||||||
(funcall next-fn next-pos)))))
|
(funcall next-fn next-pos)))))
|
||||||
(t
|
(t
|
||||||
;; general case, case-sensitive version
|
;; general case, case-sensitive version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((next-pos (+ start-pos len)))
|
(let ((next-pos (+ start-pos len)))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and (<= next-pos *end-pos*)
|
(and (<= next-pos *end-pos*)
|
||||||
(*string*= str start-pos next-pos 0 len)
|
(*string*= str start-pos next-pos 0 len)
|
||||||
(funcall next-fn next-pos))))))))
|
(funcall next-fn next-pos))))))))
|
||||||
|
|
||||||
(declaim (inline word-boundary-p))
|
(declaim (inline word-boundary-p))
|
||||||
|
|
||||||
(defun word-boundary-p (start-pos)
|
(defun word-boundary-p (start-pos)
|
||||||
"Check whether START-POS is a word-boundary within *STRING*."
|
"Check whether START-POS is a word-boundary within *STRING*."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((1-start-pos (1- start-pos))
|
(let ((1-start-pos (1- start-pos))
|
||||||
(*start-pos* (or *real-start-pos* *start-pos*)))
|
(*start-pos* (or *real-start-pos* *start-pos*)))
|
||||||
;; either the character before START-POS is a word-constituent and
|
;; either the character before START-POS is a word-constituent and
|
||||||
@ -407,7 +305,7 @@ against CHR-EXPR."
|
|||||||
|
|
||||||
(defmethod create-matcher-aux ((word-boundary word-boundary) next-fn)
|
(defmethod create-matcher-aux ((word-boundary word-boundary) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
(if (negatedp word-boundary)
|
(if (negatedp word-boundary)
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(and (not (word-boundary-p start-pos))
|
(and (not (word-boundary-p start-pos))
|
||||||
@ -418,25 +316,25 @@ against CHR-EXPR."
|
|||||||
|
|
||||||
(defmethod create-matcher-aux ((everything everything) next-fn)
|
(defmethod create-matcher-aux ((everything everything) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
(if (single-line-p everything)
|
(if (single-line-p everything)
|
||||||
;; closure for single-line-mode: we really match everything, so we
|
;; closure for single-line-mode: we really match everything, so we
|
||||||
;; just advance the index into *STRING* by one and carry on
|
;; just advance the index into *STRING* by one and carry on
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (< start-pos *end-pos*)
|
(and (< start-pos *end-pos*)
|
||||||
(funcall next-fn (1+ start-pos))))
|
(funcall next-fn (1+ start-pos))))
|
||||||
;; not single-line-mode, so we have to make sure we don't match
|
;; not single-line-mode, so we have to make sure we don't match
|
||||||
;; #\Newline
|
;; #\Newline
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (< start-pos *end-pos*)
|
(and (< start-pos *end-pos*)
|
||||||
(char/= (schar *string* start-pos) #\Newline)
|
(char/= (schar *string* start-pos) #\Newline)
|
||||||
(funcall next-fn (1+ start-pos))))))
|
(funcall next-fn (1+ start-pos))))))
|
||||||
|
|
||||||
(defmethod create-matcher-aux ((anchor anchor) next-fn)
|
(defmethod create-matcher-aux ((anchor anchor) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
(let ((startp (startp anchor))
|
(let ((startp (startp anchor))
|
||||||
(multi-line-p (multi-line-p anchor)))
|
(multi-line-p (multi-line-p anchor)))
|
||||||
(cond ((no-newline-p anchor)
|
(cond ((no-newline-p anchor)
|
||||||
@ -444,14 +342,14 @@ against CHR-EXPR."
|
|||||||
;; we just have to check whether START-POS equals
|
;; we just have to check whether START-POS equals
|
||||||
;; *END-POS*
|
;; *END-POS*
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (= start-pos *end-pos*)
|
(and (= start-pos *end-pos*)
|
||||||
(funcall next-fn start-pos))))
|
(funcall next-fn start-pos))))
|
||||||
((and startp multi-line-p)
|
((and startp multi-line-p)
|
||||||
;; a start-anchor in multi-line-mode: check if we're at
|
;; a start-anchor in multi-line-mode: check if we're at
|
||||||
;; *START-POS* or if the last character was #\Newline
|
;; *START-POS* or if the last character was #\Newline
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((*start-pos* (or *real-start-pos* *start-pos*)))
|
(let ((*start-pos* (or *real-start-pos* *start-pos*)))
|
||||||
(and (or (= start-pos *start-pos*)
|
(and (or (= start-pos *start-pos*)
|
||||||
(and (<= start-pos *end-pos*)
|
(and (<= start-pos *end-pos*)
|
||||||
@ -463,7 +361,7 @@ against CHR-EXPR."
|
|||||||
;; a start-anchor which is not in multi-line-mode, so just
|
;; a start-anchor which is not in multi-line-mode, so just
|
||||||
;; check whether we're at *START-POS*
|
;; check whether we're at *START-POS*
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (= start-pos (or *real-start-pos* *start-pos*))
|
(and (= start-pos (or *real-start-pos* *start-pos*))
|
||||||
(funcall next-fn start-pos))))
|
(funcall next-fn start-pos))))
|
||||||
(multi-line-p
|
(multi-line-p
|
||||||
@ -471,7 +369,7 @@ against CHR-EXPR."
|
|||||||
;; *END-POS* or if the character we're looking at is
|
;; *END-POS* or if the character we're looking at is
|
||||||
;; #\Newline
|
;; #\Newline
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (or (= start-pos *end-pos*)
|
(and (or (= start-pos *end-pos*)
|
||||||
(and (< start-pos *end-pos*)
|
(and (< start-pos *end-pos*)
|
||||||
(char= #\Newline
|
(char= #\Newline
|
||||||
@ -482,7 +380,7 @@ against CHR-EXPR."
|
|||||||
;; check if we're at *END-POS* or if we're looking at
|
;; check if we're at *END-POS* or if we're looking at
|
||||||
;; #\Newline and there's nothing behind it
|
;; #\Newline and there's nothing behind it
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (or (= start-pos *end-pos*)
|
(and (or (= start-pos *end-pos*)
|
||||||
(and (= start-pos (1- *end-pos*))
|
(and (= start-pos (1- *end-pos*))
|
||||||
(char= #\Newline
|
(char= #\Newline
|
||||||
@ -491,14 +389,14 @@ against CHR-EXPR."
|
|||||||
|
|
||||||
(defmethod create-matcher-aux ((back-reference back-reference) next-fn)
|
(defmethod create-matcher-aux ((back-reference back-reference) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
;; the position of the corresponding REGISTER within the whole
|
;; the position of the corresponding REGISTER within the whole
|
||||||
;; regex; we start to count at 0
|
;; regex; we start to count at 0
|
||||||
(let ((num (num back-reference)))
|
(let ((num (num back-reference)))
|
||||||
(if (case-insensitive-p back-reference)
|
(if (case-insensitive-p back-reference)
|
||||||
;; the case-insensitive version
|
;; the case-insensitive version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((reg-start (svref *reg-starts* num))
|
(let ((reg-start (svref *reg-starts* num))
|
||||||
(reg-end (svref *reg-ends* num)))
|
(reg-end (svref *reg-ends* num)))
|
||||||
;; only bother to check if the corresponding REGISTER as
|
;; only bother to check if the corresponding REGISTER as
|
||||||
@ -506,7 +404,7 @@ against CHR-EXPR."
|
|||||||
(and reg-start
|
(and reg-start
|
||||||
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
||||||
(the fixnum reg-start)))))
|
(the fixnum reg-start)))))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and
|
(and
|
||||||
(<= next-pos *end-pos*)
|
(<= next-pos *end-pos*)
|
||||||
(*string*-equal *string* start-pos next-pos
|
(*string*-equal *string* start-pos next-pos
|
||||||
@ -514,7 +412,7 @@ against CHR-EXPR."
|
|||||||
(funcall next-fn next-pos))))))
|
(funcall next-fn next-pos))))))
|
||||||
;; the case-sensitive version
|
;; the case-sensitive version
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((reg-start (svref *reg-starts* num))
|
(let ((reg-start (svref *reg-starts* num))
|
||||||
(reg-end (svref *reg-ends* num)))
|
(reg-end (svref *reg-ends* num)))
|
||||||
;; only bother to check if the corresponding REGISTER as
|
;; only bother to check if the corresponding REGISTER as
|
||||||
@ -522,7 +420,7 @@ against CHR-EXPR."
|
|||||||
(and reg-start
|
(and reg-start
|
||||||
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
||||||
(the fixnum reg-start)))))
|
(the fixnum reg-start)))))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and
|
(and
|
||||||
(<= next-pos *end-pos*)
|
(<= next-pos *end-pos*)
|
||||||
(*string*= *string* start-pos next-pos
|
(*string*= *string* start-pos next-pos
|
||||||
@ -534,17 +432,17 @@ against CHR-EXPR."
|
|||||||
(let* ((test (test branch))
|
(let* ((test (test branch))
|
||||||
(then-matcher (create-matcher-aux (then-regex branch) next-fn))
|
(then-matcher (create-matcher-aux (then-regex branch) next-fn))
|
||||||
(else-matcher (create-matcher-aux (else-regex branch) next-fn)))
|
(else-matcher (create-matcher-aux (else-regex branch) next-fn)))
|
||||||
(declare (type function then-matcher else-matcher))
|
(declare (function then-matcher else-matcher))
|
||||||
(cond ((numberp test)
|
(cond ((numberp test)
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum test))
|
(declare (fixnum test))
|
||||||
(if (and (< test (length *reg-starts*))
|
(if (and (< test (length *reg-starts*))
|
||||||
(svref *reg-starts* test))
|
(svref *reg-starts* test))
|
||||||
(funcall then-matcher start-pos)
|
(funcall then-matcher start-pos)
|
||||||
(funcall else-matcher start-pos))))
|
(funcall else-matcher start-pos))))
|
||||||
(t
|
(t
|
||||||
(let ((test-matcher (create-matcher-aux test #'identity)))
|
(let ((test-matcher (create-matcher-aux test #'identity)))
|
||||||
(declare (type function test-matcher))
|
(declare (function test-matcher))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(if (funcall test-matcher start-pos)
|
(if (funcall test-matcher start-pos)
|
||||||
(funcall then-matcher start-pos)
|
(funcall then-matcher start-pos)
|
||||||
@ -553,7 +451,7 @@ against CHR-EXPR."
|
|||||||
(defmethod create-matcher-aux ((standalone standalone) next-fn)
|
(defmethod create-matcher-aux ((standalone standalone) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(let ((inner-matcher (create-matcher-aux (regex standalone) #'identity)))
|
(let ((inner-matcher (create-matcher-aux (regex standalone) #'identity)))
|
||||||
(declare (type function next-fn inner-matcher))
|
(declare (function next-fn inner-matcher))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(let ((next-pos (funcall inner-matcher start-pos)))
|
(let ((next-pos (funcall inner-matcher start-pos)))
|
||||||
(and next-pos
|
(and next-pos
|
||||||
|
|||||||
1210
convert.lisp
1210
convert.lisp
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1522
doc/index.html
1522
doc/index.html
File diff suppressed because it is too large
Load Diff
12
errors.lisp
12
errors.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.18 2008/06/25 14:04:27 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.21 2008/07/06 18:12:04 edi Exp $
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
@ -27,7 +27,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defvar *syntax-error-string* nil
|
(defvar *syntax-error-string* nil
|
||||||
"The string which caused the syntax error.")
|
"The string which caused the syntax error.")
|
||||||
@ -69,16 +69,16 @@ parse tree).")
|
|||||||
(:documentation "Signaled when CL-PPCRE functions are
|
(:documentation "Signaled when CL-PPCRE functions are
|
||||||
invoked with wrong arguments."))
|
invoked with wrong arguments."))
|
||||||
|
|
||||||
(defmacro signal-ppcre-syntax-error* (pos format-control &rest format-arguments)
|
(defmacro signal-syntax-error* (pos format-control &rest format-arguments)
|
||||||
`(error 'ppcre-syntax-error
|
`(error 'ppcre-syntax-error
|
||||||
:pos ,pos
|
:pos ,pos
|
||||||
:format-control ,format-control
|
:format-control ,format-control
|
||||||
:format-arguments (list ,@format-arguments)))
|
:format-arguments (list ,@format-arguments)))
|
||||||
|
|
||||||
(defmacro signal-ppcre-syntax-error (format-control &rest format-arguments)
|
(defmacro signal-syntax-error (format-control &rest format-arguments)
|
||||||
`(signal-ppcre-syntax-error* nil ,format-control ,@format-arguments))
|
`(signal-syntax-error* nil ,format-control ,@format-arguments))
|
||||||
|
|
||||||
(defmacro signal-ppcre-invocation-error (format-control &rest format-arguments)
|
(defmacro signal-invocation-error (format-control &rest format-arguments)
|
||||||
`(error 'ppcre-invocation-error
|
`(error 'ppcre-invocation-error
|
||||||
:format-control ,format-control
|
:format-control ,format-control
|
||||||
:format-arguments (list ,@format-arguments)))
|
:format-arguments (list ,@format-arguments)))
|
||||||
|
|||||||
275
lexer.lisp
275
lexer.lisp
@ -1,12 +1,12 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.28 2008/06/25 14:04:27 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.34 2008/07/06 22:36:30 edi Exp $
|
||||||
|
|
||||||
;;; The lexer's responsibility is to convert the regex string into a
|
;;; The lexer's responsibility is to convert the regex string into a
|
||||||
;;; sequence of tokens which are in turn consumed by the parser.
|
;;; sequence of tokens which are in turn consumed by the parser.
|
||||||
;;;
|
;;;
|
||||||
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
|
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
|
||||||
;;; (with a little help from the parser) how many register groups it
|
;;; (with a little help from the parser) how many register groups it
|
||||||
;;; has opened so far. (The latter is necessary for interpreting
|
;;; has opened so far. (The latter is necessary for interpreting
|
||||||
;;; strings like "\\10" correctly.)
|
;;; strings like "\\10" correctly.)
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
@ -35,7 +35,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(declaim (inline map-char-to-special-class))
|
(declaim (inline map-char-to-special-class))
|
||||||
(defun map-char-to-special-char-class (chr)
|
(defun map-char-to-special-char-class (chr)
|
||||||
@ -56,27 +56,18 @@ their associated character classes."
|
|||||||
((#\S)
|
((#\S)
|
||||||
:non-whitespace-char-class)))
|
:non-whitespace-char-class)))
|
||||||
|
|
||||||
(locally
|
(defstruct (lexer (:constructor make-lexer-internal))
|
||||||
(declare #.*standard-optimize-settings*)
|
"LEXER structures are used to hold the regex string which is
|
||||||
(defstruct (lexer (:constructor make-lexer-internal))
|
|
||||||
"LEXER structures are used to hold the regex string which is
|
|
||||||
currently lexed and to keep track of the lexer's state."
|
currently lexed and to keep track of the lexer's state."
|
||||||
(str ""
|
(str "" :type string :read-only t)
|
||||||
:type string
|
(len 0 :type fixnum :read-only t)
|
||||||
:read-only t)
|
(reg 0 :type fixnum)
|
||||||
(len 0
|
(pos 0 :type fixnum)
|
||||||
:type fixnum
|
(last-pos nil :type list))
|
||||||
:read-only t)
|
|
||||||
(reg 0
|
|
||||||
:type fixnum)
|
|
||||||
(pos 0
|
|
||||||
:type fixnum)
|
|
||||||
(last-pos nil
|
|
||||||
:type list)))
|
|
||||||
|
|
||||||
(defun make-lexer (string)
|
(defun make-lexer (string)
|
||||||
(declare (inline make-lexer-internal)
|
(declare (inline make-lexer-internal)
|
||||||
#-genera (type string string))
|
#-:genera (string string))
|
||||||
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
|
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
|
||||||
:len (length string)))
|
:len (length string)))
|
||||||
|
|
||||||
@ -101,12 +92,10 @@ Does not respect extended mode."
|
|||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
"Returns the next character which is to be examined and updates the
|
"Returns the next character which is to be examined and updates the
|
||||||
POS slot. Does not respect extended mode."
|
POS slot. Does not respect extended mode."
|
||||||
(cond ((end-of-string-p lexer)
|
(cond ((end-of-string-p lexer) nil)
|
||||||
nil)
|
(t (prog1
|
||||||
(t
|
(schar (lexer-str lexer) (lexer-pos lexer))
|
||||||
(prog1
|
(incf (lexer-pos lexer))))))
|
||||||
(schar (lexer-str lexer) (lexer-pos lexer))
|
|
||||||
(incf (lexer-pos lexer))))))
|
|
||||||
|
|
||||||
(defun next-char (lexer)
|
(defun next-char (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -135,9 +124,7 @@ nested comments are skipped if applicable."
|
|||||||
while (and skip-char
|
while (and skip-char
|
||||||
(char/= skip-char #\)))
|
(char/= skip-char #\)))
|
||||||
finally (return skip-char))
|
finally (return skip-char))
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* error-pos "Comment group not closed.")))
|
||||||
error-pos
|
|
||||||
"Comment group not closed")))
|
|
||||||
(setq next-char (next-char-non-extended lexer)))
|
(setq next-char (next-char-non-extended lexer)))
|
||||||
(t
|
(t
|
||||||
;; undo effect of previous INCF if we didn't see a #
|
;; undo effect of previous INCF if we didn't see a #
|
||||||
@ -177,7 +164,7 @@ nested comments are skipped if applicable."
|
|||||||
"Moves (LEXER-POS LEXER) back to the last position stored in
|
"Moves (LEXER-POS LEXER) back to the last position stored in
|
||||||
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
|
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
|
||||||
(unless (lexer-last-pos lexer)
|
(unless (lexer-last-pos lexer)
|
||||||
(signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
|
(signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer))
|
||||||
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
|
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
|
||||||
nil)
|
nil)
|
||||||
|
|
||||||
@ -232,19 +219,16 @@ the corresponding number started within the regex string."
|
|||||||
(let ((code (logand #o377 (the fixnum (or number 0)))))
|
(let ((code (logand #o377 (the fixnum (or number 0)))))
|
||||||
(or (and (< code char-code-limit)
|
(or (and (< code char-code-limit)
|
||||||
(code-char code))
|
(code-char code))
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* error-pos "No character for hex-code ~X." number))))
|
||||||
error-pos
|
|
||||||
"No character for hex-code ~X"
|
|
||||||
number))))
|
|
||||||
|
|
||||||
(defun unescape-char (lexer)
|
(defun unescape-char (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
"Convert the characters(s) following a backslash into a token
|
"Convert the characters\(s) following a backslash into a token
|
||||||
which is returned. This function is to be called when the backslash
|
which is returned. This function is to be called when the backslash
|
||||||
has already been consumed. Special character classes like \\W are
|
has already been consumed. Special character classes like \\W are
|
||||||
handled elsewhere."
|
handled elsewhere."
|
||||||
(when (end-of-string-p lexer)
|
(when (end-of-string-p lexer)
|
||||||
(signal-ppcre-syntax-error "String ends with backslash"))
|
(signal-syntax-error "String ends with backslash."))
|
||||||
(let ((chr (next-char-non-extended lexer)))
|
(let ((chr (next-char-non-extended lexer)))
|
||||||
(case chr
|
(case chr
|
||||||
((#\E)
|
((#\E)
|
||||||
@ -257,9 +241,7 @@ handled elsewhere."
|
|||||||
;; \cx means control-x in Perl
|
;; \cx means control-x in Perl
|
||||||
(let ((next-char (next-char-non-extended lexer)))
|
(let ((next-char (next-char-non-extended lexer)))
|
||||||
(unless next-char
|
(unless next-char
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c' at position ~A."))
|
||||||
(lexer-pos lexer)
|
|
||||||
"Character missing after '\\c' at position ~A"))
|
|
||||||
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
|
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
|
||||||
((#\x)
|
((#\x)
|
||||||
;; \x should be followed by a hexadecimal char code,
|
;; \x should be followed by a hexadecimal char code,
|
||||||
@ -295,12 +277,28 @@ handled elsewhere."
|
|||||||
;; all other characters aren't affected by a backslash
|
;; all other characters aren't affected by a backslash
|
||||||
chr))))
|
chr))))
|
||||||
|
|
||||||
(defun collect-char-class (lexer)
|
(defun read-char-property (lexer first-char)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(unless (eql (next-char-non-extended lexer) #\{)
|
||||||
|
(signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char))
|
||||||
|
(let ((name (with-output-to-string (out nil :element-type
|
||||||
|
#+:lispworks 'lw:simple-char #-:lispworks 'character)
|
||||||
|
(loop
|
||||||
|
(let ((char (or (next-char-non-extended lexer)
|
||||||
|
(signal-syntax-error "Unexpected EOF after \\~A{." first-char))))
|
||||||
|
(when (char= char #\})
|
||||||
|
(return))
|
||||||
|
(write-char char out))))))
|
||||||
|
(list (if (char= first-char #\p) :property :inverted-property)
|
||||||
|
;; we must reverse here because of what PARSE-STRING does
|
||||||
|
(nreverse name))))
|
||||||
|
|
||||||
|
(defun collect-char-class (lexer)
|
||||||
"Reads and consumes characters from regex string until a right
|
"Reads and consumes characters from regex string until a right
|
||||||
bracket is seen. Assembles them into a list \(which is returned) of
|
bracket is seen. Assembles them into a list \(which is returned) of
|
||||||
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
|
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
|
||||||
tokens representing special character classes."
|
tokens representing special character classes."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
|
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
|
||||||
hyphen-seen
|
hyphen-seen
|
||||||
last-char
|
last-char
|
||||||
@ -309,72 +307,85 @@ tokens representing special character classes."
|
|||||||
"Do the right thing with character C depending on whether
|
"Do the right thing with character C depending on whether
|
||||||
we're inside a range or not."
|
we're inside a range or not."
|
||||||
(cond ((and hyphen-seen last-char)
|
(cond ((and hyphen-seen last-char)
|
||||||
(setf (car list) (list :range last-char c)
|
(setf (car list) (list :range last-char c)
|
||||||
last-char nil))
|
last-char nil))
|
||||||
(t
|
(t
|
||||||
(push c list)
|
(push c list)
|
||||||
(setq last-char c)))
|
(setq last-char c)))
|
||||||
(setq hyphen-seen nil)))
|
(setq hyphen-seen nil)))
|
||||||
(loop for first = t then nil
|
(loop for first = t then nil
|
||||||
for c = (next-char-non-extended lexer)
|
for c = (next-char-non-extended lexer)
|
||||||
;; leave loop if at end of string
|
;; leave loop if at end of string
|
||||||
while c
|
while c
|
||||||
do (cond
|
do (cond
|
||||||
((char= c #\\)
|
((char= c #\\)
|
||||||
;; we've seen a backslash
|
;; we've seen a backslash
|
||||||
(let ((next-char (next-char-non-extended lexer)))
|
(let ((next-char (next-char-non-extended lexer)))
|
||||||
(case next-char
|
(case next-char
|
||||||
((#\d #\D #\w #\W #\s #\S)
|
((#\d #\D #\w #\W #\s #\S)
|
||||||
;; a special character class
|
;; a special character class
|
||||||
(push (map-char-to-special-char-class next-char) list)
|
(push (map-char-to-special-char-class next-char) list)
|
||||||
;; if the last character was a hyphen
|
;; if the last character was a hyphen
|
||||||
;; just collect it literally
|
;; just collect it literally
|
||||||
(when hyphen-seen
|
(when hyphen-seen
|
||||||
(push #\- list))
|
(push #\- list))
|
||||||
;; if the next character is a hyphen do the same
|
;; if the next character is a hyphen do the same
|
||||||
(when (looking-at-p lexer #\-)
|
(when (looking-at-p lexer #\-)
|
||||||
(push #\- list)
|
(push #\- list)
|
||||||
(incf (lexer-pos lexer)))
|
(incf (lexer-pos lexer)))
|
||||||
(setq hyphen-seen nil))
|
(setq hyphen-seen nil))
|
||||||
((#\E)
|
((#\P #\p)
|
||||||
;; if \Q quoting is on we ignore \E,
|
;; maybe a character property
|
||||||
;; otherwise it's just a plain #\E
|
(cond ((null *property-resolver*)
|
||||||
(unless *allow-quoting*
|
(handle-char next-char))
|
||||||
(handle-char #\E)))
|
(t
|
||||||
(otherwise
|
(push (read-char-property lexer next-char) list)
|
||||||
;; otherwise unescape the following character(s)
|
;; if the last character was a hyphen
|
||||||
(decf (lexer-pos lexer))
|
;; just collect it literally
|
||||||
(handle-char (unescape-char lexer))))))
|
(when hyphen-seen
|
||||||
(first
|
(push #\- list))
|
||||||
;; the first character must not be a right bracket
|
;; if the next character is a hyphen do the same
|
||||||
;; and isn't treated specially if it's a hyphen
|
(when (looking-at-p lexer #\-)
|
||||||
(handle-char c))
|
(push #\- list)
|
||||||
((char= c #\])
|
(incf (lexer-pos lexer)))
|
||||||
;; end of character class
|
(setq hyphen-seen nil))))
|
||||||
;; make sure we collect a pending hyphen
|
((#\E)
|
||||||
(when hyphen-seen
|
;; if \Q quoting is on we ignore \E,
|
||||||
(setq hyphen-seen nil)
|
;; otherwise it's just a plain #\E
|
||||||
(handle-char #\-))
|
(unless *allow-quoting*
|
||||||
;; reverse the list to preserve the order intended
|
(handle-char #\E)))
|
||||||
;; by the author of the regex string
|
(otherwise
|
||||||
(return-from collect-char-class (nreverse list)))
|
;; otherwise unescape the following character(s)
|
||||||
((and (char= c #\-)
|
(decf (lexer-pos lexer))
|
||||||
last-char
|
(handle-char (unescape-char lexer))))))
|
||||||
(not hyphen-seen))
|
(first
|
||||||
;; if the last character was 'just a character'
|
;; the first character must not be a right bracket
|
||||||
;; we expect to be in the middle of a range
|
;; and isn't treated specially if it's a hyphen
|
||||||
(setq hyphen-seen t))
|
(handle-char c))
|
||||||
((char= c #\-)
|
((char= c #\])
|
||||||
;; otherwise this is just an ordinary hyphen
|
;; end of character class
|
||||||
|
;; make sure we collect a pending hyphen
|
||||||
|
(when hyphen-seen
|
||||||
|
(setq hyphen-seen nil)
|
||||||
(handle-char #\-))
|
(handle-char #\-))
|
||||||
(t
|
;; reverse the list to preserve the order intended
|
||||||
;; default case - just collect the character
|
;; by the author of the regex string
|
||||||
(handle-char c))))
|
(return-from collect-char-class (nreverse list)))
|
||||||
|
((and (char= c #\-)
|
||||||
|
last-char
|
||||||
|
(not hyphen-seen))
|
||||||
|
;; if the last character was 'just a character'
|
||||||
|
;; we expect to be in the middle of a range
|
||||||
|
(setq hyphen-seen t))
|
||||||
|
((char= c #\-)
|
||||||
|
;; otherwise this is just an ordinary hyphen
|
||||||
|
(handle-char #\-))
|
||||||
|
(t
|
||||||
|
;; default case - just collect the character
|
||||||
|
(handle-char c))))
|
||||||
;; we can only exit the loop normally if we've reached the end
|
;; we can only exit the loop normally if we've reached the end
|
||||||
;; of the regex string without seeing a right bracket
|
;; of the regex string without seeing a right bracket
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* start-pos "Missing right bracket to close character class."))))
|
||||||
start-pos
|
|
||||||
"Missing right bracket to close character class"))))
|
|
||||||
|
|
||||||
(defun maybe-parse-flags (lexer)
|
(defun maybe-parse-flags (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -387,7 +398,7 @@ the behaviour of the lexer itself via the special variable
|
|||||||
(loop with set = t
|
(loop with set = t
|
||||||
for chr = (next-char-non-extended lexer)
|
for chr = (next-char-non-extended lexer)
|
||||||
unless chr
|
unless chr
|
||||||
do (signal-ppcre-syntax-error "Unexpected end of string")
|
do (signal-syntax-error "Unexpected end of string.")
|
||||||
while (find chr "-imsx" :test #'char=)
|
while (find chr "-imsx" :test #'char=)
|
||||||
;; the first #\- will invert the meaning of all modifiers
|
;; the first #\- will invert the meaning of all modifiers
|
||||||
;; following it
|
;; following it
|
||||||
@ -473,9 +484,7 @@ closing #\> will also be consumed."
|
|||||||
:test #'char=)))
|
:test #'char=)))
|
||||||
(unless end-name
|
(unless end-name
|
||||||
;; there has to be > somewhere, syntax error otherwise
|
;; there has to be > somewhere, syntax error otherwise
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>."))
|
||||||
(1- (lexer-pos lexer))
|
|
||||||
"Opening #\< in named group has no closing #\>"))
|
|
||||||
(let ((name (subseq (lexer-str lexer)
|
(let ((name (subseq (lexer-str lexer)
|
||||||
(lexer-pos lexer)
|
(lexer-pos lexer)
|
||||||
end-name)))
|
end-name)))
|
||||||
@ -484,9 +493,7 @@ closing #\> will also be consumed."
|
|||||||
(char= #\- char)))
|
(char= #\- char)))
|
||||||
name)
|
name)
|
||||||
;; register name can contain only alphanumeric characters or #\-
|
;; register name can contain only alphanumeric characters or #\-
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group."))
|
||||||
(lexer-pos lexer)
|
|
||||||
"Invalid character in named register group"))
|
|
||||||
;; advance lexer beyond "<name>" part
|
;; advance lexer beyond "<name>" part
|
||||||
(setf (lexer-pos lexer) (1+ end-name))
|
(setf (lexer-pos lexer) (1+ end-name))
|
||||||
name)))
|
name)))
|
||||||
@ -518,10 +525,7 @@ closing #\> will also be consumed."
|
|||||||
((#\+ #\*)
|
((#\+ #\*)
|
||||||
;; quantifiers will always be consumend by
|
;; quantifiers will always be consumend by
|
||||||
;; GET-QUANTIFIER, they must not appear here
|
;; GET-QUANTIFIER, they must not appear here
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char))
|
||||||
(1- (lexer-pos lexer))
|
|
||||||
"Quantifier '~A' not allowed"
|
|
||||||
next-char))
|
|
||||||
((#\{)
|
((#\{)
|
||||||
;; left brace isn't a special character in it's own
|
;; left brace isn't a special character in it's own
|
||||||
;; right but we must check if what follows might
|
;; right but we must check if what follows might
|
||||||
@ -530,12 +534,11 @@ closing #\> will also be consumed."
|
|||||||
(this-last-pos (lexer-last-pos lexer)))
|
(this-last-pos (lexer-last-pos lexer)))
|
||||||
(unget-token lexer)
|
(unget-token lexer)
|
||||||
(when (get-quantifier lexer)
|
(when (get-quantifier lexer)
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (car this-last-pos)
|
||||||
(car this-last-pos)
|
"Quantifier '~A' not allowed."
|
||||||
"Quantifier '~A' not allowed"
|
(subseq (lexer-str lexer)
|
||||||
(subseq (lexer-str lexer)
|
(car this-last-pos)
|
||||||
(car this-last-pos)
|
(lexer-pos lexer))))
|
||||||
(lexer-pos lexer))))
|
|
||||||
(setf (lexer-pos lexer) this-pos
|
(setf (lexer-pos lexer) this-pos
|
||||||
(lexer-last-pos lexer) this-last-pos)
|
(lexer-last-pos lexer) this-last-pos)
|
||||||
next-char))
|
next-char))
|
||||||
@ -580,7 +583,7 @@ closing #\> will also be consumed."
|
|||||||
(let* ((old-pos (decf (lexer-pos lexer)))
|
(let* ((old-pos (decf (lexer-pos lexer)))
|
||||||
;; ...so let's get the whole number first
|
;; ...so let's get the whole number first
|
||||||
(backref-number (get-number lexer)))
|
(backref-number (get-number lexer)))
|
||||||
(declare (type fixnum backref-number))
|
(declare (fixnum backref-number))
|
||||||
(cond ((and (> backref-number (lexer-reg lexer))
|
(cond ((and (> backref-number (lexer-reg lexer))
|
||||||
(<= 10 backref-number))
|
(<= 10 backref-number))
|
||||||
;; \10 and higher are treated as octal
|
;; \10 and higher are treated as octal
|
||||||
@ -603,6 +606,10 @@ closing #\> will also be consumed."
|
|||||||
(let ((old-pos (decf (lexer-pos lexer))))
|
(let ((old-pos (decf (lexer-pos lexer))))
|
||||||
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
||||||
old-pos)))
|
old-pos)))
|
||||||
|
((#\P #\p)
|
||||||
|
;; might be a named property
|
||||||
|
(cond (*property-resolver* (read-char-property lexer next-char))
|
||||||
|
(t next-char)))
|
||||||
(otherwise
|
(otherwise
|
||||||
;; in all other cases just unescape the
|
;; in all other cases just unescape the
|
||||||
;; character
|
;; character
|
||||||
@ -622,17 +629,15 @@ closing #\> will also be consumed."
|
|||||||
;; or a closing parenthesis are following
|
;; or a closing parenthesis are following
|
||||||
(when (and flags
|
(when (and flags
|
||||||
(not (find next-char ":)" :test #'char=)))
|
(not (find next-char ":)" :test #'char=)))
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (car (lexer-last-pos lexer))
|
||||||
(car (lexer-last-pos lexer))
|
"Sequence '~A' not recognized."
|
||||||
"Sequence '~A' not recognized"
|
(subseq (lexer-str lexer)
|
||||||
(subseq (lexer-str lexer)
|
(car (lexer-last-pos lexer))
|
||||||
(car (lexer-last-pos lexer))
|
(lexer-pos lexer))))
|
||||||
(lexer-pos lexer))))
|
|
||||||
(case next-char
|
(case next-char
|
||||||
((nil)
|
((nil)
|
||||||
;; syntax error
|
;; syntax error
|
||||||
(signal-ppcre-syntax-error
|
(signal-syntax-error "End of string following '(?'."))
|
||||||
"End of string following '(?'"))
|
|
||||||
((#\))
|
((#\))
|
||||||
;; an empty group except for the flags
|
;; an empty group except for the flags
|
||||||
;; (if there are any)
|
;; (if there are any)
|
||||||
@ -664,10 +669,9 @@ closing #\> will also be consumed."
|
|||||||
;; we have encountered a named group
|
;; we have encountered a named group
|
||||||
;; are we supporting register naming?
|
;; are we supporting register naming?
|
||||||
(unless *allow-named-registers*
|
(unless *allow-named-registers*
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (1- (lexer-pos lexer))
|
||||||
(1- (lexer-pos lexer))
|
"Character '~A' may not follow '(?<'."
|
||||||
"Character '~A' may not follow '(?<'"
|
next-char))
|
||||||
next-char))
|
|
||||||
;; put the letter back
|
;; put the letter back
|
||||||
(decf (lexer-pos lexer))
|
(decf (lexer-pos lexer))
|
||||||
;; named group
|
;; named group
|
||||||
@ -685,19 +689,16 @@ closing #\> will also be consumed."
|
|||||||
:void)
|
:void)
|
||||||
((nil)
|
((nil)
|
||||||
;; syntax error
|
;; syntax error
|
||||||
(signal-ppcre-syntax-error
|
(signal-syntax-error "End of string following '(?<'."))
|
||||||
"End of string following '(?<'"))
|
|
||||||
(t
|
(t
|
||||||
;; also syntax error
|
;; also syntax error
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (1- (lexer-pos lexer))
|
||||||
(1- (lexer-pos lexer))
|
"Character '~A' may not follow '(?<'."
|
||||||
"Character '~A' may not follow '(?<'"
|
next-char ))))))
|
||||||
next-char ))))))
|
|
||||||
(otherwise
|
(otherwise
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (1- (lexer-pos lexer))
|
||||||
(1- (lexer-pos lexer))
|
"Character '~A' may not follow '(?'."
|
||||||
"Character '~A' may not follow '(?'"
|
next-char)))))
|
||||||
next-char)))))
|
|
||||||
(t
|
(t
|
||||||
;; if next-char was not #\? (this is within
|
;; if next-char was not #\? (this is within
|
||||||
;; the first COND), we've just seen an opening
|
;; the first COND), we've just seen an opening
|
||||||
|
|||||||
67
load.lisp
67
load.lisp
@ -1,67 +0,0 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.16 2008/06/25 14:04:27 edi Exp $
|
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
|
||||||
|
|
||||||
;;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;;; modification, are permitted provided that the following conditions
|
|
||||||
;;; are met:
|
|
||||||
|
|
||||||
;;; * Redistributions of source code must retain the above copyright
|
|
||||||
;;; notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
;;; * Redistributions in binary form must reproduce the above
|
|
||||||
;;; copyright notice, this list of conditions and the following
|
|
||||||
;;; disclaimer in the documentation and/or other materials
|
|
||||||
;;; provided with the distribution.
|
|
||||||
|
|
||||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
||||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
||||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
||||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
||||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
(in-package :cl-user)
|
|
||||||
|
|
||||||
(let ((cl-ppcre-base-directory
|
|
||||||
(make-pathname :name nil :type nil :version nil
|
|
||||||
:defaults (parse-namestring *load-truename*)))
|
|
||||||
must-compile)
|
|
||||||
(with-compilation-unit ()
|
|
||||||
(dolist (file '("packages"
|
|
||||||
"specials"
|
|
||||||
"util"
|
|
||||||
"errors"
|
|
||||||
#-:use-acl-regexp2-engine "lexer"
|
|
||||||
#-:use-acl-regexp2-engine "parser"
|
|
||||||
#-:use-acl-regexp2-engine "regex-class"
|
|
||||||
#-:use-acl-regexp2-engine "convert"
|
|
||||||
#-:use-acl-regexp2-engine "optimize"
|
|
||||||
#-:use-acl-regexp2-engine "closures"
|
|
||||||
#-:use-acl-regexp2-engine "repetition-closures"
|
|
||||||
#-:use-acl-regexp2-engine "scanner"
|
|
||||||
"api"
|
|
||||||
"ppcre-tests"))
|
|
||||||
(let ((pathname (make-pathname :name file :type "lisp" :version nil
|
|
||||||
:defaults cl-ppcre-base-directory)))
|
|
||||||
;; don't use COMPILE-FILE in Corman Lisp, it's broken - LOAD
|
|
||||||
;; will yield compiled functions anyway
|
|
||||||
#-:cormanlisp
|
|
||||||
(let ((compiled-pathname (compile-file-pathname pathname)))
|
|
||||||
(unless (and (not must-compile)
|
|
||||||
(probe-file compiled-pathname)
|
|
||||||
(< (file-write-date pathname)
|
|
||||||
(file-write-date compiled-pathname)))
|
|
||||||
(setq must-compile t)
|
|
||||||
(compile-file pathname))
|
|
||||||
(setq pathname compiled-pathname))
|
|
||||||
(load pathname)))))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.31 2008/06/25 14:04:27 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.35 2008/07/06 18:12:04 edi Exp $
|
||||||
|
|
||||||
;;; This file contains optimizations which can be applied to converted
|
;;; This file contains optimizations which can be applied to converted
|
||||||
;;; parse trees.
|
;;; parse trees.
|
||||||
@ -30,7 +30,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defgeneric flatten (regex)
|
(defgeneric flatten (regex)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -96,8 +96,7 @@ operation on REGEX."))
|
|||||||
alternation)
|
alternation)
|
||||||
((cdr choices)
|
((cdr choices)
|
||||||
(first choices))
|
(first choices))
|
||||||
(t (signal-ppcre-syntax-error
|
(t (signal-syntax-error "Encountered alternation without choices.")))))
|
||||||
"Encountered alternation without choices.")))))
|
|
||||||
|
|
||||||
(defmethod flatten ((branch branch))
|
(defmethod flatten ((branch branch))
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -143,7 +142,7 @@ operation on REGEX."))
|
|||||||
collector-start
|
collector-start
|
||||||
(collector-length 0)
|
(collector-length 0)
|
||||||
skip)
|
skip)
|
||||||
(declare (type fixnum collector-length))
|
(declare (fixnum collector-length))
|
||||||
(loop
|
(loop
|
||||||
(let ((elements-rest (cdr curr-point)))
|
(let ((elements-rest (cdr curr-point)))
|
||||||
(unless elements-rest
|
(unless elements-rest
|
||||||
@ -394,7 +393,7 @@ function called by END-STRIN.)"))
|
|||||||
concatenated-string
|
concatenated-string
|
||||||
concatenated-start
|
concatenated-start
|
||||||
(concatenated-length 0))
|
(concatenated-length 0))
|
||||||
(declare (type fixnum concatenated-length))
|
(declare (fixnum concatenated-length))
|
||||||
(loop for element in (reverse (elements seq))
|
(loop for element in (reverse (elements seq))
|
||||||
;; remember the case-(in)sensitivity of the last relevant
|
;; remember the case-(in)sensitivity of the last relevant
|
||||||
;; STR object
|
;; STR object
|
||||||
@ -429,7 +428,7 @@ function called by END-STRIN.)"))
|
|||||||
concatenated-start nil))
|
concatenated-start nil))
|
||||||
(let ((len (len element-end))
|
(let ((len (len element-end))
|
||||||
(str (str element-end)))
|
(str (str element-end)))
|
||||||
(declare (type fixnum len))
|
(declare (fixnum len))
|
||||||
(incf concatenated-length len)
|
(incf concatenated-length len)
|
||||||
(loop for i of-type fixnum downfrom (1- len) to 0
|
(loop for i of-type fixnum downfrom (1- len) to 0
|
||||||
do (vector-push-extend (char str i)
|
do (vector-push-extend (char str i)
|
||||||
|
|||||||
114
packages.lisp
114
packages.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.24 2008/06/25 14:04:27 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.38 2008/07/22 23:54:59 edi Exp $
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
@ -29,78 +29,40 @@
|
|||||||
|
|
||||||
(in-package :cl-user)
|
(in-package :cl-user)
|
||||||
|
|
||||||
#-:cormanlisp
|
(defpackage :cl-ppcre
|
||||||
(defpackage #:cl-ppcre
|
(:nicknames :ppcre)
|
||||||
(:nicknames #:ppcre)
|
#+:genera
|
||||||
#+genera (:shadowing-import-from #:common-lisp #:lambda #:simple-string #:string)
|
(:shadowing-import-from :common-lisp :lambda :simple-string :string)
|
||||||
(:use #-genera #:cl #+genera #:future-common-lisp)
|
(:use #-:genera :cl #+:genera :future-common-lisp)
|
||||||
(:export #:create-scanner
|
(:shadow :digit-char-p :defconstant)
|
||||||
#:parse-tree-synonym
|
(:export :parse-string
|
||||||
#:define-parse-tree-synonym
|
:create-scanner
|
||||||
#:scan
|
:create-optimized-test-function
|
||||||
#:scan-to-strings
|
:parse-tree-synonym
|
||||||
#:do-scans
|
:define-parse-tree-synonym
|
||||||
#:do-matches
|
:scan
|
||||||
#:do-matches-as-strings
|
:scan-to-strings
|
||||||
#:all-matches
|
:do-scans
|
||||||
#:all-matches-as-strings
|
:do-matches
|
||||||
#:split
|
:do-matches-as-strings
|
||||||
#:regex-replace
|
:all-matches
|
||||||
#:regex-replace-all
|
:all-matches-as-strings
|
||||||
#:regex-apropos
|
:split
|
||||||
#:regex-apropos-list
|
:regex-replace
|
||||||
#:quote-meta-chars
|
:regex-replace-all
|
||||||
#:*regex-char-code-limit*
|
:regex-apropos
|
||||||
#:*use-bmh-matchers*
|
:regex-apropos-list
|
||||||
#:*allow-quoting*
|
:quote-meta-chars
|
||||||
#:*allow-named-registers*
|
:*regex-char-code-limit*
|
||||||
#:ppcre-error
|
:*use-bmh-matchers*
|
||||||
#:ppcre-invocation-error
|
:*allow-quoting*
|
||||||
#:ppcre-syntax-error
|
:*allow-named-registers*
|
||||||
#:ppcre-syntax-error-string
|
:*optimize-char-classes*
|
||||||
#:ppcre-syntax-error-pos
|
:*property-resolver*
|
||||||
#:register-groups-bind
|
:ppcre-error
|
||||||
#:do-register-groups))
|
:ppcre-invocation-error
|
||||||
|
:ppcre-syntax-error
|
||||||
#+:cormanlisp
|
:ppcre-syntax-error-string
|
||||||
(defpackage "CL-PPCRE"
|
:ppcre-syntax-error-pos
|
||||||
(:nicknames "PPCRE")
|
:register-groups-bind
|
||||||
(:use "CL")
|
:do-register-groups))
|
||||||
(:export "CREATE-SCANNER"
|
|
||||||
"PARSE-TREE-SYNONYM"
|
|
||||||
"DEFINE-PARSE-TREE-SYNONYM"
|
|
||||||
"SCAN"
|
|
||||||
"SCAN-TO-STRINGS"
|
|
||||||
"DO-SCANS"
|
|
||||||
"DO-MATCHES"
|
|
||||||
"DO-MATCHES-AS-STRINGS"
|
|
||||||
"ALL-MATCHES"
|
|
||||||
"ALL-MATCHES-AS-STRINGS"
|
|
||||||
"SPLIT"
|
|
||||||
"REGEX-REPLACE"
|
|
||||||
"REGEX-REPLACE-ALL"
|
|
||||||
"REGEX-APROPOS"
|
|
||||||
"REGEX-APROPOS-LIST"
|
|
||||||
"QUOTE-META-CHARS"
|
|
||||||
"*REGEX-CHAR-CODE-LIMIT*"
|
|
||||||
"*USE-BMH-MATCHERS*"
|
|
||||||
"*ALLOW-QUOTING*"
|
|
||||||
"*ALLOW-NAMED-REGISTERS*"
|
|
||||||
"PPCRE-ERROR"
|
|
||||||
"PPCRE-INVOCATION-ERROR"
|
|
||||||
"PPCRE-SYNTAX-ERROR"
|
|
||||||
"PPCRE-SYNTAX-ERROR-STRING"
|
|
||||||
"PPCRE-SYNTAX-ERROR-POS"
|
|
||||||
"REGISTER-GROUPS-BIND"
|
|
||||||
"DO-REGISTER-GROUPS"))
|
|
||||||
|
|
||||||
#-:cormanlisp
|
|
||||||
(defpackage #:cl-ppcre-test
|
|
||||||
#+genera (:shadowing-import-from #:common-lisp #:lambda)
|
|
||||||
(:use #-genera #:cl #+genera #:future-common-lisp #:cl-ppcre)
|
|
||||||
(:export #:test))
|
|
||||||
|
|
||||||
#+:cormanlisp
|
|
||||||
(defpackage "CL-PPCRE-TEST"
|
|
||||||
(:use "CL" "CL-PPCRE")
|
|
||||||
(:export "TEST"))
|
|
||||||
|
|||||||
50
parser.lisp
50
parser.lisp
@ -1,11 +1,11 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.25 2008/06/25 14:04:28 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.30 2008/07/06 18:12:05 edi Exp $
|
||||||
|
|
||||||
;;; The parser will - with the help of the lexer - parse a regex
|
;;; The parser will - with the help of the lexer - parse a regex
|
||||||
;;; string and convert it into a "parse tree" (see docs for details
|
;;; string and convert it into a "parse tree" (see docs for details
|
||||||
;;; about the syntax of these trees). Note that the lexer might return
|
;;; about the syntax of these trees). Note that the lexer might
|
||||||
;;; illegal parse trees. It is assumed that the conversion process
|
;;; return illegal parse trees. It is assumed that the conversion
|
||||||
;;; later on will track them down.
|
;;; process later on will track them down.
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
@ -33,10 +33,9 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defun group (lexer)
|
(defun group (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Parses and consumes a <group>.
|
"Parses and consumes a <group>.
|
||||||
The productions are: <group> -> \"\(\"<regex>\")\"
|
The productions are: <group> -> \"\(\"<regex>\")\"
|
||||||
\"\(?:\"<regex>\")\"
|
\"\(?:\"<regex>\")\"
|
||||||
@ -53,6 +52,7 @@ The productions are: <group> -> \"\(\"<regex>\")\"
|
|||||||
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
|
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
|
||||||
Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||||
<grouping-type> is one of six keywords - see source for details."
|
<grouping-type> is one of six keywords - see source for details."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(multiple-value-bind (open-token flags)
|
(multiple-value-bind (open-token flags)
|
||||||
(get-token lexer)
|
(get-token lexer)
|
||||||
(cond ((eq open-token :open-paren-paren)
|
(cond ((eq open-token :open-paren-paren)
|
||||||
@ -65,7 +65,7 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
|||||||
(number (try-number lexer :no-whitespace-p t))
|
(number (try-number lexer :no-whitespace-p t))
|
||||||
;; make changes to extended-mode-p local
|
;; make changes to extended-mode-p local
|
||||||
(*extended-mode-p* *extended-mode-p*))
|
(*extended-mode-p* *extended-mode-p*))
|
||||||
(declare (type fixnum open-paren-pos))
|
(declare (fixnum open-paren-pos))
|
||||||
(cond (number
|
(cond (number
|
||||||
;; condition is a number (i.e. refers to a
|
;; condition is a number (i.e. refers to a
|
||||||
;; back-reference)
|
;; back-reference)
|
||||||
@ -73,13 +73,11 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
|||||||
(reg-expr (reg-expr lexer))
|
(reg-expr (reg-expr lexer))
|
||||||
(close-token (get-token lexer)))
|
(close-token (get-token lexer)))
|
||||||
(unless (eq inner-close-token :close-paren)
|
(unless (eq inner-close-token :close-paren)
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (+ open-paren-pos 2)
|
||||||
(+ open-paren-pos 2)
|
"Opening paren has no matching closing paren."))
|
||||||
"Opening paren has no matching closing paren"))
|
|
||||||
(unless (eq close-token :close-paren)
|
(unless (eq close-token :close-paren)
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* open-paren-pos
|
||||||
open-paren-pos
|
"Opening paren has no matching closing paren."))
|
||||||
"Opening paren has no matching closing paren"))
|
|
||||||
(list :branch number reg-expr)))
|
(list :branch number reg-expr)))
|
||||||
(t
|
(t
|
||||||
;; condition must be a full regex (actually a
|
;; condition must be a full regex (actually a
|
||||||
@ -94,9 +92,8 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
|||||||
(reg-expr (reg-expr lexer))
|
(reg-expr (reg-expr lexer))
|
||||||
(close-token (get-token lexer)))
|
(close-token (get-token lexer)))
|
||||||
(unless (eq close-token :close-paren)
|
(unless (eq close-token :close-paren)
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* open-paren-pos
|
||||||
open-paren-pos
|
"Opening paren has no matching closing paren."))
|
||||||
"Opening paren has no matching closing paren"))
|
|
||||||
(list :branch inner-reg-expr reg-expr))))))
|
(list :branch inner-reg-expr reg-expr))))))
|
||||||
((member open-token '(:open-paren
|
((member open-token '(:open-paren
|
||||||
:open-paren-colon
|
:open-paren-colon
|
||||||
@ -124,9 +121,8 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
|||||||
(unless (eq close-token :close-paren)
|
(unless (eq close-token :close-paren)
|
||||||
;; the token following <regex> must be the closing
|
;; the token following <regex> must be the closing
|
||||||
;; parenthesis or this is a syntax error
|
;; parenthesis or this is a syntax error
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* open-paren-pos
|
||||||
open-paren-pos
|
"Opening paren has no matching closing paren."))
|
||||||
"Opening paren has no matching closing paren"))
|
|
||||||
(if flags
|
(if flags
|
||||||
;; if the lexer has returned a list of flags this must
|
;; if the lexer has returned a list of flags this must
|
||||||
;; have been the "(?:"<regex>")" production
|
;; have been the "(?:"<regex>")" production
|
||||||
@ -160,11 +156,11 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
|||||||
open-token))))
|
open-token))))
|
||||||
|
|
||||||
(defun greedy-quant (lexer)
|
(defun greedy-quant (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Parses and consumes a <greedy-quant>.
|
"Parses and consumes a <greedy-quant>.
|
||||||
The productions are: <greedy-quant> -> <group> | <group><quantifier>
|
The productions are: <greedy-quant> -> <group> | <group><quantifier>
|
||||||
where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
|
where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
|
||||||
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
|
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let* ((group (group lexer))
|
(let* ((group (group lexer))
|
||||||
(token (get-quantifier lexer)))
|
(token (get-quantifier lexer)))
|
||||||
(if token
|
(if token
|
||||||
@ -174,11 +170,11 @@ Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
|
|||||||
group)))
|
group)))
|
||||||
|
|
||||||
(defun quant (lexer)
|
(defun quant (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Parses and consumes a <quant>.
|
"Parses and consumes a <quant>.
|
||||||
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
|
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
|
||||||
Will return the <parse-tree> returned by GREEDY-QUANT and optionally
|
Will return the <parse-tree> returned by GREEDY-QUANT and optionally
|
||||||
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
|
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let* ((greedy-quant (greedy-quant lexer))
|
(let* ((greedy-quant (greedy-quant lexer))
|
||||||
(pos (lexer-pos lexer))
|
(pos (lexer-pos lexer))
|
||||||
(next-char (next-char lexer)))
|
(next-char (next-char lexer)))
|
||||||
@ -189,10 +185,10 @@ change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
|
|||||||
greedy-quant))
|
greedy-quant))
|
||||||
|
|
||||||
(defun seq (lexer)
|
(defun seq (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Parses and consumes a <seq>.
|
"Parses and consumes a <seq>.
|
||||||
The productions are: <seq> -> <quant> | <quant><seq>.
|
The productions are: <seq> -> <quant> | <quant><seq>.
|
||||||
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
|
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(flet ((make-array-from-two-chars (char1 char2)
|
(flet ((make-array-from-two-chars (char1 char2)
|
||||||
(let ((string (make-array 2
|
(let ((string (make-array 2
|
||||||
:element-type 'character
|
:element-type 'character
|
||||||
@ -254,10 +250,10 @@ Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
|
|||||||
:void)))
|
:void)))
|
||||||
|
|
||||||
(defun reg-expr (lexer)
|
(defun reg-expr (lexer)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Parses and consumes a <regex>, a complete regular expression.
|
"Parses and consumes a <regex>, a complete regular expression.
|
||||||
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
|
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
|
||||||
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let ((pos (lexer-pos lexer)))
|
(let ((pos (lexer-pos lexer)))
|
||||||
(case (next-char lexer)
|
(case (next-char lexer)
|
||||||
((nil)
|
((nil)
|
||||||
@ -299,6 +295,8 @@ Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
|||||||
seq)))))))
|
seq)))))))
|
||||||
|
|
||||||
(defun reverse-strings (parse-tree)
|
(defun reverse-strings (parse-tree)
|
||||||
|
"Recursively walks through PARSE-TREE and destructively reverses all
|
||||||
|
strings in it."
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(cond ((stringp parse-tree)
|
(cond ((stringp parse-tree)
|
||||||
(nreverse parse-tree))
|
(nreverse parse-tree))
|
||||||
@ -311,13 +309,11 @@ Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
|||||||
(t parse-tree)))
|
(t parse-tree)))
|
||||||
|
|
||||||
(defun parse-string (string)
|
(defun parse-string (string)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Translate the regex string STRING into a parse tree."
|
"Translate the regex string STRING into a parse tree."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(let* ((lexer (make-lexer string))
|
(let* ((lexer (make-lexer string))
|
||||||
(parse-tree (reverse-strings (reg-expr lexer))))
|
(parse-tree (reverse-strings (reg-expr lexer))))
|
||||||
;; check whether we've consumed the whole regex string
|
;; check whether we've consumed the whole regex string
|
||||||
(if (end-of-string-p lexer)
|
(if (end-of-string-p lexer)
|
||||||
parse-tree
|
parse-tree
|
||||||
(signal-ppcre-syntax-error*
|
(signal-syntax-error* (lexer-pos lexer) "Expected end of string."))))
|
||||||
(lexer-pos lexer)
|
|
||||||
"Expected end of string"))))
|
|
||||||
|
|||||||
269
ppcre-tests.lisp
269
ppcre-tests.lisp
@ -1,269 +0,0 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.36 2008/06/25 14:04:28 edi Exp $
|
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
|
||||||
|
|
||||||
;;; Redistribution and use in source and binary forms, with or without
|
|
||||||
;;; modification, are permitted provided that the following conditions
|
|
||||||
;;; are met:
|
|
||||||
|
|
||||||
;;; * Redistributions of source code must retain the above copyright
|
|
||||||
;;; notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
;;; * Redistributions in binary form must reproduce the above
|
|
||||||
;;; copyright notice, this list of conditions and the following
|
|
||||||
;;; disclaimer in the documentation and/or other materials
|
|
||||||
;;; provided with the distribution.
|
|
||||||
|
|
||||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
||||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
||||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
||||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
||||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
||||||
(in-package #:cl-ppcre-test)
|
|
||||||
|
|
||||||
(defparameter *cl-ppcre-test-base-directory*
|
|
||||||
(make-pathname :name nil :type nil :version nil
|
|
||||||
:defaults (parse-namestring *load-truename*)))
|
|
||||||
|
|
||||||
(defun full-gc ()
|
|
||||||
"Start a full garbage collection."
|
|
||||||
;; what are the corresponding values for MCL and OpenMCL?
|
|
||||||
#+:allegro (excl:gc t)
|
|
||||||
#+(or :cmu :scl) (ext:gc :full t)
|
|
||||||
#+:ecl (si:gc t)
|
|
||||||
#+:clisp (ext:gc)
|
|
||||||
#+:cormanlisp (loop for i from 0 to 3 do (cormanlisp:gc i))
|
|
||||||
#+:lispworks4 (hcl:mark-and-sweep 3)
|
|
||||||
#+:lispworks5 (hcl:gc-generation #+:lispworks-32bit 3 #+:lispworks-64bit :blocking-gen-num)
|
|
||||||
#+:sbcl (sb-ext:gc :full t))
|
|
||||||
|
|
||||||
;; warning: ugly code ahead!!
|
|
||||||
;; this is just a quick hack for testing purposes
|
|
||||||
|
|
||||||
(defun time-regex (factor regex string
|
|
||||||
&key case-insensitive-mode
|
|
||||||
multi-line-mode
|
|
||||||
single-line-mode
|
|
||||||
extended-mode)
|
|
||||||
(declare #.ppcre::*standard-optimize-settings*)
|
|
||||||
"Auxiliary function used by TEST to benchmark a regex scanner
|
|
||||||
against Perl timings."
|
|
||||||
(declare (type string string))
|
|
||||||
(let* ((scanner (create-scanner regex
|
|
||||||
:case-insensitive-mode case-insensitive-mode
|
|
||||||
:multi-line-mode multi-line-mode
|
|
||||||
:single-line-mode single-line-mode
|
|
||||||
:extended-mode extended-mode))
|
|
||||||
;; make sure GC doesn't invalidate our benchmarking
|
|
||||||
(dummy (full-gc))
|
|
||||||
(start (get-internal-real-time)))
|
|
||||||
(declare (ignore dummy))
|
|
||||||
(dotimes (i factor)
|
|
||||||
(funcall scanner string 0 (length string)))
|
|
||||||
(float (/ (- (get-internal-real-time) start) internal-time-units-per-second))))
|
|
||||||
|
|
||||||
#+(or scl
|
|
||||||
lispworks
|
|
||||||
(and sbcl sb-thread))
|
|
||||||
(defun threaded-scan (scanner target-string &key (threads 10) (repetitions 5000))
|
|
||||||
(declare #.ppcre::*standard-optimize-settings*)
|
|
||||||
"Auxiliary function used by TEST to check whether SCANNER is thread-safe."
|
|
||||||
(full-gc)
|
|
||||||
(let ((collector (make-array threads))
|
|
||||||
(counter 0))
|
|
||||||
(loop for i below threads
|
|
||||||
do (let* ((j i)
|
|
||||||
(fn
|
|
||||||
(lambda ()
|
|
||||||
(let ((r (random repetitions)))
|
|
||||||
(loop for k below repetitions
|
|
||||||
if (= k r)
|
|
||||||
do (setf (aref collector j)
|
|
||||||
(let ((result
|
|
||||||
(multiple-value-list
|
|
||||||
(cl-ppcre:scan scanner target-string))))
|
|
||||||
(unless (cdr result)
|
|
||||||
(setq result '(nil nil #() #())))
|
|
||||||
result))
|
|
||||||
else
|
|
||||||
do (cl-ppcre:scan scanner target-string))
|
|
||||||
(incf counter)))))
|
|
||||||
#+scl (thread:thread-create fn)
|
|
||||||
#+lispworks (mp:process-run-function "" nil fn)
|
|
||||||
#+(and sbcl sb-thread) (sb-thread:make-thread fn)))
|
|
||||||
(loop while (< counter threads)
|
|
||||||
do (sleep .1))
|
|
||||||
(destructuring-bind (first-start first-end first-reg-starts first-reg-ends)
|
|
||||||
(aref collector 0)
|
|
||||||
(loop for (start end reg-starts reg-ends) across collector
|
|
||||||
if (or (not (eql first-start start))
|
|
||||||
(not (eql first-end end))
|
|
||||||
(/= (length first-reg-starts) (length reg-starts))
|
|
||||||
(/= (length first-reg-ends) (length reg-ends))
|
|
||||||
(loop for first-reg-start across first-reg-starts
|
|
||||||
for reg-start across reg-starts
|
|
||||||
thereis (not (eql first-reg-start reg-start)))
|
|
||||||
(loop for first-reg-end across first-reg-ends
|
|
||||||
for reg-end across reg-ends
|
|
||||||
thereis (not (eql first-reg-end reg-end))))
|
|
||||||
do (return (format nil "~&Inconsistent results during multi-threading"))))))
|
|
||||||
|
|
||||||
(defun create-string-from-input (input)
|
|
||||||
(cond ((or (null input)
|
|
||||||
(stringp input))
|
|
||||||
input)
|
|
||||||
(t
|
|
||||||
(cl-ppcre::string-list-to-simple-string
|
|
||||||
(loop for element in input
|
|
||||||
if (stringp element)
|
|
||||||
collect element
|
|
||||||
else
|
|
||||||
collect (string (code-char element)))))))
|
|
||||||
|
|
||||||
(defun test (&key (file-name
|
|
||||||
(make-pathname :name "testdata"
|
|
||||||
:type nil :version nil
|
|
||||||
:defaults *cl-ppcre-test-base-directory*)
|
|
||||||
file-name-provided-p)
|
|
||||||
threaded)
|
|
||||||
(declare #.ppcre::*standard-optimize-settings*)
|
|
||||||
(declare (ignorable threaded))
|
|
||||||
"Loop through all test cases in FILE-NAME and print report. Only in
|
|
||||||
LispWorks and SCL: If THREADED is true, also test whether the scanners
|
|
||||||
work multi-threaded."
|
|
||||||
(with-open-file (stream file-name
|
|
||||||
#+(or :allegro :clisp :scl :sbcl)
|
|
||||||
:external-format
|
|
||||||
#+(or :allegro :clisp :scl :sbcl)
|
|
||||||
(if file-name-provided-p
|
|
||||||
:default
|
|
||||||
#+(or :allegro :scl :sbcl) :iso-8859-1
|
|
||||||
#+:clisp charset:iso-8859-1))
|
|
||||||
(loop with testcount of-type fixnum = 0
|
|
||||||
with *regex-char-code-limit* = (if file-name-provided-p
|
|
||||||
*regex-char-code-limit*
|
|
||||||
;; the standard test suite
|
|
||||||
;; doesn't need Unicode
|
|
||||||
;; support
|
|
||||||
256)
|
|
||||||
with *allow-quoting* = (if file-name-provided-p
|
|
||||||
*allow-quoting*
|
|
||||||
t)
|
|
||||||
for input-line = (read stream nil nil)
|
|
||||||
for (counter info-string regex
|
|
||||||
case-insensitive-mode multi-line-mode
|
|
||||||
single-line-mode extended-mode
|
|
||||||
string perl-error factor
|
|
||||||
perl-time ex-result ex-subs) = input-line
|
|
||||||
while input-line
|
|
||||||
do (let ((info-string (create-string-from-input info-string))
|
|
||||||
(regex (create-string-from-input regex))
|
|
||||||
(string (create-string-from-input string))
|
|
||||||
(ex-result (create-string-from-input ex-result))
|
|
||||||
(ex-subs (mapcar #'create-string-from-input ex-subs))
|
|
||||||
(errors '()))
|
|
||||||
;; provide some visual feedback for slow CL
|
|
||||||
;; implementations; suggested by JP Massar
|
|
||||||
(incf testcount)
|
|
||||||
#+(or scl
|
|
||||||
lispworks
|
|
||||||
(and sbcl sb-thread))
|
|
||||||
(when threaded
|
|
||||||
(format t "Test #~A (ID ~A)~%" testcount counter)
|
|
||||||
(force-output))
|
|
||||||
(unless #-(or scl
|
|
||||||
lispworks
|
|
||||||
(and sbcl sb-thread))
|
|
||||||
nil
|
|
||||||
#+(or scl
|
|
||||||
lispworks
|
|
||||||
(and sbcl sb-thread))
|
|
||||||
threaded
|
|
||||||
(when (zerop (mod testcount 10))
|
|
||||||
(format t ".")
|
|
||||||
(force-output))
|
|
||||||
(when (zerop (mod testcount 100))
|
|
||||||
(terpri)))
|
|
||||||
(handler-case
|
|
||||||
(let* ((*use-bmh-matchers* (if (and (> factor 1) (plusp perl-time))
|
|
||||||
*use-bmh-matchers*
|
|
||||||
;; if we only check for
|
|
||||||
;; correctness we don't
|
|
||||||
;; care about speed that
|
|
||||||
;; match (but rather
|
|
||||||
;; about space
|
|
||||||
;; constraints of the
|
|
||||||
;; trial versions)
|
|
||||||
nil))
|
|
||||||
(scanner (create-scanner regex
|
|
||||||
:case-insensitive-mode case-insensitive-mode
|
|
||||||
:multi-line-mode multi-line-mode
|
|
||||||
:single-line-mode single-line-mode
|
|
||||||
:extended-mode extended-mode)))
|
|
||||||
(multiple-value-bind (result1 result2 sub-starts sub-ends)
|
|
||||||
(scan scanner string)
|
|
||||||
(cond (perl-error
|
|
||||||
(push (format nil
|
|
||||||
"~&expected an error but got a result")
|
|
||||||
errors))
|
|
||||||
(t
|
|
||||||
(when (not (eq result1 ex-result))
|
|
||||||
(if result1
|
|
||||||
(let ((result (subseq string result1 result2)))
|
|
||||||
(unless (string= result ex-result)
|
|
||||||
(push (format nil
|
|
||||||
"~&expected ~S but got ~S"
|
|
||||||
ex-result result)
|
|
||||||
errors))
|
|
||||||
(setq sub-starts (coerce sub-starts 'list)
|
|
||||||
sub-ends (coerce sub-ends 'list))
|
|
||||||
(loop for i from 0
|
|
||||||
for ex-sub in ex-subs
|
|
||||||
for sub-start = (nth i sub-starts)
|
|
||||||
for sub-end = (nth i sub-ends)
|
|
||||||
for sub = (if (and sub-start sub-end)
|
|
||||||
(subseq string sub-start sub-end)
|
|
||||||
nil)
|
|
||||||
unless (string= ex-sub sub)
|
|
||||||
do (push (format nil
|
|
||||||
"~&\\~A: expected ~S but got ~S"
|
|
||||||
(1+ i) ex-sub sub) errors)))
|
|
||||||
(push (format nil
|
|
||||||
"~&expected ~S but got ~S"
|
|
||||||
ex-result result1)
|
|
||||||
errors)))))
|
|
||||||
#+(or scl
|
|
||||||
lispworks
|
|
||||||
(and sbcl sb-thread))
|
|
||||||
(when threaded
|
|
||||||
(let ((thread-result (threaded-scan scanner string)))
|
|
||||||
(when thread-result
|
|
||||||
(push thread-result errors))))))
|
|
||||||
(condition (msg)
|
|
||||||
(unless perl-error
|
|
||||||
(push (format nil "~&got an unexpected error: '~A'" msg)
|
|
||||||
errors))))
|
|
||||||
(setq errors (nreverse errors))
|
|
||||||
(cond (errors
|
|
||||||
(when (or (<= factor 1) (zerop perl-time))
|
|
||||||
(format t "~&~4@A (~A):~{~& ~A~}~%"
|
|
||||||
counter info-string errors)))
|
|
||||||
((and (> factor 1) (plusp perl-time))
|
|
||||||
(let ((result (time-regex factor regex string
|
|
||||||
:case-insensitive-mode case-insensitive-mode
|
|
||||||
:multi-line-mode multi-line-mode
|
|
||||||
:single-line-mode single-line-mode
|
|
||||||
:extended-mode extended-mode)))
|
|
||||||
(format t "~&~4@A: ~,4F (~A repetitions, Perl: ~,4F seconds, CL-PPCRE: ~,4F seconds)" counter
|
|
||||||
(float (/ result perl-time)) factor perl-time result)
|
|
||||||
#+:cormanlisp (force-output *standard-output*)))
|
|
||||||
(t nil))))
|
|
||||||
(values)))
|
|
||||||
555
regex-class-util.lisp
Normal file
555
regex-class-util.lisp
Normal file
@ -0,0 +1,555 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class-util.lisp,v 1.8 2008/07/22 22:38:05 edi Exp $
|
||||||
|
|
||||||
|
;;; This file contains some utility methods for REGEX objects.
|
||||||
|
|
||||||
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
|
;;; The following four methods allow a VOID object to behave like a
|
||||||
|
;;; zero-length STR object (only readers needed)
|
||||||
|
|
||||||
|
(defmethod len ((void void))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
0)
|
||||||
|
|
||||||
|
(defmethod str ((void void))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
"")
|
||||||
|
|
||||||
|
(defmethod skip ((void void))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
nil)
|
||||||
|
|
||||||
|
(defmethod start-of-end-string-p ((void void))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
nil)
|
||||||
|
|
||||||
|
(defgeneric case-mode (regex old-case-mode)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
|
||||||
|
Returns a keyword denoting the case-(in)sensitivity of a STR or its
|
||||||
|
second argument if the STR has length 0. Returns NIL for REGEX objects
|
||||||
|
which are not of type STR."))
|
||||||
|
|
||||||
|
(defmethod case-mode ((str str) old-case-mode)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(cond ((zerop (len str))
|
||||||
|
old-case-mode)
|
||||||
|
((case-insensitive-p str)
|
||||||
|
:case-insensitive)
|
||||||
|
(t
|
||||||
|
:case-sensitive)))
|
||||||
|
|
||||||
|
(defmethod case-mode ((regex regex) old-case-mode)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (ignore old-case-mode))
|
||||||
|
nil)
|
||||||
|
|
||||||
|
(defgeneric copy-regex (regex)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Implements a deep copy of a REGEX object."))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((anchor anchor))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'anchor
|
||||||
|
:startp (startp anchor)
|
||||||
|
:multi-line-p (multi-line-p anchor)
|
||||||
|
:no-newline-p (no-newline-p anchor)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((everything everything))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'everything
|
||||||
|
:single-line-p (single-line-p everything)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((word-boundary word-boundary))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'word-boundary
|
||||||
|
:negatedp (negatedp word-boundary)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((void void))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'void))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((lookahead lookahead))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'lookahead
|
||||||
|
:regex (copy-regex (regex lookahead))
|
||||||
|
:positivep (positivep lookahead)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((seq seq))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'seq
|
||||||
|
:elements (mapcar #'copy-regex (elements seq))))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((alternation alternation))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'alternation
|
||||||
|
:choices (mapcar #'copy-regex (choices alternation))))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((branch branch))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(with-slots (test)
|
||||||
|
branch
|
||||||
|
(make-instance 'branch
|
||||||
|
:test (if (typep test 'regex)
|
||||||
|
(copy-regex test)
|
||||||
|
test)
|
||||||
|
:then-regex (copy-regex (then-regex branch))
|
||||||
|
:else-regex (copy-regex (else-regex branch)))))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((lookbehind lookbehind))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'lookbehind
|
||||||
|
:regex (copy-regex (regex lookbehind))
|
||||||
|
:positivep (positivep lookbehind)
|
||||||
|
:len (len lookbehind)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((repetition repetition))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'repetition
|
||||||
|
:regex (copy-regex (regex repetition))
|
||||||
|
:greedyp (greedyp repetition)
|
||||||
|
:minimum (minimum repetition)
|
||||||
|
:maximum (maximum repetition)
|
||||||
|
:min-len (min-len repetition)
|
||||||
|
:len (len repetition)
|
||||||
|
:contains-register-p (contains-register-p repetition)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((register register))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'register
|
||||||
|
:regex (copy-regex (regex register))
|
||||||
|
:num (num register)
|
||||||
|
:name (name register)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((standalone standalone))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'standalone
|
||||||
|
:regex (copy-regex (regex standalone))))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((back-reference back-reference))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'back-reference
|
||||||
|
:num (num back-reference)
|
||||||
|
:case-insensitive-p (case-insensitive-p back-reference)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((char-class char-class))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'char-class
|
||||||
|
:test-function (test-function char-class)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((str str))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'str
|
||||||
|
:str (str str)
|
||||||
|
:case-insensitive-p (case-insensitive-p str)))
|
||||||
|
|
||||||
|
(defmethod copy-regex ((filter filter))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'filter
|
||||||
|
:fn (fn filter)
|
||||||
|
:len (len filter)))
|
||||||
|
|
||||||
|
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
|
||||||
|
;;; wrapped into one function. Maybe in the next release...
|
||||||
|
|
||||||
|
;;; Further note that this function is used by CONVERT to factor out
|
||||||
|
;;; complicated repetitions, i.e. cases like
|
||||||
|
;;; (a)* -> (?:a*(a))?
|
||||||
|
;;; This won't work for, say,
|
||||||
|
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
|
||||||
|
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
|
||||||
|
|
||||||
|
(defgeneric remove-registers (regex)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
|
||||||
|
optionally removes embedded REGISTER objects if possible and if the
|
||||||
|
special variable REMOVE-REGISTERS-P is true."))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((register register))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (special remove-registers-p reg-seen))
|
||||||
|
(cond (remove-registers-p
|
||||||
|
(remove-registers (regex register)))
|
||||||
|
(t
|
||||||
|
;; mark REG-SEEN as true so enclosing REPETITION objects
|
||||||
|
;; (see method below) know if they contain a register or not
|
||||||
|
(setq reg-seen t)
|
||||||
|
(copy-regex register))))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((repetition repetition))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(let* (reg-seen
|
||||||
|
(inner-regex (remove-registers (regex repetition))))
|
||||||
|
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
|
||||||
|
;; (REGEX REPETITION) contains a REGISTER
|
||||||
|
(declare (special reg-seen))
|
||||||
|
(make-instance 'repetition
|
||||||
|
:regex inner-regex
|
||||||
|
:greedyp (greedyp repetition)
|
||||||
|
:minimum (minimum repetition)
|
||||||
|
:maximum (maximum repetition)
|
||||||
|
:min-len (min-len repetition)
|
||||||
|
:len (len repetition)
|
||||||
|
:contains-register-p reg-seen)))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((standalone standalone))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'standalone
|
||||||
|
:regex (remove-registers (regex standalone))))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((lookahead lookahead))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'lookahead
|
||||||
|
:regex (remove-registers (regex lookahead))
|
||||||
|
:positivep (positivep lookahead)))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((lookbehind lookbehind))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'lookbehind
|
||||||
|
:regex (remove-registers (regex lookbehind))
|
||||||
|
:positivep (positivep lookbehind)
|
||||||
|
:len (len lookbehind)))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((branch branch))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(with-slots (test)
|
||||||
|
branch
|
||||||
|
(make-instance 'branch
|
||||||
|
:test (if (typep test 'regex)
|
||||||
|
(remove-registers test)
|
||||||
|
test)
|
||||||
|
:then-regex (remove-registers (then-regex branch))
|
||||||
|
:else-regex (remove-registers (else-regex branch)))))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((alternation alternation))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (special remove-registers-p))
|
||||||
|
;; an ALTERNATION, so we can't remove REGISTER objects further down
|
||||||
|
(setq remove-registers-p nil)
|
||||||
|
(copy-regex alternation))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((regex regex))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(copy-regex regex))
|
||||||
|
|
||||||
|
(defmethod remove-registers ((seq seq))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(make-instance 'seq
|
||||||
|
:elements (mapcar #'remove-registers (elements seq))))
|
||||||
|
|
||||||
|
(defgeneric everythingp (regex)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
|
||||||
|
to this object, otherwise NIL. So, \"(.){1}\" would return true
|
||||||
|
\(i.e. the object corresponding to \".\", for example."))
|
||||||
|
|
||||||
|
(defmethod everythingp ((seq seq))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
|
||||||
|
;; due to the parsing process
|
||||||
|
(let ((cleaned-elements (remove-if #'(lambda (element)
|
||||||
|
(typep element 'void))
|
||||||
|
(elements seq))))
|
||||||
|
(and (= 1 (length cleaned-elements))
|
||||||
|
(everythingp (first cleaned-elements)))))
|
||||||
|
|
||||||
|
(defmethod everythingp ((alternation alternation))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(with-slots (choices)
|
||||||
|
alternation
|
||||||
|
(and (= 1 (length choices))
|
||||||
|
;; this is unlikely to happen for human-generated regexes,
|
||||||
|
;; but machine-generated ones might look like this
|
||||||
|
(everythingp (first choices)))))
|
||||||
|
|
||||||
|
(defmethod everythingp ((repetition repetition))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(with-slots (maximum minimum regex)
|
||||||
|
repetition
|
||||||
|
(and maximum
|
||||||
|
(= 1 minimum maximum)
|
||||||
|
;; treat "<regex>{1,1}" like "<regex>"
|
||||||
|
(everythingp regex))))
|
||||||
|
|
||||||
|
(defmethod everythingp ((register register))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(everythingp (regex register)))
|
||||||
|
|
||||||
|
(defmethod everythingp ((standalone standalone))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(everythingp (regex standalone)))
|
||||||
|
|
||||||
|
(defmethod everythingp ((everything everything))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
everything)
|
||||||
|
|
||||||
|
(defmethod everythingp ((regex regex))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
|
||||||
|
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
|
||||||
|
nil)
|
||||||
|
|
||||||
|
(defgeneric regex-length (regex)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
|
||||||
|
|
||||||
|
(defmethod regex-length ((seq seq))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; simply add all inner lengths unless one of them is NIL
|
||||||
|
(loop for sub-regex in (elements seq)
|
||||||
|
for len = (regex-length sub-regex)
|
||||||
|
if (not len) do (return nil)
|
||||||
|
sum len))
|
||||||
|
|
||||||
|
(defmethod regex-length ((alternation alternation))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; only return a true value if all inner lengths are non-NIL and
|
||||||
|
;; mutually equal
|
||||||
|
(loop for sub-regex in (choices alternation)
|
||||||
|
for old-len = nil then len
|
||||||
|
for len = (regex-length sub-regex)
|
||||||
|
if (or (not len)
|
||||||
|
(and old-len (/= len old-len))) do (return nil)
|
||||||
|
finally (return len)))
|
||||||
|
|
||||||
|
(defmethod regex-length ((branch branch))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; only return a true value if both alternations have a length and
|
||||||
|
;; if they're equal
|
||||||
|
(let ((then-length (regex-length (then-regex branch))))
|
||||||
|
(and then-length
|
||||||
|
(eql then-length (regex-length (else-regex branch)))
|
||||||
|
then-length)))
|
||||||
|
|
||||||
|
(defmethod regex-length ((repetition repetition))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; we can only compute the length of a REPETITION object if the
|
||||||
|
;; number of repetitions is fixed; note that we don't call
|
||||||
|
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
|
||||||
|
;; always set correctly
|
||||||
|
(with-slots (len minimum maximum)
|
||||||
|
repetition
|
||||||
|
(if (and len
|
||||||
|
(eql minimum maximum))
|
||||||
|
(* minimum len)
|
||||||
|
nil)))
|
||||||
|
|
||||||
|
(defmethod regex-length ((register register))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(regex-length (regex register)))
|
||||||
|
|
||||||
|
(defmethod regex-length ((standalone standalone))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(regex-length (regex standalone)))
|
||||||
|
|
||||||
|
(defmethod regex-length ((back-reference back-reference))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; with enough effort we could possibly do better here, but
|
||||||
|
;; currently we just give up and return NIL
|
||||||
|
nil)
|
||||||
|
|
||||||
|
(defmethod regex-length ((char-class char-class))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
1)
|
||||||
|
|
||||||
|
(defmethod regex-length ((everything everything))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
1)
|
||||||
|
|
||||||
|
(defmethod regex-length ((str str))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(len str))
|
||||||
|
|
||||||
|
(defmethod regex-length ((filter filter))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(len filter))
|
||||||
|
|
||||||
|
(defmethod regex-length ((regex regex))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||||
|
;; WORD-BOUNDARY (which all have zero-length)
|
||||||
|
0)
|
||||||
|
|
||||||
|
(defgeneric regex-min-length (regex)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Returns the minimal length of REGEX."))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((seq seq))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; simply add all inner minimal lengths
|
||||||
|
(loop for sub-regex in (elements seq)
|
||||||
|
for len = (regex-min-length sub-regex)
|
||||||
|
sum len))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((alternation alternation))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; minimal length of an alternation is the minimal length of the
|
||||||
|
;; "shortest" element
|
||||||
|
(loop for sub-regex in (choices alternation)
|
||||||
|
for len = (regex-min-length sub-regex)
|
||||||
|
minimize len))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((branch branch))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; minimal length of both alternations
|
||||||
|
(min (regex-min-length (then-regex branch))
|
||||||
|
(regex-min-length (else-regex branch))))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((repetition repetition))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; obviously the product of the inner minimal length and the minimal
|
||||||
|
;; number of repetitions
|
||||||
|
(* (minimum repetition) (min-len repetition)))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((register register))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(regex-min-length (regex register)))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((standalone standalone))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(regex-min-length (regex standalone)))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((char-class char-class))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
1)
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((everything everything))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
1)
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((str str))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(len str))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((filter filter))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(or (len filter)
|
||||||
|
0))
|
||||||
|
|
||||||
|
(defmethod regex-min-length ((regex regex))
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
|
||||||
|
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
|
||||||
|
0)
|
||||||
|
|
||||||
|
(defgeneric compute-offsets (regex start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(:documentation "Returns the offset the following regex would have
|
||||||
|
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
|
||||||
|
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
|
||||||
|
slots of STR objects further down the tree."))
|
||||||
|
|
||||||
|
;; note that we're actually only interested in the offset of
|
||||||
|
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
|
||||||
|
;; can stop at variable-length alternations and don't need to descend
|
||||||
|
;; into repetitions
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((seq seq) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(loop for element in (elements seq)
|
||||||
|
;; advance offset argument for next call while looping through
|
||||||
|
;; the elements
|
||||||
|
for pos = start-pos then curr-offset
|
||||||
|
for curr-offset = (compute-offsets element pos)
|
||||||
|
while curr-offset
|
||||||
|
finally (return curr-offset)))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((alternation alternation) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(loop for choice in (choices alternation)
|
||||||
|
for old-offset = nil then curr-offset
|
||||||
|
for curr-offset = (compute-offsets choice start-pos)
|
||||||
|
;; we stop immediately if two alternations don't result in the
|
||||||
|
;; same offset
|
||||||
|
if (or (not curr-offset)
|
||||||
|
(and old-offset (/= curr-offset old-offset)))
|
||||||
|
do (return nil)
|
||||||
|
finally (return curr-offset)))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((branch branch) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; only return offset if both alternations have equal value
|
||||||
|
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
|
||||||
|
(and then-offset
|
||||||
|
(eql then-offset (compute-offsets (else-regex branch) start-pos))
|
||||||
|
then-offset)))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((repetition repetition) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; no need to descend into the inner regex
|
||||||
|
(with-slots (len minimum maximum)
|
||||||
|
repetition
|
||||||
|
(if (and len
|
||||||
|
(eq minimum maximum))
|
||||||
|
;; fixed number of repetitions, so we know how to proceed
|
||||||
|
(+ start-pos (* minimum len))
|
||||||
|
;; otherwise return NIL
|
||||||
|
nil)))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((register register) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(compute-offsets (regex register) start-pos))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((standalone standalone) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(compute-offsets (regex standalone) start-pos))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((char-class char-class) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(1+ start-pos))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((everything everything) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(1+ start-pos))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((str str) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(setf (offset str) start-pos)
|
||||||
|
(+ start-pos (len str)))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((back-reference back-reference) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; with enough effort we could possibly do better here, but
|
||||||
|
;; currently we just give up and return NIL
|
||||||
|
(declare (ignore start-pos))
|
||||||
|
nil)
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((filter filter) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(let ((len (len filter)))
|
||||||
|
(if len
|
||||||
|
(+ start-pos len)
|
||||||
|
nil)))
|
||||||
|
|
||||||
|
(defmethod compute-offsets ((regex regex) start-pos)
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||||
|
;; WORD-BOUNDARY (which all have zero-length)
|
||||||
|
start-pos)
|
||||||
912
regex-class.lisp
912
regex-class.lisp
@ -1,9 +1,8 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.34 2008/07/03 07:44:06 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.42 2008/07/22 22:38:05 edi Exp $
|
||||||
|
|
||||||
;;; This file defines the REGEX class and some utility methods for
|
;;; This file defines the REGEX class. REGEX objects are used to
|
||||||
;;; this class. REGEX objects are used to represent the (transformed)
|
;;; represent the (transformed) parse trees internally
|
||||||
;;; parse trees internally
|
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
@ -31,244 +30,218 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
;; Genera need the eval-when, here, or the types created by the class
|
(defclass regex ()
|
||||||
;; definitions aren't seen by the typep calls later in the file.
|
()
|
||||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
(:documentation "The REGEX base class. All other classes inherit
|
||||||
(locally
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(defclass regex ()
|
|
||||||
()
|
|
||||||
(:documentation "The REGEX base class. All other classes inherit
|
|
||||||
from this one."))
|
from this one."))
|
||||||
|
|
||||||
|
(defclass seq (regex)
|
||||||
|
((elements :initarg :elements
|
||||||
|
:accessor elements
|
||||||
|
:type cons
|
||||||
|
:documentation "A list of REGEX objects."))
|
||||||
|
(:documentation "SEQ objects represents sequences of regexes.
|
||||||
|
\(Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
||||||
|
|
||||||
(defclass seq (regex)
|
(defclass alternation (regex)
|
||||||
((elements :initarg :elements
|
((choices :initarg :choices
|
||||||
:accessor elements
|
:accessor choices
|
||||||
:type cons
|
:type cons
|
||||||
:documentation "A list of REGEX objects."))
|
:documentation "A list of REGEX objects"))
|
||||||
(:documentation "SEQ objects represents sequences of
|
(:documentation "ALTERNATION objects represent alternations of
|
||||||
regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
regexes. \(Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
||||||
|
|
||||||
(defclass alternation (regex)
|
(defclass lookahead (regex)
|
||||||
((choices :initarg :choices
|
((regex :initarg :regex
|
||||||
:accessor choices
|
:accessor regex
|
||||||
:type cons
|
:documentation "The REGEX object we're checking.")
|
||||||
:documentation "A list of REGEX objects"))
|
(positivep :initarg :positivep
|
||||||
(:documentation "ALTERNATION objects represent alternations of
|
:reader positivep
|
||||||
regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
:documentation "Whether this assertion is positive."))
|
||||||
|
(:documentation "LOOKAHEAD objects represent look-ahead assertions."))
|
||||||
|
|
||||||
(defclass lookahead (regex)
|
(defclass lookbehind (regex)
|
||||||
((regex :initarg :regex
|
((regex :initarg :regex
|
||||||
:accessor regex
|
:accessor regex
|
||||||
:documentation "The REGEX object we're checking.")
|
:documentation "The REGEX object we're checking.")
|
||||||
(positivep :initarg :positivep
|
(positivep :initarg :positivep
|
||||||
:reader positivep
|
:reader positivep
|
||||||
:documentation "Whether this assertion is positive."))
|
:documentation "Whether this assertion is positive.")
|
||||||
(:documentation "LOOKAHEAD objects represent look-ahead assertions."))
|
(len :initarg :len
|
||||||
|
:accessor len
|
||||||
|
:type fixnum
|
||||||
|
:documentation "The \(fixed) length of the enclosed regex."))
|
||||||
|
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
|
||||||
|
|
||||||
(defclass lookbehind (regex)
|
(defclass repetition (regex)
|
||||||
((regex :initarg :regex
|
((regex :initarg :regex
|
||||||
:accessor regex
|
:accessor regex
|
||||||
:documentation "The REGEX object we're checking.")
|
:documentation "The REGEX that's repeated.")
|
||||||
(positivep :initarg :positivep
|
(greedyp :initarg :greedyp
|
||||||
:reader positivep
|
:reader greedyp
|
||||||
:documentation "Whether this assertion is positive.")
|
:documentation "Whether the repetition is greedy.")
|
||||||
(len :initarg :len
|
(minimum :initarg :minimum
|
||||||
:accessor len
|
:accessor minimum
|
||||||
:type fixnum
|
:type fixnum
|
||||||
:documentation "The (fixed) length of the enclosed regex."))
|
:documentation "The minimal number of repetitions.")
|
||||||
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
|
(maximum :initarg :maximum
|
||||||
|
:accessor maximum
|
||||||
(defclass repetition (regex)
|
:documentation "The maximal number of repetitions.
|
||||||
((regex :initarg :regex
|
|
||||||
:accessor regex
|
|
||||||
:documentation "The REGEX that's repeated.")
|
|
||||||
(greedyp :initarg :greedyp
|
|
||||||
:reader greedyp
|
|
||||||
:documentation "Whether the repetition is greedy.")
|
|
||||||
(minimum :initarg :minimum
|
|
||||||
:accessor minimum
|
|
||||||
:type fixnum
|
|
||||||
:documentation "The minimal number of repetitions.")
|
|
||||||
(maximum :initarg :maximum
|
|
||||||
:accessor maximum
|
|
||||||
:documentation "The maximal number of repetitions.
|
|
||||||
Can be NIL for unbounded.")
|
Can be NIL for unbounded.")
|
||||||
(min-len :initarg :min-len
|
(min-len :initarg :min-len
|
||||||
:reader min-len
|
:reader min-len
|
||||||
:documentation "The minimal length of the enclosed regex.")
|
:documentation "The minimal length of the enclosed regex.")
|
||||||
(len :initarg :len
|
(len :initarg :len
|
||||||
:reader len
|
:reader len
|
||||||
:documentation "The length of the enclosed regex. NIL
|
:documentation "The length of the enclosed regex. NIL if
|
||||||
if unknown.")
|
unknown.")
|
||||||
(min-rest :initform 0
|
(min-rest :initform 0
|
||||||
:accessor min-rest
|
:accessor min-rest
|
||||||
:type fixnum
|
:type fixnum
|
||||||
:documentation "The minimal number of characters which must
|
:documentation "The minimal number of characters which
|
||||||
appear after this repetition.")
|
must appear after this repetition.")
|
||||||
(contains-register-p :initarg :contains-register-p
|
(contains-register-p :initarg :contains-register-p
|
||||||
:reader contains-register-p
|
:reader contains-register-p
|
||||||
:documentation "If the regex contains a register."))
|
:documentation "Whether the regex contains a
|
||||||
(:documentation "REPETITION objects represent repetitions of regexes."))
|
register."))
|
||||||
|
(:documentation "REPETITION objects represent repetitions of regexes."))
|
||||||
|
|
||||||
(defclass register (regex)
|
(defclass register (regex)
|
||||||
((regex :initarg :regex
|
((regex :initarg :regex
|
||||||
:accessor regex
|
:accessor regex
|
||||||
:documentation "The inner regex.")
|
:documentation "The inner regex.")
|
||||||
(num :initarg :num
|
(num :initarg :num
|
||||||
:reader num
|
:reader num
|
||||||
:type fixnum
|
:type fixnum
|
||||||
:documentation "The number of this register, starting from 0.
|
:documentation "The number of this register, starting from 0.
|
||||||
This is the index into *REGS-START* and *REGS-END*.")
|
This is the index into *REGS-START* and *REGS-END*.")
|
||||||
(name :initarg :name
|
(name :initarg :name
|
||||||
:reader name
|
:reader name
|
||||||
:documentation "Name of this register or NIL."))
|
:documentation "Name of this register or NIL."))
|
||||||
(:documentation "REGISTER objects represent register groups."))
|
(:documentation "REGISTER objects represent register groups."))
|
||||||
|
|
||||||
(defclass standalone (regex)
|
(defclass standalone (regex)
|
||||||
((regex :initarg :regex
|
((regex :initarg :regex
|
||||||
:accessor regex
|
:accessor regex
|
||||||
:documentation "The inner regex."))
|
:documentation "The inner regex."))
|
||||||
(:documentation "A standalone regular expression."))
|
(:documentation "A standalone regular expression."))
|
||||||
|
|
||||||
(defclass back-reference (regex)
|
(defclass back-reference (regex)
|
||||||
((num :initarg :num
|
((num :initarg :num
|
||||||
:accessor num
|
:accessor num
|
||||||
:type fixnum
|
:type fixnum
|
||||||
:documentation "The number of the register this
|
:documentation "The number of the register this
|
||||||
reference refers to.")
|
reference refers to.")
|
||||||
(name :initarg :name
|
(name :initarg :name
|
||||||
:accessor name
|
:accessor name
|
||||||
:documentation "The name of the register this
|
:documentation "The name of the register this
|
||||||
reference refers to or NIL.")
|
reference refers to or NIL.")
|
||||||
(case-insensitive-p :initarg :case-insensitive-p
|
(case-insensitive-p :initarg :case-insensitive-p
|
||||||
:reader case-insensitive-p
|
:reader case-insensitive-p
|
||||||
:documentation "Whether we check
|
:documentation "Whether we check
|
||||||
case-insensitively."))
|
case-insensitively."))
|
||||||
(:documentation "BACK-REFERENCE objects represent backreferences."))
|
(:documentation "BACK-REFERENCE objects represent backreferences."))
|
||||||
(defclass char-class (regex)
|
|
||||||
((charset :initarg :charset
|
|
||||||
:reader charset
|
|
||||||
:type (or charset null)
|
|
||||||
:documentation "A charset denoting the characters
|
|
||||||
in the character class.")
|
|
||||||
(case-insensitive-p :initarg :case-insensitive-p
|
|
||||||
:reader case-insensitive-p
|
|
||||||
:documentation "If the char class
|
|
||||||
case-insensitive.")
|
|
||||||
(invertedp :initarg :invertedp
|
|
||||||
:reader invertedp
|
|
||||||
:documentation "Whether we mean the inverse of
|
|
||||||
the char class.")
|
|
||||||
(word-char-class-p :initarg :word-char-class-p
|
|
||||||
:reader word-char-class-p
|
|
||||||
:documentation "Whether this CHAR CLASS
|
|
||||||
represents the special class WORD-CHAR-CLASS."))
|
|
||||||
(:documentation "CHAR-CLASS objects represent character classes."))
|
|
||||||
|
|
||||||
(defclass str (regex)
|
(defclass char-class (regex)
|
||||||
((str :initarg :str
|
((test-function :initarg :test-function
|
||||||
:accessor str
|
:reader test-function
|
||||||
:type string
|
:type (or function symbol nil)
|
||||||
:documentation "The actual string.")
|
:documentation "A unary function \(accepting a
|
||||||
(len :initform 0
|
character) which stands in for the character class and does the work
|
||||||
:accessor len
|
of checking whether a character belongs to the class."))
|
||||||
:type fixnum
|
(:documentation "CHAR-CLASS objects represent character classes."))
|
||||||
:documentation "The length of the string.")
|
|
||||||
(case-insensitive-p :initarg :case-insensitive-p
|
(defclass str (regex)
|
||||||
:reader case-insensitive-p
|
((str :initarg :str
|
||||||
:documentation "If we match case-insensitively.")
|
:accessor str
|
||||||
(offset :initform nil
|
:type string
|
||||||
:accessor offset
|
:documentation "The actual string.")
|
||||||
:documentation "Offset from the left of the whole
|
(len :initform 0
|
||||||
|
:accessor len
|
||||||
|
:type fixnum
|
||||||
|
:documentation "The length of the string.")
|
||||||
|
(case-insensitive-p :initarg :case-insensitive-p
|
||||||
|
:reader case-insensitive-p
|
||||||
|
:documentation "If we match case-insensitively.")
|
||||||
|
(offset :initform nil
|
||||||
|
:accessor offset
|
||||||
|
:documentation "Offset from the left of the whole
|
||||||
parse tree. The first regex has offset 0. NIL if unknown, i.e. behind
|
parse tree. The first regex has offset 0. NIL if unknown, i.e. behind
|
||||||
a variable-length regex.")
|
a variable-length regex.")
|
||||||
(skip :initform nil
|
(skip :initform nil
|
||||||
:initarg :skip
|
:initarg :skip
|
||||||
:accessor skip
|
:accessor skip
|
||||||
:documentation "If we can avoid testing for this
|
:documentation "If we can avoid testing for this
|
||||||
string because the SCAN function has done this already.")
|
string because the SCAN function has done this already.")
|
||||||
(start-of-end-string-p :initform nil
|
(start-of-end-string-p :initform nil
|
||||||
:accessor start-of-end-string-p
|
:accessor start-of-end-string-p
|
||||||
:documentation "If this is the unique
|
:documentation "If this is the unique
|
||||||
STR which starts END-STRING (a slot of MATCHER)."))
|
STR which starts END-STRING (a slot of MATCHER)."))
|
||||||
(:documentation "STR objects represent string."))
|
(:documentation "STR objects represent string."))
|
||||||
|
|
||||||
(defclass anchor (regex)
|
(defclass anchor (regex)
|
||||||
((startp :initarg :startp
|
((startp :initarg :startp
|
||||||
:reader startp
|
:reader startp
|
||||||
:documentation "Whether this is a \"start anchor\".")
|
:documentation "Whether this is a \"start anchor\".")
|
||||||
(multi-line-p :initarg :multi-line-p
|
(multi-line-p :initarg :multi-line-p
|
||||||
:reader multi-line-p
|
:initform nil
|
||||||
:documentation "Whether we're in multi-line mode,
|
:reader multi-line-p
|
||||||
|
:documentation "Whether we're in multi-line mode,
|
||||||
i.e. whether each #\\Newline is surrounded by anchors.")
|
i.e. whether each #\\Newline is surrounded by anchors.")
|
||||||
(no-newline-p :initarg :no-newline-p
|
(no-newline-p :initarg :no-newline-p
|
||||||
:reader no-newline-p
|
:initform nil
|
||||||
:documentation "Whether we ignore #\\Newline at the end."))
|
:reader no-newline-p
|
||||||
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
|
:documentation "Whether we ignore #\\Newline at the end."))
|
||||||
|
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
|
||||||
|
|
||||||
(defclass everything (regex)
|
(defclass everything (regex)
|
||||||
((single-line-p :initarg :single-line-p
|
((single-line-p :initarg :single-line-p
|
||||||
:reader single-line-p
|
:reader single-line-p
|
||||||
:documentation "Whether we're in single-line mode,
|
:documentation "Whether we're in single-line mode,
|
||||||
i.e. whether we also match #\\Newline."))
|
i.e. whether we also match #\\Newline."))
|
||||||
(:documentation "EVERYTHING objects represent regexes matching
|
(:documentation "EVERYTHING objects represent regexes matching
|
||||||
\"everything\", i.e. dots."))
|
\"everything\", i.e. dots."))
|
||||||
|
|
||||||
(defclass word-boundary (regex)
|
(defclass word-boundary (regex)
|
||||||
((negatedp :initarg :negatedp
|
((negatedp :initarg :negatedp
|
||||||
:reader negatedp
|
:reader negatedp
|
||||||
:documentation "Whether we mean the opposite,
|
:documentation "Whether we mean the opposite,
|
||||||
i.e. no word-boundary."))
|
i.e. no word-boundary."))
|
||||||
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))
|
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))
|
||||||
|
|
||||||
(defclass branch (regex)
|
(defclass branch (regex)
|
||||||
((test :initarg :test
|
((test :initarg :test
|
||||||
:accessor test
|
:accessor test
|
||||||
:documentation "The test of this branch, one of
|
:documentation "The test of this branch, one of
|
||||||
LOOKAHEAD, LOOKBEHIND, or a number.")
|
LOOKAHEAD, LOOKBEHIND, or a number.")
|
||||||
(then-regex :initarg :then-regex
|
(then-regex :initarg :then-regex
|
||||||
:accessor then-regex
|
:accessor then-regex
|
||||||
:documentation "The regex that's to be matched if the
|
:documentation "The regex that's to be matched if the
|
||||||
test succeeds.")
|
test succeeds.")
|
||||||
(else-regex :initarg :else-regex
|
(else-regex :initarg :else-regex
|
||||||
:initform (make-instance 'void)
|
:initform (make-instance 'void)
|
||||||
:accessor else-regex
|
:accessor else-regex
|
||||||
:documentation "The regex that's to be matched if the
|
:documentation "The regex that's to be matched if the
|
||||||
test fails."))
|
test fails."))
|
||||||
(:documentation "BRANCH objects represent Perl's conditional regular
|
(:documentation "BRANCH objects represent Perl's conditional regular
|
||||||
expressions."))
|
expressions."))
|
||||||
|
|
||||||
(defclass filter (regex)
|
(defclass filter (regex)
|
||||||
((fn :initarg :fn
|
((fn :initarg :fn
|
||||||
:accessor fn
|
:accessor fn
|
||||||
:type (or function symbol)
|
:type (or function symbol)
|
||||||
:documentation "The user-defined function.")
|
:documentation "The user-defined function.")
|
||||||
(len :initarg :len
|
(len :initarg :len
|
||||||
:reader len
|
:reader len
|
||||||
:documentation "The fixed length of this filter or NIL."))
|
:documentation "The fixed length of this filter or NIL."))
|
||||||
(:documentation "FILTER objects represent arbitrary functions
|
(:documentation "FILTER objects represent arbitrary functions
|
||||||
defined by the user."))
|
defined by the user."))
|
||||||
|
|
||||||
(defclass void (regex)
|
(defclass void (regex)
|
||||||
()
|
()
|
||||||
(:documentation "VOID objects represent empty regular expressions."))))
|
(:documentation "VOID objects represent empty regular expressions."))
|
||||||
|
|
||||||
(defmethod initialize-instance :after ((char-class char-class) &rest init-args)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Make large charsets smaller, if possible."
|
|
||||||
(let ((set (getf init-args :charset)))
|
|
||||||
(when (and set
|
|
||||||
(> *regex-char-code-limit* 256)
|
|
||||||
(> (charset-count set)
|
|
||||||
(/ *regex-char-code-limit* 2)))
|
|
||||||
(setf (slot-value char-class 'set)
|
|
||||||
(merge-set (make-charset) set)
|
|
||||||
(slot-value char-class 'invertedp)
|
|
||||||
(not (slot-value char-class 'invertedp))))))
|
|
||||||
|
|
||||||
(defmethod initialize-instance :after ((str str) &rest init-args)
|
(defmethod initialize-instance :after ((str str) &rest init-args)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -279,528 +252,3 @@ defined by the user."))
|
|||||||
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
|
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
|
||||||
(setf (len str) (length (str str))))
|
(setf (len str) (length (str str))))
|
||||||
|
|
||||||
;;; The following four methods allow a VOID object to behave like a
|
|
||||||
;;; zero-length STR object (only readers needed)
|
|
||||||
|
|
||||||
(defmethod len ((void void))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
0)
|
|
||||||
|
|
||||||
(defmethod str ((void void))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"")
|
|
||||||
|
|
||||||
(defmethod skip ((void void))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
nil)
|
|
||||||
|
|
||||||
(defmethod start-of-end-string-p ((void void))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
nil)
|
|
||||||
|
|
||||||
(defgeneric case-mode (regex old-case-mode)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
|
|
||||||
Returns a keyword denoting the case-(in)sensitivity of a STR or its
|
|
||||||
second argument if the STR has length 0. Returns NIL for REGEX objects
|
|
||||||
which are not of type STR."))
|
|
||||||
|
|
||||||
(defmethod case-mode ((str str) old-case-mode)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(cond ((zerop (len str))
|
|
||||||
old-case-mode)
|
|
||||||
((case-insensitive-p str)
|
|
||||||
:case-insensitive)
|
|
||||||
(t
|
|
||||||
:case-sensitive)))
|
|
||||||
|
|
||||||
(defmethod case-mode ((regex regex) old-case-mode)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(declare (ignore old-case-mode))
|
|
||||||
nil)
|
|
||||||
|
|
||||||
(defgeneric copy-regex (regex)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Implements a deep copy of a REGEX object."))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((anchor anchor))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'anchor
|
|
||||||
:startp (startp anchor)
|
|
||||||
:multi-line-p (multi-line-p anchor)
|
|
||||||
:no-newline-p (no-newline-p anchor)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((everything everything))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'everything
|
|
||||||
:single-line-p (single-line-p everything)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((word-boundary word-boundary))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'word-boundary
|
|
||||||
:negatedp (negatedp word-boundary)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((void void))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'void))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((lookahead lookahead))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'lookahead
|
|
||||||
:regex (copy-regex (regex lookahead))
|
|
||||||
:positivep (positivep lookahead)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((seq seq))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'seq
|
|
||||||
:elements (mapcar #'copy-regex (elements seq))))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((alternation alternation))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'alternation
|
|
||||||
:choices (mapcar #'copy-regex (choices alternation))))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((branch branch))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(with-slots (test)
|
|
||||||
branch
|
|
||||||
(make-instance 'branch
|
|
||||||
:test (if (typep test 'regex)
|
|
||||||
(copy-regex test)
|
|
||||||
test)
|
|
||||||
:then-regex (copy-regex (then-regex branch))
|
|
||||||
:else-regex (copy-regex (else-regex branch)))))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((lookbehind lookbehind))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'lookbehind
|
|
||||||
:regex (copy-regex (regex lookbehind))
|
|
||||||
:positivep (positivep lookbehind)
|
|
||||||
:len (len lookbehind)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((repetition repetition))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'repetition
|
|
||||||
:regex (copy-regex (regex repetition))
|
|
||||||
:greedyp (greedyp repetition)
|
|
||||||
:minimum (minimum repetition)
|
|
||||||
:maximum (maximum repetition)
|
|
||||||
:min-len (min-len repetition)
|
|
||||||
:len (len repetition)
|
|
||||||
:contains-register-p (contains-register-p repetition)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((register register))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'register
|
|
||||||
:regex (copy-regex (regex register))
|
|
||||||
:num (num register)
|
|
||||||
:name (name register)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((standalone standalone))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'standalone
|
|
||||||
:regex (copy-regex (regex standalone))))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((back-reference back-reference))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'back-reference
|
|
||||||
:num (num back-reference)
|
|
||||||
:case-insensitive-p (case-insensitive-p back-reference)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((char-class char-class))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'char-class
|
|
||||||
:charset (charset char-class)
|
|
||||||
:case-insensitive-p (case-insensitive-p char-class)
|
|
||||||
:invertedp (invertedp char-class)
|
|
||||||
:word-char-class-p (word-char-class-p char-class)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((str str))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'str
|
|
||||||
:str (str str)
|
|
||||||
:case-insensitive-p (case-insensitive-p str)))
|
|
||||||
|
|
||||||
(defmethod copy-regex ((filter filter))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'filter
|
|
||||||
:fn (fn filter)
|
|
||||||
:len (len filter)))
|
|
||||||
|
|
||||||
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
|
|
||||||
;;; wrapped into one function. Maybe in the next release...
|
|
||||||
|
|
||||||
;;; Further note that this function is used by CONVERT to factor out
|
|
||||||
;;; complicated repetitions, i.e. cases like
|
|
||||||
;;; (a)* -> (?:a*(a))?
|
|
||||||
;;; This won't work for, say,
|
|
||||||
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
|
|
||||||
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
|
|
||||||
|
|
||||||
(defgeneric remove-registers (regex)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
|
|
||||||
optionally removes embedded REGISTER objects if possible and if the
|
|
||||||
special variable REMOVE-REGISTERS-P is true."))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((register register))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(declare (special remove-registers-p reg-seen))
|
|
||||||
(cond (remove-registers-p
|
|
||||||
(remove-registers (regex register)))
|
|
||||||
(t
|
|
||||||
;; mark REG-SEEN as true so enclosing REPETITION objects
|
|
||||||
;; (see method below) know if they contain a register or not
|
|
||||||
(setq reg-seen t)
|
|
||||||
(copy-regex register))))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((repetition repetition))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(let* (reg-seen
|
|
||||||
(inner-regex (remove-registers (regex repetition))))
|
|
||||||
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
|
|
||||||
;; (REGEX REPETITION) contains a REGISTER
|
|
||||||
(declare (special reg-seen))
|
|
||||||
(make-instance 'repetition
|
|
||||||
:regex inner-regex
|
|
||||||
:greedyp (greedyp repetition)
|
|
||||||
:minimum (minimum repetition)
|
|
||||||
:maximum (maximum repetition)
|
|
||||||
:min-len (min-len repetition)
|
|
||||||
:len (len repetition)
|
|
||||||
:contains-register-p reg-seen)))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((standalone standalone))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'standalone
|
|
||||||
:regex (remove-registers (regex standalone))))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((lookahead lookahead))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'lookahead
|
|
||||||
:regex (remove-registers (regex lookahead))
|
|
||||||
:positivep (positivep lookahead)))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((lookbehind lookbehind))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'lookbehind
|
|
||||||
:regex (remove-registers (regex lookbehind))
|
|
||||||
:positivep (positivep lookbehind)
|
|
||||||
:len (len lookbehind)))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((branch branch))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(with-slots (test)
|
|
||||||
branch
|
|
||||||
(make-instance 'branch
|
|
||||||
:test (if (typep test 'regex)
|
|
||||||
(remove-registers test)
|
|
||||||
test)
|
|
||||||
:then-regex (remove-registers (then-regex branch))
|
|
||||||
:else-regex (remove-registers (else-regex branch)))))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((alternation alternation))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(declare (special remove-registers-p))
|
|
||||||
;; an ALTERNATION, so we can't remove REGISTER objects further down
|
|
||||||
(setq remove-registers-p nil)
|
|
||||||
(copy-regex alternation))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((regex regex))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(copy-regex regex))
|
|
||||||
|
|
||||||
(defmethod remove-registers ((seq seq))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(make-instance 'seq
|
|
||||||
:elements (mapcar #'remove-registers (elements seq))))
|
|
||||||
|
|
||||||
(defgeneric everythingp (regex)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
|
|
||||||
to this object, otherwise NIL. So, \"(.){1}\" would return true
|
|
||||||
(i.e. the object corresponding to \".\", for example."))
|
|
||||||
|
|
||||||
(defmethod everythingp ((seq seq))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
|
|
||||||
;; due to the parsing process
|
|
||||||
(let ((cleaned-elements (remove-if #'(lambda (element)
|
|
||||||
(typep element 'void))
|
|
||||||
(elements seq))))
|
|
||||||
(and (= 1 (length cleaned-elements))
|
|
||||||
(everythingp (first cleaned-elements)))))
|
|
||||||
|
|
||||||
(defmethod everythingp ((alternation alternation))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(with-slots (choices)
|
|
||||||
alternation
|
|
||||||
(and (= 1 (length choices))
|
|
||||||
;; this is unlikely to happen for human-generated regexes,
|
|
||||||
;; but machine-generated ones might look like this
|
|
||||||
(everythingp (first choices)))))
|
|
||||||
|
|
||||||
(defmethod everythingp ((repetition repetition))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(with-slots (maximum minimum regex)
|
|
||||||
repetition
|
|
||||||
(and maximum
|
|
||||||
(= 1 minimum maximum)
|
|
||||||
;; treat "<regex>{1,1}" like "<regex>"
|
|
||||||
(everythingp regex))))
|
|
||||||
|
|
||||||
(defmethod everythingp ((register register))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(everythingp (regex register)))
|
|
||||||
|
|
||||||
(defmethod everythingp ((standalone standalone))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(everythingp (regex standalone)))
|
|
||||||
|
|
||||||
(defmethod everythingp ((everything everything))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
everything)
|
|
||||||
|
|
||||||
(defmethod everythingp ((regex regex))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
|
|
||||||
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
|
|
||||||
nil)
|
|
||||||
|
|
||||||
(defgeneric regex-length (regex)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
|
|
||||||
|
|
||||||
(defmethod regex-length ((seq seq))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; simply add all inner lengths unless one of them is NIL
|
|
||||||
(loop for sub-regex in (elements seq)
|
|
||||||
for len = (regex-length sub-regex)
|
|
||||||
if (not len) do (return nil)
|
|
||||||
sum len))
|
|
||||||
|
|
||||||
(defmethod regex-length ((alternation alternation))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; only return a true value if all inner lengths are non-NIL and
|
|
||||||
;; mutually equal
|
|
||||||
(loop for sub-regex in (choices alternation)
|
|
||||||
for old-len = nil then len
|
|
||||||
for len = (regex-length sub-regex)
|
|
||||||
if (or (not len)
|
|
||||||
(and old-len (/= len old-len))) do (return nil)
|
|
||||||
finally (return len)))
|
|
||||||
|
|
||||||
(defmethod regex-length ((branch branch))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; only return a true value if both alternations have a length and
|
|
||||||
;; if they're equal
|
|
||||||
(let ((then-length (regex-length (then-regex branch))))
|
|
||||||
(and then-length
|
|
||||||
(eql then-length (regex-length (else-regex branch)))
|
|
||||||
then-length)))
|
|
||||||
|
|
||||||
(defmethod regex-length ((repetition repetition))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; we can only compute the length of a REPETITION object if the
|
|
||||||
;; number of repetitions is fixed; note that we don't call
|
|
||||||
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
|
|
||||||
;; always set correctly
|
|
||||||
(with-slots (len minimum maximum)
|
|
||||||
repetition
|
|
||||||
(if (and len
|
|
||||||
(eql minimum maximum))
|
|
||||||
(* minimum len)
|
|
||||||
nil)))
|
|
||||||
|
|
||||||
(defmethod regex-length ((register register))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(regex-length (regex register)))
|
|
||||||
|
|
||||||
(defmethod regex-length ((standalone standalone))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(regex-length (regex standalone)))
|
|
||||||
|
|
||||||
(defmethod regex-length ((back-reference back-reference))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; with enough effort we could possibly do better here, but
|
|
||||||
;; currently we just give up and return NIL
|
|
||||||
nil)
|
|
||||||
|
|
||||||
(defmethod regex-length ((char-class char-class))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
1)
|
|
||||||
|
|
||||||
(defmethod regex-length ((everything everything))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
1)
|
|
||||||
|
|
||||||
(defmethod regex-length ((str str))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(len str))
|
|
||||||
|
|
||||||
(defmethod regex-length ((filter filter))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(len filter))
|
|
||||||
|
|
||||||
(defmethod regex-length ((regex regex))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
|
||||||
;; WORD-BOUNDARY (which all have zero-length)
|
|
||||||
0)
|
|
||||||
|
|
||||||
(defgeneric regex-min-length (regex)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Returns the minimal length of REGEX."))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((seq seq))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; simply add all inner minimal lengths
|
|
||||||
(loop for sub-regex in (elements seq)
|
|
||||||
for len = (regex-min-length sub-regex)
|
|
||||||
sum len))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((alternation alternation))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; minimal length of an alternation is the minimal length of the
|
|
||||||
;; "shortest" element
|
|
||||||
(loop for sub-regex in (choices alternation)
|
|
||||||
for len = (regex-min-length sub-regex)
|
|
||||||
minimize len))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((branch branch))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; minimal length of both alternations
|
|
||||||
(min (regex-min-length (then-regex branch))
|
|
||||||
(regex-min-length (else-regex branch))))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((repetition repetition))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; obviously the product of the inner minimal length and the minimal
|
|
||||||
;; number of repetitions
|
|
||||||
(* (minimum repetition) (min-len repetition)))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((register register))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(regex-min-length (regex register)))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((standalone standalone))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(regex-min-length (regex standalone)))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((char-class char-class))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
1)
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((everything everything))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
1)
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((str str))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(len str))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((filter filter))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(or (len filter)
|
|
||||||
0))
|
|
||||||
|
|
||||||
(defmethod regex-min-length ((regex regex))
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
|
|
||||||
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
|
|
||||||
0)
|
|
||||||
|
|
||||||
(defgeneric compute-offsets (regex start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(:documentation "Returns the offset the following regex would have
|
|
||||||
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
|
|
||||||
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
|
|
||||||
slots of STR objects further down the tree."))
|
|
||||||
|
|
||||||
;; note that we're actually only interested in the offset of
|
|
||||||
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
|
|
||||||
;; can stop at variable-length alternations and don't need to descend
|
|
||||||
;; into repetitions
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((seq seq) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(loop for element in (elements seq)
|
|
||||||
;; advance offset argument for next call while looping through
|
|
||||||
;; the elements
|
|
||||||
for pos = start-pos then curr-offset
|
|
||||||
for curr-offset = (compute-offsets element pos)
|
|
||||||
while curr-offset
|
|
||||||
finally (return curr-offset)))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((alternation alternation) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(loop for choice in (choices alternation)
|
|
||||||
for old-offset = nil then curr-offset
|
|
||||||
for curr-offset = (compute-offsets choice start-pos)
|
|
||||||
;; we stop immediately if two alternations don't result in the
|
|
||||||
;; same offset
|
|
||||||
if (or (not curr-offset)
|
|
||||||
(and old-offset (/= curr-offset old-offset)))
|
|
||||||
do (return nil)
|
|
||||||
finally (return curr-offset)))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((branch branch) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; only return offset if both alternations have equal value
|
|
||||||
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
|
|
||||||
(and then-offset
|
|
||||||
(eql then-offset (compute-offsets (else-regex branch) start-pos))
|
|
||||||
then-offset)))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((repetition repetition) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; no need to descend into the inner regex
|
|
||||||
(with-slots (len minimum maximum)
|
|
||||||
repetition
|
|
||||||
(if (and len
|
|
||||||
(eq minimum maximum))
|
|
||||||
;; fixed number of repetitions, so we know how to proceed
|
|
||||||
(+ start-pos (* minimum len))
|
|
||||||
;; otherwise return NIL
|
|
||||||
nil)))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((register register) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(compute-offsets (regex register) start-pos))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((standalone standalone) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(compute-offsets (regex standalone) start-pos))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((char-class char-class) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(1+ start-pos))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((everything everything) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(1+ start-pos))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((str str) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(setf (offset str) start-pos)
|
|
||||||
(+ start-pos (len str)))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((back-reference back-reference) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; with enough effort we could possibly do better here, but
|
|
||||||
;; currently we just give up and return NIL
|
|
||||||
(declare (ignore start-pos))
|
|
||||||
nil)
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((filter filter) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(let ((len (len filter)))
|
|
||||||
(if len
|
|
||||||
(+ start-pos len)
|
|
||||||
nil)))
|
|
||||||
|
|
||||||
(defmethod compute-offsets ((regex regex) start-pos)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
|
||||||
;; WORD-BOUNDARY (which all have zero-length)
|
|
||||||
start-pos)
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.33 2008/07/06 18:12:05 edi Exp $
|
||||||
|
|
||||||
;;; This is actually a part of closures.lisp which we put into a
|
;;; This is actually a part of closures.lisp which we put into a
|
||||||
;;; separate file because it is rather complex. We only deal with
|
;;; separate file because it is rather complex. We only deal with
|
||||||
@ -33,7 +33,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defmacro incf-after (place &optional (delta 1) &environment env)
|
(defmacro incf-after (place &optional (delta 1) &environment env)
|
||||||
"Utility macro inspired by C's \"place++\", i.e. first return the
|
"Utility macro inspired by C's \"place++\", i.e. first return the
|
||||||
@ -58,7 +58,7 @@ CHECK-CURR-POS is a form which checks whether the inner regex of the
|
|||||||
repetition matches at CURR-POS."
|
repetition matches at CURR-POS."
|
||||||
`(if maximum
|
`(if maximum
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos maximum))
|
(declare (fixnum start-pos maximum))
|
||||||
;; because we know LEN we know in advance where to stop at the
|
;; because we know LEN we know in advance where to stop at the
|
||||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||||
;; minimal length of the part behind the repetition
|
;; minimal length of the part behind the repetition
|
||||||
@ -68,7 +68,7 @@ repetition matches at CURR-POS."
|
|||||||
(+ start-pos
|
(+ start-pos
|
||||||
(the fixnum (* len maximum)))))
|
(the fixnum (* len maximum)))))
|
||||||
(curr-pos start-pos))
|
(curr-pos start-pos))
|
||||||
(declare (type fixnum target-end-pos curr-pos))
|
(declare (fixnum target-end-pos curr-pos))
|
||||||
(block greedy-constant-length-matcher
|
(block greedy-constant-length-matcher
|
||||||
;; we use an ugly TAGBODY construct because this might be a
|
;; we use an ugly TAGBODY construct because this might be a
|
||||||
;; tight loop and this version is a bit faster than our LOOP
|
;; tight loop and this version is a bit faster than our LOOP
|
||||||
@ -95,10 +95,10 @@ repetition matches at CURR-POS."
|
|||||||
;; basically the same code; it's just a bit easier because we're
|
;; basically the same code; it's just a bit easier because we're
|
||||||
;; not bounded by MAXIMUM
|
;; not bounded by MAXIMUM
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((target-end-pos (1+ (- *end-pos* len min-rest)))
|
(let ((target-end-pos (1+ (- *end-pos* len min-rest)))
|
||||||
(curr-pos start-pos))
|
(curr-pos start-pos))
|
||||||
(declare (type fixnum target-end-pos curr-pos))
|
(declare (fixnum target-end-pos curr-pos))
|
||||||
(block greedy-constant-length-matcher
|
(block greedy-constant-length-matcher
|
||||||
(tagbody
|
(tagbody
|
||||||
forward-loop
|
forward-loop
|
||||||
@ -117,20 +117,19 @@ repetition matches at CURR-POS."
|
|||||||
(go backward-loop)))))))
|
(go backward-loop)))))))
|
||||||
|
|
||||||
(defun create-greedy-everything-matcher (maximum min-rest next-fn)
|
(defun create-greedy-everything-matcher (maximum min-rest next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(declare (type fixnum min-rest)
|
|
||||||
(type function next-fn))
|
|
||||||
"Creates a closure which just matches as far ahead as possible,
|
"Creates a closure which just matches as far ahead as possible,
|
||||||
i.e. a closure for a dot in single-line mode."
|
i.e. a closure for a dot in single-line mode."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (fixnum min-rest) (function next-fn))
|
||||||
(if maximum
|
(if maximum
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos maximum))
|
(declare (fixnum start-pos maximum))
|
||||||
;; because we know LEN we know in advance where to stop at the
|
;; because we know LEN we know in advance where to stop at the
|
||||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||||
;; minimal length of the part behind the repetition
|
;; minimal length of the part behind the repetition
|
||||||
(let ((target-end-pos (min (+ start-pos maximum)
|
(let ((target-end-pos (min (+ start-pos maximum)
|
||||||
(- *end-pos* min-rest))))
|
(- *end-pos* min-rest))))
|
||||||
(declare (type fixnum target-end-pos))
|
(declare (fixnum target-end-pos))
|
||||||
;; start from the highest possible position and go backward
|
;; start from the highest possible position and go backward
|
||||||
;; until we're able to match the rest of the regex
|
;; until we're able to match the rest of the regex
|
||||||
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
||||||
@ -138,18 +137,18 @@ i.e. a closure for a dot in single-line mode."
|
|||||||
;; basically the same code; it's just a bit easier because we're
|
;; basically the same code; it's just a bit easier because we're
|
||||||
;; not bounded by MAXIMUM
|
;; not bounded by MAXIMUM
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((target-end-pos (- *end-pos* min-rest)))
|
(let ((target-end-pos (- *end-pos* min-rest)))
|
||||||
(declare (type fixnum target-end-pos))
|
(declare (fixnum target-end-pos))
|
||||||
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
||||||
thereis (funcall next-fn curr-pos))))))
|
thereis (funcall next-fn curr-pos))))))
|
||||||
|
|
||||||
(defgeneric create-greedy-constant-length-matcher (repetition next-fn)
|
(defgeneric create-greedy-constant-length-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION is greedy and the minimal number of repetitions is
|
It is assumed that REPETITION is greedy and the minimal number of
|
||||||
zero. It is furthermore assumed that the inner regex of REPETITION is
|
repetitions is zero. It is furthermore assumed that the inner regex
|
||||||
of fixed length and doesn't contain registers."))
|
of REPETITION is of fixed length and doesn't contain registers."))
|
||||||
|
|
||||||
(defmethod create-greedy-constant-length-matcher ((repetition repetition)
|
(defmethod create-greedy-constant-length-matcher ((repetition repetition)
|
||||||
next-fn)
|
next-fn)
|
||||||
@ -158,8 +157,8 @@ of fixed length and doesn't contain registers."))
|
|||||||
(maximum (maximum repetition))
|
(maximum (maximum repetition))
|
||||||
(regex (regex repetition))
|
(regex (regex repetition))
|
||||||
(min-rest (min-rest repetition)))
|
(min-rest (min-rest repetition)))
|
||||||
(declare (type fixnum len min-rest)
|
(declare (fixnum len min-rest)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(cond ((zerop len)
|
(cond ((zerop len)
|
||||||
;; inner regex has zero-length, so we can discard it
|
;; inner regex has zero-length, so we can discard it
|
||||||
;; completely
|
;; completely
|
||||||
@ -186,11 +185,8 @@ of fixed length and doesn't contain registers."))
|
|||||||
(char-class
|
(char-class
|
||||||
;; a character class
|
;; a character class
|
||||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||||
(if (invertedp regex)
|
(greedy-constant-length-closure
|
||||||
(greedy-constant-length-closure
|
(char-class-test))))
|
||||||
(not (char-class-test)))
|
|
||||||
(greedy-constant-length-closure
|
|
||||||
(char-class-test)))))
|
|
||||||
(everything
|
(everything
|
||||||
;; an EVERYTHING object, i.e. a dot
|
;; an EVERYTHING object, i.e. a dot
|
||||||
(if (single-line-p regex)
|
(if (single-line-p regex)
|
||||||
@ -202,17 +198,17 @@ of fixed length and doesn't contain registers."))
|
|||||||
;; just checks for immediate success, i.e. NEXT-FN is
|
;; just checks for immediate success, i.e. NEXT-FN is
|
||||||
;; #'IDENTITY
|
;; #'IDENTITY
|
||||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||||
(declare (type function inner-matcher))
|
(declare (function inner-matcher))
|
||||||
(greedy-constant-length-closure
|
(greedy-constant-length-closure
|
||||||
(funcall inner-matcher curr-pos)))))))))
|
(funcall inner-matcher curr-pos)))))))))
|
||||||
|
|
||||||
(defgeneric create-greedy-no-zero-matcher (repetition next-fn)
|
(defgeneric create-greedy-no-zero-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION is greedy and the minimal number of repetitions is
|
It is assumed that REPETITION is greedy and the minimal number of
|
||||||
zero. It is furthermore assumed that the inner regex of REPETITION can
|
repetitions is zero. It is furthermore assumed that the inner regex
|
||||||
never match a zero-length string (or instead the maximal number of
|
of REPETITION can never match a zero-length string \(or instead the
|
||||||
repetitions is 1)."))
|
maximal number of repetitions is 1)."))
|
||||||
|
|
||||||
(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -220,7 +216,7 @@ repetitions is 1)."))
|
|||||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||||
;; can only be defined after GREEDY-AUX is defined
|
;; can only be defined after GREEDY-AUX is defined
|
||||||
repeat-matcher)
|
repeat-matcher)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
(cond
|
(cond
|
||||||
((eql maximum 1)
|
((eql maximum 1)
|
||||||
;; this is essentially like the next case but with a known
|
;; this is essentially like the next case but with a known
|
||||||
@ -230,7 +226,7 @@ repetitions is 1)."))
|
|||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
(create-matcher-aux (regex repetition) next-fn))
|
(create-matcher-aux (regex repetition) next-fn))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type function repeat-matcher))
|
(declare (function repeat-matcher))
|
||||||
(or (funcall repeat-matcher start-pos)
|
(or (funcall repeat-matcher start-pos)
|
||||||
(funcall next-fn start-pos))))
|
(funcall next-fn start-pos))))
|
||||||
(maximum
|
(maximum
|
||||||
@ -239,8 +235,8 @@ repetitions is 1)."))
|
|||||||
;; repetitions
|
;; repetitions
|
||||||
(let ((rep-num (incf-after *rep-num*)))
|
(let ((rep-num (incf-after *rep-num*)))
|
||||||
(flet ((greedy-aux (start-pos)
|
(flet ((greedy-aux (start-pos)
|
||||||
(declare (type fixnum start-pos maximum rep-num)
|
(declare (fixnum start-pos maximum rep-num)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
;; the actual matcher which first tries to match the
|
;; the actual matcher which first tries to match the
|
||||||
;; inner regex of REPETITION (if we haven't done so
|
;; inner regex of REPETITION (if we haven't done so
|
||||||
;; too often) and on failure calls NEXT-FN
|
;; too often) and on failure calls NEXT-FN
|
||||||
@ -259,15 +255,15 @@ repetitions is 1)."))
|
|||||||
;; the closure we return is just a thin wrapper around
|
;; the closure we return is just a thin wrapper around
|
||||||
;; GREEDY-AUX to initialize the repetition counter
|
;; GREEDY-AUX to initialize the repetition counter
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (aref *repeat-counters* rep-num) 0)
|
(setf (aref *repeat-counters* rep-num) 0)
|
||||||
(greedy-aux start-pos)))))
|
(greedy-aux start-pos)))))
|
||||||
(t
|
(t
|
||||||
;; easier code because we're not bounded by MAXIMUM, but
|
;; easier code because we're not bounded by MAXIMUM, but
|
||||||
;; basically the same
|
;; basically the same
|
||||||
(flet ((greedy-aux (start-pos)
|
(flet ((greedy-aux (start-pos)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(or (funcall repeat-matcher start-pos)
|
(or (funcall repeat-matcher start-pos)
|
||||||
(funcall next-fn start-pos))))
|
(funcall next-fn start-pos))))
|
||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
@ -276,9 +272,9 @@ repetitions is 1)."))
|
|||||||
|
|
||||||
(defgeneric create-greedy-matcher (repetition next-fn)
|
(defgeneric create-greedy-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION is greedy and the minimal number of repetitions is
|
It is assumed that REPETITION is greedy and the minimal number of
|
||||||
zero."))
|
repetitions is zero."))
|
||||||
|
|
||||||
(defmethod create-greedy-matcher ((repetition repetition) next-fn)
|
(defmethod create-greedy-matcher ((repetition repetition) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -290,8 +286,8 @@ zero."))
|
|||||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||||
;; can only be defined after GREEDY-AUX is defined
|
;; can only be defined after GREEDY-AUX is defined
|
||||||
repeat-matcher)
|
repeat-matcher)
|
||||||
(declare (type fixnum zero-length-num)
|
(declare (fixnum zero-length-num)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(cond
|
(cond
|
||||||
(maximum
|
(maximum
|
||||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||||
@ -302,8 +298,8 @@ zero."))
|
|||||||
;; the actual matcher which first tries to match the
|
;; the actual matcher which first tries to match the
|
||||||
;; inner regex of REPETITION (if we haven't done so
|
;; inner regex of REPETITION (if we haven't done so
|
||||||
;; too often) and on failure calls NEXT-FN
|
;; too often) and on failure calls NEXT-FN
|
||||||
(declare (type fixnum start-pos maximum rep-num)
|
(declare (fixnum start-pos maximum rep-num)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(let ((old-last-pos
|
(let ((old-last-pos
|
||||||
(svref *last-pos-stores* zero-length-num)))
|
(svref *last-pos-stores* zero-length-num)))
|
||||||
(when (and old-last-pos
|
(when (and old-last-pos
|
||||||
@ -333,7 +329,7 @@ zero."))
|
|||||||
;; GREEDY-AUX to initialize the repetition counter and our
|
;; GREEDY-AUX to initialize the repetition counter and our
|
||||||
;; slot in *LAST-POS-STORES*
|
;; slot in *LAST-POS-STORES*
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (aref *repeat-counters* rep-num) 0
|
(setf (aref *repeat-counters* rep-num) 0
|
||||||
(svref *last-pos-stores* zero-length-num) nil)
|
(svref *last-pos-stores* zero-length-num) nil)
|
||||||
(greedy-aux start-pos)))))
|
(greedy-aux start-pos)))))
|
||||||
@ -341,8 +337,8 @@ zero."))
|
|||||||
;; easier code because we're not bounded by MAXIMUM, but
|
;; easier code because we're not bounded by MAXIMUM, but
|
||||||
;; basically the same
|
;; basically the same
|
||||||
(flet ((greedy-aux (start-pos)
|
(flet ((greedy-aux (start-pos)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(let ((old-last-pos
|
(let ((old-last-pos
|
||||||
(svref *last-pos-stores* zero-length-num)))
|
(svref *last-pos-stores* zero-length-num)))
|
||||||
(when (and old-last-pos
|
(when (and old-last-pos
|
||||||
@ -356,14 +352,14 @@ zero."))
|
|||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
(create-matcher-aux (regex repetition) #'greedy-aux))
|
(create-matcher-aux (regex repetition) #'greedy-aux))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (svref *last-pos-stores* zero-length-num) nil)
|
(setf (svref *last-pos-stores* zero-length-num) nil)
|
||||||
(greedy-aux start-pos)))))))
|
(greedy-aux start-pos)))))))
|
||||||
|
|
||||||
;; code for non-greedy repetitions with minimum zero
|
;; code for non-greedy repetitions with minimum zero
|
||||||
|
|
||||||
(defmacro non-greedy-constant-length-closure (check-curr-pos)
|
(defmacro non-greedy-constant-length-closure (check-curr-pos)
|
||||||
"This is the template for simple non-greedy repetitions (where
|
"This is the template for simple non-greedy repetitions \(where
|
||||||
simple means that the minimum number of repetitions is zero, that the
|
simple means that the minimum number of repetitions is zero, that the
|
||||||
inner regex to be checked is of fixed length LEN, and that it doesn't
|
inner regex to be checked is of fixed length LEN, and that it doesn't
|
||||||
contain registers, i.e. there's no need for backtracking).
|
contain registers, i.e. there's no need for backtracking).
|
||||||
@ -371,7 +367,7 @@ CHECK-CURR-POS is a form which checks whether the inner regex of the
|
|||||||
repetition matches at CURR-POS."
|
repetition matches at CURR-POS."
|
||||||
`(if maximum
|
`(if maximum
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos maximum))
|
(declare (fixnum start-pos maximum))
|
||||||
;; because we know LEN we know in advance where to stop at the
|
;; because we know LEN we know in advance where to stop at the
|
||||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||||
;; minimal length of the part behind the repetition
|
;; minimal length of the part behind the repetition
|
||||||
@ -389,7 +385,7 @@ repetition matches at CURR-POS."
|
|||||||
;; basically the same code; it's just a bit easier because we're
|
;; basically the same code; it's just a bit easier because we're
|
||||||
;; not bounded by MAXIMUM
|
;; not bounded by MAXIMUM
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((target-end-pos (1+ (- *end-pos* len min-rest))))
|
(let ((target-end-pos (1+ (- *end-pos* len min-rest))))
|
||||||
(loop for curr-pos of-type fixnum from start-pos
|
(loop for curr-pos of-type fixnum from start-pos
|
||||||
below target-end-pos
|
below target-end-pos
|
||||||
@ -400,10 +396,10 @@ repetition matches at CURR-POS."
|
|||||||
|
|
||||||
(defgeneric create-non-greedy-constant-length-matcher (repetition next-fn)
|
(defgeneric create-non-greedy-constant-length-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
It is assumed that REPETITION is non-greedy and the minimal number of
|
||||||
zero. It is furthermore assumed that the inner regex of REPETITION is
|
repetitions is zero. It is furthermore assumed that the inner regex
|
||||||
of fixed length and doesn't contain registers."))
|
of REPETITION is of fixed length and doesn't contain registers."))
|
||||||
|
|
||||||
(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn)
|
(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -411,8 +407,8 @@ of fixed length and doesn't contain registers."))
|
|||||||
(maximum (maximum repetition))
|
(maximum (maximum repetition))
|
||||||
(regex (regex repetition))
|
(regex (regex repetition))
|
||||||
(min-rest (min-rest repetition)))
|
(min-rest (min-rest repetition)))
|
||||||
(declare (type fixnum len min-rest)
|
(declare (fixnum len min-rest)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(cond ((zerop len)
|
(cond ((zerop len)
|
||||||
;; inner regex has zero-length, so we can discard it
|
;; inner regex has zero-length, so we can discard it
|
||||||
;; completely
|
;; completely
|
||||||
@ -439,11 +435,8 @@ of fixed length and doesn't contain registers."))
|
|||||||
(char-class
|
(char-class
|
||||||
;; a character class
|
;; a character class
|
||||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||||
(if (invertedp regex)
|
(non-greedy-constant-length-closure
|
||||||
(non-greedy-constant-length-closure
|
(char-class-test))))
|
||||||
(not (char-class-test)))
|
|
||||||
(non-greedy-constant-length-closure
|
|
||||||
(char-class-test)))))
|
|
||||||
(everything
|
(everything
|
||||||
(if (single-line-p regex)
|
(if (single-line-p regex)
|
||||||
;; a dot which really can match everything; we rely
|
;; a dot which really can match everything; we rely
|
||||||
@ -458,17 +451,17 @@ of fixed length and doesn't contain registers."))
|
|||||||
;; just checks for immediate success, i.e. NEXT-FN is
|
;; just checks for immediate success, i.e. NEXT-FN is
|
||||||
;; #'IDENTITY
|
;; #'IDENTITY
|
||||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||||
(declare (type function inner-matcher))
|
(declare (function inner-matcher))
|
||||||
(non-greedy-constant-length-closure
|
(non-greedy-constant-length-closure
|
||||||
(funcall inner-matcher curr-pos)))))))))
|
(funcall inner-matcher curr-pos)))))))))
|
||||||
|
|
||||||
(defgeneric create-non-greedy-no-zero-matcher (repetition next-fn)
|
(defgeneric create-non-greedy-no-zero-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
It is assumed that REPETITION is non-greedy and the minimal number of
|
||||||
zero. It is furthermore assumed that the inner regex of REPETITION can
|
repetitions is zero. It is furthermore assumed that the inner regex
|
||||||
never match a zero-length string (or instead the maximal number of
|
of REPETITION can never match a zero-length string \(or instead the
|
||||||
repetitions is 1)."))
|
maximal number of repetitions is 1)."))
|
||||||
|
|
||||||
(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -476,7 +469,7 @@ repetitions is 1)."))
|
|||||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||||
;; can only be defined after NON-GREEDY-AUX is defined
|
;; can only be defined after NON-GREEDY-AUX is defined
|
||||||
repeat-matcher)
|
repeat-matcher)
|
||||||
(declare (type function next-fn))
|
(declare (function next-fn))
|
||||||
(cond
|
(cond
|
||||||
((eql maximum 1)
|
((eql maximum 1)
|
||||||
;; this is essentially like the next case but with a known
|
;; this is essentially like the next case but with a known
|
||||||
@ -484,7 +477,7 @@ repetitions is 1)."))
|
|||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
(create-matcher-aux (regex repetition) next-fn))
|
(create-matcher-aux (regex repetition) next-fn))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type function repeat-matcher))
|
(declare (function repeat-matcher))
|
||||||
(or (funcall next-fn start-pos)
|
(or (funcall next-fn start-pos)
|
||||||
(funcall repeat-matcher start-pos))))
|
(funcall repeat-matcher start-pos))))
|
||||||
(maximum
|
(maximum
|
||||||
@ -496,8 +489,8 @@ repetitions is 1)."))
|
|||||||
;; the actual matcher which first calls NEXT-FN and
|
;; the actual matcher which first calls NEXT-FN and
|
||||||
;; on failure tries to match the inner regex of
|
;; on failure tries to match the inner regex of
|
||||||
;; REPETITION (if we haven't done so too often)
|
;; REPETITION (if we haven't done so too often)
|
||||||
(declare (type fixnum start-pos maximum rep-num)
|
(declare (fixnum start-pos maximum rep-num)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(or (funcall next-fn start-pos)
|
(or (funcall next-fn start-pos)
|
||||||
(and (< (aref *repeat-counters* rep-num) maximum)
|
(and (< (aref *repeat-counters* rep-num) maximum)
|
||||||
(incf (aref *repeat-counters* rep-num))
|
(incf (aref *repeat-counters* rep-num))
|
||||||
@ -513,15 +506,15 @@ repetitions is 1)."))
|
|||||||
;; the closure we return is just a thin wrapper around
|
;; the closure we return is just a thin wrapper around
|
||||||
;; NON-GREEDY-AUX to initialize the repetition counter
|
;; NON-GREEDY-AUX to initialize the repetition counter
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (aref *repeat-counters* rep-num) 0)
|
(setf (aref *repeat-counters* rep-num) 0)
|
||||||
(non-greedy-aux start-pos)))))
|
(non-greedy-aux start-pos)))))
|
||||||
(t
|
(t
|
||||||
;; easier code because we're not bounded by MAXIMUM, but
|
;; easier code because we're not bounded by MAXIMUM, but
|
||||||
;; basically the same
|
;; basically the same
|
||||||
(flet ((non-greedy-aux (start-pos)
|
(flet ((non-greedy-aux (start-pos)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(or (funcall next-fn start-pos)
|
(or (funcall next-fn start-pos)
|
||||||
(funcall repeat-matcher start-pos))))
|
(funcall repeat-matcher start-pos))))
|
||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
@ -530,9 +523,9 @@ repetitions is 1)."))
|
|||||||
|
|
||||||
(defgeneric create-non-greedy-matcher (repetition next-fn)
|
(defgeneric create-non-greedy-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
It is assumed that REPETITION is non-greedy and the minimal number of
|
||||||
zero."))
|
repetitions is zero."))
|
||||||
|
|
||||||
(defmethod create-non-greedy-matcher ((repetition repetition) next-fn)
|
(defmethod create-non-greedy-matcher ((repetition repetition) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -544,8 +537,8 @@ zero."))
|
|||||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||||
;; can only be defined after NON-GREEDY-AUX is defined
|
;; can only be defined after NON-GREEDY-AUX is defined
|
||||||
repeat-matcher)
|
repeat-matcher)
|
||||||
(declare (type fixnum zero-length-num)
|
(declare (fixnum zero-length-num)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(cond
|
(cond
|
||||||
(maximum
|
(maximum
|
||||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||||
@ -556,8 +549,8 @@ zero."))
|
|||||||
;; the actual matcher which first calls NEXT-FN and
|
;; the actual matcher which first calls NEXT-FN and
|
||||||
;; on failure tries to match the inner regex of
|
;; on failure tries to match the inner regex of
|
||||||
;; REPETITION (if we haven't done so too often)
|
;; REPETITION (if we haven't done so too often)
|
||||||
(declare (type fixnum start-pos maximum rep-num)
|
(declare (fixnum start-pos maximum rep-num)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(let ((old-last-pos
|
(let ((old-last-pos
|
||||||
(svref *last-pos-stores* zero-length-num)))
|
(svref *last-pos-stores* zero-length-num)))
|
||||||
(when (and old-last-pos
|
(when (and old-last-pos
|
||||||
@ -587,7 +580,7 @@ zero."))
|
|||||||
;; NON-GREEDY-AUX to initialize the repetition counter and our
|
;; NON-GREEDY-AUX to initialize the repetition counter and our
|
||||||
;; slot in *LAST-POS-STORES*
|
;; slot in *LAST-POS-STORES*
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (aref *repeat-counters* rep-num) 0
|
(setf (aref *repeat-counters* rep-num) 0
|
||||||
(svref *last-pos-stores* zero-length-num) nil)
|
(svref *last-pos-stores* zero-length-num) nil)
|
||||||
(non-greedy-aux start-pos)))))
|
(non-greedy-aux start-pos)))))
|
||||||
@ -595,8 +588,8 @@ zero."))
|
|||||||
;; easier code because we're not bounded by MAXIMUM, but
|
;; easier code because we're not bounded by MAXIMUM, but
|
||||||
;; basically the same
|
;; basically the same
|
||||||
(flet ((non-greedy-aux (start-pos)
|
(flet ((non-greedy-aux (start-pos)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(let ((old-last-pos
|
(let ((old-last-pos
|
||||||
(svref *last-pos-stores* zero-length-num)))
|
(svref *last-pos-stores* zero-length-num)))
|
||||||
(when (and old-last-pos
|
(when (and old-last-pos
|
||||||
@ -611,7 +604,7 @@ zero."))
|
|||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (svref *last-pos-stores* zero-length-num) nil)
|
(setf (svref *last-pos-stores* zero-length-num) nil)
|
||||||
(non-greedy-aux start-pos)))))))
|
(non-greedy-aux start-pos)))))))
|
||||||
|
|
||||||
@ -622,13 +615,13 @@ zero."))
|
|||||||
means that the inner regex to be checked is of fixed length LEN, and
|
means that the inner regex to be checked is of fixed length LEN, and
|
||||||
that it doesn't contain registers, i.e. there's no need for
|
that it doesn't contain registers, i.e. there's no need for
|
||||||
backtracking) and where constant means that MINIMUM is equal to
|
backtracking) and where constant means that MINIMUM is equal to
|
||||||
MAXIMUM. CHECK-CURR-POS is a form which checks whether the inner regex
|
MAXIMUM. CHECK-CURR-POS is a form which checks whether the inner
|
||||||
of the repetition matches at CURR-POS."
|
regex of the repetition matches at CURR-POS."
|
||||||
`(lambda (start-pos)
|
`(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((target-end-pos (+ start-pos
|
(let ((target-end-pos (+ start-pos
|
||||||
(the fixnum (* len repetitions)))))
|
(the fixnum (* len repetitions)))))
|
||||||
(declare (type fixnum target-end-pos))
|
(declare (fixnum target-end-pos))
|
||||||
;; first check if we won't go beyond the end of the string
|
;; first check if we won't go beyond the end of the string
|
||||||
(and (>= *end-pos* target-end-pos)
|
(and (>= *end-pos* target-end-pos)
|
||||||
;; then loop through all repetitions step by step
|
;; then loop through all repetitions step by step
|
||||||
@ -642,10 +635,10 @@ of the repetition matches at CURR-POS."
|
|||||||
(defgeneric create-constant-repetition-constant-length-matcher
|
(defgeneric create-constant-repetition-constant-length-matcher
|
||||||
(repetition next-fn)
|
(repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION has a constant number of repetitions. It is
|
It is assumed that REPETITION has a constant number of repetitions.
|
||||||
furthermore assumed that the inner regex of REPETITION is of fixed
|
It is furthermore assumed that the inner regex of REPETITION is of
|
||||||
length and doesn't contain registers."))
|
fixed length and doesn't contain registers."))
|
||||||
|
|
||||||
(defmethod create-constant-repetition-constant-length-matcher
|
(defmethod create-constant-repetition-constant-length-matcher
|
||||||
((repetition repetition) next-fn)
|
((repetition repetition) next-fn)
|
||||||
@ -653,8 +646,8 @@ length and doesn't contain registers."))
|
|||||||
(let ((len (len repetition))
|
(let ((len (len repetition))
|
||||||
(repetitions (minimum repetition))
|
(repetitions (minimum repetition))
|
||||||
(regex (regex repetition)))
|
(regex (regex repetition)))
|
||||||
(declare (type fixnum len repetitions)
|
(declare (fixnum len repetitions)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(if (zerop len)
|
(if (zerop len)
|
||||||
;; if the length is zero it suffices to try once
|
;; if the length is zero it suffices to try once
|
||||||
(create-matcher-aux regex next-fn)
|
(create-matcher-aux regex next-fn)
|
||||||
@ -676,33 +669,29 @@ length and doesn't contain registers."))
|
|||||||
(if (case-insensitive-p regex)
|
(if (case-insensitive-p regex)
|
||||||
(constant-repetition-constant-length-closure
|
(constant-repetition-constant-length-closure
|
||||||
(let ((next-pos (+ curr-pos len)))
|
(let ((next-pos (+ curr-pos len)))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and (*string*-equal str curr-pos next-pos 0 len)
|
(and (*string*-equal str curr-pos next-pos 0 len)
|
||||||
next-pos)))
|
next-pos)))
|
||||||
(constant-repetition-constant-length-closure
|
(constant-repetition-constant-length-closure
|
||||||
(let ((next-pos (+ curr-pos len)))
|
(let ((next-pos (+ curr-pos len)))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and (*string*= str curr-pos next-pos 0 len)
|
(and (*string*= str curr-pos next-pos 0 len)
|
||||||
next-pos)))))))
|
next-pos)))))))
|
||||||
(char-class
|
(char-class
|
||||||
;; a character class
|
;; a character class
|
||||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||||
(if (invertedp regex)
|
(constant-repetition-constant-length-closure
|
||||||
(constant-repetition-constant-length-closure
|
(and (char-class-test)
|
||||||
(and (not (char-class-test))
|
(1+ curr-pos)))))
|
||||||
(1+ curr-pos)))
|
|
||||||
(constant-repetition-constant-length-closure
|
|
||||||
(and (char-class-test)
|
|
||||||
(1+ curr-pos))))))
|
|
||||||
(everything
|
(everything
|
||||||
(if (single-line-p regex)
|
(if (single-line-p regex)
|
||||||
;; a dot which really matches everything - we just have to
|
;; a dot which really matches everything - we just have to
|
||||||
;; advance the index into *STRING* accordingly and check
|
;; advance the index into *STRING* accordingly and check
|
||||||
;; if we didn't go past the end
|
;; if we didn't go past the end
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(let ((next-pos (+ start-pos repetitions)))
|
(let ((next-pos (+ start-pos repetitions)))
|
||||||
(declare (type fixnum next-pos))
|
(declare (fixnum next-pos))
|
||||||
(and (<= next-pos *end-pos*)
|
(and (<= next-pos *end-pos*)
|
||||||
(funcall next-fn next-pos))))
|
(funcall next-fn next-pos))))
|
||||||
;; a dot which is not in single-line-mode - make sure we
|
;; a dot which is not in single-line-mode - make sure we
|
||||||
@ -714,14 +703,14 @@ length and doesn't contain registers."))
|
|||||||
;; the general case - we build an inner matcher which just
|
;; the general case - we build an inner matcher which just
|
||||||
;; checks for immediate success, i.e. NEXT-FN is #'IDENTITY
|
;; checks for immediate success, i.e. NEXT-FN is #'IDENTITY
|
||||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||||
(declare (type function inner-matcher))
|
(declare (function inner-matcher))
|
||||||
(constant-repetition-constant-length-closure
|
(constant-repetition-constant-length-closure
|
||||||
(funcall inner-matcher curr-pos))))))))
|
(funcall inner-matcher curr-pos))))))))
|
||||||
|
|
||||||
(defgeneric create-constant-repetition-matcher (repetition next-fn)
|
(defgeneric create-constant-repetition-matcher (repetition next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
(:documentation "Creates a closure which tries to match REPETITION.
|
||||||
that REPETITION has a constant number of repetitions."))
|
It is assumed that REPETITION has a constant number of repetitions."))
|
||||||
|
|
||||||
(defmethod create-constant-repetition-matcher ((repetition repetition) next-fn)
|
(defmethod create-constant-repetition-matcher ((repetition repetition) next-fn)
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
@ -732,20 +721,20 @@ that REPETITION has a constant number of repetitions."))
|
|||||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||||
;; can only be defined after NON-GREEDY-AUX is defined
|
;; can only be defined after NON-GREEDY-AUX is defined
|
||||||
repeat-matcher)
|
repeat-matcher)
|
||||||
(declare (type fixnum repetitions rep-num)
|
(declare (fixnum repetitions rep-num)
|
||||||
(type function next-fn))
|
(function next-fn))
|
||||||
(if (zerop (min-len repetition))
|
(if (zerop (min-len repetition))
|
||||||
;; we make a reservation for our slot in *LAST-POS-STORES*
|
;; we make a reservation for our slot in *LAST-POS-STORES*
|
||||||
;; because we have to watch out for needless loops as the inner
|
;; because we have to watch out for needless loops as the inner
|
||||||
;; regex might match zero-length strings
|
;; regex might match zero-length strings
|
||||||
(let ((zero-length-num (incf-after *zero-length-num*)))
|
(let ((zero-length-num (incf-after *zero-length-num*)))
|
||||||
(declare (type fixnum zero-length-num))
|
(declare (fixnum zero-length-num))
|
||||||
(flet ((constant-aux (start-pos)
|
(flet ((constant-aux (start-pos)
|
||||||
;; the actual matcher which first calls NEXT-FN and
|
;; the actual matcher which first calls NEXT-FN and
|
||||||
;; on failure tries to match the inner regex of
|
;; on failure tries to match the inner regex of
|
||||||
;; REPETITION (if we haven't done so too often)
|
;; REPETITION (if we haven't done so too often)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(let ((old-last-pos
|
(let ((old-last-pos
|
||||||
(svref *last-pos-stores* zero-length-num)))
|
(svref *last-pos-stores* zero-length-num)))
|
||||||
(when (and old-last-pos
|
(when (and old-last-pos
|
||||||
@ -778,15 +767,15 @@ that REPETITION has a constant number of repetitions."))
|
|||||||
;; the closure we return is just a thin wrapper around
|
;; the closure we return is just a thin wrapper around
|
||||||
;; CONSTANT-AUX to initialize the repetition counter
|
;; CONSTANT-AUX to initialize the repetition counter
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (aref *repeat-counters* rep-num) 0
|
(setf (aref *repeat-counters* rep-num) 0
|
||||||
(aref *last-pos-stores* zero-length-num) nil)
|
(aref *last-pos-stores* zero-length-num) nil)
|
||||||
(constant-aux start-pos))))
|
(constant-aux start-pos))))
|
||||||
;; easier code because we don't have to care about zero-length
|
;; easier code because we don't have to care about zero-length
|
||||||
;; matches but basically the same
|
;; matches but basically the same
|
||||||
(flet ((constant-aux (start-pos)
|
(flet ((constant-aux (start-pos)
|
||||||
(declare (type fixnum start-pos)
|
(declare (fixnum start-pos)
|
||||||
(type function repeat-matcher))
|
(function repeat-matcher))
|
||||||
(cond ((< (aref *repeat-counters* rep-num) repetitions)
|
(cond ((< (aref *repeat-counters* rep-num) repetitions)
|
||||||
(incf (aref *repeat-counters* rep-num))
|
(incf (aref *repeat-counters* rep-num))
|
||||||
(prog1
|
(prog1
|
||||||
@ -796,7 +785,7 @@ that REPETITION has a constant number of repetitions."))
|
|||||||
(setq repeat-matcher
|
(setq repeat-matcher
|
||||||
(create-matcher-aux (regex repetition) #'constant-aux))
|
(create-matcher-aux (regex repetition) #'constant-aux))
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(setf (aref *repeat-counters* rep-num) 0)
|
(setf (aref *repeat-counters* rep-num) 0)
|
||||||
(constant-aux start-pos))))))
|
(constant-aux start-pos))))))
|
||||||
|
|
||||||
|
|||||||
75
scanner.lisp
75
scanner.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.34 2008/07/06 18:12:05 edi Exp $
|
||||||
|
|
||||||
;;; Here the scanner for the actual regex as well as utility scanners
|
;;; Here the scanner for the actual regex as well as utility scanners
|
||||||
;;; for the constant start and end strings are created.
|
;;; for the constant start and end strings are created.
|
||||||
@ -30,13 +30,13 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
(defmacro bmh-matcher-aux (&key case-insensitive-p)
|
(defmacro bmh-matcher-aux (&key case-insensitive-p)
|
||||||
"Auxiliary macro used by CREATE-BMH-MATCHER."
|
"Auxiliary macro used by CREATE-BMH-MATCHER."
|
||||||
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
||||||
`(lambda (start-pos)
|
`(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(if (or (minusp start-pos)
|
(if (or (minusp start-pos)
|
||||||
(> (the fixnum (+ start-pos m)) *end-pos*))
|
(> (the fixnum (+ start-pos m)) *end-pos*))
|
||||||
nil
|
nil
|
||||||
@ -53,21 +53,21 @@
|
|||||||
(return-from bmh-matcher (1+ i)))))))))
|
(return-from bmh-matcher (1+ i)))))))))
|
||||||
|
|
||||||
(defun create-bmh-matcher (pattern case-insensitive-p)
|
(defun create-bmh-matcher (pattern case-insensitive-p)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
|
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
|
||||||
simple-string *STRING* for the first occurence of the substring
|
simple-string *STRING* for the first occurence of the substring
|
||||||
PATTERN. The search starts at the position START-POS within *STRING*
|
PATTERN. The search starts at the position START-POS within *STRING*
|
||||||
and stops before *END-POS* is reached. Depending on the second
|
and stops before *END-POS* is reached. Depending on the second
|
||||||
argument the search is case-insensitive or not. If the special
|
argument the search is case-insensitive or not. If the special
|
||||||
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
|
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
|
||||||
instead. (BMH matchers are faster but need much more space.)"
|
instead. \(BMH matchers are faster but need much more space.)"
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
|
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
|
||||||
;; details
|
;; details
|
||||||
(unless *use-bmh-matchers*
|
(unless *use-bmh-matchers*
|
||||||
(let ((test (if case-insensitive-p #'char-equal #'char=)))
|
(let ((test (if case-insensitive-p #'char-equal #'char=)))
|
||||||
(return-from create-bmh-matcher
|
(return-from create-bmh-matcher
|
||||||
(lambda (start-pos)
|
(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (not (minusp start-pos))
|
(and (not (minusp start-pos))
|
||||||
(search pattern
|
(search pattern
|
||||||
*string*
|
*string*
|
||||||
@ -78,7 +78,7 @@ instead. (BMH matchers are faster but need much more space.)"
|
|||||||
(skip (make-array *regex-char-code-limit*
|
(skip (make-array *regex-char-code-limit*
|
||||||
:element-type 'fixnum
|
:element-type 'fixnum
|
||||||
:initial-element m)))
|
:initial-element m)))
|
||||||
(declare (type fixnum m))
|
(declare (fixnum m))
|
||||||
(loop for k of-type fixnum below m
|
(loop for k of-type fixnum below m
|
||||||
if case-insensitive-p
|
if case-insensitive-p
|
||||||
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
|
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
|
||||||
@ -93,29 +93,28 @@ instead. (BMH matchers are faster but need much more space.)"
|
|||||||
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
|
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
|
||||||
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
||||||
`(lambda (start-pos)
|
`(lambda (start-pos)
|
||||||
(declare (type fixnum start-pos))
|
(declare (fixnum start-pos))
|
||||||
(and (not (minusp start-pos))
|
(and (not (minusp start-pos))
|
||||||
(loop for i of-type fixnum from start-pos below *end-pos*
|
(loop for i of-type fixnum from start-pos below *end-pos*
|
||||||
thereis (and (,char-compare (schar *string* i) chr) i))))))
|
thereis (and (,char-compare (schar *string* i) chr) i))))))
|
||||||
|
|
||||||
(defun create-char-searcher (chr case-insensitive-p)
|
(defun create-char-searcher (chr case-insensitive-p)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Returns a function which searches the (special) simple-string
|
"Returns a function which searches the (special) simple-string
|
||||||
*STRING* for the first occurence of the character CHR. The search
|
*STRING* for the first occurence of the character CHR. The search
|
||||||
starts at the position START-POS within *STRING* and stops before
|
starts at the position START-POS within *STRING* and stops before
|
||||||
*END-POS* is reached. Depending on the second argument the search is
|
*END-POS* is reached. Depending on the second argument the search is
|
||||||
case-insensitive or not."
|
case-insensitive or not."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
(if case-insensitive-p
|
(if case-insensitive-p
|
||||||
(char-searcher-aux :case-insensitive-p t)
|
(char-searcher-aux :case-insensitive-p t)
|
||||||
(char-searcher-aux)))
|
(char-searcher-aux)))
|
||||||
|
|
||||||
(declaim (inline newline-skipper))
|
(declaim (inline newline-skipper))
|
||||||
|
|
||||||
(defun newline-skipper (start-pos)
|
(defun newline-skipper (start-pos)
|
||||||
(declare #.*standard-optimize-settings*)
|
"Finds the next occurence of a character in *STRING* which is behind
|
||||||
(declare (type fixnum start-pos))
|
|
||||||
"Find the next occurence of a character in *STRING* which is behind
|
|
||||||
a #\Newline."
|
a #\Newline."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (fixnum start-pos))
|
||||||
;; we can start with (1- START-POS) without testing for (PLUSP
|
;; we can start with (1- START-POS) without testing for (PLUSP
|
||||||
;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
|
;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
|
||||||
;; the first iteration
|
;; the first iteration
|
||||||
@ -127,7 +126,7 @@ a #\Newline."
|
|||||||
(defmacro insert-advance-fn (advance-fn)
|
(defmacro insert-advance-fn (advance-fn)
|
||||||
"Creates the actual closure returned by CREATE-SCANNER-AUX by
|
"Creates the actual closure returned by CREATE-SCANNER-AUX by
|
||||||
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
|
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
|
||||||
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
||||||
(subst
|
(subst
|
||||||
advance-fn '(advance-fn-definition)
|
advance-fn '(advance-fn-definition)
|
||||||
'(lambda (string start end)
|
'(lambda (string start end)
|
||||||
@ -159,8 +158,8 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
|||||||
nil))
|
nil))
|
||||||
;; we don't need to try further than MAX-END-POS
|
;; we don't need to try further than MAX-END-POS
|
||||||
(max-end-pos (- *end-pos* min-len)))
|
(max-end-pos (- *end-pos* min-len)))
|
||||||
(declare (type fixnum scan-start-pos)
|
(declare (fixnum scan-start-pos)
|
||||||
(type function match-fn))
|
(function match-fn))
|
||||||
;; definition of ADVANCE-FN will be inserted here by macrology
|
;; definition of ADVANCE-FN will be inserted here by macrology
|
||||||
(labels ((advance-fn-definition))
|
(labels ((advance-fn-definition))
|
||||||
(declare (inline advance-fn))
|
(declare (inline advance-fn))
|
||||||
@ -185,8 +184,8 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
|||||||
;; is anchored at the very end of the target string
|
;; is anchored at the very end of the target string
|
||||||
;; (perhaps modulo a #\Newline)
|
;; (perhaps modulo a #\Newline)
|
||||||
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
|
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
|
||||||
(declare (type fixnum end-test-pos)
|
(declare (fixnum end-test-pos)
|
||||||
(type function end-string-test))
|
(function end-string-test))
|
||||||
(unless (setq *end-string-pos* (funcall end-string-test
|
(unless (setq *end-string-pos* (funcall end-string-test
|
||||||
end-test-pos))
|
end-test-pos))
|
||||||
(when (and (= 1 (the fixnum end-anchored-p))
|
(when (and (= 1 (the fixnum end-anchored-p))
|
||||||
@ -223,7 +222,7 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
|||||||
(return-from scan nil))
|
(return-from scan nil))
|
||||||
(when starts-with-str
|
(when starts-with-str
|
||||||
(locally
|
(locally
|
||||||
(declare (type fixnum starts-with-len))
|
(declare (fixnum starts-with-len))
|
||||||
(cond ((and (case-insensitive-p starts-with)
|
(cond ((and (case-insensitive-p starts-with)
|
||||||
(not (*string*-equal starts-with-str
|
(not (*string*-equal starts-with-str
|
||||||
*start-pos*
|
*start-pos*
|
||||||
@ -321,10 +320,10 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
|||||||
rep-num
|
rep-num
|
||||||
zero-length-num
|
zero-length-num
|
||||||
reg-num)
|
reg-num)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
(declare (type fixnum min-len zero-length-num rep-num reg-num))
|
|
||||||
"Auxiliary function to create and return a scanner \(which is
|
"Auxiliary function to create and return a scanner \(which is
|
||||||
actually a closure). Used by CREATE-SCANNER."
|
actually a closure). Used by CREATE-SCANNER."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(declare (fixnum min-len zero-length-num rep-num reg-num))
|
||||||
(let ((starts-with-len (if (typep starts-with 'str)
|
(let ((starts-with-len (if (typep starts-with 'str)
|
||||||
(len starts-with)))
|
(len starts-with)))
|
||||||
(starts-with-everything (typep starts-with 'everything)))
|
(starts-with-everything (typep starts-with 'everything)))
|
||||||
@ -341,8 +340,8 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; left)
|
;; left)
|
||||||
(insert-advance-fn
|
(insert-advance-fn
|
||||||
(advance-fn (pos)
|
(advance-fn (pos)
|
||||||
(declare (type fixnum end-string-offset starts-with-len)
|
(declare (fixnum end-string-offset starts-with-len)
|
||||||
(type function start-string-test end-string-test))
|
(function start-string-test end-string-test))
|
||||||
(loop
|
(loop
|
||||||
(unless (setq pos (funcall start-string-test pos))
|
(unless (setq pos (funcall start-string-test pos))
|
||||||
;; give up completely if we can't find a start string
|
;; give up completely if we can't find a start string
|
||||||
@ -350,7 +349,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
(return-from scan nil))
|
(return-from scan nil))
|
||||||
(locally
|
(locally
|
||||||
;; from here we know that POS is a FIXNUM
|
;; from here we know that POS is a FIXNUM
|
||||||
(declare (type fixnum pos))
|
(declare (fixnum pos))
|
||||||
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
||||||
;; if we already found an end string candidate the
|
;; if we already found an end string candidate the
|
||||||
;; position of which matches the start string
|
;; position of which matches the start string
|
||||||
@ -369,7 +368,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; according to the end string candidate
|
;; according to the end string candidate
|
||||||
(let ((new-pos (- (the fixnum *end-string-pos*)
|
(let ((new-pos (- (the fixnum *end-string-pos*)
|
||||||
end-string-offset)))
|
end-string-offset)))
|
||||||
(declare (type fixnum new-pos *end-string-pos*))
|
(declare (fixnum new-pos *end-string-pos*))
|
||||||
(cond ((= new-pos pos)
|
(cond ((= new-pos pos)
|
||||||
;; if POS and NEW-POS are equal then the
|
;; if POS and NEW-POS are equal then the
|
||||||
;; two candidates agree so we're fine
|
;; two candidates agree so we're fine
|
||||||
@ -394,15 +393,15 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; offset (from the left)
|
;; offset (from the left)
|
||||||
(insert-advance-fn
|
(insert-advance-fn
|
||||||
(advance-fn (pos)
|
(advance-fn (pos)
|
||||||
(declare (type fixnum end-string-offset)
|
(declare (fixnum end-string-offset)
|
||||||
(type function end-string-test))
|
(function end-string-test))
|
||||||
(loop
|
(loop
|
||||||
(unless (setq pos (newline-skipper pos))
|
(unless (setq pos (newline-skipper pos))
|
||||||
;; if we can't find a #\Newline we give up immediately
|
;; if we can't find a #\Newline we give up immediately
|
||||||
(return-from scan nil))
|
(return-from scan nil))
|
||||||
(locally
|
(locally
|
||||||
;; from here we know that POS is a FIXNUM
|
;; from here we know that POS is a FIXNUM
|
||||||
(declare (type fixnum pos))
|
(declare (fixnum pos))
|
||||||
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
||||||
;; if we already found an end string candidate the
|
;; if we already found an end string candidate the
|
||||||
;; position of which matches the place behind the
|
;; position of which matches the place behind the
|
||||||
@ -420,7 +419,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; according to the end string candidate
|
;; according to the end string candidate
|
||||||
(let ((new-pos (- (the fixnum *end-string-pos*)
|
(let ((new-pos (- (the fixnum *end-string-pos*)
|
||||||
end-string-offset)))
|
end-string-offset)))
|
||||||
(declare (type fixnum new-pos *end-string-pos*))
|
(declare (fixnum new-pos *end-string-pos*))
|
||||||
(cond ((= new-pos pos)
|
(cond ((= new-pos pos)
|
||||||
;; if POS and NEW-POS are equal then the
|
;; if POS and NEW-POS are equal then the
|
||||||
;; the end string candidate agrees with
|
;; the end string candidate agrees with
|
||||||
@ -446,7 +445,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; information to advance POS
|
;; information to advance POS
|
||||||
(insert-advance-fn
|
(insert-advance-fn
|
||||||
(advance-fn (pos)
|
(advance-fn (pos)
|
||||||
(declare (type function start-string-test end-string-test))
|
(declare (function start-string-test end-string-test))
|
||||||
(unless (setq pos (funcall start-string-test pos))
|
(unless (setq pos (funcall start-string-test pos))
|
||||||
(return-from scan nil))
|
(return-from scan nil))
|
||||||
(if (<= (the fixnum pos)
|
(if (<= (the fixnum pos)
|
||||||
@ -463,7 +462,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; enough information to advance POS
|
;; enough information to advance POS
|
||||||
(insert-advance-fn
|
(insert-advance-fn
|
||||||
(advance-fn (pos)
|
(advance-fn (pos)
|
||||||
(declare (type function end-string-test))
|
(declare (function end-string-test))
|
||||||
(unless (setq pos (newline-skipper pos))
|
(unless (setq pos (newline-skipper pos))
|
||||||
(return-from scan nil))
|
(return-from scan nil))
|
||||||
(if (<= (the fixnum pos)
|
(if (<= (the fixnum pos)
|
||||||
@ -476,7 +475,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; just check for constant start string candidate
|
;; just check for constant start string candidate
|
||||||
(insert-advance-fn
|
(insert-advance-fn
|
||||||
(advance-fn (pos)
|
(advance-fn (pos)
|
||||||
(declare (type function start-string-test))
|
(declare (function start-string-test))
|
||||||
(unless (setq pos (funcall start-string-test pos))
|
(unless (setq pos (funcall start-string-test pos))
|
||||||
(return-from scan nil))
|
(return-from scan nil))
|
||||||
pos)))
|
pos)))
|
||||||
@ -492,7 +491,7 @@ actually a closure). Used by CREATE-SCANNER."
|
|||||||
;; advanced beyond the last one
|
;; advanced beyond the last one
|
||||||
(insert-advance-fn
|
(insert-advance-fn
|
||||||
(advance-fn (pos)
|
(advance-fn (pos)
|
||||||
(declare (type function end-string-test))
|
(declare (function end-string-test))
|
||||||
(if (<= (the fixnum pos)
|
(if (<= (the fixnum pos)
|
||||||
(the fixnum *end-string-pos*))
|
(the fixnum *end-string-pos*))
|
||||||
(return-from advance-fn pos))
|
(return-from advance-fn pos))
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.27 2008/07/03 08:13:28 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.40 2008/07/23 02:14:06 edi Exp $
|
||||||
|
|
||||||
;;; globally declared special variables
|
;;; globally declared special variables
|
||||||
|
|
||||||
@ -29,7 +29,7 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
;;; special variables used to effect declarations
|
;;; special variables used to effect declarations
|
||||||
|
|
||||||
@ -51,7 +51,7 @@
|
|||||||
|
|
||||||
(defvar *extended-mode-p* nil
|
(defvar *extended-mode-p* nil
|
||||||
"Whether the parser will start in extended mode.")
|
"Whether the parser will start in extended mode.")
|
||||||
(declaim (type boolean *extended-mode-p*))
|
(declaim (boolean *extended-mode-p*))
|
||||||
|
|
||||||
;;; special variables used by the SCAN function and the matchers
|
;;; special variables used by the SCAN function and the matchers
|
||||||
|
|
||||||
@ -60,16 +60,16 @@
|
|||||||
occur in character classes. Change this value BEFORE creating
|
occur in character classes. Change this value BEFORE creating
|
||||||
scanners if you don't need the \(full) Unicode support of
|
scanners if you don't need the \(full) Unicode support of
|
||||||
implementations like AllegroCL, CLISP, LispWorks, or SBCL.")
|
implementations like AllegroCL, CLISP, LispWorks, or SBCL.")
|
||||||
(declaim (type fixnum *regex-char-code-limit*))
|
(declaim (fixnum *regex-char-code-limit*))
|
||||||
|
|
||||||
(defvar *string* ""
|
(defvar *string* ""
|
||||||
"The string which is currently scanned by SCAN.
|
"The string which is currently scanned by SCAN.
|
||||||
Will always be coerced to a SIMPLE-STRING.")
|
Will always be coerced to a SIMPLE-STRING.")
|
||||||
(declaim (type simple-string *string*))
|
(declaim (simple-string *string*))
|
||||||
|
|
||||||
(defvar *start-pos* 0
|
(defvar *start-pos* 0
|
||||||
"Where to start scanning within *STRING*.")
|
"Where to start scanning within *STRING*.")
|
||||||
(declaim (type fixnum *start-pos*))
|
(declaim (fixnum *start-pos*))
|
||||||
|
|
||||||
(defvar *real-start-pos* nil
|
(defvar *real-start-pos* nil
|
||||||
"The real start of *STRING*. This is for repeated scans and is only used internally.")
|
"The real start of *STRING*. This is for repeated scans and is only used internally.")
|
||||||
@ -77,22 +77,22 @@ Will always be coerced to a SIMPLE-STRING.")
|
|||||||
|
|
||||||
(defvar *end-pos* 0
|
(defvar *end-pos* 0
|
||||||
"Where to stop scanning within *STRING*.")
|
"Where to stop scanning within *STRING*.")
|
||||||
(declaim (type fixnum *end-pos*))
|
(declaim (fixnum *end-pos*))
|
||||||
|
|
||||||
(defvar *reg-starts* (make-array 0)
|
(defvar *reg-starts* (make-array 0)
|
||||||
"An array which holds the start positions
|
"An array which holds the start positions
|
||||||
of the current register candidates.")
|
of the current register candidates.")
|
||||||
(declaim (type simple-vector *reg-starts*))
|
(declaim (simple-vector *reg-starts*))
|
||||||
|
|
||||||
(defvar *regs-maybe-start* (make-array 0)
|
(defvar *regs-maybe-start* (make-array 0)
|
||||||
"An array which holds the next start positions
|
"An array which holds the next start positions
|
||||||
of the current register candidates.")
|
of the current register candidates.")
|
||||||
(declaim (type simple-vector *regs-maybe-start*))
|
(declaim (simple-vector *regs-maybe-start*))
|
||||||
|
|
||||||
(defvar *reg-ends* (make-array 0)
|
(defvar *reg-ends* (make-array 0)
|
||||||
"An array which holds the end positions
|
"An array which holds the end positions
|
||||||
of the current register candidates.")
|
of the current register candidates.")
|
||||||
(declaim (type simple-vector *reg-ends*))
|
(declaim (simple-vector *reg-ends*))
|
||||||
|
|
||||||
(defvar *end-string-pos* nil
|
(defvar *end-string-pos* nil
|
||||||
"Start of the next possible end-string candidate.")
|
"Start of the next possible end-string candidate.")
|
||||||
@ -100,12 +100,12 @@ of the current register candidates.")
|
|||||||
(defvar *rep-num* 0
|
(defvar *rep-num* 0
|
||||||
"Counts the number of \"complicated\" repetitions while the matchers
|
"Counts the number of \"complicated\" repetitions while the matchers
|
||||||
are built.")
|
are built.")
|
||||||
(declaim (type fixnum *rep-num*))
|
(declaim (fixnum *rep-num*))
|
||||||
|
|
||||||
(defvar *zero-length-num* 0
|
(defvar *zero-length-num* 0
|
||||||
"Counts the number of repetitions the inner regexes of which may
|
"Counts the number of repetitions the inner regexes of which may
|
||||||
have zero-length while the matchers are built.")
|
have zero-length while the matchers are built.")
|
||||||
(declaim (type fixnum *zero-length-num*))
|
(declaim (fixnum *zero-length-num*))
|
||||||
|
|
||||||
(defvar *repeat-counters* (make-array 0
|
(defvar *repeat-counters* (make-array 0
|
||||||
:initial-element 0
|
:initial-element 0
|
||||||
@ -118,12 +118,27 @@ repetitive patterns have been tested already.")
|
|||||||
"An array to keep track of the last positions
|
"An array to keep track of the last positions
|
||||||
where we saw repetitive patterns.
|
where we saw repetitive patterns.
|
||||||
Only used for patterns which might have zero length.")
|
Only used for patterns which might have zero length.")
|
||||||
(declaim (type simple-vector *last-pos-stores*))
|
(declaim (simple-vector *last-pos-stores*))
|
||||||
|
|
||||||
(defvar *use-bmh-matchers* t
|
(defvar *use-bmh-matchers* t
|
||||||
"Whether the scanners created by CREATE-SCANNER should use the \(fast
|
"Whether the scanners created by CREATE-SCANNER should use the \(fast
|
||||||
but large) Boyer-Moore-Horspool matchers.")
|
but large) Boyer-Moore-Horspool matchers.")
|
||||||
|
|
||||||
|
(defvar *optimize-char-classes* nil
|
||||||
|
"Whether character classes should be compiled into look-ups into
|
||||||
|
O\(1) data structures. This is usually fast but will be costly in
|
||||||
|
terms of scanner creation time and might be costly in terms of size if
|
||||||
|
*REGEX-CHAR-CODE-LIMIT* is high. This value will be used as the :KIND
|
||||||
|
keyword argument to CREATE-OPTIMIZED-TEST-FUNCTION - see there for the
|
||||||
|
possible non-NIL values.")
|
||||||
|
|
||||||
|
(defvar *property-resolver* nil
|
||||||
|
"Should be NIL or a designator for a function which accepts strings
|
||||||
|
and returns unary character test functions or NIL. This 'resolver' is
|
||||||
|
intended to handle `character properties' like \\p{IsAlpha}. If
|
||||||
|
*PROPERTY-RESOLVER* is NIL, then the parser will simply treat \\p and
|
||||||
|
\\P as #\\p and #\\P as in older versions of CL-PPCRE.")
|
||||||
|
|
||||||
(defvar *allow-quoting* nil
|
(defvar *allow-quoting* nil
|
||||||
"Whether the parser should support Perl's \\Q and \\E.")
|
"Whether the parser should support Perl's \\Q and \\E.")
|
||||||
|
|
||||||
|
|||||||
@ -1,9 +1,7 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.4 2008/06/25 14:04:27 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/packages.lisp,v 1.3 2008/07/22 12:58:52 edi Exp $
|
||||||
|
|
||||||
;;; This system definition for LispWorks was kindly provided by Wade Humeniuk
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
|
||||||
|
|
||||||
;;; Redistribution and use in source and binary forms, with or without
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
;;; modification, are permitted provided that the following conditions
|
;;; modification, are permitted provided that the following conditions
|
||||||
@ -29,29 +27,11 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-user)
|
(in-package :cl-user)
|
||||||
|
|
||||||
(defparameter *cl-ppcre-base-directory*
|
(defpackage :cl-ppcre-test
|
||||||
(make-pathname :name nil :type nil :version nil
|
#+genera (:shadowing-import-from :common-lisp :lambda)
|
||||||
:defaults (parse-namestring *load-truename*)))
|
(:use #-:genera :cl #+:genera :future-common-lisp :cl-ppcre)
|
||||||
|
(:import-from :cl-ppcre :*standard-optimize-settings*
|
||||||
(defsystem cl-ppcre
|
:string-list-to-simple-string)
|
||||||
(:default-pathname *cl-ppcre-base-directory*
|
(:export :run-all-tests :unicode-test))
|
||||||
:default-type :lisp-file)
|
|
||||||
:members ("packages"
|
|
||||||
"specials"
|
|
||||||
"util"
|
|
||||||
"errors"
|
|
||||||
"lexer"
|
|
||||||
"parser"
|
|
||||||
"regex-class"
|
|
||||||
"convert"
|
|
||||||
"optimize"
|
|
||||||
"closures"
|
|
||||||
"repetition-closures"
|
|
||||||
"scanner"
|
|
||||||
"api")
|
|
||||||
:rules ((:in-order-to :compile :all
|
|
||||||
(:requires (:load :previous)))
|
|
||||||
(:in-order-to :load :all
|
|
||||||
(:requires (:load :previous)))))
|
|
||||||
150
test/perl-tests.lisp
Normal file
150
test/perl-tests.lisp
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/perl-tests.lisp,v 1.7 2008/07/22 23:02:04 edi Exp $
|
||||||
|
|
||||||
|
;;; The tests in this file test CL-PPCRE against testdata generated by
|
||||||
|
;;; the Perl program `perltest.pl' from the input file `testinput' in
|
||||||
|
;;; order to check compatibility with Perl and correctness of the
|
||||||
|
;;; regex engine.
|
||||||
|
|
||||||
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre-test)
|
||||||
|
|
||||||
|
(defvar *tests-to-skip* '(662 790 1439)
|
||||||
|
"Some tests we skip because the testdata is generated by a Perl
|
||||||
|
program and CL-PPCRE differs from Perl for these tests - on purpose.")
|
||||||
|
|
||||||
|
(defun create-string-from-input (input)
|
||||||
|
"Converts INPUT to a string which can be used in TEST below. The
|
||||||
|
input file `testdata' encodes strings containing non-printable
|
||||||
|
characters as lists where those characters are represented by their
|
||||||
|
character code."
|
||||||
|
(etypecase input
|
||||||
|
((or null string) input)
|
||||||
|
(list (string-list-to-simple-string
|
||||||
|
(loop for element in input
|
||||||
|
if (stringp element)
|
||||||
|
collect element
|
||||||
|
else
|
||||||
|
collect (string (code-char element)))))))
|
||||||
|
|
||||||
|
(defun perl-test (&key (file-name
|
||||||
|
(make-pathname :name "perltestdata"
|
||||||
|
:type nil :version nil
|
||||||
|
:defaults *this-file*)
|
||||||
|
file-name-provided-p)
|
||||||
|
(external-format '(:latin-1 :eol-style :lf))
|
||||||
|
verbose)
|
||||||
|
"Loops through all test cases in FILE-NAME and prints a report if
|
||||||
|
VERBOSE is true. EXTERNAL-FORMAT is the FLEXI-STREAMS external format
|
||||||
|
which is used to read the file. Returns a true value if all tests
|
||||||
|
succeeded.
|
||||||
|
|
||||||
|
For the syntax of the tests in FILE-NAME refer to the source code of
|
||||||
|
this function and to the Perl script perltest.pl which generates such
|
||||||
|
test files."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(with-open-file (binary-stream file-name :element-type 'flex:octet)
|
||||||
|
(let ((stream (flex:make-flexi-stream binary-stream :external-format external-format))
|
||||||
|
;; the standard Perl tests don't need full Unicode support
|
||||||
|
(*regex-char-code-limit* (if file-name-provided-p *regex-char-code-limit* 256))
|
||||||
|
;; we need this for the standard test suite or otherwise we
|
||||||
|
;; might get stack overflows
|
||||||
|
(*optimize-char-classes* (if file-name-provided-p *optimize-char-classes* :charmap))
|
||||||
|
;; we only check for correctness and don't care about speed
|
||||||
|
;; that match (but rather about space constraints of the
|
||||||
|
;; trial versions)
|
||||||
|
(*use-bmh-matchers* (if file-name-provided-p *use-bmh-matchers* nil))
|
||||||
|
;; some tests in the Perl suite explicitly check for this
|
||||||
|
(*allow-quoting* (if file-name-provided-p *allow-quoting* t)))
|
||||||
|
(do-tests ((format nil "Running tests in file ~S" (file-namestring file-name))
|
||||||
|
(not verbose))
|
||||||
|
(let ((input-line (or (read stream nil) (done)))
|
||||||
|
errors)
|
||||||
|
(destructuring-bind (counter
|
||||||
|
info-string%
|
||||||
|
regex%
|
||||||
|
case-insensitive-mode
|
||||||
|
multi-line-mode
|
||||||
|
single-line-mode
|
||||||
|
extended-mode
|
||||||
|
target%
|
||||||
|
perl-error
|
||||||
|
expected-result%
|
||||||
|
expected-registers)
|
||||||
|
input-line
|
||||||
|
(destructuring-bind (info-string regex target expected-result)
|
||||||
|
(mapcar 'create-string-from-input
|
||||||
|
(list info-string% regex% target% expected-result%))
|
||||||
|
(setq expected-registers (mapcar 'create-string-from-input expected-registers))
|
||||||
|
(unless (find counter *tests-to-skip* :test #'=)
|
||||||
|
(when verbose
|
||||||
|
(format t "~&~4D: ~S" counter info-string))
|
||||||
|
(let ((scanner
|
||||||
|
(handler-bind ((error (lambda (condition)
|
||||||
|
(declare (ignore condition))
|
||||||
|
(when perl-error
|
||||||
|
;; we expected an
|
||||||
|
;; error, so we can
|
||||||
|
;; signal success
|
||||||
|
(return-from test-block)))))
|
||||||
|
(create-scanner regex
|
||||||
|
:case-insensitive-mode case-insensitive-mode
|
||||||
|
:multi-line-mode multi-line-mode
|
||||||
|
:single-line-mode single-line-mode
|
||||||
|
:extended-mode extended-mode))))
|
||||||
|
(block test-block
|
||||||
|
(multiple-value-bind (start end reg-starts reg-ends)
|
||||||
|
(scan scanner target)
|
||||||
|
(cond (perl-error
|
||||||
|
(push (format nil "expected an error but got a result.")
|
||||||
|
errors))
|
||||||
|
(t
|
||||||
|
(when (not (eq start expected-result))
|
||||||
|
(if start
|
||||||
|
(let ((result (subseq target start end)))
|
||||||
|
(unless (string= result expected-result)
|
||||||
|
(push (format nil "expected ~S but got ~S."
|
||||||
|
expected-result result)
|
||||||
|
errors))
|
||||||
|
(setq reg-starts (coerce reg-starts 'list)
|
||||||
|
reg-ends (coerce reg-ends 'list))
|
||||||
|
(loop for i from 0
|
||||||
|
for expected-register in expected-registers
|
||||||
|
for reg-start = (nth i reg-starts)
|
||||||
|
for reg-end = (nth i reg-ends)
|
||||||
|
for register = (if (and reg-start reg-end)
|
||||||
|
(subseq target reg-start reg-end)
|
||||||
|
nil)
|
||||||
|
unless (string= expected-register register)
|
||||||
|
do (push (format nil "\\~A: expected ~S but got ~S."
|
||||||
|
(1+ i) expected-register register)
|
||||||
|
errors)))
|
||||||
|
(push (format nil "expected ~S but got ~S."
|
||||||
|
expected-result start)
|
||||||
|
errors))))))
|
||||||
|
errors))))))))))
|
||||||
18
perltest.pl → test/perltest.pl
Executable file → Normal file
18
perltest.pl → test/perltest.pl
Executable file → Normal file
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/perl
|
#!/usr/bin/perl
|
||||||
|
# $Header: /usr/local/cvsrep/cl-ppcre/test/perltest.pl,v 1.1 2008/07/06 21:24:39 edi Exp $
|
||||||
|
|
||||||
# This is a heavily modified version of the file 'perltest' which
|
# This is a heavily modified version of the file 'perltest' which
|
||||||
# comes with the PCRE library package, which is open source software,
|
# comes with the PCRE library package, which is open source software,
|
||||||
@ -8,8 +9,6 @@
|
|||||||
# The PCRE library package is available from
|
# The PCRE library package is available from
|
||||||
# <ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/>
|
# <ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/>
|
||||||
|
|
||||||
use Time::HiRes qw(time);
|
|
||||||
|
|
||||||
sub string_for_lisp {
|
sub string_for_lisp {
|
||||||
my(@a, $t, $in_string, $switch);
|
my(@a, $t, $in_string, $switch);
|
||||||
|
|
||||||
@ -48,8 +47,6 @@ sub string_for_lisp {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$min_time = shift;
|
|
||||||
|
|
||||||
NEXT_RE: while (1) {
|
NEXT_RE: while (1) {
|
||||||
last
|
last
|
||||||
if !($_ = <>);
|
if !($_ = <>);
|
||||||
@ -132,8 +129,6 @@ if (\$x =~ ${pattern}) {
|
|||||||
};
|
};
|
||||||
END
|
END
|
||||||
|
|
||||||
$times = 1;
|
|
||||||
$used = 0;
|
|
||||||
$counter++;
|
$counter++;
|
||||||
print STDERR "$counter\n";
|
print STDERR "$counter\n";
|
||||||
|
|
||||||
@ -141,18 +136,9 @@ END
|
|||||||
$error = 't';
|
$error = 't';
|
||||||
} else {
|
} else {
|
||||||
$error = 'nil';
|
$error = 'nil';
|
||||||
if ($min_time) {
|
|
||||||
$times = 10;
|
|
||||||
while (1) {
|
|
||||||
$used = &$test($times);
|
|
||||||
last
|
|
||||||
if $used > $min_time;
|
|
||||||
$times *= 10;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
print "($counter $info_string \"$pattern_for_lisp\" $case_insensitive_mode $multi_line_mode $single_line_mode $extended_mode " . string_for_lisp($x) . " $error $times $used ";
|
print "($counter $info_string \"$pattern_for_lisp\" $case_insensitive_mode $multi_line_mode $single_line_mode $extended_mode " . string_for_lisp($x) . " $error ";
|
||||||
if (!@subs) {
|
if (!@subs) {
|
||||||
print 'nil nil';
|
print 'nil nil';
|
||||||
} else {
|
} else {
|
||||||
File diff suppressed because one or more lines are too long
349
test/simple
Normal file
349
test/simple
Normal file
@ -0,0 +1,349 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/simple,v 1.9 2008/07/23 00:48:00 edi Exp $
|
||||||
|
|
||||||
|
;;; some simple tests for CL-PPCRE - entered manually and to be read
|
||||||
|
;;; in the CL-PPCRE-TEST package; all forms are expected to return a
|
||||||
|
;;; true value on success when EVALuated
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan "(a)*b" "xaaabd"))
|
||||||
|
(list 1 5 #(3) #(4)))
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan "(a)*b" "xaaabd" :start 1))
|
||||||
|
(list 1 5 #(3) #(4)))
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan "(a)*b" "xaaabd" :start 2))
|
||||||
|
(list 2 5 #(3) #(4)))
|
||||||
|
|
||||||
|
(null (scan "(a)*b" "xaaabd" :end 4))
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan '(:greedy-repetition 0 nil #\b) "bbbc"))
|
||||||
|
(list 0 3 #() #()))
|
||||||
|
|
||||||
|
(null (scan '(:greedy-repetition 4 6 #\b) "bbbc"))
|
||||||
|
|
||||||
|
(let ((s (create-scanner "(([a-c])+)x")))
|
||||||
|
(equalp (multiple-value-list (scan s "abcxy"))
|
||||||
|
(list 0 4 #(0 2) #(3 3))))
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan-to-strings "[^b]*b" "aaabd"))
|
||||||
|
(list "aaab" #()))
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan-to-strings "([^b])*b" "aaabd"))
|
||||||
|
(list "aaab" #("a")))
|
||||||
|
|
||||||
|
(equalp (multiple-value-list (scan-to-strings "(([^b])*)b" "aaabd"))
|
||||||
|
(list "aaab" #("aaa" "a")))
|
||||||
|
|
||||||
|
(equalp (register-groups-bind (first second third fourth)
|
||||||
|
("((a)|(b)|(c))+" "abababc" :sharedp t)
|
||||||
|
(list first second third fourth))
|
||||||
|
(list "c" "a" "b" "c"))
|
||||||
|
|
||||||
|
(equalp (register-groups-bind (nil second third fourth)
|
||||||
|
("((a)|(b)|(c))()+" "abababc" :start 6)
|
||||||
|
(list second third fourth))
|
||||||
|
(list nil nil "c"))
|
||||||
|
|
||||||
|
(null (register-groups-bind (first)
|
||||||
|
("(a|b)+" "accc" :start 1)
|
||||||
|
first))
|
||||||
|
|
||||||
|
(equalp (register-groups-bind (fname lname (#'parse-integer date month year))
|
||||||
|
("(\\w+)\\s+(\\w+)\\s+(\\d{1,2})\\.(\\d{1,2})\\.(\\d{4})" "Frank Zappa 21.12.1940")
|
||||||
|
(list fname lname (encode-universal-time 0 0 0 date month year 0)))
|
||||||
|
(list "Frank" "Zappa" 1292889600))
|
||||||
|
|
||||||
|
(flet ((foo (regex target-string &key (start 0) (end (length target-string)))
|
||||||
|
(let ((sum 0))
|
||||||
|
(do-matches (s e regex target-string nil :start start :end end)
|
||||||
|
(incf sum (- e s)))
|
||||||
|
(/ sum (- end start)))))
|
||||||
|
(and (= 1/3 (foo "a" "abcabcabc"))
|
||||||
|
(= 5/9 (foo "aa|b" "aacabcbbc"))))
|
||||||
|
|
||||||
|
(labels ((crossfoot (target-string &key (start 0) (end (length target-string)))
|
||||||
|
(let ((sum 0))
|
||||||
|
(do-matches-as-strings (m :digit-class
|
||||||
|
target-string nil
|
||||||
|
:start start :end end)
|
||||||
|
(incf sum (parse-integer m)))
|
||||||
|
(if (< sum 10)
|
||||||
|
sum
|
||||||
|
(crossfoot (format nil "~A" sum))))))
|
||||||
|
(and (zerop (crossfoot "bar"))
|
||||||
|
(= 3 (crossfoot "a3x"))
|
||||||
|
(= 6 (crossfoot "12345"))))
|
||||||
|
|
||||||
|
(let (result)
|
||||||
|
(do-register-groups (first second third fourth)
|
||||||
|
("((a)|(b)|(c))" "abababc" nil :start 2 :sharedp t)
|
||||||
|
(push (list first second third fourth) result))
|
||||||
|
(equal (nreverse result)
|
||||||
|
'(("a" "a" nil nil)
|
||||||
|
("b" nil "b" nil)
|
||||||
|
("a" "a" nil nil)
|
||||||
|
("b" nil "b" nil)
|
||||||
|
("c" nil nil "c"))))
|
||||||
|
|
||||||
|
(let (result)
|
||||||
|
(do-register-groups ((#'parse-integer n) (#'intern sign) whitespace)
|
||||||
|
("(\\d+)|(\\+|-|\\*|/)|(\\s+)" "12*15 - 42/3")
|
||||||
|
(unless whitespace
|
||||||
|
(push (or n sign) result)))
|
||||||
|
(equal (nreverse result)
|
||||||
|
'(12 * 15 - 42 / 3)))
|
||||||
|
|
||||||
|
(equal (all-matches "a" "foo bar baz")
|
||||||
|
(list 5 6 9 10))
|
||||||
|
|
||||||
|
(equal (all-matches "\\w*" "foo bar baz")
|
||||||
|
(list 0 3 3 3 4 7 7 7 8 11 11 11))
|
||||||
|
|
||||||
|
(equal (all-matches-as-strings "a" "foo bar baz")
|
||||||
|
(list "a" "a"))
|
||||||
|
|
||||||
|
(equal (all-matches-as-strings "\\w*" "foo bar baz")
|
||||||
|
(list "foo" "" "bar" "" "baz" ""))
|
||||||
|
|
||||||
|
(equal (split "\\s+" "foo bar baz
|
||||||
|
frob")
|
||||||
|
'("foo" "bar" "baz" "frob"))
|
||||||
|
|
||||||
|
(equal (split "\\s*" "foo bar baz")
|
||||||
|
'("f" "o" "o" "b" "a" "r" "b" "a" "z"))
|
||||||
|
|
||||||
|
(equal (split "(\\s+)" "foo bar baz")
|
||||||
|
'("foo" "bar" "baz"))
|
||||||
|
|
||||||
|
(equal (split "(\\s+)" "foo bar baz" :with-registers-p t)
|
||||||
|
'("foo" " " "bar" " " "baz"))
|
||||||
|
|
||||||
|
(equal (split "(\\s)(\\s*)" "foo bar baz" :with-registers-p t)
|
||||||
|
'("foo" " " "" "bar" " " " " "baz"))
|
||||||
|
|
||||||
|
(equal (split "(,)|(;)" "foo,bar;baz" :with-registers-p t)
|
||||||
|
'("foo" "," nil "bar" nil ";" "baz"))
|
||||||
|
|
||||||
|
(equal (split "(,)|(;)" "foo,bar;baz" :with-registers-p t :omit-unmatched-p t)
|
||||||
|
'("foo" "," "bar" ";" "baz"))
|
||||||
|
|
||||||
|
(equal (split ":" "a:b:c:d:e:f:g::")
|
||||||
|
'("a" "b" "c" "d" "e" "f" "g"))
|
||||||
|
|
||||||
|
(equal (split ":" "a:b:c:d:e:f:g::" :limit 1)
|
||||||
|
'("a:b:c:d:e:f:g::"))
|
||||||
|
|
||||||
|
(equal (split ":" "a:b:c:d:e:f:g::" :limit 2)
|
||||||
|
'("a" "b:c:d:e:f:g::"))
|
||||||
|
|
||||||
|
(equal (split ":" "a:b:c:d:e:f:g::" :limit 3)
|
||||||
|
'("a" "b" "c:d:e:f:g::"))
|
||||||
|
|
||||||
|
(equal (split ":" "a:b:c:d:e:f:g::" :limit 1000)
|
||||||
|
'("a" "b" "c" "d" "e" "f" "g" "" ""))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "fo+" "foo bar" "frob"))
|
||||||
|
(list "frob bar" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "fo+" "FOO bar" "frob"))
|
||||||
|
(list "FOO bar" nil))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "(?i)fo+" "FOO bar" "frob"))
|
||||||
|
(list "frob bar" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "(?i)fo+" "FOO bar" "frob" :preserve-case t))
|
||||||
|
(list "FROB bar" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "(?i)fo+" "Foo bar" "frob" :preserve-case t))
|
||||||
|
(list "Frob bar" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "bar" "foo bar baz" "[frob (was '\\&' between '\\`' and '\\'')]"))
|
||||||
|
(list "foo [frob (was 'bar' between 'foo ' and ' baz')] baz" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list
|
||||||
|
(regex-replace "bar" "foo bar baz"
|
||||||
|
'("[frob (was '" :match "' between '" :before-match "' and '" :after-match "')]")))
|
||||||
|
(list "foo [frob (was 'bar' between 'foo ' and ' baz')] baz" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace "(be)(nev)(o)(lent)"
|
||||||
|
"benevolent: adj. generous, kind"
|
||||||
|
(lambda (match &rest registers)
|
||||||
|
(format nil "~A [~{~A~^.~}]" match registers))
|
||||||
|
:simple-calls t))
|
||||||
|
(list "benevolent [be.nev.o.lent]: adj. generous, kind" t))
|
||||||
|
|
||||||
|
(equal (multiple-value-list (regex-replace-all "(?i)fo+" "foo Fooo FOOOO bar" "frob" :preserve-case t))
|
||||||
|
(list "frob Frob FROB bar" t))
|
||||||
|
|
||||||
|
(string= (regex-replace-all "(?i)f(o+)" "foo Fooo FOOOO bar" "fr\\1b" :preserve-case t)
|
||||||
|
"froob Frooob FROOOOB bar")
|
||||||
|
|
||||||
|
(let ((qp-regex (create-scanner "[\\x80-\\xff]")))
|
||||||
|
(flet ((encode-quoted-printable (string)
|
||||||
|
"Converts 8-bit string to quoted-printable representation."
|
||||||
|
;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
|
||||||
|
(flet ((convert (target-string start end match-start match-end reg-starts reg-ends)
|
||||||
|
(declare (ignore start end match-end reg-starts reg-ends))
|
||||||
|
(format nil "=~2,'0x" (char-code (char target-string match-start)))))
|
||||||
|
(regex-replace-all qp-regex string #'convert))))
|
||||||
|
(string= (encode-quoted-printable "F<>te S<>rensen na<6E>ve H<>hner Stra<72>e")
|
||||||
|
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe")))
|
||||||
|
|
||||||
|
(let ((url-regex (create-scanner "[^a-zA-Z0-9_\\-.]")))
|
||||||
|
(flet ((url-encode (string)
|
||||||
|
"URL-encodes a string."
|
||||||
|
;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
|
||||||
|
(flet ((convert (target-string start end match-start match-end reg-starts reg-ends)
|
||||||
|
(declare (ignore start end match-end reg-starts reg-ends))
|
||||||
|
(format nil "%~2,'0x" (char-code (char target-string match-start)))))
|
||||||
|
(regex-replace-all url-regex string #'convert))))
|
||||||
|
(string= (url-encode "F<>te S<>rensen na<6E>ve H<>hner Stra<72>e")
|
||||||
|
"F%EAte%20S%F8rensen%20na%EFve%20H%FChner%20Stra%DFe")))
|
||||||
|
|
||||||
|
(flet ((how-many (target-string start end match-start match-end reg-starts reg-ends)
|
||||||
|
(declare (ignore target-string start end match-start match-end))
|
||||||
|
(format nil "~A" (- (svref reg-ends 0)
|
||||||
|
(svref reg-starts 0)))))
|
||||||
|
(string= (regex-replace-all "{(.+?)}"
|
||||||
|
"foo{...}bar{.....}{..}baz{....}frob"
|
||||||
|
(list "[" #'how-many " dots]"))
|
||||||
|
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"))
|
||||||
|
|
||||||
|
(let ((qp-regex (create-scanner "[\\x80-\\xff]")))
|
||||||
|
(flet ((encode-quoted-printable (string)
|
||||||
|
"Converts 8-bit string to quoted-printable representation.
|
||||||
|
Version using SIMPLE-CALLS keyword argument."
|
||||||
|
;; ;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
|
||||||
|
(flet ((convert (match)
|
||||||
|
(format nil "=~2,'0x" (char-code (char match 0)))))
|
||||||
|
(regex-replace-all qp-regex string #'convert
|
||||||
|
:simple-calls t))))
|
||||||
|
(string= (encode-quoted-printable "F<>te S<>rensen na<6E>ve H<>hner Stra<72>e")
|
||||||
|
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe")))
|
||||||
|
|
||||||
|
(flet ((how-many (match first-register)
|
||||||
|
(declare (ignore match))
|
||||||
|
(format nil "~A" (length first-register))))
|
||||||
|
(string= (regex-replace-all "{(.+?)}"
|
||||||
|
"foo{...}bar{.....}{..}baz{....}frob"
|
||||||
|
(list "[" #'how-many " dots]")
|
||||||
|
:simple-calls t)
|
||||||
|
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"))
|
||||||
|
|
||||||
|
(flet ((my-repetition (char min)
|
||||||
|
`(:greedy-repetition ,min nil ,char)))
|
||||||
|
(setf (parse-tree-synonym 'a*) (my-repetition #\a 0)
|
||||||
|
(parse-tree-synonym 'b+) (my-repetition #\b 1))
|
||||||
|
(unwind-protect
|
||||||
|
(let ((scanner (create-scanner '(:sequence a* b+))))
|
||||||
|
(equal (mapcar (lambda (target)
|
||||||
|
(scan scanner target))
|
||||||
|
'("ab" "b" "aab" "a" "x"))
|
||||||
|
(list 0 0 0 nil nil)))
|
||||||
|
(setf (parse-tree-synonym 'a*) nil
|
||||||
|
(parse-tree-synonym 'b+) nil)))
|
||||||
|
|
||||||
|
(null (scan "^a+$" "a+"))
|
||||||
|
|
||||||
|
(let ((*allow-quoting* t))
|
||||||
|
;;we use CREATE-SCANNER because of Lisps like SBCL that don't have an interpreter
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^\\Qa+\\E$") "a+"))
|
||||||
|
(list 0 2 #() #())))
|
||||||
|
|
||||||
|
(string= (parse-string "\\k<reg>") "k<reg>")
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
(equal (nth-value 1 (create-scanner "((?<small>[a-z]*)(?<big>[A-Z]*))"))
|
||||||
|
(list nil "small" "big")))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
(equal (nth-value 1 (create-scanner '(:register
|
||||||
|
(:sequence
|
||||||
|
(:named-register "small"
|
||||||
|
(:greedy-repetition 0 nil (:char-class (:range #\a #\z))))
|
||||||
|
(:named-register "big"
|
||||||
|
(:greedy-repetition 0 nil (:char-class (:range #\a #\z))))))))
|
||||||
|
(list nil "small" "big")))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "((?<small>[a-z]*)(?<big>[A-Z]*))") "aaaBBB"))
|
||||||
|
(list 0 6 #(0 0 3) #(6 3 6))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
;; multiple-choice back-reference
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$") "a1aa"))
|
||||||
|
(list 0 4 #(0 1) #(1 2))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$") "a22a"))
|
||||||
|
(list 0 4 #(0 1) #(1 2))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
;; demonstrating most-recently-seen-register-first property of back-reference;
|
||||||
|
;; "greedy" regex (analogous to "aa?")
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>)(?<reg>a)(\\k<reg>)") "a"))
|
||||||
|
(list 0 1 #(0 0 1) #(0 1 1))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>)(?<reg>a)(\\k<reg>)") "aa"))
|
||||||
|
(list 0 2 #(0 0 1) #(0 1 2))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
;; switched groups
|
||||||
|
;; "lazy" regex (analogous to "aa??")
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>a)(?<reg>)(\\k<reg>)") "a"))
|
||||||
|
(list 0 1 #(0 1 1) #(1 1 1))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
;; scanner ignores the second "a"
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>a)(?<reg>)(\\k<reg>)") "aa"))
|
||||||
|
(list 0 1 #(0 1 1) #(1 1 1))))
|
||||||
|
|
||||||
|
(let ((*allow-named-registers* t))
|
||||||
|
;; "aa" will be matched only when forced by adding "$" at the end
|
||||||
|
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>a)(?<reg>)(\\k<reg>)$") "aa"))
|
||||||
|
(list 0 2 #(0 1 1) #(1 1 2))))
|
||||||
|
|
||||||
|
(string= (quote-meta-chars "[a-z]*") "\\[a\\-z\\]\\*")
|
||||||
|
|
||||||
|
(string= (handler-case
|
||||||
|
(create-scanner "foo**x")
|
||||||
|
(ppcre-syntax-error (condition)
|
||||||
|
(format nil "Houston, we've got a problem with the string ~S: Looks like something went wrong at position ~A. The last message we received was \"~?\"."
|
||||||
|
(ppcre-syntax-error-string condition)
|
||||||
|
(ppcre-syntax-error-pos condition)
|
||||||
|
(simple-condition-format-control condition)
|
||||||
|
(simple-condition-format-arguments condition))))
|
||||||
|
"Houston, we've got a problem with the string \"foo**x\": Looks like something went wrong at position 4. The last message we received was \"Quantifier '*' not allowed.\".")
|
||||||
|
|
||||||
|
(flet ((my-weird-filter (pos)
|
||||||
|
"Only match at this point if either pos is odd and the
|
||||||
|
character we're looking at is lowercase or if pos is even and the next
|
||||||
|
two characters we're looking at are uppercase. Consume these
|
||||||
|
characters if there's a match."
|
||||||
|
(cond ((and (oddp pos)
|
||||||
|
(< pos cl-ppcre::*end-pos*)
|
||||||
|
(lower-case-p (char cl-ppcre::*string* pos)))
|
||||||
|
(1+ pos))
|
||||||
|
((and (evenp pos)
|
||||||
|
(< (1+ pos) cl-ppcre::*end-pos*)
|
||||||
|
(upper-case-p (char cl-ppcre::*string* pos))
|
||||||
|
(upper-case-p (char cl-ppcre::*string* (1+ pos))))
|
||||||
|
(+ pos 2))
|
||||||
|
(t nil))))
|
||||||
|
(let ((weird-regex `(:sequence "+" (:filter ,#'my-weird-filter) "+")))
|
||||||
|
(equalp (multiple-value-list (scan weird-regex "+A++a+AA+"))
|
||||||
|
(list 5 9 #() #()))))
|
||||||
|
|
||||||
|
(let ((a "\\E*"))
|
||||||
|
(equalp (multiple-value-list (scan (concatenate 'string "(?:" (quote-meta-chars a) "){2}") "\\E*\\E*"))
|
||||||
|
(list 0 6 #() #())))
|
||||||
|
|
||||||
|
(let ((a "\\E*"))
|
||||||
|
(equalp (multiple-value-list (scan `(:greedy-repetition 2 2 ,a) "\\E*\\E*"))
|
||||||
|
(list 0 6 #() #())))
|
||||||
|
|
||||||
|
(loop for *optimize-char-classes* in '(:hash-table :hash-table* :charset :charset* :charmap)
|
||||||
|
for s = (create-scanner "(([a-c])+)x")
|
||||||
|
always (equalp (multiple-value-list (scan s "abcxy"))
|
||||||
|
(list 0 4 #(0 2) #(3 3))))
|
||||||
159
test/tests.lisp
Normal file
159
test/tests.lisp
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/tests.lisp,v 1.12 2008/07/23 00:48:00 edi Exp $
|
||||||
|
|
||||||
|
;;; The tests in this file test CL-PPCRE against testdata generated by
|
||||||
|
;;; the Perl program `perltest.pl' from the input file `testinput' in
|
||||||
|
;;; order to check compatibility with Perl and correctness of the
|
||||||
|
;;; regex engine.
|
||||||
|
|
||||||
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre-test)
|
||||||
|
|
||||||
|
(defvar *this-file* (load-time-value
|
||||||
|
(or #.*compile-file-pathname* *load-pathname*))
|
||||||
|
"The location of this source file.")
|
||||||
|
|
||||||
|
(defmacro do-tests ((name &optional show-progress-p) &body body)
|
||||||
|
"Helper macro which repeatedly executes BODY until the code in body
|
||||||
|
calls the function DONE. It is assumed that each invocation of BODY
|
||||||
|
will be the execution of one test which returns NIL in case of success
|
||||||
|
and list of string describing errors otherwise.
|
||||||
|
|
||||||
|
The macro prints a simple progress indicator \(one dots for ten tests)
|
||||||
|
to *STANDARD-OUTPUT* unless SHOW-PROGRESS-P is NIL and returns a true
|
||||||
|
value iff all tests succeeded. Errors in BODY are caught and reported
|
||||||
|
\(and counted as failures)."
|
||||||
|
`(let ((successp t)
|
||||||
|
(testcount 1))
|
||||||
|
(block test-block
|
||||||
|
(flet ((done ()
|
||||||
|
(return-from test-block successp)))
|
||||||
|
(format t "~&Test: ~A~%" ,name)
|
||||||
|
(loop
|
||||||
|
(when (and ,show-progress-p (zerop (mod testcount 10)))
|
||||||
|
(format t ".")
|
||||||
|
(when (zerop (mod testcount 100))
|
||||||
|
(terpri))
|
||||||
|
(force-output))
|
||||||
|
(let ((errors
|
||||||
|
(handler-case
|
||||||
|
(progn ,@body)
|
||||||
|
(error (msg)
|
||||||
|
(list (format nil "~&got an unexpected error: ~A" msg))))))
|
||||||
|
(setq successp (and successp (null errors)))
|
||||||
|
(when errors
|
||||||
|
(format t "~&~4@A:~{~& ~A~}~%" testcount errors))
|
||||||
|
(incf testcount)))))
|
||||||
|
successp))
|
||||||
|
|
||||||
|
(defun simple-tests (&key (file-name
|
||||||
|
(make-pathname :name "simple"
|
||||||
|
:type nil :version nil
|
||||||
|
:defaults *this-file*))
|
||||||
|
(external-format '(:latin-1 :eol-style :lf))
|
||||||
|
verbose)
|
||||||
|
"Loops through all the forms in the file FILE-NAME and executes each
|
||||||
|
of them using EVAL. It is assumed that each FORM specifies a test
|
||||||
|
which returns a true value iff it succeeds. Prints each test form to
|
||||||
|
*STANDARD-OUTPUT* if VERBOSE is true and shows a simple progress
|
||||||
|
indicator otherwise. EXTERNAL-FORMAT is the FLEXI-STREAMS external
|
||||||
|
format which is used to read the file. Returns a true value iff all
|
||||||
|
tests succeeded."
|
||||||
|
(with-open-file (binary-stream file-name :element-type 'flex:octet)
|
||||||
|
(let ((stream (flex:make-flexi-stream binary-stream :external-format external-format))
|
||||||
|
(*package* (find-package :cl-ppcre-test)))
|
||||||
|
(do-tests ((format nil "Simple tests from file ~S" (file-namestring file-name))
|
||||||
|
(not verbose))
|
||||||
|
(let ((form (or (read stream nil) (done))))
|
||||||
|
(when verbose
|
||||||
|
(format t "~&~S" form))
|
||||||
|
(cond ((eval form) nil)
|
||||||
|
(t (list (format nil "~S returned NIL" form)))))))))
|
||||||
|
|
||||||
|
(defun random-test-function (probability)
|
||||||
|
"Returns a random character test function which contains each
|
||||||
|
character with probability PROBABILITY."
|
||||||
|
(let ((hash-table (make-hash-table)))
|
||||||
|
(dotimes (code char-code-limit)
|
||||||
|
(let ((char (code-char code)))
|
||||||
|
(when (and char (< (random 1.0d0) probability))
|
||||||
|
(setf (gethash (code-char code) hash-table) t))))
|
||||||
|
(lambda (char)
|
||||||
|
(gethash char hash-table))))
|
||||||
|
|
||||||
|
(defun test-optimized-test-functions% (probability)
|
||||||
|
"Creates a random test function with probability PROBABILITY and six
|
||||||
|
\(one for each possible \"kind\") corresponding optimized test
|
||||||
|
functions, then checks for each character in turn that all functions
|
||||||
|
agree on it."
|
||||||
|
(let* ((test-function (random-test-function probability))
|
||||||
|
(optimized-functions (loop for kind in '(nil
|
||||||
|
:hash-table
|
||||||
|
:hash-table*
|
||||||
|
:charset
|
||||||
|
:charset*
|
||||||
|
:charmap)
|
||||||
|
collect (create-optimized-test-function test-function :kind kind))))
|
||||||
|
(loop for code below char-code-limit
|
||||||
|
for char = (code-char code)
|
||||||
|
for expected-result = (and char (funcall test-function char))
|
||||||
|
always (or (null char)
|
||||||
|
(loop for optimized-function in optimized-functions
|
||||||
|
always (eq (not (funcall optimized-function char))
|
||||||
|
(not expected-result)))))))
|
||||||
|
|
||||||
|
(defun test-optimized-test-functions (&key verbose)
|
||||||
|
"Runs TEST-OPTIMIZED-TEST-FUNCTIONS% with different probabilities."
|
||||||
|
(let ((probabilities '(0 .001 .01 .1 1)))
|
||||||
|
(do-tests ("Optimized test functions - this might take some time..." (not verbose))
|
||||||
|
(let ((probability (or (pop probabilities) (done))))
|
||||||
|
(when verbose
|
||||||
|
(format t "~&Probability is ~A" probability))
|
||||||
|
(not (test-optimized-test-functions% probability))))))
|
||||||
|
|
||||||
|
(defun run-all-tests (&key more-tests verbose)
|
||||||
|
"Runs all tests for CL-PPCRE and returns a true value iff all tests
|
||||||
|
succeeded. VERBOSE is interpreted by the individual test suites.
|
||||||
|
MORE-TESTS can be a list of function designators designating
|
||||||
|
additional tests to run. This facility is used by the tests for
|
||||||
|
CL-PPCRE-UNICODE."
|
||||||
|
(let ((successp t))
|
||||||
|
(macrolet ((run-test-suite (&body body)
|
||||||
|
`(unless (progn ,@body)
|
||||||
|
(setq successp nil))))
|
||||||
|
;; run the automatically generated Perl tests
|
||||||
|
(run-test-suite (perl-test :verbose verbose))
|
||||||
|
(run-test-suite (test-optimized-test-functions :verbose verbose))
|
||||||
|
(run-test-suite (simple-tests :verbose verbose))
|
||||||
|
(when more-tests
|
||||||
|
(unless (listp more-tests)
|
||||||
|
(setq more-tests (list more-tests))
|
||||||
|
(dolist (test more-tests)
|
||||||
|
(run-test-suite (funcall test :verbose verbose))))))
|
||||||
|
(format t "~2&~:[Some tests failed~;All tests passed~]." successp)
|
||||||
|
successp))
|
||||||
80
test/unicode-tests.lisp
Normal file
80
test/unicode-tests.lisp
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/unicode-tests.lisp,v 1.8 2008/07/23 00:17:53 edi Exp $
|
||||||
|
|
||||||
|
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||||
|
|
||||||
|
;;; Redistribution and use in source and binary forms, with or without
|
||||||
|
;;; modification, are permitted provided that the following conditions
|
||||||
|
;;; are met:
|
||||||
|
|
||||||
|
;;; * Redistributions of source code must retain the above copyright
|
||||||
|
;;; notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
;;; * Redistributions in binary form must reproduce the above
|
||||||
|
;;; copyright notice, this list of conditions and the following
|
||||||
|
;;; disclaimer in the documentation and/or other materials
|
||||||
|
;;; provided with the distribution.
|
||||||
|
|
||||||
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||||
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
(in-package :cl-ppcre-test)
|
||||||
|
|
||||||
|
(defun unicode-test (&key (file-name
|
||||||
|
(make-pathname :name "unicodetestdata"
|
||||||
|
:type nil :version nil
|
||||||
|
:defaults *this-file*)
|
||||||
|
file-name-provided-p)
|
||||||
|
verbose)
|
||||||
|
"Loops through all test cases in FILE-NAME and prints a report if
|
||||||
|
VERBOSE is true. Returns a true value if all tests succeeded.
|
||||||
|
|
||||||
|
For the syntax of the tests in FILE-NAME refer to CL-UNICODE."
|
||||||
|
(with-open-file (stream file-name)
|
||||||
|
(let ((*regex-char-code-limit* (if file-name-provided-p *regex-char-code-limit* char-code-limit))
|
||||||
|
(*optimize-char-classes* (if file-name-provided-p *optimize-char-classes* nil))
|
||||||
|
;; we only check for correctness and don't care about speed
|
||||||
|
;; that match (but rather about space constraints of the
|
||||||
|
;; trial versions)
|
||||||
|
(*use-bmh-matchers* (if file-name-provided-p *use-bmh-matchers* nil)))
|
||||||
|
(do-tests ((format nil "Running Unicode tests in file ~S" (file-namestring file-name))
|
||||||
|
(not verbose))
|
||||||
|
(let ((input-line (or (read stream nil) (done)))
|
||||||
|
errors)
|
||||||
|
(destructuring-bind (char-code property-name expected-result)
|
||||||
|
input-line
|
||||||
|
(let ((char (and (< char-code char-code-limit) (code-char char-code))))
|
||||||
|
(when char
|
||||||
|
(when verbose
|
||||||
|
(format t "~&~A: #x~X" property-name char-code))
|
||||||
|
(let* ((string (string char))
|
||||||
|
(result-1 (scan (format nil "\\p{~A}" property-name) string))
|
||||||
|
(result-2 (scan (format nil "[\\p{~A}]" property-name) string))
|
||||||
|
(inverted-result-1 (scan (format nil "\\P{~A}" property-name) string))
|
||||||
|
(inverted-result-2 (scan (format nil "[\\P{~A}]" property-name) string)))
|
||||||
|
(unless (eq expected-result (not (not result-1)))
|
||||||
|
(push (format nil "\(code-char #x~X) should ~:[not ~;~]have matched \"\\p{~A}\""
|
||||||
|
char-code expected-result property-name)
|
||||||
|
errors))
|
||||||
|
(unless (eq expected-result (not (not result-2)))
|
||||||
|
(push (format nil "\(code-char #x~X) should ~:[not ~;~]have matched \"[\\p{~A}]\""
|
||||||
|
char-code expected-result property-name)
|
||||||
|
errors))
|
||||||
|
(unless (eq expected-result (not inverted-result-1))
|
||||||
|
(push (format nil "\(code-char #x~X) should ~:[~;not ~]have matched \"\\P{~A}\""
|
||||||
|
char-code expected-result property-name)
|
||||||
|
errors))
|
||||||
|
(unless (eq expected-result (not inverted-result-2))
|
||||||
|
(push (format nil "\(code-char #x~X) should ~:[~;not ~]have matched \"[\\P{~A}]\""
|
||||||
|
char-code expected-result property-name)
|
||||||
|
errors)))
|
||||||
|
errors))))))))
|
||||||
107
test/unicodetestdata
Normal file
107
test/unicodetestdata
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||||
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/unicodetestdata,v 1.10 2008/07/22 14:00:35 edi Exp $
|
||||||
|
|
||||||
|
;;; some arbitrary test data for Unicode properties - stolen from CL-UNICODE
|
||||||
|
|
||||||
|
(#x0001 "ASCII" t)
|
||||||
|
(#x0100 "ASCII" nil)
|
||||||
|
(#x000A "Alphabetic" nil)
|
||||||
|
(#x0061 "Alphabetic" t)
|
||||||
|
(#x0061 "Ll" t)
|
||||||
|
(#x0041 "Alphabetic" t)
|
||||||
|
(#x0041 "alphabetic" t)
|
||||||
|
(#x0041 "IsAlphabetic" t)
|
||||||
|
(#x02E4 "Alphabetic" t)
|
||||||
|
(#x0970 "Alphabetic" nil)
|
||||||
|
(#x030D "BidiClass:NonspacingMark" t)
|
||||||
|
(#x030D "NonspacingMark" t)
|
||||||
|
(#x030D "nonspacing mark" t)
|
||||||
|
(#xE0146 "BidiClass:NonspacingMark" t)
|
||||||
|
(#x000D "BidiClass:WhiteSpace" nil)
|
||||||
|
(#x0020 "BidiClass:WhiteSpace" t)
|
||||||
|
(#x2006 "BidiClass:WhiteSpace" t)
|
||||||
|
(#x12470 "Cuneiform" t)
|
||||||
|
(#x12470 "IsCuneiform" t)
|
||||||
|
(#x12470 "CuneiformNumbersAndPunctuation" t)
|
||||||
|
(#x12470 "Block:CuneiformNumbersAndPunctuation" t)
|
||||||
|
(#x12470 "InCuneiformNumbersAndPunctuation" t)
|
||||||
|
(#x12470 "Script:Cuneiform" t)
|
||||||
|
(#x0041 "Block:Hebrew" nil)
|
||||||
|
(#x0593 "Block:Hebrew" t)
|
||||||
|
(#x0593 "InHebrew" t)
|
||||||
|
(#x040D "Block:Cyrillic" t)
|
||||||
|
(#x040D "InCyrillic" t)
|
||||||
|
(#x0042 "Block:Cyrillic" nil)
|
||||||
|
(#x2011 "Dash" t)
|
||||||
|
(#x2011 "IsDash" t)
|
||||||
|
(#xFF0D "Dash" t)
|
||||||
|
(#x003D "Dash" nil)
|
||||||
|
(#x00F0 "Lowercase" t)
|
||||||
|
(#x00F0 "IsLowercase" t)
|
||||||
|
(#x00F0 "lowercase" t)
|
||||||
|
(#x00F0 "Ll" t)
|
||||||
|
(#x0067 "Lowercase" t)
|
||||||
|
(#x010A "Lowercase" nil)
|
||||||
|
(#x1D6C1 "Lowercase" nil)
|
||||||
|
(#x0023 "CurrencySymbol" nil)
|
||||||
|
(#x0024 "CurrencySymbol" t)
|
||||||
|
(#x0024 "IsCurrencySymbol" t)
|
||||||
|
(#x0024 "currency symbol" t)
|
||||||
|
(#x20AC "CurrencySymbol" t)
|
||||||
|
(#xFFE6 "CurrencySymbol" t)
|
||||||
|
(#x002B "Sm" t)
|
||||||
|
(#x002B "Math" t)
|
||||||
|
(#x002B "IsMath" t)
|
||||||
|
(#x002B "math" t)
|
||||||
|
(#x211C "Math" t)
|
||||||
|
(#x1D7D2 "Math" t)
|
||||||
|
(#x002A "Math" nil)
|
||||||
|
(#x25C9 "Math" nil)
|
||||||
|
(#x0000 "NonCharacterCodePoint" nil)
|
||||||
|
(#xFDD0 "NonCharacterCodePoint" t)
|
||||||
|
(#xFDD0 "Non-Character-Code-Point" t)
|
||||||
|
(#xFDD0 "non-character-code-point" t)
|
||||||
|
(#xFFFFF "NonCharacterCodePoint" t)
|
||||||
|
(#x0043 "Arabic" nil)
|
||||||
|
(#x0606 "Arabic" t)
|
||||||
|
(#x0606 "arabic" t)
|
||||||
|
(#x0606 "IsArabic" t)
|
||||||
|
(#x0606 "Script:Arabic" t)
|
||||||
|
(#x0044 "IsVariationSelector" nil)
|
||||||
|
(#x0044 "VariationSelector" nil)
|
||||||
|
(#x180B "VariationSelector" t)
|
||||||
|
(#x180B "Variation_Selector" t)
|
||||||
|
(#x180B "Variation-Selector" t)
|
||||||
|
(#x180B "variationselector" t)
|
||||||
|
(#x180B "variation selector" t)
|
||||||
|
(#x180B "IsVariationSelector" t)
|
||||||
|
(#x00B5 "XIDContinue" t)
|
||||||
|
(#x00B5 "IsXIDContinue" t)
|
||||||
|
(#x00B5 "IsXID_Continue" t)
|
||||||
|
(#x00B5 "Is_XID_Continue" t)
|
||||||
|
(#x00B5 "XID_Continue" t)
|
||||||
|
(#x33FF "Unified_Ideograph" nil)
|
||||||
|
(#x33FF "Ideographic" nil)
|
||||||
|
(#x3400 "Unified_Ideograph" t)
|
||||||
|
(#x3400 "Ideographic" t)
|
||||||
|
(#x3400 "Han" t)
|
||||||
|
(#x3400 "OtherLetter" t)
|
||||||
|
(#x3400 "Alphabetic" t)
|
||||||
|
(#x3400 "Common" nil)
|
||||||
|
(#x3400 "Assigned" t)
|
||||||
|
(#x3400 "Any" t)
|
||||||
|
(#x0378 "Cn" t)
|
||||||
|
(#x0378 "Unassigned" t)
|
||||||
|
(#x0377 "Cn" nil)
|
||||||
|
(#x0377 "Unassigned" nil)
|
||||||
|
(#x2800 "Braille" t)
|
||||||
|
(#x2800 "Script:Braille" t)
|
||||||
|
(#x2800 "OtherSymbol" t)
|
||||||
|
(#x0027 "QuotationMark" t)
|
||||||
|
(#x201C "QuotationMark" t)
|
||||||
|
(#x201C "OtherNeutral" t)
|
||||||
|
(#x201C "PatternSyntax" t)
|
||||||
|
(#x0028 "Bidi_Mirrored" t)
|
||||||
|
(#x0028 "BidiMirrored" t)
|
||||||
|
(#x0028 "IsBidiMirrored" t)
|
||||||
|
(#x0027 "Bidi_Mirrored" nil)
|
||||||
178
util.lisp
178
util.lisp
@ -1,5 +1,5 @@
|
|||||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.40 2008/07/03 10:06:16 edi Exp $
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.46 2008/07/06 18:12:05 edi Exp $
|
||||||
|
|
||||||
;;; Utility functions and constants dealing with the character sets we
|
;;; Utility functions and constants dealing with the character sets we
|
||||||
;;; use to encode character classes
|
;;; use to encode character classes
|
||||||
@ -30,7 +30,12 @@
|
|||||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
(in-package #:cl-ppcre)
|
(in-package :cl-ppcre)
|
||||||
|
|
||||||
|
(defmacro defconstant (name value &optional doc)
|
||||||
|
"Make sure VALUE is evaluated only once \(to appease SBCL)."
|
||||||
|
`(cl:defconstant ,name (if (boundp ',name) (symbol-value ',name) ,value)
|
||||||
|
,@(when doc (list doc))))
|
||||||
|
|
||||||
#+:lispworks
|
#+:lispworks
|
||||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||||
@ -100,120 +105,36 @@ are discarded \(that is, the body is an implicit PROGN)."
|
|||||||
`(let (,,@temps)
|
`(let (,,@temps)
|
||||||
,,@body))))))
|
,,@body))))))
|
||||||
|
|
||||||
(eval-when (:compile-toplevel :execute :load-toplevel)
|
(declaim (inline digit-char-p))
|
||||||
(defun make-char-set (test)
|
(defun digit-char-p (chr)
|
||||||
(declare #.*special-optimize-settings*)
|
|
||||||
"Returns a CHARSET for all characters satisfying test."
|
|
||||||
(loop with set = (make-charset)
|
|
||||||
for code of-type fixnum from 0 below char-code-limit
|
|
||||||
for char = (code-char code)
|
|
||||||
if (and char (funcall test char))
|
|
||||||
do (add-to-charset char set)
|
|
||||||
finally (return set)))
|
|
||||||
|
|
||||||
(declaim (inline word-char-p))
|
|
||||||
|
|
||||||
(defun word-char-p (chr)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Tests whether a character is a \"word\" character.
|
|
||||||
In the ASCII charset this is equivalent to a-z, A-Z, 0-9, or _,
|
|
||||||
i.e. the same as Perl's [\\w]."
|
|
||||||
(or (alphanumericp chr)
|
|
||||||
(char= chr #\_)))
|
|
||||||
|
|
||||||
(unless (boundp '+whitespace-char-string+)
|
|
||||||
(defconstant +whitespace-char-string+
|
|
||||||
(coerce
|
|
||||||
'(#\Space #\Tab #\Linefeed #\Return #\Page)
|
|
||||||
'string)
|
|
||||||
"A string of all characters which are considered to be whitespace.
|
|
||||||
Same as Perl's [\\s]."))
|
|
||||||
|
|
||||||
(defun whitespacep (chr)
|
|
||||||
(declare #.*special-optimize-settings*)
|
|
||||||
"Tests whether a character is whitespace,
|
|
||||||
i.e. whether it would match [\\s] in Perl."
|
|
||||||
(find chr +whitespace-char-string+ :test #'char=)))
|
|
||||||
|
|
||||||
;; the following DEFCONSTANT statements are wrapped with
|
|
||||||
;; (UNLESS (BOUNDP ...) ...) to make SBCL happy
|
|
||||||
|
|
||||||
(unless (boundp '+digit-set+)
|
|
||||||
(defconstant +digit-set+
|
|
||||||
(make-char-set (lambda (chr) (char<= #\0 chr #\9)))
|
|
||||||
"Character set containing the digits from 0 to 9."))
|
|
||||||
|
|
||||||
(unless (boundp '+word-char-set+)
|
|
||||||
(defconstant +word-char-set+
|
|
||||||
(make-char-set #'word-char-p)
|
|
||||||
"Character set containing all \"word\" characters."))
|
|
||||||
|
|
||||||
(unless (boundp '+whitespace-char-set+)
|
|
||||||
(defconstant +whitespace-char-set+
|
|
||||||
(make-char-set #'whitespacep)
|
|
||||||
"Character set containing all whitespace characters."))
|
|
||||||
|
|
||||||
(defun create-ranges-from-set (set &key downcasep)
|
|
||||||
(declare #.*standard-optimize-settings*)
|
(declare #.*standard-optimize-settings*)
|
||||||
"Tries to identify up to three intervals \(with respect to CHAR<)
|
"Tests whether a character is a decimal digit, i.e. the same as
|
||||||
which together comprise the charset SET. Returns NIL if this is not
|
Perl's [\\d]. Note that this function shadows the standard Common
|
||||||
possible. If DOWNCASEP is true it will treat the charset as if it
|
Lisp function CL:DIGIT-CHAR-P."
|
||||||
represents both the lower-case and the upper-case variants of its
|
(char<= #\0 chr #\9))
|
||||||
members and will only return the respective lower-case intervals."
|
|
||||||
;; discard empty charsets
|
(declaim (inline word-char-p))
|
||||||
(unless (and set (plusp (charset-count set)))
|
(defun word-char-p (chr)
|
||||||
(return-from create-ranges-from-set nil))
|
(declare #.*standard-optimize-settings*)
|
||||||
(loop with min1 and min2 and min3
|
"Tests whether a character is a \"word\" character. In the ASCII
|
||||||
and max1 and max2 and max3
|
charset this is equivalent to a-z, A-Z, 0-9, or _, i.e. the same as
|
||||||
;; loop through all characters in SET, sorted by CHAR<
|
Perl's [\\w]."
|
||||||
;; (actually by < on their character codes, see 13.1.6 in the
|
(or (alphanumericp chr)
|
||||||
;; ANSI standard)
|
(char= chr #\_)))
|
||||||
for code of-type fixnum below *regex-char-code-limit*
|
|
||||||
for char = (code-char code)
|
(defconstant +whitespace-char-string+
|
||||||
when (and char (in-charset-p (if downcasep (char-downcase char) char) set))
|
(coerce '(#\Space #\Tab #\Linefeed #\Return #\Page) 'string)
|
||||||
;; MIN1, MAX1, etc. are _exclusive_
|
"A string of all characters which are considered to be whitespace.
|
||||||
;; bounds of the intervals identified so far
|
Same as Perl's [\\s].")
|
||||||
do (cond
|
|
||||||
((not min1)
|
(defun whitespacep (chr)
|
||||||
;; this will only happen once, for the first character
|
(declare #.*special-optimize-settings*)
|
||||||
(setq min1 (1- code)
|
"Tests whether a character is whitespace, i.e. whether it would
|
||||||
max1 (1+ code)))
|
match [\\s] in Perl."
|
||||||
((<= (the fixnum min1) code (the fixnum max1))
|
(find chr +whitespace-char-string+ :test #'char=))
|
||||||
;; we're here as long as CHAR fits into the first interval
|
|
||||||
(setq min1 (min (the fixnum min1) (1- code))
|
|
||||||
max1 (max (the fixnum max1) (1+ code))))
|
|
||||||
((not min2)
|
|
||||||
;; we need to open a second interval
|
|
||||||
;; this'll also happen only once
|
|
||||||
(setq min2 (1- code)
|
|
||||||
max2 (1+ code)))
|
|
||||||
((<= (the fixnum min2) code (the fixnum max2))
|
|
||||||
;; CHAR fits into the second interval
|
|
||||||
(setq min2 (min (the fixnum min2) (1- code))
|
|
||||||
max2 (max (the fixnum max2) (1+ code))))
|
|
||||||
((not min3)
|
|
||||||
;; we need to open the third interval
|
|
||||||
;; happens only once
|
|
||||||
(setq min3 (1- code)
|
|
||||||
max3 (1+ code)))
|
|
||||||
((<= (the fixnum min3) code (the fixnum max3))
|
|
||||||
;; CHAR fits into the third interval
|
|
||||||
(setq min3 (min (the fixnum min3) (1- code))
|
|
||||||
max3 (max (the fixnum max3) (1+ code))))
|
|
||||||
(t
|
|
||||||
;; we're out of luck, CHAR doesn't fit
|
|
||||||
;; into one of the three intervals
|
|
||||||
(return nil)))
|
|
||||||
;; on success return all bounds
|
|
||||||
;; make them inclusive bounds before returning
|
|
||||||
finally (return (values (code-char (1+ min1))
|
|
||||||
(code-char (1- max1))
|
|
||||||
(and min2 (code-char (1+ min2)))
|
|
||||||
(and max2 (code-char (1- max2)))
|
|
||||||
(and min3 (code-char (1+ min3)))
|
|
||||||
(and max3 (code-char (1- max3)))))))
|
|
||||||
|
|
||||||
(defmacro maybe-coerce-to-simple-string (string)
|
(defmacro maybe-coerce-to-simple-string (string)
|
||||||
|
"Coerces STRING to a simple STRING unless it already is one."
|
||||||
(with-unique-names (=string=)
|
(with-unique-names (=string=)
|
||||||
`(let ((,=string= ,string))
|
`(let ((,=string= ,string))
|
||||||
(cond ((simple-string-p ,=string=)
|
(cond ((simple-string-p ,=string=)
|
||||||
@ -223,16 +144,16 @@ members and will only return the respective lower-case intervals."
|
|||||||
|
|
||||||
(declaim (inline nsubseq))
|
(declaim (inline nsubseq))
|
||||||
(defun nsubseq (sequence start &optional (end (length sequence)))
|
(defun nsubseq (sequence start &optional (end (length sequence)))
|
||||||
"Return a subsequence by pointing to location in original sequence."
|
"Returns a subsequence by pointing to location in original sequence."
|
||||||
(make-array (- end start)
|
(make-array (- end start)
|
||||||
:element-type (array-element-type sequence)
|
:element-type (array-element-type sequence)
|
||||||
:displaced-to sequence
|
:displaced-to sequence
|
||||||
:displaced-index-offset start))
|
:displaced-index-offset start))
|
||||||
|
|
||||||
(defun normalize-var-list (var-list)
|
(defun normalize-var-list (var-list)
|
||||||
"Utility function for REGISTER-GROUPS-BIND and
|
"Utility function for REGISTER-GROUPS-BIND and DO-REGISTER-GROUPS.
|
||||||
DO-REGISTER-GROUPS. Creates the long form \(a list of \(FUNCTION VAR)
|
Creates the long form \(a list of \(FUNCTION VAR) entries) out of the
|
||||||
entries) out of the short form of VAR-LIST."
|
short form of VAR-LIST."
|
||||||
(loop for element in var-list
|
(loop for element in var-list
|
||||||
if (consp element)
|
if (consp element)
|
||||||
nconc (loop for var in (rest element)
|
nconc (loop for var in (rest element)
|
||||||
@ -241,20 +162,33 @@ entries) out of the short form of VAR-LIST."
|
|||||||
collect (list '(function identity) element)))
|
collect (list '(function identity) element)))
|
||||||
|
|
||||||
(defun string-list-to-simple-string (string-list)
|
(defun string-list-to-simple-string (string-list)
|
||||||
(declare #.*standard-optimize-settings*)
|
|
||||||
"Concatenates a list of strings to one simple-string."
|
"Concatenates a list of strings to one simple-string."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
;; this function provided by JP Massar; note that we can't use APPLY
|
;; this function provided by JP Massar; note that we can't use APPLY
|
||||||
;; with CONCATENATE here because of CALL-ARGUMENTS-LIMIT
|
;; with CONCATENATE here because of CALL-ARGUMENTS-LIMIT
|
||||||
(let ((total-size 0))
|
(let ((total-size 0))
|
||||||
(declare (type fixnum total-size))
|
(declare (fixnum total-size))
|
||||||
(dolist (string string-list)
|
(dolist (string string-list)
|
||||||
#-genera (declare (type string string))
|
#-:genera (declare (string string))
|
||||||
(incf total-size (length string)))
|
(incf total-size (length string)))
|
||||||
(let ((result-string (make-sequence 'simple-string total-size))
|
(let ((result-string (make-sequence 'simple-string total-size))
|
||||||
(curr-pos 0))
|
(curr-pos 0))
|
||||||
(declare (type fixnum curr-pos))
|
(declare (fixnum curr-pos))
|
||||||
(dolist (string string-list)
|
(dolist (string string-list)
|
||||||
#-genera (declare (type string string))
|
#-:genera (declare (string string))
|
||||||
(replace result-string string :start1 curr-pos)
|
(replace result-string string :start1 curr-pos)
|
||||||
(incf curr-pos (length string)))
|
(incf curr-pos (length string)))
|
||||||
result-string)))
|
result-string)))
|
||||||
|
|
||||||
|
(defun complement* (test-function)
|
||||||
|
"Like COMPLEMENT but optimized for unary functions."
|
||||||
|
(declare #.*standard-optimize-settings*)
|
||||||
|
(typecase test-function
|
||||||
|
(function
|
||||||
|
(lambda (char)
|
||||||
|
(declare (character char))
|
||||||
|
(not (funcall (the function test-function) char))))
|
||||||
|
(otherwise
|
||||||
|
(lambda (char)
|
||||||
|
(declare (character char))
|
||||||
|
(not (funcall test-function char))))))
|
||||||
Reference in New Issue
Block a user