Update to current dev version
git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3581 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
12
CHANGELOG
12
CHANGELOG
@ -1,3 +1,15 @@
|
||||
Version 2.0.0
|
||||
2008-07-23
|
||||
Added named properties (\p{foo})
|
||||
Added Unicode support
|
||||
Introduced test functions for character classes
|
||||
Added optional test function optimization
|
||||
Cleaned up test suite, removed performance cruft
|
||||
Removed the various alternative system definitions (too much maintenance work)
|
||||
Exported PARSE-STRING
|
||||
General cleanup
|
||||
Lots of documentation additions
|
||||
|
||||
Version 1.4.1
|
||||
2008-07-03
|
||||
Skip non-characters in CREATE-RANGES-FROM-SET
|
||||
|
||||
62
README
62
README
@ -1,62 +0,0 @@
|
||||
Complete documentation for CL-PPCRE can be found in the 'doc'
|
||||
directory.
|
||||
|
||||
CL-PPCRE also supports Nikodemus Siivola's HYPERDOC, see
|
||||
<http://common-lisp.net/project/hyperdoc/> and
|
||||
<http://www.cliki.net/hyperdoc>.
|
||||
|
||||
1. Installation
|
||||
|
||||
1.1. Probably the easiest way is
|
||||
|
||||
(load "/path/to/cl-ppcre/load.lisp")
|
||||
|
||||
This should compile and load CL-PPCRE on most Common Lisp
|
||||
implementations.
|
||||
|
||||
1.2. With MK:DEFSYSTEM you can make a symbolic link from
|
||||
'cl-ppcre.system' and 'cl-ppcre-test.system' to your central registry
|
||||
(which by default is in '/usr/local/lisp/Registry/') and then issue
|
||||
the command
|
||||
|
||||
(mk:compile-system "cl-ppcre")
|
||||
|
||||
Note that this relies on TRUENAME returning the original file a
|
||||
symbolic link is pointing to. This will only work with AllegroCL
|
||||
6.2 if you've applied all patches with (SYS:UPDATE-ALLEGRO).
|
||||
|
||||
1.3. You can also use ASDF instead of MK:DEFSYSTEM in a similar way
|
||||
(use the .asd files instead of the .system files).
|
||||
|
||||
1.4. For LispWorks there's a file 'lispworks-defsystem.lisp' which includes
|
||||
a system definition for LispWork's Common Defsystem.
|
||||
|
||||
2. Test
|
||||
|
||||
CL-PPCRE comes with a test suite that can be used to check its
|
||||
compatibility with Perl's regex syntax. See the documentation on how
|
||||
to use this test suite for benchmarks and on how to write your own
|
||||
tests.
|
||||
|
||||
2.1. If you've used 'load.lisp' to load CL-PPCRE you already have the
|
||||
test suite loaded and can start the default tests with
|
||||
|
||||
(cl-ppcre-test:test)
|
||||
|
||||
2.2. With MK:DEFSYSTEM you need to compile the 'cl-ppcre-test' system
|
||||
as well before you can proceed as in 2.1.
|
||||
|
||||
2.3. Same for ASDF.
|
||||
|
||||
Depending on your machine and your CL implementation the default test
|
||||
will take between a few seconds and a couple of minutes. (It will
|
||||
print a dot for every tenth test case while it proceeds to give some
|
||||
visual feedback.) It should exactly report three 'errors' (662, 790,
|
||||
and 1439) which are explained in the documentation.
|
||||
|
||||
MCL might report an error for the ninth test case which is also
|
||||
explained in the docs.
|
||||
|
||||
Genera notes (thanks to Patrick O'Donnell): Some more tests will fail
|
||||
because characters like #\Return, #\Linefeed, or #\Tab have encodings
|
||||
which differ from Perl's (and thus CL-PPCRE's) expectations.
|
||||
96
api.lisp
96
api.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.79 2008/07/03 08:39:10 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.84 2008/07/06 18:12:04 edi Exp $
|
||||
|
||||
;;; The external API for creating and using scanners.
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defgeneric create-scanner (regex &key case-insensitive-mode
|
||||
multi-line-mode
|
||||
@ -40,9 +40,9 @@
|
||||
parse-tree or as a string - and returns a scan closure which will scan
|
||||
strings for this regular expression and a list mapping registers to
|
||||
their names \(NIL stands for unnamed ones). The \"mode\" keyboard
|
||||
arguments are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE
|
||||
is not NIL the function is allowed to destructively modify its first
|
||||
argument \(but only if it's a parse tree)."))
|
||||
arguments are equivalent to the imsx modifiers in Perl. If
|
||||
DESTRUCTIVE is not NIL, the function is allowed to destructively
|
||||
modify its first argument \(but only if it's a parse tree)."))
|
||||
|
||||
#-:use-acl-regexp2-engine
|
||||
(defmethod create-scanner ((regex-string string) &key case-insensitive-mode
|
||||
@ -76,8 +76,7 @@ argument \(but only if it's a parse tree)."))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (ignore destructive))
|
||||
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
|
||||
(signal-ppcre-invocation-error
|
||||
"You can't use the keyword arguments to modify an existing scanner."))
|
||||
(signal-invocation-error "You can't use the keyword arguments to modify an existing scanner."))
|
||||
scanner)
|
||||
|
||||
#-:use-acl-regexp2-engine
|
||||
@ -88,8 +87,7 @@ argument \(but only if it's a parse tree)."))
|
||||
destructive)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(when extended-mode
|
||||
(signal-ppcre-invocation-error
|
||||
"Extended mode doesn't make sense in parse trees."))
|
||||
(signal-invocation-error "Extended mode doesn't make sense in parse trees."))
|
||||
;; convert parse-tree into internal representation REGEX and at the
|
||||
;; same time compute the number of registers and the constant string
|
||||
;; (or anchor) the regex starts with (if any)
|
||||
@ -180,7 +178,6 @@ argument \(but only if it's a parse tree)."))
|
||||
|
||||
#+:use-acl-regexp2-engine
|
||||
(declaim (inline create-scanner))
|
||||
|
||||
#+:use-acl-regexp2-engine
|
||||
(defmethod create-scanner ((scanner regexp::regular-expression) &key case-insensitive-mode
|
||||
multi-line-mode
|
||||
@ -190,8 +187,7 @@ argument \(but only if it's a parse tree)."))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (ignore destructive))
|
||||
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
|
||||
(signal-ppcre-invocation-error
|
||||
"You can't use the keyword arguments to modify an existing scanner."))
|
||||
(signal-invocation-error "You can't use the keyword arguments to modify an existing scanner."))
|
||||
scanner)
|
||||
|
||||
#+:use-acl-regexp2-engine
|
||||
@ -254,7 +250,6 @@ internal purposes."))
|
||||
|
||||
#+:use-acl-regexp2-engine
|
||||
(declaim (inline scan))
|
||||
|
||||
#+:use-acl-regexp2-engine
|
||||
(defmethod scan ((parse-tree t) target-string
|
||||
&key (start 0)
|
||||
@ -292,12 +287,12 @@ internal purposes."))
|
||||
(defun scan-to-strings (regex target-string &key (start 0)
|
||||
(end (length target-string))
|
||||
sharedp)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Like SCAN but returns substrings of TARGET-STRING instead of
|
||||
positions, i.e. this function returns two values on success: the whole
|
||||
match as a string plus an array of substrings (or NILs) corresponding
|
||||
to the matched registers. If SHAREDP is true, the substrings may share
|
||||
structure with TARGET-STRING."
|
||||
to the matched registers. If SHAREDP is true, the substrings may
|
||||
share structure with TARGET-STRING."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(multiple-value-bind (match-start match-end reg-starts reg-ends)
|
||||
(scan regex target-string :start start :end end)
|
||||
(unless match-start
|
||||
@ -329,11 +324,11 @@ structure with TARGET-STRING."
|
||||
"Executes BODY with the variables in VAR-LIST bound to the
|
||||
corresponding register groups after TARGET-STRING has been matched
|
||||
against REGEX, i.e. each variable is either bound to a string or to
|
||||
NIL. If there is no match, BODY is _not_ executed. For each element of
|
||||
VAR-LIST which is NIL there's no binding to the corresponding register
|
||||
group. The number of variables in VAR-LIST must not be greater than
|
||||
the number of register groups. If SHAREDP is true, the substrings may
|
||||
share structure with TARGET-STRING."
|
||||
NIL. If there is no match, BODY is _not_ executed. For each element
|
||||
of VAR-LIST which is NIL there's no binding to the corresponding
|
||||
register group. The number of variables in VAR-LIST must not be
|
||||
greater than the number of register groups. If SHAREDP is true, the
|
||||
substrings may share structure with TARGET-STRING."
|
||||
(with-rebinding (target-string)
|
||||
(with-unique-names (match-start match-end reg-starts reg-ends
|
||||
start-index substr-fn)
|
||||
@ -482,8 +477,9 @@ the number of register groups. After the last match, returns
|
||||
RESULT-FORM if provided or NIL otherwise. An implicit block named NIL
|
||||
surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop
|
||||
immediately. If REGEX matches an empty string the scan is continued
|
||||
one position behind this match. If SHAREDP is true, the substrings may
|
||||
share structure with TARGET-STRING. BODY may start with declarations."
|
||||
one position behind this match. If SHAREDP is true, the substrings
|
||||
may share structure with TARGET-STRING. BODY may start with
|
||||
declarations."
|
||||
(with-rebinding (target-string)
|
||||
(with-unique-names (substr-fn match-start match-end
|
||||
reg-starts reg-ends start-index)
|
||||
@ -510,11 +506,11 @@ share structure with TARGET-STRING. BODY may start with declarations."
|
||||
(defun all-matches (regex target-string
|
||||
&key (start 0)
|
||||
(end (length target-string)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Returns a list containing the start and end positions of all
|
||||
matches of REGEX against TARGET-STRING, i.e. if there are N matches
|
||||
the list contains (* 2 N) elements. If REGEX matches an empty string
|
||||
the scan is continued one position behind this match."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let (result-list)
|
||||
(do-matches (match-start match-end
|
||||
regex target-string
|
||||
@ -536,11 +532,11 @@ compile time."
|
||||
&key (start 0)
|
||||
(end (length target-string))
|
||||
sharedp)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Returns a list containing all substrings of TARGET-STRING which
|
||||
match REGEX. If REGEX matches an empty string the scan is continued
|
||||
one position behind this match. If SHAREDP is true, the substrings may
|
||||
share structure with TARGET-STRING."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let (result-list)
|
||||
(do-matches-as-strings (match regex target-string (nreverse result-list)
|
||||
:start start :end end :sharedp sharedp)
|
||||
@ -563,18 +559,18 @@ compile time."
|
||||
with-registers-p
|
||||
omit-unmatched-p
|
||||
sharedp)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Matches REGEX against TARGET-STRING as often as possible and
|
||||
returns a list of the substrings between the matches. If
|
||||
WITH-REGISTERS-P is true, substrings corresponding to matched
|
||||
registers are inserted into the list as well. If OMIT-UNMATCHED-P is
|
||||
true, unmatched registers will simply be left out, otherwise they will
|
||||
show up as NIL. LIMIT limits the number of elements returned -
|
||||
registers aren't counted. If LIMIT is NIL (or 0 which is equivalent),
|
||||
trailing empty strings are removed from the result list. If REGEX
|
||||
matches an empty string the scan is continued one position behind this
|
||||
match. If SHAREDP is true, the substrings may share structure with
|
||||
TARGET-STRING."
|
||||
registers aren't counted. If LIMIT is NIL \(or 0 which is
|
||||
equivalent), trailing empty strings are removed from the result list.
|
||||
If REGEX matches an empty string the scan is continued one position
|
||||
behind this match. If SHAREDP is true, the substrings may share
|
||||
structure with TARGET-STRING."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; initialize list of positions POS-LIST to extract substrings with
|
||||
;; START so that the start of the next match will mark the end of
|
||||
;; the first substring
|
||||
@ -637,13 +633,13 @@ TARGET-STRING."
|
||||
|
||||
(defun string-case-modifier (str from to start end)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum from to start end))
|
||||
(declare (fixnum from to start end))
|
||||
"Checks whether all words in STR between FROM and TO are upcased,
|
||||
downcased or capitalized and returns a function which applies a
|
||||
corresponding case modification to strings. Returns #'IDENTITY
|
||||
otherwise, especially if words in the target area extend beyond FROM
|
||||
or TO. STR is supposed to be bounded by START and END. It is assumed
|
||||
that (<= START FROM TO END)."
|
||||
that \(<= START FROM TO END)."
|
||||
(case
|
||||
(if (or (<= to from)
|
||||
(and (< start from)
|
||||
@ -740,8 +736,7 @@ S-expression."))
|
||||
((#\\) :backslash)))))
|
||||
(when (and (numberp token) (< token 0))
|
||||
;; make sure we don't accept something like "\\0"
|
||||
(signal-ppcre-invocation-error
|
||||
"Illegal substring ~S in replacement string"
|
||||
(signal-invocation-error "Illegal substring ~S in replacement string."
|
||||
(subseq replacement-string match-start match-end)))
|
||||
(push token collector))
|
||||
;; remember where the match ended
|
||||
@ -801,8 +796,7 @@ S-expression."))
|
||||
((#\\) :backslash)))))
|
||||
(when (and (numberp token) (< token 0))
|
||||
;; make sure we don't accept something like "\\0"
|
||||
(signal-ppcre-invocation-error
|
||||
"Illegal substring ~S in replacement string"
|
||||
(signal-invocation-error "Illegal substring ~S in replacement string."
|
||||
(subseq replacement match-start match-end)))
|
||||
(push token collector))
|
||||
;; remember where the match ended
|
||||
@ -843,8 +837,7 @@ corresponding string."
|
||||
(when (>= token reg-bound)
|
||||
;; but only if the register was referenced in the
|
||||
;; regular expression
|
||||
(signal-ppcre-invocation-error
|
||||
"Reference to non-existent register ~A in replacement string"
|
||||
(signal-invocation-error "Reference to non-existent register ~A in replacement string."
|
||||
(1+ token)))
|
||||
(when (svref reg-starts token)
|
||||
;; and only if it matched, i.e. no match results
|
||||
@ -909,11 +902,11 @@ corresponding string."
|
||||
|
||||
(defun replace-aux (target-string replacement pos-list reg-list start end
|
||||
preserve-case simple-calls element-type)
|
||||
"Auxiliary function used by REGEX-REPLACE and REGEX-REPLACE-ALL.
|
||||
POS-LIST contains a list with the start and end positions of all
|
||||
matches while REG-LIST contains a list of arrays representing the
|
||||
corresponding register start and end positions."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Auxiliary function used by REGEX-REPLACE and
|
||||
REGEX-REPLACE-ALL. POS-LIST contains a list with the start and end
|
||||
positions of all matches while REG-LIST contains a list of arrays
|
||||
representing the corresponding register start and end positions."
|
||||
;; build the template once before we start the loop
|
||||
(let ((replacement-template (build-replacement-template replacement)))
|
||||
(with-output-to-string (s nil :element-type element-type)
|
||||
@ -955,7 +948,6 @@ representing the corresponding register start and end positions."
|
||||
preserve-case
|
||||
simple-calls
|
||||
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Try to match TARGET-STRING between START and END against REGEX and
|
||||
replace the first match with REPLACEMENT. Two values are returned;
|
||||
the modified string, and T if REGEX matched or NIL otherwise.
|
||||
@ -985,6 +977,7 @@ match. The result will always be a fresh string, even if REGEX doesn't
|
||||
match.
|
||||
|
||||
ELEMENT-TYPE is the element type of the resulting string."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(multiple-value-bind (match-start match-end reg-starts reg-ends)
|
||||
(scan regex target-string :start start :end end)
|
||||
(if match-start
|
||||
@ -1012,7 +1005,6 @@ match.
|
||||
preserve-case
|
||||
simple-calls
|
||||
(element-type #+:lispworks 'lw:simple-char #-:lispworks 'character))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Try to match TARGET-STRING between START and END against REGEX and
|
||||
replace all matches with REPLACEMENT. Two values are returned; the
|
||||
modified string, and T if REGEX matched or NIL otherwise.
|
||||
@ -1042,6 +1034,7 @@ match. The result will always be a fresh string, even if REGEX doesn't
|
||||
match.
|
||||
|
||||
ELEMENT-TYPE is the element type of the resulting string."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((pos-list '())
|
||||
(reg-list '()))
|
||||
(do-scans (match-start match-end reg-starts reg-ends regex target-string
|
||||
@ -1102,6 +1095,9 @@ scanner, a case-insensitive scanner is used."
|
||||
|
||||
#+:cormanlisp
|
||||
(defmacro do-with-all-symbols ((variable package-or-packagelist) &body body)
|
||||
"Executes BODY with VARIABLE bound to each symbol in
|
||||
PACKAGE-OR-PACKAGELIST \(a designator for a list of packages) in
|
||||
turn."
|
||||
(with-unique-names (pack-var)
|
||||
`(if (listp ,package-or-packagelist)
|
||||
(dolist (,pack-var ,package-or-packagelist)
|
||||
@ -1113,11 +1109,11 @@ scanner, a case-insensitive scanner is used."
|
||||
#+:cormanlisp
|
||||
(defmacro regex-apropos-aux ((regex packages case-insensitive &optional return-form)
|
||||
&body body)
|
||||
"Auxiliary macro used by REGEX-APROPOS and REGEX-APROPOS-LIST. Loops
|
||||
through PACKAGES and executes BODY with SYMBOL bound to each symbol
|
||||
which matches REGEX. Optionally evaluates and returns RETURN-FORM at
|
||||
the end. If CASE-INSENSITIVE is true and REGEX isn't already a
|
||||
scanner, a case-insensitive scanner is used."
|
||||
"Auxiliary macro used by REGEX-APROPOS and REGEX-APROPOS-LIST.
|
||||
Loops through PACKAGES and executes BODY with SYMBOL bound to each
|
||||
symbol which matches REGEX. Optionally evaluates and returns
|
||||
RETURN-FORM at the end. If CASE-INSENSITIVE is true and REGEX isn't
|
||||
already a scanner, a case-insensitive scanner is used."
|
||||
(with-rebinding (regex)
|
||||
(with-unique-names (scanner %packages hash)
|
||||
`(let* ((,scanner (create-scanner ,regex
|
||||
|
||||
152
charmap.lisp
Normal file
152
charmap.lisp
Normal file
@ -0,0 +1,152 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/charmap.lisp,v 1.18 2008/07/22 23:54:59 edi Exp $
|
||||
|
||||
;;; An optimized representation of sets of characters.
|
||||
|
||||
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defstruct (charmap (:constructor make-charmap%))
|
||||
;; a bit vector mapping char codes to "booleans" (1 for set members,
|
||||
;; 0 for others)
|
||||
(vector #*0 :type simple-bit-vector)
|
||||
;; the smallest character code of all characters in the set
|
||||
(start 0 :type fixnum)
|
||||
;; the upper (exclusive) bound of all character codes in the set
|
||||
(end 0 :type fixnum)
|
||||
;; the number of characters in the set, or NIL if this is unknown
|
||||
(count nil :type (or fixnum null))
|
||||
;; whether the charmap actually represents the complement of the set
|
||||
(complementp nil :type boolean))
|
||||
|
||||
;; seems to be necessary for some Lisps like ClozureCL
|
||||
(defmethod make-load-form ((map charmap) &optional environment)
|
||||
(make-load-form-saving-slots map :environment environment))
|
||||
|
||||
(declaim (inline in-charmap-p))
|
||||
(defun in-charmap-p (char charmap)
|
||||
"Tests whether the character CHAR belongs to the set represented by CHARMAP."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (character char) (charmap charmap))
|
||||
(let* ((char-code (char-code char))
|
||||
(char-in-vector-p
|
||||
(let ((charmap-start (charmap-start charmap)))
|
||||
(declare (fixnum charmap-start))
|
||||
(and (<= charmap-start char-code)
|
||||
(< char-code (the fixnum (charmap-end charmap)))
|
||||
(= 1 (sbit (the simple-bit-vector (charmap-vector charmap))
|
||||
(- char-code charmap-start)))))))
|
||||
(cond ((charmap-complementp charmap) (not char-in-vector-p))
|
||||
(t char-in-vector-p))))
|
||||
|
||||
(defun charmap-contents (charmap)
|
||||
"Returns a list of all characters belonging to a character map.
|
||||
Only works for non-complement charmaps."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (charmap charmap))
|
||||
(and (not (charmap-complementp charmap))
|
||||
(loop for code of-type fixnum from (charmap-start charmap) to (charmap-end charmap)
|
||||
for i across (the simple-bit-vector (charmap-vector charmap))
|
||||
when (= i 1)
|
||||
collect (code-char code))))
|
||||
|
||||
(defun make-charmap (start end test-function &optional complementp)
|
||||
"Creates and returns a charmap representing all characters with
|
||||
character codes in the interval [start end) that satisfy
|
||||
TEST-FUNCTION. The COMPLEMENTP slot of the charmap is set to the
|
||||
value of the optional argument, but this argument doesn't have an
|
||||
effect on how TEST-FUNCTION is used."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (fixnum start end))
|
||||
(let ((vector (make-array (- end start) :element-type 'bit))
|
||||
(count 0))
|
||||
(declare (fixnum count))
|
||||
(loop for code from start below end
|
||||
for char = (code-char code)
|
||||
for index from 0
|
||||
when char do
|
||||
(incf count)
|
||||
(setf (sbit vector index) (if (funcall test-function char) 1 0)))
|
||||
(make-charmap% :vector vector
|
||||
:start start
|
||||
:end end
|
||||
;; we don't know for sure if COMPLEMENTP is true as
|
||||
;; there isn't a necessary a character for each
|
||||
;; integer below *REGEX-CHAR-CODE-LIMIT*
|
||||
:count (and (not complementp) count)
|
||||
;; make sure it's boolean
|
||||
:complementp (not (not complementp)))))
|
||||
|
||||
(defun create-charmap-from-test-function (test-function start end)
|
||||
"Creates and returns a charmap representing all characters with
|
||||
character codes between START and END which satisfy TEST-FUNCTION.
|
||||
Tries to find the smallest interval which is necessary to represent
|
||||
the character set and uses the complement representation if that
|
||||
helps."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let (start-in end-in start-out end-out)
|
||||
;; determine the smallest intervals containing the set and its
|
||||
;; complement, [start-in, end-in) and [start-out, end-out) - first
|
||||
;; the lower bound
|
||||
(loop for code from start below end
|
||||
for char = (code-char code)
|
||||
until (and start-in start-out)
|
||||
when (and char
|
||||
(not start-in)
|
||||
(funcall test-function char))
|
||||
do (setq start-in code)
|
||||
when (and char
|
||||
(not start-out)
|
||||
(not (funcall test-function char)))
|
||||
do (setq start-out code))
|
||||
(unless start-in
|
||||
;; no character satisfied the test, so return a "pseudo" charmap
|
||||
;; where IN-CHARMAP-P is always false
|
||||
(return-from create-charmap-from-test-function
|
||||
(make-charmap% :count 0)))
|
||||
(unless start-out
|
||||
;; no character failed the test, so return a "pseudo" charmap
|
||||
;; where IN-CHARMAP-P is always true
|
||||
(return-from create-charmap-from-test-function
|
||||
(make-charmap% :complementp t)))
|
||||
;; now determine upper bound
|
||||
(loop for code from (1- end) downto start
|
||||
for char = (code-char code)
|
||||
until (and end-in end-out)
|
||||
when (and char
|
||||
(not end-in)
|
||||
(funcall test-function char))
|
||||
do (setq end-in (1+ code))
|
||||
when (and char
|
||||
(not end-out)
|
||||
(not (funcall test-function char)))
|
||||
do (setq end-out (1+ code)))
|
||||
;; use the smaller interval
|
||||
(cond ((<= (- end-in start-in) (- end-out start-out))
|
||||
(make-charmap start-in end-in test-function))
|
||||
(t (make-charmap start-out end-out (complement* test-function) t)))))
|
||||
83
charset.lisp
83
charset.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.4 2008/07/03 08:39:10 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.9 2008/07/23 00:47:58 edi Exp $
|
||||
|
||||
;;; A specialized set implementation for characters by Nikodemus Siivola.
|
||||
|
||||
@ -30,7 +30,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defconstant +probe-depth+ 3
|
||||
"Maximum number of collisions \(for any element) we accept before we
|
||||
@ -45,7 +45,7 @@ initialized to #\Null except for the first one which is initialized to
|
||||
#\?."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type (integer 2 #.(1- array-total-size-limit)) size))
|
||||
;; Since #\Null always hashes to 0, store something else there
|
||||
;; since #\Null always hashes to 0, store something else there
|
||||
;; initially, and #\Null everywhere else
|
||||
(let ((result (make-array size
|
||||
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char
|
||||
@ -53,7 +53,7 @@ initialized to #\Null except for the first one which is initialized to
|
||||
(setf (char result 0) #\?)
|
||||
result))
|
||||
|
||||
(defstruct (charset (:constructor make-charset))
|
||||
(defstruct (charset (:constructor make-charset ()))
|
||||
;; this is set to 0 when we stop hashing and just use a CHAR-CODE
|
||||
;; indexed vector
|
||||
(depth +probe-depth+ :type fixnum)
|
||||
@ -92,7 +92,7 @@ to the hash code HASH."
|
||||
(depth (charset-depth set))
|
||||
(code (char-code char)))
|
||||
(declare (fixnum depth))
|
||||
;; As long as the set remains reasonably small, we use non-linear
|
||||
;; as long as the set remains reasonably small, we use non-linear
|
||||
;; hashing - the first hash of any character is its CHAR-CODE, and
|
||||
;; subsequent hashes are computed by MIX above
|
||||
(cond ((or
|
||||
@ -129,14 +129,15 @@ to the hash code HASH."
|
||||
"Adds the character CHAR to the charset SET, extending SET if
|
||||
necessary. Returns CHAR."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(or (%add-to-charset char set)
|
||||
(or (%add-to-charset char set t)
|
||||
(%add-to-charset/expand char set)
|
||||
(error "Oops, this should not happen..."))
|
||||
char)
|
||||
|
||||
(defun %add-to-charset (char set)
|
||||
(defun %add-to-charset (char set count)
|
||||
"Tries to add the character CHAR to the charset SET without
|
||||
extending it. Returns NIL if this fails."
|
||||
extending it. Returns NIL if this fails. Counts CHAR as new
|
||||
if COUNT is true and it is added to SET."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (character char) (charset set))
|
||||
(let ((vector (charset-vector set))
|
||||
@ -145,7 +146,11 @@ extending it. Returns NIL if this fails."
|
||||
(declare (fixnum depth))
|
||||
;; see comments in IN-CHARSET-P for algorithm
|
||||
(cond ((or (zerop depth) (zerop code))
|
||||
(setf (char vector code) char))
|
||||
(unless (eq char (char vector code))
|
||||
(setf (char vector code) char)
|
||||
(when count
|
||||
(incf (charset-count set))))
|
||||
char)
|
||||
(t
|
||||
(let ((hash code))
|
||||
(tagbody
|
||||
@ -154,7 +159,8 @@ extending it. Returns NIL if this fails."
|
||||
(x (char vector index)))
|
||||
(cond ((eq x (code-char 0))
|
||||
(setf (char vector index) char)
|
||||
(incf (charset-count set))
|
||||
(when count
|
||||
(incf (charset-count set)))
|
||||
(return-from %add-to-charset char))
|
||||
((eq x char)
|
||||
(return-from %add-to-charset char))
|
||||
@ -184,7 +190,10 @@ extending it. Returns NIL if this fails."
|
||||
(setf (charset-depth set) new-depth
|
||||
(charset-vector set) new-vector)
|
||||
(flet ((try-add (x)
|
||||
(unless (%add-to-charset x set)
|
||||
;; don't count - old characters are already accounted
|
||||
;; for, and might count the new one multiple times as
|
||||
;; well
|
||||
(unless (%add-to-charset x set nil)
|
||||
(assert (not (zerop new-depth)))
|
||||
(setf new-size (* 2 new-size))
|
||||
(go :retry))))
|
||||
@ -196,32 +205,38 @@ extending it. Returns NIL if this fails."
|
||||
(try-add x))
|
||||
(unless (zerop i)
|
||||
(try-add x))))))))
|
||||
;; added and expanded, /now/ count the new character.
|
||||
(incf (charset-count set))
|
||||
t))
|
||||
|
||||
(defun all-characters (set)
|
||||
"Returns a list of all characters in the charset SET."
|
||||
(defun map-charset (function charset)
|
||||
"Calls FUNCTION with all characters in SET. Returns NIL."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop with count = (charset-count set)
|
||||
with counter = 0
|
||||
for code below char-code-limit
|
||||
for char = (code-char code)
|
||||
while (< counter count)
|
||||
when (and char (in-charset-p char set))
|
||||
do (incf counter)
|
||||
and collect char))
|
||||
(declare (function function))
|
||||
(let* ((n (charset-count charset))
|
||||
(vector (charset-vector charset))
|
||||
(size (length vector)))
|
||||
;; see comments in IN-CHARSET-P for algorithm
|
||||
(when (eq (code-char 0) (char vector 0))
|
||||
(funcall function (code-char 0))
|
||||
(decf n))
|
||||
(loop for i from 1 below size
|
||||
for char = (char vector i)
|
||||
unless (eq (code-char 0) char) do
|
||||
(funcall function char)
|
||||
;; this early termination test should be worth it when
|
||||
;; mapping across depth 0 charsets.
|
||||
(when (zerop (decf n))
|
||||
(return-from map-charset nil))))
|
||||
nil)
|
||||
|
||||
(defun merge-set (set1 set2 &optional invertedp)
|
||||
"Returns the \"sum\" of two charsets. This is a destructive
|
||||
operation on SET1. If INVERTEDP is true, merges the \"inverse\" of
|
||||
SET2 into SET1 instead."
|
||||
(defun create-charset-from-test-function (test-function start end)
|
||||
"Creates and returns a charset representing all characters with
|
||||
character codes between START and END which satisfy TEST-FUNCTION."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; we only consider values with character codes below
|
||||
;; *REGEX-CHAR-CODE-LIMIT*
|
||||
(loop for code of-type fixnum from 0 below *regex-char-code-limit*
|
||||
(loop with charset = (make-charset)
|
||||
for code from start below end
|
||||
for char = (code-char code)
|
||||
when (and char (if invertedp
|
||||
(not (in-charset-p char set2))
|
||||
(in-charset-p char set2)))
|
||||
do (add-to-charset char set1))
|
||||
set1)
|
||||
|
||||
when (and char (funcall test-function char))
|
||||
do (add-to-charset char charset)
|
||||
finally (return charset)))
|
||||
|
||||
98
chartest.lisp
Normal file
98
chartest.lisp
Normal file
@ -0,0 +1,98 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/chartest.lisp,v 1.3 2008/07/23 00:47:58 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defun create-hash-table-from-test-function (test-function start end)
|
||||
"Creates and returns a hash table representing all characters with
|
||||
character codes between START and END which satisfy TEST-FUNCTION."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop with hash-table = (make-hash-table)
|
||||
for code from start below end
|
||||
for char = (code-char code)
|
||||
when (and char (funcall test-function char))
|
||||
do (setf (gethash char hash-table) t)
|
||||
finally (return hash-table)))
|
||||
|
||||
(defun create-optimized-test-function (test-function &key
|
||||
(start 0)
|
||||
(end *regex-char-code-limit*)
|
||||
(kind *optimize-char-classes*))
|
||||
"Given a unary test function which is applicable to characters
|
||||
returns a function which yields the same boolean results for all
|
||||
characters with character codes from START to \(excluding) END. If
|
||||
KIND is NIL, TEST-FUNCTION will simply be returned. Otherwise, KIND
|
||||
should be one of:
|
||||
|
||||
* :HASH-TABLE - builds a hash table representing all characters which
|
||||
satisfy the test and returns a closure which checks if
|
||||
a character is in that hash table
|
||||
|
||||
* :CHARSET - instead of a hash table uses a \"charset\" which is a
|
||||
data structure using non-linear hashing and optimized to
|
||||
represent \(sparse) sets of characters in a fast and
|
||||
space-efficient way \(contributed by Nikodemus Siivola)
|
||||
|
||||
* :CHARMAP - instead of a hash table uses a bit vector to represent
|
||||
the set of characters
|
||||
|
||||
You can also use :HASH-TABLE* or :CHARSET* which are like :HASH-TABLE
|
||||
and :CHARSET but use the complement of the set if the set contains
|
||||
more than half of all characters between START and END. This saves
|
||||
space but needs an additional pass across all characters to create the
|
||||
data structure. There is no corresponding :CHARMAP* kind as the bit
|
||||
vectors are already created to cover the smallest possible interval
|
||||
which contains either the set or its complement."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(ecase kind
|
||||
((nil) test-function)
|
||||
(:charmap
|
||||
(let ((charmap (create-charmap-from-test-function test-function start end)))
|
||||
(lambda (char)
|
||||
(in-charmap-p char charmap))))
|
||||
((:charset :charset*)
|
||||
(let ((charset (create-charset-from-test-function test-function start end)))
|
||||
(cond ((or (eq kind :charset)
|
||||
(<= (charset-count charset) (ceiling (- end start) 2)))
|
||||
(lambda (char)
|
||||
(in-charset-p char charset)))
|
||||
(t (setq charset (create-charset-from-test-function (complement* test-function)
|
||||
start end))
|
||||
(lambda (char)
|
||||
(not (in-charset-p char charset)))))))
|
||||
((:hash-table :hash-table*)
|
||||
(let ((hash-table (create-hash-table-from-test-function test-function start end)))
|
||||
(cond ((or (eq kind :charset)
|
||||
(<= (hash-table-count hash-table) (ceiling (- end start) 2)))
|
||||
(lambda (char)
|
||||
(gethash char hash-table)))
|
||||
(t (setq hash-table (create-hash-table-from-test-function (complement* test-function)
|
||||
start end))
|
||||
(lambda (char)
|
||||
(not (gethash char hash-table)))))))))
|
||||
58
cl-ppcre-unicode.asd
Normal file
58
cl-ppcre-unicode.asd
Normal file
@ -0,0 +1,58 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-unicode.asd,v 1.14 2008/07/22 14:19:44 edi Exp $
|
||||
|
||||
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-user)
|
||||
|
||||
(defpackage :cl-ppcre-unicode-asd
|
||||
(:use :cl :asdf))
|
||||
|
||||
(in-package :cl-ppcre-unicode-asd)
|
||||
|
||||
(defsystem :cl-ppcre-unicode
|
||||
:components ((:module "cl-ppcre-unicode"
|
||||
:serial t
|
||||
:components ((:file "packages")
|
||||
(:file "resolver"))))
|
||||
:depends-on (:cl-ppcre :cl-unicode))
|
||||
|
||||
(defsystem :cl-ppcre-unicode-test
|
||||
:depends-on (:cl-ppcre-unicode :cl-ppcre-test)
|
||||
:components ((:module "test"
|
||||
:serial t
|
||||
:components ((:file "unicode-tests")))))
|
||||
|
||||
(defmethod perform ((o test-op) (c (eql (find-system :cl-ppcre-unicode))))
|
||||
;; we must load CL-PPCRE explicitly so that the CL-PPCRE-TEST system
|
||||
;; will be found
|
||||
(operate 'load-op :cl-ppcre)
|
||||
(operate 'load-op :cl-ppcre-unicode-test)
|
||||
(funcall (intern (symbol-name :run-all-tests) (find-package :cl-ppcre-test))
|
||||
:more-tests (intern (symbol-name :unicode-test) (find-package :cl-ppcre-test))))
|
||||
@ -1,7 +1,7 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.11 2007/01/01 23:43:10 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-unicode/packages.lisp,v 1.2 2008/07/22 13:58:13 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
@ -27,14 +27,12 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
(in-package :cl-user)
|
||||
|
||||
(defparameter *cl-ppcre-test-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(mk:defsystem #:cl-ppcre-test
|
||||
:source-pathname *cl-ppcre-test-base-directory*
|
||||
:source-extension "lisp"
|
||||
:depends-on (#:cl-ppcre)
|
||||
:components ((:file "ppcre-tests")))
|
||||
(defpackage :cl-ppcre-unicode
|
||||
#+:genera
|
||||
(:shadowing-import-from :common-lisp :lambda :string)
|
||||
(:use #-:genera :cl #+:genera :future-common-lisp
|
||||
:cl-ppcre :cl-unicode)
|
||||
(:import-from :cl-ppcre :signal-syntax-error)
|
||||
(:export :unicode-property-resolver))
|
||||
61
cl-ppcre-unicode/resolver.lisp
Normal file
61
cl-ppcre-unicode/resolver.lisp
Normal file
@ -0,0 +1,61 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-unicode/resolver.lisp,v 1.5 2008/07/23 02:14:08 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre-unicode)
|
||||
|
||||
(defun unicode-property-resolver (property-name)
|
||||
"A property resolver which understands Unicode properties using
|
||||
CL-UNICODE's PROPERTY-TEST function. This resolver is automatically
|
||||
installed in *PROPERTY-RESOLVER* when the CL-PPCRE-UNICODE system is
|
||||
loaded."
|
||||
(or (property-test property-name :errorp nil)
|
||||
(signal-syntax-error "There is no property named ~S." property-name)))
|
||||
|
||||
(setq *property-resolver* 'unicode-property-resolver)
|
||||
|
||||
(pushnew :cl-ppcre-unicode *features*)
|
||||
|
||||
;; stuff for Nikodemus Siivola's HYPERDOC
|
||||
;; see <http://common-lisp.net/project/hyperdoc/>
|
||||
;; and <http://www.cliki.net/hyperdoc>
|
||||
;; also used by LW-ADD-ONS
|
||||
|
||||
(defvar *hyperdoc-base-uri* "http://weitz.de/cl-ppcre/")
|
||||
|
||||
(let ((exported-symbols-alist
|
||||
(loop for symbol being the external-symbols of :cl-ppcre-unicode
|
||||
collect (cons symbol
|
||||
(concatenate 'string
|
||||
"#"
|
||||
(string-downcase symbol))))))
|
||||
(defun hyperdoc-lookup (symbol type)
|
||||
(declare (ignore type))
|
||||
(cdr (assoc symbol
|
||||
exported-symbols-alist
|
||||
:test #'eq))))
|
||||
31
cl-ppcre.asd
31
cl-ppcre.asd
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.30 2008/07/03 10:06:15 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.45 2008/07/23 02:14:06 edi Exp $
|
||||
|
||||
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
||||
|
||||
@ -29,14 +29,23 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(asdf:defsystem :cl-ppcre
|
||||
:version "1.4.1"
|
||||
(in-package :cl-user)
|
||||
|
||||
(defpackage :cl-ppcre-asd
|
||||
(:use :cl :asdf))
|
||||
|
||||
(in-package :cl-ppcre-asd)
|
||||
|
||||
(defsystem :cl-ppcre
|
||||
:version "2.0.0"
|
||||
:serial t
|
||||
:components ((:file "packages")
|
||||
(:file "specials")
|
||||
(:file "charset")
|
||||
(:file "util")
|
||||
(:file "errors")
|
||||
(:file "charset")
|
||||
(:file "charmap")
|
||||
(:file "chartest")
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "lexer")
|
||||
#-:use-acl-regexp2-engine
|
||||
@ -44,6 +53,8 @@
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "regex-class")
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "regex-class-util")
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "convert")
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "optimize")
|
||||
@ -54,3 +65,15 @@
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "scanner")
|
||||
(:file "api")))
|
||||
|
||||
(defsystem :cl-ppcre-test
|
||||
:depends-on (:cl-ppcre :flexi-streams)
|
||||
:components ((:module "test"
|
||||
:serial t
|
||||
:components ((:file "packages")
|
||||
(:file "tests")
|
||||
(:file "perl-tests")))))
|
||||
|
||||
(defmethod perform ((o test-op) (c (eql (find-system :cl-ppcre))))
|
||||
(operate 'load-op :cl-ppcre-test)
|
||||
(funcall (intern (symbol-name :run-all-tests) (find-package :cl-ppcre-test))))
|
||||
|
||||
@ -1,59 +0,0 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.13 2007/01/01 23:43:10 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
|
||||
(defparameter *cl-ppcre-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(mk:defsystem #:cl-ppcre
|
||||
:source-pathname *cl-ppcre-base-directory*
|
||||
:source-extension "lisp"
|
||||
:components ((:file "packages")
|
||||
(:file "specials" :depends-on ("packages"))
|
||||
(:file "util" :depends-on ("packages"))
|
||||
(:file "errors" :depends-on ("util"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "lexer" :depends-on ("errors" "specials"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "parser" :depends-on ("lexer"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "regex-class" :depends-on ("parser"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "convert" :depends-on ("regex-class"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "optimize" :depends-on ("convert"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "closures" :depends-on ("optimize" "specials"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "repetition-closures" :depends-on ("closures"))
|
||||
#-:use-acl-regexp2-engine
|
||||
(:file "scanner" :depends-on ("repetition-closures"))
|
||||
(:file "api" :depends-on ("scanner"))))
|
||||
216
closures.lisp
216
closures.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.36 2008/07/03 07:44:06 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.44 2008/07/22 22:38:05 edi Exp $
|
||||
|
||||
;;; Here we create the closures which together build the final
|
||||
;;; scanner.
|
||||
@ -30,16 +30,15 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(declaim (inline *string*= *string*-equal))
|
||||
|
||||
(defun *string*= (string2 start1 end1 start2 end2)
|
||||
"Like STRING=, i.e. compares the special string *STRING* from START1
|
||||
to END1 with STRING2 from START2 to END2. Note that there's no
|
||||
boundary check - this has to be implemented by the caller."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum start1 end1 start2 end2))
|
||||
(declare (fixnum start1 end1 start2 end2))
|
||||
(loop for string1-idx of-type fixnum from start1 below end1
|
||||
for string2-idx of-type fixnum from start2 below end2
|
||||
always (char= (schar *string* string1-idx)
|
||||
@ -50,7 +49,7 @@ boundary check - this has to be implemented by the caller."
|
||||
START1 to END1 with STRING2 from START2 to END2. Note that there's no
|
||||
boundary check - this has to be implemented by the caller."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum start1 end1 start2 end2))
|
||||
(declare (fixnum start1 end1 start2 end2))
|
||||
(loop for string1-idx of-type fixnum from start1 below end1
|
||||
for string2-idx of-type fixnum from start2 below end2
|
||||
always (char-equal (schar *string* string1-idx)
|
||||
@ -81,7 +80,7 @@ such that the call to NEXT-FN after the match would succeed."))
|
||||
;; now create a closure which checks if one of the closures
|
||||
;; created above can succeed
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(loop for matcher in all-matchers
|
||||
thereis (funcall (the function matcher) start-pos)))))
|
||||
|
||||
@ -90,13 +89,13 @@ such that the call to NEXT-FN after the match would succeed."))
|
||||
;; the position of this REGISTER within the whole regex; we start to
|
||||
;; count at 0
|
||||
(let ((num (num register)))
|
||||
(declare (type fixnum num))
|
||||
(declare (fixnum num))
|
||||
;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which will
|
||||
;; update the corresponding values of *REGS-START* and *REGS-END*
|
||||
;; after the inner matcher has succeeded
|
||||
(flet ((store-end-of-reg (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function next-fn))
|
||||
(declare (fixnum start-pos)
|
||||
(function next-fn))
|
||||
(setf (svref *reg-starts* num) (svref *regs-maybe-start* num)
|
||||
(svref *reg-ends* num) start-pos)
|
||||
(funcall next-fn start-pos)))
|
||||
@ -104,10 +103,10 @@ such that the call to NEXT-FN after the match would succeed."))
|
||||
;; wrapped by this REGISTER
|
||||
(let ((inner-matcher (create-matcher-aux (regex register)
|
||||
#'store-end-of-reg)))
|
||||
(declare (type function inner-matcher))
|
||||
(declare (function inner-matcher))
|
||||
;; here comes the actual closure for REGISTER
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
;; remember the old values of *REGS-START* and friends in
|
||||
;; case we cannot match
|
||||
(let ((old-*reg-starts* (svref *reg-starts* num))
|
||||
@ -129,7 +128,7 @@ such that the call to NEXT-FN after the match would succeed."))
|
||||
;; create a closure which just checks for the inner regex and
|
||||
;; doesn't care about NEXT-FN
|
||||
(let ((test-matcher (create-matcher-aux (regex lookahead) #'identity)))
|
||||
(declare (type function next-fn test-matcher))
|
||||
(declare (function next-fn test-matcher))
|
||||
(if (positivep lookahead)
|
||||
;; positive look-ahead: check success of inner regex, then call
|
||||
;; NEXT-FN
|
||||
@ -148,152 +147,52 @@ such that the call to NEXT-FN after the match would succeed."))
|
||||
;; create a closure which just checks for the inner regex and
|
||||
;; doesn't care about NEXT-FN
|
||||
(test-matcher (create-matcher-aux (regex lookbehind) #'identity)))
|
||||
(declare (type function next-fn test-matcher)
|
||||
(type fixnum len))
|
||||
(declare (function next-fn test-matcher)
|
||||
(fixnum len))
|
||||
(if (positivep lookbehind)
|
||||
;; positive look-behind: check success of inner regex (if we're
|
||||
;; far enough from the start of *STRING*), then call NEXT-FN
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (>= (- start-pos (or *real-start-pos* *start-pos*)) len)
|
||||
(funcall test-matcher (- start-pos len))
|
||||
(funcall next-fn start-pos)))
|
||||
;; negative look-behind: check failure of inner regex (if we're
|
||||
;; far enough from the start of *STRING*), then call NEXT-FN
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (or (< (- start-pos (or *real-start-pos* *start-pos*)) len)
|
||||
(not (funcall test-matcher (- start-pos len))))
|
||||
(funcall next-fn start-pos))))))
|
||||
|
||||
(defmacro insert-char-class-tester ((char-class chr-expr) &body body)
|
||||
"Utility macro to replace each occurence of '(CHAR-CLASS-TEST)
|
||||
"Utility macro to replace each occurence of '\(CHAR-CLASS-TEST)
|
||||
within BODY with the correct test (corresponding to CHAR-CLASS)
|
||||
against CHR-EXPR."
|
||||
(with-unique-names (%char-class)
|
||||
;; the actual substitution is done here: replace
|
||||
;; '(CHAR-CLASS-TEST) with NEW
|
||||
(with-rebinding (char-class)
|
||||
(with-unique-names (test-function)
|
||||
(flet ((substitute-char-class-tester (new)
|
||||
(subst new '(char-class-test) body
|
||||
:test #'equalp)))
|
||||
`(let* ((,%char-class ,char-class)
|
||||
(set (charset ,%char-class))
|
||||
(count (if set
|
||||
(charset-count set)
|
||||
most-positive-fixnum))
|
||||
;; collect a list of "all" characters in the set if
|
||||
;; there aren't more than two
|
||||
(all-chars (if (<= count 2)
|
||||
(all-characters set)
|
||||
nil))
|
||||
downcasedp)
|
||||
(declare (type fixnum count))
|
||||
;; check if we can partition the charset into three ranges (or
|
||||
;; less)
|
||||
(multiple-value-bind (min1 max1 min2 max2 min3 max3)
|
||||
(create-ranges-from-set set)
|
||||
;; if that didn't work and CHAR-CLASS is case-insensitive we
|
||||
;; try it again with every character downcased
|
||||
(when (and (not min1)
|
||||
(case-insensitive-p ,%char-class))
|
||||
(multiple-value-setq (min1 max1 min2 max2 min3 max3)
|
||||
(create-ranges-from-set set :downcasep t))
|
||||
(setq downcasedp t))
|
||||
(cond ((= count 1)
|
||||
;; charset contains exactly one character so we just
|
||||
;; check for this single character; (note that this
|
||||
;; actually can't happen because this case is
|
||||
;; optimized away in CONVERT already...)
|
||||
(let ((chr1 (first all-chars)))
|
||||
`(let ((,test-function (test-function ,char-class)))
|
||||
,@(substitute-char-class-tester
|
||||
`(char= ,chr-expr chr1))))
|
||||
((= count 2)
|
||||
;; set contains exactly two characters
|
||||
(let ((chr1 (first all-chars))
|
||||
(chr2 (second all-chars)))
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char= chr chr1)
|
||||
(char= chr chr2))))))
|
||||
((word-char-class-p ,%char-class)
|
||||
;; special-case: set is \w, \W, [\w], [\W] or
|
||||
;; something equivalent
|
||||
,@(substitute-char-class-tester
|
||||
`(word-char-p ,chr-expr)))
|
||||
((= count *regex-char-code-limit*)
|
||||
;; according to the ANSI standard we might have all
|
||||
;; possible characters in the set even if it doesn't
|
||||
;; contain CHAR-CODE-LIMIT characters but this
|
||||
;; doesn't seem to be the case for current
|
||||
;; implementations (also note that this optimization
|
||||
;; implies that you must not have characters with
|
||||
;; character codes beyond *REGEX-CHAR-CODE-LIMIT* in
|
||||
;; your regexes if you've changed this limit); we
|
||||
;; expect the compiler to optimize this T "test"
|
||||
;; away
|
||||
,@(substitute-char-class-tester t))
|
||||
((and downcasedp min1 min2 min3)
|
||||
;; three different ranges, downcased
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char-not-greaterp min1 chr max1)
|
||||
(char-not-greaterp min2 chr max2)
|
||||
(char-not-greaterp min3 chr max3)))))
|
||||
((and downcasedp min1 min2)
|
||||
;; two ranges, downcased
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char-not-greaterp min1 chr max1)
|
||||
(char-not-greaterp min2 chr max2)))))
|
||||
((and downcasedp min1)
|
||||
;; one downcased range
|
||||
,@(substitute-char-class-tester
|
||||
`(char-not-greaterp min1 ,chr-expr max1)))
|
||||
((and min1 min2 min3)
|
||||
;; three ranges
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char<= min1 chr max1)
|
||||
(char<= min2 chr max2)
|
||||
(char<= min3 chr max3)))))
|
||||
((and min1 min2)
|
||||
;; two ranges
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char<= min1 chr max1)
|
||||
(char<= min2 chr max2)))))
|
||||
(min1
|
||||
;; one range
|
||||
,@(substitute-char-class-tester
|
||||
`(char<= min1 ,chr-expr max1)))
|
||||
(t
|
||||
;; the general case; note that most of the above
|
||||
;; "optimizations" are based on early (2002)
|
||||
;; experiences and benchmarks with CMUCL
|
||||
,@(substitute-char-class-tester
|
||||
`(in-charset-p ,chr-expr set)))))))))
|
||||
`(funcall ,test-function ,chr-expr)))))))
|
||||
|
||||
(defmethod create-matcher-aux ((char-class char-class) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
;; insert a test against the current character within *STRING*
|
||||
(insert-char-class-tester (char-class (schar *string* start-pos))
|
||||
(if (invertedp char-class)
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(not (char-class-test))
|
||||
(funcall next-fn (1+ start-pos))))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char-class-test)
|
||||
(funcall next-fn (1+ start-pos)))))))
|
||||
(funcall next-fn (1+ start-pos))))))
|
||||
|
||||
(defmethod create-matcher-aux ((str str) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum *end-string-pos*)
|
||||
(type function next-fn)
|
||||
(declare (fixnum *end-string-pos*)
|
||||
(function next-fn)
|
||||
;; this special value is set by CREATE-SCANNER when the
|
||||
;; closures are built
|
||||
(special end-string))
|
||||
@ -307,15 +206,15 @@ against CHR-EXPR."
|
||||
(end-string-len (if end-string
|
||||
(length end-string)
|
||||
nil)))
|
||||
(declare (type fixnum len))
|
||||
(declare (fixnum len))
|
||||
(cond ((and start-of-end-string-p case-insensitive-p)
|
||||
;; closure for the first STR which belongs to the constant
|
||||
;; string at the end of the regular expression;
|
||||
;; case-insensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos end-string-len))
|
||||
(declare (fixnum start-pos end-string-len))
|
||||
(let ((test-end-pos (+ start-pos end-string-len)))
|
||||
(declare (type fixnum test-end-pos))
|
||||
(declare (fixnum test-end-pos))
|
||||
;; either we're at *END-STRING-POS* (which means that
|
||||
;; it has already been confirmed that end-string
|
||||
;; starts here) or we really have to test
|
||||
@ -329,9 +228,9 @@ against CHR-EXPR."
|
||||
;; string at the end of the regular expression;
|
||||
;; case-sensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos end-string-len))
|
||||
(declare (fixnum start-pos end-string-len))
|
||||
(let ((test-end-pos (+ start-pos end-string-len)))
|
||||
(declare (type fixnum test-end-pos))
|
||||
(declare (fixnum test-end-pos))
|
||||
;; either we're at *END-STRING-POS* (which means that
|
||||
;; it has already been confirmed that end-string
|
||||
;; starts here) or we really have to test
|
||||
@ -344,13 +243,13 @@ against CHR-EXPR."
|
||||
;; a STR which can be skipped because some other function
|
||||
;; has already confirmed that it matches
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(funcall next-fn (+ start-pos len))))
|
||||
((and (= len 1) case-insensitive-p)
|
||||
;; STR represent exactly one character; case-insensitive
|
||||
;; version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char-equal (schar *string* start-pos) chr)
|
||||
(funcall next-fn (1+ start-pos)))))
|
||||
@ -358,35 +257,34 @@ against CHR-EXPR."
|
||||
;; STR represent exactly one character; case-sensitive
|
||||
;; version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char= (schar *string* start-pos) chr)
|
||||
(funcall next-fn (1+ start-pos)))))
|
||||
(case-insensitive-p
|
||||
;; general case, case-insensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((next-pos (+ start-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and (<= next-pos *end-pos*)
|
||||
(*string*-equal str start-pos next-pos 0 len)
|
||||
(funcall next-fn next-pos)))))
|
||||
(t
|
||||
;; general case, case-sensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((next-pos (+ start-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and (<= next-pos *end-pos*)
|
||||
(*string*= str start-pos next-pos 0 len)
|
||||
(funcall next-fn next-pos))))))))
|
||||
|
||||
(declaim (inline word-boundary-p))
|
||||
|
||||
(defun word-boundary-p (start-pos)
|
||||
"Check whether START-POS is a word-boundary within *STRING*."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((1-start-pos (1- start-pos))
|
||||
(*start-pos* (or *real-start-pos* *start-pos*)))
|
||||
;; either the character before START-POS is a word-constituent and
|
||||
@ -407,7 +305,7 @@ against CHR-EXPR."
|
||||
|
||||
(defmethod create-matcher-aux ((word-boundary word-boundary) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
(if (negatedp word-boundary)
|
||||
(lambda (start-pos)
|
||||
(and (not (word-boundary-p start-pos))
|
||||
@ -418,25 +316,25 @@ against CHR-EXPR."
|
||||
|
||||
(defmethod create-matcher-aux ((everything everything) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
(if (single-line-p everything)
|
||||
;; closure for single-line-mode: we really match everything, so we
|
||||
;; just advance the index into *STRING* by one and carry on
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(funcall next-fn (1+ start-pos))))
|
||||
;; not single-line-mode, so we have to make sure we don't match
|
||||
;; #\Newline
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char/= (schar *string* start-pos) #\Newline)
|
||||
(funcall next-fn (1+ start-pos))))))
|
||||
|
||||
(defmethod create-matcher-aux ((anchor anchor) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
(let ((startp (startp anchor))
|
||||
(multi-line-p (multi-line-p anchor)))
|
||||
(cond ((no-newline-p anchor)
|
||||
@ -444,14 +342,14 @@ against CHR-EXPR."
|
||||
;; we just have to check whether START-POS equals
|
||||
;; *END-POS*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (= start-pos *end-pos*)
|
||||
(funcall next-fn start-pos))))
|
||||
((and startp multi-line-p)
|
||||
;; a start-anchor in multi-line-mode: check if we're at
|
||||
;; *START-POS* or if the last character was #\Newline
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((*start-pos* (or *real-start-pos* *start-pos*)))
|
||||
(and (or (= start-pos *start-pos*)
|
||||
(and (<= start-pos *end-pos*)
|
||||
@ -463,7 +361,7 @@ against CHR-EXPR."
|
||||
;; a start-anchor which is not in multi-line-mode, so just
|
||||
;; check whether we're at *START-POS*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (= start-pos (or *real-start-pos* *start-pos*))
|
||||
(funcall next-fn start-pos))))
|
||||
(multi-line-p
|
||||
@ -471,7 +369,7 @@ against CHR-EXPR."
|
||||
;; *END-POS* or if the character we're looking at is
|
||||
;; #\Newline
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (or (= start-pos *end-pos*)
|
||||
(and (< start-pos *end-pos*)
|
||||
(char= #\Newline
|
||||
@ -482,7 +380,7 @@ against CHR-EXPR."
|
||||
;; check if we're at *END-POS* or if we're looking at
|
||||
;; #\Newline and there's nothing behind it
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (or (= start-pos *end-pos*)
|
||||
(and (= start-pos (1- *end-pos*))
|
||||
(char= #\Newline
|
||||
@ -491,14 +389,14 @@ against CHR-EXPR."
|
||||
|
||||
(defmethod create-matcher-aux ((back-reference back-reference) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
;; the position of the corresponding REGISTER within the whole
|
||||
;; regex; we start to count at 0
|
||||
(let ((num (num back-reference)))
|
||||
(if (case-insensitive-p back-reference)
|
||||
;; the case-insensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((reg-start (svref *reg-starts* num))
|
||||
(reg-end (svref *reg-ends* num)))
|
||||
;; only bother to check if the corresponding REGISTER as
|
||||
@ -506,7 +404,7 @@ against CHR-EXPR."
|
||||
(and reg-start
|
||||
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
||||
(the fixnum reg-start)))))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and
|
||||
(<= next-pos *end-pos*)
|
||||
(*string*-equal *string* start-pos next-pos
|
||||
@ -514,7 +412,7 @@ against CHR-EXPR."
|
||||
(funcall next-fn next-pos))))))
|
||||
;; the case-sensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((reg-start (svref *reg-starts* num))
|
||||
(reg-end (svref *reg-ends* num)))
|
||||
;; only bother to check if the corresponding REGISTER as
|
||||
@ -522,7 +420,7 @@ against CHR-EXPR."
|
||||
(and reg-start
|
||||
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
||||
(the fixnum reg-start)))))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and
|
||||
(<= next-pos *end-pos*)
|
||||
(*string*= *string* start-pos next-pos
|
||||
@ -534,17 +432,17 @@ against CHR-EXPR."
|
||||
(let* ((test (test branch))
|
||||
(then-matcher (create-matcher-aux (then-regex branch) next-fn))
|
||||
(else-matcher (create-matcher-aux (else-regex branch) next-fn)))
|
||||
(declare (type function then-matcher else-matcher))
|
||||
(declare (function then-matcher else-matcher))
|
||||
(cond ((numberp test)
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum test))
|
||||
(declare (fixnum test))
|
||||
(if (and (< test (length *reg-starts*))
|
||||
(svref *reg-starts* test))
|
||||
(funcall then-matcher start-pos)
|
||||
(funcall else-matcher start-pos))))
|
||||
(t
|
||||
(let ((test-matcher (create-matcher-aux test #'identity)))
|
||||
(declare (type function test-matcher))
|
||||
(declare (function test-matcher))
|
||||
(lambda (start-pos)
|
||||
(if (funcall test-matcher start-pos)
|
||||
(funcall then-matcher start-pos)
|
||||
@ -553,7 +451,7 @@ against CHR-EXPR."
|
||||
(defmethod create-matcher-aux ((standalone standalone) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((inner-matcher (create-matcher-aux (regex standalone) #'identity)))
|
||||
(declare (type function next-fn inner-matcher))
|
||||
(declare (function next-fn inner-matcher))
|
||||
(lambda (start-pos)
|
||||
(let ((next-pos (funcall inner-matcher start-pos)))
|
||||
(and next-pos
|
||||
|
||||
756
convert.lisp
756
convert.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.29 2008/07/03 07:44:06 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.54 2008/07/23 02:14:06 edi Exp $
|
||||
|
||||
;;; Here the parse tree is converted into its internal representation
|
||||
;;; using REGEX objects. At the same time some optimizations are
|
||||
@ -31,7 +31,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
;;; The flags that represent the "ism" modifiers are always kept
|
||||
;;; together in a three-element list. We use the following macros to
|
||||
@ -50,10 +50,10 @@
|
||||
`(third ,flags))
|
||||
|
||||
(defun set-flag (token)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
"Reads a flag token and sets or unsets the corresponding entry in
|
||||
the special FLAGS list."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
(case token
|
||||
((:case-insensitive-p)
|
||||
(setf (case-insensitive-mode-p flags) t))
|
||||
@ -68,70 +68,107 @@ the special FLAGS list."
|
||||
((:not-single-line-mode-p)
|
||||
(setf (single-line-mode-p flags) nil))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error "Unknown flag token ~A" token))))
|
||||
(signal-syntax-error "Unknown flag token ~A." token))))
|
||||
|
||||
(defun add-range-to-set (set from to)
|
||||
(defgeneric resolve-property (property)
|
||||
(:documentation "Resolves PROPERTY to a unary character test
|
||||
function. PROPERTY can either be a function designator or it can be a
|
||||
string which is resolved using *PROPERTY-RESOLVER*.")
|
||||
(:method ((property-name string))
|
||||
(funcall *property-resolver* property-name))
|
||||
(:method ((function-name symbol))
|
||||
function-name)
|
||||
(:method ((test-function function))
|
||||
test-function))
|
||||
|
||||
(defun convert-char-class-to-test-function (list invertedp case-insensitive-p)
|
||||
"Combines all items in LIST into test function and returns a
|
||||
logical-OR combination of these functions. Items can be single
|
||||
characters, character ranges like \(:RANGE #\\A #\\E), or special
|
||||
character classes like :DIGIT-CLASS. Does the right thing with
|
||||
respect to case-\(in)sensitivity as specified by the special variable
|
||||
FLAGS."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
"Adds all characters from character FROM to character TO
|
||||
\(inclusive) to the charset SET. Does the right thing with respect to
|
||||
case-\(in)sensitivity as specified by the special variable FLAGS."
|
||||
(let ((from-code (char-code from))
|
||||
(to-code (char-code to)))
|
||||
(when (> from-code to-code)
|
||||
(signal-ppcre-syntax-error "Invalid range from ~A to ~A in char-class"
|
||||
from to))
|
||||
(cond ((case-insensitive-mode-p flags)
|
||||
(loop for code from from-code to to-code
|
||||
for char = (code-char code)
|
||||
do (add-to-charset (char-upcase char) set)
|
||||
(add-to-charset (char-downcase char) set)))
|
||||
(t
|
||||
(loop for code from from-code to to-code
|
||||
do (add-to-charset (code-char code) set))))
|
||||
set))
|
||||
|
||||
(defun convert-char-class-to-charset (list)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Combines all items in LIST into one charset and returns it. Items
|
||||
can be single characters, character ranges like \(:RANGE #\\A #\\E),
|
||||
or special character classes like :DIGIT-CLASS. Does the right thing
|
||||
with respect to case-\(in)sensitivity as specified by the special
|
||||
variable FLAGS."
|
||||
(loop with set = (make-charset)
|
||||
for item in list
|
||||
if (characterp item)
|
||||
;; treat a single character C like a range (:RANGE C C)
|
||||
do (add-range-to-set set item item)
|
||||
else if (symbolp item)
|
||||
;; special character classes
|
||||
do (setq set
|
||||
(let ((test-functions
|
||||
(loop for item in list
|
||||
collect (cond ((characterp item)
|
||||
;; rebind so closure captures the right one
|
||||
(let ((this-char item))
|
||||
(lambda (char)
|
||||
(declare (character char this-char))
|
||||
(char= char this-char))))
|
||||
((symbolp item)
|
||||
(case item
|
||||
((:digit-class)
|
||||
(merge-set set +digit-set+))
|
||||
((:non-digit-class)
|
||||
(merge-set set +digit-set+ t))
|
||||
((:whitespace-char-class)
|
||||
(merge-set set +whitespace-char-set+))
|
||||
((:non-whitespace-char-class)
|
||||
(merge-set set +whitespace-char-set+ t))
|
||||
((:word-char-class)
|
||||
(merge-set set +word-char-set+))
|
||||
((:non-word-char-class)
|
||||
(merge-set set +word-char-set+ t))
|
||||
((:digit-class) #'digit-char-p)
|
||||
((:non-digit-class) (complement* #'digit-char-p))
|
||||
((:whitespace-char-class) #'whitespacep)
|
||||
((:non-whitespace-char-class) (complement* #'whitespacep))
|
||||
((:word-char-class) #'word-char-p)
|
||||
((:non-word-char-class) (complement* #'word-char-p))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error
|
||||
"Unknown symbol ~A in character class"
|
||||
item))))
|
||||
else if (and (consp item)
|
||||
(eq (car item) :range))
|
||||
;; proper ranges
|
||||
do (add-range-to-set set
|
||||
(second item)
|
||||
(third item))
|
||||
else do (signal-ppcre-syntax-error "Unknown item ~A in char-class list"
|
||||
item)
|
||||
finally (return set)))
|
||||
(signal-syntax-error "Unknown symbol ~A in character class." item))))
|
||||
((and (consp item)
|
||||
(eq (first item) :property))
|
||||
(resolve-property (second item)))
|
||||
((and (consp item)
|
||||
(eq (first item) :inverted-property))
|
||||
(complement* (resolve-property (second item))))
|
||||
((and (consp item)
|
||||
(eq (first item) :range))
|
||||
(let ((from (second item))
|
||||
(to (third item)))
|
||||
(when (char> from to)
|
||||
(signal-syntax-error "Invalid range from ~S to ~S in char-class." from to))
|
||||
(lambda (char)
|
||||
(declare (character char from to))
|
||||
(char<= from char to))))
|
||||
(t (signal-syntax-error "Unknown item ~A in char-class list." item))))))
|
||||
(unless test-functions
|
||||
(signal-syntax-error "Empty character class."))
|
||||
(cond ((cdr test-functions)
|
||||
(cond ((and invertedp case-insensitive-p)
|
||||
(lambda (char)
|
||||
(declare (character char))
|
||||
(loop with both-case-p = (both-case-p char)
|
||||
with char-down = (if both-case-p (char-downcase char) char)
|
||||
with char-up = (if both-case-p (char-upcase char) nil)
|
||||
for test-function in test-functions
|
||||
never (or (funcall test-function char-down)
|
||||
(and char-up (funcall test-function char-up))))))
|
||||
(case-insensitive-p
|
||||
(lambda (char)
|
||||
(declare (character char))
|
||||
(loop with both-case-p = (both-case-p char)
|
||||
with char-down = (if both-case-p (char-downcase char) char)
|
||||
with char-up = (if both-case-p (char-upcase char) nil)
|
||||
for test-function in test-functions
|
||||
thereis (or (funcall test-function char-down)
|
||||
(and char-up (funcall test-function char-up))))))
|
||||
(invertedp
|
||||
(lambda (char)
|
||||
(loop for test-function in test-functions
|
||||
never (funcall test-function char))))
|
||||
(t
|
||||
(lambda (char)
|
||||
(loop for test-function in test-functions
|
||||
thereis (funcall test-function char))))))
|
||||
;; there's only one test-function
|
||||
(t (let ((test-function (first test-functions)))
|
||||
(cond ((and invertedp case-insensitive-p)
|
||||
(lambda (char)
|
||||
(declare (character char))
|
||||
(not (or (funcall test-function (char-downcase char))
|
||||
(and (both-case-p char)
|
||||
(funcall test-function (char-upcase char)))))))
|
||||
(case-insensitive-p
|
||||
(lambda (char)
|
||||
(declare (character char))
|
||||
(or (funcall test-function (char-downcase char))
|
||||
(and (both-case-p char)
|
||||
(funcall test-function (char-upcase char))))))
|
||||
(invertedp (complement* test-function))
|
||||
(t test-function)))))))
|
||||
|
||||
(defun maybe-split-repetition (regex
|
||||
greedyp
|
||||
@ -140,14 +177,14 @@ variable FLAGS."
|
||||
min-len
|
||||
length
|
||||
reg-seen)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum minimum)
|
||||
(type (or fixnum null) maximum))
|
||||
"Splits a REPETITION object into a constant and a varying part if
|
||||
applicable, i.e. something like
|
||||
a{3,} -> a{3}a*
|
||||
The arguments to this function correspond to the REPETITION slots of
|
||||
the same name."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (fixnum minimum)
|
||||
((or fixnum null) maximum))
|
||||
;; note the usage of COPY-REGEX here; we can't use the same REGEX
|
||||
;; object in both REPETITIONS because they will have different
|
||||
;; offsets
|
||||
@ -209,12 +246,12 @@ the same name."
|
||||
;; regex at the start (perhaps modulo #\Newline).
|
||||
|
||||
(defun maybe-accumulate (str)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p starts-with))
|
||||
(declare (ftype (function (t) fixnum) len))
|
||||
"Accumulate STR into the special variable STARTS-WITH if
|
||||
ACCUMULATE-START-P (also special) is true and STARTS-WITH is either
|
||||
NIL or a STR object of the same case mode. Always returns NIL."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p starts-with))
|
||||
(declare (ftype (function (t) fixnum) len))
|
||||
(when accumulate-start-p
|
||||
(etypecase starts-with
|
||||
(str
|
||||
@ -263,12 +300,11 @@ NIL or a STR object of the same case mode. Always returns NIL."
|
||||
(setq accumulate-start-p nil))))
|
||||
nil)
|
||||
|
||||
(declaim (inline convert-aux))
|
||||
(defun convert-aux (parse-tree)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags reg-num reg-names accumulate-start-p starts-with max-back-ref))
|
||||
"Converts the parse tree PARSE-TREE into a REGEX object and returns it.
|
||||
"Converts the parse tree PARSE-TREE into a REGEX object and returns
|
||||
it. Will also
|
||||
|
||||
Will also
|
||||
- split and optimize repetitions,
|
||||
- accumulate strings or EVERYTHING objects into the special variable
|
||||
STARTS-WITH,
|
||||
@ -279,10 +315,22 @@ Will also
|
||||
- maintain and adher to the currently applicable modifiers in the special
|
||||
variable FLAGS, and
|
||||
- maybe even wash your car..."
|
||||
(cond ((consp parse-tree)
|
||||
(case (first parse-tree)
|
||||
;; (:SEQUENCE {<regex>}*)
|
||||
((:sequence)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(if (consp parse-tree)
|
||||
(convert-compound-parse-tree (first parse-tree) parse-tree)
|
||||
(convert-simple-parse-tree parse-tree)))
|
||||
|
||||
(defgeneric convert-compound-parse-tree (token parse-tree &key)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Helper function for CONVERT-AUX which converts
|
||||
parse trees which are conses and dispatches on TOKEN which is the
|
||||
first element of the parse tree.")
|
||||
(:method (token parse-tree &key)
|
||||
(signal-syntax-error "Unknown token ~A in parse-tree." token)))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :sequence)) parse-tree &key)
|
||||
"The case for parse trees like \(:SEQUENCE {<regex>}*)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(cond ((cddr parse-tree)
|
||||
;; this is essentially like
|
||||
;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE))
|
||||
@ -291,15 +339,18 @@ Will also
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(convert-aux (car parse-tree-rest))))
|
||||
(make-instance 'seq
|
||||
:elements (rest parse-tree)))
|
||||
(make-instance 'seq :elements (rest parse-tree)))
|
||||
(t (convert-aux (second parse-tree)))))
|
||||
;; (:GROUP {<regex>}*)
|
||||
;; this is a syntactical construct equivalent to :SEQUENCE
|
||||
;; intended to keep the effect of modifiers local
|
||||
((:group)
|
||||
;; make a local copy of FLAGS and shadow the global
|
||||
;; value while we descend into the enclosed regexes
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :group)) parse-tree &key)
|
||||
"The case for parse trees like \(:GROUP {<regex>}*).
|
||||
|
||||
This is a syntactical construct equivalent to :SEQUENCE intended to
|
||||
keep the effect of modifiers local."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
;; make a local copy of FLAGS and shadow the global value while we
|
||||
;; descend into the enclosed regexes
|
||||
(let ((flags (copy-list flags)))
|
||||
(declare (special flags))
|
||||
(cond ((cddr parse-tree)
|
||||
@ -307,47 +358,46 @@ Will also
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(convert-aux (car parse-tree-rest))))
|
||||
(make-instance 'seq
|
||||
:elements (rest parse-tree)))
|
||||
(make-instance 'seq :elements (rest parse-tree)))
|
||||
(t (convert-aux (second parse-tree))))))
|
||||
;; (:ALTERNATION {<regex>}*)
|
||||
((:alternation)
|
||||
;; we must stop accumulating objects into STARTS-WITH
|
||||
;; once we reach an alternation
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :alternation)) parse-tree &key)
|
||||
"The case for \(:ALTERNATION {<regex>}*)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
;; we must stop accumulating objects into STARTS-WITH once we reach
|
||||
;; an alternation
|
||||
(setq accumulate-start-p nil)
|
||||
(loop for parse-tree-rest on (rest parse-tree)
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(convert-aux (car parse-tree-rest))))
|
||||
(make-instance 'alternation
|
||||
:choices (rest parse-tree)))
|
||||
;; (:BRANCH <test> <regex>)
|
||||
;; <test> must be look-ahead, look-behind or number;
|
||||
;; if <regex> is an alternation it must have one or two
|
||||
;; choices
|
||||
((:branch)
|
||||
(make-instance 'alternation :choices (rest parse-tree)))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :branch)) parse-tree &key)
|
||||
"The case for \(:BRANCH <test> <regex>).
|
||||
|
||||
Here, <test> must be look-ahead, look-behind or number; if <regex> is
|
||||
an alternation it must have one or two choices."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(let* ((test-candidate (second parse-tree))
|
||||
(test (cond ((numberp test-candidate)
|
||||
(when (zerop (the fixnum test-candidate))
|
||||
(signal-ppcre-syntax-error
|
||||
"Register 0 doesn't exist: ~S"
|
||||
parse-tree))
|
||||
(signal-syntax-error "Register 0 doesn't exist: ~S." parse-tree))
|
||||
(1- (the fixnum test-candidate)))
|
||||
(t (convert-aux test-candidate))))
|
||||
(alternations (convert-aux (third parse-tree))))
|
||||
(when (and (not (numberp test))
|
||||
(not (typep test 'lookahead))
|
||||
(not (typep test 'lookbehind)))
|
||||
(signal-ppcre-syntax-error
|
||||
"Branch test must be look-ahead, look-behind or number: ~S"
|
||||
parse-tree))
|
||||
(signal-syntax-error "Branch test must be look-ahead, look-behind or number: ~S." parse-tree))
|
||||
(typecase alternations
|
||||
(alternation
|
||||
(case (length (choices alternations))
|
||||
((0)
|
||||
(signal-ppcre-syntax-error "No choices in branch: ~S"
|
||||
parse-tree))
|
||||
(signal-syntax-error "No choices in branch: ~S." parse-tree))
|
||||
((1)
|
||||
(make-instance 'branch
|
||||
:test test
|
||||
@ -361,28 +411,39 @@ Will also
|
||||
:else-regex (second
|
||||
(choices alternations))))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error
|
||||
"Too much choices in branch: ~S"
|
||||
parse-tree))))
|
||||
(signal-syntax-error "Too much choices in branch: ~S." parse-tree))))
|
||||
(t
|
||||
(make-instance 'branch
|
||||
:test test
|
||||
:then-regex alternations)))))
|
||||
;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD <regex>)
|
||||
((:positive-lookahead :negative-lookahead)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex and stop accumulating into STARTS-WITH
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :positive-lookahead)) parse-tree &key)
|
||||
"The case for \(:POSITIVE-LOOKAHEAD <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags accumulate-start-p))
|
||||
;; keep the effect of modifiers local to the enclosed regex and stop
|
||||
;; accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(let ((flags (copy-list flags)))
|
||||
(declare (special flags))
|
||||
(make-instance 'lookahead
|
||||
:regex (convert-aux (second parse-tree))
|
||||
:positivep (eq (first parse-tree)
|
||||
:positive-lookahead))))
|
||||
;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND <regex>)
|
||||
((:positive-lookbehind :negative-lookbehind)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex and stop accumulating into STARTS-WITH
|
||||
:positivep t)))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :negative-lookahead)) parse-tree &key)
|
||||
"The case for \(:NEGATIVE-LOOKAHEAD <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; do the same as for positive look-aheads and just switch afterwards
|
||||
(let ((regex (convert-compound-parse-tree :positive-lookahead parse-tree)))
|
||||
(setf (slot-value regex 'positivep) nil)
|
||||
regex))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :positive-lookbehind)) parse-tree &key)
|
||||
"The case for \(:POSITIVE-LOOKBEHIND <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags accumulate-start-p))
|
||||
;; keep the effect of modifiers local to the enclosed regex and stop
|
||||
;; accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(let* ((flags (copy-list flags))
|
||||
(regex (convert-aux (second parse-tree)))
|
||||
@ -390,21 +451,32 @@ Will also
|
||||
(declare (special flags))
|
||||
;; lookbehind assertions must be of fixed length
|
||||
(unless len
|
||||
(signal-ppcre-syntax-error
|
||||
"Variable length look-behind not implemented (yet): ~S"
|
||||
parse-tree))
|
||||
(signal-syntax-error "Variable length look-behind not implemented \(yet): ~S." parse-tree))
|
||||
(make-instance 'lookbehind
|
||||
:regex regex
|
||||
:positivep (eq (first parse-tree)
|
||||
:positive-lookbehind)
|
||||
:positivep t
|
||||
:len len)))
|
||||
;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>)
|
||||
((:greedy-repetition :non-greedy-repetition)
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :negative-lookbehind)) parse-tree &key)
|
||||
"The case for \(:NEGATIVE-LOOKBEHIND <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; do the same as for positive look-behinds and just switch afterwards
|
||||
(let ((regex (convert-compound-parse-tree :positive-lookbehind parse-tree)))
|
||||
(setf (slot-value regex 'positivep) nil)
|
||||
regex))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :greedy-repetition)) parse-tree &key (greedyp t))
|
||||
"The case for \(:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>).
|
||||
|
||||
This function is also used for the non-greedy case in which case it is
|
||||
called with GREEDYP set to NIL as you would expect."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p starts-with))
|
||||
;; remember the value of ACCUMULATE-START-P upon entering
|
||||
(let ((local-accumulate-start-p accumulate-start-p))
|
||||
(let ((minimum (second parse-tree))
|
||||
(maximum (third parse-tree)))
|
||||
(declare (type fixnum minimum))
|
||||
(declare (fixnum minimum))
|
||||
(declare (type (or null fixnum) maximum))
|
||||
(unless (and maximum
|
||||
(= 1 minimum maximum))
|
||||
@ -416,7 +488,6 @@ Will also
|
||||
(let* (reg-seen
|
||||
(regex (convert-aux (fourth parse-tree)))
|
||||
(min-len (regex-min-length regex))
|
||||
(greedyp (eq (first parse-tree) :greedy-repetition))
|
||||
(length (regex-length regex)))
|
||||
;; note that this declaration already applies to
|
||||
;; the call to CONVERT-AUX above
|
||||
@ -493,53 +564,66 @@ Will also
|
||||
min-len
|
||||
nil
|
||||
t))))))))
|
||||
;; (:REGISTER <regex>)
|
||||
;; (:NAMED-REGISTER <name> <regex>)
|
||||
((:register :named-register)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex; also, assign the current value of REG-NUM to
|
||||
;; the corresponding slot of the REGISTER object and
|
||||
;; increase this counter afterwards; for named register
|
||||
;; update REG-NAMES and set the corresponding name slot
|
||||
;; of the REGISTER object too
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :non-greedy-repetition)) parse-tree &key)
|
||||
"The case for \(:NON-GREEDY-REPETITION <min> <max> <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; just dispatch to the method above with GREEDYP explicitly set to NIL
|
||||
(convert-compound-parse-tree :greedy-repetition parse-tree :greedyp nil))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :register)) parse-tree &key name)
|
||||
"The case for \(:REGISTER <regex>). Also used for named registers
|
||||
when NAME is not NIL."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags reg-num reg-names))
|
||||
;; keep the effect of modifiers local to the enclosed regex; also,
|
||||
;; assign the current value of REG-NUM to the corresponding slot of
|
||||
;; the REGISTER object and increase this counter afterwards; for
|
||||
;; named register update REG-NAMES and set the corresponding name
|
||||
;; slot of the REGISTER object too
|
||||
(let ((flags (copy-list flags))
|
||||
(stored-reg-num reg-num)
|
||||
(reg-name (when (eq (first parse-tree) :named-register)
|
||||
(copy-seq (second parse-tree)))))
|
||||
(stored-reg-num reg-num))
|
||||
(declare (special flags reg-seen named-reg-seen))
|
||||
(setq reg-seen t)
|
||||
(when reg-name
|
||||
(setq named-reg-seen t))
|
||||
(when name (setq named-reg-seen t))
|
||||
(incf (the fixnum reg-num))
|
||||
(push reg-name
|
||||
reg-names)
|
||||
(push name reg-names)
|
||||
(make-instance 'register
|
||||
:regex (convert-aux (if (eq (first parse-tree) :named-register)
|
||||
(third parse-tree)
|
||||
(second parse-tree)))
|
||||
:regex (convert-aux (if name (third parse-tree) (second parse-tree)))
|
||||
:num stored-reg-num
|
||||
:name reg-name)))
|
||||
;; (:FILTER <function> &optional <length>)
|
||||
((:filter)
|
||||
:name name)))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :named-register)) parse-tree &key)
|
||||
"The case for \(:NAMED-REGISTER <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; call the method above and use the :NAME keyword argument
|
||||
(convert-compound-parse-tree :register parse-tree :name (copy-seq (second parse-tree))))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :filter)) parse-tree &key)
|
||||
"The case for \(:FILTER <function> &optional <length>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
;; stop accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'filter
|
||||
:fn (second parse-tree)
|
||||
:len (third parse-tree)))
|
||||
;; (:STANDALONE <regex>)
|
||||
((:standalone)
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :standalone)) parse-tree &key)
|
||||
"The case for \(:STANDALONE <regex>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags accumulate-start-p))
|
||||
;; stop accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex
|
||||
;; keep the effect of modifiers local to the enclosed regex
|
||||
(let ((flags (copy-list flags)))
|
||||
(declare (special flags))
|
||||
(make-instance 'standalone
|
||||
:regex (convert-aux (second parse-tree)))))
|
||||
;; (:BACK-REFERENCE <number>)
|
||||
;; (:BACK-REFERENCE <name>)
|
||||
((:back-reference)
|
||||
(locally (declare (special reg-names reg-num))
|
||||
(make-instance 'standalone :regex (convert-aux (second parse-tree)))))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :back-reference)) parse-tree &key)
|
||||
"The case for \(:BACK-REFERENCE <number>|<name>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags accumulate-start-p reg-num reg-names max-back-ref))
|
||||
(let* ((backref-name (and (stringp (second parse-tree))
|
||||
(second parse-tree)))
|
||||
(referred-regs
|
||||
@ -547,25 +631,21 @@ Will also
|
||||
;; find which register corresponds to the given name
|
||||
;; we have to deal with case where several registers share
|
||||
;; the same name and collect their respective numbers
|
||||
(loop
|
||||
for name in reg-names
|
||||
(loop for name in reg-names
|
||||
for reg-index from 0
|
||||
when (string= name backref-name)
|
||||
;; NOTE: REG-NAMES stores register names in reversed order
|
||||
;; REG-NUM contains number of (any) registers seen so far
|
||||
;; 1- will be done later
|
||||
;; NOTE: REG-NAMES stores register names in reversed
|
||||
;; order REG-NUM contains number of (any) registers
|
||||
;; seen so far; 1- will be done later
|
||||
collect (- reg-num reg-index))))
|
||||
;; store the register number for the simple case
|
||||
(backref-number (or (first referred-regs)
|
||||
(second parse-tree))))
|
||||
(backref-number (or (first referred-regs) (second parse-tree))))
|
||||
(declare (type (or fixnum null) backref-number))
|
||||
(when (or (not (typep backref-number 'fixnum))
|
||||
(<= backref-number 0))
|
||||
(signal-ppcre-syntax-error
|
||||
"Illegal back-reference: ~S"
|
||||
parse-tree))
|
||||
;; stop accumulating into STARTS-WITH and increase
|
||||
;; MAX-BACK-REF if necessary
|
||||
(signal-syntax-error "Illegal back-reference: ~S." parse-tree))
|
||||
;; stop accumulating into STARTS-WITH and increase MAX-BACK-REF if
|
||||
;; necessary
|
||||
(setq accumulate-start-p nil
|
||||
max-back-ref (max (the fixnum max-back-ref)
|
||||
backref-number))
|
||||
@ -578,9 +658,8 @@ Will also
|
||||
:name (copy-seq backref-name))))
|
||||
(cond
|
||||
((cdr referred-regs)
|
||||
;; several registers share the same name
|
||||
;; we will try to match any of them, starting
|
||||
;; with the most recent first
|
||||
;; several registers share the same name we will try to match
|
||||
;; any of them, starting with the most recent first
|
||||
;; alternation is used to accomplish matching
|
||||
(make-instance 'alternation
|
||||
:choices (loop
|
||||
@ -588,72 +667,51 @@ Will also
|
||||
collect (make-back-ref reg-index))))
|
||||
;; simple case - backref corresponds to only one register
|
||||
(t
|
||||
(make-back-ref backref-number)))))))
|
||||
;; (:REGEX <string>)
|
||||
((:regex)
|
||||
(let ((regex (second parse-tree)))
|
||||
(convert-aux (parse-string regex))))
|
||||
;; (:CHAR-CLASS|:INVERTED-CHAR-CLASS {<item>}*)
|
||||
;; where item is one of
|
||||
;; - a character
|
||||
;; - a character range: (:RANGE <char1> <char2>)
|
||||
;; - a special char class symbol like :DIGIT-CHAR-CLASS
|
||||
((:char-class :inverted-char-class)
|
||||
;; first create the charset and some auxiliary values
|
||||
(let* (set set-contents
|
||||
(count most-positive-fixnum)
|
||||
(item-list (rest parse-tree))
|
||||
(invertedp (eq (first parse-tree) :inverted-char-class))
|
||||
word-char-class-p)
|
||||
(cond ((every (lambda (item) (eq item :word-char-class))
|
||||
item-list)
|
||||
;; treat "[\\w]" like "\\w"
|
||||
(setq word-char-class-p t))
|
||||
((every (lambda (item) (eq item :non-word-char-class))
|
||||
item-list)
|
||||
;; treat "[\\W]" like "\\W"
|
||||
(setq word-char-class-p t)
|
||||
(setq invertedp (not invertedp)))
|
||||
(t
|
||||
(setq set (convert-char-class-to-charset item-list)
|
||||
count (charset-count set))
|
||||
(when (<= count 2)
|
||||
;; collect the contents of SET into a list if
|
||||
;; COUNT is smaller than 3
|
||||
(setq set-contents (all-characters set)))))
|
||||
(cond ((and (not invertedp)
|
||||
(= count 1))
|
||||
;; convert one-element charset into a STR object
|
||||
;; and try to accumulate into STARTS-WITH
|
||||
(let ((str (make-instance 'str
|
||||
:str (string (first set-contents))
|
||||
:case-insensitive-p nil)))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
((and (not invertedp)
|
||||
(= count 2)
|
||||
(char-equal (first set-contents) (second set-contents)))
|
||||
;; convert two-element charset into a
|
||||
;; case-insensitive STR object and try to
|
||||
;; accumulate into STARTS-WITH if the two
|
||||
;; characters are CHAR-EQUAL
|
||||
(let ((str (make-instance 'str
|
||||
:str (string (first set-contents))
|
||||
:case-insensitive-p t)))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
(t
|
||||
;; the general case; stop accumulating into STARTS-WITH
|
||||
(make-back-ref backref-number))))))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :regex)) parse-tree &key)
|
||||
"The case for \(:REGEX <string>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(convert-aux (parse-string (second parse-tree))))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :char-class)) parse-tree &key invertedp)
|
||||
"The case for \(:CHAR-CLASS {<item>}*) where item is one of
|
||||
|
||||
- a character,
|
||||
- a character range: \(:RANGE <char1> <char2>), or
|
||||
- a special char class symbol like :DIGIT-CHAR-CLASS.
|
||||
|
||||
Also used for inverted char classes when INVERTEDP is true."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags accumulate-start-p))
|
||||
(let ((test-function
|
||||
(create-optimized-test-function
|
||||
(convert-char-class-to-test-function (rest parse-tree)
|
||||
invertedp
|
||||
(case-insensitive-mode-p flags)))))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class
|
||||
:charset set
|
||||
:case-insensitive-p
|
||||
(case-insensitive-mode-p flags)
|
||||
:invertedp invertedp
|
||||
:word-char-class-p word-char-class-p)))))
|
||||
;; (:FLAGS {<flag>}*)
|
||||
;; where flag is a modifier symbol like :CASE-INSENSITIVE-P
|
||||
((:flags)
|
||||
(make-instance 'char-class :test-function test-function)))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :inverted-char-class)) parse-tree &key)
|
||||
"The case for \(:INVERTED-CHAR-CLASS {<item>}*)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; just dispatch to the "real" method
|
||||
(convert-compound-parse-tree :char-class parse-tree :invertedp t))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :property)) parse-tree &key)
|
||||
"The case for \(:PROPERTY <name>) where <name> is a string."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'char-class :test-function (resolve-property (second parse-tree))))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :inverted-property)) parse-tree &key)
|
||||
"The case for \(:INVERTED-PROPERTY <name>) where <name> is a string."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'char-class :test-function (complement* (resolve-property (second parse-tree)))))
|
||||
|
||||
(defmethod convert-compound-parse-tree ((token (eql :flags)) parse-tree &key)
|
||||
"The case for \(:FLAGS {<flag>}*) where flag is a modifier symbol
|
||||
like :CASE-INSENSITIVE-P."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; set/unset the flags corresponding to the symbols
|
||||
;; following :FLAGS
|
||||
(mapc #'set-flag (rest parse-tree))
|
||||
@ -662,119 +720,133 @@ Will also
|
||||
;; construct into a VOID object which'll be optimized
|
||||
;; away when creating the matcher
|
||||
(make-instance 'void))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error
|
||||
"Unknown token ~A in parse-tree"
|
||||
(first parse-tree)))))
|
||||
((or (characterp parse-tree) (stringp parse-tree))
|
||||
;; turn characters or strings into STR objects and try to
|
||||
;; accumulate into STARTS-WITH
|
||||
(let ((str (make-instance 'str
|
||||
:str (string parse-tree)
|
||||
:case-insensitive-p
|
||||
(case-insensitive-mode-p flags))))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
(t
|
||||
;; and now for the tokens which are symbols
|
||||
(case parse-tree
|
||||
((:void)
|
||||
|
||||
(defgeneric convert-simple-parse-tree (parse-tree)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Helper function for CONVERT-AUX which converts
|
||||
parse trees which are atoms.")
|
||||
(:method ((parse-tree (eql :void)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'void))
|
||||
((:word-boundary)
|
||||
(:method ((parse-tree (eql :word-boundary)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'word-boundary :negatedp nil))
|
||||
((:non-word-boundary)
|
||||
(:method ((parse-tree (eql :non-word-boundary)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'word-boundary :negatedp t))
|
||||
;; the special character classes
|
||||
((:digit-class
|
||||
:non-digit-class
|
||||
:word-char-class
|
||||
:non-word-char-class
|
||||
:whitespace-char-class
|
||||
:non-whitespace-char-class)
|
||||
;; stop accumulating into STARTS-WITH
|
||||
(:method ((parse-tree (eql :everything)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class
|
||||
;; use the constants defined in util.lisp
|
||||
:charset (case parse-tree
|
||||
((:digit-class
|
||||
:non-digit-class)
|
||||
+digit-set+)
|
||||
((:word-char-class
|
||||
:non-word-char-class)
|
||||
nil)
|
||||
((:whitespace-char-class
|
||||
:non-whitespace-char-class)
|
||||
+whitespace-char-set+))
|
||||
;; this value doesn't really matter but
|
||||
;; NIL should result in slightly faster
|
||||
;; matchers
|
||||
:case-insensitive-p nil
|
||||
:invertedp (member parse-tree
|
||||
'(:non-digit-class
|
||||
:non-word-char-class
|
||||
:non-whitespace-char-class)
|
||||
:test #'eq)
|
||||
:word-char-class-p (member parse-tree
|
||||
'(:word-char-class
|
||||
:non-word-char-class)
|
||||
:test #'eq)))
|
||||
((:start-anchor ; Perl's "^"
|
||||
:end-anchor ; Perl's "$"
|
||||
:modeless-end-anchor-no-newline
|
||||
; Perl's "\z"
|
||||
:modeless-start-anchor ; Perl's "\A"
|
||||
:modeless-end-anchor) ; Perl's "\Z"
|
||||
(make-instance 'anchor
|
||||
:startp (member parse-tree
|
||||
'(:start-anchor
|
||||
:modeless-start-anchor)
|
||||
:test #'eq)
|
||||
;; set this value according to the
|
||||
;; current settings of FLAGS (unless it's
|
||||
;; a modeless anchor)
|
||||
:multi-line-p
|
||||
(and (multi-line-mode-p flags)
|
||||
(not (member parse-tree
|
||||
'(:modeless-start-anchor
|
||||
:modeless-end-anchor
|
||||
:modeless-end-anchor-no-newline)
|
||||
:test #'eq)))
|
||||
:no-newline-p
|
||||
(eq parse-tree
|
||||
:modeless-end-anchor-no-newline)))
|
||||
((:everything)
|
||||
;; stop accumulating into STARTS-WITHS
|
||||
(make-instance 'everything :single-line-p (single-line-mode-p flags)))
|
||||
(:method ((parse-tree (eql :digit-class)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'everything
|
||||
:single-line-p (single-line-mode-p flags)))
|
||||
;; special tokens corresponding to Perl's "ism" modifiers
|
||||
((:case-insensitive-p
|
||||
:case-sensitive-p
|
||||
:multi-line-mode-p
|
||||
:not-multi-line-mode-p
|
||||
:single-line-mode-p
|
||||
:not-single-line-mode-p)
|
||||
;; we're only interested in the side effect of
|
||||
;; setting/unsetting the flags and turn these tokens
|
||||
;; into VOID objects which'll be optimized away when
|
||||
;; creating the matcher
|
||||
(make-instance 'char-class :test-function #'digit-char-p))
|
||||
(:method ((parse-tree (eql :word-char-class)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class :test-function #'word-char-p))
|
||||
(:method ((parse-tree (eql :whitespace-char-class)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class :test-function #'whitespacep))
|
||||
(:method ((parse-tree (eql :non-digit-class)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class :test-function (complement* #'digit-char-p)))
|
||||
(:method ((parse-tree (eql :non-word-char-class)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class :test-function (complement* #'word-char-p)))
|
||||
(:method ((parse-tree (eql :non-whitespace-char-class)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special accumulate-start-p))
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class :test-function (complement* #'whitespacep)))
|
||||
(:method ((parse-tree (eql :start-anchor)))
|
||||
;; Perl's "^"
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
(make-instance 'anchor :startp t :multi-line-p (multi-line-mode-p flags)))
|
||||
(:method ((parse-tree (eql :end-anchor)))
|
||||
;; Perl's "$"
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
(make-instance 'anchor :startp nil :multi-line-p (multi-line-mode-p flags)))
|
||||
(:method ((parse-tree (eql :modeless-start-anchor)))
|
||||
;; Perl's "\A"
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'anchor :startp t))
|
||||
(:method ((parse-tree (eql :modeless-end-anchor)))
|
||||
;; Perl's "$\Z"
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'anchor :startp nil))
|
||||
(:method ((parse-tree (eql :modeless-end-anchor-no-newline)))
|
||||
;; Perl's "$\z"
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'anchor :startp nil :no-newline-p t))
|
||||
(:method ((parse-tree (eql :case-insensitive-p)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void))
|
||||
(otherwise
|
||||
(let ((translation (and (symbolp parse-tree)
|
||||
(parse-tree-synonym parse-tree))))
|
||||
(:method ((parse-tree (eql :case-sensitive-p)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void))
|
||||
(:method ((parse-tree (eql :multi-line-mode-p)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void))
|
||||
(:method ((parse-tree (eql :not-multi-line-mode-p)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void))
|
||||
(:method ((parse-tree (eql :single-line-mode-p)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void))
|
||||
(:method ((parse-tree (eql :not-single-line-mode-p)))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void)))
|
||||
|
||||
(defmethod convert-simple-parse-tree ((parse-tree string))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special flags))
|
||||
;; turn strings into STR objects and try to accumulate into
|
||||
;; STARTS-WITH
|
||||
(let ((str (make-instance 'str
|
||||
:str parse-tree
|
||||
:case-insensitive-p (case-insensitive-mode-p flags))))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
|
||||
(defmethod convert-simple-parse-tree ((parse-tree character))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; dispatch to the method for strings
|
||||
(convert-simple-parse-tree (string parse-tree)))
|
||||
|
||||
(defmethod convert-simple-parse-tree (parse-tree)
|
||||
"The default method - check if there's a translation."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((translation (and (symbolp parse-tree) (parse-tree-synonym parse-tree))))
|
||||
(if translation
|
||||
(convert-aux (copy-tree translation))
|
||||
(signal-ppcre-syntax-error "Unknown token ~A in parse-tree"
|
||||
parse-tree))))))))
|
||||
(signal-syntax-error "Unknown token ~A in parse tree." parse-tree))))
|
||||
|
||||
(defun convert (parse-tree)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Converts the parse tree PARSE-TREE into an equivalent REGEX object
|
||||
and returns three values: the REGEX object, the number of registers
|
||||
seen and an object the regex starts with which is either a STR object
|
||||
or an EVERYTHING object (if the regex starts with something like
|
||||
or an EVERYTHING object \(if the regex starts with something like
|
||||
\".*\") or NIL."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; this function basically just initializes the special variables
|
||||
;; and then calls CONVERT-AUX to do all the work
|
||||
(let* ((flags (list nil nil nil))
|
||||
@ -790,12 +862,12 @@ or an EVERYTHING object (if the regex starts with something like
|
||||
;; make sure we don't reference registers which aren't there
|
||||
(when (> (the fixnum max-back-ref)
|
||||
(the fixnum reg-num))
|
||||
(signal-ppcre-syntax-error
|
||||
"Backreference to register ~A which has not been defined"
|
||||
max-back-ref))
|
||||
(signal-syntax-error "Backreference to register ~A which has not been defined." max-back-ref))
|
||||
(when (typep starts-with 'str)
|
||||
(setf (slot-value starts-with 'str)
|
||||
(coerce (slot-value starts-with 'str) 'simple-string)))
|
||||
(coerce (slot-value starts-with 'str)
|
||||
#+:lispworks 'lw:simple-text-string
|
||||
#-:lispworks 'simple-string)))
|
||||
(values converted-parse-tree reg-num starts-with
|
||||
;; we can't simply use *ALLOW-NAMED-REGISTERS*
|
||||
;; since parse-tree syntax ignores it
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
1468
doc/index.html
1468
doc/index.html
File diff suppressed because it is too large
Load Diff
12
errors.lisp
12
errors.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.18 2008/06/25 14:04:27 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.21 2008/07/06 18:12:04 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
@ -27,7 +27,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defvar *syntax-error-string* nil
|
||||
"The string which caused the syntax error.")
|
||||
@ -69,16 +69,16 @@ parse tree).")
|
||||
(:documentation "Signaled when CL-PPCRE functions are
|
||||
invoked with wrong arguments."))
|
||||
|
||||
(defmacro signal-ppcre-syntax-error* (pos format-control &rest format-arguments)
|
||||
(defmacro signal-syntax-error* (pos format-control &rest format-arguments)
|
||||
`(error 'ppcre-syntax-error
|
||||
:pos ,pos
|
||||
:format-control ,format-control
|
||||
:format-arguments (list ,@format-arguments)))
|
||||
|
||||
(defmacro signal-ppcre-syntax-error (format-control &rest format-arguments)
|
||||
`(signal-ppcre-syntax-error* nil ,format-control ,@format-arguments))
|
||||
(defmacro signal-syntax-error (format-control &rest format-arguments)
|
||||
`(signal-syntax-error* nil ,format-control ,@format-arguments))
|
||||
|
||||
(defmacro signal-ppcre-invocation-error (format-control &rest format-arguments)
|
||||
(defmacro signal-invocation-error (format-control &rest format-arguments)
|
||||
`(error 'ppcre-invocation-error
|
||||
:format-control ,format-control
|
||||
:format-arguments (list ,@format-arguments)))
|
||||
|
||||
139
lexer.lisp
139
lexer.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.28 2008/06/25 14:04:27 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.34 2008/07/06 22:36:30 edi Exp $
|
||||
|
||||
;;; The lexer's responsibility is to convert the regex string into a
|
||||
;;; sequence of tokens which are in turn consumed by the parser.
|
||||
@ -35,7 +35,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(declaim (inline map-char-to-special-class))
|
||||
(defun map-char-to-special-char-class (chr)
|
||||
@ -56,27 +56,18 @@ their associated character classes."
|
||||
((#\S)
|
||||
:non-whitespace-char-class)))
|
||||
|
||||
(locally
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(defstruct (lexer (:constructor make-lexer-internal))
|
||||
"LEXER structures are used to hold the regex string which is
|
||||
currently lexed and to keep track of the lexer's state."
|
||||
(str ""
|
||||
:type string
|
||||
:read-only t)
|
||||
(len 0
|
||||
:type fixnum
|
||||
:read-only t)
|
||||
(reg 0
|
||||
:type fixnum)
|
||||
(pos 0
|
||||
:type fixnum)
|
||||
(last-pos nil
|
||||
:type list)))
|
||||
(str "" :type string :read-only t)
|
||||
(len 0 :type fixnum :read-only t)
|
||||
(reg 0 :type fixnum)
|
||||
(pos 0 :type fixnum)
|
||||
(last-pos nil :type list))
|
||||
|
||||
(defun make-lexer (string)
|
||||
(declare (inline make-lexer-internal)
|
||||
#-genera (type string string))
|
||||
#-:genera (string string))
|
||||
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
|
||||
:len (length string)))
|
||||
|
||||
@ -101,10 +92,8 @@ Does not respect extended mode."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Returns the next character which is to be examined and updates the
|
||||
POS slot. Does not respect extended mode."
|
||||
(cond ((end-of-string-p lexer)
|
||||
nil)
|
||||
(t
|
||||
(prog1
|
||||
(cond ((end-of-string-p lexer) nil)
|
||||
(t (prog1
|
||||
(schar (lexer-str lexer) (lexer-pos lexer))
|
||||
(incf (lexer-pos lexer))))))
|
||||
|
||||
@ -135,9 +124,7 @@ nested comments are skipped if applicable."
|
||||
while (and skip-char
|
||||
(char/= skip-char #\)))
|
||||
finally (return skip-char))
|
||||
(signal-ppcre-syntax-error*
|
||||
error-pos
|
||||
"Comment group not closed")))
|
||||
(signal-syntax-error* error-pos "Comment group not closed.")))
|
||||
(setq next-char (next-char-non-extended lexer)))
|
||||
(t
|
||||
;; undo effect of previous INCF if we didn't see a #
|
||||
@ -177,7 +164,7 @@ nested comments are skipped if applicable."
|
||||
"Moves (LEXER-POS LEXER) back to the last position stored in
|
||||
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
|
||||
(unless (lexer-last-pos lexer)
|
||||
(signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
|
||||
(signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer))
|
||||
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
|
||||
nil)
|
||||
|
||||
@ -232,19 +219,16 @@ the corresponding number started within the regex string."
|
||||
(let ((code (logand #o377 (the fixnum (or number 0)))))
|
||||
(or (and (< code char-code-limit)
|
||||
(code-char code))
|
||||
(signal-ppcre-syntax-error*
|
||||
error-pos
|
||||
"No character for hex-code ~X"
|
||||
number))))
|
||||
(signal-syntax-error* error-pos "No character for hex-code ~X." number))))
|
||||
|
||||
(defun unescape-char (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Convert the characters(s) following a backslash into a token
|
||||
"Convert the characters\(s) following a backslash into a token
|
||||
which is returned. This function is to be called when the backslash
|
||||
has already been consumed. Special character classes like \\W are
|
||||
handled elsewhere."
|
||||
(when (end-of-string-p lexer)
|
||||
(signal-ppcre-syntax-error "String ends with backslash"))
|
||||
(signal-syntax-error "String ends with backslash."))
|
||||
(let ((chr (next-char-non-extended lexer)))
|
||||
(case chr
|
||||
((#\E)
|
||||
@ -257,9 +241,7 @@ handled elsewhere."
|
||||
;; \cx means control-x in Perl
|
||||
(let ((next-char (next-char-non-extended lexer)))
|
||||
(unless next-char
|
||||
(signal-ppcre-syntax-error*
|
||||
(lexer-pos lexer)
|
||||
"Character missing after '\\c' at position ~A"))
|
||||
(signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c' at position ~A."))
|
||||
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
|
||||
((#\x)
|
||||
;; \x should be followed by a hexadecimal char code,
|
||||
@ -295,12 +277,28 @@ handled elsewhere."
|
||||
;; all other characters aren't affected by a backslash
|
||||
chr))))
|
||||
|
||||
(defun collect-char-class (lexer)
|
||||
(defun read-char-property (lexer first-char)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(unless (eql (next-char-non-extended lexer) #\{)
|
||||
(signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char))
|
||||
(let ((name (with-output-to-string (out nil :element-type
|
||||
#+:lispworks 'lw:simple-char #-:lispworks 'character)
|
||||
(loop
|
||||
(let ((char (or (next-char-non-extended lexer)
|
||||
(signal-syntax-error "Unexpected EOF after \\~A{." first-char))))
|
||||
(when (char= char #\})
|
||||
(return))
|
||||
(write-char char out))))))
|
||||
(list (if (char= first-char #\p) :property :inverted-property)
|
||||
;; we must reverse here because of what PARSE-STRING does
|
||||
(nreverse name))))
|
||||
|
||||
(defun collect-char-class (lexer)
|
||||
"Reads and consumes characters from regex string until a right
|
||||
bracket is seen. Assembles them into a list \(which is returned) of
|
||||
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
|
||||
tokens representing special character classes."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
|
||||
hyphen-seen
|
||||
last-char
|
||||
@ -336,6 +334,21 @@ we're inside a range or not."
|
||||
(push #\- list)
|
||||
(incf (lexer-pos lexer)))
|
||||
(setq hyphen-seen nil))
|
||||
((#\P #\p)
|
||||
;; maybe a character property
|
||||
(cond ((null *property-resolver*)
|
||||
(handle-char next-char))
|
||||
(t
|
||||
(push (read-char-property lexer next-char) list)
|
||||
;; if the last character was a hyphen
|
||||
;; just collect it literally
|
||||
(when hyphen-seen
|
||||
(push #\- list))
|
||||
;; if the next character is a hyphen do the same
|
||||
(when (looking-at-p lexer #\-)
|
||||
(push #\- list)
|
||||
(incf (lexer-pos lexer)))
|
||||
(setq hyphen-seen nil))))
|
||||
((#\E)
|
||||
;; if \Q quoting is on we ignore \E,
|
||||
;; otherwise it's just a plain #\E
|
||||
@ -372,9 +385,7 @@ we're inside a range or not."
|
||||
(handle-char c))))
|
||||
;; we can only exit the loop normally if we've reached the end
|
||||
;; of the regex string without seeing a right bracket
|
||||
(signal-ppcre-syntax-error*
|
||||
start-pos
|
||||
"Missing right bracket to close character class"))))
|
||||
(signal-syntax-error* start-pos "Missing right bracket to close character class."))))
|
||||
|
||||
(defun maybe-parse-flags (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -387,7 +398,7 @@ the behaviour of the lexer itself via the special variable
|
||||
(loop with set = t
|
||||
for chr = (next-char-non-extended lexer)
|
||||
unless chr
|
||||
do (signal-ppcre-syntax-error "Unexpected end of string")
|
||||
do (signal-syntax-error "Unexpected end of string.")
|
||||
while (find chr "-imsx" :test #'char=)
|
||||
;; the first #\- will invert the meaning of all modifiers
|
||||
;; following it
|
||||
@ -473,9 +484,7 @@ closing #\> will also be consumed."
|
||||
:test #'char=)))
|
||||
(unless end-name
|
||||
;; there has to be > somewhere, syntax error otherwise
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Opening #\< in named group has no closing #\>"))
|
||||
(signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>."))
|
||||
(let ((name (subseq (lexer-str lexer)
|
||||
(lexer-pos lexer)
|
||||
end-name)))
|
||||
@ -484,9 +493,7 @@ closing #\> will also be consumed."
|
||||
(char= #\- char)))
|
||||
name)
|
||||
;; register name can contain only alphanumeric characters or #\-
|
||||
(signal-ppcre-syntax-error*
|
||||
(lexer-pos lexer)
|
||||
"Invalid character in named register group"))
|
||||
(signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group."))
|
||||
;; advance lexer beyond "<name>" part
|
||||
(setf (lexer-pos lexer) (1+ end-name))
|
||||
name)))
|
||||
@ -518,10 +525,7 @@ closing #\> will also be consumed."
|
||||
((#\+ #\*)
|
||||
;; quantifiers will always be consumend by
|
||||
;; GET-QUANTIFIER, they must not appear here
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Quantifier '~A' not allowed"
|
||||
next-char))
|
||||
(signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char))
|
||||
((#\{)
|
||||
;; left brace isn't a special character in it's own
|
||||
;; right but we must check if what follows might
|
||||
@ -530,9 +534,8 @@ closing #\> will also be consumed."
|
||||
(this-last-pos (lexer-last-pos lexer)))
|
||||
(unget-token lexer)
|
||||
(when (get-quantifier lexer)
|
||||
(signal-ppcre-syntax-error*
|
||||
(car this-last-pos)
|
||||
"Quantifier '~A' not allowed"
|
||||
(signal-syntax-error* (car this-last-pos)
|
||||
"Quantifier '~A' not allowed."
|
||||
(subseq (lexer-str lexer)
|
||||
(car this-last-pos)
|
||||
(lexer-pos lexer))))
|
||||
@ -580,7 +583,7 @@ closing #\> will also be consumed."
|
||||
(let* ((old-pos (decf (lexer-pos lexer)))
|
||||
;; ...so let's get the whole number first
|
||||
(backref-number (get-number lexer)))
|
||||
(declare (type fixnum backref-number))
|
||||
(declare (fixnum backref-number))
|
||||
(cond ((and (> backref-number (lexer-reg lexer))
|
||||
(<= 10 backref-number))
|
||||
;; \10 and higher are treated as octal
|
||||
@ -603,6 +606,10 @@ closing #\> will also be consumed."
|
||||
(let ((old-pos (decf (lexer-pos lexer))))
|
||||
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
||||
old-pos)))
|
||||
((#\P #\p)
|
||||
;; might be a named property
|
||||
(cond (*property-resolver* (read-char-property lexer next-char))
|
||||
(t next-char)))
|
||||
(otherwise
|
||||
;; in all other cases just unescape the
|
||||
;; character
|
||||
@ -622,17 +629,15 @@ closing #\> will also be consumed."
|
||||
;; or a closing parenthesis are following
|
||||
(when (and flags
|
||||
(not (find next-char ":)" :test #'char=)))
|
||||
(signal-ppcre-syntax-error*
|
||||
(car (lexer-last-pos lexer))
|
||||
"Sequence '~A' not recognized"
|
||||
(signal-syntax-error* (car (lexer-last-pos lexer))
|
||||
"Sequence '~A' not recognized."
|
||||
(subseq (lexer-str lexer)
|
||||
(car (lexer-last-pos lexer))
|
||||
(lexer-pos lexer))))
|
||||
(case next-char
|
||||
((nil)
|
||||
;; syntax error
|
||||
(signal-ppcre-syntax-error
|
||||
"End of string following '(?'"))
|
||||
(signal-syntax-error "End of string following '(?'."))
|
||||
((#\))
|
||||
;; an empty group except for the flags
|
||||
;; (if there are any)
|
||||
@ -664,9 +669,8 @@ closing #\> will also be consumed."
|
||||
;; we have encountered a named group
|
||||
;; are we supporting register naming?
|
||||
(unless *allow-named-registers*
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?<'"
|
||||
(signal-syntax-error* (1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?<'."
|
||||
next-char))
|
||||
;; put the letter back
|
||||
(decf (lexer-pos lexer))
|
||||
@ -685,18 +689,15 @@ closing #\> will also be consumed."
|
||||
:void)
|
||||
((nil)
|
||||
;; syntax error
|
||||
(signal-ppcre-syntax-error
|
||||
"End of string following '(?<'"))
|
||||
(signal-syntax-error "End of string following '(?<'."))
|
||||
(t
|
||||
;; also syntax error
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?<'"
|
||||
(signal-syntax-error* (1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?<'."
|
||||
next-char ))))))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?'"
|
||||
(signal-syntax-error* (1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?'."
|
||||
next-char)))))
|
||||
(t
|
||||
;; if next-char was not #\? (this is within
|
||||
|
||||
67
load.lisp
67
load.lisp
@ -1,67 +0,0 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.16 2008/06/25 14:04:27 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-user)
|
||||
|
||||
(let ((cl-ppcre-base-directory
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
must-compile)
|
||||
(with-compilation-unit ()
|
||||
(dolist (file '("packages"
|
||||
"specials"
|
||||
"util"
|
||||
"errors"
|
||||
#-:use-acl-regexp2-engine "lexer"
|
||||
#-:use-acl-regexp2-engine "parser"
|
||||
#-:use-acl-regexp2-engine "regex-class"
|
||||
#-:use-acl-regexp2-engine "convert"
|
||||
#-:use-acl-regexp2-engine "optimize"
|
||||
#-:use-acl-regexp2-engine "closures"
|
||||
#-:use-acl-regexp2-engine "repetition-closures"
|
||||
#-:use-acl-regexp2-engine "scanner"
|
||||
"api"
|
||||
"ppcre-tests"))
|
||||
(let ((pathname (make-pathname :name file :type "lisp" :version nil
|
||||
:defaults cl-ppcre-base-directory)))
|
||||
;; don't use COMPILE-FILE in Corman Lisp, it's broken - LOAD
|
||||
;; will yield compiled functions anyway
|
||||
#-:cormanlisp
|
||||
(let ((compiled-pathname (compile-file-pathname pathname)))
|
||||
(unless (and (not must-compile)
|
||||
(probe-file compiled-pathname)
|
||||
(< (file-write-date pathname)
|
||||
(file-write-date compiled-pathname)))
|
||||
(setq must-compile t)
|
||||
(compile-file pathname))
|
||||
(setq pathname compiled-pathname))
|
||||
(load pathname)))))
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.31 2008/06/25 14:04:27 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.35 2008/07/06 18:12:04 edi Exp $
|
||||
|
||||
;;; This file contains optimizations which can be applied to converted
|
||||
;;; parse trees.
|
||||
@ -30,7 +30,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defgeneric flatten (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -96,8 +96,7 @@ operation on REGEX."))
|
||||
alternation)
|
||||
((cdr choices)
|
||||
(first choices))
|
||||
(t (signal-ppcre-syntax-error
|
||||
"Encountered alternation without choices.")))))
|
||||
(t (signal-syntax-error "Encountered alternation without choices.")))))
|
||||
|
||||
(defmethod flatten ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -143,7 +142,7 @@ operation on REGEX."))
|
||||
collector-start
|
||||
(collector-length 0)
|
||||
skip)
|
||||
(declare (type fixnum collector-length))
|
||||
(declare (fixnum collector-length))
|
||||
(loop
|
||||
(let ((elements-rest (cdr curr-point)))
|
||||
(unless elements-rest
|
||||
@ -394,7 +393,7 @@ function called by END-STRIN.)"))
|
||||
concatenated-string
|
||||
concatenated-start
|
||||
(concatenated-length 0))
|
||||
(declare (type fixnum concatenated-length))
|
||||
(declare (fixnum concatenated-length))
|
||||
(loop for element in (reverse (elements seq))
|
||||
;; remember the case-(in)sensitivity of the last relevant
|
||||
;; STR object
|
||||
@ -429,7 +428,7 @@ function called by END-STRIN.)"))
|
||||
concatenated-start nil))
|
||||
(let ((len (len element-end))
|
||||
(str (str element-end)))
|
||||
(declare (type fixnum len))
|
||||
(declare (fixnum len))
|
||||
(incf concatenated-length len)
|
||||
(loop for i of-type fixnum downfrom (1- len) to 0
|
||||
do (vector-push-extend (char str i)
|
||||
|
||||
114
packages.lisp
114
packages.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.24 2008/06/25 14:04:27 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.38 2008/07/22 23:54:59 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
@ -29,78 +29,40 @@
|
||||
|
||||
(in-package :cl-user)
|
||||
|
||||
#-:cormanlisp
|
||||
(defpackage #:cl-ppcre
|
||||
(:nicknames #:ppcre)
|
||||
#+genera (:shadowing-import-from #:common-lisp #:lambda #:simple-string #:string)
|
||||
(:use #-genera #:cl #+genera #:future-common-lisp)
|
||||
(:export #:create-scanner
|
||||
#:parse-tree-synonym
|
||||
#:define-parse-tree-synonym
|
||||
#:scan
|
||||
#:scan-to-strings
|
||||
#:do-scans
|
||||
#:do-matches
|
||||
#:do-matches-as-strings
|
||||
#:all-matches
|
||||
#:all-matches-as-strings
|
||||
#:split
|
||||
#:regex-replace
|
||||
#:regex-replace-all
|
||||
#:regex-apropos
|
||||
#:regex-apropos-list
|
||||
#:quote-meta-chars
|
||||
#:*regex-char-code-limit*
|
||||
#:*use-bmh-matchers*
|
||||
#:*allow-quoting*
|
||||
#:*allow-named-registers*
|
||||
#:ppcre-error
|
||||
#:ppcre-invocation-error
|
||||
#:ppcre-syntax-error
|
||||
#:ppcre-syntax-error-string
|
||||
#:ppcre-syntax-error-pos
|
||||
#:register-groups-bind
|
||||
#:do-register-groups))
|
||||
|
||||
#+:cormanlisp
|
||||
(defpackage "CL-PPCRE"
|
||||
(:nicknames "PPCRE")
|
||||
(:use "CL")
|
||||
(:export "CREATE-SCANNER"
|
||||
"PARSE-TREE-SYNONYM"
|
||||
"DEFINE-PARSE-TREE-SYNONYM"
|
||||
"SCAN"
|
||||
"SCAN-TO-STRINGS"
|
||||
"DO-SCANS"
|
||||
"DO-MATCHES"
|
||||
"DO-MATCHES-AS-STRINGS"
|
||||
"ALL-MATCHES"
|
||||
"ALL-MATCHES-AS-STRINGS"
|
||||
"SPLIT"
|
||||
"REGEX-REPLACE"
|
||||
"REGEX-REPLACE-ALL"
|
||||
"REGEX-APROPOS"
|
||||
"REGEX-APROPOS-LIST"
|
||||
"QUOTE-META-CHARS"
|
||||
"*REGEX-CHAR-CODE-LIMIT*"
|
||||
"*USE-BMH-MATCHERS*"
|
||||
"*ALLOW-QUOTING*"
|
||||
"*ALLOW-NAMED-REGISTERS*"
|
||||
"PPCRE-ERROR"
|
||||
"PPCRE-INVOCATION-ERROR"
|
||||
"PPCRE-SYNTAX-ERROR"
|
||||
"PPCRE-SYNTAX-ERROR-STRING"
|
||||
"PPCRE-SYNTAX-ERROR-POS"
|
||||
"REGISTER-GROUPS-BIND"
|
||||
"DO-REGISTER-GROUPS"))
|
||||
|
||||
#-:cormanlisp
|
||||
(defpackage #:cl-ppcre-test
|
||||
#+genera (:shadowing-import-from #:common-lisp #:lambda)
|
||||
(:use #-genera #:cl #+genera #:future-common-lisp #:cl-ppcre)
|
||||
(:export #:test))
|
||||
|
||||
#+:cormanlisp
|
||||
(defpackage "CL-PPCRE-TEST"
|
||||
(:use "CL" "CL-PPCRE")
|
||||
(:export "TEST"))
|
||||
(defpackage :cl-ppcre
|
||||
(:nicknames :ppcre)
|
||||
#+:genera
|
||||
(:shadowing-import-from :common-lisp :lambda :simple-string :string)
|
||||
(:use #-:genera :cl #+:genera :future-common-lisp)
|
||||
(:shadow :digit-char-p :defconstant)
|
||||
(:export :parse-string
|
||||
:create-scanner
|
||||
:create-optimized-test-function
|
||||
:parse-tree-synonym
|
||||
:define-parse-tree-synonym
|
||||
:scan
|
||||
:scan-to-strings
|
||||
:do-scans
|
||||
:do-matches
|
||||
:do-matches-as-strings
|
||||
:all-matches
|
||||
:all-matches-as-strings
|
||||
:split
|
||||
:regex-replace
|
||||
:regex-replace-all
|
||||
:regex-apropos
|
||||
:regex-apropos-list
|
||||
:quote-meta-chars
|
||||
:*regex-char-code-limit*
|
||||
:*use-bmh-matchers*
|
||||
:*allow-quoting*
|
||||
:*allow-named-registers*
|
||||
:*optimize-char-classes*
|
||||
:*property-resolver*
|
||||
:ppcre-error
|
||||
:ppcre-invocation-error
|
||||
:ppcre-syntax-error
|
||||
:ppcre-syntax-error-string
|
||||
:ppcre-syntax-error-pos
|
||||
:register-groups-bind
|
||||
:do-register-groups))
|
||||
|
||||
50
parser.lisp
50
parser.lisp
@ -1,11 +1,11 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.25 2008/06/25 14:04:28 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.30 2008/07/06 18:12:05 edi Exp $
|
||||
|
||||
;;; The parser will - with the help of the lexer - parse a regex
|
||||
;;; string and convert it into a "parse tree" (see docs for details
|
||||
;;; about the syntax of these trees). Note that the lexer might return
|
||||
;;; illegal parse trees. It is assumed that the conversion process
|
||||
;;; later on will track them down.
|
||||
;;; about the syntax of these trees). Note that the lexer might
|
||||
;;; return illegal parse trees. It is assumed that the conversion
|
||||
;;; process later on will track them down.
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
@ -33,10 +33,9 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defun group (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Parses and consumes a <group>.
|
||||
The productions are: <group> -> \"\(\"<regex>\")\"
|
||||
\"\(?:\"<regex>\")\"
|
||||
@ -53,6 +52,7 @@ The productions are: <group> -> \"\(\"<regex>\")\"
|
||||
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
|
||||
Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||
<grouping-type> is one of six keywords - see source for details."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(multiple-value-bind (open-token flags)
|
||||
(get-token lexer)
|
||||
(cond ((eq open-token :open-paren-paren)
|
||||
@ -65,7 +65,7 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||
(number (try-number lexer :no-whitespace-p t))
|
||||
;; make changes to extended-mode-p local
|
||||
(*extended-mode-p* *extended-mode-p*))
|
||||
(declare (type fixnum open-paren-pos))
|
||||
(declare (fixnum open-paren-pos))
|
||||
(cond (number
|
||||
;; condition is a number (i.e. refers to a
|
||||
;; back-reference)
|
||||
@ -73,13 +73,11 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||
(reg-expr (reg-expr lexer))
|
||||
(close-token (get-token lexer)))
|
||||
(unless (eq inner-close-token :close-paren)
|
||||
(signal-ppcre-syntax-error*
|
||||
(+ open-paren-pos 2)
|
||||
"Opening paren has no matching closing paren"))
|
||||
(signal-syntax-error* (+ open-paren-pos 2)
|
||||
"Opening paren has no matching closing paren."))
|
||||
(unless (eq close-token :close-paren)
|
||||
(signal-ppcre-syntax-error*
|
||||
open-paren-pos
|
||||
"Opening paren has no matching closing paren"))
|
||||
(signal-syntax-error* open-paren-pos
|
||||
"Opening paren has no matching closing paren."))
|
||||
(list :branch number reg-expr)))
|
||||
(t
|
||||
;; condition must be a full regex (actually a
|
||||
@ -94,9 +92,8 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||
(reg-expr (reg-expr lexer))
|
||||
(close-token (get-token lexer)))
|
||||
(unless (eq close-token :close-paren)
|
||||
(signal-ppcre-syntax-error*
|
||||
open-paren-pos
|
||||
"Opening paren has no matching closing paren"))
|
||||
(signal-syntax-error* open-paren-pos
|
||||
"Opening paren has no matching closing paren."))
|
||||
(list :branch inner-reg-expr reg-expr))))))
|
||||
((member open-token '(:open-paren
|
||||
:open-paren-colon
|
||||
@ -124,9 +121,8 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||
(unless (eq close-token :close-paren)
|
||||
;; the token following <regex> must be the closing
|
||||
;; parenthesis or this is a syntax error
|
||||
(signal-ppcre-syntax-error*
|
||||
open-paren-pos
|
||||
"Opening paren has no matching closing paren"))
|
||||
(signal-syntax-error* open-paren-pos
|
||||
"Opening paren has no matching closing paren."))
|
||||
(if flags
|
||||
;; if the lexer has returned a list of flags this must
|
||||
;; have been the "(?:"<regex>")" production
|
||||
@ -160,11 +156,11 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
|
||||
open-token))))
|
||||
|
||||
(defun greedy-quant (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Parses and consumes a <greedy-quant>.
|
||||
The productions are: <greedy-quant> -> <group> | <group><quantifier>
|
||||
where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
|
||||
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let* ((group (group lexer))
|
||||
(token (get-quantifier lexer)))
|
||||
(if token
|
||||
@ -174,11 +170,11 @@ Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
|
||||
group)))
|
||||
|
||||
(defun quant (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Parses and consumes a <quant>.
|
||||
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
|
||||
Will return the <parse-tree> returned by GREEDY-QUANT and optionally
|
||||
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let* ((greedy-quant (greedy-quant lexer))
|
||||
(pos (lexer-pos lexer))
|
||||
(next-char (next-char lexer)))
|
||||
@ -189,10 +185,10 @@ change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
|
||||
greedy-quant))
|
||||
|
||||
(defun seq (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Parses and consumes a <seq>.
|
||||
The productions are: <seq> -> <quant> | <quant><seq>.
|
||||
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(flet ((make-array-from-two-chars (char1 char2)
|
||||
(let ((string (make-array 2
|
||||
:element-type 'character
|
||||
@ -254,10 +250,10 @@ Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
|
||||
:void)))
|
||||
|
||||
(defun reg-expr (lexer)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Parses and consumes a <regex>, a complete regular expression.
|
||||
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
|
||||
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((pos (lexer-pos lexer)))
|
||||
(case (next-char lexer)
|
||||
((nil)
|
||||
@ -299,6 +295,8 @@ Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
||||
seq)))))))
|
||||
|
||||
(defun reverse-strings (parse-tree)
|
||||
"Recursively walks through PARSE-TREE and destructively reverses all
|
||||
strings in it."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(cond ((stringp parse-tree)
|
||||
(nreverse parse-tree))
|
||||
@ -311,13 +309,11 @@ Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
||||
(t parse-tree)))
|
||||
|
||||
(defun parse-string (string)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Translate the regex string STRING into a parse tree."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let* ((lexer (make-lexer string))
|
||||
(parse-tree (reverse-strings (reg-expr lexer))))
|
||||
;; check whether we've consumed the whole regex string
|
||||
(if (end-of-string-p lexer)
|
||||
parse-tree
|
||||
(signal-ppcre-syntax-error*
|
||||
(lexer-pos lexer)
|
||||
"Expected end of string"))))
|
||||
(signal-syntax-error* (lexer-pos lexer) "Expected end of string."))))
|
||||
|
||||
269
ppcre-tests.lisp
269
ppcre-tests.lisp
@ -1,269 +0,0 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.36 2008/06/25 14:04:28 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre-test)
|
||||
|
||||
(defparameter *cl-ppcre-test-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(defun full-gc ()
|
||||
"Start a full garbage collection."
|
||||
;; what are the corresponding values for MCL and OpenMCL?
|
||||
#+:allegro (excl:gc t)
|
||||
#+(or :cmu :scl) (ext:gc :full t)
|
||||
#+:ecl (si:gc t)
|
||||
#+:clisp (ext:gc)
|
||||
#+:cormanlisp (loop for i from 0 to 3 do (cormanlisp:gc i))
|
||||
#+:lispworks4 (hcl:mark-and-sweep 3)
|
||||
#+:lispworks5 (hcl:gc-generation #+:lispworks-32bit 3 #+:lispworks-64bit :blocking-gen-num)
|
||||
#+:sbcl (sb-ext:gc :full t))
|
||||
|
||||
;; warning: ugly code ahead!!
|
||||
;; this is just a quick hack for testing purposes
|
||||
|
||||
(defun time-regex (factor regex string
|
||||
&key case-insensitive-mode
|
||||
multi-line-mode
|
||||
single-line-mode
|
||||
extended-mode)
|
||||
(declare #.ppcre::*standard-optimize-settings*)
|
||||
"Auxiliary function used by TEST to benchmark a regex scanner
|
||||
against Perl timings."
|
||||
(declare (type string string))
|
||||
(let* ((scanner (create-scanner regex
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode))
|
||||
;; make sure GC doesn't invalidate our benchmarking
|
||||
(dummy (full-gc))
|
||||
(start (get-internal-real-time)))
|
||||
(declare (ignore dummy))
|
||||
(dotimes (i factor)
|
||||
(funcall scanner string 0 (length string)))
|
||||
(float (/ (- (get-internal-real-time) start) internal-time-units-per-second))))
|
||||
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
(defun threaded-scan (scanner target-string &key (threads 10) (repetitions 5000))
|
||||
(declare #.ppcre::*standard-optimize-settings*)
|
||||
"Auxiliary function used by TEST to check whether SCANNER is thread-safe."
|
||||
(full-gc)
|
||||
(let ((collector (make-array threads))
|
||||
(counter 0))
|
||||
(loop for i below threads
|
||||
do (let* ((j i)
|
||||
(fn
|
||||
(lambda ()
|
||||
(let ((r (random repetitions)))
|
||||
(loop for k below repetitions
|
||||
if (= k r)
|
||||
do (setf (aref collector j)
|
||||
(let ((result
|
||||
(multiple-value-list
|
||||
(cl-ppcre:scan scanner target-string))))
|
||||
(unless (cdr result)
|
||||
(setq result '(nil nil #() #())))
|
||||
result))
|
||||
else
|
||||
do (cl-ppcre:scan scanner target-string))
|
||||
(incf counter)))))
|
||||
#+scl (thread:thread-create fn)
|
||||
#+lispworks (mp:process-run-function "" nil fn)
|
||||
#+(and sbcl sb-thread) (sb-thread:make-thread fn)))
|
||||
(loop while (< counter threads)
|
||||
do (sleep .1))
|
||||
(destructuring-bind (first-start first-end first-reg-starts first-reg-ends)
|
||||
(aref collector 0)
|
||||
(loop for (start end reg-starts reg-ends) across collector
|
||||
if (or (not (eql first-start start))
|
||||
(not (eql first-end end))
|
||||
(/= (length first-reg-starts) (length reg-starts))
|
||||
(/= (length first-reg-ends) (length reg-ends))
|
||||
(loop for first-reg-start across first-reg-starts
|
||||
for reg-start across reg-starts
|
||||
thereis (not (eql first-reg-start reg-start)))
|
||||
(loop for first-reg-end across first-reg-ends
|
||||
for reg-end across reg-ends
|
||||
thereis (not (eql first-reg-end reg-end))))
|
||||
do (return (format nil "~&Inconsistent results during multi-threading"))))))
|
||||
|
||||
(defun create-string-from-input (input)
|
||||
(cond ((or (null input)
|
||||
(stringp input))
|
||||
input)
|
||||
(t
|
||||
(cl-ppcre::string-list-to-simple-string
|
||||
(loop for element in input
|
||||
if (stringp element)
|
||||
collect element
|
||||
else
|
||||
collect (string (code-char element)))))))
|
||||
|
||||
(defun test (&key (file-name
|
||||
(make-pathname :name "testdata"
|
||||
:type nil :version nil
|
||||
:defaults *cl-ppcre-test-base-directory*)
|
||||
file-name-provided-p)
|
||||
threaded)
|
||||
(declare #.ppcre::*standard-optimize-settings*)
|
||||
(declare (ignorable threaded))
|
||||
"Loop through all test cases in FILE-NAME and print report. Only in
|
||||
LispWorks and SCL: If THREADED is true, also test whether the scanners
|
||||
work multi-threaded."
|
||||
(with-open-file (stream file-name
|
||||
#+(or :allegro :clisp :scl :sbcl)
|
||||
:external-format
|
||||
#+(or :allegro :clisp :scl :sbcl)
|
||||
(if file-name-provided-p
|
||||
:default
|
||||
#+(or :allegro :scl :sbcl) :iso-8859-1
|
||||
#+:clisp charset:iso-8859-1))
|
||||
(loop with testcount of-type fixnum = 0
|
||||
with *regex-char-code-limit* = (if file-name-provided-p
|
||||
*regex-char-code-limit*
|
||||
;; the standard test suite
|
||||
;; doesn't need Unicode
|
||||
;; support
|
||||
256)
|
||||
with *allow-quoting* = (if file-name-provided-p
|
||||
*allow-quoting*
|
||||
t)
|
||||
for input-line = (read stream nil nil)
|
||||
for (counter info-string regex
|
||||
case-insensitive-mode multi-line-mode
|
||||
single-line-mode extended-mode
|
||||
string perl-error factor
|
||||
perl-time ex-result ex-subs) = input-line
|
||||
while input-line
|
||||
do (let ((info-string (create-string-from-input info-string))
|
||||
(regex (create-string-from-input regex))
|
||||
(string (create-string-from-input string))
|
||||
(ex-result (create-string-from-input ex-result))
|
||||
(ex-subs (mapcar #'create-string-from-input ex-subs))
|
||||
(errors '()))
|
||||
;; provide some visual feedback for slow CL
|
||||
;; implementations; suggested by JP Massar
|
||||
(incf testcount)
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
(when threaded
|
||||
(format t "Test #~A (ID ~A)~%" testcount counter)
|
||||
(force-output))
|
||||
(unless #-(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
nil
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
threaded
|
||||
(when (zerop (mod testcount 10))
|
||||
(format t ".")
|
||||
(force-output))
|
||||
(when (zerop (mod testcount 100))
|
||||
(terpri)))
|
||||
(handler-case
|
||||
(let* ((*use-bmh-matchers* (if (and (> factor 1) (plusp perl-time))
|
||||
*use-bmh-matchers*
|
||||
;; if we only check for
|
||||
;; correctness we don't
|
||||
;; care about speed that
|
||||
;; match (but rather
|
||||
;; about space
|
||||
;; constraints of the
|
||||
;; trial versions)
|
||||
nil))
|
||||
(scanner (create-scanner regex
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode)))
|
||||
(multiple-value-bind (result1 result2 sub-starts sub-ends)
|
||||
(scan scanner string)
|
||||
(cond (perl-error
|
||||
(push (format nil
|
||||
"~&expected an error but got a result")
|
||||
errors))
|
||||
(t
|
||||
(when (not (eq result1 ex-result))
|
||||
(if result1
|
||||
(let ((result (subseq string result1 result2)))
|
||||
(unless (string= result ex-result)
|
||||
(push (format nil
|
||||
"~&expected ~S but got ~S"
|
||||
ex-result result)
|
||||
errors))
|
||||
(setq sub-starts (coerce sub-starts 'list)
|
||||
sub-ends (coerce sub-ends 'list))
|
||||
(loop for i from 0
|
||||
for ex-sub in ex-subs
|
||||
for sub-start = (nth i sub-starts)
|
||||
for sub-end = (nth i sub-ends)
|
||||
for sub = (if (and sub-start sub-end)
|
||||
(subseq string sub-start sub-end)
|
||||
nil)
|
||||
unless (string= ex-sub sub)
|
||||
do (push (format nil
|
||||
"~&\\~A: expected ~S but got ~S"
|
||||
(1+ i) ex-sub sub) errors)))
|
||||
(push (format nil
|
||||
"~&expected ~S but got ~S"
|
||||
ex-result result1)
|
||||
errors)))))
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
(when threaded
|
||||
(let ((thread-result (threaded-scan scanner string)))
|
||||
(when thread-result
|
||||
(push thread-result errors))))))
|
||||
(condition (msg)
|
||||
(unless perl-error
|
||||
(push (format nil "~&got an unexpected error: '~A'" msg)
|
||||
errors))))
|
||||
(setq errors (nreverse errors))
|
||||
(cond (errors
|
||||
(when (or (<= factor 1) (zerop perl-time))
|
||||
(format t "~&~4@A (~A):~{~& ~A~}~%"
|
||||
counter info-string errors)))
|
||||
((and (> factor 1) (plusp perl-time))
|
||||
(let ((result (time-regex factor regex string
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode)))
|
||||
(format t "~&~4@A: ~,4F (~A repetitions, Perl: ~,4F seconds, CL-PPCRE: ~,4F seconds)" counter
|
||||
(float (/ result perl-time)) factor perl-time result)
|
||||
#+:cormanlisp (force-output *standard-output*)))
|
||||
(t nil))))
|
||||
(values)))
|
||||
555
regex-class-util.lisp
Normal file
555
regex-class-util.lisp
Normal file
@ -0,0 +1,555 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class-util.lisp,v 1.8 2008/07/22 22:38:05 edi Exp $
|
||||
|
||||
;;; This file contains some utility methods for REGEX objects.
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
;;; The following four methods allow a VOID object to behave like a
|
||||
;;; zero-length STR object (only readers needed)
|
||||
|
||||
(defmethod len ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
0)
|
||||
|
||||
(defmethod str ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"")
|
||||
|
||||
(defmethod skip ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
nil)
|
||||
|
||||
(defmethod start-of-end-string-p ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
nil)
|
||||
|
||||
(defgeneric case-mode (regex old-case-mode)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
|
||||
Returns a keyword denoting the case-(in)sensitivity of a STR or its
|
||||
second argument if the STR has length 0. Returns NIL for REGEX objects
|
||||
which are not of type STR."))
|
||||
|
||||
(defmethod case-mode ((str str) old-case-mode)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(cond ((zerop (len str))
|
||||
old-case-mode)
|
||||
((case-insensitive-p str)
|
||||
:case-insensitive)
|
||||
(t
|
||||
:case-sensitive)))
|
||||
|
||||
(defmethod case-mode ((regex regex) old-case-mode)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (ignore old-case-mode))
|
||||
nil)
|
||||
|
||||
(defgeneric copy-regex (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Implements a deep copy of a REGEX object."))
|
||||
|
||||
(defmethod copy-regex ((anchor anchor))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'anchor
|
||||
:startp (startp anchor)
|
||||
:multi-line-p (multi-line-p anchor)
|
||||
:no-newline-p (no-newline-p anchor)))
|
||||
|
||||
(defmethod copy-regex ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'everything
|
||||
:single-line-p (single-line-p everything)))
|
||||
|
||||
(defmethod copy-regex ((word-boundary word-boundary))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'word-boundary
|
||||
:negatedp (negatedp word-boundary)))
|
||||
|
||||
(defmethod copy-regex ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'void))
|
||||
|
||||
(defmethod copy-regex ((lookahead lookahead))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookahead
|
||||
:regex (copy-regex (regex lookahead))
|
||||
:positivep (positivep lookahead)))
|
||||
|
||||
(defmethod copy-regex ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'seq
|
||||
:elements (mapcar #'copy-regex (elements seq))))
|
||||
|
||||
(defmethod copy-regex ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'alternation
|
||||
:choices (mapcar #'copy-regex (choices alternation))))
|
||||
|
||||
(defmethod copy-regex ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (test)
|
||||
branch
|
||||
(make-instance 'branch
|
||||
:test (if (typep test 'regex)
|
||||
(copy-regex test)
|
||||
test)
|
||||
:then-regex (copy-regex (then-regex branch))
|
||||
:else-regex (copy-regex (else-regex branch)))))
|
||||
|
||||
(defmethod copy-regex ((lookbehind lookbehind))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookbehind
|
||||
:regex (copy-regex (regex lookbehind))
|
||||
:positivep (positivep lookbehind)
|
||||
:len (len lookbehind)))
|
||||
|
||||
(defmethod copy-regex ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'repetition
|
||||
:regex (copy-regex (regex repetition))
|
||||
:greedyp (greedyp repetition)
|
||||
:minimum (minimum repetition)
|
||||
:maximum (maximum repetition)
|
||||
:min-len (min-len repetition)
|
||||
:len (len repetition)
|
||||
:contains-register-p (contains-register-p repetition)))
|
||||
|
||||
(defmethod copy-regex ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'register
|
||||
:regex (copy-regex (regex register))
|
||||
:num (num register)
|
||||
:name (name register)))
|
||||
|
||||
(defmethod copy-regex ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'standalone
|
||||
:regex (copy-regex (regex standalone))))
|
||||
|
||||
(defmethod copy-regex ((back-reference back-reference))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'back-reference
|
||||
:num (num back-reference)
|
||||
:case-insensitive-p (case-insensitive-p back-reference)))
|
||||
|
||||
(defmethod copy-regex ((char-class char-class))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'char-class
|
||||
:test-function (test-function char-class)))
|
||||
|
||||
(defmethod copy-regex ((str str))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'str
|
||||
:str (str str)
|
||||
:case-insensitive-p (case-insensitive-p str)))
|
||||
|
||||
(defmethod copy-regex ((filter filter))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'filter
|
||||
:fn (fn filter)
|
||||
:len (len filter)))
|
||||
|
||||
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
|
||||
;;; wrapped into one function. Maybe in the next release...
|
||||
|
||||
;;; Further note that this function is used by CONVERT to factor out
|
||||
;;; complicated repetitions, i.e. cases like
|
||||
;;; (a)* -> (?:a*(a))?
|
||||
;;; This won't work for, say,
|
||||
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
|
||||
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
|
||||
|
||||
(defgeneric remove-registers (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
|
||||
optionally removes embedded REGISTER objects if possible and if the
|
||||
special variable REMOVE-REGISTERS-P is true."))
|
||||
|
||||
(defmethod remove-registers ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special remove-registers-p reg-seen))
|
||||
(cond (remove-registers-p
|
||||
(remove-registers (regex register)))
|
||||
(t
|
||||
;; mark REG-SEEN as true so enclosing REPETITION objects
|
||||
;; (see method below) know if they contain a register or not
|
||||
(setq reg-seen t)
|
||||
(copy-regex register))))
|
||||
|
||||
(defmethod remove-registers ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let* (reg-seen
|
||||
(inner-regex (remove-registers (regex repetition))))
|
||||
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
|
||||
;; (REGEX REPETITION) contains a REGISTER
|
||||
(declare (special reg-seen))
|
||||
(make-instance 'repetition
|
||||
:regex inner-regex
|
||||
:greedyp (greedyp repetition)
|
||||
:minimum (minimum repetition)
|
||||
:maximum (maximum repetition)
|
||||
:min-len (min-len repetition)
|
||||
:len (len repetition)
|
||||
:contains-register-p reg-seen)))
|
||||
|
||||
(defmethod remove-registers ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'standalone
|
||||
:regex (remove-registers (regex standalone))))
|
||||
|
||||
(defmethod remove-registers ((lookahead lookahead))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookahead
|
||||
:regex (remove-registers (regex lookahead))
|
||||
:positivep (positivep lookahead)))
|
||||
|
||||
(defmethod remove-registers ((lookbehind lookbehind))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookbehind
|
||||
:regex (remove-registers (regex lookbehind))
|
||||
:positivep (positivep lookbehind)
|
||||
:len (len lookbehind)))
|
||||
|
||||
(defmethod remove-registers ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (test)
|
||||
branch
|
||||
(make-instance 'branch
|
||||
:test (if (typep test 'regex)
|
||||
(remove-registers test)
|
||||
test)
|
||||
:then-regex (remove-registers (then-regex branch))
|
||||
:else-regex (remove-registers (else-regex branch)))))
|
||||
|
||||
(defmethod remove-registers ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special remove-registers-p))
|
||||
;; an ALTERNATION, so we can't remove REGISTER objects further down
|
||||
(setq remove-registers-p nil)
|
||||
(copy-regex alternation))
|
||||
|
||||
(defmethod remove-registers ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(copy-regex regex))
|
||||
|
||||
(defmethod remove-registers ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'seq
|
||||
:elements (mapcar #'remove-registers (elements seq))))
|
||||
|
||||
(defgeneric everythingp (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
|
||||
to this object, otherwise NIL. So, \"(.){1}\" would return true
|
||||
\(i.e. the object corresponding to \".\", for example."))
|
||||
|
||||
(defmethod everythingp ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
|
||||
;; due to the parsing process
|
||||
(let ((cleaned-elements (remove-if #'(lambda (element)
|
||||
(typep element 'void))
|
||||
(elements seq))))
|
||||
(and (= 1 (length cleaned-elements))
|
||||
(everythingp (first cleaned-elements)))))
|
||||
|
||||
(defmethod everythingp ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (choices)
|
||||
alternation
|
||||
(and (= 1 (length choices))
|
||||
;; this is unlikely to happen for human-generated regexes,
|
||||
;; but machine-generated ones might look like this
|
||||
(everythingp (first choices)))))
|
||||
|
||||
(defmethod everythingp ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (maximum minimum regex)
|
||||
repetition
|
||||
(and maximum
|
||||
(= 1 minimum maximum)
|
||||
;; treat "<regex>{1,1}" like "<regex>"
|
||||
(everythingp regex))))
|
||||
|
||||
(defmethod everythingp ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(everythingp (regex register)))
|
||||
|
||||
(defmethod everythingp ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(everythingp (regex standalone)))
|
||||
|
||||
(defmethod everythingp ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
everything)
|
||||
|
||||
(defmethod everythingp ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
|
||||
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
|
||||
nil)
|
||||
|
||||
(defgeneric regex-length (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
|
||||
|
||||
(defmethod regex-length ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; simply add all inner lengths unless one of them is NIL
|
||||
(loop for sub-regex in (elements seq)
|
||||
for len = (regex-length sub-regex)
|
||||
if (not len) do (return nil)
|
||||
sum len))
|
||||
|
||||
(defmethod regex-length ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; only return a true value if all inner lengths are non-NIL and
|
||||
;; mutually equal
|
||||
(loop for sub-regex in (choices alternation)
|
||||
for old-len = nil then len
|
||||
for len = (regex-length sub-regex)
|
||||
if (or (not len)
|
||||
(and old-len (/= len old-len))) do (return nil)
|
||||
finally (return len)))
|
||||
|
||||
(defmethod regex-length ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; only return a true value if both alternations have a length and
|
||||
;; if they're equal
|
||||
(let ((then-length (regex-length (then-regex branch))))
|
||||
(and then-length
|
||||
(eql then-length (regex-length (else-regex branch)))
|
||||
then-length)))
|
||||
|
||||
(defmethod regex-length ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; we can only compute the length of a REPETITION object if the
|
||||
;; number of repetitions is fixed; note that we don't call
|
||||
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
|
||||
;; always set correctly
|
||||
(with-slots (len minimum maximum)
|
||||
repetition
|
||||
(if (and len
|
||||
(eql minimum maximum))
|
||||
(* minimum len)
|
||||
nil)))
|
||||
|
||||
(defmethod regex-length ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-length (regex register)))
|
||||
|
||||
(defmethod regex-length ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-length (regex standalone)))
|
||||
|
||||
(defmethod regex-length ((back-reference back-reference))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; with enough effort we could possibly do better here, but
|
||||
;; currently we just give up and return NIL
|
||||
nil)
|
||||
|
||||
(defmethod regex-length ((char-class char-class))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-length ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-length ((str str))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(len str))
|
||||
|
||||
(defmethod regex-length ((filter filter))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(len filter))
|
||||
|
||||
(defmethod regex-length ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||
;; WORD-BOUNDARY (which all have zero-length)
|
||||
0)
|
||||
|
||||
(defgeneric regex-min-length (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns the minimal length of REGEX."))
|
||||
|
||||
(defmethod regex-min-length ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; simply add all inner minimal lengths
|
||||
(loop for sub-regex in (elements seq)
|
||||
for len = (regex-min-length sub-regex)
|
||||
sum len))
|
||||
|
||||
(defmethod regex-min-length ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; minimal length of an alternation is the minimal length of the
|
||||
;; "shortest" element
|
||||
(loop for sub-regex in (choices alternation)
|
||||
for len = (regex-min-length sub-regex)
|
||||
minimize len))
|
||||
|
||||
(defmethod regex-min-length ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; minimal length of both alternations
|
||||
(min (regex-min-length (then-regex branch))
|
||||
(regex-min-length (else-regex branch))))
|
||||
|
||||
(defmethod regex-min-length ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; obviously the product of the inner minimal length and the minimal
|
||||
;; number of repetitions
|
||||
(* (minimum repetition) (min-len repetition)))
|
||||
|
||||
(defmethod regex-min-length ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-min-length (regex register)))
|
||||
|
||||
(defmethod regex-min-length ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-min-length (regex standalone)))
|
||||
|
||||
(defmethod regex-min-length ((char-class char-class))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-min-length ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-min-length ((str str))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(len str))
|
||||
|
||||
(defmethod regex-min-length ((filter filter))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(or (len filter)
|
||||
0))
|
||||
|
||||
(defmethod regex-min-length ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
|
||||
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
|
||||
0)
|
||||
|
||||
(defgeneric compute-offsets (regex start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns the offset the following regex would have
|
||||
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
|
||||
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
|
||||
slots of STR objects further down the tree."))
|
||||
|
||||
;; note that we're actually only interested in the offset of
|
||||
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
|
||||
;; can stop at variable-length alternations and don't need to descend
|
||||
;; into repetitions
|
||||
|
||||
(defmethod compute-offsets ((seq seq) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop for element in (elements seq)
|
||||
;; advance offset argument for next call while looping through
|
||||
;; the elements
|
||||
for pos = start-pos then curr-offset
|
||||
for curr-offset = (compute-offsets element pos)
|
||||
while curr-offset
|
||||
finally (return curr-offset)))
|
||||
|
||||
(defmethod compute-offsets ((alternation alternation) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop for choice in (choices alternation)
|
||||
for old-offset = nil then curr-offset
|
||||
for curr-offset = (compute-offsets choice start-pos)
|
||||
;; we stop immediately if two alternations don't result in the
|
||||
;; same offset
|
||||
if (or (not curr-offset)
|
||||
(and old-offset (/= curr-offset old-offset)))
|
||||
do (return nil)
|
||||
finally (return curr-offset)))
|
||||
|
||||
(defmethod compute-offsets ((branch branch) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; only return offset if both alternations have equal value
|
||||
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
|
||||
(and then-offset
|
||||
(eql then-offset (compute-offsets (else-regex branch) start-pos))
|
||||
then-offset)))
|
||||
|
||||
(defmethod compute-offsets ((repetition repetition) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; no need to descend into the inner regex
|
||||
(with-slots (len minimum maximum)
|
||||
repetition
|
||||
(if (and len
|
||||
(eq minimum maximum))
|
||||
;; fixed number of repetitions, so we know how to proceed
|
||||
(+ start-pos (* minimum len))
|
||||
;; otherwise return NIL
|
||||
nil)))
|
||||
|
||||
(defmethod compute-offsets ((register register) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(compute-offsets (regex register) start-pos))
|
||||
|
||||
(defmethod compute-offsets ((standalone standalone) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(compute-offsets (regex standalone) start-pos))
|
||||
|
||||
(defmethod compute-offsets ((char-class char-class) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(1+ start-pos))
|
||||
|
||||
(defmethod compute-offsets ((everything everything) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(1+ start-pos))
|
||||
|
||||
(defmethod compute-offsets ((str str) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(setf (offset str) start-pos)
|
||||
(+ start-pos (len str)))
|
||||
|
||||
(defmethod compute-offsets ((back-reference back-reference) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; with enough effort we could possibly do better here, but
|
||||
;; currently we just give up and return NIL
|
||||
(declare (ignore start-pos))
|
||||
nil)
|
||||
|
||||
(defmethod compute-offsets ((filter filter) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((len (len filter)))
|
||||
(if len
|
||||
(+ start-pos len)
|
||||
nil)))
|
||||
|
||||
(defmethod compute-offsets ((regex regex) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||
;; WORD-BOUNDARY (which all have zero-length)
|
||||
start-pos)
|
||||
600
regex-class.lisp
600
regex-class.lisp
@ -1,9 +1,8 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.34 2008/07/03 07:44:06 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.42 2008/07/22 22:38:05 edi Exp $
|
||||
|
||||
;;; This file defines the REGEX class and some utility methods for
|
||||
;;; this class. REGEX objects are used to represent the (transformed)
|
||||
;;; parse trees internally
|
||||
;;; This file defines the REGEX class. REGEX objects are used to
|
||||
;;; represent the (transformed) parse trees internally
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
@ -31,26 +30,20 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
;; Genera need the eval-when, here, or the types created by the class
|
||||
;; definitions aren't seen by the typep calls later in the file.
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
(locally
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(defclass regex ()
|
||||
()
|
||||
(:documentation "The REGEX base class. All other classes inherit
|
||||
from this one."))
|
||||
|
||||
|
||||
(defclass seq (regex)
|
||||
((elements :initarg :elements
|
||||
:accessor elements
|
||||
:type cons
|
||||
:documentation "A list of REGEX objects."))
|
||||
(:documentation "SEQ objects represents sequences of
|
||||
regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
||||
(:documentation "SEQ objects represents sequences of regexes.
|
||||
\(Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
||||
|
||||
(defclass alternation (regex)
|
||||
((choices :initarg :choices
|
||||
@ -58,7 +51,7 @@ regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
||||
:type cons
|
||||
:documentation "A list of REGEX objects"))
|
||||
(:documentation "ALTERNATION objects represent alternations of
|
||||
regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
||||
regexes. \(Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
||||
|
||||
(defclass lookahead (regex)
|
||||
((regex :initarg :regex
|
||||
@ -79,7 +72,7 @@ regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
||||
(len :initarg :len
|
||||
:accessor len
|
||||
:type fixnum
|
||||
:documentation "The (fixed) length of the enclosed regex."))
|
||||
:documentation "The \(fixed) length of the enclosed regex."))
|
||||
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
|
||||
|
||||
(defclass repetition (regex)
|
||||
@ -102,16 +95,17 @@ Can be NIL for unbounded.")
|
||||
:documentation "The minimal length of the enclosed regex.")
|
||||
(len :initarg :len
|
||||
:reader len
|
||||
:documentation "The length of the enclosed regex. NIL
|
||||
if unknown.")
|
||||
:documentation "The length of the enclosed regex. NIL if
|
||||
unknown.")
|
||||
(min-rest :initform 0
|
||||
:accessor min-rest
|
||||
:type fixnum
|
||||
:documentation "The minimal number of characters which must
|
||||
appear after this repetition.")
|
||||
:documentation "The minimal number of characters which
|
||||
must appear after this repetition.")
|
||||
(contains-register-p :initarg :contains-register-p
|
||||
:reader contains-register-p
|
||||
:documentation "If the regex contains a register."))
|
||||
:documentation "Whether the regex contains a
|
||||
register."))
|
||||
(:documentation "REPETITION objects represent repetitions of regexes."))
|
||||
|
||||
(defclass register (regex)
|
||||
@ -149,24 +143,14 @@ reference refers to or NIL.")
|
||||
:documentation "Whether we check
|
||||
case-insensitively."))
|
||||
(:documentation "BACK-REFERENCE objects represent backreferences."))
|
||||
|
||||
(defclass char-class (regex)
|
||||
((charset :initarg :charset
|
||||
:reader charset
|
||||
:type (or charset null)
|
||||
:documentation "A charset denoting the characters
|
||||
in the character class.")
|
||||
(case-insensitive-p :initarg :case-insensitive-p
|
||||
:reader case-insensitive-p
|
||||
:documentation "If the char class
|
||||
case-insensitive.")
|
||||
(invertedp :initarg :invertedp
|
||||
:reader invertedp
|
||||
:documentation "Whether we mean the inverse of
|
||||
the char class.")
|
||||
(word-char-class-p :initarg :word-char-class-p
|
||||
:reader word-char-class-p
|
||||
:documentation "Whether this CHAR CLASS
|
||||
represents the special class WORD-CHAR-CLASS."))
|
||||
((test-function :initarg :test-function
|
||||
:reader test-function
|
||||
:type (or function symbol nil)
|
||||
:documentation "A unary function \(accepting a
|
||||
character) which stands in for the character class and does the work
|
||||
of checking whether a character belongs to the class."))
|
||||
(:documentation "CHAR-CLASS objects represent character classes."))
|
||||
|
||||
(defclass str (regex)
|
||||
@ -202,10 +186,12 @@ STR which starts END-STRING (a slot of MATCHER)."))
|
||||
:reader startp
|
||||
:documentation "Whether this is a \"start anchor\".")
|
||||
(multi-line-p :initarg :multi-line-p
|
||||
:initform nil
|
||||
:reader multi-line-p
|
||||
:documentation "Whether we're in multi-line mode,
|
||||
i.e. whether each #\\Newline is surrounded by anchors.")
|
||||
(no-newline-p :initarg :no-newline-p
|
||||
:initform nil
|
||||
:reader no-newline-p
|
||||
:documentation "Whether we ignore #\\Newline at the end."))
|
||||
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
|
||||
@ -255,20 +241,7 @@ defined by the user."))
|
||||
|
||||
(defclass void (regex)
|
||||
()
|
||||
(:documentation "VOID objects represent empty regular expressions."))))
|
||||
|
||||
(defmethod initialize-instance :after ((char-class char-class) &rest init-args)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Make large charsets smaller, if possible."
|
||||
(let ((set (getf init-args :charset)))
|
||||
(when (and set
|
||||
(> *regex-char-code-limit* 256)
|
||||
(> (charset-count set)
|
||||
(/ *regex-char-code-limit* 2)))
|
||||
(setf (slot-value char-class 'set)
|
||||
(merge-set (make-charset) set)
|
||||
(slot-value char-class 'invertedp)
|
||||
(not (slot-value char-class 'invertedp))))))
|
||||
(:documentation "VOID objects represent empty regular expressions."))
|
||||
|
||||
(defmethod initialize-instance :after ((str str) &rest init-args)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -279,528 +252,3 @@ defined by the user."))
|
||||
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
|
||||
(setf (len str) (length (str str))))
|
||||
|
||||
;;; The following four methods allow a VOID object to behave like a
|
||||
;;; zero-length STR object (only readers needed)
|
||||
|
||||
(defmethod len ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
0)
|
||||
|
||||
(defmethod str ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"")
|
||||
|
||||
(defmethod skip ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
nil)
|
||||
|
||||
(defmethod start-of-end-string-p ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
nil)
|
||||
|
||||
(defgeneric case-mode (regex old-case-mode)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
|
||||
Returns a keyword denoting the case-(in)sensitivity of a STR or its
|
||||
second argument if the STR has length 0. Returns NIL for REGEX objects
|
||||
which are not of type STR."))
|
||||
|
||||
(defmethod case-mode ((str str) old-case-mode)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(cond ((zerop (len str))
|
||||
old-case-mode)
|
||||
((case-insensitive-p str)
|
||||
:case-insensitive)
|
||||
(t
|
||||
:case-sensitive)))
|
||||
|
||||
(defmethod case-mode ((regex regex) old-case-mode)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (ignore old-case-mode))
|
||||
nil)
|
||||
|
||||
(defgeneric copy-regex (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Implements a deep copy of a REGEX object."))
|
||||
|
||||
(defmethod copy-regex ((anchor anchor))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'anchor
|
||||
:startp (startp anchor)
|
||||
:multi-line-p (multi-line-p anchor)
|
||||
:no-newline-p (no-newline-p anchor)))
|
||||
|
||||
(defmethod copy-regex ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'everything
|
||||
:single-line-p (single-line-p everything)))
|
||||
|
||||
(defmethod copy-regex ((word-boundary word-boundary))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'word-boundary
|
||||
:negatedp (negatedp word-boundary)))
|
||||
|
||||
(defmethod copy-regex ((void void))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'void))
|
||||
|
||||
(defmethod copy-regex ((lookahead lookahead))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookahead
|
||||
:regex (copy-regex (regex lookahead))
|
||||
:positivep (positivep lookahead)))
|
||||
|
||||
(defmethod copy-regex ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'seq
|
||||
:elements (mapcar #'copy-regex (elements seq))))
|
||||
|
||||
(defmethod copy-regex ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'alternation
|
||||
:choices (mapcar #'copy-regex (choices alternation))))
|
||||
|
||||
(defmethod copy-regex ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (test)
|
||||
branch
|
||||
(make-instance 'branch
|
||||
:test (if (typep test 'regex)
|
||||
(copy-regex test)
|
||||
test)
|
||||
:then-regex (copy-regex (then-regex branch))
|
||||
:else-regex (copy-regex (else-regex branch)))))
|
||||
|
||||
(defmethod copy-regex ((lookbehind lookbehind))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookbehind
|
||||
:regex (copy-regex (regex lookbehind))
|
||||
:positivep (positivep lookbehind)
|
||||
:len (len lookbehind)))
|
||||
|
||||
(defmethod copy-regex ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'repetition
|
||||
:regex (copy-regex (regex repetition))
|
||||
:greedyp (greedyp repetition)
|
||||
:minimum (minimum repetition)
|
||||
:maximum (maximum repetition)
|
||||
:min-len (min-len repetition)
|
||||
:len (len repetition)
|
||||
:contains-register-p (contains-register-p repetition)))
|
||||
|
||||
(defmethod copy-regex ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'register
|
||||
:regex (copy-regex (regex register))
|
||||
:num (num register)
|
||||
:name (name register)))
|
||||
|
||||
(defmethod copy-regex ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'standalone
|
||||
:regex (copy-regex (regex standalone))))
|
||||
|
||||
(defmethod copy-regex ((back-reference back-reference))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'back-reference
|
||||
:num (num back-reference)
|
||||
:case-insensitive-p (case-insensitive-p back-reference)))
|
||||
|
||||
(defmethod copy-regex ((char-class char-class))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'char-class
|
||||
:charset (charset char-class)
|
||||
:case-insensitive-p (case-insensitive-p char-class)
|
||||
:invertedp (invertedp char-class)
|
||||
:word-char-class-p (word-char-class-p char-class)))
|
||||
|
||||
(defmethod copy-regex ((str str))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'str
|
||||
:str (str str)
|
||||
:case-insensitive-p (case-insensitive-p str)))
|
||||
|
||||
(defmethod copy-regex ((filter filter))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'filter
|
||||
:fn (fn filter)
|
||||
:len (len filter)))
|
||||
|
||||
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
|
||||
;;; wrapped into one function. Maybe in the next release...
|
||||
|
||||
;;; Further note that this function is used by CONVERT to factor out
|
||||
;;; complicated repetitions, i.e. cases like
|
||||
;;; (a)* -> (?:a*(a))?
|
||||
;;; This won't work for, say,
|
||||
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
|
||||
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
|
||||
|
||||
(defgeneric remove-registers (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
|
||||
optionally removes embedded REGISTER objects if possible and if the
|
||||
special variable REMOVE-REGISTERS-P is true."))
|
||||
|
||||
(defmethod remove-registers ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special remove-registers-p reg-seen))
|
||||
(cond (remove-registers-p
|
||||
(remove-registers (regex register)))
|
||||
(t
|
||||
;; mark REG-SEEN as true so enclosing REPETITION objects
|
||||
;; (see method below) know if they contain a register or not
|
||||
(setq reg-seen t)
|
||||
(copy-regex register))))
|
||||
|
||||
(defmethod remove-registers ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let* (reg-seen
|
||||
(inner-regex (remove-registers (regex repetition))))
|
||||
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
|
||||
;; (REGEX REPETITION) contains a REGISTER
|
||||
(declare (special reg-seen))
|
||||
(make-instance 'repetition
|
||||
:regex inner-regex
|
||||
:greedyp (greedyp repetition)
|
||||
:minimum (minimum repetition)
|
||||
:maximum (maximum repetition)
|
||||
:min-len (min-len repetition)
|
||||
:len (len repetition)
|
||||
:contains-register-p reg-seen)))
|
||||
|
||||
(defmethod remove-registers ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'standalone
|
||||
:regex (remove-registers (regex standalone))))
|
||||
|
||||
(defmethod remove-registers ((lookahead lookahead))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookahead
|
||||
:regex (remove-registers (regex lookahead))
|
||||
:positivep (positivep lookahead)))
|
||||
|
||||
(defmethod remove-registers ((lookbehind lookbehind))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'lookbehind
|
||||
:regex (remove-registers (regex lookbehind))
|
||||
:positivep (positivep lookbehind)
|
||||
:len (len lookbehind)))
|
||||
|
||||
(defmethod remove-registers ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (test)
|
||||
branch
|
||||
(make-instance 'branch
|
||||
:test (if (typep test 'regex)
|
||||
(remove-registers test)
|
||||
test)
|
||||
:then-regex (remove-registers (then-regex branch))
|
||||
:else-regex (remove-registers (else-regex branch)))))
|
||||
|
||||
(defmethod remove-registers ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (special remove-registers-p))
|
||||
;; an ALTERNATION, so we can't remove REGISTER objects further down
|
||||
(setq remove-registers-p nil)
|
||||
(copy-regex alternation))
|
||||
|
||||
(defmethod remove-registers ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(copy-regex regex))
|
||||
|
||||
(defmethod remove-registers ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(make-instance 'seq
|
||||
:elements (mapcar #'remove-registers (elements seq))))
|
||||
|
||||
(defgeneric everythingp (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
|
||||
to this object, otherwise NIL. So, \"(.){1}\" would return true
|
||||
(i.e. the object corresponding to \".\", for example."))
|
||||
|
||||
(defmethod everythingp ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
|
||||
;; due to the parsing process
|
||||
(let ((cleaned-elements (remove-if #'(lambda (element)
|
||||
(typep element 'void))
|
||||
(elements seq))))
|
||||
(and (= 1 (length cleaned-elements))
|
||||
(everythingp (first cleaned-elements)))))
|
||||
|
||||
(defmethod everythingp ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (choices)
|
||||
alternation
|
||||
(and (= 1 (length choices))
|
||||
;; this is unlikely to happen for human-generated regexes,
|
||||
;; but machine-generated ones might look like this
|
||||
(everythingp (first choices)))))
|
||||
|
||||
(defmethod everythingp ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-slots (maximum minimum regex)
|
||||
repetition
|
||||
(and maximum
|
||||
(= 1 minimum maximum)
|
||||
;; treat "<regex>{1,1}" like "<regex>"
|
||||
(everythingp regex))))
|
||||
|
||||
(defmethod everythingp ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(everythingp (regex register)))
|
||||
|
||||
(defmethod everythingp ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(everythingp (regex standalone)))
|
||||
|
||||
(defmethod everythingp ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
everything)
|
||||
|
||||
(defmethod everythingp ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
|
||||
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
|
||||
nil)
|
||||
|
||||
(defgeneric regex-length (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
|
||||
|
||||
(defmethod regex-length ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; simply add all inner lengths unless one of them is NIL
|
||||
(loop for sub-regex in (elements seq)
|
||||
for len = (regex-length sub-regex)
|
||||
if (not len) do (return nil)
|
||||
sum len))
|
||||
|
||||
(defmethod regex-length ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; only return a true value if all inner lengths are non-NIL and
|
||||
;; mutually equal
|
||||
(loop for sub-regex in (choices alternation)
|
||||
for old-len = nil then len
|
||||
for len = (regex-length sub-regex)
|
||||
if (or (not len)
|
||||
(and old-len (/= len old-len))) do (return nil)
|
||||
finally (return len)))
|
||||
|
||||
(defmethod regex-length ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; only return a true value if both alternations have a length and
|
||||
;; if they're equal
|
||||
(let ((then-length (regex-length (then-regex branch))))
|
||||
(and then-length
|
||||
(eql then-length (regex-length (else-regex branch)))
|
||||
then-length)))
|
||||
|
||||
(defmethod regex-length ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; we can only compute the length of a REPETITION object if the
|
||||
;; number of repetitions is fixed; note that we don't call
|
||||
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
|
||||
;; always set correctly
|
||||
(with-slots (len minimum maximum)
|
||||
repetition
|
||||
(if (and len
|
||||
(eql minimum maximum))
|
||||
(* minimum len)
|
||||
nil)))
|
||||
|
||||
(defmethod regex-length ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-length (regex register)))
|
||||
|
||||
(defmethod regex-length ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-length (regex standalone)))
|
||||
|
||||
(defmethod regex-length ((back-reference back-reference))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; with enough effort we could possibly do better here, but
|
||||
;; currently we just give up and return NIL
|
||||
nil)
|
||||
|
||||
(defmethod regex-length ((char-class char-class))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-length ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-length ((str str))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(len str))
|
||||
|
||||
(defmethod regex-length ((filter filter))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(len filter))
|
||||
|
||||
(defmethod regex-length ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||
;; WORD-BOUNDARY (which all have zero-length)
|
||||
0)
|
||||
|
||||
(defgeneric regex-min-length (regex)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns the minimal length of REGEX."))
|
||||
|
||||
(defmethod regex-min-length ((seq seq))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; simply add all inner minimal lengths
|
||||
(loop for sub-regex in (elements seq)
|
||||
for len = (regex-min-length sub-regex)
|
||||
sum len))
|
||||
|
||||
(defmethod regex-min-length ((alternation alternation))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; minimal length of an alternation is the minimal length of the
|
||||
;; "shortest" element
|
||||
(loop for sub-regex in (choices alternation)
|
||||
for len = (regex-min-length sub-regex)
|
||||
minimize len))
|
||||
|
||||
(defmethod regex-min-length ((branch branch))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; minimal length of both alternations
|
||||
(min (regex-min-length (then-regex branch))
|
||||
(regex-min-length (else-regex branch))))
|
||||
|
||||
(defmethod regex-min-length ((repetition repetition))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; obviously the product of the inner minimal length and the minimal
|
||||
;; number of repetitions
|
||||
(* (minimum repetition) (min-len repetition)))
|
||||
|
||||
(defmethod regex-min-length ((register register))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-min-length (regex register)))
|
||||
|
||||
(defmethod regex-min-length ((standalone standalone))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(regex-min-length (regex standalone)))
|
||||
|
||||
(defmethod regex-min-length ((char-class char-class))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-min-length ((everything everything))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
1)
|
||||
|
||||
(defmethod regex-min-length ((str str))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(len str))
|
||||
|
||||
(defmethod regex-min-length ((filter filter))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(or (len filter)
|
||||
0))
|
||||
|
||||
(defmethod regex-min-length ((regex regex))
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
|
||||
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
|
||||
0)
|
||||
|
||||
(defgeneric compute-offsets (regex start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Returns the offset the following regex would have
|
||||
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
|
||||
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
|
||||
slots of STR objects further down the tree."))
|
||||
|
||||
;; note that we're actually only interested in the offset of
|
||||
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
|
||||
;; can stop at variable-length alternations and don't need to descend
|
||||
;; into repetitions
|
||||
|
||||
(defmethod compute-offsets ((seq seq) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop for element in (elements seq)
|
||||
;; advance offset argument for next call while looping through
|
||||
;; the elements
|
||||
for pos = start-pos then curr-offset
|
||||
for curr-offset = (compute-offsets element pos)
|
||||
while curr-offset
|
||||
finally (return curr-offset)))
|
||||
|
||||
(defmethod compute-offsets ((alternation alternation) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(loop for choice in (choices alternation)
|
||||
for old-offset = nil then curr-offset
|
||||
for curr-offset = (compute-offsets choice start-pos)
|
||||
;; we stop immediately if two alternations don't result in the
|
||||
;; same offset
|
||||
if (or (not curr-offset)
|
||||
(and old-offset (/= curr-offset old-offset)))
|
||||
do (return nil)
|
||||
finally (return curr-offset)))
|
||||
|
||||
(defmethod compute-offsets ((branch branch) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; only return offset if both alternations have equal value
|
||||
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
|
||||
(and then-offset
|
||||
(eql then-offset (compute-offsets (else-regex branch) start-pos))
|
||||
then-offset)))
|
||||
|
||||
(defmethod compute-offsets ((repetition repetition) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; no need to descend into the inner regex
|
||||
(with-slots (len minimum maximum)
|
||||
repetition
|
||||
(if (and len
|
||||
(eq minimum maximum))
|
||||
;; fixed number of repetitions, so we know how to proceed
|
||||
(+ start-pos (* minimum len))
|
||||
;; otherwise return NIL
|
||||
nil)))
|
||||
|
||||
(defmethod compute-offsets ((register register) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(compute-offsets (regex register) start-pos))
|
||||
|
||||
(defmethod compute-offsets ((standalone standalone) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(compute-offsets (regex standalone) start-pos))
|
||||
|
||||
(defmethod compute-offsets ((char-class char-class) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(1+ start-pos))
|
||||
|
||||
(defmethod compute-offsets ((everything everything) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(1+ start-pos))
|
||||
|
||||
(defmethod compute-offsets ((str str) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(setf (offset str) start-pos)
|
||||
(+ start-pos (len str)))
|
||||
|
||||
(defmethod compute-offsets ((back-reference back-reference) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; with enough effort we could possibly do better here, but
|
||||
;; currently we just give up and return NIL
|
||||
(declare (ignore start-pos))
|
||||
nil)
|
||||
|
||||
(defmethod compute-offsets ((filter filter) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(let ((len (len filter)))
|
||||
(if len
|
||||
(+ start-pos len)
|
||||
nil)))
|
||||
|
||||
(defmethod compute-offsets ((regex regex) start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||
;; WORD-BOUNDARY (which all have zero-length)
|
||||
start-pos)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.33 2008/07/06 18:12:05 edi Exp $
|
||||
|
||||
;;; This is actually a part of closures.lisp which we put into a
|
||||
;;; separate file because it is rather complex. We only deal with
|
||||
@ -33,7 +33,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defmacro incf-after (place &optional (delta 1) &environment env)
|
||||
"Utility macro inspired by C's \"place++\", i.e. first return the
|
||||
@ -58,7 +58,7 @@ CHECK-CURR-POS is a form which checks whether the inner regex of the
|
||||
repetition matches at CURR-POS."
|
||||
`(if maximum
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos maximum))
|
||||
(declare (fixnum start-pos maximum))
|
||||
;; because we know LEN we know in advance where to stop at the
|
||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||
;; minimal length of the part behind the repetition
|
||||
@ -68,7 +68,7 @@ repetition matches at CURR-POS."
|
||||
(+ start-pos
|
||||
(the fixnum (* len maximum)))))
|
||||
(curr-pos start-pos))
|
||||
(declare (type fixnum target-end-pos curr-pos))
|
||||
(declare (fixnum target-end-pos curr-pos))
|
||||
(block greedy-constant-length-matcher
|
||||
;; we use an ugly TAGBODY construct because this might be a
|
||||
;; tight loop and this version is a bit faster than our LOOP
|
||||
@ -95,10 +95,10 @@ repetition matches at CURR-POS."
|
||||
;; basically the same code; it's just a bit easier because we're
|
||||
;; not bounded by MAXIMUM
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((target-end-pos (1+ (- *end-pos* len min-rest)))
|
||||
(curr-pos start-pos))
|
||||
(declare (type fixnum target-end-pos curr-pos))
|
||||
(declare (fixnum target-end-pos curr-pos))
|
||||
(block greedy-constant-length-matcher
|
||||
(tagbody
|
||||
forward-loop
|
||||
@ -117,20 +117,19 @@ repetition matches at CURR-POS."
|
||||
(go backward-loop)))))))
|
||||
|
||||
(defun create-greedy-everything-matcher (maximum min-rest next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum min-rest)
|
||||
(type function next-fn))
|
||||
"Creates a closure which just matches as far ahead as possible,
|
||||
i.e. a closure for a dot in single-line mode."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (fixnum min-rest) (function next-fn))
|
||||
(if maximum
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos maximum))
|
||||
(declare (fixnum start-pos maximum))
|
||||
;; because we know LEN we know in advance where to stop at the
|
||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||
;; minimal length of the part behind the repetition
|
||||
(let ((target-end-pos (min (+ start-pos maximum)
|
||||
(- *end-pos* min-rest))))
|
||||
(declare (type fixnum target-end-pos))
|
||||
(declare (fixnum target-end-pos))
|
||||
;; start from the highest possible position and go backward
|
||||
;; until we're able to match the rest of the regex
|
||||
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
||||
@ -138,18 +137,18 @@ i.e. a closure for a dot in single-line mode."
|
||||
;; basically the same code; it's just a bit easier because we're
|
||||
;; not bounded by MAXIMUM
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((target-end-pos (- *end-pos* min-rest)))
|
||||
(declare (type fixnum target-end-pos))
|
||||
(declare (fixnum target-end-pos))
|
||||
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
||||
thereis (funcall next-fn curr-pos))))))
|
||||
|
||||
(defgeneric create-greedy-constant-length-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION is
|
||||
of fixed length and doesn't contain registers."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION is greedy and the minimal number of
|
||||
repetitions is zero. It is furthermore assumed that the inner regex
|
||||
of REPETITION is of fixed length and doesn't contain registers."))
|
||||
|
||||
(defmethod create-greedy-constant-length-matcher ((repetition repetition)
|
||||
next-fn)
|
||||
@ -158,8 +157,8 @@ of fixed length and doesn't contain registers."))
|
||||
(maximum (maximum repetition))
|
||||
(regex (regex repetition))
|
||||
(min-rest (min-rest repetition)))
|
||||
(declare (type fixnum len min-rest)
|
||||
(type function next-fn))
|
||||
(declare (fixnum len min-rest)
|
||||
(function next-fn))
|
||||
(cond ((zerop len)
|
||||
;; inner regex has zero-length, so we can discard it
|
||||
;; completely
|
||||
@ -186,11 +185,8 @@ of fixed length and doesn't contain registers."))
|
||||
(char-class
|
||||
;; a character class
|
||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||
(if (invertedp regex)
|
||||
(greedy-constant-length-closure
|
||||
(not (char-class-test)))
|
||||
(greedy-constant-length-closure
|
||||
(char-class-test)))))
|
||||
(char-class-test))))
|
||||
(everything
|
||||
;; an EVERYTHING object, i.e. a dot
|
||||
(if (single-line-p regex)
|
||||
@ -202,17 +198,17 @@ of fixed length and doesn't contain registers."))
|
||||
;; just checks for immediate success, i.e. NEXT-FN is
|
||||
;; #'IDENTITY
|
||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||
(declare (type function inner-matcher))
|
||||
(declare (function inner-matcher))
|
||||
(greedy-constant-length-closure
|
||||
(funcall inner-matcher curr-pos)))))))))
|
||||
|
||||
(defgeneric create-greedy-no-zero-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION can
|
||||
never match a zero-length string (or instead the maximal number of
|
||||
repetitions is 1)."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION is greedy and the minimal number of
|
||||
repetitions is zero. It is furthermore assumed that the inner regex
|
||||
of REPETITION can never match a zero-length string \(or instead the
|
||||
maximal number of repetitions is 1)."))
|
||||
|
||||
(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -220,7 +216,7 @@ repetitions is 1)."))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
(cond
|
||||
((eql maximum 1)
|
||||
;; this is essentially like the next case but with a known
|
||||
@ -230,7 +226,7 @@ repetitions is 1)."))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) next-fn))
|
||||
(lambda (start-pos)
|
||||
(declare (type function repeat-matcher))
|
||||
(declare (function repeat-matcher))
|
||||
(or (funcall repeat-matcher start-pos)
|
||||
(funcall next-fn start-pos))))
|
||||
(maximum
|
||||
@ -239,8 +235,8 @@ repetitions is 1)."))
|
||||
;; repetitions
|
||||
(let ((rep-num (incf-after *rep-num*)))
|
||||
(flet ((greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos maximum rep-num)
|
||||
(function repeat-matcher))
|
||||
;; the actual matcher which first tries to match the
|
||||
;; inner regex of REPETITION (if we haven't done so
|
||||
;; too often) and on failure calls NEXT-FN
|
||||
@ -259,15 +255,15 @@ repetitions is 1)."))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; GREEDY-AUX to initialize the repetition counter
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0)
|
||||
(greedy-aux start-pos)))))
|
||||
(t
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos)
|
||||
(function repeat-matcher))
|
||||
(or (funcall repeat-matcher start-pos)
|
||||
(funcall next-fn start-pos))))
|
||||
(setq repeat-matcher
|
||||
@ -276,9 +272,9 @@ repetitions is 1)."))
|
||||
|
||||
(defgeneric create-greedy-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is greedy and the minimal number of repetitions is
|
||||
zero."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION is greedy and the minimal number of
|
||||
repetitions is zero."))
|
||||
|
||||
(defmethod create-greedy-matcher ((repetition repetition) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -290,8 +286,8 @@ zero."))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type fixnum zero-length-num)
|
||||
(type function next-fn))
|
||||
(declare (fixnum zero-length-num)
|
||||
(function next-fn))
|
||||
(cond
|
||||
(maximum
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
@ -302,8 +298,8 @@ zero."))
|
||||
;; the actual matcher which first tries to match the
|
||||
;; inner regex of REPETITION (if we haven't done so
|
||||
;; too often) and on failure calls NEXT-FN
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos maximum rep-num)
|
||||
(function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
@ -333,7 +329,7 @@ zero."))
|
||||
;; GREEDY-AUX to initialize the repetition counter and our
|
||||
;; slot in *LAST-POS-STORES*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0
|
||||
(svref *last-pos-stores* zero-length-num) nil)
|
||||
(greedy-aux start-pos)))))
|
||||
@ -341,8 +337,8 @@ zero."))
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos)
|
||||
(function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
@ -356,14 +352,14 @@ zero."))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'greedy-aux))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (svref *last-pos-stores* zero-length-num) nil)
|
||||
(greedy-aux start-pos)))))))
|
||||
|
||||
;; code for non-greedy repetitions with minimum zero
|
||||
|
||||
(defmacro non-greedy-constant-length-closure (check-curr-pos)
|
||||
"This is the template for simple non-greedy repetitions (where
|
||||
"This is the template for simple non-greedy repetitions \(where
|
||||
simple means that the minimum number of repetitions is zero, that the
|
||||
inner regex to be checked is of fixed length LEN, and that it doesn't
|
||||
contain registers, i.e. there's no need for backtracking).
|
||||
@ -371,7 +367,7 @@ CHECK-CURR-POS is a form which checks whether the inner regex of the
|
||||
repetition matches at CURR-POS."
|
||||
`(if maximum
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos maximum))
|
||||
(declare (fixnum start-pos maximum))
|
||||
;; because we know LEN we know in advance where to stop at the
|
||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||
;; minimal length of the part behind the repetition
|
||||
@ -389,7 +385,7 @@ repetition matches at CURR-POS."
|
||||
;; basically the same code; it's just a bit easier because we're
|
||||
;; not bounded by MAXIMUM
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((target-end-pos (1+ (- *end-pos* len min-rest))))
|
||||
(loop for curr-pos of-type fixnum from start-pos
|
||||
below target-end-pos
|
||||
@ -400,10 +396,10 @@ repetition matches at CURR-POS."
|
||||
|
||||
(defgeneric create-non-greedy-constant-length-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION is
|
||||
of fixed length and doesn't contain registers."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION is non-greedy and the minimal number of
|
||||
repetitions is zero. It is furthermore assumed that the inner regex
|
||||
of REPETITION is of fixed length and doesn't contain registers."))
|
||||
|
||||
(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -411,8 +407,8 @@ of fixed length and doesn't contain registers."))
|
||||
(maximum (maximum repetition))
|
||||
(regex (regex repetition))
|
||||
(min-rest (min-rest repetition)))
|
||||
(declare (type fixnum len min-rest)
|
||||
(type function next-fn))
|
||||
(declare (fixnum len min-rest)
|
||||
(function next-fn))
|
||||
(cond ((zerop len)
|
||||
;; inner regex has zero-length, so we can discard it
|
||||
;; completely
|
||||
@ -439,11 +435,8 @@ of fixed length and doesn't contain registers."))
|
||||
(char-class
|
||||
;; a character class
|
||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||
(if (invertedp regex)
|
||||
(non-greedy-constant-length-closure
|
||||
(not (char-class-test)))
|
||||
(non-greedy-constant-length-closure
|
||||
(char-class-test)))))
|
||||
(char-class-test))))
|
||||
(everything
|
||||
(if (single-line-p regex)
|
||||
;; a dot which really can match everything; we rely
|
||||
@ -458,17 +451,17 @@ of fixed length and doesn't contain registers."))
|
||||
;; just checks for immediate success, i.e. NEXT-FN is
|
||||
;; #'IDENTITY
|
||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||
(declare (type function inner-matcher))
|
||||
(declare (function inner-matcher))
|
||||
(non-greedy-constant-length-closure
|
||||
(funcall inner-matcher curr-pos)))))))))
|
||||
|
||||
(defgeneric create-non-greedy-no-zero-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION can
|
||||
never match a zero-length string (or instead the maximal number of
|
||||
repetitions is 1)."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION is non-greedy and the minimal number of
|
||||
repetitions is zero. It is furthermore assumed that the inner regex
|
||||
of REPETITION can never match a zero-length string \(or instead the
|
||||
maximal number of repetitions is 1)."))
|
||||
|
||||
(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -476,7 +469,7 @@ repetitions is 1)."))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after NON-GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type function next-fn))
|
||||
(declare (function next-fn))
|
||||
(cond
|
||||
((eql maximum 1)
|
||||
;; this is essentially like the next case but with a known
|
||||
@ -484,7 +477,7 @@ repetitions is 1)."))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) next-fn))
|
||||
(lambda (start-pos)
|
||||
(declare (type function repeat-matcher))
|
||||
(declare (function repeat-matcher))
|
||||
(or (funcall next-fn start-pos)
|
||||
(funcall repeat-matcher start-pos))))
|
||||
(maximum
|
||||
@ -496,8 +489,8 @@ repetitions is 1)."))
|
||||
;; the actual matcher which first calls NEXT-FN and
|
||||
;; on failure tries to match the inner regex of
|
||||
;; REPETITION (if we haven't done so too often)
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos maximum rep-num)
|
||||
(function repeat-matcher))
|
||||
(or (funcall next-fn start-pos)
|
||||
(and (< (aref *repeat-counters* rep-num) maximum)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
@ -513,15 +506,15 @@ repetitions is 1)."))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; NON-GREEDY-AUX to initialize the repetition counter
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0)
|
||||
(non-greedy-aux start-pos)))))
|
||||
(t
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((non-greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos)
|
||||
(function repeat-matcher))
|
||||
(or (funcall next-fn start-pos)
|
||||
(funcall repeat-matcher start-pos))))
|
||||
(setq repeat-matcher
|
||||
@ -530,9 +523,9 @@ repetitions is 1)."))
|
||||
|
||||
(defgeneric create-non-greedy-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
||||
zero."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION is non-greedy and the minimal number of
|
||||
repetitions is zero."))
|
||||
|
||||
(defmethod create-non-greedy-matcher ((repetition repetition) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -544,8 +537,8 @@ zero."))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after NON-GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type fixnum zero-length-num)
|
||||
(type function next-fn))
|
||||
(declare (fixnum zero-length-num)
|
||||
(function next-fn))
|
||||
(cond
|
||||
(maximum
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
@ -556,8 +549,8 @@ zero."))
|
||||
;; the actual matcher which first calls NEXT-FN and
|
||||
;; on failure tries to match the inner regex of
|
||||
;; REPETITION (if we haven't done so too often)
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos maximum rep-num)
|
||||
(function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
@ -587,7 +580,7 @@ zero."))
|
||||
;; NON-GREEDY-AUX to initialize the repetition counter and our
|
||||
;; slot in *LAST-POS-STORES*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0
|
||||
(svref *last-pos-stores* zero-length-num) nil)
|
||||
(non-greedy-aux start-pos)))))
|
||||
@ -595,8 +588,8 @@ zero."))
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((non-greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos)
|
||||
(function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
@ -611,7 +604,7 @@ zero."))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (svref *last-pos-stores* zero-length-num) nil)
|
||||
(non-greedy-aux start-pos)))))))
|
||||
|
||||
@ -622,13 +615,13 @@ zero."))
|
||||
means that the inner regex to be checked is of fixed length LEN, and
|
||||
that it doesn't contain registers, i.e. there's no need for
|
||||
backtracking) and where constant means that MINIMUM is equal to
|
||||
MAXIMUM. CHECK-CURR-POS is a form which checks whether the inner regex
|
||||
of the repetition matches at CURR-POS."
|
||||
MAXIMUM. CHECK-CURR-POS is a form which checks whether the inner
|
||||
regex of the repetition matches at CURR-POS."
|
||||
`(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((target-end-pos (+ start-pos
|
||||
(the fixnum (* len repetitions)))))
|
||||
(declare (type fixnum target-end-pos))
|
||||
(declare (fixnum target-end-pos))
|
||||
;; first check if we won't go beyond the end of the string
|
||||
(and (>= *end-pos* target-end-pos)
|
||||
;; then loop through all repetitions step by step
|
||||
@ -642,10 +635,10 @@ of the repetition matches at CURR-POS."
|
||||
(defgeneric create-constant-repetition-constant-length-matcher
|
||||
(repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION has a constant number of repetitions. It is
|
||||
furthermore assumed that the inner regex of REPETITION is of fixed
|
||||
length and doesn't contain registers."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION has a constant number of repetitions.
|
||||
It is furthermore assumed that the inner regex of REPETITION is of
|
||||
fixed length and doesn't contain registers."))
|
||||
|
||||
(defmethod create-constant-repetition-constant-length-matcher
|
||||
((repetition repetition) next-fn)
|
||||
@ -653,8 +646,8 @@ length and doesn't contain registers."))
|
||||
(let ((len (len repetition))
|
||||
(repetitions (minimum repetition))
|
||||
(regex (regex repetition)))
|
||||
(declare (type fixnum len repetitions)
|
||||
(type function next-fn))
|
||||
(declare (fixnum len repetitions)
|
||||
(function next-fn))
|
||||
(if (zerop len)
|
||||
;; if the length is zero it suffices to try once
|
||||
(create-matcher-aux regex next-fn)
|
||||
@ -676,33 +669,29 @@ length and doesn't contain registers."))
|
||||
(if (case-insensitive-p regex)
|
||||
(constant-repetition-constant-length-closure
|
||||
(let ((next-pos (+ curr-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and (*string*-equal str curr-pos next-pos 0 len)
|
||||
next-pos)))
|
||||
(constant-repetition-constant-length-closure
|
||||
(let ((next-pos (+ curr-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and (*string*= str curr-pos next-pos 0 len)
|
||||
next-pos)))))))
|
||||
(char-class
|
||||
;; a character class
|
||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||
(if (invertedp regex)
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (not (char-class-test))
|
||||
(1+ curr-pos)))
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (char-class-test)
|
||||
(1+ curr-pos))))))
|
||||
(1+ curr-pos)))))
|
||||
(everything
|
||||
(if (single-line-p regex)
|
||||
;; a dot which really matches everything - we just have to
|
||||
;; advance the index into *STRING* accordingly and check
|
||||
;; if we didn't go past the end
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(let ((next-pos (+ start-pos repetitions)))
|
||||
(declare (type fixnum next-pos))
|
||||
(declare (fixnum next-pos))
|
||||
(and (<= next-pos *end-pos*)
|
||||
(funcall next-fn next-pos))))
|
||||
;; a dot which is not in single-line-mode - make sure we
|
||||
@ -714,14 +703,14 @@ length and doesn't contain registers."))
|
||||
;; the general case - we build an inner matcher which just
|
||||
;; checks for immediate success, i.e. NEXT-FN is #'IDENTITY
|
||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||
(declare (type function inner-matcher))
|
||||
(declare (function inner-matcher))
|
||||
(constant-repetition-constant-length-closure
|
||||
(funcall inner-matcher curr-pos))))))))
|
||||
|
||||
(defgeneric create-constant-repetition-matcher (repetition next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(:documentation "Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION has a constant number of repetitions."))
|
||||
(:documentation "Creates a closure which tries to match REPETITION.
|
||||
It is assumed that REPETITION has a constant number of repetitions."))
|
||||
|
||||
(defmethod create-constant-repetition-matcher ((repetition repetition) next-fn)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
@ -732,20 +721,20 @@ that REPETITION has a constant number of repetitions."))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after NON-GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type fixnum repetitions rep-num)
|
||||
(type function next-fn))
|
||||
(declare (fixnum repetitions rep-num)
|
||||
(function next-fn))
|
||||
(if (zerop (min-len repetition))
|
||||
;; we make a reservation for our slot in *LAST-POS-STORES*
|
||||
;; because we have to watch out for needless loops as the inner
|
||||
;; regex might match zero-length strings
|
||||
(let ((zero-length-num (incf-after *zero-length-num*)))
|
||||
(declare (type fixnum zero-length-num))
|
||||
(declare (fixnum zero-length-num))
|
||||
(flet ((constant-aux (start-pos)
|
||||
;; the actual matcher which first calls NEXT-FN and
|
||||
;; on failure tries to match the inner regex of
|
||||
;; REPETITION (if we haven't done so too often)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos)
|
||||
(function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
@ -778,15 +767,15 @@ that REPETITION has a constant number of repetitions."))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; CONSTANT-AUX to initialize the repetition counter
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0
|
||||
(aref *last-pos-stores* zero-length-num) nil)
|
||||
(constant-aux start-pos))))
|
||||
;; easier code because we don't have to care about zero-length
|
||||
;; matches but basically the same
|
||||
(flet ((constant-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(declare (fixnum start-pos)
|
||||
(function repeat-matcher))
|
||||
(cond ((< (aref *repeat-counters* rep-num) repetitions)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
(prog1
|
||||
@ -796,7 +785,7 @@ that REPETITION has a constant number of repetitions."))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'constant-aux))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0)
|
||||
(constant-aux start-pos))))))
|
||||
|
||||
|
||||
63
scanner.lisp
63
scanner.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.34 2008/07/06 18:12:05 edi Exp $
|
||||
|
||||
;;; Here the scanner for the actual regex as well as utility scanners
|
||||
;;; for the constant start and end strings are created.
|
||||
@ -30,13 +30,13 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defmacro bmh-matcher-aux (&key case-insensitive-p)
|
||||
"Auxiliary macro used by CREATE-BMH-MATCHER."
|
||||
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
||||
`(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(if (or (minusp start-pos)
|
||||
(> (the fixnum (+ start-pos m)) *end-pos*))
|
||||
nil
|
||||
@ -53,21 +53,21 @@
|
||||
(return-from bmh-matcher (1+ i)))))))))
|
||||
|
||||
(defun create-bmh-matcher (pattern case-insensitive-p)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
|
||||
simple-string *STRING* for the first occurence of the substring
|
||||
PATTERN. The search starts at the position START-POS within *STRING*
|
||||
and stops before *END-POS* is reached. Depending on the second
|
||||
argument the search is case-insensitive or not. If the special
|
||||
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
|
||||
instead. (BMH matchers are faster but need much more space.)"
|
||||
instead. \(BMH matchers are faster but need much more space.)"
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
|
||||
;; details
|
||||
(unless *use-bmh-matchers*
|
||||
(let ((test (if case-insensitive-p #'char-equal #'char=)))
|
||||
(return-from create-bmh-matcher
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (not (minusp start-pos))
|
||||
(search pattern
|
||||
*string*
|
||||
@ -78,7 +78,7 @@ instead. (BMH matchers are faster but need much more space.)"
|
||||
(skip (make-array *regex-char-code-limit*
|
||||
:element-type 'fixnum
|
||||
:initial-element m)))
|
||||
(declare (type fixnum m))
|
||||
(declare (fixnum m))
|
||||
(loop for k of-type fixnum below m
|
||||
if case-insensitive-p
|
||||
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
|
||||
@ -93,29 +93,28 @@ instead. (BMH matchers are faster but need much more space.)"
|
||||
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
|
||||
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
||||
`(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(declare (fixnum start-pos))
|
||||
(and (not (minusp start-pos))
|
||||
(loop for i of-type fixnum from start-pos below *end-pos*
|
||||
thereis (and (,char-compare (schar *string* i) chr) i))))))
|
||||
|
||||
(defun create-char-searcher (chr case-insensitive-p)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Returns a function which searches the (special) simple-string
|
||||
*STRING* for the first occurence of the character CHR. The search
|
||||
starts at the position START-POS within *STRING* and stops before
|
||||
*END-POS* is reached. Depending on the second argument the search is
|
||||
case-insensitive or not."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(if case-insensitive-p
|
||||
(char-searcher-aux :case-insensitive-p t)
|
||||
(char-searcher-aux)))
|
||||
|
||||
(declaim (inline newline-skipper))
|
||||
|
||||
(defun newline-skipper (start-pos)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum start-pos))
|
||||
"Find the next occurence of a character in *STRING* which is behind
|
||||
"Finds the next occurence of a character in *STRING* which is behind
|
||||
a #\Newline."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (fixnum start-pos))
|
||||
;; we can start with (1- START-POS) without testing for (PLUSP
|
||||
;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
|
||||
;; the first iteration
|
||||
@ -159,8 +158,8 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
||||
nil))
|
||||
;; we don't need to try further than MAX-END-POS
|
||||
(max-end-pos (- *end-pos* min-len)))
|
||||
(declare (type fixnum scan-start-pos)
|
||||
(type function match-fn))
|
||||
(declare (fixnum scan-start-pos)
|
||||
(function match-fn))
|
||||
;; definition of ADVANCE-FN will be inserted here by macrology
|
||||
(labels ((advance-fn-definition))
|
||||
(declare (inline advance-fn))
|
||||
@ -185,8 +184,8 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
||||
;; is anchored at the very end of the target string
|
||||
;; (perhaps modulo a #\Newline)
|
||||
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
|
||||
(declare (type fixnum end-test-pos)
|
||||
(type function end-string-test))
|
||||
(declare (fixnum end-test-pos)
|
||||
(function end-string-test))
|
||||
(unless (setq *end-string-pos* (funcall end-string-test
|
||||
end-test-pos))
|
||||
(when (and (= 1 (the fixnum end-anchored-p))
|
||||
@ -223,7 +222,7 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
||||
(return-from scan nil))
|
||||
(when starts-with-str
|
||||
(locally
|
||||
(declare (type fixnum starts-with-len))
|
||||
(declare (fixnum starts-with-len))
|
||||
(cond ((and (case-insensitive-p starts-with)
|
||||
(not (*string*-equal starts-with-str
|
||||
*start-pos*
|
||||
@ -321,10 +320,10 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
||||
rep-num
|
||||
zero-length-num
|
||||
reg-num)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (type fixnum min-len zero-length-num rep-num reg-num))
|
||||
"Auxiliary function to create and return a scanner \(which is
|
||||
actually a closure). Used by CREATE-SCANNER."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(declare (fixnum min-len zero-length-num rep-num reg-num))
|
||||
(let ((starts-with-len (if (typep starts-with 'str)
|
||||
(len starts-with)))
|
||||
(starts-with-everything (typep starts-with 'everything)))
|
||||
@ -341,8 +340,8 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; left)
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type fixnum end-string-offset starts-with-len)
|
||||
(type function start-string-test end-string-test))
|
||||
(declare (fixnum end-string-offset starts-with-len)
|
||||
(function start-string-test end-string-test))
|
||||
(loop
|
||||
(unless (setq pos (funcall start-string-test pos))
|
||||
;; give up completely if we can't find a start string
|
||||
@ -350,7 +349,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
(return-from scan nil))
|
||||
(locally
|
||||
;; from here we know that POS is a FIXNUM
|
||||
(declare (type fixnum pos))
|
||||
(declare (fixnum pos))
|
||||
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
||||
;; if we already found an end string candidate the
|
||||
;; position of which matches the start string
|
||||
@ -369,7 +368,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; according to the end string candidate
|
||||
(let ((new-pos (- (the fixnum *end-string-pos*)
|
||||
end-string-offset)))
|
||||
(declare (type fixnum new-pos *end-string-pos*))
|
||||
(declare (fixnum new-pos *end-string-pos*))
|
||||
(cond ((= new-pos pos)
|
||||
;; if POS and NEW-POS are equal then the
|
||||
;; two candidates agree so we're fine
|
||||
@ -394,15 +393,15 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; offset (from the left)
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type fixnum end-string-offset)
|
||||
(type function end-string-test))
|
||||
(declare (fixnum end-string-offset)
|
||||
(function end-string-test))
|
||||
(loop
|
||||
(unless (setq pos (newline-skipper pos))
|
||||
;; if we can't find a #\Newline we give up immediately
|
||||
(return-from scan nil))
|
||||
(locally
|
||||
;; from here we know that POS is a FIXNUM
|
||||
(declare (type fixnum pos))
|
||||
(declare (fixnum pos))
|
||||
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
||||
;; if we already found an end string candidate the
|
||||
;; position of which matches the place behind the
|
||||
@ -420,7 +419,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; according to the end string candidate
|
||||
(let ((new-pos (- (the fixnum *end-string-pos*)
|
||||
end-string-offset)))
|
||||
(declare (type fixnum new-pos *end-string-pos*))
|
||||
(declare (fixnum new-pos *end-string-pos*))
|
||||
(cond ((= new-pos pos)
|
||||
;; if POS and NEW-POS are equal then the
|
||||
;; the end string candidate agrees with
|
||||
@ -446,7 +445,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; information to advance POS
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function start-string-test end-string-test))
|
||||
(declare (function start-string-test end-string-test))
|
||||
(unless (setq pos (funcall start-string-test pos))
|
||||
(return-from scan nil))
|
||||
(if (<= (the fixnum pos)
|
||||
@ -463,7 +462,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; enough information to advance POS
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function end-string-test))
|
||||
(declare (function end-string-test))
|
||||
(unless (setq pos (newline-skipper pos))
|
||||
(return-from scan nil))
|
||||
(if (<= (the fixnum pos)
|
||||
@ -476,7 +475,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; just check for constant start string candidate
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function start-string-test))
|
||||
(declare (function start-string-test))
|
||||
(unless (setq pos (funcall start-string-test pos))
|
||||
(return-from scan nil))
|
||||
pos)))
|
||||
@ -492,7 +491,7 @@ actually a closure). Used by CREATE-SCANNER."
|
||||
;; advanced beyond the last one
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function end-string-test))
|
||||
(declare (function end-string-test))
|
||||
(if (<= (the fixnum pos)
|
||||
(the fixnum *end-string-pos*))
|
||||
(return-from advance-fn pos))
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.27 2008/07/03 08:13:28 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.40 2008/07/23 02:14:06 edi Exp $
|
||||
|
||||
;;; globally declared special variables
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
;;; special variables used to effect declarations
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
|
||||
(defvar *extended-mode-p* nil
|
||||
"Whether the parser will start in extended mode.")
|
||||
(declaim (type boolean *extended-mode-p*))
|
||||
(declaim (boolean *extended-mode-p*))
|
||||
|
||||
;;; special variables used by the SCAN function and the matchers
|
||||
|
||||
@ -60,16 +60,16 @@
|
||||
occur in character classes. Change this value BEFORE creating
|
||||
scanners if you don't need the \(full) Unicode support of
|
||||
implementations like AllegroCL, CLISP, LispWorks, or SBCL.")
|
||||
(declaim (type fixnum *regex-char-code-limit*))
|
||||
(declaim (fixnum *regex-char-code-limit*))
|
||||
|
||||
(defvar *string* ""
|
||||
"The string which is currently scanned by SCAN.
|
||||
Will always be coerced to a SIMPLE-STRING.")
|
||||
(declaim (type simple-string *string*))
|
||||
(declaim (simple-string *string*))
|
||||
|
||||
(defvar *start-pos* 0
|
||||
"Where to start scanning within *STRING*.")
|
||||
(declaim (type fixnum *start-pos*))
|
||||
(declaim (fixnum *start-pos*))
|
||||
|
||||
(defvar *real-start-pos* nil
|
||||
"The real start of *STRING*. This is for repeated scans and is only used internally.")
|
||||
@ -77,22 +77,22 @@ Will always be coerced to a SIMPLE-STRING.")
|
||||
|
||||
(defvar *end-pos* 0
|
||||
"Where to stop scanning within *STRING*.")
|
||||
(declaim (type fixnum *end-pos*))
|
||||
(declaim (fixnum *end-pos*))
|
||||
|
||||
(defvar *reg-starts* (make-array 0)
|
||||
"An array which holds the start positions
|
||||
of the current register candidates.")
|
||||
(declaim (type simple-vector *reg-starts*))
|
||||
(declaim (simple-vector *reg-starts*))
|
||||
|
||||
(defvar *regs-maybe-start* (make-array 0)
|
||||
"An array which holds the next start positions
|
||||
of the current register candidates.")
|
||||
(declaim (type simple-vector *regs-maybe-start*))
|
||||
(declaim (simple-vector *regs-maybe-start*))
|
||||
|
||||
(defvar *reg-ends* (make-array 0)
|
||||
"An array which holds the end positions
|
||||
of the current register candidates.")
|
||||
(declaim (type simple-vector *reg-ends*))
|
||||
(declaim (simple-vector *reg-ends*))
|
||||
|
||||
(defvar *end-string-pos* nil
|
||||
"Start of the next possible end-string candidate.")
|
||||
@ -100,12 +100,12 @@ of the current register candidates.")
|
||||
(defvar *rep-num* 0
|
||||
"Counts the number of \"complicated\" repetitions while the matchers
|
||||
are built.")
|
||||
(declaim (type fixnum *rep-num*))
|
||||
(declaim (fixnum *rep-num*))
|
||||
|
||||
(defvar *zero-length-num* 0
|
||||
"Counts the number of repetitions the inner regexes of which may
|
||||
have zero-length while the matchers are built.")
|
||||
(declaim (type fixnum *zero-length-num*))
|
||||
(declaim (fixnum *zero-length-num*))
|
||||
|
||||
(defvar *repeat-counters* (make-array 0
|
||||
:initial-element 0
|
||||
@ -118,12 +118,27 @@ repetitive patterns have been tested already.")
|
||||
"An array to keep track of the last positions
|
||||
where we saw repetitive patterns.
|
||||
Only used for patterns which might have zero length.")
|
||||
(declaim (type simple-vector *last-pos-stores*))
|
||||
(declaim (simple-vector *last-pos-stores*))
|
||||
|
||||
(defvar *use-bmh-matchers* t
|
||||
"Whether the scanners created by CREATE-SCANNER should use the \(fast
|
||||
but large) Boyer-Moore-Horspool matchers.")
|
||||
|
||||
(defvar *optimize-char-classes* nil
|
||||
"Whether character classes should be compiled into look-ups into
|
||||
O\(1) data structures. This is usually fast but will be costly in
|
||||
terms of scanner creation time and might be costly in terms of size if
|
||||
*REGEX-CHAR-CODE-LIMIT* is high. This value will be used as the :KIND
|
||||
keyword argument to CREATE-OPTIMIZED-TEST-FUNCTION - see there for the
|
||||
possible non-NIL values.")
|
||||
|
||||
(defvar *property-resolver* nil
|
||||
"Should be NIL or a designator for a function which accepts strings
|
||||
and returns unary character test functions or NIL. This 'resolver' is
|
||||
intended to handle `character properties' like \\p{IsAlpha}. If
|
||||
*PROPERTY-RESOLVER* is NIL, then the parser will simply treat \\p and
|
||||
\\P as #\\p and #\\P as in older versions of CL-PPCRE.")
|
||||
|
||||
(defvar *allow-quoting* nil
|
||||
"Whether the parser should support Perl's \\Q and \\E.")
|
||||
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.4 2008/06/25 14:04:27 edi Exp $
|
||||
|
||||
;;; This system definition for LispWorks was kindly provided by Wade Humeniuk
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/packages.lisp,v 1.3 2008/07/22 12:58:52 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
@ -29,29 +27,11 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
(in-package :cl-user)
|
||||
|
||||
(defparameter *cl-ppcre-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(defsystem cl-ppcre
|
||||
(:default-pathname *cl-ppcre-base-directory*
|
||||
:default-type :lisp-file)
|
||||
:members ("packages"
|
||||
"specials"
|
||||
"util"
|
||||
"errors"
|
||||
"lexer"
|
||||
"parser"
|
||||
"regex-class"
|
||||
"convert"
|
||||
"optimize"
|
||||
"closures"
|
||||
"repetition-closures"
|
||||
"scanner"
|
||||
"api")
|
||||
:rules ((:in-order-to :compile :all
|
||||
(:requires (:load :previous)))
|
||||
(:in-order-to :load :all
|
||||
(:requires (:load :previous)))))
|
||||
(defpackage :cl-ppcre-test
|
||||
#+genera (:shadowing-import-from :common-lisp :lambda)
|
||||
(:use #-:genera :cl #+:genera :future-common-lisp :cl-ppcre)
|
||||
(:import-from :cl-ppcre :*standard-optimize-settings*
|
||||
:string-list-to-simple-string)
|
||||
(:export :run-all-tests :unicode-test))
|
||||
150
test/perl-tests.lisp
Normal file
150
test/perl-tests.lisp
Normal file
@ -0,0 +1,150 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/perl-tests.lisp,v 1.7 2008/07/22 23:02:04 edi Exp $
|
||||
|
||||
;;; The tests in this file test CL-PPCRE against testdata generated by
|
||||
;;; the Perl program `perltest.pl' from the input file `testinput' in
|
||||
;;; order to check compatibility with Perl and correctness of the
|
||||
;;; regex engine.
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre-test)
|
||||
|
||||
(defvar *tests-to-skip* '(662 790 1439)
|
||||
"Some tests we skip because the testdata is generated by a Perl
|
||||
program and CL-PPCRE differs from Perl for these tests - on purpose.")
|
||||
|
||||
(defun create-string-from-input (input)
|
||||
"Converts INPUT to a string which can be used in TEST below. The
|
||||
input file `testdata' encodes strings containing non-printable
|
||||
characters as lists where those characters are represented by their
|
||||
character code."
|
||||
(etypecase input
|
||||
((or null string) input)
|
||||
(list (string-list-to-simple-string
|
||||
(loop for element in input
|
||||
if (stringp element)
|
||||
collect element
|
||||
else
|
||||
collect (string (code-char element)))))))
|
||||
|
||||
(defun perl-test (&key (file-name
|
||||
(make-pathname :name "perltestdata"
|
||||
:type nil :version nil
|
||||
:defaults *this-file*)
|
||||
file-name-provided-p)
|
||||
(external-format '(:latin-1 :eol-style :lf))
|
||||
verbose)
|
||||
"Loops through all test cases in FILE-NAME and prints a report if
|
||||
VERBOSE is true. EXTERNAL-FORMAT is the FLEXI-STREAMS external format
|
||||
which is used to read the file. Returns a true value if all tests
|
||||
succeeded.
|
||||
|
||||
For the syntax of the tests in FILE-NAME refer to the source code of
|
||||
this function and to the Perl script perltest.pl which generates such
|
||||
test files."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(with-open-file (binary-stream file-name :element-type 'flex:octet)
|
||||
(let ((stream (flex:make-flexi-stream binary-stream :external-format external-format))
|
||||
;; the standard Perl tests don't need full Unicode support
|
||||
(*regex-char-code-limit* (if file-name-provided-p *regex-char-code-limit* 256))
|
||||
;; we need this for the standard test suite or otherwise we
|
||||
;; might get stack overflows
|
||||
(*optimize-char-classes* (if file-name-provided-p *optimize-char-classes* :charmap))
|
||||
;; we only check for correctness and don't care about speed
|
||||
;; that match (but rather about space constraints of the
|
||||
;; trial versions)
|
||||
(*use-bmh-matchers* (if file-name-provided-p *use-bmh-matchers* nil))
|
||||
;; some tests in the Perl suite explicitly check for this
|
||||
(*allow-quoting* (if file-name-provided-p *allow-quoting* t)))
|
||||
(do-tests ((format nil "Running tests in file ~S" (file-namestring file-name))
|
||||
(not verbose))
|
||||
(let ((input-line (or (read stream nil) (done)))
|
||||
errors)
|
||||
(destructuring-bind (counter
|
||||
info-string%
|
||||
regex%
|
||||
case-insensitive-mode
|
||||
multi-line-mode
|
||||
single-line-mode
|
||||
extended-mode
|
||||
target%
|
||||
perl-error
|
||||
expected-result%
|
||||
expected-registers)
|
||||
input-line
|
||||
(destructuring-bind (info-string regex target expected-result)
|
||||
(mapcar 'create-string-from-input
|
||||
(list info-string% regex% target% expected-result%))
|
||||
(setq expected-registers (mapcar 'create-string-from-input expected-registers))
|
||||
(unless (find counter *tests-to-skip* :test #'=)
|
||||
(when verbose
|
||||
(format t "~&~4D: ~S" counter info-string))
|
||||
(let ((scanner
|
||||
(handler-bind ((error (lambda (condition)
|
||||
(declare (ignore condition))
|
||||
(when perl-error
|
||||
;; we expected an
|
||||
;; error, so we can
|
||||
;; signal success
|
||||
(return-from test-block)))))
|
||||
(create-scanner regex
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode))))
|
||||
(block test-block
|
||||
(multiple-value-bind (start end reg-starts reg-ends)
|
||||
(scan scanner target)
|
||||
(cond (perl-error
|
||||
(push (format nil "expected an error but got a result.")
|
||||
errors))
|
||||
(t
|
||||
(when (not (eq start expected-result))
|
||||
(if start
|
||||
(let ((result (subseq target start end)))
|
||||
(unless (string= result expected-result)
|
||||
(push (format nil "expected ~S but got ~S."
|
||||
expected-result result)
|
||||
errors))
|
||||
(setq reg-starts (coerce reg-starts 'list)
|
||||
reg-ends (coerce reg-ends 'list))
|
||||
(loop for i from 0
|
||||
for expected-register in expected-registers
|
||||
for reg-start = (nth i reg-starts)
|
||||
for reg-end = (nth i reg-ends)
|
||||
for register = (if (and reg-start reg-end)
|
||||
(subseq target reg-start reg-end)
|
||||
nil)
|
||||
unless (string= expected-register register)
|
||||
do (push (format nil "\\~A: expected ~S but got ~S."
|
||||
(1+ i) expected-register register)
|
||||
errors)))
|
||||
(push (format nil "expected ~S but got ~S."
|
||||
expected-result start)
|
||||
errors))))))
|
||||
errors))))))))))
|
||||
18
perltest.pl → test/perltest.pl
Executable file → Normal file
18
perltest.pl → test/perltest.pl
Executable file → Normal file
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/perl
|
||||
# $Header: /usr/local/cvsrep/cl-ppcre/test/perltest.pl,v 1.1 2008/07/06 21:24:39 edi Exp $
|
||||
|
||||
# This is a heavily modified version of the file 'perltest' which
|
||||
# comes with the PCRE library package, which is open source software,
|
||||
@ -8,8 +9,6 @@
|
||||
# The PCRE library package is available from
|
||||
# <ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/>
|
||||
|
||||
use Time::HiRes qw(time);
|
||||
|
||||
sub string_for_lisp {
|
||||
my(@a, $t, $in_string, $switch);
|
||||
|
||||
@ -48,8 +47,6 @@ sub string_for_lisp {
|
||||
}
|
||||
}
|
||||
|
||||
$min_time = shift;
|
||||
|
||||
NEXT_RE: while (1) {
|
||||
last
|
||||
if !($_ = <>);
|
||||
@ -132,8 +129,6 @@ if (\$x =~ ${pattern}) {
|
||||
};
|
||||
END
|
||||
|
||||
$times = 1;
|
||||
$used = 0;
|
||||
$counter++;
|
||||
print STDERR "$counter\n";
|
||||
|
||||
@ -141,18 +136,9 @@ END
|
||||
$error = 't';
|
||||
} else {
|
||||
$error = 'nil';
|
||||
if ($min_time) {
|
||||
$times = 10;
|
||||
while (1) {
|
||||
$used = &$test($times);
|
||||
last
|
||||
if $used > $min_time;
|
||||
$times *= 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print "($counter $info_string \"$pattern_for_lisp\" $case_insensitive_mode $multi_line_mode $single_line_mode $extended_mode " . string_for_lisp($x) . " $error $times $used ";
|
||||
print "($counter $info_string \"$pattern_for_lisp\" $case_insensitive_mode $multi_line_mode $single_line_mode $extended_mode " . string_for_lisp($x) . " $error ";
|
||||
if (!@subs) {
|
||||
print 'nil nil';
|
||||
} else {
|
||||
File diff suppressed because one or more lines are too long
349
test/simple
Normal file
349
test/simple
Normal file
@ -0,0 +1,349 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/simple,v 1.9 2008/07/23 00:48:00 edi Exp $
|
||||
|
||||
;;; some simple tests for CL-PPCRE - entered manually and to be read
|
||||
;;; in the CL-PPCRE-TEST package; all forms are expected to return a
|
||||
;;; true value on success when EVALuated
|
||||
|
||||
(equalp (multiple-value-list (scan "(a)*b" "xaaabd"))
|
||||
(list 1 5 #(3) #(4)))
|
||||
|
||||
(equalp (multiple-value-list (scan "(a)*b" "xaaabd" :start 1))
|
||||
(list 1 5 #(3) #(4)))
|
||||
|
||||
(equalp (multiple-value-list (scan "(a)*b" "xaaabd" :start 2))
|
||||
(list 2 5 #(3) #(4)))
|
||||
|
||||
(null (scan "(a)*b" "xaaabd" :end 4))
|
||||
|
||||
(equalp (multiple-value-list (scan '(:greedy-repetition 0 nil #\b) "bbbc"))
|
||||
(list 0 3 #() #()))
|
||||
|
||||
(null (scan '(:greedy-repetition 4 6 #\b) "bbbc"))
|
||||
|
||||
(let ((s (create-scanner "(([a-c])+)x")))
|
||||
(equalp (multiple-value-list (scan s "abcxy"))
|
||||
(list 0 4 #(0 2) #(3 3))))
|
||||
|
||||
(equalp (multiple-value-list (scan-to-strings "[^b]*b" "aaabd"))
|
||||
(list "aaab" #()))
|
||||
|
||||
(equalp (multiple-value-list (scan-to-strings "([^b])*b" "aaabd"))
|
||||
(list "aaab" #("a")))
|
||||
|
||||
(equalp (multiple-value-list (scan-to-strings "(([^b])*)b" "aaabd"))
|
||||
(list "aaab" #("aaa" "a")))
|
||||
|
||||
(equalp (register-groups-bind (first second third fourth)
|
||||
("((a)|(b)|(c))+" "abababc" :sharedp t)
|
||||
(list first second third fourth))
|
||||
(list "c" "a" "b" "c"))
|
||||
|
||||
(equalp (register-groups-bind (nil second third fourth)
|
||||
("((a)|(b)|(c))()+" "abababc" :start 6)
|
||||
(list second third fourth))
|
||||
(list nil nil "c"))
|
||||
|
||||
(null (register-groups-bind (first)
|
||||
("(a|b)+" "accc" :start 1)
|
||||
first))
|
||||
|
||||
(equalp (register-groups-bind (fname lname (#'parse-integer date month year))
|
||||
("(\\w+)\\s+(\\w+)\\s+(\\d{1,2})\\.(\\d{1,2})\\.(\\d{4})" "Frank Zappa 21.12.1940")
|
||||
(list fname lname (encode-universal-time 0 0 0 date month year 0)))
|
||||
(list "Frank" "Zappa" 1292889600))
|
||||
|
||||
(flet ((foo (regex target-string &key (start 0) (end (length target-string)))
|
||||
(let ((sum 0))
|
||||
(do-matches (s e regex target-string nil :start start :end end)
|
||||
(incf sum (- e s)))
|
||||
(/ sum (- end start)))))
|
||||
(and (= 1/3 (foo "a" "abcabcabc"))
|
||||
(= 5/9 (foo "aa|b" "aacabcbbc"))))
|
||||
|
||||
(labels ((crossfoot (target-string &key (start 0) (end (length target-string)))
|
||||
(let ((sum 0))
|
||||
(do-matches-as-strings (m :digit-class
|
||||
target-string nil
|
||||
:start start :end end)
|
||||
(incf sum (parse-integer m)))
|
||||
(if (< sum 10)
|
||||
sum
|
||||
(crossfoot (format nil "~A" sum))))))
|
||||
(and (zerop (crossfoot "bar"))
|
||||
(= 3 (crossfoot "a3x"))
|
||||
(= 6 (crossfoot "12345"))))
|
||||
|
||||
(let (result)
|
||||
(do-register-groups (first second third fourth)
|
||||
("((a)|(b)|(c))" "abababc" nil :start 2 :sharedp t)
|
||||
(push (list first second third fourth) result))
|
||||
(equal (nreverse result)
|
||||
'(("a" "a" nil nil)
|
||||
("b" nil "b" nil)
|
||||
("a" "a" nil nil)
|
||||
("b" nil "b" nil)
|
||||
("c" nil nil "c"))))
|
||||
|
||||
(let (result)
|
||||
(do-register-groups ((#'parse-integer n) (#'intern sign) whitespace)
|
||||
("(\\d+)|(\\+|-|\\*|/)|(\\s+)" "12*15 - 42/3")
|
||||
(unless whitespace
|
||||
(push (or n sign) result)))
|
||||
(equal (nreverse result)
|
||||
'(12 * 15 - 42 / 3)))
|
||||
|
||||
(equal (all-matches "a" "foo bar baz")
|
||||
(list 5 6 9 10))
|
||||
|
||||
(equal (all-matches "\\w*" "foo bar baz")
|
||||
(list 0 3 3 3 4 7 7 7 8 11 11 11))
|
||||
|
||||
(equal (all-matches-as-strings "a" "foo bar baz")
|
||||
(list "a" "a"))
|
||||
|
||||
(equal (all-matches-as-strings "\\w*" "foo bar baz")
|
||||
(list "foo" "" "bar" "" "baz" ""))
|
||||
|
||||
(equal (split "\\s+" "foo bar baz
|
||||
frob")
|
||||
'("foo" "bar" "baz" "frob"))
|
||||
|
||||
(equal (split "\\s*" "foo bar baz")
|
||||
'("f" "o" "o" "b" "a" "r" "b" "a" "z"))
|
||||
|
||||
(equal (split "(\\s+)" "foo bar baz")
|
||||
'("foo" "bar" "baz"))
|
||||
|
||||
(equal (split "(\\s+)" "foo bar baz" :with-registers-p t)
|
||||
'("foo" " " "bar" " " "baz"))
|
||||
|
||||
(equal (split "(\\s)(\\s*)" "foo bar baz" :with-registers-p t)
|
||||
'("foo" " " "" "bar" " " " " "baz"))
|
||||
|
||||
(equal (split "(,)|(;)" "foo,bar;baz" :with-registers-p t)
|
||||
'("foo" "," nil "bar" nil ";" "baz"))
|
||||
|
||||
(equal (split "(,)|(;)" "foo,bar;baz" :with-registers-p t :omit-unmatched-p t)
|
||||
'("foo" "," "bar" ";" "baz"))
|
||||
|
||||
(equal (split ":" "a:b:c:d:e:f:g::")
|
||||
'("a" "b" "c" "d" "e" "f" "g"))
|
||||
|
||||
(equal (split ":" "a:b:c:d:e:f:g::" :limit 1)
|
||||
'("a:b:c:d:e:f:g::"))
|
||||
|
||||
(equal (split ":" "a:b:c:d:e:f:g::" :limit 2)
|
||||
'("a" "b:c:d:e:f:g::"))
|
||||
|
||||
(equal (split ":" "a:b:c:d:e:f:g::" :limit 3)
|
||||
'("a" "b" "c:d:e:f:g::"))
|
||||
|
||||
(equal (split ":" "a:b:c:d:e:f:g::" :limit 1000)
|
||||
'("a" "b" "c" "d" "e" "f" "g" "" ""))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "fo+" "foo bar" "frob"))
|
||||
(list "frob bar" t))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "fo+" "FOO bar" "frob"))
|
||||
(list "FOO bar" nil))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "(?i)fo+" "FOO bar" "frob"))
|
||||
(list "frob bar" t))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "(?i)fo+" "FOO bar" "frob" :preserve-case t))
|
||||
(list "FROB bar" t))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "(?i)fo+" "Foo bar" "frob" :preserve-case t))
|
||||
(list "Frob bar" t))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "bar" "foo bar baz" "[frob (was '\\&' between '\\`' and '\\'')]"))
|
||||
(list "foo [frob (was 'bar' between 'foo ' and ' baz')] baz" t))
|
||||
|
||||
(equal (multiple-value-list
|
||||
(regex-replace "bar" "foo bar baz"
|
||||
'("[frob (was '" :match "' between '" :before-match "' and '" :after-match "')]")))
|
||||
(list "foo [frob (was 'bar' between 'foo ' and ' baz')] baz" t))
|
||||
|
||||
(equal (multiple-value-list (regex-replace "(be)(nev)(o)(lent)"
|
||||
"benevolent: adj. generous, kind"
|
||||
(lambda (match &rest registers)
|
||||
(format nil "~A [~{~A~^.~}]" match registers))
|
||||
:simple-calls t))
|
||||
(list "benevolent [be.nev.o.lent]: adj. generous, kind" t))
|
||||
|
||||
(equal (multiple-value-list (regex-replace-all "(?i)fo+" "foo Fooo FOOOO bar" "frob" :preserve-case t))
|
||||
(list "frob Frob FROB bar" t))
|
||||
|
||||
(string= (regex-replace-all "(?i)f(o+)" "foo Fooo FOOOO bar" "fr\\1b" :preserve-case t)
|
||||
"froob Frooob FROOOOB bar")
|
||||
|
||||
(let ((qp-regex (create-scanner "[\\x80-\\xff]")))
|
||||
(flet ((encode-quoted-printable (string)
|
||||
"Converts 8-bit string to quoted-printable representation."
|
||||
;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
|
||||
(flet ((convert (target-string start end match-start match-end reg-starts reg-ends)
|
||||
(declare (ignore start end match-end reg-starts reg-ends))
|
||||
(format nil "=~2,'0x" (char-code (char target-string match-start)))))
|
||||
(regex-replace-all qp-regex string #'convert))))
|
||||
(string= (encode-quoted-printable "F<>te S<>rensen na<6E>ve H<>hner Stra<72>e")
|
||||
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe")))
|
||||
|
||||
(let ((url-regex (create-scanner "[^a-zA-Z0-9_\\-.]")))
|
||||
(flet ((url-encode (string)
|
||||
"URL-encodes a string."
|
||||
;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
|
||||
(flet ((convert (target-string start end match-start match-end reg-starts reg-ends)
|
||||
(declare (ignore start end match-end reg-starts reg-ends))
|
||||
(format nil "%~2,'0x" (char-code (char target-string match-start)))))
|
||||
(regex-replace-all url-regex string #'convert))))
|
||||
(string= (url-encode "F<>te S<>rensen na<6E>ve H<>hner Stra<72>e")
|
||||
"F%EAte%20S%F8rensen%20na%EFve%20H%FChner%20Stra%DFe")))
|
||||
|
||||
(flet ((how-many (target-string start end match-start match-end reg-starts reg-ends)
|
||||
(declare (ignore target-string start end match-start match-end))
|
||||
(format nil "~A" (- (svref reg-ends 0)
|
||||
(svref reg-starts 0)))))
|
||||
(string= (regex-replace-all "{(.+?)}"
|
||||
"foo{...}bar{.....}{..}baz{....}frob"
|
||||
(list "[" #'how-many " dots]"))
|
||||
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"))
|
||||
|
||||
(let ((qp-regex (create-scanner "[\\x80-\\xff]")))
|
||||
(flet ((encode-quoted-printable (string)
|
||||
"Converts 8-bit string to quoted-printable representation.
|
||||
Version using SIMPLE-CALLS keyword argument."
|
||||
;; ;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there
|
||||
(flet ((convert (match)
|
||||
(format nil "=~2,'0x" (char-code (char match 0)))))
|
||||
(regex-replace-all qp-regex string #'convert
|
||||
:simple-calls t))))
|
||||
(string= (encode-quoted-printable "F<>te S<>rensen na<6E>ve H<>hner Stra<72>e")
|
||||
"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe")))
|
||||
|
||||
(flet ((how-many (match first-register)
|
||||
(declare (ignore match))
|
||||
(format nil "~A" (length first-register))))
|
||||
(string= (regex-replace-all "{(.+?)}"
|
||||
"foo{...}bar{.....}{..}baz{....}frob"
|
||||
(list "[" #'how-many " dots]")
|
||||
:simple-calls t)
|
||||
"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"))
|
||||
|
||||
(flet ((my-repetition (char min)
|
||||
`(:greedy-repetition ,min nil ,char)))
|
||||
(setf (parse-tree-synonym 'a*) (my-repetition #\a 0)
|
||||
(parse-tree-synonym 'b+) (my-repetition #\b 1))
|
||||
(unwind-protect
|
||||
(let ((scanner (create-scanner '(:sequence a* b+))))
|
||||
(equal (mapcar (lambda (target)
|
||||
(scan scanner target))
|
||||
'("ab" "b" "aab" "a" "x"))
|
||||
(list 0 0 0 nil nil)))
|
||||
(setf (parse-tree-synonym 'a*) nil
|
||||
(parse-tree-synonym 'b+) nil)))
|
||||
|
||||
(null (scan "^a+$" "a+"))
|
||||
|
||||
(let ((*allow-quoting* t))
|
||||
;;we use CREATE-SCANNER because of Lisps like SBCL that don't have an interpreter
|
||||
(equalp (multiple-value-list (scan (create-scanner "^\\Qa+\\E$") "a+"))
|
||||
(list 0 2 #() #())))
|
||||
|
||||
(string= (parse-string "\\k<reg>") "k<reg>")
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
(equal (nth-value 1 (create-scanner "((?<small>[a-z]*)(?<big>[A-Z]*))"))
|
||||
(list nil "small" "big")))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
(equal (nth-value 1 (create-scanner '(:register
|
||||
(:sequence
|
||||
(:named-register "small"
|
||||
(:greedy-repetition 0 nil (:char-class (:range #\a #\z))))
|
||||
(:named-register "big"
|
||||
(:greedy-repetition 0 nil (:char-class (:range #\a #\z))))))))
|
||||
(list nil "small" "big")))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
(equalp (multiple-value-list (scan (create-scanner "((?<small>[a-z]*)(?<big>[A-Z]*))") "aaaBBB"))
|
||||
(list 0 6 #(0 0 3) #(6 3 6))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
;; multiple-choice back-reference
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$") "a1aa"))
|
||||
(list 0 4 #(0 1) #(1 2))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$") "a22a"))
|
||||
(list 0 4 #(0 1) #(1 2))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
;; demonstrating most-recently-seen-register-first property of back-reference;
|
||||
;; "greedy" regex (analogous to "aa?")
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>)(?<reg>a)(\\k<reg>)") "a"))
|
||||
(list 0 1 #(0 0 1) #(0 1 1))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>)(?<reg>a)(\\k<reg>)") "aa"))
|
||||
(list 0 2 #(0 0 1) #(0 1 2))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
;; switched groups
|
||||
;; "lazy" regex (analogous to "aa??")
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>a)(?<reg>)(\\k<reg>)") "a"))
|
||||
(list 0 1 #(0 1 1) #(1 1 1))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
;; scanner ignores the second "a"
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>a)(?<reg>)(\\k<reg>)") "aa"))
|
||||
(list 0 1 #(0 1 1) #(1 1 1))))
|
||||
|
||||
(let ((*allow-named-registers* t))
|
||||
;; "aa" will be matched only when forced by adding "$" at the end
|
||||
(equalp (multiple-value-list (scan (create-scanner "^(?<reg>a)(?<reg>)(\\k<reg>)$") "aa"))
|
||||
(list 0 2 #(0 1 1) #(1 1 2))))
|
||||
|
||||
(string= (quote-meta-chars "[a-z]*") "\\[a\\-z\\]\\*")
|
||||
|
||||
(string= (handler-case
|
||||
(create-scanner "foo**x")
|
||||
(ppcre-syntax-error (condition)
|
||||
(format nil "Houston, we've got a problem with the string ~S: Looks like something went wrong at position ~A. The last message we received was \"~?\"."
|
||||
(ppcre-syntax-error-string condition)
|
||||
(ppcre-syntax-error-pos condition)
|
||||
(simple-condition-format-control condition)
|
||||
(simple-condition-format-arguments condition))))
|
||||
"Houston, we've got a problem with the string \"foo**x\": Looks like something went wrong at position 4. The last message we received was \"Quantifier '*' not allowed.\".")
|
||||
|
||||
(flet ((my-weird-filter (pos)
|
||||
"Only match at this point if either pos is odd and the
|
||||
character we're looking at is lowercase or if pos is even and the next
|
||||
two characters we're looking at are uppercase. Consume these
|
||||
characters if there's a match."
|
||||
(cond ((and (oddp pos)
|
||||
(< pos cl-ppcre::*end-pos*)
|
||||
(lower-case-p (char cl-ppcre::*string* pos)))
|
||||
(1+ pos))
|
||||
((and (evenp pos)
|
||||
(< (1+ pos) cl-ppcre::*end-pos*)
|
||||
(upper-case-p (char cl-ppcre::*string* pos))
|
||||
(upper-case-p (char cl-ppcre::*string* (1+ pos))))
|
||||
(+ pos 2))
|
||||
(t nil))))
|
||||
(let ((weird-regex `(:sequence "+" (:filter ,#'my-weird-filter) "+")))
|
||||
(equalp (multiple-value-list (scan weird-regex "+A++a+AA+"))
|
||||
(list 5 9 #() #()))))
|
||||
|
||||
(let ((a "\\E*"))
|
||||
(equalp (multiple-value-list (scan (concatenate 'string "(?:" (quote-meta-chars a) "){2}") "\\E*\\E*"))
|
||||
(list 0 6 #() #())))
|
||||
|
||||
(let ((a "\\E*"))
|
||||
(equalp (multiple-value-list (scan `(:greedy-repetition 2 2 ,a) "\\E*\\E*"))
|
||||
(list 0 6 #() #())))
|
||||
|
||||
(loop for *optimize-char-classes* in '(:hash-table :hash-table* :charset :charset* :charmap)
|
||||
for s = (create-scanner "(([a-c])+)x")
|
||||
always (equalp (multiple-value-list (scan s "abcxy"))
|
||||
(list 0 4 #(0 2) #(3 3))))
|
||||
159
test/tests.lisp
Normal file
159
test/tests.lisp
Normal file
@ -0,0 +1,159 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/tests.lisp,v 1.12 2008/07/23 00:48:00 edi Exp $
|
||||
|
||||
;;; The tests in this file test CL-PPCRE against testdata generated by
|
||||
;;; the Perl program `perltest.pl' from the input file `testinput' in
|
||||
;;; order to check compatibility with Perl and correctness of the
|
||||
;;; regex engine.
|
||||
|
||||
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre-test)
|
||||
|
||||
(defvar *this-file* (load-time-value
|
||||
(or #.*compile-file-pathname* *load-pathname*))
|
||||
"The location of this source file.")
|
||||
|
||||
(defmacro do-tests ((name &optional show-progress-p) &body body)
|
||||
"Helper macro which repeatedly executes BODY until the code in body
|
||||
calls the function DONE. It is assumed that each invocation of BODY
|
||||
will be the execution of one test which returns NIL in case of success
|
||||
and list of string describing errors otherwise.
|
||||
|
||||
The macro prints a simple progress indicator \(one dots for ten tests)
|
||||
to *STANDARD-OUTPUT* unless SHOW-PROGRESS-P is NIL and returns a true
|
||||
value iff all tests succeeded. Errors in BODY are caught and reported
|
||||
\(and counted as failures)."
|
||||
`(let ((successp t)
|
||||
(testcount 1))
|
||||
(block test-block
|
||||
(flet ((done ()
|
||||
(return-from test-block successp)))
|
||||
(format t "~&Test: ~A~%" ,name)
|
||||
(loop
|
||||
(when (and ,show-progress-p (zerop (mod testcount 10)))
|
||||
(format t ".")
|
||||
(when (zerop (mod testcount 100))
|
||||
(terpri))
|
||||
(force-output))
|
||||
(let ((errors
|
||||
(handler-case
|
||||
(progn ,@body)
|
||||
(error (msg)
|
||||
(list (format nil "~&got an unexpected error: ~A" msg))))))
|
||||
(setq successp (and successp (null errors)))
|
||||
(when errors
|
||||
(format t "~&~4@A:~{~& ~A~}~%" testcount errors))
|
||||
(incf testcount)))))
|
||||
successp))
|
||||
|
||||
(defun simple-tests (&key (file-name
|
||||
(make-pathname :name "simple"
|
||||
:type nil :version nil
|
||||
:defaults *this-file*))
|
||||
(external-format '(:latin-1 :eol-style :lf))
|
||||
verbose)
|
||||
"Loops through all the forms in the file FILE-NAME and executes each
|
||||
of them using EVAL. It is assumed that each FORM specifies a test
|
||||
which returns a true value iff it succeeds. Prints each test form to
|
||||
*STANDARD-OUTPUT* if VERBOSE is true and shows a simple progress
|
||||
indicator otherwise. EXTERNAL-FORMAT is the FLEXI-STREAMS external
|
||||
format which is used to read the file. Returns a true value iff all
|
||||
tests succeeded."
|
||||
(with-open-file (binary-stream file-name :element-type 'flex:octet)
|
||||
(let ((stream (flex:make-flexi-stream binary-stream :external-format external-format))
|
||||
(*package* (find-package :cl-ppcre-test)))
|
||||
(do-tests ((format nil "Simple tests from file ~S" (file-namestring file-name))
|
||||
(not verbose))
|
||||
(let ((form (or (read stream nil) (done))))
|
||||
(when verbose
|
||||
(format t "~&~S" form))
|
||||
(cond ((eval form) nil)
|
||||
(t (list (format nil "~S returned NIL" form)))))))))
|
||||
|
||||
(defun random-test-function (probability)
|
||||
"Returns a random character test function which contains each
|
||||
character with probability PROBABILITY."
|
||||
(let ((hash-table (make-hash-table)))
|
||||
(dotimes (code char-code-limit)
|
||||
(let ((char (code-char code)))
|
||||
(when (and char (< (random 1.0d0) probability))
|
||||
(setf (gethash (code-char code) hash-table) t))))
|
||||
(lambda (char)
|
||||
(gethash char hash-table))))
|
||||
|
||||
(defun test-optimized-test-functions% (probability)
|
||||
"Creates a random test function with probability PROBABILITY and six
|
||||
\(one for each possible \"kind\") corresponding optimized test
|
||||
functions, then checks for each character in turn that all functions
|
||||
agree on it."
|
||||
(let* ((test-function (random-test-function probability))
|
||||
(optimized-functions (loop for kind in '(nil
|
||||
:hash-table
|
||||
:hash-table*
|
||||
:charset
|
||||
:charset*
|
||||
:charmap)
|
||||
collect (create-optimized-test-function test-function :kind kind))))
|
||||
(loop for code below char-code-limit
|
||||
for char = (code-char code)
|
||||
for expected-result = (and char (funcall test-function char))
|
||||
always (or (null char)
|
||||
(loop for optimized-function in optimized-functions
|
||||
always (eq (not (funcall optimized-function char))
|
||||
(not expected-result)))))))
|
||||
|
||||
(defun test-optimized-test-functions (&key verbose)
|
||||
"Runs TEST-OPTIMIZED-TEST-FUNCTIONS% with different probabilities."
|
||||
(let ((probabilities '(0 .001 .01 .1 1)))
|
||||
(do-tests ("Optimized test functions - this might take some time..." (not verbose))
|
||||
(let ((probability (or (pop probabilities) (done))))
|
||||
(when verbose
|
||||
(format t "~&Probability is ~A" probability))
|
||||
(not (test-optimized-test-functions% probability))))))
|
||||
|
||||
(defun run-all-tests (&key more-tests verbose)
|
||||
"Runs all tests for CL-PPCRE and returns a true value iff all tests
|
||||
succeeded. VERBOSE is interpreted by the individual test suites.
|
||||
MORE-TESTS can be a list of function designators designating
|
||||
additional tests to run. This facility is used by the tests for
|
||||
CL-PPCRE-UNICODE."
|
||||
(let ((successp t))
|
||||
(macrolet ((run-test-suite (&body body)
|
||||
`(unless (progn ,@body)
|
||||
(setq successp nil))))
|
||||
;; run the automatically generated Perl tests
|
||||
(run-test-suite (perl-test :verbose verbose))
|
||||
(run-test-suite (test-optimized-test-functions :verbose verbose))
|
||||
(run-test-suite (simple-tests :verbose verbose))
|
||||
(when more-tests
|
||||
(unless (listp more-tests)
|
||||
(setq more-tests (list more-tests))
|
||||
(dolist (test more-tests)
|
||||
(run-test-suite (funcall test :verbose verbose))))))
|
||||
(format t "~2&~:[Some tests failed~;All tests passed~]." successp)
|
||||
successp))
|
||||
80
test/unicode-tests.lisp
Normal file
80
test/unicode-tests.lisp
Normal file
@ -0,0 +1,80 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/unicode-tests.lisp,v 1.8 2008/07/23 00:17:53 edi Exp $
|
||||
|
||||
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package :cl-ppcre-test)
|
||||
|
||||
(defun unicode-test (&key (file-name
|
||||
(make-pathname :name "unicodetestdata"
|
||||
:type nil :version nil
|
||||
:defaults *this-file*)
|
||||
file-name-provided-p)
|
||||
verbose)
|
||||
"Loops through all test cases in FILE-NAME and prints a report if
|
||||
VERBOSE is true. Returns a true value if all tests succeeded.
|
||||
|
||||
For the syntax of the tests in FILE-NAME refer to CL-UNICODE."
|
||||
(with-open-file (stream file-name)
|
||||
(let ((*regex-char-code-limit* (if file-name-provided-p *regex-char-code-limit* char-code-limit))
|
||||
(*optimize-char-classes* (if file-name-provided-p *optimize-char-classes* nil))
|
||||
;; we only check for correctness and don't care about speed
|
||||
;; that match (but rather about space constraints of the
|
||||
;; trial versions)
|
||||
(*use-bmh-matchers* (if file-name-provided-p *use-bmh-matchers* nil)))
|
||||
(do-tests ((format nil "Running Unicode tests in file ~S" (file-namestring file-name))
|
||||
(not verbose))
|
||||
(let ((input-line (or (read stream nil) (done)))
|
||||
errors)
|
||||
(destructuring-bind (char-code property-name expected-result)
|
||||
input-line
|
||||
(let ((char (and (< char-code char-code-limit) (code-char char-code))))
|
||||
(when char
|
||||
(when verbose
|
||||
(format t "~&~A: #x~X" property-name char-code))
|
||||
(let* ((string (string char))
|
||||
(result-1 (scan (format nil "\\p{~A}" property-name) string))
|
||||
(result-2 (scan (format nil "[\\p{~A}]" property-name) string))
|
||||
(inverted-result-1 (scan (format nil "\\P{~A}" property-name) string))
|
||||
(inverted-result-2 (scan (format nil "[\\P{~A}]" property-name) string)))
|
||||
(unless (eq expected-result (not (not result-1)))
|
||||
(push (format nil "\(code-char #x~X) should ~:[not ~;~]have matched \"\\p{~A}\""
|
||||
char-code expected-result property-name)
|
||||
errors))
|
||||
(unless (eq expected-result (not (not result-2)))
|
||||
(push (format nil "\(code-char #x~X) should ~:[not ~;~]have matched \"[\\p{~A}]\""
|
||||
char-code expected-result property-name)
|
||||
errors))
|
||||
(unless (eq expected-result (not inverted-result-1))
|
||||
(push (format nil "\(code-char #x~X) should ~:[~;not ~]have matched \"\\P{~A}\""
|
||||
char-code expected-result property-name)
|
||||
errors))
|
||||
(unless (eq expected-result (not inverted-result-2))
|
||||
(push (format nil "\(code-char #x~X) should ~:[~;not ~]have matched \"[\\P{~A}]\""
|
||||
char-code expected-result property-name)
|
||||
errors)))
|
||||
errors))))))))
|
||||
107
test/unicodetestdata
Normal file
107
test/unicodetestdata
Normal file
@ -0,0 +1,107 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/test/unicodetestdata,v 1.10 2008/07/22 14:00:35 edi Exp $
|
||||
|
||||
;;; some arbitrary test data for Unicode properties - stolen from CL-UNICODE
|
||||
|
||||
(#x0001 "ASCII" t)
|
||||
(#x0100 "ASCII" nil)
|
||||
(#x000A "Alphabetic" nil)
|
||||
(#x0061 "Alphabetic" t)
|
||||
(#x0061 "Ll" t)
|
||||
(#x0041 "Alphabetic" t)
|
||||
(#x0041 "alphabetic" t)
|
||||
(#x0041 "IsAlphabetic" t)
|
||||
(#x02E4 "Alphabetic" t)
|
||||
(#x0970 "Alphabetic" nil)
|
||||
(#x030D "BidiClass:NonspacingMark" t)
|
||||
(#x030D "NonspacingMark" t)
|
||||
(#x030D "nonspacing mark" t)
|
||||
(#xE0146 "BidiClass:NonspacingMark" t)
|
||||
(#x000D "BidiClass:WhiteSpace" nil)
|
||||
(#x0020 "BidiClass:WhiteSpace" t)
|
||||
(#x2006 "BidiClass:WhiteSpace" t)
|
||||
(#x12470 "Cuneiform" t)
|
||||
(#x12470 "IsCuneiform" t)
|
||||
(#x12470 "CuneiformNumbersAndPunctuation" t)
|
||||
(#x12470 "Block:CuneiformNumbersAndPunctuation" t)
|
||||
(#x12470 "InCuneiformNumbersAndPunctuation" t)
|
||||
(#x12470 "Script:Cuneiform" t)
|
||||
(#x0041 "Block:Hebrew" nil)
|
||||
(#x0593 "Block:Hebrew" t)
|
||||
(#x0593 "InHebrew" t)
|
||||
(#x040D "Block:Cyrillic" t)
|
||||
(#x040D "InCyrillic" t)
|
||||
(#x0042 "Block:Cyrillic" nil)
|
||||
(#x2011 "Dash" t)
|
||||
(#x2011 "IsDash" t)
|
||||
(#xFF0D "Dash" t)
|
||||
(#x003D "Dash" nil)
|
||||
(#x00F0 "Lowercase" t)
|
||||
(#x00F0 "IsLowercase" t)
|
||||
(#x00F0 "lowercase" t)
|
||||
(#x00F0 "Ll" t)
|
||||
(#x0067 "Lowercase" t)
|
||||
(#x010A "Lowercase" nil)
|
||||
(#x1D6C1 "Lowercase" nil)
|
||||
(#x0023 "CurrencySymbol" nil)
|
||||
(#x0024 "CurrencySymbol" t)
|
||||
(#x0024 "IsCurrencySymbol" t)
|
||||
(#x0024 "currency symbol" t)
|
||||
(#x20AC "CurrencySymbol" t)
|
||||
(#xFFE6 "CurrencySymbol" t)
|
||||
(#x002B "Sm" t)
|
||||
(#x002B "Math" t)
|
||||
(#x002B "IsMath" t)
|
||||
(#x002B "math" t)
|
||||
(#x211C "Math" t)
|
||||
(#x1D7D2 "Math" t)
|
||||
(#x002A "Math" nil)
|
||||
(#x25C9 "Math" nil)
|
||||
(#x0000 "NonCharacterCodePoint" nil)
|
||||
(#xFDD0 "NonCharacterCodePoint" t)
|
||||
(#xFDD0 "Non-Character-Code-Point" t)
|
||||
(#xFDD0 "non-character-code-point" t)
|
||||
(#xFFFFF "NonCharacterCodePoint" t)
|
||||
(#x0043 "Arabic" nil)
|
||||
(#x0606 "Arabic" t)
|
||||
(#x0606 "arabic" t)
|
||||
(#x0606 "IsArabic" t)
|
||||
(#x0606 "Script:Arabic" t)
|
||||
(#x0044 "IsVariationSelector" nil)
|
||||
(#x0044 "VariationSelector" nil)
|
||||
(#x180B "VariationSelector" t)
|
||||
(#x180B "Variation_Selector" t)
|
||||
(#x180B "Variation-Selector" t)
|
||||
(#x180B "variationselector" t)
|
||||
(#x180B "variation selector" t)
|
||||
(#x180B "IsVariationSelector" t)
|
||||
(#x00B5 "XIDContinue" t)
|
||||
(#x00B5 "IsXIDContinue" t)
|
||||
(#x00B5 "IsXID_Continue" t)
|
||||
(#x00B5 "Is_XID_Continue" t)
|
||||
(#x00B5 "XID_Continue" t)
|
||||
(#x33FF "Unified_Ideograph" nil)
|
||||
(#x33FF "Ideographic" nil)
|
||||
(#x3400 "Unified_Ideograph" t)
|
||||
(#x3400 "Ideographic" t)
|
||||
(#x3400 "Han" t)
|
||||
(#x3400 "OtherLetter" t)
|
||||
(#x3400 "Alphabetic" t)
|
||||
(#x3400 "Common" nil)
|
||||
(#x3400 "Assigned" t)
|
||||
(#x3400 "Any" t)
|
||||
(#x0378 "Cn" t)
|
||||
(#x0378 "Unassigned" t)
|
||||
(#x0377 "Cn" nil)
|
||||
(#x0377 "Unassigned" nil)
|
||||
(#x2800 "Braille" t)
|
||||
(#x2800 "Script:Braille" t)
|
||||
(#x2800 "OtherSymbol" t)
|
||||
(#x0027 "QuotationMark" t)
|
||||
(#x201C "QuotationMark" t)
|
||||
(#x201C "OtherNeutral" t)
|
||||
(#x201C "PatternSyntax" t)
|
||||
(#x0028 "Bidi_Mirrored" t)
|
||||
(#x0028 "BidiMirrored" t)
|
||||
(#x0028 "IsBidiMirrored" t)
|
||||
(#x0027 "Bidi_Mirrored" nil)
|
||||
156
util.lisp
156
util.lisp
@ -1,5 +1,5 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.40 2008/07/03 10:06:16 edi Exp $
|
||||
;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.46 2008/07/06 18:12:05 edi Exp $
|
||||
|
||||
;;; Utility functions and constants dealing with the character sets we
|
||||
;;; use to encode character classes
|
||||
@ -30,7 +30,12 @@
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
(in-package :cl-ppcre)
|
||||
|
||||
(defmacro defconstant (name value &optional doc)
|
||||
"Make sure VALUE is evaluated only once \(to appease SBCL)."
|
||||
`(cl:defconstant ,name (if (boundp ',name) (symbol-value ',name) ,value)
|
||||
,@(when doc (list doc))))
|
||||
|
||||
#+:lispworks
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
@ -100,120 +105,36 @@ are discarded \(that is, the body is an implicit PROGN)."
|
||||
`(let (,,@temps)
|
||||
,,@body))))))
|
||||
|
||||
(eval-when (:compile-toplevel :execute :load-toplevel)
|
||||
(defun make-char-set (test)
|
||||
(declare #.*special-optimize-settings*)
|
||||
"Returns a CHARSET for all characters satisfying test."
|
||||
(loop with set = (make-charset)
|
||||
for code of-type fixnum from 0 below char-code-limit
|
||||
for char = (code-char code)
|
||||
if (and char (funcall test char))
|
||||
do (add-to-charset char set)
|
||||
finally (return set)))
|
||||
(declaim (inline digit-char-p))
|
||||
(defun digit-char-p (chr)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Tests whether a character is a decimal digit, i.e. the same as
|
||||
Perl's [\\d]. Note that this function shadows the standard Common
|
||||
Lisp function CL:DIGIT-CHAR-P."
|
||||
(char<= #\0 chr #\9))
|
||||
|
||||
(declaim (inline word-char-p))
|
||||
|
||||
(defun word-char-p (chr)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Tests whether a character is a \"word\" character.
|
||||
In the ASCII charset this is equivalent to a-z, A-Z, 0-9, or _,
|
||||
i.e. the same as Perl's [\\w]."
|
||||
"Tests whether a character is a \"word\" character. In the ASCII
|
||||
charset this is equivalent to a-z, A-Z, 0-9, or _, i.e. the same as
|
||||
Perl's [\\w]."
|
||||
(or (alphanumericp chr)
|
||||
(char= chr #\_)))
|
||||
|
||||
(unless (boundp '+whitespace-char-string+)
|
||||
(defconstant +whitespace-char-string+
|
||||
(coerce
|
||||
'(#\Space #\Tab #\Linefeed #\Return #\Page)
|
||||
'string)
|
||||
(coerce '(#\Space #\Tab #\Linefeed #\Return #\Page) 'string)
|
||||
"A string of all characters which are considered to be whitespace.
|
||||
Same as Perl's [\\s]."))
|
||||
Same as Perl's [\\s].")
|
||||
|
||||
(defun whitespacep (chr)
|
||||
(declare #.*special-optimize-settings*)
|
||||
"Tests whether a character is whitespace,
|
||||
i.e. whether it would match [\\s] in Perl."
|
||||
(find chr +whitespace-char-string+ :test #'char=)))
|
||||
|
||||
;; the following DEFCONSTANT statements are wrapped with
|
||||
;; (UNLESS (BOUNDP ...) ...) to make SBCL happy
|
||||
|
||||
(unless (boundp '+digit-set+)
|
||||
(defconstant +digit-set+
|
||||
(make-char-set (lambda (chr) (char<= #\0 chr #\9)))
|
||||
"Character set containing the digits from 0 to 9."))
|
||||
|
||||
(unless (boundp '+word-char-set+)
|
||||
(defconstant +word-char-set+
|
||||
(make-char-set #'word-char-p)
|
||||
"Character set containing all \"word\" characters."))
|
||||
|
||||
(unless (boundp '+whitespace-char-set+)
|
||||
(defconstant +whitespace-char-set+
|
||||
(make-char-set #'whitespacep)
|
||||
"Character set containing all whitespace characters."))
|
||||
|
||||
(defun create-ranges-from-set (set &key downcasep)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Tries to identify up to three intervals \(with respect to CHAR<)
|
||||
which together comprise the charset SET. Returns NIL if this is not
|
||||
possible. If DOWNCASEP is true it will treat the charset as if it
|
||||
represents both the lower-case and the upper-case variants of its
|
||||
members and will only return the respective lower-case intervals."
|
||||
;; discard empty charsets
|
||||
(unless (and set (plusp (charset-count set)))
|
||||
(return-from create-ranges-from-set nil))
|
||||
(loop with min1 and min2 and min3
|
||||
and max1 and max2 and max3
|
||||
;; loop through all characters in SET, sorted by CHAR<
|
||||
;; (actually by < on their character codes, see 13.1.6 in the
|
||||
;; ANSI standard)
|
||||
for code of-type fixnum below *regex-char-code-limit*
|
||||
for char = (code-char code)
|
||||
when (and char (in-charset-p (if downcasep (char-downcase char) char) set))
|
||||
;; MIN1, MAX1, etc. are _exclusive_
|
||||
;; bounds of the intervals identified so far
|
||||
do (cond
|
||||
((not min1)
|
||||
;; this will only happen once, for the first character
|
||||
(setq min1 (1- code)
|
||||
max1 (1+ code)))
|
||||
((<= (the fixnum min1) code (the fixnum max1))
|
||||
;; we're here as long as CHAR fits into the first interval
|
||||
(setq min1 (min (the fixnum min1) (1- code))
|
||||
max1 (max (the fixnum max1) (1+ code))))
|
||||
((not min2)
|
||||
;; we need to open a second interval
|
||||
;; this'll also happen only once
|
||||
(setq min2 (1- code)
|
||||
max2 (1+ code)))
|
||||
((<= (the fixnum min2) code (the fixnum max2))
|
||||
;; CHAR fits into the second interval
|
||||
(setq min2 (min (the fixnum min2) (1- code))
|
||||
max2 (max (the fixnum max2) (1+ code))))
|
||||
((not min3)
|
||||
;; we need to open the third interval
|
||||
;; happens only once
|
||||
(setq min3 (1- code)
|
||||
max3 (1+ code)))
|
||||
((<= (the fixnum min3) code (the fixnum max3))
|
||||
;; CHAR fits into the third interval
|
||||
(setq min3 (min (the fixnum min3) (1- code))
|
||||
max3 (max (the fixnum max3) (1+ code))))
|
||||
(t
|
||||
;; we're out of luck, CHAR doesn't fit
|
||||
;; into one of the three intervals
|
||||
(return nil)))
|
||||
;; on success return all bounds
|
||||
;; make them inclusive bounds before returning
|
||||
finally (return (values (code-char (1+ min1))
|
||||
(code-char (1- max1))
|
||||
(and min2 (code-char (1+ min2)))
|
||||
(and max2 (code-char (1- max2)))
|
||||
(and min3 (code-char (1+ min3)))
|
||||
(and max3 (code-char (1- max3)))))))
|
||||
"Tests whether a character is whitespace, i.e. whether it would
|
||||
match [\\s] in Perl."
|
||||
(find chr +whitespace-char-string+ :test #'char=))
|
||||
|
||||
(defmacro maybe-coerce-to-simple-string (string)
|
||||
"Coerces STRING to a simple STRING unless it already is one."
|
||||
(with-unique-names (=string=)
|
||||
`(let ((,=string= ,string))
|
||||
(cond ((simple-string-p ,=string=)
|
||||
@ -223,16 +144,16 @@ members and will only return the respective lower-case intervals."
|
||||
|
||||
(declaim (inline nsubseq))
|
||||
(defun nsubseq (sequence start &optional (end (length sequence)))
|
||||
"Return a subsequence by pointing to location in original sequence."
|
||||
"Returns a subsequence by pointing to location in original sequence."
|
||||
(make-array (- end start)
|
||||
:element-type (array-element-type sequence)
|
||||
:displaced-to sequence
|
||||
:displaced-index-offset start))
|
||||
|
||||
(defun normalize-var-list (var-list)
|
||||
"Utility function for REGISTER-GROUPS-BIND and
|
||||
DO-REGISTER-GROUPS. Creates the long form \(a list of \(FUNCTION VAR)
|
||||
entries) out of the short form of VAR-LIST."
|
||||
"Utility function for REGISTER-GROUPS-BIND and DO-REGISTER-GROUPS.
|
||||
Creates the long form \(a list of \(FUNCTION VAR) entries) out of the
|
||||
short form of VAR-LIST."
|
||||
(loop for element in var-list
|
||||
if (consp element)
|
||||
nconc (loop for var in (rest element)
|
||||
@ -241,20 +162,33 @@ entries) out of the short form of VAR-LIST."
|
||||
collect (list '(function identity) element)))
|
||||
|
||||
(defun string-list-to-simple-string (string-list)
|
||||
(declare #.*standard-optimize-settings*)
|
||||
"Concatenates a list of strings to one simple-string."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
;; this function provided by JP Massar; note that we can't use APPLY
|
||||
;; with CONCATENATE here because of CALL-ARGUMENTS-LIMIT
|
||||
(let ((total-size 0))
|
||||
(declare (type fixnum total-size))
|
||||
(declare (fixnum total-size))
|
||||
(dolist (string string-list)
|
||||
#-genera (declare (type string string))
|
||||
#-:genera (declare (string string))
|
||||
(incf total-size (length string)))
|
||||
(let ((result-string (make-sequence 'simple-string total-size))
|
||||
(curr-pos 0))
|
||||
(declare (type fixnum curr-pos))
|
||||
(declare (fixnum curr-pos))
|
||||
(dolist (string string-list)
|
||||
#-genera (declare (type string string))
|
||||
#-:genera (declare (string string))
|
||||
(replace result-string string :start1 curr-pos)
|
||||
(incf curr-pos (length string)))
|
||||
result-string)))
|
||||
|
||||
(defun complement* (test-function)
|
||||
"Like COMPLEMENT but optimized for unary functions."
|
||||
(declare #.*standard-optimize-settings*)
|
||||
(typecase test-function
|
||||
(function
|
||||
(lambda (char)
|
||||
(declare (character char))
|
||||
(not (funcall (the function test-function) char))))
|
||||
(otherwise
|
||||
(lambda (char)
|
||||
(declare (character char))
|
||||
(not (funcall test-function char))))))
|
||||
Reference in New Issue
Block a user