Initial revision
git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@12 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
156
CHANGELOG
Normal file
156
CHANGELOG
Normal file
@ -0,0 +1,156 @@
|
||||
Version 0.7.4
|
||||
2004-02-16
|
||||
Fixed wrong call to SIGNAL-PPCRE-SIGNAL-ERROR in lexer.lisp (caught by Peter Graves)
|
||||
Added :CL-PPCRE to *FEATURES* (for CL-INTERPOL)
|
||||
Compiler macro for SPLIT
|
||||
|
||||
Version 0.7.3
|
||||
2004-01-28
|
||||
Fixed bug in CURRENT-MIN-REST for lookaheads (reported by Thomas-Paz Hartman)
|
||||
Added tests for this bug
|
||||
|
||||
Version 0.7.2
|
||||
2004-01-27
|
||||
Fixed typo (SUBSEQ/NSUBSEQ) in SPLIT (thanks to Alan Ruttenberg)
|
||||
Updated docs with respect to ECL (thanks to Alex Mizrahi)
|
||||
Mention FreeBSD port in docs
|
||||
|
||||
Version 0.7.1
|
||||
2003-10-24
|
||||
Fixed version numbers in docs (thanks to S<>bastien Saint-Sevin)
|
||||
|
||||
Version 0.7.0
|
||||
2003-10-23
|
||||
New macros REGISTER-GROUPS-BIND and DO-REGISTER-GROUPS
|
||||
Added SHAREP keyword argument to most API functions and macros
|
||||
Mention CL-INTERPOL in docs
|
||||
Partial code cleanup (using WITH-UNIQUE-NAMES and REBINDING)
|
||||
|
||||
Version 0.6.1
|
||||
2003-10-11
|
||||
Added EXTERNAL-FORMAT keyword args to CL-PPCRE-TEST:TEST for some CLs (thanks to JP Massar and Scott D. Kalter)
|
||||
Fixed bug with REGEX-REPLACE and REGEX-REPLACE-ALL when (= START END) was true
|
||||
Added doc sections for quoting problems and backslash confusions (thanks to conversations with Peter Seibel)
|
||||
Disable quoting in definition of QUOTE-SECTIONS so you can always safely rebuild CL-PPCRE
|
||||
|
||||
Version 0.6.0
|
||||
2003-10-07
|
||||
CL-PPCRE now has its own condition types
|
||||
Added support for Perl's \Q and \E (Peter Seibel convinced me to do it) - see QUOTE-META-CHARS and *ALLOW-QUOTING*
|
||||
Added tests for this new feature
|
||||
Threaded tests are more verbose now and use only keyword args
|
||||
|
||||
Version 0.5.9
|
||||
2003-10-03
|
||||
Changed "^" optimizations with respect to constant end strings with offsets (bug caught by Yexuan Gui)
|
||||
Added tests for this bug
|
||||
Removed *.dos files from CL-PPCRE-TEST tests (thanks to JP Massar)
|
||||
Added threaded tests for SBCL (thanks to Christophe Rhodes)
|
||||
|
||||
Version 0.5.8
|
||||
2003-09-17
|
||||
Optimizations for ".*" were too optimistic when look-behinds were involved
|
||||
Added tests for this bug
|
||||
Removed *.dos files
|
||||
|
||||
Version 0.5.7
|
||||
2003-08-20
|
||||
Fixed (CL-PPCRE:SCAN "(.)X$" "ABCX" :START 4) bug (spotted by Tibor Simko)
|
||||
Forgot to export *REGEX-CHAR-CODE-LIMIT* in Corman version of DEFPACKAGE
|
||||
Removed Emacs local variables from source code (finally...)
|
||||
Mention Gentoo in docs
|
||||
|
||||
Version 0.5.6
|
||||
2003-06-30
|
||||
Replaced wrong COPY-REGEX code for WORD-BOUNDARY objects (detected by Max Goldberg)
|
||||
Added info about possible TRUENAME problems with ACL in README (thanks to Kevin Layer for providing a patch for this)
|
||||
|
||||
Version 0.5.5
|
||||
2003-06-09
|
||||
Patch for SBCL/Debian compatibility by Kevin Rosenberg
|
||||
Simpler version of compiler macro
|
||||
Availability through asdf-install
|
||||
|
||||
Version 0.5.4
|
||||
2003-04-09
|
||||
Added DESTRUCTIVE keyword to CREATE-SCANNER
|
||||
|
||||
Version 0.5.3
|
||||
2003-03-31
|
||||
Fixed bug in REGEX-REPLACE (replacement string couldn't contain literal backslash)
|
||||
Fixed bug in definition of CHAR-CLASS (since 0.5.0 the hash slot may be NIL - CMUCL's new PCL detects this)
|
||||
Micro-optimization in INSERT-CHAR-CLASS-TESTER: CHAR-NOT-GREATERP instead of CHAR-DOWNCASE
|
||||
|
||||
Version 0.5.2
|
||||
2003-03-28
|
||||
Better compiler macro (thanks to Kent M. Pitman)
|
||||
|
||||
Version 0.5.1
|
||||
2003-03-27
|
||||
Removed compiler macro
|
||||
|
||||
Version 0.5.0
|
||||
2003-03-27
|
||||
Lexer, parser, and converter mostly re-written to reduce consing and increase speed
|
||||
Get rid of FIX-POS in lexer and parser, "ism" flags are handled after parsing now
|
||||
Smaller test suite (again) due to literal embedding of line breaks
|
||||
Seperate test files for DOS line endings
|
||||
Replaced constant +REGEX-CHAR-CODE-LIMIT+ with special variable *REGEX-CHAR-CODE-LIMIT*
|
||||
|
||||
Version 0.4.1
|
||||
2003-03-19
|
||||
Added compiler macro for SCAN
|
||||
Changed test suite to be nicer to Corman Lisp and ECL (see docs for new syntax)
|
||||
Incorporated visual feedback (dots) in test suite (thanks to JP Massar)
|
||||
Added README file
|
||||
Replaced STRING-LIST-TO-SIMPLE-STRING with a much improved version by JP Massar
|
||||
|
||||
Version 0.4.0
|
||||
2003-02-27
|
||||
Added *USE-BMH-MATCHER*
|
||||
|
||||
Version 0.3.2
|
||||
2003-02-21
|
||||
Added load.lisp
|
||||
Various minor changes for Corman Lisp compatibility (thanks to Karsten Poeck and JP Massar)
|
||||
|
||||
Version 0.3.1
|
||||
2003-01-18
|
||||
Bugfix in CREATE-SCANNER (didn't work if flags were given and arg was a parse-tree)
|
||||
|
||||
Version 0.3.0
|
||||
2003-01-12
|
||||
Added new features to REGEX-REPLACE and REGEX-REPLACE-ALL
|
||||
|
||||
Version 0.2.0
|
||||
2003-01-11
|
||||
Make SPLIT more Perl-compatible, including new keyword parameters
|
||||
|
||||
Version 0.1.4
|
||||
2003-01-10
|
||||
Don't move "^" and "\A" while iterating with DO-SCANS
|
||||
Added link to Debian package
|
||||
|
||||
Version 0.1.3
|
||||
2002-12-25
|
||||
More usable MK:DEFSYSTEM files (courtesy of Hannu Koivisto)
|
||||
Fixed typo in documentation
|
||||
|
||||
Version 0.1.2
|
||||
2002-12-22
|
||||
Added version numbers for Debian packaging
|
||||
Be friendly to case-sensitive ACL images (courtesy of Kevin Rosenberg and Douglas Crosher)
|
||||
"Fixed" two cases where declarations came after docstrings (because of bugs in Corman Lisp and older CMUCL versions)
|
||||
Added #-cormanlisp to hide (INCF (THE FIXNUM POS)) from Corman Lisp
|
||||
Added file doc/benchmarks.2002-12-22.txt
|
||||
|
||||
Version 0.1.1
|
||||
2002-12-21
|
||||
Added asdf system definitions by Marco Baringer
|
||||
Small additions to documentation
|
||||
Correct (Emacs) local variables list in closures.lisp and api.lisp
|
||||
Added this CHANGELOG
|
||||
|
||||
Version 0.1.0
|
||||
2002-12-20
|
||||
Initial release
|
||||
51
README
Normal file
51
README
Normal file
@ -0,0 +1,51 @@
|
||||
Complete documentation for CL-PPCRE can be found in the 'doc'
|
||||
directory.
|
||||
|
||||
1. Installation
|
||||
|
||||
1.1. Probably the easiest way is
|
||||
|
||||
(load "/path/to/cl-ppcre/load.lisp")
|
||||
|
||||
This should compile and load CL-PPCRE on most Common Lisp
|
||||
implementations.
|
||||
|
||||
1.2. With MK:DEFSYSTEM you can make a symbolic link from
|
||||
'cl-ppcre.system' and 'cl-ppcre-test.system' to your central registry
|
||||
(which by default is in '/usr/local/lisp/Registry/') and then issue
|
||||
the command
|
||||
|
||||
(mk:compile-system "cl-ppcre")
|
||||
|
||||
Note that this relies on TRUENAME returning the original file a
|
||||
symbolic link is pointing to. This will only work with AllegroCL
|
||||
6.2 if you've applied all patches with (SYS:UPDATE-ALLEGRO).
|
||||
|
||||
1.3. You can also use ASDF instead of MK:DEFSYSTEM in a similar way
|
||||
(use the .asd files instead of the .system files).
|
||||
|
||||
2. Test
|
||||
|
||||
CL-PPCRE comes with a test suite that can be used to check its
|
||||
compatibility with Perl's regex syntax. See the documentation on how
|
||||
to use this test suite for benchmarks and on how to write your own
|
||||
tests.
|
||||
|
||||
2.1. If you've used 'load.lisp' to load CL-PPCRE you already have the
|
||||
test suite loaded and can start the default tests with
|
||||
|
||||
(cl-ppcre-test:test)
|
||||
|
||||
2.2. With MK:DEFSYSTEM you need to compile the 'cl-ppcre-test' system
|
||||
as well before you can proceed as in 2.1.
|
||||
|
||||
2.3. Same for ASDF.
|
||||
|
||||
Depending on your machine and your CL implementation the default test
|
||||
will take between a few seconds and a couple of minutes. (It will
|
||||
print a dot for every tenth test case while it proceeds to give some
|
||||
visual feedback.) It should exactly report three 'errors' (662, 790,
|
||||
and 1439) which are explained in the documentation.
|
||||
|
||||
MCL might report an error for the ninth test case which is also
|
||||
explained in the docs.
|
||||
40
cl-ppcre-test.asd
Normal file
40
cl-ppcre-test.asd
Normal file
@ -0,0 +1,40 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre-test.asd,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(defpackage #:cl-ppcre-test.system
|
||||
(:use #:cl
|
||||
#:asdf))
|
||||
|
||||
(in-package #:cl-ppcre-test.system)
|
||||
|
||||
(defsystem #:cl-ppcre-test
|
||||
:depends-on (#:cl-ppcre)
|
||||
:components ((:file "ppcre-tests")))
|
||||
40
cl-ppcre-test.system
Normal file
40
cl-ppcre-test.system
Normal file
@ -0,0 +1,40 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre-test.system,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
|
||||
(defparameter *cl-ppcre-test-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(mk:defsystem #:cl-ppcre-test
|
||||
:source-pathname *cl-ppcre-test-base-directory*
|
||||
:source-extension "lisp"
|
||||
:depends-on (#:cl-ppcre)
|
||||
:components ((:file "ppcre-tests")))
|
||||
51
cl-ppcre.asd
Normal file
51
cl-ppcre.asd
Normal file
@ -0,0 +1,51 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre.asd,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; This ASDF system definition was kindly provided by Marco Baringer.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(defpackage #:cl-ppcre.system
|
||||
(:use #:cl
|
||||
#:asdf))
|
||||
|
||||
(in-package #:cl-ppcre.system)
|
||||
|
||||
(defsystem #:cl-ppcre
|
||||
:components ((:file "packages")
|
||||
(:file "specials" :depends-on ("packages"))
|
||||
(:file "util" :depends-on ("packages"))
|
||||
(:file "errors" :depends-on ("util"))
|
||||
(:file "lexer" :depends-on ("errors" "specials"))
|
||||
(:file "parser" :depends-on ("lexer"))
|
||||
(:file "regex-class" :depends-on ("parser"))
|
||||
(:file "convert" :depends-on ("regex-class"))
|
||||
(:file "optimize" :depends-on ("convert"))
|
||||
(:file "closures" :depends-on ("optimize" "specials"))
|
||||
(:file "repetition-closures" :depends-on ("closures"))
|
||||
(:file "scanner" :depends-on ("repetition-closures"))
|
||||
(:file "api" :depends-on ("scanner"))))
|
||||
51
cl-ppcre.system
Normal file
51
cl-ppcre.system
Normal file
@ -0,0 +1,51 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre.system,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
|
||||
(defparameter *cl-ppcre-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(mk:defsystem #:cl-ppcre
|
||||
:source-pathname *cl-ppcre-base-directory*
|
||||
:source-extension "lisp"
|
||||
:components ((:file "packages")
|
||||
(:file "specials" :depends-on ("packages"))
|
||||
(:file "util" :depends-on ("packages"))
|
||||
(:file "errors" :depends-on ("util"))
|
||||
(:file "lexer" :depends-on ("errors" "specials"))
|
||||
(:file "parser" :depends-on ("lexer"))
|
||||
(:file "regex-class" :depends-on ("parser"))
|
||||
(:file "convert" :depends-on ("regex-class"))
|
||||
(:file "optimize" :depends-on ("convert"))
|
||||
(:file "closures" :depends-on ("optimize" "specials"))
|
||||
(:file "repetition-closures" :depends-on ("closures"))
|
||||
(:file "scanner" :depends-on ("repetition-closures"))
|
||||
(:file "api" :depends-on ("scanner"))))
|
||||
576
closures.lisp
Normal file
576
closures.lisp
Normal file
@ -0,0 +1,576 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/closures.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Here we create the closures which together build the final
|
||||
;;; scanner.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(declaim (inline *string*= *string*-equal))
|
||||
|
||||
(defun *string*= (string2 start1 end1 start2 end2)
|
||||
"Like STRING=, i.e. compares the special string *STRING* from START1
|
||||
to END1 with STRING2 from START2 to END2. Note that there's no
|
||||
boundary check - this has to be implemented by the caller."
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum start1 end1 start2 end2))
|
||||
(loop for string1-idx of-type fixnum from start1 below end1
|
||||
for string2-idx of-type fixnum from start2 below end2
|
||||
always (char= (schar *string* string1-idx)
|
||||
(schar string2 string2-idx))))
|
||||
|
||||
(defun *string*-equal (string2 start1 end1 start2 end2)
|
||||
"Like STRING-EQUAL, i.e. compares the special string *STRING* from
|
||||
START1 to END1 with STRING2 from START2 to END2. Note that there's no
|
||||
boundary check - this has to be implemented by the caller."
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum start1 end1 start2 end2))
|
||||
(loop for string1-idx of-type fixnum from start1 below end1
|
||||
for string2-idx of-type fixnum from start2 below end2
|
||||
always (char-equal (schar *string* string1-idx)
|
||||
(schar string2 string2-idx))))
|
||||
|
||||
(defgeneric create-matcher-aux (regex next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Creates a closure which takes one parameter,
|
||||
START-POS, and tests whether REGEX can match *STRING* at START-POS
|
||||
such that the call to NEXT-FN after the match would succeed."))
|
||||
|
||||
(defmethod create-matcher-aux ((seq seq) next-fn)
|
||||
;; the closure for a SEQ is a chain of closures for the elements of
|
||||
;; this sequence which call each other in turn; the last closure
|
||||
;; calls NEXT-FN
|
||||
(loop for element in (reverse (elements seq))
|
||||
for curr-matcher = next-fn then next-matcher
|
||||
for next-matcher = (create-matcher-aux element curr-matcher)
|
||||
finally (return next-matcher)))
|
||||
|
||||
(defmethod create-matcher-aux ((alternation alternation) next-fn)
|
||||
;; first create closures for all alternations of ALTERNATION
|
||||
(let ((all-matchers (mapcar #'(lambda (choice)
|
||||
(create-matcher-aux choice next-fn))
|
||||
(choices alternation))))
|
||||
;; now create a closure which checks if one of the closures
|
||||
;; created above can succeed
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(loop for matcher in all-matchers
|
||||
thereis (funcall (the function matcher) start-pos)))))
|
||||
|
||||
(defmethod create-matcher-aux ((register register) next-fn)
|
||||
;; the position of this REGISTER within the whole regex; we start to
|
||||
;; count at 0
|
||||
(let ((num (num register)))
|
||||
(declare (type fixnum num))
|
||||
;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which will
|
||||
;; update the corresponding values of *REGS-START* and *REGS-END*
|
||||
;; after the inner matcher has succeeded
|
||||
(flet ((store-end-of-reg (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function next-fn))
|
||||
(setf (svref *reg-starts* num) (svref *regs-maybe-start* num)
|
||||
(svref *reg-ends* num) start-pos)
|
||||
(funcall next-fn start-pos)))
|
||||
;; the inner matcher is a closure corresponding to the regex
|
||||
;; wrapped by this REGISTER
|
||||
(let ((inner-matcher (create-matcher-aux (regex register)
|
||||
#'store-end-of-reg)))
|
||||
(declare (type function inner-matcher))
|
||||
;; here comes the actual closure for REGISTER
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
;; remember the old values of *REGS-START* and friends in
|
||||
;; case we cannot match
|
||||
(let ((old-*reg-starts* (svref *reg-starts* num))
|
||||
(old-*regs-maybe-start* (svref *regs-maybe-start* num))
|
||||
(old-*reg-ends* (svref *reg-ends* num)))
|
||||
;; we cannot use *REGS-START* here because Perl allows
|
||||
;; regular expressions like /(a|\1x)*/
|
||||
(setf (svref *regs-maybe-start* num) start-pos)
|
||||
(let ((next-pos (funcall inner-matcher start-pos)))
|
||||
(unless next-pos
|
||||
;; restore old values on failure
|
||||
(setf (svref *reg-starts* num) old-*reg-starts*
|
||||
(svref *regs-maybe-start* num) old-*regs-maybe-start*
|
||||
(svref *reg-ends* num) old-*reg-ends*))
|
||||
next-pos)))))))
|
||||
|
||||
(defmethod create-matcher-aux ((lookahead lookahead) next-fn)
|
||||
;; create a closure which just checks for the inner regex and
|
||||
;; doesn't care about NEXT-FN
|
||||
(let ((test-matcher (create-matcher-aux (regex lookahead) #'identity)))
|
||||
(declare (type function next-fn test-matcher))
|
||||
(if (positivep lookahead)
|
||||
;; positive look-ahead: check success of inner regex, then call
|
||||
;; NEXT-FN
|
||||
(lambda (start-pos)
|
||||
(and (funcall test-matcher start-pos)
|
||||
(funcall next-fn start-pos)))
|
||||
;; negative look-ahead: check failure of inner regex, then call
|
||||
;; NEXT-FN
|
||||
(lambda (start-pos)
|
||||
(and (not (funcall test-matcher start-pos))
|
||||
(funcall next-fn start-pos))))))
|
||||
|
||||
(defmethod create-matcher-aux ((lookbehind lookbehind) next-fn)
|
||||
(let ((len (len lookbehind))
|
||||
;; create a closure which just checks for the inner regex and
|
||||
;; doesn't care about NEXT-FN
|
||||
(test-matcher (create-matcher-aux (regex lookbehind) #'identity)))
|
||||
(declare (type function next-fn test-matcher)
|
||||
(type fixnum len))
|
||||
(if (positivep lookbehind)
|
||||
;; positive look-behind: check success of inner regex (if we're
|
||||
;; far enough from the start of *STRING*), then call NEXT-FN
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (>= (- start-pos *start-pos*) len)
|
||||
(funcall test-matcher (- start-pos len))
|
||||
(funcall next-fn start-pos)))
|
||||
;; negative look-behind: check failure of inner regex (if we're
|
||||
;; far enough from the start of *STRING*), then call NEXT-FN
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (or (< start-pos len)
|
||||
(not (funcall test-matcher (- start-pos len))))
|
||||
(funcall next-fn start-pos))))))
|
||||
|
||||
(defmacro insert-char-class-tester ((char-class chr-expr) &body body)
|
||||
"Utility macro to replace each occurence of '(CHAR-CLASS-TEST)
|
||||
within BODY with the correct test (corresponding to CHAR-CLASS)
|
||||
against CHR-EXPR."
|
||||
(with-unique-names (%char-class)
|
||||
;; the actual substitution is done here: replace
|
||||
;; '(CHAR-CLASS-TEST) with NEW
|
||||
(flet ((substitute-char-class-tester (new)
|
||||
(subst new '(char-class-test) body
|
||||
:test #'equalp)))
|
||||
`(let* ((,%char-class ,char-class)
|
||||
(hash (hash ,%char-class))
|
||||
(count (if hash
|
||||
(hash-table-count hash)
|
||||
most-positive-fixnum))
|
||||
;; collect a list of "all" characters in the hash if
|
||||
;; there aren't more than two
|
||||
(key-list (if (<= count 2)
|
||||
(loop for chr being the hash-keys of hash
|
||||
collect chr)
|
||||
nil))
|
||||
downcasedp)
|
||||
(declare (type fixnum count))
|
||||
;; check if we can partition the hash into three ranges (or
|
||||
;; less)
|
||||
(multiple-value-bind (min1 max1 min2 max2 min3 max3)
|
||||
(create-ranges-from-hash hash)
|
||||
;; if that didn't work and CHAR-CLASS is case-insensitive we
|
||||
;; try it again with every character downcased
|
||||
(when (and (not min1)
|
||||
(case-insensitive-p ,%char-class))
|
||||
(multiple-value-setq (min1 max1 min2 max2 min3 max3)
|
||||
(create-ranges-from-hash hash :downcasep t))
|
||||
(setq downcasedp t))
|
||||
(cond ((= count 1)
|
||||
;; hash contains exactly one character so we just
|
||||
;; check for this single character; (note that this
|
||||
;; actually can't happen because this case is
|
||||
;; optimized away in CONVERT already...)
|
||||
(let ((chr1 (first key-list)))
|
||||
,@(substitute-char-class-tester
|
||||
`(char= ,chr-expr chr1))))
|
||||
((= count 2)
|
||||
;; hash contains exactly two characters
|
||||
(let ((chr1 (first key-list))
|
||||
(chr2 (second key-list)))
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char= chr chr1)
|
||||
(char= chr chr2))))))
|
||||
((word-char-class-p ,%char-class)
|
||||
;; special-case: hash is \w, \W, [\w], [\W] or
|
||||
;; something equivalent
|
||||
,@(substitute-char-class-tester
|
||||
`(word-char-p ,chr-expr)))
|
||||
((= count *regex-char-code-limit*)
|
||||
;; according to the ANSI standard we might have all
|
||||
;; possible characters in the hash even if it
|
||||
;; doesn't contain CHAR-CODE-LIMIT characters but
|
||||
;; this doesn't seem to be the case for current
|
||||
;; implementations (also note that this optimization
|
||||
;; implies that you must not have characters with
|
||||
;; character codes beyond *REGEX-CHAR-CODE-LIMIT* in
|
||||
;; your regexes if you've changed this limit); we
|
||||
;; expect the compiler to optimize this T "test"
|
||||
;; away
|
||||
,@(substitute-char-class-tester t))
|
||||
((and downcasedp min1 min2 min3)
|
||||
;; three different ranges, downcased
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char-not-greaterp min1 chr max1)
|
||||
(char-not-greaterp min2 chr max2)
|
||||
(char-not-greaterp min3 chr max3)))))
|
||||
((and downcasedp min1 min2)
|
||||
;; two ranges, downcased
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char-not-greaterp min1 chr max1)
|
||||
(char-not-greaterp min2 chr max2)))))
|
||||
((and downcasedp min1)
|
||||
;; one downcased range
|
||||
,@(substitute-char-class-tester
|
||||
`(char-not-greaterp min1 ,chr-expr max1)))
|
||||
((and min1 min2 min3)
|
||||
;; three ranges
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char<= min1 chr max1)
|
||||
(char<= min2 chr max2)
|
||||
(char<= min3 chr max3)))))
|
||||
((and min1 min2)
|
||||
;; two ranges
|
||||
,@(substitute-char-class-tester
|
||||
`(let ((chr ,chr-expr))
|
||||
(or (char<= min1 chr max1)
|
||||
(char<= min2 chr max2)))))
|
||||
(min1
|
||||
;; one range
|
||||
,@(substitute-char-class-tester
|
||||
`(char<= min1 ,chr-expr max1)))
|
||||
(t
|
||||
;; the general case; note that most of the above
|
||||
;; "optimizations" are based on experiences and
|
||||
;; benchmarks with CMUCL - if you're really
|
||||
;; concerned with speed you might find out that the
|
||||
;; general case is almost always the best one for
|
||||
;; other implementations (because the speed of their
|
||||
;; hash-table access in relation to other operations
|
||||
;; might be better than in CMUCL)
|
||||
,@(substitute-char-class-tester
|
||||
`(gethash ,chr-expr hash)))))))))
|
||||
|
||||
(defmethod create-matcher-aux ((char-class char-class) next-fn)
|
||||
(declare (type function next-fn))
|
||||
;; insert a test against the current character within *STRING*
|
||||
(insert-char-class-tester (char-class (schar *string* start-pos))
|
||||
(if (invertedp char-class)
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(not (char-class-test))
|
||||
(funcall next-fn (1+ start-pos))))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char-class-test)
|
||||
(funcall next-fn (1+ start-pos)))))))
|
||||
|
||||
(defmethod create-matcher-aux ((str str) next-fn)
|
||||
(declare (type fixnum *end-string-pos*)
|
||||
(type function next-fn)
|
||||
;; this special value is set by CREATE-SCANNER when the
|
||||
;; closures are built
|
||||
(special end-string))
|
||||
(let* ((len (len str))
|
||||
(case-insensitive-p (case-insensitive-p str))
|
||||
(start-of-end-string-p (start-of-end-string-p str))
|
||||
(skip (skip str))
|
||||
(str (str str))
|
||||
(chr (schar str 0))
|
||||
(end-string (and end-string (str end-string)))
|
||||
(end-string-len (if end-string
|
||||
(length end-string)
|
||||
nil)))
|
||||
(declare (type fixnum len))
|
||||
(cond ((and start-of-end-string-p case-insensitive-p)
|
||||
;; closure for the first STR which belongs to the constant
|
||||
;; string at the end of the regular expression;
|
||||
;; case-insensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos end-string-len))
|
||||
(let ((test-end-pos (+ start-pos end-string-len)))
|
||||
(declare (type fixnum test-end-pos))
|
||||
;; either we're at *END-STRING-POS* (which means that
|
||||
;; it has already been confirmed that end-string
|
||||
;; starts here) or we really have to test
|
||||
(and (or (= start-pos *end-string-pos*)
|
||||
(and (<= test-end-pos *end-pos*)
|
||||
(*string*-equal end-string start-pos test-end-pos
|
||||
0 end-string-len)))
|
||||
(funcall next-fn (+ start-pos len))))))
|
||||
(start-of-end-string-p
|
||||
;; closure for the first STR which belongs to the constant
|
||||
;; string at the end of the regular expression;
|
||||
;; case-sensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos end-string-len))
|
||||
(let ((test-end-pos (+ start-pos end-string-len)))
|
||||
(declare (type fixnum test-end-pos))
|
||||
;; either we're at *END-STRING-POS* (which means that
|
||||
;; it has already been confirmed that end-string
|
||||
;; starts here) or we really have to test
|
||||
(and (or (= start-pos *end-string-pos*)
|
||||
(and (<= test-end-pos *end-pos*)
|
||||
(*string*= end-string start-pos test-end-pos
|
||||
0 end-string-len)))
|
||||
(funcall next-fn (+ start-pos len))))))
|
||||
(skip
|
||||
;; a STR which can be skipped because some other function
|
||||
;; has already confirmed that it matches
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(funcall next-fn (+ start-pos len))))
|
||||
((and (= len 1) case-insensitive-p)
|
||||
;; STR represent exactly one character; case-insensitive
|
||||
;; version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char-equal (schar *string* start-pos) chr)
|
||||
(funcall next-fn (1+ start-pos)))))
|
||||
((= len 1)
|
||||
;; STR represent exactly one character; case-sensitive
|
||||
;; version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char= (schar *string* start-pos) chr)
|
||||
(funcall next-fn (1+ start-pos)))))
|
||||
(case-insensitive-p
|
||||
;; general case, case-insensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((next-pos (+ start-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(and (<= next-pos *end-pos*)
|
||||
(*string*-equal str start-pos next-pos 0 len)
|
||||
(funcall next-fn next-pos)))))
|
||||
(t
|
||||
;; general case, case-sensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((next-pos (+ start-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(and (<= next-pos *end-pos*)
|
||||
(*string*= str start-pos next-pos 0 len)
|
||||
(funcall next-fn next-pos))))))))
|
||||
|
||||
(declaim (inline word-boundary-p))
|
||||
|
||||
(defun word-boundary-p (start-pos)
|
||||
"Check whether START-POS is a word-boundary within *STRING*."
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((1-start-pos (1- start-pos)))
|
||||
;; either the character before START-POS is a word-constituent and
|
||||
;; the character at START-POS isn't...
|
||||
(or (and (or (= start-pos *end-pos*)
|
||||
(and (< start-pos *end-pos*)
|
||||
(not (word-char-p (schar *string* start-pos)))))
|
||||
(and (< 1-start-pos *end-pos*)
|
||||
(<= *start-pos* 1-start-pos)
|
||||
(word-char-p (schar *string* 1-start-pos))))
|
||||
;; ...or vice versa
|
||||
(and (or (= start-pos *start-pos*)
|
||||
(and (< 1-start-pos *end-pos*)
|
||||
(<= *start-pos* 1-start-pos)
|
||||
(not (word-char-p (schar *string* 1-start-pos)))))
|
||||
(and (< start-pos *end-pos*)
|
||||
(word-char-p (schar *string* start-pos)))))))
|
||||
|
||||
(defmethod create-matcher-aux ((word-boundary word-boundary) next-fn)
|
||||
(declare (type function next-fn))
|
||||
(if (negatedp word-boundary)
|
||||
(lambda (start-pos)
|
||||
(and (not (word-boundary-p start-pos))
|
||||
(funcall next-fn start-pos)))
|
||||
(lambda (start-pos)
|
||||
(and (word-boundary-p start-pos)
|
||||
(funcall next-fn start-pos)))))
|
||||
|
||||
(defmethod create-matcher-aux ((everything everything) next-fn)
|
||||
(declare (type function next-fn))
|
||||
(if (single-line-p everything)
|
||||
;; closure for single-line-mode: we really match everything, so we
|
||||
;; just advance the index into *STRING* by one and carry on
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(funcall next-fn (1+ start-pos))))
|
||||
;; not single-line-mode, so we have to make sure we don't match
|
||||
;; #\Newline
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (< start-pos *end-pos*)
|
||||
(char/= (schar *string* start-pos) #\Newline)
|
||||
(funcall next-fn (1+ start-pos))))))
|
||||
|
||||
(defmethod create-matcher-aux ((anchor anchor) next-fn)
|
||||
(declare (type function next-fn))
|
||||
(let ((startp (startp anchor))
|
||||
(multi-line-p (multi-line-p anchor)))
|
||||
(cond ((no-newline-p anchor)
|
||||
;; this must be and end-anchor and it must be modeless, so
|
||||
;; we just have to check whether START-POS equals
|
||||
;; *END-POS*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (= start-pos *end-pos*)
|
||||
(funcall next-fn start-pos))))
|
||||
((and startp multi-line-p)
|
||||
;; a start-anchor in multi-line-mode: check if we're at
|
||||
;; *START-POS* or if the last character was #\Newline
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((*start-pos* (or *real-start-pos* *start-pos*)))
|
||||
(and (or (= start-pos *start-pos*)
|
||||
(and (<= start-pos *end-pos*)
|
||||
(> start-pos *start-pos*)
|
||||
(char= #\Newline
|
||||
(schar *string* (1- start-pos)))))
|
||||
(funcall next-fn start-pos)))))
|
||||
(startp
|
||||
;; a start-anchor which is not in multi-line-mode, so just
|
||||
;; check whether we're at *START-POS*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (= start-pos (or *real-start-pos* *start-pos*))
|
||||
(funcall next-fn start-pos))))
|
||||
(multi-line-p
|
||||
;; an end-anchor in multi-line-mode: check if we're at
|
||||
;; *END-POS* or if the character we're looking at is
|
||||
;; #\Newline
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (or (= start-pos *end-pos*)
|
||||
(and (< start-pos *end-pos*)
|
||||
(char= #\Newline
|
||||
(schar *string* start-pos))))
|
||||
(funcall next-fn start-pos))))
|
||||
(t
|
||||
;; an end-anchor which is not in multi-line-mode, so just
|
||||
;; check if we're at *END-POS* or if we're looking at
|
||||
;; #\Newline and there's nothing behind it
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(and (or (= start-pos *end-pos*)
|
||||
(and (= start-pos (1- *end-pos*))
|
||||
(char= #\Newline
|
||||
(schar *string* start-pos))))
|
||||
(funcall next-fn start-pos)))))))
|
||||
|
||||
(defmethod create-matcher-aux ((back-reference back-reference) next-fn)
|
||||
(declare (type function next-fn))
|
||||
;; the position of the corresponding REGISTER within the whole
|
||||
;; regex; we start to count at 0
|
||||
(let ((num (num back-reference)))
|
||||
(if (case-insensitive-p back-reference)
|
||||
;; the case-insensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((reg-start (svref *reg-starts* num))
|
||||
(reg-end (svref *reg-ends* num)))
|
||||
;; only bother to check if the corresponding REGISTER as
|
||||
;; matched successfully already
|
||||
(and reg-start
|
||||
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
||||
(the fixnum reg-start)))))
|
||||
(declare (type fixnum next-pos))
|
||||
(and
|
||||
(<= next-pos *end-pos*)
|
||||
(*string*-equal *string* start-pos next-pos
|
||||
reg-start reg-end)
|
||||
(funcall next-fn next-pos))))))
|
||||
;; the case-sensitive version
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((reg-start (svref *reg-starts* num))
|
||||
(reg-end (svref *reg-ends* num)))
|
||||
;; only bother to check if the corresponding REGISTER as
|
||||
;; matched successfully already
|
||||
(and reg-start
|
||||
(let ((next-pos (+ start-pos (- (the fixnum reg-end)
|
||||
(the fixnum reg-start)))))
|
||||
(declare (type fixnum next-pos))
|
||||
(and
|
||||
(<= next-pos *end-pos*)
|
||||
(*string*= *string* start-pos next-pos
|
||||
reg-start reg-end)
|
||||
(funcall next-fn next-pos)))))))))
|
||||
|
||||
(defmethod create-matcher-aux ((branch branch) next-fn)
|
||||
(let* ((test (test branch))
|
||||
(then-matcher (create-matcher-aux (then-regex branch) next-fn))
|
||||
(else-matcher (create-matcher-aux (else-regex branch) next-fn)))
|
||||
(declare (type function then-matcher else-matcher))
|
||||
(cond ((numberp test)
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum test))
|
||||
(if (and (< test (length *reg-starts*))
|
||||
(svref *reg-starts* test))
|
||||
(funcall then-matcher start-pos)
|
||||
(funcall else-matcher start-pos))))
|
||||
(t
|
||||
(let ((test-matcher (create-matcher-aux test #'identity)))
|
||||
(declare (type function test-matcher))
|
||||
(lambda (start-pos)
|
||||
(if (funcall test-matcher start-pos)
|
||||
(funcall then-matcher start-pos)
|
||||
(funcall else-matcher start-pos))))))))
|
||||
|
||||
(defmethod create-matcher-aux ((standalone standalone) next-fn)
|
||||
(let ((inner-matcher (create-matcher-aux (regex standalone) #'identity)))
|
||||
(declare (type function next-fn inner-matcher))
|
||||
(lambda (start-pos)
|
||||
(let ((next-pos (funcall inner-matcher start-pos)))
|
||||
(and next-pos
|
||||
(funcall next-fn next-pos))))))
|
||||
|
||||
(defmethod create-matcher-aux ((void void) next-fn)
|
||||
;; optimize away VOIDs: don't create a closure, just return NEXT-FN
|
||||
next-fn)
|
||||
775
convert.lisp
Normal file
775
convert.lisp
Normal file
@ -0,0 +1,775 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/convert.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Here the parse tree is converted into its internal representation
|
||||
;;; using REGEX objects. At the same time some optimizations are
|
||||
;;; already applied.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
;;; The flags that represent the "ism" modifiers are always kept
|
||||
;;; together in a three-element list. We use the following macros to
|
||||
;;; access individual elements.
|
||||
|
||||
(defmacro case-insensitive-mode-p (flags)
|
||||
"Accessor macro to extract the first flag out of a three-element flag list."
|
||||
`(first ,flags))
|
||||
|
||||
(defmacro multi-line-mode-p (flags)
|
||||
"Accessor macro to extract the second flag out of a three-element flag list."
|
||||
`(second ,flags))
|
||||
|
||||
(defmacro single-line-mode-p (flags)
|
||||
"Accessor macro to extract the third flag out of a three-element flag list."
|
||||
`(third ,flags))
|
||||
|
||||
(defun set-flag (token)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (special flags))
|
||||
"Reads a flag token and sets or unsets the corresponding entry in
|
||||
the special FLAGS list."
|
||||
(case token
|
||||
((:case-insensitive-p)
|
||||
(setf (case-insensitive-mode-p flags) t))
|
||||
((:case-sensitive-p)
|
||||
(setf (case-insensitive-mode-p flags) nil))
|
||||
((:multi-line-mode-p)
|
||||
(setf (multi-line-mode-p flags) t))
|
||||
((:not-multi-line-mode-p)
|
||||
(setf (multi-line-mode-p flags) nil))
|
||||
((:single-line-mode-p)
|
||||
(setf (single-line-mode-p flags) t))
|
||||
((:not-single-line-mode-p)
|
||||
(setf (single-line-mode-p flags) nil))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error "Unknown flag token ~A" token))))
|
||||
|
||||
(defun add-range-to-hash (hash from to)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (special flags))
|
||||
"Adds all characters from character FROM to character TO (inclusive)
|
||||
to the char class hash HASH. Does the right thing with respect to
|
||||
case-(in)sensitivity as specified by the special variable FLAGS."
|
||||
(let ((from-code (char-code from))
|
||||
(to-code (char-code to)))
|
||||
(when (> from-code to-code)
|
||||
(signal-ppcre-syntax-error "Invalid range from ~A to ~A in char-class"
|
||||
from to))
|
||||
(cond ((case-insensitive-mode-p flags)
|
||||
(loop for code from from-code to to-code
|
||||
for chr = (code-char code)
|
||||
do (setf (gethash (char-upcase chr) hash) t
|
||||
(gethash (char-downcase chr) hash) t)))
|
||||
(t
|
||||
(loop for code from from-code to to-code
|
||||
do (setf (gethash (code-char code) hash) t))))
|
||||
hash))
|
||||
|
||||
(defun convert-char-class-to-hash (list)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Combines all items in LIST into one char class hash and returns it.
|
||||
Items can be single characters, character ranges like \(:RANGE #\\A
|
||||
#\\E), or special character classes like :DIGIT-CLASS. Does the right
|
||||
thing with respect to case-\(in)sensitivity as specified by the
|
||||
special variable FLAGS."
|
||||
(loop with hash = (make-hash-table :size (ceiling (expt *regex-char-code-limit* (/ 1 4)))
|
||||
:rehash-size (float (expt *regex-char-code-limit* (/ 1 4)))
|
||||
:rehash-threshold 1.0)
|
||||
for item in list
|
||||
if (characterp item)
|
||||
;; treat a single character C like a range (:RANGE C C)
|
||||
do (add-range-to-hash hash item item)
|
||||
else if (symbolp item)
|
||||
;; special character classes
|
||||
do (setq hash
|
||||
(case item
|
||||
((:digit-class)
|
||||
(merge-hash hash +digit-hash+))
|
||||
((:non-digit-class)
|
||||
(merge-inverted-hash hash +digit-hash+))
|
||||
((:whitespace-char-class)
|
||||
(merge-hash hash +whitespace-char-hash+))
|
||||
((:non-whitespace-char-class)
|
||||
(merge-inverted-hash hash +whitespace-char-hash+))
|
||||
((:word-char-class)
|
||||
(merge-hash hash +word-char-hash+))
|
||||
((:non-word-char-class)
|
||||
(merge-inverted-hash hash +word-char-hash+))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error
|
||||
"Unknown symbol ~A in character class"
|
||||
item))))
|
||||
else if (and (consp item)
|
||||
(eq (car item) :range))
|
||||
;; proper ranges
|
||||
do (add-range-to-hash hash
|
||||
(second item)
|
||||
(third item))
|
||||
else do (signal-ppcre-syntax-error "Unknown item ~A in char-class list"
|
||||
item)
|
||||
finally (return hash)))
|
||||
|
||||
(defun maybe-split-repetition (regex
|
||||
greedyp
|
||||
minimum
|
||||
maximum
|
||||
min-len
|
||||
length
|
||||
reg-seen)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum minimum)
|
||||
(type (or fixnum null) maximum))
|
||||
"Splits a REPETITION object into a constant and a varying part if
|
||||
applicable, i.e. something like
|
||||
a{3,} -> a{3}a*
|
||||
The arguments to this function correspond to the REPETITION slots of
|
||||
the same name."
|
||||
;; note the usage of COPY-REGEX here; we can't use the same REGEX
|
||||
;; object in both REPETITIONS because they will have different
|
||||
;; offsets
|
||||
(when maximum
|
||||
(when (zerop maximum)
|
||||
;; trivial case: don't repeat at all
|
||||
(return-from maybe-split-repetition
|
||||
(make-instance 'void)))
|
||||
(when (= 1 minimum maximum)
|
||||
;; another trivial case: "repeat" exactly once
|
||||
(return-from maybe-split-repetition
|
||||
regex)))
|
||||
;; first set up the constant part of the repetition
|
||||
;; maybe that's all we need
|
||||
(let ((constant-repetition (if (plusp minimum)
|
||||
(make-instance 'repetition
|
||||
:regex (copy-regex regex)
|
||||
:greedyp greedyp
|
||||
:minimum minimum
|
||||
:maximum minimum
|
||||
:min-len min-len
|
||||
:len length
|
||||
:contains-register-p reg-seen)
|
||||
;; don't create garbage if minimum is 0
|
||||
nil)))
|
||||
(when (and maximum
|
||||
(= maximum minimum))
|
||||
(return-from maybe-split-repetition
|
||||
;; no varying part needed because min = max
|
||||
constant-repetition))
|
||||
;; now construct the varying part
|
||||
(let ((varying-repetition
|
||||
(make-instance 'repetition
|
||||
:regex regex
|
||||
:greedyp greedyp
|
||||
:minimum 0
|
||||
:maximum (if maximum (- maximum minimum) nil)
|
||||
:min-len min-len
|
||||
:len length
|
||||
:contains-register-p reg-seen)))
|
||||
(cond ((zerop minimum)
|
||||
;; min = 0, no constant part needed
|
||||
varying-repetition)
|
||||
((= 1 minimum)
|
||||
;; min = 1, constant part needs no REPETITION wrapped around
|
||||
(make-instance 'seq
|
||||
:elements (list (copy-regex regex)
|
||||
varying-repetition)))
|
||||
(t
|
||||
;; general case
|
||||
(make-instance 'seq
|
||||
:elements (list constant-repetition
|
||||
varying-repetition)))))))
|
||||
|
||||
;; During the conversion of the parse tree we keep track of the start
|
||||
;; of the parse tree in the special variable STARTS-WITH which'll
|
||||
;; either hold a STR object or an EVERYTHING object. The latter is the
|
||||
;; case if the regex starts with ".*" which implicitely anchors the
|
||||
;; regex at the start (perhaps modulo #\Newline).
|
||||
|
||||
(defmethod maybe-accumulate ((str str))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (special accumulate-start-p starts-with))
|
||||
(declare (ftype (function (t) fixnum) len))
|
||||
"Accumulate STR into the special variable STARTS-WITH if
|
||||
ACCUMULATE-START-P (also special) is true and STARTS-WITH is either
|
||||
NIL or a STR object of the same case mode. Always returns NIL."
|
||||
(when accumulate-start-p
|
||||
(etypecase starts-with
|
||||
(str
|
||||
;; STARTS-WITH already holds a STR, so we check if we can
|
||||
;; concatenate
|
||||
(cond ((eq (case-insensitive-p starts-with)
|
||||
(case-insensitive-p str))
|
||||
;; we modify STARTS-WITH in place
|
||||
(setf (len starts-with)
|
||||
(+ (len starts-with) (len str)))
|
||||
;; note that we use SLOT-VALUE because the accessor
|
||||
;; STR has a declared FTYPE which doesn't fit here
|
||||
(adjust-array (slot-value starts-with 'str)
|
||||
(len starts-with)
|
||||
:fill-pointer t)
|
||||
(setf (subseq (slot-value starts-with 'str)
|
||||
(- (len starts-with) (len str)))
|
||||
(str str)
|
||||
;; STR objects that are parts of STARTS-WITH
|
||||
;; always have their SKIP slot set to true
|
||||
;; because the SCAN function will take care of
|
||||
;; them, i.e. the matcher can ignore them
|
||||
(skip str) t))
|
||||
(t (setq accumulate-start-p nil))))
|
||||
(null
|
||||
;; STARTS-WITH is still empty, so we create a new STR object
|
||||
(setf starts-with
|
||||
(make-instance 'str
|
||||
:str ""
|
||||
:case-insensitive-p (case-insensitive-p str))
|
||||
;; INITIALIZE-INSTANCE will coerce the STR to a simple
|
||||
;; string, so we have to fill it afterwards
|
||||
(slot-value starts-with 'str)
|
||||
(make-array (len str)
|
||||
:initial-contents (str str)
|
||||
:element-type 'character
|
||||
:fill-pointer t
|
||||
:adjustable t)
|
||||
(len starts-with)
|
||||
(len str)
|
||||
;; see remark about SKIP above
|
||||
(skip str) t))
|
||||
(everything
|
||||
;; STARTS-WITH already holds an EVERYTHING object - we can't
|
||||
;; concatenate
|
||||
(setq accumulate-start-p nil))))
|
||||
nil)
|
||||
|
||||
(defun convert-aux (parse-tree)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (special flags reg-num accumulate-start-p starts-with max-back-ref))
|
||||
"Converts the parse tree PARSE-TREE into a REGEX object and returns it.
|
||||
|
||||
Will also
|
||||
- split and optimize repetitions,
|
||||
- accumulate strings or EVERYTHING objects into the special variable
|
||||
STARTS-WITH,
|
||||
- keep track of all registers seen in the special variable REG-NUM,
|
||||
- keep track of the highest backreference seen in the special
|
||||
variable MAX-BACK-REF,
|
||||
- maintain and adher to the currently applicable modifiers in the special
|
||||
variable FLAGS, and
|
||||
- maybe even wash your car..."
|
||||
(cond ((consp parse-tree)
|
||||
(case (first parse-tree)
|
||||
;; (:SEQUENCE {<regex>}*)
|
||||
((:sequence)
|
||||
(cond ((cddr parse-tree)
|
||||
;; this is essentially like
|
||||
;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE))
|
||||
;; but we don't cons a new list
|
||||
(loop for parse-tree-rest on (rest parse-tree)
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(convert-aux (car parse-tree-rest))))
|
||||
(make-instance 'seq
|
||||
:elements (rest parse-tree)))
|
||||
(t (convert-aux (second parse-tree)))))
|
||||
;; (:GROUP {<regex>}*)
|
||||
;; this is a syntactical construct equivalent to :SEQUENCE
|
||||
;; intended to keep the effect of modifiers local
|
||||
((:group)
|
||||
;; make a local copy of FLAGS and shadow the global
|
||||
;; value while we descend into the enclosed regexes
|
||||
(let ((flags (copy-list flags)))
|
||||
(declare (special flags))
|
||||
(cond ((cddr parse-tree)
|
||||
(loop for parse-tree-rest on (rest parse-tree)
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(convert-aux (car parse-tree-rest))))
|
||||
(make-instance 'seq
|
||||
:elements (rest parse-tree)))
|
||||
(t (convert-aux (second parse-tree))))))
|
||||
;; (:ALTERNATION {<regex>}*)
|
||||
((:alternation)
|
||||
;; we must stop accumulating objects into STARTS-WITH
|
||||
;; once we reach an alternation
|
||||
(setq accumulate-start-p nil)
|
||||
(loop for parse-tree-rest on (rest parse-tree)
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(convert-aux (car parse-tree-rest))))
|
||||
(make-instance 'alternation
|
||||
:choices (rest parse-tree)))
|
||||
;; (:BRANCH <test> <regex>)
|
||||
;; <test> must be look-ahead, look-behind or number;
|
||||
;; if <regex> is an alternation it must have one or two
|
||||
;; choices
|
||||
((:branch)
|
||||
(setq accumulate-start-p nil)
|
||||
(let* ((test-candidate (second parse-tree))
|
||||
(test (cond ((numberp test-candidate)
|
||||
(when (zerop (the fixnum test-candidate))
|
||||
(signal-ppcre-syntax-error
|
||||
"Register 0 doesn't exist: ~S"
|
||||
parse-tree))
|
||||
(1- (the fixnum test-candidate)))
|
||||
(t (convert-aux test-candidate))))
|
||||
(alternations (convert-aux (third parse-tree))))
|
||||
(when (and (not (numberp test))
|
||||
(not (typep test 'lookahead))
|
||||
(not (typep test 'lookbehind)))
|
||||
(signal-ppcre-syntax-error
|
||||
"Branch test must be look-ahead, look-behind or number: ~S"
|
||||
parse-tree))
|
||||
(typecase alternations
|
||||
(alternation
|
||||
(case (length (choices alternations))
|
||||
((0)
|
||||
(signal-ppcre-syntax-error "No choices in branch: ~S"
|
||||
parse-tree))
|
||||
((1)
|
||||
(make-instance 'branch
|
||||
:test test
|
||||
:then-regex (first
|
||||
(choices alternations))))
|
||||
((2)
|
||||
(make-instance 'branch
|
||||
:test test
|
||||
:then-regex (first
|
||||
(choices alternations))
|
||||
:else-regex (second
|
||||
(choices alternations))))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error
|
||||
"Too much choices in branch: ~S"
|
||||
parse-tree))))
|
||||
(t
|
||||
(make-instance 'branch
|
||||
:test test
|
||||
:then-regex alternations)))))
|
||||
;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD <regex>)
|
||||
((:positive-lookahead :negative-lookahead)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex and stop accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(let ((flags (copy-list flags)))
|
||||
(declare (special flags))
|
||||
(make-instance 'lookahead
|
||||
:regex (convert-aux (second parse-tree))
|
||||
:positivep (eq (first parse-tree)
|
||||
:positive-lookahead))))
|
||||
;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND <regex>)
|
||||
((:positive-lookbehind :negative-lookbehind)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex and stop accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(let* ((flags (copy-list flags))
|
||||
(regex (convert-aux (second parse-tree)))
|
||||
(len (regex-length regex)))
|
||||
(declare (special flags))
|
||||
;; lookbehind assertions must be of fixed length
|
||||
(unless len
|
||||
(signal-ppcre-syntax-error
|
||||
"Variable length look-behind not implemented (yet): ~S"
|
||||
parse-tree))
|
||||
(make-instance 'lookbehind
|
||||
:regex regex
|
||||
:positivep (eq (first parse-tree)
|
||||
:positive-lookbehind)
|
||||
:len len)))
|
||||
;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>)
|
||||
((:greedy-repetition :non-greedy-repetition)
|
||||
;; remember the value of ACCUMULATE-START-P upon entering
|
||||
(let ((local-accumulate-start-p accumulate-start-p))
|
||||
(let ((minimum (second parse-tree))
|
||||
(maximum (third parse-tree)))
|
||||
(declare (type fixnum minimum))
|
||||
(declare (type (or null fixnum) maximum))
|
||||
(unless (and maximum
|
||||
(= 1 minimum maximum))
|
||||
;; set ACCUMULATE-START-P to NIL for the rest of
|
||||
;; the conversion because we can't continue to
|
||||
;; accumulate inside as well as after a proper
|
||||
;; repetition
|
||||
(setq accumulate-start-p nil))
|
||||
(let* (reg-seen
|
||||
(regex (convert-aux (fourth parse-tree)))
|
||||
(min-len (regex-min-length regex))
|
||||
(greedyp (eq (first parse-tree) :greedy-repetition))
|
||||
(length (regex-length regex)))
|
||||
;; note that this declaration already applies to
|
||||
;; the call to CONVERT-AUX above
|
||||
(declare (special reg-seen))
|
||||
(when (and local-accumulate-start-p
|
||||
(not starts-with)
|
||||
(zerop minimum)
|
||||
(not maximum))
|
||||
;; if this repetition is (equivalent to) ".*"
|
||||
;; and if we're at the start of the regex we
|
||||
;; remember it for ADVANCE-FN (see the SCAN
|
||||
;; function)
|
||||
(setq starts-with (everythingp regex)))
|
||||
(if (or (not reg-seen)
|
||||
(not greedyp)
|
||||
(not length)
|
||||
(zerop length)
|
||||
(and maximum (= minimum maximum)))
|
||||
;; the repetition doesn't enclose a register, or
|
||||
;; it's not greedy, or we can't determine it's
|
||||
;; (inner) length, or the length is zero, or the
|
||||
;; number of repetitions is fixed; in all of
|
||||
;; these cases we don't bother to optimize
|
||||
(maybe-split-repetition regex
|
||||
greedyp
|
||||
minimum
|
||||
maximum
|
||||
min-len
|
||||
length
|
||||
reg-seen)
|
||||
;; otherwise we make a transformation that looks
|
||||
;; roughly like one of
|
||||
;; <regex>* -> (?:<regex'>*<regex>)?
|
||||
;; <regex>+ -> <regex'>*<regex>
|
||||
;; where the trick is that as much as possible
|
||||
;; registers from <regex> are removed in
|
||||
;; <regex'>
|
||||
(let* (reg-seen ; new instance for REMOVE-REGISTERS
|
||||
(remove-registers-p t)
|
||||
(inner-regex (remove-registers regex))
|
||||
(inner-repetition
|
||||
;; this is the "<regex'>" part
|
||||
(maybe-split-repetition inner-regex
|
||||
;; always greedy
|
||||
t
|
||||
;; reduce minimum by 1
|
||||
;; unless it's already 0
|
||||
(if (zerop minimum)
|
||||
0
|
||||
(1- minimum))
|
||||
;; reduce maximum by 1
|
||||
;; unless it's NIL
|
||||
(and maximum
|
||||
(1- maximum))
|
||||
min-len
|
||||
length
|
||||
reg-seen))
|
||||
(inner-seq
|
||||
;; this is the "<regex'>*<regex>" part
|
||||
(make-instance 'seq
|
||||
:elements (list inner-repetition
|
||||
regex))))
|
||||
;; note that this declaration already applies
|
||||
;; to the call to REMOVE-REGISTERS above
|
||||
(declare (special remove-registers-p reg-seen))
|
||||
;; wrap INNER-SEQ with a greedy
|
||||
;; {0,1}-repetition (i.e. "?") if necessary
|
||||
(if (plusp minimum)
|
||||
inner-seq
|
||||
(maybe-split-repetition inner-seq
|
||||
t
|
||||
0
|
||||
1
|
||||
min-len
|
||||
nil
|
||||
t))))))))
|
||||
;; (:REGISTER <regex>)
|
||||
((:register)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex; also, assign the current value of REG-NUM to
|
||||
;; the corresponding slot of the REGISTER object and
|
||||
;; increase this counter afterwards
|
||||
(let ((flags (copy-list flags))
|
||||
(stored-reg-num reg-num))
|
||||
(declare (special flags reg-seen))
|
||||
(setq reg-seen t)
|
||||
(incf (the fixnum reg-num))
|
||||
(make-instance 'register
|
||||
:regex (convert-aux (second parse-tree))
|
||||
:num stored-reg-num)))
|
||||
;; (:STANDALONE <regex>)
|
||||
((:standalone)
|
||||
;; keep the effect of modifiers local to the enclosed
|
||||
;; regex
|
||||
(let ((flags (copy-list flags)))
|
||||
(declare (special flags))
|
||||
(make-instance 'standalone
|
||||
:regex (convert-aux (second parse-tree)))))
|
||||
;; (:BACK-REFERENCE <number>)
|
||||
((:back-reference)
|
||||
(let ((backref-number (second parse-tree)))
|
||||
(declare (type fixnum backref-number))
|
||||
(when (or (not (typep backref-number 'fixnum))
|
||||
(<= backref-number 0))
|
||||
(signal-ppcre-syntax-error
|
||||
"Illegal back-reference: ~S"
|
||||
parse-tree))
|
||||
;; stop accumulating into STARTS-WITH and increase
|
||||
;; MAX-BACK-REF if necessary
|
||||
(setq accumulate-start-p nil
|
||||
max-back-ref (max (the fixnum max-back-ref)
|
||||
backref-number))
|
||||
(make-instance 'back-reference
|
||||
;; we start counting from 0 internally
|
||||
:num (1- backref-number)
|
||||
:case-insensitive-p (case-insensitive-mode-p
|
||||
flags))))
|
||||
;; (:CHAR-CLASS|:INVERTED-CHAR-CLASS {<item>}*)
|
||||
;; where item is one of
|
||||
;; - a character
|
||||
;; - a character range: (:RANGE <char1> <char2>)
|
||||
;; - a special char class symbol like :DIGIT-CHAR-CLASS
|
||||
((:char-class :inverted-char-class)
|
||||
;; first create the hash-table and some auxiliary values
|
||||
(let* (hash
|
||||
hash-keys
|
||||
(count most-positive-fixnum)
|
||||
(item-list (rest parse-tree))
|
||||
(invertedp (eq (first parse-tree) :inverted-char-class))
|
||||
word-char-class-p)
|
||||
(cond ((every (lambda (item) (eq item :word-char-class))
|
||||
item-list)
|
||||
;; treat "[\\w]" like "\\w"
|
||||
(setq word-char-class-p t))
|
||||
((every (lambda (item) (eq item :non-word-char-class))
|
||||
item-list)
|
||||
;; treat "[\\W]" like "\\W"
|
||||
(setq word-char-class-p t)
|
||||
(setq invertedp (not invertedp)))
|
||||
(t
|
||||
(setq hash (convert-char-class-to-hash item-list)
|
||||
count (hash-table-count hash))
|
||||
(when (<= count 2)
|
||||
;; collect the hash-table keys into a list if
|
||||
;; COUNT is smaller than 3
|
||||
(setq hash-keys
|
||||
(loop for chr being the hash-keys of hash
|
||||
collect chr)))))
|
||||
(cond ((and (not invertedp)
|
||||
(= count 1))
|
||||
;; convert one-element hash table into a STR
|
||||
;; object and try to accumulate into
|
||||
;; STARTS-WITH
|
||||
(let ((str (make-instance 'str
|
||||
:str (string
|
||||
(first hash-keys))
|
||||
:case-insensitive-p nil)))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
((and (not invertedp)
|
||||
(= count 2)
|
||||
(char-equal (first hash-keys) (second hash-keys)))
|
||||
;; convert two-element hash table into a
|
||||
;; case-insensitive STR object and try to
|
||||
;; accumulate into STARTS-WITH if the two
|
||||
;; characters are CHAR-EQUAL
|
||||
(let ((str (make-instance 'str
|
||||
:str (string
|
||||
(first hash-keys))
|
||||
:case-insensitive-p t)))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
(t
|
||||
;; the general case; stop accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class
|
||||
:hash hash
|
||||
:case-insensitive-p
|
||||
(case-insensitive-mode-p flags)
|
||||
:invertedp invertedp
|
||||
:word-char-class-p word-char-class-p)))))
|
||||
;; (:FLAGS {<flag>}*)
|
||||
;; where flag is a modifier symbol like :CASE-INSENSITIVE-P
|
||||
((:flags)
|
||||
;; set/unset the flags corresponding to the symbols
|
||||
;; following :FLAGS
|
||||
(mapc #'set-flag (rest parse-tree))
|
||||
;; we're only interested in the side effect of
|
||||
;; setting/unsetting the flags and turn this syntactical
|
||||
;; construct into a VOID object which'll be optimized
|
||||
;; away when creating the matcher
|
||||
(make-instance 'void))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error
|
||||
"Unknown token ~A in parse-tree"
|
||||
(first parse-tree)))))
|
||||
((or (characterp parse-tree) (stringp parse-tree))
|
||||
;; turn characters or strings into STR objects and try to
|
||||
;; accumulate into STARTS-WITH
|
||||
(let ((str (make-instance 'str
|
||||
:str (string parse-tree)
|
||||
:case-insensitive-p
|
||||
(case-insensitive-mode-p flags))))
|
||||
(maybe-accumulate str)
|
||||
str))
|
||||
(t
|
||||
;; and now for the tokens which are symbols
|
||||
(case parse-tree
|
||||
((:void)
|
||||
(make-instance 'void))
|
||||
((:word-boundary)
|
||||
(make-instance 'word-boundary :negatedp nil))
|
||||
((:non-word-boundary)
|
||||
(make-instance 'word-boundary :negatedp t))
|
||||
;; the special character classes
|
||||
((:digit-class
|
||||
:non-digit-class
|
||||
:word-char-class
|
||||
:non-word-char-class
|
||||
:whitespace-char-class
|
||||
:non-whitespace-char-class)
|
||||
;; stop accumulating into STARTS-WITH
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'char-class
|
||||
;; use the constants defined in util.lisp
|
||||
:hash (case parse-tree
|
||||
((:digit-class
|
||||
:non-digit-class)
|
||||
+digit-hash+)
|
||||
((:word-char-class
|
||||
:non-word-char-class)
|
||||
nil)
|
||||
((:whitespace-char-class
|
||||
:non-whitespace-char-class)
|
||||
+whitespace-char-hash+))
|
||||
;; this value doesn't really matter but
|
||||
;; NIL should result in slightly faster
|
||||
;; matchers
|
||||
:case-insensitive-p nil
|
||||
:invertedp (member parse-tree
|
||||
'(:non-digit-class
|
||||
:non-word-char-class
|
||||
:non-whitespace-char-class)
|
||||
:test #'eq)
|
||||
:word-char-class-p (member parse-tree
|
||||
'(:word-char-class
|
||||
:non-word-char-class)
|
||||
:test #'eq)))
|
||||
((:start-anchor ; Perl's "^"
|
||||
:end-anchor ; Perl's "$"
|
||||
:modeless-end-anchor-no-newline
|
||||
; Perl's "\z"
|
||||
:modeless-start-anchor ; Perl's "\A"
|
||||
:modeless-end-anchor) ; Perl's "\Z"
|
||||
(make-instance 'anchor
|
||||
:startp (member parse-tree
|
||||
'(:start-anchor
|
||||
:modeless-start-anchor)
|
||||
:test #'eq)
|
||||
;; set this value according to the
|
||||
;; current settings of FLAGS (unless it's
|
||||
;; a modeless anchor)
|
||||
:multi-line-p
|
||||
(and (multi-line-mode-p flags)
|
||||
(not (member parse-tree
|
||||
'(:modeless-start-anchor
|
||||
:modeless-end-anchor
|
||||
:modeless-end-anchor-no-newline)
|
||||
:test #'eq)))
|
||||
:no-newline-p
|
||||
(eq parse-tree
|
||||
:modeless-end-anchor-no-newline)))
|
||||
((:everything)
|
||||
;; stop accumulating into STARTS-WITHS
|
||||
(setq accumulate-start-p nil)
|
||||
(make-instance 'everything
|
||||
:single-line-p (single-line-mode-p flags)))
|
||||
;; special tokens corresponding to Perl's "ism" modifiers
|
||||
((:case-insensitive-p
|
||||
:case-sensitive-p
|
||||
:multi-line-mode-p
|
||||
:not-multi-line-mode-p
|
||||
:single-line-mode-p
|
||||
:not-single-line-mode-p)
|
||||
;; we're only interested in the side effect of
|
||||
;; setting/unsetting the flags and turn these tokens
|
||||
;; into VOID objects which'll be optimized away when
|
||||
;; creating the matcher
|
||||
(set-flag parse-tree)
|
||||
(make-instance 'void))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error "Unknown token ~A in parse-tree"
|
||||
parse-tree))))))
|
||||
|
||||
(defun convert (parse-tree)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Converts the parse tree PARSE-TREE into an equivalent REGEX object
|
||||
and returns three values: the REGEX object, the number of registers
|
||||
seen and an object the regex starts with which is either a STR object
|
||||
or an EVERYTHING object (if the regex starts with something like
|
||||
\".*\") or NIL."
|
||||
;; this function basically just initializes the special variables
|
||||
;; and then calls CONVERT-AUX to do all the work
|
||||
(let* ((flags (list nil nil nil))
|
||||
(reg-num 0)
|
||||
(accumulate-start-p t)
|
||||
starts-with
|
||||
(max-back-ref 0)
|
||||
(converted-parse-tree (convert-aux parse-tree)))
|
||||
(declare (special flags reg-num accumulate-start-p starts-with max-back-ref))
|
||||
;; make sure we don't reference registers which aren't there
|
||||
(when (> (the fixnum max-back-ref)
|
||||
(the fixnum reg-num))
|
||||
(signal-ppcre-syntax-error
|
||||
"Backreference to register ~A which has not been defined"
|
||||
max-back-ref))
|
||||
(when (typep starts-with 'str)
|
||||
(setf (slot-value starts-with 'str)
|
||||
(coerce (slot-value starts-with 'str) 'simple-string)))
|
||||
(values converted-parse-tree reg-num starts-with)))
|
||||
1546
doc/benchmarks.2002-12-22.txt
Normal file
1546
doc/benchmarks.2002-12-22.txt
Normal file
File diff suppressed because it is too large
Load Diff
1934
doc/index.html
Normal file
1934
doc/index.html
Normal file
File diff suppressed because it is too large
Load Diff
72
errors.lisp
Normal file
72
errors.lisp
Normal file
@ -0,0 +1,72 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-LISP; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/errors.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defvar *syntax-error-string* nil
|
||||
"The string which caused the syntax error.")
|
||||
|
||||
(define-condition ppcre-error (simple-error)
|
||||
()
|
||||
(:documentation "All errors signaled by CL-PPCRE are of
|
||||
this type."))
|
||||
|
||||
(define-condition ppcre-syntax-error (ppcre-error)
|
||||
((string :initarg :string
|
||||
:reader ppcre-syntax-error-string)
|
||||
(pos :initarg :pos
|
||||
:reader ppcre-syntax-error-pos))
|
||||
(:default-initargs
|
||||
:pos nil
|
||||
:string *syntax-error-string*)
|
||||
(:report (lambda (condition stream)
|
||||
(format stream "~?~@[ at position ~A~]~@[ in string ~S~]"
|
||||
(simple-condition-format-control condition)
|
||||
(simple-condition-format-arguments condition)
|
||||
(ppcre-syntax-error-pos condition)
|
||||
(ppcre-syntax-error-string condition)))))
|
||||
|
||||
(define-condition ppcre-invocation-error (ppcre-error)
|
||||
()
|
||||
(:documentation "Signaled when CL-PPCRE functions are
|
||||
invoked with wrong arguments."))
|
||||
|
||||
(defmacro signal-ppcre-syntax-error* (pos format-control &rest format-arguments)
|
||||
`(error 'ppcre-syntax-error
|
||||
:pos ,pos
|
||||
:format-control ,format-control
|
||||
:format-arguments (list ,@format-arguments)))
|
||||
|
||||
(defmacro signal-ppcre-syntax-error (format-control &rest format-arguments)
|
||||
`(signal-ppcre-syntax-error* nil ,format-control ,@format-arguments))
|
||||
|
||||
(defmacro signal-ppcre-invocation-error (format-control &rest format-arguments)
|
||||
`(error 'ppcre-invocation-error
|
||||
:format-control ,format-control
|
||||
:format-arguments (list ,@format-arguments)))
|
||||
769
lexer.lisp
Normal file
769
lexer.lisp
Normal file
@ -0,0 +1,769 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/lexer.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; The lexer's responsibility is to convert the regex string into a
|
||||
;;; sequence of tokens which are in turn consumed by the parser.
|
||||
;;;
|
||||
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
|
||||
;;; (with a little help from the parser) how many register groups it
|
||||
;;; has opened so far. (The latter is necessary for interpreting
|
||||
;;; strings like "\\10" correctly.)
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(declaim (inline map-char-to-special-class))
|
||||
(defun map-char-to-special-char-class (chr)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Maps escaped characters like \"\\d\" to the tokens which represent
|
||||
their associated character classes."
|
||||
(case chr
|
||||
((#\d)
|
||||
:digit-class)
|
||||
((#\D)
|
||||
:non-digit-class)
|
||||
((#\w)
|
||||
:word-char-class)
|
||||
((#\W)
|
||||
:non-word-char-class)
|
||||
((#\s)
|
||||
:whitespace-char-class)
|
||||
((#\S)
|
||||
:non-whitespace-char-class)))
|
||||
|
||||
(locally
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(defstruct (lexer (:constructor make-lexer-internal))
|
||||
"LEXER structures are used to hold the regex string which is
|
||||
currently lexed and to keep track of the lexer's state."
|
||||
(str ""
|
||||
:type string
|
||||
:read-only t)
|
||||
(len 0
|
||||
:type fixnum
|
||||
:read-only t)
|
||||
(reg 0
|
||||
:type fixnum)
|
||||
(pos 0
|
||||
:type fixnum)
|
||||
(last-pos nil
|
||||
:type list)))
|
||||
|
||||
(defun make-lexer (string)
|
||||
(declare (inline make-lexer-internal)
|
||||
(type string string))
|
||||
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
|
||||
:len (length string)))
|
||||
|
||||
(declaim (inline end-of-string-p))
|
||||
(defun end-of-string-p (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Tests whether we're at the end of the regex string."
|
||||
(<= (lexer-len lexer)
|
||||
(lexer-pos lexer)))
|
||||
|
||||
(declaim (inline looking-at-p))
|
||||
(defun looking-at-p (lexer chr)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Tests whether the next character the lexer would see is CHR.
|
||||
Does not respect extended mode."
|
||||
(and (not (end-of-string-p lexer))
|
||||
(char= (schar (lexer-str lexer) (lexer-pos lexer))
|
||||
chr)))
|
||||
|
||||
(declaim (inline next-char-non-extended))
|
||||
(defun next-char-non-extended (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns the next character which is to be examined and updates the
|
||||
POS slot. Does not respect extended mode."
|
||||
(cond ((end-of-string-p lexer)
|
||||
nil)
|
||||
(t
|
||||
(prog1
|
||||
(schar (lexer-str lexer) (lexer-pos lexer))
|
||||
(incf (lexer-pos lexer))))))
|
||||
|
||||
(defun next-char (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns the next character which is to be examined and updates the
|
||||
POS slot. Respects extended mode, i.e. whitespace, comments, and also
|
||||
nested comments are skipped if applicable."
|
||||
(let ((next-char (next-char-non-extended lexer))
|
||||
last-loop-pos)
|
||||
(loop
|
||||
;; remember where we started
|
||||
(setq last-loop-pos (lexer-pos lexer))
|
||||
;; first we look for nested comments like (?#foo)
|
||||
(when (and next-char
|
||||
(char= next-char #\()
|
||||
(looking-at-p lexer #\?))
|
||||
(incf (lexer-pos lexer))
|
||||
(cond ((looking-at-p lexer #\#)
|
||||
;; must be a nested comment - so we have to search for
|
||||
;; the closing parenthesis
|
||||
(let ((error-pos (- (lexer-pos lexer) 2)))
|
||||
(unless
|
||||
;; loop 'til ')' or end of regex string and
|
||||
;; return NIL if ')' wasn't encountered
|
||||
(loop for skip-char = next-char
|
||||
then (next-char-non-extended lexer)
|
||||
while (and skip-char
|
||||
(char/= skip-char #\)))
|
||||
finally (return skip-char))
|
||||
(signal-ppcre-syntax-error*
|
||||
error-pos
|
||||
"Comment group not closed")))
|
||||
(setq next-char (next-char-non-extended lexer)))
|
||||
(t
|
||||
;; undo effect of previous INCF if we didn't see a #
|
||||
(decf (lexer-pos lexer)))))
|
||||
(when *extended-mode-p*
|
||||
;; now - if we're in extended mode - we skip whitespace and
|
||||
;; comments; repeat the following loop while we look at
|
||||
;; whitespace or #\#
|
||||
(loop while (and next-char
|
||||
(or (char= next-char #\#)
|
||||
(whitespacep next-char)))
|
||||
do (setq next-char
|
||||
(if (char= next-char #\#)
|
||||
;; if we saw a comment marker skip until
|
||||
;; we're behind #\Newline...
|
||||
(loop for skip-char = next-char
|
||||
then (next-char-non-extended lexer)
|
||||
while (and skip-char
|
||||
(char/= skip-char #\Newline))
|
||||
finally (return (next-char-non-extended lexer)))
|
||||
;; ...otherwise (whitespace) skip until we
|
||||
;; see the next non-whitespace character
|
||||
(loop for skip-char = next-char
|
||||
then (next-char-non-extended lexer)
|
||||
while (and skip-char
|
||||
(whitespacep skip-char))
|
||||
finally (return skip-char))))))
|
||||
;; if the position has moved we have to repeat our tests
|
||||
;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which
|
||||
;; would be equivalent to /^a{3}c/ in Perl
|
||||
(unless (> (lexer-pos lexer) last-loop-pos)
|
||||
(return next-char)))))
|
||||
|
||||
(declaim (inline fail))
|
||||
(defun fail (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Moves (LEXER-POS LEXER) back to the last position stored in
|
||||
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
|
||||
(unless (lexer-last-pos lexer)
|
||||
(signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
|
||||
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
|
||||
nil)
|
||||
|
||||
(defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Read and consume the number the lexer is currently looking at and
|
||||
return it. Returns NIL if no number could be identified.
|
||||
RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
|
||||
at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
|
||||
we don't tolerate whitespace in front of the number."
|
||||
(when (or (end-of-string-p lexer)
|
||||
(and no-whitespace-p
|
||||
(whitespacep (schar (lexer-str lexer) (lexer-pos lexer)))))
|
||||
(return-from get-number nil))
|
||||
(multiple-value-bind (integer new-pos)
|
||||
(parse-integer (lexer-str lexer)
|
||||
:start (lexer-pos lexer)
|
||||
:end (if max-length
|
||||
(let ((end-pos (+ (lexer-pos lexer)
|
||||
(the fixnum max-length)))
|
||||
(lexer-len (lexer-len lexer)))
|
||||
(if (< end-pos lexer-len)
|
||||
end-pos
|
||||
lexer-len))
|
||||
(lexer-len lexer))
|
||||
:radix radix
|
||||
:junk-allowed t)
|
||||
(cond ((and integer (>= (the fixnum integer) 0))
|
||||
(setf (lexer-pos lexer) new-pos)
|
||||
integer)
|
||||
(t nil))))
|
||||
|
||||
(declaim (inline try-number))
|
||||
(defun try-number (lexer &key (radix 10) max-length no-whitespace-p)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Like GET-NUMBER but won't consume anything if no number is seen."
|
||||
;; remember current position
|
||||
(push (lexer-pos lexer) (lexer-last-pos lexer))
|
||||
(let ((number (get-number lexer
|
||||
:radix radix
|
||||
:max-length max-length
|
||||
:no-whitespace-p no-whitespace-p)))
|
||||
(or number (fail lexer))))
|
||||
|
||||
(declaim (inline make-char-from-code))
|
||||
(defun make-char-from-code (number error-pos)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Create character from char-code NUMBER. NUMBER can be NIL
|
||||
which is interpreted as 0. ERROR-POS is the position where
|
||||
the corresponding number started within the regex string."
|
||||
;; Only look at rightmost eight bits in compliance with Perl
|
||||
(let ((code (logand #o377 (the fixnum (or number 0)))))
|
||||
(or (and (< code char-code-limit)
|
||||
(code-char code))
|
||||
(signal-ppcre-syntax-error*
|
||||
error-pos
|
||||
"No character for hex-code ~X"
|
||||
number))))
|
||||
|
||||
(defun unescape-char (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Convert the characters(s) following a backslash into a token
|
||||
which is returned. This function is to be called when the backslash
|
||||
has already been consumed. Special character classes like \\W are
|
||||
handled elsewhere."
|
||||
(when (end-of-string-p lexer)
|
||||
(signal-ppcre-syntax-error "String ends with backslash"))
|
||||
(let ((chr (next-char-non-extended lexer)))
|
||||
(case chr
|
||||
((#\E)
|
||||
;; if \Q quoting is on this is ignored, otherwise it's just an
|
||||
;; #\E
|
||||
(if *allow-quoting*
|
||||
:void
|
||||
#\E))
|
||||
((#\c)
|
||||
;; \cx means control-x in Perl
|
||||
(let ((next-char (next-char-non-extended lexer)))
|
||||
(unless next-char
|
||||
(signal-ppcre-syntax-error*
|
||||
(lexer-pos lexer)
|
||||
"Character missing after '\\c' at position ~A"))
|
||||
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
|
||||
((#\x)
|
||||
;; \x should be followed by a hexadecimal char code,
|
||||
;; two digits or less
|
||||
(let* ((error-pos (lexer-pos lexer))
|
||||
(number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t)))
|
||||
;; note that it is OK if \x is followed by zero digits
|
||||
(make-char-from-code number error-pos)))
|
||||
((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
|
||||
;; \x should be followed by an octal char code,
|
||||
;; three digits or less
|
||||
(let* ((error-pos (decf (lexer-pos lexer)))
|
||||
(number (get-number lexer :radix 8 :max-length 3)))
|
||||
(make-char-from-code number error-pos)))
|
||||
;; the following five character names are 'semi-standard'
|
||||
;; according to the CLHS but I'm not aware of any implementation
|
||||
;; that doesn't implement them
|
||||
((#\t)
|
||||
#\Tab)
|
||||
((#\n)
|
||||
#\Newline)
|
||||
((#\r)
|
||||
#\Return)
|
||||
((#\f)
|
||||
#\Page)
|
||||
((#\b)
|
||||
#\Backspace)
|
||||
((#\a)
|
||||
(code-char 7)) ; ASCII bell
|
||||
((#\e)
|
||||
(code-char 27)) ; ASCII escape
|
||||
(otherwise
|
||||
;; all other characters aren't affected by a backslash
|
||||
chr))))
|
||||
|
||||
(defun collect-char-class (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Reads and consumes characters from regex string until a right
|
||||
bracket is seen. Assembles them into a list \(which is returned) of
|
||||
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
|
||||
tokens representing special character classes."
|
||||
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
|
||||
hyphen-seen
|
||||
last-char
|
||||
list)
|
||||
(flet ((handle-char (c)
|
||||
"Do the right thing with character C depending on whether
|
||||
we're inside a range or not."
|
||||
(cond ((and hyphen-seen last-char)
|
||||
(setf (car list) (list :range last-char c)
|
||||
last-char nil))
|
||||
(t
|
||||
(push c list)
|
||||
(setq last-char c)))
|
||||
(setq hyphen-seen nil)))
|
||||
(loop for first = t then nil
|
||||
for c = (next-char-non-extended lexer)
|
||||
;; leave loop if at end of string
|
||||
while c
|
||||
do (cond
|
||||
((char= c #\\)
|
||||
;; we've seen a backslash
|
||||
(let ((next-char (next-char-non-extended lexer)))
|
||||
(case next-char
|
||||
((#\d #\D #\w #\W #\s #\S)
|
||||
;; a special character class
|
||||
(push (map-char-to-special-char-class next-char) list)
|
||||
;; if the last character was a hyphen
|
||||
;; just collect it literally
|
||||
(when hyphen-seen
|
||||
(push #\- list))
|
||||
;; if the next character is a hyphen do the same
|
||||
(when (looking-at-p lexer #\-)
|
||||
(push #\- list)
|
||||
(incf (lexer-pos lexer)))
|
||||
(setq hyphen-seen nil))
|
||||
((#\E)
|
||||
;; if \Q quoting is on we ignore \E,
|
||||
;; otherwise it's just a plain #\E
|
||||
(unless *allow-quoting*
|
||||
(handle-char #\E)))
|
||||
(otherwise
|
||||
;; otherwise unescape the following character(s)
|
||||
(decf (lexer-pos lexer))
|
||||
(handle-char (unescape-char lexer))))))
|
||||
(first
|
||||
;; the first character must not be a right bracket
|
||||
;; and isn't treated specially if it's a hyphen
|
||||
(handle-char c))
|
||||
((char= c #\])
|
||||
;; end of character class
|
||||
;; make sure we collect a pending hyphen
|
||||
(when hyphen-seen
|
||||
(setq hyphen-seen nil)
|
||||
(handle-char #\-))
|
||||
;; reverse the list to preserve the order intended
|
||||
;; by the author of the regex string
|
||||
(return-from collect-char-class (nreverse list)))
|
||||
((and (char= c #\-)
|
||||
last-char
|
||||
(not hyphen-seen))
|
||||
;; if the last character was 'just a character'
|
||||
;; we expect to be in the middle of a range
|
||||
(setq hyphen-seen t))
|
||||
((char= c #\-)
|
||||
;; otherwise this is just an ordinary hyphen
|
||||
(handle-char #\-))
|
||||
(t
|
||||
;; default case - just collect the character
|
||||
(handle-char c))))
|
||||
;; we can only exit the loop normally if we've reached the end
|
||||
;; of the regex string without seeing a right bracket
|
||||
(signal-ppcre-syntax-error*
|
||||
start-pos
|
||||
"Missing right bracket to close character class"))))
|
||||
|
||||
(defun maybe-parse-flags (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Reads a sequence of modifiers \(including #\\- to reverse their
|
||||
meaning) and returns a corresponding list of \"flag\" tokens. The
|
||||
\"x\" modifier is treated specially in that it dynamically modifies
|
||||
the behaviour of the lexer itself via the special variable
|
||||
*EXTENDED-MODE-P*."
|
||||
(prog1
|
||||
(loop with set = t
|
||||
for chr = (next-char-non-extended lexer)
|
||||
unless chr
|
||||
do (signal-ppcre-syntax-error "Unexpected end of string")
|
||||
while (find chr "-imsx" :test #'char=)
|
||||
;; the first #\- will invert the meaning of all modifiers
|
||||
;; following it
|
||||
if (char= chr #\-)
|
||||
do (setq set nil)
|
||||
else if (char= chr #\x)
|
||||
do (setq *extended-mode-p* set)
|
||||
else collect (if set
|
||||
(case chr
|
||||
((#\i)
|
||||
:case-insensitive-p)
|
||||
((#\m)
|
||||
:multi-line-mode-p)
|
||||
((#\s)
|
||||
:single-line-mode-p))
|
||||
(case chr
|
||||
((#\i)
|
||||
:case-sensitive-p)
|
||||
((#\m)
|
||||
:not-multi-line-mode-p)
|
||||
((#\s)
|
||||
:not-single-line-mode-p))))
|
||||
(decf (lexer-pos lexer))))
|
||||
|
||||
(defun get-quantifier (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns a list of two values (min max) if what the lexer is looking
|
||||
at can be interpreted as a quantifier. Otherwise returns NIL and
|
||||
resets the lexer to its old position."
|
||||
;; remember starting position for FAIL and UNGET-TOKEN functions
|
||||
(push (lexer-pos lexer) (lexer-last-pos lexer))
|
||||
(let ((next-char (next-char lexer)))
|
||||
(case next-char
|
||||
((#\*)
|
||||
;; * (Kleene star): match 0 or more times
|
||||
'(0 nil))
|
||||
((#\+)
|
||||
;; +: match 1 or more times
|
||||
'(1 nil))
|
||||
((#\?)
|
||||
;; ?: match 0 or 1 times
|
||||
'(0 1))
|
||||
((#\{)
|
||||
;; one of
|
||||
;; {n}: match exactly n times
|
||||
;; {n,}: match at least n times
|
||||
;; {n,m}: match at least n but not more than m times
|
||||
;; note that anything not matching one of these patterns will
|
||||
;; be interpreted literally - even whitespace isn't allowed
|
||||
(let ((num1 (get-number lexer :no-whitespace-p t)))
|
||||
(if num1
|
||||
(let ((next-char (next-char-non-extended lexer)))
|
||||
(case next-char
|
||||
((#\,)
|
||||
(let* ((num2 (get-number lexer :no-whitespace-p t))
|
||||
(next-char (next-char-non-extended lexer)))
|
||||
(case next-char
|
||||
((#\})
|
||||
;; this is the case {n,} (NUM2 is NIL) or {n,m}
|
||||
(list num1 num2))
|
||||
(otherwise
|
||||
(fail lexer)))))
|
||||
((#\})
|
||||
;; this is the case {n}
|
||||
(list num1 num1))
|
||||
(otherwise
|
||||
(fail lexer))))
|
||||
;; no number following left curly brace, so we treat it
|
||||
;; like a normal character
|
||||
(fail lexer))))
|
||||
;; cannot be a quantifier
|
||||
(otherwise
|
||||
(fail lexer)))))
|
||||
|
||||
(defun get-token (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns and consumes the next token from the regex string (or NIL)."
|
||||
;; remember starting position for UNGET-TOKEN function
|
||||
(push (lexer-pos lexer)
|
||||
(lexer-last-pos lexer))
|
||||
(let ((next-char (next-char lexer)))
|
||||
(cond (next-char
|
||||
(case next-char
|
||||
;; the easy cases first - the following six characters
|
||||
;; always have a special meaning and get translated
|
||||
;; into tokens immediately
|
||||
((#\))
|
||||
:close-paren)
|
||||
((#\|)
|
||||
:vertical-bar)
|
||||
((#\?)
|
||||
:question-mark)
|
||||
((#\.)
|
||||
:everything)
|
||||
((#\^)
|
||||
:start-anchor)
|
||||
((#\$)
|
||||
:end-anchor)
|
||||
((#\+ #\*)
|
||||
;; quantifiers will always be consumend by
|
||||
;; GET-QUANTIFIER, they must not appear here
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Quantifier '~A' not allowed"
|
||||
next-char))
|
||||
((#\{)
|
||||
;; left brace isn't a special character in it's own
|
||||
;; right but we must check if what follows might
|
||||
;; look like a quantifier
|
||||
(let ((this-pos (lexer-pos lexer))
|
||||
(this-last-pos (lexer-last-pos lexer)))
|
||||
(unget-token lexer)
|
||||
(when (get-quantifier lexer)
|
||||
(signal-ppcre-syntax-error*
|
||||
(car this-last-pos)
|
||||
"Quantifier '~A' not allowed"
|
||||
(subseq (lexer-str lexer)
|
||||
(car this-last-pos)
|
||||
(lexer-pos lexer))))
|
||||
(setf (lexer-pos lexer) this-pos
|
||||
(lexer-last-pos lexer) this-last-pos)
|
||||
next-char))
|
||||
((#\[)
|
||||
;; left bracket always starts a character class
|
||||
(cons (cond ((looking-at-p lexer #\^)
|
||||
(incf (lexer-pos lexer))
|
||||
:inverted-char-class)
|
||||
(t
|
||||
:char-class))
|
||||
(collect-char-class lexer)))
|
||||
((#\\)
|
||||
;; backslash might mean different things so we have
|
||||
;; to peek one char ahead:
|
||||
(let ((next-char (next-char-non-extended lexer)))
|
||||
(case next-char
|
||||
((#\A)
|
||||
:modeless-start-anchor)
|
||||
((#\Z)
|
||||
:modeless-end-anchor)
|
||||
((#\z)
|
||||
:modeless-end-anchor-no-newline)
|
||||
((#\b)
|
||||
:word-boundary)
|
||||
((#\B)
|
||||
:non-word-boundary)
|
||||
((#\d #\D #\w #\W #\s #\S)
|
||||
;; these will be treated like character classes
|
||||
(map-char-to-special-char-class next-char))
|
||||
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
|
||||
;; uh, a digit...
|
||||
(let* ((old-pos (decf (lexer-pos lexer)))
|
||||
;; ...so let's get the whole number first
|
||||
(backref-number (get-number lexer)))
|
||||
(declare (type fixnum backref-number))
|
||||
(cond ((and (> backref-number (lexer-reg lexer))
|
||||
(<= 10 backref-number))
|
||||
;; \10 and higher are treated as octal
|
||||
;; character codes if we haven't
|
||||
;; opened that much register groups
|
||||
;; yet
|
||||
(setf (lexer-pos lexer) old-pos)
|
||||
;; re-read the number from the old
|
||||
;; position and convert it to its
|
||||
;; corresponding character
|
||||
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
||||
old-pos))
|
||||
(t
|
||||
;; otherwise this must refer to a
|
||||
;; backreference
|
||||
(list :back-reference backref-number)))))
|
||||
((#\0)
|
||||
;; this always means an octal character code
|
||||
;; (at most three digits)
|
||||
(let ((old-pos (decf (lexer-pos lexer))))
|
||||
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
||||
old-pos)))
|
||||
(otherwise
|
||||
;; in all other cases just unescape the
|
||||
;; character
|
||||
(decf (lexer-pos lexer))
|
||||
(unescape-char lexer)))))
|
||||
((#\()
|
||||
;; an open parenthesis might mean different things
|
||||
;; depending on what follows...
|
||||
(cond ((looking-at-p lexer #\?)
|
||||
;; this is the case '(?' (and probably more behind)
|
||||
(incf (lexer-pos lexer))
|
||||
;; we have to check for modifiers first
|
||||
;; because a colon might follow
|
||||
(let* ((flags (maybe-parse-flags lexer))
|
||||
(next-char (next-char-non-extended lexer)))
|
||||
;; modifiers are only allowed if a colon
|
||||
;; or a closing parenthesis are following
|
||||
(when (and flags
|
||||
(not (find next-char ":)" :test #'char=)))
|
||||
(signal-ppcre-syntax-error*
|
||||
(car (lexer-last-pos lexer))
|
||||
"Sequence '~A' not recognized"
|
||||
(subseq (lexer-str lexer)
|
||||
(car (lexer-last-pos lexer))
|
||||
(lexer-pos lexer))))
|
||||
(case next-char
|
||||
((nil)
|
||||
;; syntax error
|
||||
(signal-ppcre-syntax-error
|
||||
"End of string following '(?'"))
|
||||
((#\))
|
||||
;; an empty group except for the flags
|
||||
;; (if there are any)
|
||||
(or (and flags
|
||||
(cons :flags flags))
|
||||
:void))
|
||||
((#\()
|
||||
;; branch
|
||||
:open-paren-paren)
|
||||
((#\>)
|
||||
;; standalone
|
||||
:open-paren-greater)
|
||||
((#\=)
|
||||
;; positive look-ahead
|
||||
:open-paren-equal)
|
||||
((#\!)
|
||||
;; negative look-ahead
|
||||
:open-paren-exclamation)
|
||||
((#\:)
|
||||
;; non-capturing group - return flags as
|
||||
;; second value
|
||||
(values :open-paren-colon flags))
|
||||
((#\<)
|
||||
;; might be a look-behind assertion, so
|
||||
;; check next character
|
||||
(let ((next-char (next-char-non-extended lexer)))
|
||||
(case next-char
|
||||
((#\=)
|
||||
;; positive look-behind
|
||||
:open-paren-less-equal)
|
||||
((#\!)
|
||||
;; negative look-behind
|
||||
:open-paren-less-exclamation)
|
||||
((#\))
|
||||
;; Perl allows "(?<)" and treats
|
||||
;; it like a null string
|
||||
:void)
|
||||
((nil)
|
||||
;; syntax error
|
||||
(signal-ppcre-syntax-error
|
||||
"End of string following '(?<'"))
|
||||
(t
|
||||
;; also syntax error
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?<'"
|
||||
next-char )))))
|
||||
(otherwise
|
||||
(signal-ppcre-syntax-error*
|
||||
(1- (lexer-pos lexer))
|
||||
"Character '~A' may not follow '(?'"
|
||||
next-char)))))
|
||||
(t
|
||||
;; if next-char was not #\? (this is within
|
||||
;; the first COND), we've just seen an opening
|
||||
;; parenthesis and leave it like that
|
||||
:open-paren)))
|
||||
(otherwise
|
||||
;; all other characters are their own tokens
|
||||
next-char)))
|
||||
;; we didn't get a character (this if the "else" branch from
|
||||
;; the first IF), so we don't return a token but NIL
|
||||
(t
|
||||
(pop (lexer-last-pos lexer))
|
||||
nil))))
|
||||
|
||||
(declaim (inline unget-token))
|
||||
(defun unget-token (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Moves the lexer back to the last position stored in the LAST-POS stack."
|
||||
(if (lexer-last-pos lexer)
|
||||
(setf (lexer-pos lexer)
|
||||
(pop (lexer-last-pos lexer)))
|
||||
(error "No token to unget \(this should not happen)")))
|
||||
|
||||
(declaim (inline start-of-subexpr-p))
|
||||
(defun start-of-subexpr-p (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Tests whether the next token can start a valid sub-expression, i.e.
|
||||
a stand-alone regex."
|
||||
(let* ((pos (lexer-pos lexer))
|
||||
(next-char (next-char lexer)))
|
||||
(not (or (null next-char)
|
||||
(prog1
|
||||
(member (the character next-char)
|
||||
'(#\) #\|)
|
||||
:test #'char=)
|
||||
(setf (lexer-pos lexer) pos))))))
|
||||
62
load.lisp
Executable file
62
load.lisp
Executable file
@ -0,0 +1,62 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/load.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
|
||||
(defparameter *cl-ppcre-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(loop for file in '("packages"
|
||||
"specials"
|
||||
"util"
|
||||
"errors"
|
||||
"lexer"
|
||||
"parser"
|
||||
"regex-class"
|
||||
"convert"
|
||||
"optimize"
|
||||
"closures"
|
||||
"repetition-closures"
|
||||
"scanner"
|
||||
"api"
|
||||
"ppcre-tests")
|
||||
do (let ((pathname (make-pathname :name file :type "lisp" :version nil
|
||||
:defaults *cl-ppcre-base-directory*)))
|
||||
#-:cormanlisp
|
||||
(let ((compiled-pathname (compile-file-pathname pathname)))
|
||||
(unless (probe-file compiled-pathname)
|
||||
(compile-file pathname))
|
||||
(setq pathname compiled-pathname))
|
||||
(load pathname)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
597
optimize.lisp
Normal file
597
optimize.lisp
Normal file
@ -0,0 +1,597 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/optimize.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; This file contains optimizations which can be applied to converted
|
||||
;;; parse trees.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defun string-list-to-simple-string (string-list)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Concatenates a list of strings to one simple-string."
|
||||
;; this function provided by JP Massar; note that we can't use APPLY
|
||||
;; with CONCATENATE here because of CALL-ARGUMENTS-LIMIT
|
||||
(let ((total-size 0))
|
||||
(declare (type fixnum total-size))
|
||||
(dolist (string string-list)
|
||||
(declare (type string string))
|
||||
(incf total-size (length string)))
|
||||
(let ((result-string (make-sequence 'simple-string total-size))
|
||||
(curr-pos 0))
|
||||
(declare (type fixnum curr-pos))
|
||||
(dolist (string string-list)
|
||||
(declare (type string string))
|
||||
(replace result-string string :start1 curr-pos)
|
||||
(incf curr-pos (length string)))
|
||||
result-string)))
|
||||
|
||||
(defgeneric flatten (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Merges adjacent sequences and alternations, i.e. it
|
||||
transforms #<SEQ #<STR \"a\"> #<SEQ #<STR \"b\"> #<STR \"c\">>> to
|
||||
#<SEQ #<STR \"a\"> #<STR \"b\"> #<STR \"c\">>. This is a destructive
|
||||
operation on REGEX."))
|
||||
|
||||
(defmethod flatten ((seq seq))
|
||||
;; this looks more complicated than it is because we modify SEQ in
|
||||
;; place to avoid unnecessary consing
|
||||
(let ((elements-rest (elements seq)))
|
||||
(loop
|
||||
(unless elements-rest
|
||||
(return))
|
||||
(let ((flattened-element (flatten (car elements-rest)))
|
||||
(next-elements-rest (cdr elements-rest)))
|
||||
(cond ((typep flattened-element 'seq)
|
||||
;; FLATTENED-ELEMENT is a SEQ object, so we "splice"
|
||||
;; it into out list of elements
|
||||
(let ((flattened-element-elements
|
||||
(elements flattened-element)))
|
||||
(setf (car elements-rest)
|
||||
(car flattened-element-elements)
|
||||
(cdr elements-rest)
|
||||
(nconc (cdr flattened-element-elements)
|
||||
(cdr elements-rest)))))
|
||||
(t
|
||||
;; otherwise we just replace the current element with
|
||||
;; its flattened counterpart
|
||||
(setf (car elements-rest) flattened-element)))
|
||||
(setq elements-rest next-elements-rest))))
|
||||
(let ((elements (elements seq)))
|
||||
(cond ((cadr elements)
|
||||
seq)
|
||||
((cdr elements)
|
||||
(first elements))
|
||||
(t (make-instance 'void)))))
|
||||
|
||||
(defmethod flatten ((alternation alternation))
|
||||
;; same algorithm as above
|
||||
(let ((choices-rest (choices alternation)))
|
||||
(loop
|
||||
(unless choices-rest
|
||||
(return))
|
||||
(let ((flattened-choice (flatten (car choices-rest)))
|
||||
(next-choices-rest (cdr choices-rest)))
|
||||
(cond ((typep flattened-choice 'alternation)
|
||||
(let ((flattened-choice-choices
|
||||
(choices flattened-choice)))
|
||||
(setf (car choices-rest)
|
||||
(car flattened-choice-choices)
|
||||
(cdr choices-rest)
|
||||
(nconc (cdr flattened-choice-choices)
|
||||
(cdr choices-rest)))))
|
||||
(t
|
||||
(setf (car choices-rest) flattened-choice)))
|
||||
(setq choices-rest next-choices-rest))))
|
||||
(let ((choices (choices alternation)))
|
||||
(cond ((cadr choices)
|
||||
alternation)
|
||||
((cdr choices)
|
||||
(first choices))
|
||||
(t (signal-ppcre-syntax-error
|
||||
"Encountered alternation without choices.")))))
|
||||
|
||||
(defmethod flatten ((branch branch))
|
||||
(with-slots ((test test)
|
||||
(then-regex then-regex)
|
||||
(else-regex else-regex))
|
||||
branch
|
||||
(setq test
|
||||
(if (numberp test)
|
||||
test
|
||||
(flatten test))
|
||||
then-regex (flatten then-regex)
|
||||
else-regex (flatten else-regex))
|
||||
branch))
|
||||
|
||||
(defmethod flatten ((regex regex))
|
||||
(typecase regex
|
||||
((or repetition register lookahead lookbehind standalone)
|
||||
;; if REGEX contains exactly one inner REGEX object flatten it
|
||||
(setf (regex regex)
|
||||
(flatten (regex regex)))
|
||||
regex)
|
||||
(t
|
||||
;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
|
||||
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY) do
|
||||
;; nothing
|
||||
regex)))
|
||||
|
||||
(defgeneric gather-strings (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Collects adjacent strings or characters into one
|
||||
string provided they have the same case mode. This is a destructive
|
||||
operation on REGEX."))
|
||||
|
||||
(defmethod gather-strings ((seq seq))
|
||||
;; note that GATHER-STRINGS is to be applied after FLATTEN, i.e. it
|
||||
;; expects SEQ to be flattened already; in particular, SEQ cannot be
|
||||
;; empty and cannot contain embedded SEQ objects
|
||||
(let* ((start-point (cons nil (elements seq)))
|
||||
(curr-point start-point)
|
||||
old-case-mode
|
||||
collector
|
||||
collector-start
|
||||
(collector-length 0)
|
||||
skip)
|
||||
(declare (type fixnum collector-length))
|
||||
(loop
|
||||
(let ((elements-rest (cdr curr-point)))
|
||||
(unless elements-rest
|
||||
(return))
|
||||
(let* ((element (car elements-rest))
|
||||
(case-mode (case-mode element old-case-mode)))
|
||||
(cond ((and case-mode
|
||||
(eq case-mode old-case-mode))
|
||||
;; if ELEMENT is a STR and we have collected a STR of
|
||||
;; the same case mode in the last iteration we
|
||||
;; concatenate ELEMENT onto COLLECTOR and remember the
|
||||
;; value of its SKIP slot
|
||||
(let ((old-collector-length collector-length))
|
||||
(unless (and (adjustable-array-p collector)
|
||||
(array-has-fill-pointer-p collector))
|
||||
(setq collector
|
||||
(make-array collector-length
|
||||
:initial-contents collector
|
||||
:element-type 'character
|
||||
:fill-pointer t
|
||||
:adjustable t)
|
||||
collector-start nil))
|
||||
(adjust-array collector
|
||||
(incf collector-length (len element))
|
||||
:fill-pointer t)
|
||||
(setf (subseq collector
|
||||
old-collector-length)
|
||||
(str element)
|
||||
;; it suffices to remember the last SKIP slot
|
||||
;; because due to the way MAYBE-ACCUMULATE
|
||||
;; works adjacent STR objects have the same
|
||||
;; SKIP value
|
||||
skip (skip element)))
|
||||
(setf (cdr curr-point) (cdr elements-rest)))
|
||||
(t
|
||||
(let ((collected-string
|
||||
(cond (collector-start
|
||||
collector-start)
|
||||
(collector
|
||||
;; if we have collected something already
|
||||
;; we convert it into a STR
|
||||
(make-instance 'str
|
||||
:skip skip
|
||||
:str collector
|
||||
:case-insensitive-p
|
||||
(eq old-case-mode
|
||||
:case-insensitive)))
|
||||
(t nil))))
|
||||
(cond (case-mode
|
||||
;; if ELEMENT is a string with a different case
|
||||
;; mode than the last one we have either just
|
||||
;; converted COLLECTOR into a STR or COLLECTOR
|
||||
;; is still empty; in both cases we can now
|
||||
;; begin to fill it anew
|
||||
(setq collector (str element)
|
||||
collector-start element
|
||||
;; and we remember the SKIP value as above
|
||||
skip (skip element)
|
||||
collector-length (len element))
|
||||
(cond (collected-string
|
||||
(setf (car elements-rest)
|
||||
collected-string
|
||||
curr-point
|
||||
(cdr curr-point)))
|
||||
(t
|
||||
(setf (cdr curr-point)
|
||||
(cdr elements-rest)))))
|
||||
(t
|
||||
;; otherwise this is not a STR so we apply
|
||||
;; GATHER-STRINGS to it and collect it directly
|
||||
;; into RESULT
|
||||
(cond (collected-string
|
||||
(setf (car elements-rest)
|
||||
collected-string
|
||||
curr-point
|
||||
(cdr curr-point)
|
||||
(cdr curr-point)
|
||||
(cons (gather-strings element)
|
||||
(cdr curr-point))
|
||||
curr-point
|
||||
(cdr curr-point)))
|
||||
(t
|
||||
(setf (car elements-rest)
|
||||
(gather-strings element)
|
||||
curr-point
|
||||
(cdr curr-point))))
|
||||
;; we also have to empty COLLECTOR here in case
|
||||
;; it was still filled from the last iteration
|
||||
(setq collector nil
|
||||
collector-start nil))))))
|
||||
(setq old-case-mode case-mode))))
|
||||
(when collector
|
||||
(setf (cdr curr-point)
|
||||
(cons
|
||||
(make-instance 'str
|
||||
:skip skip
|
||||
:str collector
|
||||
:case-insensitive-p
|
||||
(eq old-case-mode
|
||||
:case-insensitive))
|
||||
nil)))
|
||||
(setf (elements seq) (cdr start-point))
|
||||
seq))
|
||||
|
||||
(defmethod gather-strings ((alternation alternation))
|
||||
;; loop ON the choices of ALTERNATION so we can modify them directly
|
||||
(loop for choices-rest on (choices alternation)
|
||||
while choices-rest
|
||||
do (setf (car choices-rest)
|
||||
(gather-strings (car choices-rest))))
|
||||
alternation)
|
||||
|
||||
(defmethod gather-strings ((branch branch))
|
||||
(with-slots ((test test)
|
||||
(then-regex then-regex)
|
||||
(else-regex else-regex))
|
||||
branch
|
||||
(setq test
|
||||
(if (numberp test)
|
||||
test
|
||||
(gather-strings test))
|
||||
then-regex (gather-strings then-regex)
|
||||
else-regex (gather-strings else-regex))
|
||||
branch))
|
||||
|
||||
(defmethod gather-strings ((regex regex))
|
||||
(typecase regex
|
||||
((or repetition register lookahead lookbehind standalone)
|
||||
;; if REGEX contains exactly one inner REGEX object apply
|
||||
;; GATHER-STRINGS to it
|
||||
(setf (regex regex)
|
||||
(gather-strings (regex regex)))
|
||||
regex)
|
||||
(t
|
||||
;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
|
||||
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY) do
|
||||
;; nothing
|
||||
regex)))
|
||||
|
||||
;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS.
|
||||
|
||||
(defgeneric start-anchored-p (regex &optional in-seq-p)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns T if REGEX starts with a \"real\" start
|
||||
anchor, i.e. one that's not in multi-line mode, NIL otherwise. If
|
||||
IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a
|
||||
zero-length assertion."))
|
||||
|
||||
(defmethod start-anchored-p ((seq seq) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
;; note that START-ANCHORED-P is to be applied after FLATTEN and
|
||||
;; GATHER-STRINGS, i.e. SEQ cannot be empty and cannot contain
|
||||
;; embedded SEQ objects
|
||||
(loop for element in (elements seq)
|
||||
for anchored-p = (start-anchored-p element t)
|
||||
;; skip zero-length elements because they won't affect the
|
||||
;; "anchoredness" of the sequence
|
||||
while (eq anchored-p :zero-length)
|
||||
finally (return (and anchored-p (not (eq anchored-p :zero-length))))))
|
||||
|
||||
(defmethod start-anchored-p ((alternation alternation) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
;; clearly an alternation can only be start-anchored if all of its
|
||||
;; choices are start-anchored
|
||||
(loop for choice in (choices alternation)
|
||||
always (start-anchored-p choice)))
|
||||
|
||||
(defmethod start-anchored-p ((branch branch) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
(and (start-anchored-p (then-regex branch))
|
||||
(start-anchored-p (else-regex branch))))
|
||||
|
||||
(defmethod start-anchored-p ((repetition repetition) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
;; well, this wouldn't make much sense, but anyway...
|
||||
(and (plusp (minimum repetition))
|
||||
(start-anchored-p (regex repetition))))
|
||||
|
||||
(defmethod start-anchored-p ((register register) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
(start-anchored-p (regex register)))
|
||||
|
||||
(defmethod start-anchored-p ((standalone standalone) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
(start-anchored-p (regex standalone)))
|
||||
|
||||
(defmethod start-anchored-p ((anchor anchor) &optional in-seq-p)
|
||||
(declare (ignore in-seq-p))
|
||||
(and (startp anchor)
|
||||
(not (multi-line-p anchor))))
|
||||
|
||||
(defmethod start-anchored-p ((regex regex) &optional in-seq-p)
|
||||
(typecase regex
|
||||
((or lookahead lookbehind word-boundary void)
|
||||
;; zero-length assertions
|
||||
(if in-seq-p
|
||||
:zero-length
|
||||
nil))
|
||||
(t
|
||||
;; BACK-REFERENCE, CHAR-CLASS, EVERYTHING, and STR
|
||||
nil)))
|
||||
|
||||
;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS.
|
||||
|
||||
(defgeneric end-string-aux (regex &optional old-case-insensitive-p)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns the constant string (if it exists) REGEX
|
||||
ends with wrapped into a STR object, otherwise NIL.
|
||||
OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR
|
||||
collected or :VOID if no STR has been collected yet. (This is a helper
|
||||
function called by END-STRIN.)"))
|
||||
|
||||
(defmethod end-string-aux ((str str)
|
||||
&optional (old-case-insensitive-p :void))
|
||||
(declare (special last-str))
|
||||
(cond ((and (not (skip str)) ; avoid constituents of STARTS-WITH
|
||||
;; only use STR if nothing has been collected yet or if
|
||||
;; the collected string has the same value for
|
||||
;; CASE-INSENSITIVE-P
|
||||
(or (eq old-case-insensitive-p :void)
|
||||
(eq (case-insensitive-p str) old-case-insensitive-p)))
|
||||
(setf last-str str
|
||||
;; set the SKIP property of this STR
|
||||
(skip str) t)
|
||||
str)
|
||||
(t nil)))
|
||||
|
||||
(defmethod end-string-aux ((seq seq)
|
||||
&optional (old-case-insensitive-p :void))
|
||||
(declare (special continuep))
|
||||
(let (case-insensitive-p
|
||||
concatenated-string
|
||||
concatenated-start
|
||||
(concatenated-length 0))
|
||||
(declare (type fixnum concatenated-length))
|
||||
(loop for element in (reverse (elements seq))
|
||||
;; remember the case-(in)sensitivity of the last relevant
|
||||
;; STR object
|
||||
for loop-old-case-insensitive-p = old-case-insensitive-p
|
||||
then (if skip
|
||||
loop-old-case-insensitive-p
|
||||
(case-insensitive-p element-end))
|
||||
;; the end-string of the current element
|
||||
for element-end = (end-string-aux element
|
||||
loop-old-case-insensitive-p)
|
||||
;; whether we encountered a zero-length element
|
||||
for skip = (if element-end
|
||||
(zerop (len element-end))
|
||||
nil)
|
||||
;; set CONTINUEP to NIL if we have to stop collecting to
|
||||
;; alert END-STRING-AUX methods on enclosing SEQ objects
|
||||
unless element-end
|
||||
do (setq continuep nil)
|
||||
;; end loop if we neither got a STR nor a zero-length
|
||||
;; element
|
||||
while element-end
|
||||
;; only collect if not zero-length
|
||||
unless skip
|
||||
do (cond (concatenated-string
|
||||
(when concatenated-start
|
||||
(setf concatenated-string
|
||||
(make-array concatenated-length
|
||||
:initial-contents (reverse (str concatenated-start))
|
||||
:element-type 'character
|
||||
:fill-pointer t
|
||||
:adjustable t)
|
||||
concatenated-start nil))
|
||||
(let ((len (len element-end))
|
||||
(str (str element-end)))
|
||||
(declare (type fixnum len))
|
||||
(incf concatenated-length len)
|
||||
(loop for i of-type fixnum downfrom (1- len) to 0
|
||||
do (vector-push-extend (char str i)
|
||||
concatenated-string))))
|
||||
(t
|
||||
(setf concatenated-string
|
||||
t
|
||||
concatenated-start
|
||||
element-end
|
||||
concatenated-length
|
||||
(len element-end)
|
||||
case-insensitive-p
|
||||
(case-insensitive-p element-end))))
|
||||
;; stop collecting if END-STRING-AUX on inner SEQ has said so
|
||||
while continuep)
|
||||
(cond ((zerop concatenated-length)
|
||||
;; don't bother to return zero-length strings
|
||||
nil)
|
||||
(concatenated-start
|
||||
concatenated-start)
|
||||
(t
|
||||
(make-instance 'str
|
||||
:str (nreverse concatenated-string)
|
||||
:case-insensitive-p case-insensitive-p)))))
|
||||
|
||||
(defmethod end-string-aux ((register register)
|
||||
&optional (old-case-insensitive-p :void))
|
||||
(end-string-aux (regex register) old-case-insensitive-p))
|
||||
|
||||
(defmethod end-string-aux ((standalone standalone)
|
||||
&optional (old-case-insensitive-p :void))
|
||||
(end-string-aux (regex standalone) old-case-insensitive-p))
|
||||
|
||||
(defmethod end-string-aux ((regex regex)
|
||||
&optional (old-case-insensitive-p :void))
|
||||
(declare (special last-str end-anchored-p continuep))
|
||||
(typecase regex
|
||||
((or anchor lookahead lookbehind word-boundary void)
|
||||
;; a zero-length REGEX object - for the sake of END-STRING-AUX
|
||||
;; this is a zero-length string
|
||||
(when (and (typep regex 'anchor)
|
||||
(not (startp regex))
|
||||
(or (no-newline-p regex)
|
||||
(not (multi-line-p regex)))
|
||||
(eq old-case-insensitive-p :void))
|
||||
;; if this is a "real" end-anchor and we haven't collected
|
||||
;; anything so far we can set END-ANCHORED-P (where 1 or 0
|
||||
;; indicate whether we accept a #\Newline at the end or not)
|
||||
(setq end-anchored-p (if (no-newline-p regex) 0 1)))
|
||||
(make-instance 'str
|
||||
:str ""
|
||||
:case-insensitive-p :void))
|
||||
(t
|
||||
;; (ALTERNATION, BACK-REFERENCE, BRANCH, CHAR-CLASS, EVERYTHING,
|
||||
;; REPETITION)
|
||||
nil)))
|
||||
|
||||
(defmethod end-string ((regex regex))
|
||||
(declare (special end-string-offset))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns the constant string (if it exists) REGEX ends with wrapped
|
||||
into a STR object, otherwise NIL."
|
||||
;; LAST-STR points to the last STR object (seen from the end) that's
|
||||
;; part of END-STRING; CONTINUEP is set to T if we stop collecting
|
||||
;; in the middle of a SEQ
|
||||
(let ((continuep t)
|
||||
last-str)
|
||||
(declare (special continuep last-str))
|
||||
(prog1
|
||||
(end-string-aux regex)
|
||||
(when last-str
|
||||
;; if we've found something set the START-OF-END-STRING-P of
|
||||
;; the leftmost STR collected accordingly and remember the
|
||||
;; OFFSET of this STR (in a special variable provided by the
|
||||
;; caller of this function)
|
||||
(setf (start-of-end-string-p last-str) t
|
||||
end-string-offset (offset last-str))))))
|
||||
|
||||
(defgeneric compute-min-rest (regex current-min-rest)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns the minimal length of REGEX plus
|
||||
CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it
|
||||
recurses down into REGEX and sets the MIN-REST slots of REPETITION
|
||||
objects."))
|
||||
|
||||
(defmethod compute-min-rest ((seq seq) current-min-rest)
|
||||
(loop for element in (reverse (elements seq))
|
||||
for last-min-rest = current-min-rest then this-min-rest
|
||||
for this-min-rest = (compute-min-rest element last-min-rest)
|
||||
finally (return this-min-rest)))
|
||||
|
||||
(defmethod compute-min-rest ((alternation alternation) current-min-rest)
|
||||
(loop for choice in (choices alternation)
|
||||
minimize (compute-min-rest choice current-min-rest)))
|
||||
|
||||
(defmethod compute-min-rest ((branch branch) current-min-rest)
|
||||
(min (compute-min-rest (then-regex branch) current-min-rest)
|
||||
(compute-min-rest (else-regex branch) current-min-rest)))
|
||||
|
||||
(defmethod compute-min-rest ((str str) current-min-rest)
|
||||
(+ current-min-rest (len str)))
|
||||
|
||||
(defmethod compute-min-rest ((repetition repetition) current-min-rest)
|
||||
(setf (min-rest repetition) current-min-rest)
|
||||
(compute-min-rest (regex repetition) current-min-rest)
|
||||
(+ current-min-rest (* (minimum repetition) (min-len repetition))))
|
||||
|
||||
(defmethod compute-min-rest ((register register) current-min-rest)
|
||||
(compute-min-rest (regex register) current-min-rest))
|
||||
|
||||
(defmethod compute-min-rest ((standalone standalone) current-min-rest)
|
||||
(declare (ignore current-min-rest))
|
||||
(compute-min-rest (regex standalone) 0))
|
||||
|
||||
(defmethod compute-min-rest ((lookahead lookahead) current-min-rest)
|
||||
(compute-min-rest (regex lookahead) 0)
|
||||
current-min-rest)
|
||||
|
||||
(defmethod compute-min-rest ((lookbehind lookbehind) current-min-rest)
|
||||
(compute-min-rest (regex lookbehind) (+ current-min-rest (len lookbehind)))
|
||||
current-min-rest)
|
||||
|
||||
(defmethod compute-min-rest ((regex regex) current-min-rest)
|
||||
(typecase regex
|
||||
((or char-class everything)
|
||||
(1+ current-min-rest))
|
||||
(t
|
||||
;; zero min-len and no embedded regexes (ANCHOR,
|
||||
;; BACK-REFERENCE, VOID, and WORD-BOUNDARY)
|
||||
current-min-rest)))
|
||||
88
packages.lisp
Normal file
88
packages.lisp
Normal file
@ -0,0 +1,88 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/packages.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
|
||||
#-:cormanlisp
|
||||
(defpackage #:cl-ppcre
|
||||
(:nicknames #:ppcre)
|
||||
(:use #:cl)
|
||||
(:export #:create-scanner
|
||||
#:scan
|
||||
#:scan-to-strings
|
||||
#:do-scans
|
||||
#:do-matches
|
||||
#:do-matches-as-strings
|
||||
#:all-matches
|
||||
#:all-matches-as-strings
|
||||
#:split
|
||||
#:regex-replace
|
||||
#:regex-replace-all
|
||||
#:regex-apropos
|
||||
#:regex-apropos-list
|
||||
#:quote-meta-chars
|
||||
#:*regex-char-code-limit*
|
||||
#:*use-bmh-matchers*
|
||||
#:*allow-quoting*
|
||||
#:ppcre-error
|
||||
#:ppcre-invocation-error
|
||||
#:ppcre-syntax-error
|
||||
#:ppcre-syntax-error-string
|
||||
#:ppcre-syntax-error-pos
|
||||
#:register-groups-bind
|
||||
#:do-register-groups))
|
||||
|
||||
#+:cormanlisp
|
||||
(defpackage "CL-PPCRE"
|
||||
(:nicknames "PPCRE")
|
||||
(:use "CL")
|
||||
(:export "CREATE-SCANNER"
|
||||
"SCAN"
|
||||
"SCAN-TO-STRINGS"
|
||||
"DO-SCANS"
|
||||
"DO-MATCHES"
|
||||
"DO-MATCHES-AS-STRINGS"
|
||||
"ALL-MATCHES"
|
||||
"ALL-MATCHES-AS-STRINGS"
|
||||
"SPLIT"
|
||||
"REGEX-REPLACE"
|
||||
"REGEX-REPLACE-ALL"
|
||||
"REGEX-APROPOS"
|
||||
"REGEX-APROPOS-LIST"
|
||||
"QUOTE-META-CHARS"
|
||||
"*REGEX-CHAR-CODE-LIMIT*"
|
||||
"*USE-BMH-MATCHERS*"
|
||||
"*ALLOW-QUOTING*"
|
||||
"PPCRE-ERROR"
|
||||
"PPCRE-INVOCATION-ERROR"
|
||||
"PPCRE-SYNTAX-ERROR"
|
||||
"PPCRE-SYNTAX-ERROR-STRING"
|
||||
"PPCRE-SYNTAX-ERROR-POS"
|
||||
"REGISTER-GROUPS-BIND"
|
||||
"DO-REGISTER-GROUPS"))
|
||||
347
parser.lisp
Normal file
347
parser.lisp
Normal file
@ -0,0 +1,347 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/parser.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; The parser will - with the help of the lexer - parse a regex
|
||||
;;; string and convert it into a "parse tree" (see docs for details
|
||||
;;; about the syntax of these trees). Note that the lexer might return
|
||||
;;; illegal parse trees. It is assumed that the conversion process
|
||||
;;; later on will track them down.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defun group (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Parses and consumes a <group>.
|
||||
The productions are: <group> -> \"(\"<regex>\")\"
|
||||
\"(?:\"<regex>\")\"
|
||||
\"(?<\"<regex>\")\"
|
||||
\"(?<flags>:\"<regex>\")\"
|
||||
\"(?=\"<regex>\")\"
|
||||
\"(?!\"<regex>\")\"
|
||||
\"(?<=\"<regex>\")\"
|
||||
\"(?<!\"<regex>\")\"
|
||||
\"(?(\"<num>\")\"<regex>\")\"
|
||||
\"(?(\"<regex>\")\"<regex>\")\"
|
||||
<legal-token>
|
||||
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
|
||||
Will return <parse-tree> or (<grouping-type> <parse-tree>) where
|
||||
<grouping-type> is one of six keywords - see source for details."
|
||||
(multiple-value-bind (open-token flags)
|
||||
(get-token lexer)
|
||||
(cond ((eq open-token :open-paren-paren)
|
||||
;; special case for conditional regular expressions; note
|
||||
;; that at this point we accept a couple of illegal
|
||||
;; combinations which'll be sorted out later by the
|
||||
;; converter
|
||||
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
|
||||
;; check if what follows "(?(" is a number
|
||||
(number (try-number lexer :no-whitespace-p t))
|
||||
;; make changes to extended-mode-p local
|
||||
(*extended-mode-p* *extended-mode-p*))
|
||||
(declare (type fixnum open-paren-pos))
|
||||
(cond (number
|
||||
;; condition is a number (i.e. refers to a
|
||||
;; back-reference)
|
||||
(let* ((inner-close-token (get-token lexer))
|
||||
(reg-expr (reg-expr lexer))
|
||||
(close-token (get-token lexer)))
|
||||
(unless (eq inner-close-token :close-paren)
|
||||
(signal-ppcre-syntax-error*
|
||||
(+ open-paren-pos 2)
|
||||
"Opening paren has no matching closing paren"))
|
||||
(unless (eq close-token :close-paren)
|
||||
(signal-ppcre-syntax-error*
|
||||
open-paren-pos
|
||||
"Opening paren has no matching closing paren"))
|
||||
(list :branch number reg-expr)))
|
||||
(t
|
||||
;; condition must be a full regex (actually a
|
||||
;; look-behind or look-ahead); and here comes a
|
||||
;; terrible kludge: instead of being cleanly
|
||||
;; separated from the lexer, the parser pushes
|
||||
;; back the lexer by one position, thereby
|
||||
;; landing in the middle of the 'token' "(?(" -
|
||||
;; yuck!!
|
||||
(decf (lexer-pos lexer))
|
||||
(let* ((inner-reg-expr (group lexer))
|
||||
(reg-expr (reg-expr lexer))
|
||||
(close-token (get-token lexer)))
|
||||
(unless (eq close-token :close-paren)
|
||||
(signal-ppcre-syntax-error*
|
||||
open-paren-pos
|
||||
"Opening paren has no matching closing paren"))
|
||||
(list :branch inner-reg-expr reg-expr))))))
|
||||
((member open-token '(:open-paren
|
||||
:open-paren-colon
|
||||
:open-paren-greater
|
||||
:open-paren-equal
|
||||
:open-paren-exclamation
|
||||
:open-paren-less-equal
|
||||
:open-paren-less-exclamation)
|
||||
:test #'eq)
|
||||
;; make changes to extended-mode-p local
|
||||
(let ((*extended-mode-p* *extended-mode-p*))
|
||||
;; we saw one of the six token representing opening
|
||||
;; parentheses
|
||||
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
|
||||
(reg-expr (reg-expr lexer))
|
||||
(close-token (get-token lexer)))
|
||||
(when (eq open-token :open-paren)
|
||||
;; if this is the "("<regex>")" production we have to
|
||||
;; increment the register counter of the lexer
|
||||
(incf (lexer-reg lexer)))
|
||||
(unless (eq close-token :close-paren)
|
||||
;; the token following <regex> must be the closing
|
||||
;; parenthesis or this is a syntax error
|
||||
(signal-ppcre-syntax-error*
|
||||
open-paren-pos
|
||||
"Opening paren has no matching closing paren"))
|
||||
(if flags
|
||||
;; if the lexer has returned a list of flags this must
|
||||
;; have been the "(?:"<regex>")" production
|
||||
(cons :group (nconc flags (list reg-expr)))
|
||||
(list (case open-token
|
||||
((:open-paren)
|
||||
:register)
|
||||
((:open-paren-colon)
|
||||
:group)
|
||||
((:open-paren-greater)
|
||||
:standalone)
|
||||
((:open-paren-equal)
|
||||
:positive-lookahead)
|
||||
((:open-paren-exclamation)
|
||||
:negative-lookahead)
|
||||
((:open-paren-less-equal)
|
||||
:positive-lookbehind)
|
||||
((:open-paren-less-exclamation)
|
||||
:negative-lookbehind))
|
||||
reg-expr)))))
|
||||
(t
|
||||
;; this is the <legal-token> production; <legal-token> is
|
||||
;; any token which passes START-OF-SUBEXPR-P (otherwise
|
||||
;; parsing had already stopped in the SEQ method)
|
||||
open-token))))
|
||||
|
||||
(defun greedy-quant (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Parses and consumes a <greedy-quant>.
|
||||
The productions are: <greedy-quant> -> <group> | <group><quantifier>
|
||||
where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
|
||||
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
|
||||
(let* ((group (group lexer))
|
||||
(token (get-quantifier lexer)))
|
||||
(if token
|
||||
;; if GET-QUANTIFIER returned a non-NIL value it's the
|
||||
;; two-element list (<min> <max>)
|
||||
(list :greedy-repetition (first token) (second token) group)
|
||||
group)))
|
||||
|
||||
(defun quant (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Parses and consumes a <quant>.
|
||||
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
|
||||
Will return the <parse-tree> returned by GREEDY-QUANT and optionally
|
||||
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
|
||||
(let* ((greedy-quant (greedy-quant lexer))
|
||||
(pos (lexer-pos lexer))
|
||||
(next-char (next-char lexer)))
|
||||
(when next-char
|
||||
(if (char= next-char #\?)
|
||||
(setf (car greedy-quant) :non-greedy-repetition)
|
||||
(setf (lexer-pos lexer) pos)))
|
||||
greedy-quant))
|
||||
|
||||
(defun seq (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Parses and consumes a <seq>.
|
||||
The productions are: <seq> -> <quant> | <quant><seq>.
|
||||
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
|
||||
(flet ((make-array-from-two-chars (char1 char2)
|
||||
(let ((string (make-array 2
|
||||
:element-type 'character
|
||||
:fill-pointer t
|
||||
:adjustable t)))
|
||||
(setf (aref string 0) char1)
|
||||
(setf (aref string 1) char2)
|
||||
string)))
|
||||
;; Note that we're calling START-OF-SUBEXPR-P before we actually try
|
||||
;; to parse a <seq> or <quant> in order to catch empty regular
|
||||
;; expressions
|
||||
(if (start-of-subexpr-p lexer)
|
||||
(let ((quant (quant lexer)))
|
||||
(if (start-of-subexpr-p lexer)
|
||||
(let* ((seq (seq lexer))
|
||||
(quant-is-char-p (characterp quant))
|
||||
(seq-is-sequence-p (and (consp seq)
|
||||
(eq (first seq) :sequence))))
|
||||
(cond ((and quant-is-char-p
|
||||
(characterp seq))
|
||||
(make-array-from-two-chars seq quant))
|
||||
((and quant-is-char-p
|
||||
(stringp seq))
|
||||
(vector-push-extend quant seq)
|
||||
seq)
|
||||
((and quant-is-char-p
|
||||
seq-is-sequence-p
|
||||
(characterp (second seq)))
|
||||
(cond ((cddr seq)
|
||||
(setf (cdr seq)
|
||||
(cons
|
||||
(make-array-from-two-chars (second seq)
|
||||
quant)
|
||||
(cddr seq)))
|
||||
seq)
|
||||
(t (make-array-from-two-chars (second seq) quant))))
|
||||
((and quant-is-char-p
|
||||
seq-is-sequence-p
|
||||
(stringp (second seq)))
|
||||
(cond ((cddr seq)
|
||||
(setf (cdr seq)
|
||||
(cons
|
||||
(progn
|
||||
(vector-push-extend quant (second seq))
|
||||
(second seq))
|
||||
(cddr seq)))
|
||||
seq)
|
||||
(t
|
||||
(vector-push-extend quant (second seq))
|
||||
(second seq))))
|
||||
(seq-is-sequence-p
|
||||
;; if <seq> is also a :SEQUENCE parse tree we merge
|
||||
;; both lists into one to avoid unnecessary consing
|
||||
(setf (cdr seq)
|
||||
(cons quant (cdr seq)))
|
||||
seq)
|
||||
(t (list :sequence quant seq))))
|
||||
quant))
|
||||
:void)))
|
||||
|
||||
(defun reg-expr (lexer)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Parses and consumes a <regex>, a complete regular expression.
|
||||
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
|
||||
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
|
||||
(let ((pos (lexer-pos lexer)))
|
||||
(case (next-char lexer)
|
||||
((nil)
|
||||
;; if we didn't get any token we return :VOID which stands for
|
||||
;; "empty regular expression"
|
||||
:void)
|
||||
((#\|)
|
||||
;; now check whether the expression started with a vertical
|
||||
;; bar, i.e. <seq> - the left alternation - is empty
|
||||
(list :alternation :void (reg-expr lexer)))
|
||||
(otherwise
|
||||
;; otherwise un-read the character we just saw and parse a
|
||||
;; <seq> plus the character following it
|
||||
(setf (lexer-pos lexer) pos)
|
||||
(let* ((seq (seq lexer))
|
||||
(pos (lexer-pos lexer)))
|
||||
(case (next-char lexer)
|
||||
((nil)
|
||||
;; no further character, just a <seq>
|
||||
seq)
|
||||
((#\|)
|
||||
;; if the character was a vertical bar, this is an
|
||||
;; alternation and we have the second production
|
||||
(let ((reg-expr (reg-expr lexer)))
|
||||
(cond ((and (consp reg-expr)
|
||||
(eq (first reg-expr) :alternation))
|
||||
;; again we try to merge as above in SEQ
|
||||
(setf (cdr reg-expr)
|
||||
(cons seq (cdr reg-expr)))
|
||||
reg-expr)
|
||||
(t (list :alternation seq reg-expr)))))
|
||||
(otherwise
|
||||
;; a character which is not a vertical bar - this is
|
||||
;; either a syntax error or we're inside of a group and
|
||||
;; the next character is a closing parenthesis; so we
|
||||
;; just un-read the character and let another function
|
||||
;; take care of it
|
||||
(setf (lexer-pos lexer) pos)
|
||||
seq)))))))
|
||||
|
||||
(defun reverse-strings (parse-tree)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(cond ((stringp parse-tree)
|
||||
(nreverse parse-tree))
|
||||
((consp parse-tree)
|
||||
(loop for parse-tree-rest on parse-tree
|
||||
while parse-tree-rest
|
||||
do (setf (car parse-tree-rest)
|
||||
(reverse-strings (car parse-tree-rest))))
|
||||
parse-tree)
|
||||
(t parse-tree)))
|
||||
|
||||
(defun parse-string (string)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Translate the regex string STRING into a parse tree."
|
||||
(let* ((lexer (make-lexer string))
|
||||
(parse-tree (reverse-strings (reg-expr lexer))))
|
||||
;; check whether we've consumed the whole regex string
|
||||
(if (end-of-string-p lexer)
|
||||
parse-tree
|
||||
(signal-ppcre-syntax-error*
|
||||
(lexer-pos lexer)
|
||||
"Expected end of string"))))
|
||||
174
perltest.pl
Executable file
174
perltest.pl
Executable file
@ -0,0 +1,174 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
# This is a heavily modified version of the file 'perltest' which
|
||||
# comes with the PCRE library package, which is open source software,
|
||||
# written by Philip Hazel, and copyright by the University of
|
||||
# Cambridge, England.
|
||||
|
||||
# The PCRE library package is available from
|
||||
# <ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/>
|
||||
|
||||
use Time::HiRes qw(time);
|
||||
|
||||
sub string_for_lisp {
|
||||
my(@a, $t, $in_string, $switch);
|
||||
|
||||
my $string = shift;
|
||||
$string =~ s/\\/\\\\/g;
|
||||
$string =~ s/"/\\"/g;
|
||||
|
||||
return "\"$string\""
|
||||
if $string =~ /^[\n\x20-\x7f]*$/;
|
||||
|
||||
$in_string = 1;
|
||||
foreach $c (split(//, $string)) {
|
||||
if (ord $c >= 32 and ord $c < 127) {
|
||||
if ($in_string) {
|
||||
$t .= $c;
|
||||
} else {
|
||||
$in_string = 1;
|
||||
$t = $c;
|
||||
}
|
||||
} else {
|
||||
if ($in_string) {
|
||||
push @a, "\"$t\"";
|
||||
$in_string = 0;
|
||||
$switch = 1;
|
||||
}
|
||||
push @a, ord $c;
|
||||
}
|
||||
}
|
||||
if ($switch) {
|
||||
if ($in_string) {
|
||||
push @a, "\"$t\"";
|
||||
}
|
||||
'(' . (join ' ', @a) . ')';
|
||||
} else {
|
||||
"\"$t\"";
|
||||
}
|
||||
}
|
||||
|
||||
$min_time = shift;
|
||||
|
||||
NEXT_RE: while (1) {
|
||||
last
|
||||
if !($_ = <>);
|
||||
next
|
||||
if $_ eq "";
|
||||
|
||||
$pattern = $_;
|
||||
|
||||
while ($pattern !~ /^\s*(.).*\1/s) {
|
||||
last
|
||||
if !($_ = <>);
|
||||
$pattern .= $_;
|
||||
}
|
||||
|
||||
chomp($pattern);
|
||||
$pattern =~ s/\s+$//;
|
||||
$pattern =~ s/\+(?=[a-z]*$)//;
|
||||
|
||||
$multi_line_mode = ($pattern =~ /m[a-z]*$/) ? 't' : 'nil';
|
||||
$single_line_mode = ($pattern =~ /s[a-z]*$/) ? 't' : 'nil';
|
||||
$extended_mode = ($pattern =~ /x[a-z]*$/) ? 't' : 'nil';
|
||||
$case_insensitive_mode = ($pattern =~ /i[a-z]*$/) ? 't' : 'nil';
|
||||
$pattern =~ s/^(.*)g([a-z]*)$/\1\2/;
|
||||
|
||||
$pattern_for_lisp = $pattern;
|
||||
$pattern_for_lisp =~ s/[a-z]*$//;
|
||||
$pattern_for_lisp =~ s/^\s*(.)(.*)\1/$2/s;
|
||||
$pattern_for_lisp =~ s/\\/\\\\/g;
|
||||
$pattern_for_lisp =~ s/"/\\"/g;
|
||||
|
||||
$pattern = "/(?#)/$2"
|
||||
if ($pattern =~ /^(.)\1(.*)$/);
|
||||
|
||||
while (1) {
|
||||
last NEXT_RE
|
||||
if !($_ = <>);
|
||||
|
||||
chomp;
|
||||
|
||||
s/\s+$//;
|
||||
s/^\s+//;
|
||||
|
||||
last
|
||||
if ($_ eq "");
|
||||
|
||||
$info_string = string_for_lisp "\"$_\" =~ $pattern";
|
||||
$x = eval "\"$_\"";
|
||||
|
||||
@subs = ();
|
||||
|
||||
eval <<"END";
|
||||
if (\$x =~ ${pattern}) {
|
||||
push \@subs,\$&;
|
||||
push \@subs,\$1;
|
||||
push \@subs,\$2;
|
||||
push \@subs,\$3;
|
||||
push \@subs,\$4;
|
||||
push \@subs,\$5;
|
||||
push \@subs,\$6;
|
||||
push \@subs,\$7;
|
||||
push \@subs,\$8;
|
||||
push \@subs,\$9;
|
||||
push \@subs,\$10;
|
||||
push \@subs,\$11;
|
||||
push \@subs,\$12;
|
||||
push \@subs,\$13;
|
||||
push \@subs,\$14;
|
||||
push \@subs,\$15;
|
||||
push \@subs,\$16;
|
||||
}
|
||||
|
||||
\$test = sub {
|
||||
my \$times = shift;
|
||||
|
||||
my \$start = time;
|
||||
for (my \$i = 0; \$i < \$times; \$i++) {
|
||||
\$x =~ ${pattern};
|
||||
}
|
||||
return time - \$start;
|
||||
};
|
||||
END
|
||||
|
||||
$times = 1;
|
||||
$used = 0;
|
||||
$counter++;
|
||||
print STDERR "$counter\n";
|
||||
|
||||
if ($@) {
|
||||
$error = 't';
|
||||
} else {
|
||||
$error = 'nil';
|
||||
if ($min_time) {
|
||||
$times = 10;
|
||||
while (1) {
|
||||
$used = &$test($times);
|
||||
last
|
||||
if $used > $min_time;
|
||||
$times *= 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print "($counter $info_string \"$pattern_for_lisp\" $case_insensitive_mode $multi_line_mode $single_line_mode $extended_mode " . string_for_lisp($x) . " $error $times $used ";
|
||||
if (!@subs) {
|
||||
print 'nil nil';
|
||||
} else {
|
||||
print string_for_lisp($subs[0]) . ' (';
|
||||
undef $not_first;
|
||||
for ($i = 1; $i <= 16; $i++) {
|
||||
print ' '
|
||||
unless $i == 1;
|
||||
if (defined $subs[$i]) {
|
||||
print string_for_lisp $subs[$i];
|
||||
} else {
|
||||
print 'nil';
|
||||
}
|
||||
}
|
||||
print ')';
|
||||
}
|
||||
print ")\n";
|
||||
}
|
||||
}
|
||||
296
ppcre-tests.lisp
Normal file
296
ppcre-tests.lisp
Normal file
@ -0,0 +1,296 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/ppcre-tests.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-user)
|
||||
|
||||
#-:cormanlisp
|
||||
(defpackage #:cl-ppcre-test
|
||||
(:use #:cl #:cl-ppcre)
|
||||
(:export #:test))
|
||||
|
||||
#+:cormanlisp
|
||||
(defpackage "CL-PPCRE-TEST"
|
||||
(:use "CL" "CL-PPCRE")
|
||||
(:export "TEST"))
|
||||
|
||||
(in-package #:cl-ppcre-test)
|
||||
|
||||
(defparameter *cl-ppcre-test-base-directory*
|
||||
(make-pathname :name nil :type nil :version nil
|
||||
:defaults (parse-namestring *load-truename*)))
|
||||
|
||||
(defun full-gc ()
|
||||
"Start a full garbage collection."
|
||||
;; what are the corresponding values for MCL and OpenMCL?
|
||||
#+:allegro (excl:gc t)
|
||||
#+(or :cmu :scl) (ext:gc :full t)
|
||||
#+:ecl (si:gc t)
|
||||
#+:clisp (ext:gc)
|
||||
#+:cormanlisp (loop for i from 0 to 3 do (cormanlisp:gc i))
|
||||
#+:lispworks (hcl:mark-and-sweep 3)
|
||||
#+:sbcl (sb-ext:gc :full t))
|
||||
|
||||
;; warning: ugly code ahead!!
|
||||
;; this is just a quick hack for testing purposes
|
||||
|
||||
(defun time-regex (factor regex string
|
||||
&key case-insensitive-mode
|
||||
multi-line-mode
|
||||
single-line-mode
|
||||
extended-mode)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Auxiliary function used by TEST to benchmark a regex scanner
|
||||
against Perl timings."
|
||||
(declare (type string string))
|
||||
(let* ((scanner (create-scanner regex
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode))
|
||||
;; make sure GC doesn't invalidate our benchmarking
|
||||
(dummy (full-gc))
|
||||
(start (get-internal-real-time)))
|
||||
(declare (ignore dummy))
|
||||
(dotimes (i factor)
|
||||
(funcall scanner string 0 (length string)))
|
||||
(float (/ (- (get-internal-real-time) start) internal-time-units-per-second))))
|
||||
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
(defun threaded-scan (scanner target-string &key (threads 10) (repetitions 5000))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Auxiliary function used by TEST to check whether SCANNER is thread-safe."
|
||||
(full-gc)
|
||||
(let ((collector (make-array threads))
|
||||
(counter 0))
|
||||
(loop for i below threads
|
||||
do (let* ((j i)
|
||||
(fn
|
||||
(lambda ()
|
||||
(let ((r (random repetitions)))
|
||||
(loop for k below repetitions
|
||||
if (= k r)
|
||||
do (setf (aref collector j)
|
||||
(let ((result
|
||||
(multiple-value-list
|
||||
(cl-ppcre:scan scanner target-string))))
|
||||
(unless (cdr result)
|
||||
(setq result '(nil nil #() #())))
|
||||
result))
|
||||
else
|
||||
do (cl-ppcre:scan scanner target-string))
|
||||
(incf counter)))))
|
||||
#+scl (thread:thread-create fn)
|
||||
#+lispworks (mp:process-run-function "" nil fn)
|
||||
#+(and sbcl sb-thread) (sb-thread:make-thread fn)))
|
||||
(loop while (< counter threads)
|
||||
do (sleep .1))
|
||||
(destructuring-bind (first-start first-end first-reg-starts first-reg-ends)
|
||||
(aref collector 0)
|
||||
(loop for (start end reg-starts reg-ends) across collector
|
||||
if (or (not (eql first-start start))
|
||||
(not (eql first-end end))
|
||||
(/= (length first-reg-starts) (length reg-starts))
|
||||
(/= (length first-reg-ends) (length reg-ends))
|
||||
(loop for first-reg-start across first-reg-starts
|
||||
for reg-start across reg-starts
|
||||
thereis (not (eql first-reg-start reg-start)))
|
||||
(loop for first-reg-end across first-reg-ends
|
||||
for reg-end across reg-ends
|
||||
thereis (not (eql first-reg-end reg-end))))
|
||||
do (return (format nil "~&Inconsistent results during multi-threading"))))))
|
||||
|
||||
(defun create-string-from-input (input)
|
||||
(cond ((or (null input)
|
||||
(stringp input))
|
||||
input)
|
||||
(t
|
||||
(cl-ppcre::string-list-to-simple-string
|
||||
(loop for element in input
|
||||
if (stringp element)
|
||||
collect element
|
||||
else
|
||||
collect (string (code-char element)))))))
|
||||
|
||||
(defun test (&key (file-name
|
||||
(make-pathname :name "testdata"
|
||||
:type nil :version nil
|
||||
:defaults *cl-ppcre-test-base-directory*)
|
||||
file-name-provided-p)
|
||||
threaded)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (ignorable threaded))
|
||||
"Loop through all test cases in FILE-NAME and print report. Only in
|
||||
LispWorks and SCL: If THREADED is true, also test whether the scanners
|
||||
work multi-threaded."
|
||||
(with-open-file (stream file-name
|
||||
#+(or :allegro :clisp :scl)
|
||||
:external-format
|
||||
#+(or :allegro :clisp :scl)
|
||||
(if file-name-provided-p
|
||||
:default
|
||||
#+:allegro :iso-8859-1
|
||||
#+:clisp charset:iso-8859-1
|
||||
#+:scl :iso-8859-1))
|
||||
(loop with testcount of-type fixnum = 0
|
||||
with *regex-char-code-limit* = (if file-name-provided-p
|
||||
*regex-char-code-limit*
|
||||
;; the standard test suite
|
||||
;; doesn't need full
|
||||
;; Unicode support
|
||||
255)
|
||||
with *allow-quoting* = (if file-name-provided-p
|
||||
*allow-quoting*
|
||||
t)
|
||||
for input-line = (read stream nil nil)
|
||||
for (counter info-string regex
|
||||
case-insensitive-mode multi-line-mode
|
||||
single-line-mode extended-mode
|
||||
string perl-error factor
|
||||
perl-time ex-result ex-subs) = input-line
|
||||
while input-line
|
||||
do (let ((info-string (create-string-from-input info-string))
|
||||
(regex (create-string-from-input regex))
|
||||
(string (create-string-from-input string))
|
||||
(ex-result (create-string-from-input ex-result))
|
||||
(ex-subs (mapcar #'create-string-from-input ex-subs))
|
||||
(errors '()))
|
||||
;; provide some visual feedback for slow CL
|
||||
;; implementations; suggested by JP Massar
|
||||
(incf testcount)
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
(when threaded
|
||||
(format t "Test #~A (ID ~A)~%" testcount counter)
|
||||
(force-output))
|
||||
(unless #-(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
nil
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
threaded
|
||||
(when (zerop (mod testcount 10))
|
||||
(format t ".")
|
||||
(force-output))
|
||||
(when (zerop (mod testcount 100))
|
||||
(terpri)))
|
||||
(handler-case
|
||||
(let* ((*use-bmh-matchers* (if (and (> factor 1) (plusp perl-time))
|
||||
*use-bmh-matchers*
|
||||
;; if we only check for
|
||||
;; correctness we don't
|
||||
;; care about speed that
|
||||
;; match (but rather
|
||||
;; about space
|
||||
;; constraints of the
|
||||
;; trial versions)
|
||||
nil))
|
||||
(scanner (create-scanner regex
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode)))
|
||||
(multiple-value-bind (result1 result2 sub-starts sub-ends)
|
||||
(scan scanner string)
|
||||
(cond (perl-error
|
||||
(push (format nil
|
||||
"~&expected an error but got a result")
|
||||
errors))
|
||||
(t
|
||||
(when (not (eq result1 ex-result))
|
||||
(if result1
|
||||
(let ((result (subseq string result1 result2)))
|
||||
(unless (string= result ex-result)
|
||||
(push (format nil
|
||||
"~&expected ~S but got ~S"
|
||||
ex-result result)
|
||||
errors))
|
||||
(setq sub-starts (coerce sub-starts 'list)
|
||||
sub-ends (coerce sub-ends 'list))
|
||||
(loop for i from 0
|
||||
for ex-sub in ex-subs
|
||||
for sub-start = (nth i sub-starts)
|
||||
for sub-end = (nth i sub-ends)
|
||||
for sub = (if (and sub-start sub-end)
|
||||
(subseq string sub-start sub-end)
|
||||
nil)
|
||||
unless (string= ex-sub sub)
|
||||
do (push (format nil
|
||||
"~&\\~A: expected ~S but got ~S"
|
||||
(1+ i) ex-sub sub) errors)))
|
||||
(push (format nil
|
||||
"~&expected ~S but got ~S"
|
||||
ex-result result1)
|
||||
errors)))))
|
||||
#+(or scl
|
||||
lispworks
|
||||
(and sbcl sb-thread))
|
||||
(when threaded
|
||||
(let ((thread-result (threaded-scan scanner string)))
|
||||
(when thread-result
|
||||
(push thread-result errors))))))
|
||||
(condition (msg)
|
||||
(unless perl-error
|
||||
(push (format nil "~&got an unexpected error: '~A'" msg)
|
||||
errors))))
|
||||
(setq errors (nreverse errors))
|
||||
(cond (errors
|
||||
(when (or (<= factor 1) (zerop perl-time))
|
||||
(format t "~&~4@A (~A):~{~& ~A~}~%"
|
||||
counter info-string errors)))
|
||||
((and (> factor 1) (plusp perl-time))
|
||||
(let ((result (time-regex factor regex string
|
||||
:case-insensitive-mode case-insensitive-mode
|
||||
:multi-line-mode multi-line-mode
|
||||
:single-line-mode single-line-mode
|
||||
:extended-mode extended-mode)))
|
||||
(format t "~&~4@A: ~,4F (~A repetitions, Perl: ~,4F seconds, CL-PPCRE: ~,4F seconds)" counter
|
||||
(float (/ result perl-time)) factor perl-time result)
|
||||
#+:cormanlisp (force-output *standard-output*)))
|
||||
(t nil))))
|
||||
(values)))
|
||||
752
regex-class.lisp
Normal file
752
regex-class.lisp
Normal file
@ -0,0 +1,752 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/regex-class.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; This file defines the REGEX class and some utility methods for
|
||||
;;; this class. REGEX objects are used to represent the (transformed)
|
||||
;;; parse trees internally
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(locally
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(defclass regex ()
|
||||
()
|
||||
(:documentation "The REGEX base class. All other classes inherit from this one."))
|
||||
|
||||
|
||||
(defclass seq (regex)
|
||||
((elements :initarg :elements
|
||||
:accessor elements
|
||||
:type cons
|
||||
:documentation "A list of REGEX objects."))
|
||||
(:documentation "SEQ objects represents sequences of
|
||||
regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
||||
|
||||
(defclass alternation (regex)
|
||||
((choices :initarg :choices
|
||||
:accessor choices
|
||||
:type cons
|
||||
:documentation "A list of REGEX objects"))
|
||||
(:documentation "ALTERNATION objects represent alternations of
|
||||
regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
||||
|
||||
(defclass lookahead (regex)
|
||||
((regex :initarg :regex
|
||||
:accessor regex
|
||||
:documentation "The REGEX object we're checking.")
|
||||
(positivep :initarg :positivep
|
||||
:reader positivep
|
||||
:documentation "Whether this assertion is positive."))
|
||||
(:documentation "LOOKAHEAD objects represent look-ahead assertions."))
|
||||
|
||||
(defclass lookbehind (regex)
|
||||
((regex :initarg :regex
|
||||
:accessor regex
|
||||
:documentation "The REGEX object we're checking.")
|
||||
(positivep :initarg :positivep
|
||||
:reader positivep
|
||||
:documentation "Whether this assertion is positive.")
|
||||
(len :initarg :len
|
||||
:accessor len
|
||||
:type fixnum
|
||||
:documentation "The (fixed) length of the enclosed regex."))
|
||||
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
|
||||
|
||||
(defclass repetition (regex)
|
||||
((regex :initarg :regex
|
||||
:accessor regex
|
||||
:documentation "The REGEX that's repeated.")
|
||||
(greedyp :initarg :greedyp
|
||||
:reader greedyp
|
||||
:documentation "Whether the repetition is greedy.")
|
||||
(minimum :initarg :minimum
|
||||
:accessor minimum
|
||||
:type fixnum
|
||||
:documentation "The minimal number of repetitions.")
|
||||
(maximum :initarg :maximum
|
||||
:accessor maximum
|
||||
:documentation "The maximal number of repetitions.
|
||||
Can be NIL for unbounded.")
|
||||
(min-len :initarg :min-len
|
||||
:reader min-len
|
||||
:documentation "The minimal length of the enclosed regex.")
|
||||
(len :initarg :len
|
||||
:reader len
|
||||
:documentation "The length of the enclosed regex. NIL if unknown.")
|
||||
(min-rest :initform 0
|
||||
:accessor min-rest
|
||||
:type fixnum
|
||||
:documentation "The minimal number of characters which must
|
||||
appear after this repetition.")
|
||||
(contains-register-p :initarg :contains-register-p
|
||||
:reader contains-register-p
|
||||
:documentation "If the regex contains a register."))
|
||||
(:documentation "REPETITION objects represent repetitions of regexes."))
|
||||
|
||||
(defclass register (regex)
|
||||
((regex :initarg :regex
|
||||
:accessor regex
|
||||
:documentation "The inner regex.")
|
||||
(num :initarg :num
|
||||
:reader num
|
||||
:type fixnum
|
||||
:documentation "The number of this register, starting from 0.
|
||||
This is the index into *REGS-START* and *REGS-END*."))
|
||||
(:documentation "REGISTER objects represent register groups."))
|
||||
|
||||
(defclass standalone (regex)
|
||||
((regex :initarg :regex
|
||||
:accessor regex
|
||||
:documentation "The inner regex."))
|
||||
(:documentation "A standalone regular expression."))
|
||||
|
||||
(defclass back-reference (regex)
|
||||
((num :initarg :num
|
||||
:accessor num
|
||||
:type fixnum
|
||||
:documentation "The number of the register this reference refers to.")
|
||||
(case-insensitive-p :initarg :case-insensitive-p
|
||||
:reader case-insensitive-p
|
||||
:documentation "Whether we check case-insensitively."))
|
||||
(:documentation "BACK-REFERENCE objects represent backreferences."))
|
||||
|
||||
(defclass char-class (regex)
|
||||
((hash :initarg :hash
|
||||
:reader hash
|
||||
:type (or hash-table null)
|
||||
:documentation "A hash table the keys of which are the characters;
|
||||
the values are always T.")
|
||||
(case-insensitive-p :initarg :case-insensitive-p
|
||||
:reader case-insensitive-p
|
||||
:documentation "If the char class case-insensitive.")
|
||||
(invertedp :initarg :invertedp
|
||||
:reader invertedp
|
||||
:documentation "Whether we mean the inverse of the char class.")
|
||||
(word-char-class-p :initarg :word-char-class-p
|
||||
:reader word-char-class-p
|
||||
:documentation "Whether this CHAR CLASS
|
||||
represents the special class WORD-CHAR-CLASS."))
|
||||
(:documentation "CHAR-CLASS objects represent character classes."))
|
||||
|
||||
(defclass str (regex)
|
||||
((str :initarg :str
|
||||
:accessor str
|
||||
:type string
|
||||
:documentation "The actual string.")
|
||||
(len :initform 0
|
||||
:accessor len
|
||||
:type fixnum
|
||||
:documentation "The length of the string.")
|
||||
(case-insensitive-p :initarg :case-insensitive-p
|
||||
:reader case-insensitive-p
|
||||
:documentation "If we match case-insensitively.")
|
||||
(offset :initform nil
|
||||
:accessor offset
|
||||
:documentation "Offset from the left of the whole parse tree.
|
||||
The first regex has offset 0.
|
||||
NIL if unknown, i.e. behind a variable-length regex.")
|
||||
(skip :initform nil
|
||||
:initarg :skip
|
||||
:accessor skip
|
||||
:documentation "If we can avoid testing for this string
|
||||
because the SCAN function has done this already.")
|
||||
(start-of-end-string-p :initform nil
|
||||
:accessor start-of-end-string-p
|
||||
:documentation "If this is the unique STR which
|
||||
starts END-STRING (a slot of MATCHER)."))
|
||||
(:documentation "STR objects represent string."))
|
||||
|
||||
(defclass anchor (regex)
|
||||
((startp :initarg :startp
|
||||
:reader startp
|
||||
:documentation "Whether this is a \"start anchor\".")
|
||||
(multi-line-p :initarg :multi-line-p
|
||||
:reader multi-line-p
|
||||
:documentation "Whether we're in multi-line mode,
|
||||
i.e. whether each #\\Newline is surrounded by anchors.")
|
||||
(no-newline-p :initarg :no-newline-p
|
||||
:reader no-newline-p
|
||||
:documentation "Whether we ignore #\\Newline at the end."))
|
||||
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
|
||||
|
||||
(defclass everything (regex)
|
||||
((single-line-p :initarg :single-line-p
|
||||
:reader single-line-p
|
||||
:documentation "Whether we're in single-line mode,
|
||||
i.e. whether we also match #\\Newline."))
|
||||
(:documentation "EVERYTHING objects represent regexes matching
|
||||
\"everything\", i.e. dots."))
|
||||
|
||||
(defclass word-boundary (regex)
|
||||
((negatedp :initarg :negatedp
|
||||
:reader negatedp
|
||||
:documentation "Whether we mean the opposite,
|
||||
i.e. no word-boundary."))
|
||||
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))
|
||||
|
||||
(defclass branch (regex)
|
||||
((test :initarg :test
|
||||
:accessor test
|
||||
:documentation "The test of this branch, one of LOOKAHEAD,
|
||||
LOOKBEHIND, or a number.")
|
||||
(then-regex :initarg :then-regex
|
||||
:accessor then-regex
|
||||
:documentation "The regex that's to be matched if the
|
||||
test succeeds.")
|
||||
(else-regex :initarg :else-regex
|
||||
:initform (make-instance 'void)
|
||||
:accessor else-regex
|
||||
:documentation "The regex that's to be matched if the
|
||||
test fails."))
|
||||
(:documentation "BRANCH objects represent Perl's conditional regular
|
||||
expressions."))
|
||||
|
||||
(defclass void (regex)
|
||||
()
|
||||
(:documentation "VOID objects represent empty regular expressions.")))
|
||||
|
||||
(declaim (ftype (function (t) simple-string) str))
|
||||
|
||||
;;; The following four methods allow a VOID object to behave like a
|
||||
;;; zero-length STR object (only readers needed)
|
||||
|
||||
(defmethod initialize-instance :after ((str str) &rest init-args)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (ignore init-args))
|
||||
"Automatically computes the length of a STR after initialization."
|
||||
(let ((str-slot (slot-value str 'str)))
|
||||
(unless (typep str-slot 'simple-string)
|
||||
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
|
||||
(setf (len str) (length (str str))))
|
||||
|
||||
(defmethod len ((void void))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
0)
|
||||
|
||||
(defmethod str ((void void))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"")
|
||||
|
||||
(defmethod skip ((void void))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
nil)
|
||||
|
||||
(defmethod start-of-end-string-p ((void void))
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
nil)
|
||||
|
||||
(defgeneric case-mode (regex old-case-mode)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
|
||||
Returns a keyword denoting the case-(in)sensitivity of a STR or its
|
||||
second argument if the STR has length 0. Returns NIL for REGEX objects
|
||||
which are not of type STR."))
|
||||
|
||||
(defmethod case-mode ((str str) old-case-mode)
|
||||
(cond ((zerop (len str))
|
||||
old-case-mode)
|
||||
((case-insensitive-p str)
|
||||
:case-insensitive)
|
||||
(t
|
||||
:case-sensitive)))
|
||||
|
||||
(defmethod case-mode ((regex regex) old-case-mode)
|
||||
(declare (ignore old-case-mode))
|
||||
nil)
|
||||
|
||||
(defgeneric copy-regex (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Implements a deep copy of a REGEX object."))
|
||||
|
||||
(defmethod copy-regex ((anchor anchor))
|
||||
(make-instance 'anchor
|
||||
:startp (startp anchor)
|
||||
:multi-line-p (multi-line-p anchor)
|
||||
:no-newline-p (no-newline-p anchor)))
|
||||
|
||||
(defmethod copy-regex ((everything everything))
|
||||
(make-instance 'everything
|
||||
:single-line-p (single-line-p everything)))
|
||||
|
||||
(defmethod copy-regex ((word-boundary word-boundary))
|
||||
(make-instance 'word-boundary
|
||||
:negatedp (negatedp word-boundary)))
|
||||
|
||||
(defmethod copy-regex ((void void))
|
||||
(make-instance 'void))
|
||||
|
||||
(defmethod copy-regex ((lookahead lookahead))
|
||||
(make-instance 'lookahead
|
||||
:regex (copy-regex (regex lookahead))
|
||||
:positivep (positivep lookahead)))
|
||||
|
||||
(defmethod copy-regex ((seq seq))
|
||||
(make-instance 'seq
|
||||
:elements (mapcar #'copy-regex (elements seq))))
|
||||
|
||||
(defmethod copy-regex ((alternation alternation))
|
||||
(make-instance 'alternation
|
||||
:choices (mapcar #'copy-regex (choices alternation))))
|
||||
|
||||
(defmethod copy-regex ((branch branch))
|
||||
(with-slots ((test test))
|
||||
branch
|
||||
(make-instance 'branch
|
||||
:test (if (typep test 'regex)
|
||||
(copy-regex test)
|
||||
test)
|
||||
:then-regex (copy-regex (then-regex branch))
|
||||
:else-regex (copy-regex (else-regex branch)))))
|
||||
|
||||
(defmethod copy-regex ((lookbehind lookbehind))
|
||||
(make-instance 'lookbehind
|
||||
:regex (copy-regex (regex lookbehind))
|
||||
:positivep (positivep lookbehind)
|
||||
:len (len lookbehind)))
|
||||
|
||||
(defmethod copy-regex ((repetition repetition))
|
||||
(make-instance 'repetition
|
||||
:regex (copy-regex (regex repetition))
|
||||
:greedyp (greedyp repetition)
|
||||
:minimum (minimum repetition)
|
||||
:maximum (maximum repetition)
|
||||
:min-len (min-len repetition)
|
||||
:len (len repetition)
|
||||
:contains-register-p (contains-register-p repetition)))
|
||||
|
||||
(defmethod copy-regex ((register register))
|
||||
(make-instance 'register
|
||||
:regex (copy-regex (regex register))
|
||||
:num (num register)))
|
||||
|
||||
(defmethod copy-regex ((standalone standalone))
|
||||
(make-instance 'standalone
|
||||
:regex (copy-regex (regex standalone))))
|
||||
|
||||
(defmethod copy-regex ((back-reference back-reference))
|
||||
(make-instance 'back-reference
|
||||
:num (num back-reference)
|
||||
:case-insensitive-p (case-insensitive-p back-reference)))
|
||||
|
||||
(defmethod copy-regex ((char-class char-class))
|
||||
(make-instance 'char-class
|
||||
:hash (hash char-class)
|
||||
:case-insensitive-p (case-insensitive-p char-class)
|
||||
:invertedp (invertedp char-class)
|
||||
:word-char-class-p (word-char-class-p char-class)))
|
||||
|
||||
(defmethod copy-regex ((str str))
|
||||
(make-instance 'str
|
||||
:str (str str)
|
||||
:case-insensitive-p (case-insensitive-p str)))
|
||||
|
||||
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
|
||||
;;; wrapped into one function. Maybe in the next release...
|
||||
|
||||
;;; Further note that this function is used by CONVERT to factor out
|
||||
;;; complicated repetitions, i.e. cases like
|
||||
;;; (a)* -> (?:a*(a))?
|
||||
;;; This won't work for, say,
|
||||
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
|
||||
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
|
||||
|
||||
(defgeneric remove-registers (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
|
||||
optionally removes embedded REGISTER objects if possible and if the
|
||||
special variable REMOVE-REGISTERS-P is true."))
|
||||
|
||||
(defmethod remove-registers ((register register))
|
||||
(declare (special remove-registers-p reg-seen))
|
||||
(cond (remove-registers-p
|
||||
(remove-registers (regex register)))
|
||||
(t
|
||||
;; mark REG-SEEN as true so enclosing REPETITION objects
|
||||
;; (see method below) know if they contain a register or not
|
||||
(setq reg-seen t)
|
||||
(copy-regex register))))
|
||||
|
||||
(defmethod remove-registers ((repetition repetition))
|
||||
(let* (reg-seen
|
||||
(inner-regex (remove-registers (regex repetition))))
|
||||
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
|
||||
;; (REGEX REPETITION) contains a REGISTER
|
||||
(declare (special reg-seen))
|
||||
(make-instance 'repetition
|
||||
:regex inner-regex
|
||||
:greedyp (greedyp repetition)
|
||||
:minimum (minimum repetition)
|
||||
:maximum (maximum repetition)
|
||||
:min-len (min-len repetition)
|
||||
:len (len repetition)
|
||||
:contains-register-p reg-seen)))
|
||||
|
||||
(defmethod remove-registers ((standalone standalone))
|
||||
(make-instance 'standalone
|
||||
:regex (remove-registers (regex standalone))))
|
||||
|
||||
(defmethod remove-registers ((lookahead lookahead))
|
||||
(make-instance 'lookahead
|
||||
:regex (remove-registers (regex lookahead))
|
||||
:positivep (positivep lookahead)))
|
||||
|
||||
(defmethod remove-registers ((lookbehind lookbehind))
|
||||
(make-instance 'lookbehind
|
||||
:regex (remove-registers (regex lookbehind))
|
||||
:positivep (positivep lookbehind)
|
||||
:len (len lookbehind)))
|
||||
|
||||
(defmethod remove-registers ((branch branch))
|
||||
(with-slots ((test test))
|
||||
branch
|
||||
(make-instance 'branch
|
||||
:test (if (typep test 'regex)
|
||||
(remove-registers test)
|
||||
test)
|
||||
:then-regex (remove-registers (then-regex branch))
|
||||
:else-regex (remove-registers (else-regex branch)))))
|
||||
|
||||
(defmethod remove-registers ((alternation alternation))
|
||||
(declare (special remove-registers-p))
|
||||
;; an ALTERNATION, so we can't remove REGISTER objects further down
|
||||
(setq remove-registers-p nil)
|
||||
(copy-regex alternation))
|
||||
|
||||
(defmethod remove-registers ((regex regex))
|
||||
(copy-regex regex))
|
||||
|
||||
(defmethod remove-registers ((seq seq))
|
||||
(make-instance 'seq
|
||||
:elements (mapcar #'remove-registers (elements seq))))
|
||||
|
||||
(defgeneric everythingp (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
|
||||
to this object, otherwise NIL. So, \"(.){1}\" would return true
|
||||
(i.e. the object corresponding to \".\", for example."))
|
||||
|
||||
(defmethod everythingp ((seq seq))
|
||||
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
|
||||
;; due to the parsing process
|
||||
(let ((cleaned-elements (remove-if #'(lambda (element)
|
||||
(typep element 'void))
|
||||
(elements seq))))
|
||||
(and (= 1 (length cleaned-elements))
|
||||
(everythingp (first cleaned-elements)))))
|
||||
|
||||
(defmethod everythingp ((alternation alternation))
|
||||
(with-slots ((choices choices))
|
||||
alternation
|
||||
(and (= 1 (length choices))
|
||||
;; this is unlikely to happen for human-generated regexes,
|
||||
;; but machine-generated ones might look like this
|
||||
(everythingp (first choices)))))
|
||||
|
||||
(defmethod everythingp ((repetition repetition))
|
||||
(with-slots ((maximum maximum)
|
||||
(minimum minimum)
|
||||
(regex regex))
|
||||
repetition
|
||||
(and maximum
|
||||
(= 1 minimum maximum)
|
||||
;; treat "<regex>{1,1}" like "<regex>"
|
||||
(everythingp regex))))
|
||||
|
||||
(defmethod everythingp ((register register))
|
||||
(everythingp (regex register)))
|
||||
|
||||
(defmethod everythingp ((standalone standalone))
|
||||
(everythingp (regex standalone)))
|
||||
|
||||
(defmethod everythingp ((everything everything))
|
||||
everything)
|
||||
|
||||
(defmethod everythingp ((regex regex))
|
||||
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
|
||||
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY
|
||||
nil)
|
||||
|
||||
(defgeneric regex-length (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
|
||||
|
||||
(defmethod regex-length ((seq seq))
|
||||
;; simply add all inner lengths unless one of them is NIL
|
||||
(loop for sub-regex in (elements seq)
|
||||
for len = (regex-length sub-regex)
|
||||
if (not len) do (return nil)
|
||||
sum len))
|
||||
|
||||
(defmethod regex-length ((alternation alternation))
|
||||
;; only return a true value if all inner lengths are non-NIL and
|
||||
;; mutually equal
|
||||
(loop for sub-regex in (choices alternation)
|
||||
for old-len = nil then len
|
||||
for len = (regex-length sub-regex)
|
||||
if (or (not len)
|
||||
(and old-len (/= len old-len))) do (return nil)
|
||||
finally (return len)))
|
||||
|
||||
(defmethod regex-length ((branch branch))
|
||||
;; only return a true value if both alternations have a length and
|
||||
;; if they're equal
|
||||
(let ((then-length (regex-length (then-regex branch))))
|
||||
(and then-length
|
||||
(eql then-length (regex-length (else-regex branch)))
|
||||
then-length)))
|
||||
|
||||
(defmethod regex-length ((repetition repetition))
|
||||
;; we can only compute the length of a REPETITION object if the
|
||||
;; number of repetitions is fixed; note that we don't call
|
||||
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
|
||||
;; always set correctly
|
||||
(with-slots ((len len)
|
||||
(minimum minimum)
|
||||
(maximum maximum))
|
||||
repetition
|
||||
(if (and len
|
||||
(eq minimum maximum))
|
||||
(* minimum len)
|
||||
nil)))
|
||||
|
||||
(defmethod regex-length ((register register))
|
||||
(regex-length (regex register)))
|
||||
|
||||
(defmethod regex-length ((standalone standalone))
|
||||
(regex-length (regex standalone)))
|
||||
|
||||
(defmethod regex-length ((back-reference back-reference))
|
||||
;; with enough effort we could possibly do better here, but
|
||||
;; currently we just give up and return NIL
|
||||
nil)
|
||||
|
||||
(defmethod regex-length ((char-class char-class))
|
||||
1)
|
||||
|
||||
(defmethod regex-length ((everything everything))
|
||||
1)
|
||||
|
||||
(defmethod regex-length ((str str))
|
||||
(len str))
|
||||
|
||||
(defmethod regex-length ((regex regex))
|
||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||
;; WORD-BOUNDARY (which all have zero-length)
|
||||
0)
|
||||
|
||||
(defgeneric regex-min-length (regex)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns the minimal length of REGEX."))
|
||||
|
||||
(defmethod regex-min-length ((seq seq))
|
||||
;; simply add all inner minimal lengths
|
||||
(loop for sub-regex in (elements seq)
|
||||
for len = (regex-min-length sub-regex)
|
||||
sum len))
|
||||
|
||||
(defmethod regex-min-length ((alternation alternation))
|
||||
;; minimal length of an alternation is the minimal length of the
|
||||
;; "shortest" element
|
||||
(loop for sub-regex in (choices alternation)
|
||||
for len = (regex-min-length sub-regex)
|
||||
minimize len))
|
||||
|
||||
(defmethod regex-min-length ((branch branch))
|
||||
;; minimal length of both alternations
|
||||
(min (regex-min-length (then-regex branch))
|
||||
(regex-min-length (else-regex branch))))
|
||||
|
||||
(defmethod regex-min-length ((repetition repetition))
|
||||
;; obviously the product of the inner minimal length and the minimal
|
||||
;; number of repetitions
|
||||
(* (minimum repetition) (min-len repetition)))
|
||||
|
||||
(defmethod regex-min-length ((register register))
|
||||
(regex-min-length (regex register)))
|
||||
|
||||
(defmethod regex-min-length ((standalone standalone))
|
||||
(regex-min-length (regex standalone)))
|
||||
|
||||
(defmethod regex-min-length ((char-class char-class))
|
||||
1)
|
||||
|
||||
(defmethod regex-min-length ((everything everything))
|
||||
1)
|
||||
|
||||
(defmethod regex-min-length ((str str))
|
||||
(len str))
|
||||
|
||||
(defmethod regex-min-length ((regex regex))
|
||||
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
|
||||
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
|
||||
0)
|
||||
|
||||
(defgeneric compute-offsets (regex start-pos)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(:documentation "Returns the offset the following regex would have
|
||||
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
|
||||
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
|
||||
slots of STR objects further down the tree."))
|
||||
|
||||
;; note that we're actually only interested in the offset of
|
||||
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
|
||||
;; can stop at variable-length alternations and don't need to descend
|
||||
;; into repetitions
|
||||
|
||||
(defmethod compute-offsets ((seq seq) start-pos)
|
||||
(loop for element in (elements seq)
|
||||
;; advance offset argument for next call while looping through
|
||||
;; the elements
|
||||
for pos = start-pos then curr-offset
|
||||
for curr-offset = (compute-offsets element pos)
|
||||
while curr-offset
|
||||
finally (return curr-offset)))
|
||||
|
||||
(defmethod compute-offsets ((alternation alternation) start-pos)
|
||||
(loop for choice in (choices alternation)
|
||||
for old-offset = nil then curr-offset
|
||||
for curr-offset = (compute-offsets choice start-pos)
|
||||
;; we stop immediately if two alternations don't result in the
|
||||
;; same offset
|
||||
if (or (not curr-offset)
|
||||
(and old-offset (/= curr-offset old-offset)))
|
||||
do (return nil)
|
||||
finally (return curr-offset)))
|
||||
|
||||
(defmethod compute-offsets ((branch branch) start-pos)
|
||||
;; only return offset if both alternations have equal value
|
||||
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
|
||||
(and then-offset
|
||||
(eql then-offset (compute-offsets (else-regex branch) start-pos))
|
||||
then-offset)))
|
||||
|
||||
(defmethod compute-offsets ((repetition repetition) start-pos)
|
||||
;; no need to descend into the inner regex
|
||||
(with-slots ((len len)
|
||||
(minimum minimum)
|
||||
(maximum maximum))
|
||||
repetition
|
||||
(if (and len
|
||||
(eq minimum maximum))
|
||||
;; fixed number of repetitions, so we know how to proceed
|
||||
(+ start-pos (* minimum len))
|
||||
;; otherwise return NIL
|
||||
nil)))
|
||||
|
||||
(defmethod compute-offsets ((register register) start-pos)
|
||||
(compute-offsets (regex register) start-pos))
|
||||
|
||||
(defmethod compute-offsets ((standalone standalone) start-pos)
|
||||
(compute-offsets (regex standalone) start-pos))
|
||||
|
||||
(defmethod compute-offsets ((char-class char-class) start-pos)
|
||||
(1+ start-pos))
|
||||
|
||||
(defmethod compute-offsets ((everything everything) start-pos)
|
||||
(1+ start-pos))
|
||||
|
||||
(defmethod compute-offsets ((str str) start-pos)
|
||||
(setf (offset str) start-pos)
|
||||
(+ start-pos (len str)))
|
||||
|
||||
(defmethod compute-offsets ((back-reference back-reference) start-pos)
|
||||
;; with enough effort we could possibly do better here, but
|
||||
;; currently we just give up and return NIL
|
||||
(declare (ignore start-pos))
|
||||
nil)
|
||||
|
||||
(defmethod compute-offsets ((regex regex) start-pos)
|
||||
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
||||
;; WORD-BOUNDARY (which all have zero-length)
|
||||
start-pos)
|
||||
868
repetition-closures.lisp
Normal file
868
repetition-closures.lisp
Normal file
@ -0,0 +1,868 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/repetition-closures.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; This is actually a part of closures.lisp which we put into a
|
||||
;;; separate file because it is rather complex. We only deal with
|
||||
;;; REPETITIONs here. Note that this part of the code contains some
|
||||
;;; rather crazy micro-optimizations which were introduced to be as
|
||||
;;; competitive with Perl as possible in tight loops.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defmacro incf-after (place &optional (delta 1) &environment env)
|
||||
"Utility macro inspired by C's \"place++\", i.e. first return the
|
||||
value of PLACE and afterwards increment it by DELTA."
|
||||
(with-unique-names (%temp)
|
||||
(multiple-value-bind (vars vals store-vars writer-form reader-form)
|
||||
(get-setf-expansion place env)
|
||||
`(let* (,@(mapcar #'list vars vals)
|
||||
(,%temp ,reader-form)
|
||||
(,(car store-vars) (+ ,%temp ,delta)))
|
||||
,writer-form
|
||||
,%temp))))
|
||||
|
||||
;; code for greedy repetitions with minimum zero
|
||||
|
||||
(defmacro greedy-constant-length-closure (check-curr-pos)
|
||||
"This is the template for simple greedy repetitions (where simple
|
||||
means that the minimum number of repetitions is zero, that the inner
|
||||
regex to be checked is of fixed length LEN, and that it doesn't
|
||||
contain registers, i.e. there's no need for backtracking).
|
||||
CHECK-CURR-POS is a form which checks whether the inner regex of the
|
||||
repetition matches at CURR-POS."
|
||||
`(if maximum
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos maximum))
|
||||
;; because we know LEN we know in advance where to stop at the
|
||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||
;; minimal length of the part behind the repetition
|
||||
(let ((target-end-pos (min (1+ (- *end-pos* len min-rest))
|
||||
;; don't go further than MAXIMUM
|
||||
;; repetitions, of course
|
||||
(+ start-pos
|
||||
(the fixnum (* len maximum)))))
|
||||
(curr-pos start-pos))
|
||||
(declare (type fixnum target-end-pos curr-pos))
|
||||
(block greedy-constant-length-matcher
|
||||
;; we use an ugly TAGBODY construct because this might be a
|
||||
;; tight loop and this version is a bit faster than our LOOP
|
||||
;; version (at least in CMUCL)
|
||||
(tagbody
|
||||
forward-loop
|
||||
;; first go forward as far as possible, i.e. while
|
||||
;; the inner regex matches
|
||||
(when (>= curr-pos target-end-pos)
|
||||
(go backward-loop))
|
||||
(when ,check-curr-pos
|
||||
(incf curr-pos len)
|
||||
(go forward-loop))
|
||||
backward-loop
|
||||
;; now go back LEN steps each until we're able to match
|
||||
;; the rest of the regex
|
||||
(when (< curr-pos start-pos)
|
||||
(return-from greedy-constant-length-matcher nil))
|
||||
(let ((result (funcall next-fn curr-pos)))
|
||||
(when result
|
||||
(return-from greedy-constant-length-matcher result)))
|
||||
(decf curr-pos len)
|
||||
(go backward-loop)))))
|
||||
;; basically the same code; it's just a bit easier because we're
|
||||
;; not bounded by MAXIMUM
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((target-end-pos (1+ (- *end-pos* len min-rest)))
|
||||
(curr-pos start-pos))
|
||||
(declare (type fixnum target-end-pos curr-pos))
|
||||
(block greedy-constant-length-matcher
|
||||
(tagbody
|
||||
forward-loop
|
||||
(when (>= curr-pos target-end-pos)
|
||||
(go backward-loop))
|
||||
(when ,check-curr-pos
|
||||
(incf curr-pos len)
|
||||
(go forward-loop))
|
||||
backward-loop
|
||||
(when (< curr-pos start-pos)
|
||||
(return-from greedy-constant-length-matcher nil))
|
||||
(let ((result (funcall next-fn curr-pos)))
|
||||
(when result
|
||||
(return-from greedy-constant-length-matcher result)))
|
||||
(decf curr-pos len)
|
||||
(go backward-loop)))))))
|
||||
|
||||
(defun create-greedy-everything-matcher (maximum min-rest next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum min-rest)
|
||||
(type function next-fn))
|
||||
"Creates a closure which just matches as far ahead as possible,
|
||||
i.e. a closure for a dot in single-line mode."
|
||||
(if maximum
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos maximum))
|
||||
;; because we know LEN we know in advance where to stop at the
|
||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||
;; minimal length of the part behind the repetition
|
||||
(let ((target-end-pos (min (+ start-pos maximum)
|
||||
(- *end-pos* min-rest))))
|
||||
(declare (type fixnum target-end-pos))
|
||||
;; start from the highest possible position and go backward
|
||||
;; until we're able to match the rest of the regex
|
||||
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
||||
thereis (funcall next-fn curr-pos))))
|
||||
;; basically the same code; it's just a bit easier because we're
|
||||
;; not bounded by MAXIMUM
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((target-end-pos (- *end-pos* min-rest)))
|
||||
(declare (type fixnum target-end-pos))
|
||||
(loop for curr-pos of-type fixnum from target-end-pos downto start-pos
|
||||
thereis (funcall next-fn curr-pos))))))
|
||||
|
||||
(defmethod create-greedy-constant-length-matcher ((repetition repetition)
|
||||
next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION is
|
||||
of fixed length and doesn't contain registers."
|
||||
(let ((len (len repetition))
|
||||
(maximum (maximum repetition))
|
||||
(regex (regex repetition))
|
||||
(min-rest (min-rest repetition)))
|
||||
(declare (type fixnum len min-rest)
|
||||
(type function next-fn))
|
||||
(cond ((zerop len)
|
||||
;; inner regex has zero-length, so we can discard it
|
||||
;; completely
|
||||
next-fn)
|
||||
(t
|
||||
;; now first try to optimize for a couple of common cases
|
||||
(typecase regex
|
||||
(str
|
||||
(let ((str (str regex)))
|
||||
(if (= 1 len)
|
||||
;; a single character
|
||||
(let ((chr (schar str 0)))
|
||||
(if (case-insensitive-p regex)
|
||||
(greedy-constant-length-closure
|
||||
(char-equal chr (schar *string* curr-pos)))
|
||||
(greedy-constant-length-closure
|
||||
(char= chr (schar *string* curr-pos)))))
|
||||
;; a string
|
||||
(if (case-insensitive-p regex)
|
||||
(greedy-constant-length-closure
|
||||
(*string*-equal str curr-pos (+ curr-pos len) 0 len))
|
||||
(greedy-constant-length-closure
|
||||
(*string*= str curr-pos (+ curr-pos len) 0 len))))))
|
||||
(char-class
|
||||
;; a character class
|
||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||
(if (invertedp regex)
|
||||
(greedy-constant-length-closure
|
||||
(not (char-class-test)))
|
||||
(greedy-constant-length-closure
|
||||
(char-class-test)))))
|
||||
(everything
|
||||
;; an EVERYTHING object, i.e. a dot
|
||||
(if (single-line-p regex)
|
||||
(create-greedy-everything-matcher maximum min-rest next-fn)
|
||||
(greedy-constant-length-closure
|
||||
(char/= #\Newline (schar *string* curr-pos)))))
|
||||
(t
|
||||
;; the general case - we build an inner matcher which
|
||||
;; just checks for immediate success, i.e. NEXT-FN is
|
||||
;; #'IDENTITY
|
||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||
(declare (type function inner-matcher))
|
||||
(greedy-constant-length-closure
|
||||
(funcall inner-matcher curr-pos)))))))))
|
||||
|
||||
(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION can
|
||||
never match a zero-length string (or instead the maximal number of
|
||||
repetitions is 1)."
|
||||
(let ((maximum (maximum repetition))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type function next-fn))
|
||||
(cond
|
||||
((eql maximum 1)
|
||||
;; this is essentially like the next case but with a known
|
||||
;; MAXIMUM of 1 we can get away without a counter; note that
|
||||
;; we always arrive here if CONVERT optimizes <regex>* to
|
||||
;; (?:<regex'>*<regex>)?
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) next-fn))
|
||||
(lambda (start-pos)
|
||||
(declare (type function repeat-matcher))
|
||||
(or (funcall repeat-matcher start-pos)
|
||||
(funcall next-fn start-pos))))
|
||||
(maximum
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
;; because we need to keep track whether we've reached MAXIMUM
|
||||
;; repetitions
|
||||
(let ((rep-num (incf-after *rep-num*)))
|
||||
(flet ((greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
;; the actual matcher which first tries to match the
|
||||
;; inner regex of REPETITION (if we haven't done so
|
||||
;; too often) and on failure calls NEXT-FN
|
||||
(or (and (< (aref *repeat-counters* rep-num) maximum)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
;; note that REPEAT-MATCHER will call
|
||||
;; GREEDY-AUX again recursively
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(decf (aref *repeat-counters* rep-num))))
|
||||
(funcall next-fn start-pos))))
|
||||
;; create a closure to match the inner regex and to
|
||||
;; implement backtracking via GREEDY-AUX
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'greedy-aux))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; GREEDY-AUX to initialize the repetition counter
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0)
|
||||
(greedy-aux start-pos)))))
|
||||
(t
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(or (funcall repeat-matcher start-pos)
|
||||
(funcall next-fn start-pos))))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'greedy-aux))
|
||||
#'greedy-aux)))))
|
||||
|
||||
(defmethod create-greedy-matcher ((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is greedy and the minimal number of repetitions is
|
||||
zero."
|
||||
(let ((maximum (maximum repetition))
|
||||
;; we make a reservation for our slot in *LAST-POS-STORES* because
|
||||
;; we have to watch out for endless loops as the inner regex might
|
||||
;; match zero-length strings
|
||||
(zero-length-num (incf-after *zero-length-num*))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type fixnum zero-length-num)
|
||||
(type function next-fn))
|
||||
(cond
|
||||
(maximum
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
;; because we need to keep track whether we've reached MAXIMUM
|
||||
;; repetitions
|
||||
(let ((rep-num (incf-after *rep-num*)))
|
||||
(flet ((greedy-aux (start-pos)
|
||||
;; the actual matcher which first tries to match the
|
||||
;; inner regex of REPETITION (if we haven't done so
|
||||
;; too often) and on failure calls NEXT-FN
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
(= (the fixnum old-last-pos) start-pos))
|
||||
;; stop immediately if we've been here before,
|
||||
;; i.e. if the last attempt matched a zero-length
|
||||
;; string
|
||||
(return-from greedy-aux (funcall next-fn start-pos)))
|
||||
;; otherwise remember this position for the next
|
||||
;; repetition
|
||||
(setf (svref *last-pos-stores* zero-length-num) start-pos)
|
||||
(or (and (< (aref *repeat-counters* rep-num) maximum)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
;; note that REPEAT-MATCHER will call
|
||||
;; GREEDY-AUX again recursively
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(decf (aref *repeat-counters* rep-num))
|
||||
(setf (svref *last-pos-stores* zero-length-num)
|
||||
old-last-pos)))
|
||||
(funcall next-fn start-pos)))))
|
||||
;; create a closure to match the inner regex and to
|
||||
;; implement backtracking via GREEDY-AUX
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'greedy-aux))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; GREEDY-AUX to initialize the repetition counter and our
|
||||
;; slot in *LAST-POS-STORES*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0
|
||||
(svref *last-pos-stores* zero-length-num) nil)
|
||||
(greedy-aux start-pos)))))
|
||||
(t
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
(= (the fixnum old-last-pos) start-pos))
|
||||
(return-from greedy-aux (funcall next-fn start-pos)))
|
||||
(setf (svref *last-pos-stores* zero-length-num) start-pos)
|
||||
(or (prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(setf (svref *last-pos-stores* zero-length-num) old-last-pos))
|
||||
(funcall next-fn start-pos)))))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'greedy-aux))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (svref *last-pos-stores* zero-length-num) nil)
|
||||
(greedy-aux start-pos)))))))
|
||||
|
||||
;; code for non-greedy repetitions with minimum zero
|
||||
|
||||
(defmacro non-greedy-constant-length-closure (check-curr-pos)
|
||||
"This is the template for simple non-greedy repetitions (where
|
||||
simple means that the minimum number of repetitions is zero, that the
|
||||
inner regex to be checked is of fixed length LEN, and that it doesn't
|
||||
contain registers, i.e. there's no need for backtracking).
|
||||
CHECK-CURR-POS is a form which checks whether the inner regex of the
|
||||
repetition matches at CURR-POS."
|
||||
`(if maximum
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos maximum))
|
||||
;; because we know LEN we know in advance where to stop at the
|
||||
;; latest; we also take into consideration MIN-REST, i.e. the
|
||||
;; minimal length of the part behind the repetition
|
||||
(let ((target-end-pos (min (1+ (- *end-pos* len min-rest))
|
||||
(+ start-pos
|
||||
(the fixnum (* len maximum))))))
|
||||
;; move forward by LEN and always try NEXT-FN first, then
|
||||
;; CHECK-CUR-POS
|
||||
(loop for curr-pos of-type fixnum from start-pos
|
||||
below target-end-pos
|
||||
by len
|
||||
thereis (funcall next-fn curr-pos)
|
||||
while ,check-curr-pos
|
||||
finally (return (funcall next-fn curr-pos)))))
|
||||
;; basically the same code; it's just a bit easier because we're
|
||||
;; not bounded by MAXIMUM
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((target-end-pos (1+ (- *end-pos* len min-rest))))
|
||||
(loop for curr-pos of-type fixnum from start-pos
|
||||
below target-end-pos
|
||||
by len
|
||||
thereis (funcall next-fn curr-pos)
|
||||
while ,check-curr-pos
|
||||
finally (return (funcall next-fn curr-pos)))))))
|
||||
|
||||
(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION is
|
||||
of fixed length and doesn't contain registers."
|
||||
(let ((len (len repetition))
|
||||
(maximum (maximum repetition))
|
||||
(regex (regex repetition))
|
||||
(min-rest (min-rest repetition)))
|
||||
(declare (type fixnum len min-rest)
|
||||
(type function next-fn))
|
||||
(cond ((zerop len)
|
||||
;; inner regex has zero-length, so we can discard it
|
||||
;; completely
|
||||
next-fn)
|
||||
(t
|
||||
;; now first try to optimize for a couple of common cases
|
||||
(typecase regex
|
||||
(str
|
||||
(let ((str (str regex)))
|
||||
(if (= 1 len)
|
||||
;; a single character
|
||||
(let ((chr (schar str 0)))
|
||||
(if (case-insensitive-p regex)
|
||||
(non-greedy-constant-length-closure
|
||||
(char-equal chr (schar *string* curr-pos)))
|
||||
(non-greedy-constant-length-closure
|
||||
(char= chr (schar *string* curr-pos)))))
|
||||
;; a string
|
||||
(if (case-insensitive-p regex)
|
||||
(non-greedy-constant-length-closure
|
||||
(*string*-equal str curr-pos (+ curr-pos len) 0 len))
|
||||
(non-greedy-constant-length-closure
|
||||
(*string*= str curr-pos (+ curr-pos len) 0 len))))))
|
||||
(char-class
|
||||
;; a character class
|
||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||
(if (invertedp regex)
|
||||
(non-greedy-constant-length-closure
|
||||
(not (char-class-test)))
|
||||
(non-greedy-constant-length-closure
|
||||
(char-class-test)))))
|
||||
(everything
|
||||
(if (single-line-p regex)
|
||||
;; a dot which really can match everything; we rely
|
||||
;; on the compiler to optimize this away
|
||||
(non-greedy-constant-length-closure
|
||||
t)
|
||||
;; a dot which has to watch out for #\Newline
|
||||
(non-greedy-constant-length-closure
|
||||
(char/= #\Newline (schar *string* curr-pos)))))
|
||||
(t
|
||||
;; the general case - we build an inner matcher which
|
||||
;; just checks for immediate success, i.e. NEXT-FN is
|
||||
;; #'IDENTITY
|
||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||
(declare (type function inner-matcher))
|
||||
(non-greedy-constant-length-closure
|
||||
(funcall inner-matcher curr-pos)))))))))
|
||||
|
||||
(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
||||
zero. It is furthermore assumed that the inner regex of REPETITION can
|
||||
never match a zero-length string (or instead the maximal number of
|
||||
repetitions is 1)."
|
||||
(let ((maximum (maximum repetition))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after NON-GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type function next-fn))
|
||||
(cond
|
||||
((eql maximum 1)
|
||||
;; this is essentially like the next case but with a known
|
||||
;; MAXIMUM of 1 we can get away without a counter
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) next-fn))
|
||||
(lambda (start-pos)
|
||||
(declare (type function repeat-matcher))
|
||||
(or (funcall next-fn start-pos)
|
||||
(funcall repeat-matcher start-pos))))
|
||||
(maximum
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
;; because we need to keep track whether we've reached MAXIMUM
|
||||
;; repetitions
|
||||
(let ((rep-num (incf-after *rep-num*)))
|
||||
(flet ((non-greedy-aux (start-pos)
|
||||
;; the actual matcher which first calls NEXT-FN and
|
||||
;; on failure tries to match the inner regex of
|
||||
;; REPETITION (if we haven't done so too often)
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(or (funcall next-fn start-pos)
|
||||
(and (< (aref *repeat-counters* rep-num) maximum)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
;; note that REPEAT-MATCHER will call
|
||||
;; NON-GREEDY-AUX again recursively
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(decf (aref *repeat-counters* rep-num)))))))
|
||||
;; create a closure to match the inner regex and to
|
||||
;; implement backtracking via NON-GREEDY-AUX
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; NON-GREEDY-AUX to initialize the repetition counter
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0)
|
||||
(non-greedy-aux start-pos)))))
|
||||
(t
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((non-greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(or (funcall next-fn start-pos)
|
||||
(funcall repeat-matcher start-pos))))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
||||
#'non-greedy-aux)))))
|
||||
|
||||
(defmethod create-non-greedy-matcher ((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION is non-greedy and the minimal number of repetitions is
|
||||
zero."
|
||||
;; we make a reservation for our slot in *LAST-POS-STORES* because
|
||||
;; we have to watch out for endless loops as the inner regex might
|
||||
;; match zero-length strings
|
||||
(let ((zero-length-num (incf-after *zero-length-num*))
|
||||
(maximum (maximum repetition))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after NON-GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type fixnum zero-length-num)
|
||||
(type function next-fn))
|
||||
(cond
|
||||
(maximum
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
;; because we need to keep track whether we've reached MAXIMUM
|
||||
;; repetitions
|
||||
(let ((rep-num (incf-after *rep-num*)))
|
||||
(flet ((non-greedy-aux (start-pos)
|
||||
;; the actual matcher which first calls NEXT-FN and
|
||||
;; on failure tries to match the inner regex of
|
||||
;; REPETITION (if we haven't done so too often)
|
||||
(declare (type fixnum start-pos maximum rep-num)
|
||||
(type function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
(= (the fixnum old-last-pos) start-pos))
|
||||
;; stop immediately if we've been here before,
|
||||
;; i.e. if the last attempt matched a zero-length
|
||||
;; string
|
||||
(return-from non-greedy-aux (funcall next-fn start-pos)))
|
||||
;; otherwise remember this position for the next
|
||||
;; repetition
|
||||
(setf (svref *last-pos-stores* zero-length-num) start-pos)
|
||||
(or (funcall next-fn start-pos)
|
||||
(and (< (aref *repeat-counters* rep-num) maximum)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
;; note that REPEAT-MATCHER will call
|
||||
;; NON-GREEDY-AUX again recursively
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(decf (aref *repeat-counters* rep-num))
|
||||
(setf (svref *last-pos-stores* zero-length-num)
|
||||
old-last-pos)))))))
|
||||
;; create a closure to match the inner regex and to
|
||||
;; implement backtracking via NON-GREEDY-AUX
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; NON-GREEDY-AUX to initialize the repetition counter and our
|
||||
;; slot in *LAST-POS-STORES*
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0
|
||||
(svref *last-pos-stores* zero-length-num) nil)
|
||||
(non-greedy-aux start-pos)))))
|
||||
(t
|
||||
;; easier code because we're not bounded by MAXIMUM, but
|
||||
;; basically the same
|
||||
(flet ((non-greedy-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
(= (the fixnum old-last-pos) start-pos))
|
||||
(return-from non-greedy-aux (funcall next-fn start-pos)))
|
||||
(setf (svref *last-pos-stores* zero-length-num) start-pos)
|
||||
(or (funcall next-fn start-pos)
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(setf (svref *last-pos-stores* zero-length-num)
|
||||
old-last-pos))))))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'non-greedy-aux))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (svref *last-pos-stores* zero-length-num) nil)
|
||||
(non-greedy-aux start-pos)))))))
|
||||
|
||||
;; code for constant repetitions, i.e. those with a fixed number of repetitions
|
||||
|
||||
(defmacro constant-repetition-constant-length-closure (check-curr-pos)
|
||||
"This is the template for simple constant repetitions (where simple
|
||||
means that the inner regex to be checked is of fixed length LEN, and
|
||||
that it doesn't contain registers, i.e. there's no need for
|
||||
backtracking) and where constant means that MINIMUM is equal to
|
||||
MAXIMUM. CHECK-CURR-POS is a form which checks whether the inner regex
|
||||
of the repetition matches at CURR-POS."
|
||||
`(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((target-end-pos (+ start-pos
|
||||
(the fixnum (* len repetitions)))))
|
||||
(declare (type fixnum target-end-pos))
|
||||
;; first check if we won't go beyond the end of the string
|
||||
(and (>= *end-pos* target-end-pos)
|
||||
;; then loop through all repetitions step by step
|
||||
(loop for curr-pos of-type fixnum from start-pos
|
||||
below target-end-pos
|
||||
by len
|
||||
always ,check-curr-pos)
|
||||
;; finally call NEXT-FN if we made it that far
|
||||
(funcall next-fn target-end-pos)))))
|
||||
|
||||
(defmethod create-constant-repetition-constant-length-matcher
|
||||
((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION has a constant number of repetitions. It is
|
||||
furthermore assumed that the inner regex of REPETITION is of fixed
|
||||
length and doesn't contain registers."
|
||||
(let ((len (len repetition))
|
||||
(repetitions (minimum repetition))
|
||||
(regex (regex repetition)))
|
||||
(declare (type fixnum len repetitions)
|
||||
(type function next-fn))
|
||||
(if (zerop len)
|
||||
;; if the length is zero it suffices to try once
|
||||
(create-matcher-aux regex next-fn)
|
||||
;; otherwise try to optimize for a couple of common cases
|
||||
(typecase regex
|
||||
(str
|
||||
(let ((str (str regex)))
|
||||
(if (= 1 len)
|
||||
;; a single character
|
||||
(let ((chr (schar str 0)))
|
||||
(if (case-insensitive-p regex)
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (char-equal chr (schar *string* curr-pos))
|
||||
(1+ curr-pos)))
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (char= chr (schar *string* curr-pos))
|
||||
(1+ curr-pos)))))
|
||||
;; a string
|
||||
(if (case-insensitive-p regex)
|
||||
(constant-repetition-constant-length-closure
|
||||
(let ((next-pos (+ curr-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(and (*string*-equal str curr-pos next-pos 0 len)
|
||||
next-pos)))
|
||||
(constant-repetition-constant-length-closure
|
||||
(let ((next-pos (+ curr-pos len)))
|
||||
(declare (type fixnum next-pos))
|
||||
(and (*string*= str curr-pos next-pos 0 len)
|
||||
next-pos)))))))
|
||||
(char-class
|
||||
;; a character class
|
||||
(insert-char-class-tester (regex (schar *string* curr-pos))
|
||||
(if (invertedp regex)
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (not (char-class-test))
|
||||
(1+ curr-pos)))
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (char-class-test)
|
||||
(1+ curr-pos))))))
|
||||
(everything
|
||||
(if (single-line-p regex)
|
||||
;; a dot which really matches everything - we just have to
|
||||
;; advance the index into *STRING* accordingly and check
|
||||
;; if we didn't go past the end
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(let ((next-pos (+ start-pos repetitions)))
|
||||
(declare (type fixnum next-pos))
|
||||
(or (<= next-pos *end-pos*)
|
||||
(funcall next-fn next-pos))))
|
||||
;; a dot which is not in single-line-mode - make sure we
|
||||
;; don't match #\Newline
|
||||
(constant-repetition-constant-length-closure
|
||||
(and (char/= #\Newline (schar *string* curr-pos))
|
||||
(1+ curr-pos)))))
|
||||
(t
|
||||
;; the general case - we build an inner matcher which just
|
||||
;; checks for immediate success, i.e. NEXT-FN is #'IDENTITY
|
||||
(let ((inner-matcher (create-matcher-aux regex #'identity)))
|
||||
(declare (type function inner-matcher))
|
||||
(constant-repetition-constant-length-closure
|
||||
(funcall inner-matcher curr-pos))))))))
|
||||
|
||||
(defmethod create-constant-repetition-matcher ((repetition repetition) next-fn)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Creates a closure which tries to match REPETITION. It is assumed
|
||||
that REPETITION has a constant number of repetitions."
|
||||
(let ((repetitions (minimum repetition))
|
||||
;; we make a reservation for our slot in *REPEAT-COUNTERS*
|
||||
;; because we need to keep track of the number of repetitions
|
||||
(rep-num (incf-after *rep-num*))
|
||||
;; REPEAT-MATCHER is part of the closure's environment but it
|
||||
;; can only be defined after NON-GREEDY-AUX is defined
|
||||
repeat-matcher)
|
||||
(declare (type fixnum repetitions rep-num)
|
||||
(type function next-fn))
|
||||
(if (zerop (min-len repetition))
|
||||
;; we make a reservation for our slot in *LAST-POS-STORES*
|
||||
;; because we have to watch out for needless loops as the inner
|
||||
;; regex might match zero-length strings
|
||||
(let ((zero-length-num (incf-after *zero-length-num*)))
|
||||
(declare (type fixnum zero-length-num))
|
||||
(flet ((constant-aux (start-pos)
|
||||
;; the actual matcher which first calls NEXT-FN and
|
||||
;; on failure tries to match the inner regex of
|
||||
;; REPETITION (if we haven't done so too often)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(let ((old-last-pos
|
||||
(svref *last-pos-stores* zero-length-num)))
|
||||
(when (and old-last-pos
|
||||
(= (the fixnum old-last-pos) start-pos))
|
||||
;; if we've been here before we matched a
|
||||
;; zero-length string the last time, so we can
|
||||
;; just carry on because we will definitely be
|
||||
;; able to do this again often enough
|
||||
(return-from constant-aux (funcall next-fn start-pos)))
|
||||
;; otherwise remember this position for the next
|
||||
;; repetition
|
||||
(setf (svref *last-pos-stores* zero-length-num) start-pos)
|
||||
(cond ((< (aref *repeat-counters* rep-num) repetitions)
|
||||
;; not enough repetitions yet, try it again
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
;; note that REPEAT-MATCHER will call
|
||||
;; CONSTANT-AUX again recursively
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(decf (aref *repeat-counters* rep-num))
|
||||
(setf (svref *last-pos-stores* zero-length-num)
|
||||
old-last-pos)))
|
||||
(t
|
||||
;; we're done - call NEXT-FN
|
||||
(funcall next-fn start-pos))))))
|
||||
;; create a closure to match the inner regex and to
|
||||
;; implement backtracking via CONSTANT-AUX
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'constant-aux))
|
||||
;; the closure we return is just a thin wrapper around
|
||||
;; CONSTANT-AUX to initialize the repetition counter
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0
|
||||
(aref *last-pos-stores* zero-length-num) nil)
|
||||
(constant-aux start-pos))))
|
||||
;; easier code because we don't have to care about zero-length
|
||||
;; matches but basically the same
|
||||
(flet ((constant-aux (start-pos)
|
||||
(declare (type fixnum start-pos)
|
||||
(type function repeat-matcher))
|
||||
(cond ((< (aref *repeat-counters* rep-num) repetitions)
|
||||
(incf (aref *repeat-counters* rep-num))
|
||||
(prog1
|
||||
(funcall repeat-matcher start-pos)
|
||||
(decf (aref *repeat-counters* rep-num))))
|
||||
(t (funcall next-fn start-pos)))))
|
||||
(setq repeat-matcher
|
||||
(create-matcher-aux (regex repetition) #'constant-aux))
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(setf (aref *repeat-counters* rep-num) 0)
|
||||
(constant-aux start-pos))))))
|
||||
|
||||
;; the actual CREATE-MATCHER-AUX method for REPETITION objects which
|
||||
;; utilizes all the functions and macros defined above
|
||||
|
||||
(defmethod create-matcher-aux ((repetition repetition) next-fn)
|
||||
(with-slots ((minimum minimum)
|
||||
(maximum maximum)
|
||||
(len len)
|
||||
(min-len min-len)
|
||||
(greedyp greedyp)
|
||||
(contains-register-p contains-register-p))
|
||||
repetition
|
||||
(cond ((and maximum
|
||||
(zerop maximum))
|
||||
;; this should have been optimized away by CONVERT but just
|
||||
;; in case...
|
||||
(error "Got REPETITION with MAXIMUM 0 \(should not happen)"))
|
||||
((and maximum
|
||||
(= minimum maximum 1))
|
||||
;; this should have been optimized away by CONVERT but just
|
||||
;; in case...
|
||||
(error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)"))
|
||||
((and (eql minimum maximum)
|
||||
len
|
||||
(not contains-register-p))
|
||||
(create-constant-repetition-constant-length-matcher repetition next-fn))
|
||||
((eql minimum maximum)
|
||||
(create-constant-repetition-matcher repetition next-fn))
|
||||
((and greedyp
|
||||
len
|
||||
(not contains-register-p))
|
||||
(create-greedy-constant-length-matcher repetition next-fn))
|
||||
((and greedyp
|
||||
(or (plusp min-len)
|
||||
(eql maximum 1)))
|
||||
(create-greedy-no-zero-matcher repetition next-fn))
|
||||
(greedyp
|
||||
(create-greedy-matcher repetition next-fn))
|
||||
((and len
|
||||
(plusp len)
|
||||
(not contains-register-p))
|
||||
(create-non-greedy-constant-length-matcher repetition next-fn))
|
||||
((or (plusp min-len)
|
||||
(eql maximum 1))
|
||||
(create-non-greedy-no-zero-matcher repetition next-fn))
|
||||
(t
|
||||
(create-non-greedy-matcher repetition next-fn)))))
|
||||
519
scanner.lisp
Normal file
519
scanner.lisp
Normal file
@ -0,0 +1,519 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/scanner.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Here the scanner for the actual regex as well as utility scanners
|
||||
;;; for the constant start and end strings are created.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defmacro bmh-matcher-aux (&key case-insensitive-p)
|
||||
"Auxiliary macro used by CREATE-BMH-MATCHER."
|
||||
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
||||
`(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(if (> (the fixnum (+ start-pos m)) *end-pos*)
|
||||
nil
|
||||
(loop named bmh-matcher
|
||||
for k of-type fixnum = (+ start-pos m -1)
|
||||
then (+ k (max 1 (aref skip (char-code (schar *string* k)))))
|
||||
while (< k *end-pos*)
|
||||
do (loop for j of-type fixnum downfrom (1- m)
|
||||
for i of-type fixnum downfrom k
|
||||
while (and (>= j 0)
|
||||
(,char-compare (schar *string* i)
|
||||
(schar pattern j)))
|
||||
finally (if (minusp j)
|
||||
(return-from bmh-matcher (1+ i)))))))))
|
||||
|
||||
(defun create-bmh-matcher (pattern case-insensitive-p)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
|
||||
simple-string *STRING* for the first occurence of the substring
|
||||
PATTERN. The search starts at the position START-POS within *STRING*
|
||||
and stops before *END-POS* is reached. Depending on the second
|
||||
argument the search is case-insensitive or not. If the special
|
||||
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
|
||||
instead. (BMH matchers are faster but need much more space.)"
|
||||
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
|
||||
;; details
|
||||
(unless *use-bmh-matchers*
|
||||
(let ((test (if case-insensitive-p #'char-equal #'char=)))
|
||||
(return-from create-bmh-matcher
|
||||
(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(search pattern
|
||||
*string*
|
||||
:start2 start-pos
|
||||
:end2 *end-pos*
|
||||
:test test)))))
|
||||
(let* ((m (length pattern))
|
||||
(skip (make-array *regex-char-code-limit*
|
||||
:element-type 'fixnum
|
||||
:initial-element m)))
|
||||
(declare (type fixnum m))
|
||||
(loop for k of-type fixnum below m
|
||||
if case-insensitive-p
|
||||
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
|
||||
(aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1))
|
||||
else
|
||||
do (setf (aref skip (char-code (schar pattern k))) (- m k 1)))
|
||||
(if case-insensitive-p
|
||||
(bmh-matcher-aux :case-insensitive-p t)
|
||||
(bmh-matcher-aux))))
|
||||
|
||||
(defmacro char-searcher-aux (&key case-insensitive-p)
|
||||
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
|
||||
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
||||
`(lambda (start-pos)
|
||||
(declare (type fixnum start-pos))
|
||||
(loop for i of-type fixnum from start-pos below *end-pos*
|
||||
thereis (and (,char-compare (schar *string* i) chr) i)))))
|
||||
|
||||
(defun create-char-searcher (chr case-insensitive-p)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns a function which searches the (special) simple-string
|
||||
*STRING* for the first occurence of the character CHR. The search
|
||||
starts at the position START-POS within *STRING* and stops before
|
||||
*END-POS* is reached. Depending on the second argument the search is
|
||||
case-insensitive or not."
|
||||
(if case-insensitive-p
|
||||
(char-searcher-aux :case-insensitive-p t)
|
||||
(char-searcher-aux)))
|
||||
|
||||
(declaim (inline newline-skipper))
|
||||
|
||||
(defun newline-skipper (start-pos)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum start-pos))
|
||||
"Find the next occurence of a character in *STRING* which is behind
|
||||
a #\Newline."
|
||||
(loop for i of-type fixnum from start-pos below *end-pos*
|
||||
thereis (and (char= (schar *string* i) #\Newline)
|
||||
(1+ i))))
|
||||
|
||||
(defmacro insert-advance-fn (advance-fn)
|
||||
"Creates the actual closure returned by CREATE-SCANNER-AUX by
|
||||
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
|
||||
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
||||
(subst
|
||||
advance-fn '(advance-fn-definition)
|
||||
'(lambda (string start end)
|
||||
(block scan
|
||||
;; initialize a couple of special variables used by the
|
||||
;; matchers (see file specials.lisp)
|
||||
(let* ((*string* string)
|
||||
(*start-pos* start)
|
||||
(*end-pos* end)
|
||||
;; we will search forward for END-STRING if this value
|
||||
;; isn't at least as big as POS (see ADVANCE-FN), so it
|
||||
;; is safe to start to the left of *START-POS*; note
|
||||
;; that this value will _never_ be decremented - this
|
||||
;; is crucial to the scanning process
|
||||
(*end-string-pos* (1- *start-pos*))
|
||||
;; the next five will shadow the variables defined by
|
||||
;; DEFPARAMETER; at this point, we don't know if we'll
|
||||
;; actually use them, though
|
||||
(*repeat-counters* *repeat-counters*)
|
||||
(*last-pos-stores* *last-pos-stores*)
|
||||
(*reg-starts* *reg-starts*)
|
||||
(*regs-maybe-start* *regs-maybe-start*)
|
||||
(*reg-ends* *reg-ends*)
|
||||
;; we might be able to optimize the scanning process by
|
||||
;; (virtually) shifting *START-POS* to the right
|
||||
(scan-start-pos *start-pos*)
|
||||
(starts-with-str (if start-string-test
|
||||
(str starts-with)
|
||||
nil))
|
||||
;; we don't need to try further than MAX-END-POS
|
||||
(max-end-pos (- *end-pos* min-len)))
|
||||
(declare (type fixnum scan-start-pos)
|
||||
(type function match-fn))
|
||||
;; definition of ADVANCE-FN will be inserted here by macrology
|
||||
(labels ((advance-fn-definition))
|
||||
(declare (inline advance-fn))
|
||||
(when (plusp rep-num)
|
||||
;; we have at least one REPETITION which needs to count
|
||||
;; the number of repetitions
|
||||
(setq *repeat-counters* (make-array rep-num
|
||||
:initial-element 0
|
||||
:element-type 'fixnum)))
|
||||
(when (plusp zero-length-num)
|
||||
;; we have at least one REPETITION which needs to watch
|
||||
;; out for zero-length repetitions
|
||||
(setq *last-pos-stores* (make-array zero-length-num
|
||||
:initial-element nil)))
|
||||
(when (plusp reg-num)
|
||||
;; we have registers in our regular expression
|
||||
(setq *reg-starts* (make-array reg-num :initial-element nil)
|
||||
*regs-maybe-start* (make-array reg-num :initial-element nil)
|
||||
*reg-ends* (make-array reg-num :initial-element nil)))
|
||||
(when end-anchored-p
|
||||
;; the regular expression has a constant end string which
|
||||
;; is anchored at the very end of the target string
|
||||
;; (perhaps modulo a #\Newline)
|
||||
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
|
||||
(declare (type fixnum end-test-pos)
|
||||
(type function end-string-test))
|
||||
(unless (setq *end-string-pos* (funcall end-string-test
|
||||
end-test-pos))
|
||||
(when (and (= 1 (the fixnum end-anchored-p))
|
||||
(char= #\Newline (schar *string* (1- *end-pos*))))
|
||||
;; if we didn't find an end string candidate from
|
||||
;; END-TEST-POS and if a #\Newline at the end is
|
||||
;; allowed we try it again from one position to the
|
||||
;; left
|
||||
(setq *end-string-pos* (funcall end-string-test
|
||||
(1- end-test-pos))))))
|
||||
(unless (and *end-string-pos*
|
||||
(<= *start-pos* *end-string-pos*))
|
||||
;; no end string candidate found, so give up
|
||||
(return-from scan nil))
|
||||
(when end-string-offset
|
||||
;; if the offset of the constant end string from the
|
||||
;; left of the regular expression is known we can start
|
||||
;; scanning further to the right; this is similar to
|
||||
;; what we might do in ADVANCE-FN
|
||||
(setq scan-start-pos (max scan-start-pos
|
||||
(- (the fixnum *end-string-pos*)
|
||||
(the fixnum end-string-offset))))))
|
||||
(cond
|
||||
(start-anchored-p
|
||||
;; we're anchored at the start of the target string,
|
||||
;; so no need to try again after first failure
|
||||
(when (or (/= *start-pos* scan-start-pos)
|
||||
(< max-end-pos *start-pos*))
|
||||
;; if END-STRING-OFFSET has proven that we don't
|
||||
;; need to bother to scan from *START-POS* or if the
|
||||
;; minimal length of the regular expression is
|
||||
;; longer than the target string we give up
|
||||
(return-from scan nil))
|
||||
(when starts-with-str
|
||||
(locally
|
||||
(declare (type fixnum starts-with-len))
|
||||
(cond ((and (case-insensitive-p starts-with)
|
||||
(not (*string*-equal starts-with-str
|
||||
*start-pos*
|
||||
(+ *start-pos*
|
||||
starts-with-len)
|
||||
0 starts-with-len)))
|
||||
;; the regular expression has a
|
||||
;; case-insensitive constant start string
|
||||
;; and we didn't find it
|
||||
(return-from scan nil))
|
||||
((and (not (case-insensitive-p starts-with))
|
||||
(not (*string*= starts-with-str
|
||||
*start-pos*
|
||||
(+ *start-pos* starts-with-len)
|
||||
0 starts-with-len)))
|
||||
;; the regular expression has a
|
||||
;; case-sensitive constant start string
|
||||
;; and we didn't find it
|
||||
(return-from scan nil))
|
||||
(t nil))))
|
||||
(when (and end-string-test
|
||||
(not end-anchored-p))
|
||||
;; the regular expression has a constant end string
|
||||
;; which isn't anchored so we didn't check for it
|
||||
;; already
|
||||
(block end-string-loop
|
||||
;; we temporarily use *END-STRING-POS* as our
|
||||
;; starting position to look for end string
|
||||
;; candidates
|
||||
(setq *end-string-pos* *start-pos*)
|
||||
(loop
|
||||
(unless (setq *end-string-pos*
|
||||
(funcall (the function end-string-test)
|
||||
*end-string-pos*))
|
||||
;; no end string candidate found, so give up
|
||||
(return-from scan nil))
|
||||
(unless end-string-offset
|
||||
;; end string doesn't have an offset so we
|
||||
;; can start scanning now
|
||||
(return-from end-string-loop))
|
||||
(let ((maybe-start-pos (- (the fixnum *end-string-pos*)
|
||||
(the fixnum end-string-offset))))
|
||||
(cond ((= maybe-start-pos *start-pos*)
|
||||
;; offset of end string into regular
|
||||
;; expression matches start anchor -
|
||||
;; fine...
|
||||
(return-from end-string-loop))
|
||||
((and (< maybe-start-pos *start-pos*)
|
||||
(< (+ *end-string-pos* end-string-len) *end-pos*))
|
||||
;; no match but maybe we find another
|
||||
;; one to the right - try again
|
||||
(incf *end-string-pos*))
|
||||
(t
|
||||
;; otherwise give up
|
||||
(return-from scan nil)))))))
|
||||
;; if we got here we scan exactly once
|
||||
(let ((next-pos (funcall match-fn *start-pos*)))
|
||||
(when next-pos
|
||||
(values (if next-pos *start-pos* nil)
|
||||
next-pos
|
||||
*reg-starts*
|
||||
*reg-ends*))))
|
||||
(t
|
||||
(loop for pos = (if starts-with-everything
|
||||
;; don't jump to the next
|
||||
;; #\Newline on the first
|
||||
;; iteration
|
||||
scan-start-pos
|
||||
(advance-fn scan-start-pos))
|
||||
then (advance-fn pos)
|
||||
;; give up if the regular expression can't fit
|
||||
;; into the rest of the target string
|
||||
while (and pos
|
||||
(<= (the fixnum pos) max-end-pos))
|
||||
do (let ((next-pos (funcall match-fn pos)))
|
||||
(when next-pos
|
||||
(return-from scan (values pos
|
||||
next-pos
|
||||
*reg-starts*
|
||||
*reg-ends*)))
|
||||
;; not yet found, increment POS
|
||||
#-cormanlisp (incf (the fixnum pos))
|
||||
#+cormanlisp (incf pos)))))))))
|
||||
:test #'equalp))
|
||||
|
||||
(defun create-scanner-aux (match-fn
|
||||
min-len
|
||||
start-anchored-p
|
||||
starts-with
|
||||
start-string-test
|
||||
end-anchored-p
|
||||
end-string-test
|
||||
end-string-len
|
||||
end-string-offset
|
||||
rep-num
|
||||
zero-length-num
|
||||
reg-num)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
(declare (type fixnum min-len zero-length-num rep-num reg-num))
|
||||
"Auxiliary function to create and return a scanner \(which is
|
||||
actually a closure). Used by CREATE-SCANNER."
|
||||
(let ((starts-with-len (if (typep starts-with 'str)
|
||||
(len starts-with)))
|
||||
(starts-with-everything (typep starts-with 'everything)))
|
||||
(cond
|
||||
;; this COND statement dispatches on the different versions we
|
||||
;; have for ADVANCE-FN and creates different closures for each;
|
||||
;; note that you see only the bodies of ADVANCE-FN below - the
|
||||
;; actual scanner is defined in INSERT-ADVANCE-FN above; (we
|
||||
;; could have done this with closures instead of macrology but
|
||||
;; would have consed a lot more)
|
||||
((and start-string-test end-string-test end-string-offset)
|
||||
;; we know that the regular expression has constant start and
|
||||
;; end strings and we know the end string's offset (from the
|
||||
;; left)
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type fixnum end-string-offset starts-with-len)
|
||||
(type function start-string-test end-string-test))
|
||||
(loop
|
||||
(unless (setq pos (funcall start-string-test pos))
|
||||
;; give up completely if we can't find a start string
|
||||
;; candidate
|
||||
(return-from scan nil))
|
||||
(locally
|
||||
;; from here we know that POS is a FIXNUM
|
||||
(declare (type fixnum pos))
|
||||
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
||||
;; if we already found an end string candidate the
|
||||
;; position of which matches the start string
|
||||
;; candidate we're done
|
||||
(return-from advance-fn pos))
|
||||
(let ((try-pos (+ pos starts-with-len)))
|
||||
;; otherwise try (again) to find an end string
|
||||
;; candidate which starts behind the start string
|
||||
;; candidate
|
||||
(loop
|
||||
(unless (setq *end-string-pos*
|
||||
(funcall end-string-test try-pos))
|
||||
;; no end string candidate found, so give up
|
||||
(return-from scan nil))
|
||||
;; NEW-POS is where we should start scanning
|
||||
;; according to the end string candidate
|
||||
(let ((new-pos (- (the fixnum *end-string-pos*)
|
||||
end-string-offset)))
|
||||
(declare (type fixnum new-pos *end-string-pos*))
|
||||
(cond ((= new-pos pos)
|
||||
;; if POS and NEW-POS are equal then the
|
||||
;; two candidates agree so we're fine
|
||||
(return-from advance-fn pos))
|
||||
((> new-pos pos)
|
||||
;; if NEW-POS is further to the right we
|
||||
;; advance POS and try again, i.e. we go
|
||||
;; back to the start of the outer LOOP
|
||||
(setq pos new-pos)
|
||||
;; this means "return from inner LOOP"
|
||||
(return))
|
||||
(t
|
||||
;; otherwise NEW-POS is smaller than POS,
|
||||
;; so we have to redo the inner LOOP to
|
||||
;; find another end string candidate
|
||||
;; further to the right
|
||||
(setq try-pos (1+ *end-string-pos*))))))))))))
|
||||
((and starts-with-everything end-string-test end-string-offset)
|
||||
;; we know that the regular expression starts with ".*" (which
|
||||
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
|
||||
;; with a constant end string and we know the end string's
|
||||
;; offset (from the left)
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type fixnum end-string-offset)
|
||||
(type function end-string-test))
|
||||
(loop
|
||||
(unless (setq pos (newline-skipper pos))
|
||||
;; if we can't find a #\Newline we give up immediately
|
||||
(return-from scan nil))
|
||||
(locally
|
||||
;; from here we know that POS is a FIXNUM
|
||||
(declare (type fixnum pos))
|
||||
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
||||
;; if we already found an end string candidate the
|
||||
;; position of which matches the place behind the
|
||||
;; #\Newline we're done
|
||||
(return-from advance-fn pos))
|
||||
(let ((try-pos pos))
|
||||
;; otherwise try (again) to find an end string
|
||||
;; candidate which starts behind the #\Newline
|
||||
(loop
|
||||
(unless (setq *end-string-pos*
|
||||
(funcall end-string-test try-pos))
|
||||
;; no end string candidate found, so we give up
|
||||
(return-from scan nil))
|
||||
;; NEW-POS is where we should start scanning
|
||||
;; according to the end string candidate
|
||||
(let ((new-pos (- (the fixnum *end-string-pos*)
|
||||
end-string-offset)))
|
||||
(declare (type fixnum new-pos *end-string-pos*))
|
||||
(cond ((= new-pos pos)
|
||||
;; if POS and NEW-POS are equal then the
|
||||
;; the end string candidate agrees with
|
||||
;; the #\Newline so we're fine
|
||||
(return-from advance-fn pos))
|
||||
((> new-pos pos)
|
||||
;; if NEW-POS is further to the right we
|
||||
;; advance POS and try again, i.e. we go
|
||||
;; back to the start of the outer LOOP
|
||||
(setq pos new-pos)
|
||||
;; this means "return from inner LOOP"
|
||||
(return))
|
||||
(t
|
||||
;; otherwise NEW-POS is smaller than POS,
|
||||
;; so we have to redo the inner LOOP to
|
||||
;; find another end string candidate
|
||||
;; further to the right
|
||||
(setq try-pos (1+ *end-string-pos*))))))))))))
|
||||
((and start-string-test end-string-test)
|
||||
;; we know that the regular expression has constant start and
|
||||
;; end strings; similar to the first case but we only need to
|
||||
;; check for the end string, it doesn't provide enough
|
||||
;; information to advance POS
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function start-string-test end-string-test))
|
||||
(unless (setq pos (funcall start-string-test pos))
|
||||
(return-from scan nil))
|
||||
(if (<= (the fixnum pos)
|
||||
(the fixnum *end-string-pos*))
|
||||
(return-from advance-fn pos))
|
||||
(unless (setq *end-string-pos* (funcall end-string-test pos))
|
||||
(return-from scan nil))
|
||||
pos)))
|
||||
((and starts-with-everything end-string-test)
|
||||
;; we know that the regular expression starts with ".*" (which
|
||||
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
|
||||
;; with a constant end string; similar to the second case but we
|
||||
;; only need to check for the end string, it doesn't provide
|
||||
;; enough information to advance POS
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function end-string-test))
|
||||
(unless (setq pos (newline-skipper pos))
|
||||
(return-from scan nil))
|
||||
(if (<= (the fixnum pos)
|
||||
(the fixnum *end-string-pos*))
|
||||
(return-from advance-fn pos))
|
||||
(unless (setq *end-string-pos* (funcall end-string-test pos))
|
||||
(return-from scan nil))
|
||||
pos)))
|
||||
(start-string-test
|
||||
;; just check for constant start string candidate
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function start-string-test))
|
||||
(unless (setq pos (funcall start-string-test pos))
|
||||
(return-from scan nil))
|
||||
pos)))
|
||||
(starts-with-everything
|
||||
;; just advance POS with NEWLINE-SKIPPER
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(unless (setq pos (newline-skipper pos))
|
||||
(return-from scan nil))
|
||||
pos)))
|
||||
(end-string-test
|
||||
;; just check for the next end string candidate if POS has
|
||||
;; advanced beyond the last one
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
(declare (type function end-string-test))
|
||||
(if (<= (the fixnum pos)
|
||||
(the fixnum *end-string-pos*))
|
||||
(return-from advance-fn pos))
|
||||
(unless (setq *end-string-pos* (funcall end-string-test pos))
|
||||
(return-from scan nil))
|
||||
pos)))
|
||||
(t
|
||||
;; not enough optimization information about the regular
|
||||
;; expression to optimize so we just return POS
|
||||
(insert-advance-fn
|
||||
(advance-fn (pos)
|
||||
pos))))))
|
||||
107
specials.lisp
Normal file
107
specials.lisp
Normal file
@ -0,0 +1,107 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/specials.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; globally declared special variables
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
;;; special variables used by the lexer/parser combo
|
||||
|
||||
(defvar *extended-mode-p* nil
|
||||
"Whether the parser will start in extended mode.")
|
||||
(declaim (type boolean *extended-mode-p*))
|
||||
|
||||
;;; special variables used by the SCAN function and the matchers
|
||||
|
||||
(defvar *string* ""
|
||||
"The string which is currently scanned by SCAN.
|
||||
Will always be coerced to a SIMPLE-STRING.")
|
||||
(declaim (type simple-string *string*))
|
||||
|
||||
(defvar *start-pos* 0
|
||||
"Where to start scanning within *STRING*.")
|
||||
(declaim (type fixnum *start-pos*))
|
||||
|
||||
(defvar *real-start-pos* nil
|
||||
"The real start of *STRING*. This is for repeated scans and is only used internally.")
|
||||
(declaim (type (or null fixnum) *real-start-pos*))
|
||||
|
||||
(defvar *end-pos* 0
|
||||
"Where to stop scanning within *STRING*.")
|
||||
(declaim (type fixnum *end-pos*))
|
||||
|
||||
(defvar *reg-starts* (make-array 0)
|
||||
"An array which holds the start positions
|
||||
of the current register candidates.")
|
||||
(declaim (type simple-vector *reg-starts*))
|
||||
|
||||
(defvar *regs-maybe-start* (make-array 0)
|
||||
"An array which holds the next start positions
|
||||
of the current register candidates.")
|
||||
(declaim (type simple-vector *regs-maybe-start*))
|
||||
|
||||
(defvar *reg-ends* (make-array 0)
|
||||
"An array which holds the end positions
|
||||
of the current register candidates.")
|
||||
(declaim (type simple-vector *reg-ends*))
|
||||
|
||||
(defvar *end-string-pos* nil
|
||||
"Start of the next possible end-string candidate.")
|
||||
|
||||
(defvar *rep-num* 0
|
||||
"Counts the number of \"complicated\" repetitions while the matchers
|
||||
are built.")
|
||||
(declaim (type fixnum *rep-num*))
|
||||
|
||||
(defvar *zero-length-num* 0
|
||||
"Counts the number of repetitions the inner regexes of which may
|
||||
have zero-length while the matchers are built.")
|
||||
(declaim (type fixnum *zero-length-num*))
|
||||
|
||||
(defvar *repeat-counters* (make-array 0
|
||||
:initial-element 0
|
||||
:element-type 'fixnum)
|
||||
"An array to keep track of how often
|
||||
repetitive patterns have been tested already.")
|
||||
(declaim (type (array fixnum (*)) *repeat-counters*))
|
||||
|
||||
(defvar *last-pos-stores* (make-array 0)
|
||||
"An array to keep track of the last positions
|
||||
where we saw repetitive patterns.
|
||||
Only used for patterns which might have zero length.")
|
||||
(declaim (type simple-vector *last-pos-stores*))
|
||||
|
||||
(defvar *use-bmh-matchers* t
|
||||
"Whether the scanners created by CREATE-SCANNER should use the \(fast
|
||||
but large) Boyer-Moore-Horspool matchers.")
|
||||
|
||||
(defvar *allow-quoting* nil
|
||||
"Whether the parser should support Perl's \\Q and \\E.")
|
||||
|
||||
(pushnew :cl-ppcre *features*)
|
||||
278
util.lisp
Normal file
278
util.lisp
Normal file
@ -0,0 +1,278 @@
|
||||
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
||||
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/util.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
||||
|
||||
;;; Utility functions and constants dealing with the hash-tables
|
||||
;;; we use to encode character classes
|
||||
|
||||
;;; Hash-tables are treated like sets, i.e. a character C is a member of the
|
||||
;;; hash-table H iff (GETHASH C H) is true.
|
||||
|
||||
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
||||
|
||||
;;; Redistribution and use in source and binary forms, with or without
|
||||
;;; modification, are permitted provided that the following conditions
|
||||
;;; are met:
|
||||
|
||||
;;; * Redistributions of source code must retain the above copyright
|
||||
;;; notice, this list of conditions and the following disclaimer.
|
||||
|
||||
;;; * Redistributions in binary form must reproduce the above
|
||||
;;; copyright notice, this list of conditions and the following
|
||||
;;; disclaimer in the documentation and/or other materials
|
||||
;;; provided with the distribution.
|
||||
|
||||
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
||||
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
(in-package #:cl-ppcre)
|
||||
|
||||
(defmacro with-unique-names ((&rest bindings) &body body)
|
||||
"Syntax: WITH-UNIQUE-NAMES ( { var | (var x) }* ) declaration* form*
|
||||
|
||||
Executes a series of forms with each VAR bound to a fresh,
|
||||
uninterned symbol. The uninterned symbol is as if returned by a call
|
||||
to GENSYM with the string denoted by X - or, if X is not supplied, the
|
||||
string denoted by VAR - as argument.
|
||||
|
||||
The variable bindings created are lexical unless special declarations
|
||||
are specified. The scopes of the name bindings and declarations do not
|
||||
include the Xs.
|
||||
|
||||
The forms are evaluated in order, and the values of all but the last
|
||||
are discarded \(that is, the body is an implicit PROGN)."
|
||||
;; reference implementation posted to comp.lang.lisp as
|
||||
;; <cy3bshuf30f.fsf@ljosa.com> by Vebjorn Ljosa - see also
|
||||
;; <http://www.cliki.net/Common%20Lisp%20Utilities>
|
||||
`(let ,(mapcar #'(lambda (binding)
|
||||
(check-type binding (or cons symbol))
|
||||
(if (consp binding)
|
||||
(destructuring-bind (var x) binding
|
||||
(check-type var symbol)
|
||||
`(,var (gensym ,(etypecase x
|
||||
(symbol (symbol-name x))
|
||||
(character (string x))
|
||||
(string x)))))
|
||||
`(,binding (gensym ,(symbol-name binding)))))
|
||||
bindings)
|
||||
,@body))
|
||||
|
||||
(defmacro rebinding (bindings &body body)
|
||||
"REBINDING ( { var | (var prefix) }* ) form*
|
||||
|
||||
Evaluates a series of forms in the lexical environment that is
|
||||
formed by adding the binding of each VAR to a fresh, uninterned
|
||||
symbol, and the binding of that fresh, uninterned symbol to VAR's
|
||||
original value, i.e., its value in the current lexical environment.
|
||||
|
||||
The uninterned symbol is created as if by a call to GENSYM with the
|
||||
string denoted by PREFIX - or, if PREFIX is not supplied, the string
|
||||
denoted by VAR - as argument.
|
||||
|
||||
The forms are evaluated in order, and the values of all but the last
|
||||
are discarded \(that is, the body is an implicit PROGN)."
|
||||
;; reference implementation posted to comp.lang.lisp as
|
||||
;; <cy3wv0fya0p.fsf@ljosa.com> by Vebjorn Ljosa - see also
|
||||
;; <http://www.cliki.net/Common%20Lisp%20Utilities>
|
||||
(loop for binding in bindings
|
||||
for var = (if (consp binding) (car binding) binding)
|
||||
for name = (gensym)
|
||||
collect `(,name ,var) into renames
|
||||
collect ``(,,var ,,name) into temps
|
||||
finally (return `(let ,renames
|
||||
(with-unique-names ,bindings
|
||||
`(let (,,@temps)
|
||||
,,@body))))))
|
||||
|
||||
(eval-when (:compile-toplevel :execute :load-toplevel)
|
||||
(defvar *regex-char-code-limit* char-code-limit
|
||||
"The upper exclusive bound on the char-codes of characters
|
||||
which can occur in character classes.
|
||||
Change this value BEFORE creating scanners if you don't need
|
||||
the full Unicode support of LW, ACL, or CLISP.")
|
||||
(declaim (type fixnum *regex-char-code-limit*))
|
||||
|
||||
(defun make-char-hash (test)
|
||||
(declare (optimize speed space))
|
||||
"Returns a hash-table of all characters satisfying test."
|
||||
(loop with hash = (make-hash-table)
|
||||
for c of-type fixnum from 0 below char-code-limit
|
||||
for chr = (code-char c)
|
||||
if (and chr (funcall test chr))
|
||||
do (setf (gethash chr hash) t)
|
||||
finally (return hash)))
|
||||
|
||||
(declaim (inline word-char-p))
|
||||
|
||||
(defun word-char-p (chr)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Tests whether a character is a \"word\" character.
|
||||
In the ASCII charset this is equivalent to a-z, A-Z, 0-9, or _,
|
||||
i.e. the same as Perl's [\\w]."
|
||||
(or (alphanumericp chr)
|
||||
(char= chr #\_)))
|
||||
|
||||
(unless (boundp '+whitespace-char-string+)
|
||||
(defconstant +whitespace-char-string+
|
||||
(coerce
|
||||
'(#\Space #\Tab #\Linefeed #\Return #\Page)
|
||||
'string)
|
||||
"A string of all characters which are considered to be whitespace.
|
||||
Same as Perl's [\\s]."))
|
||||
|
||||
(defun whitespacep (chr)
|
||||
(declare (optimize speed space))
|
||||
"Tests whether a character is whitespace,
|
||||
i.e. whether it would match [\\s] in Perl."
|
||||
(find chr +whitespace-char-string+ :test #'char=)))
|
||||
|
||||
;; the following DEFCONSTANT statements are wrapped with
|
||||
;; (UNLESS (BOUNDP ...) ...) to make SBCL happy
|
||||
|
||||
(unless (boundp '+digit-hash+)
|
||||
(defconstant +digit-hash+
|
||||
(make-char-hash (lambda (chr) (char<= #\0 chr #\9)))
|
||||
"Hash-table containing the digits from 0 to 9."))
|
||||
|
||||
(unless (boundp '+word-char-hash+)
|
||||
(defconstant +word-char-hash+
|
||||
(make-char-hash #'word-char-p)
|
||||
"Hash-table containing all \"word\" characters."))
|
||||
|
||||
(unless (boundp '+whitespace-char-hash+)
|
||||
(defconstant +whitespace-char-hash+
|
||||
(make-char-hash #'whitespacep)
|
||||
"Hash-table containing all whitespace characters."))
|
||||
|
||||
(defun merge-hash (hash1 hash2)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns the \"sum\" of two hashes. This is a destructive operation
|
||||
on HASH1."
|
||||
(cond ((> (hash-table-count hash2)
|
||||
*regex-char-code-limit*)
|
||||
;; don't walk through, e.g., the whole +WORD-CHAR-HASH+ if
|
||||
;; the user has set *REGEX-CHAR-CODE-LIMIT* to a lower value
|
||||
(loop for c of-type fixnum from 0 below *regex-char-code-limit*
|
||||
for chr = (code-char c)
|
||||
if (and chr (gethash chr hash2))
|
||||
do (setf (gethash chr hash1) t)))
|
||||
(t
|
||||
(loop for chr being the hash-keys of hash2
|
||||
do (setf (gethash chr hash1) t))))
|
||||
hash1)
|
||||
|
||||
(defun merge-inverted-hash (hash1 hash2)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Returns the \"sum\" of HASH1 and the \"inverse\" of HASH2. This is
|
||||
a destructive operation on HASH1."
|
||||
(loop for c of-type fixnum from 0 below *regex-char-code-limit*
|
||||
for chr = (code-char c)
|
||||
if (and chr (not (gethash chr hash2)))
|
||||
do (setf (gethash chr hash1) t))
|
||||
hash1)
|
||||
|
||||
(defun create-ranges-from-hash (hash &key downcasep)
|
||||
(declare (optimize speed
|
||||
(safety 0)
|
||||
(space 0)
|
||||
(debug 0)
|
||||
(compilation-speed 0)
|
||||
#+:lispworks (hcl:fixnum-safety 0)))
|
||||
"Tries to identify up to three intervals (with respect to CHAR<)
|
||||
which together comprise HASH. Returns NIL if this is not possible.
|
||||
If DOWNCASEP is true it will treat the hash-table as if it represents
|
||||
both the lower-case and the upper-case variants of its members and
|
||||
will only return the respective lower-case intervals."
|
||||
;; discard empty hash-tables
|
||||
(unless (and hash (plusp (hash-table-count hash)))
|
||||
(return-from create-ranges-from-hash nil))
|
||||
(loop with min1 and min2 and min3
|
||||
and max1 and max2 and max3
|
||||
;; loop through all characters in HASH, sorted by CHAR<
|
||||
for chr in (sort (the list
|
||||
(loop for chr being the hash-keys of hash
|
||||
collect (if downcasep
|
||||
(char-downcase chr)
|
||||
chr)))
|
||||
#'char<)
|
||||
for code = (char-code chr)
|
||||
;; MIN1, MAX1, etc. are _exclusive_
|
||||
;; bounds of the intervals identified so far
|
||||
do (cond
|
||||
((not min1)
|
||||
;; this will only happen once, for the first character
|
||||
(setq min1 (1- code)
|
||||
max1 (1+ code)))
|
||||
((<= (the fixnum min1) code (the fixnum max1))
|
||||
;; we're here as long as CHR fits into the first interval
|
||||
(setq min1 (min (the fixnum min1) (1- code))
|
||||
max1 (max (the fixnum max1) (1+ code))))
|
||||
((not min2)
|
||||
;; we need to open a second interval
|
||||
;; this'll also happen only once
|
||||
(setq min2 (1- code)
|
||||
max2 (1+ code)))
|
||||
((<= (the fixnum min2) code (the fixnum max2))
|
||||
;; CHR fits into the second interval
|
||||
(setq min2 (min (the fixnum min2) (1- code))
|
||||
max2 (max (the fixnum max2) (1+ code))))
|
||||
((not min3)
|
||||
;; we need to open the third interval
|
||||
;; happens only once
|
||||
(setq min3 (1- code)
|
||||
max3 (1+ code)))
|
||||
((<= (the fixnum min3) code (the fixnum max3))
|
||||
;; CHR fits into the third interval
|
||||
(setq min3 (min (the fixnum min3) (1- code))
|
||||
max3 (max (the fixnum max3) (1+ code))))
|
||||
(t
|
||||
;; we're out of luck, CHR doesn't fit
|
||||
;; into one of the three intervals
|
||||
(return nil)))
|
||||
;; on success return all bounds
|
||||
;; make them inclusive bounds before returning
|
||||
finally (return (values (code-char (1+ min1))
|
||||
(code-char (1- max1))
|
||||
(and min2 (code-char (1+ min2)))
|
||||
(and max2 (code-char (1- max2)))
|
||||
(and min3 (code-char (1+ min3)))
|
||||
(and max3 (code-char (1- max3)))))))
|
||||
|
||||
(defmacro maybe-coerce-to-simple-string (string)
|
||||
(with-unique-names (=string=)
|
||||
`(let ((,=string= ,string))
|
||||
(cond ((simple-string-p ,=string=)
|
||||
,=string=)
|
||||
(t
|
||||
(coerce ,=string= 'simple-string))))))
|
||||
|
||||
(declaim (inline nsubseq))
|
||||
(defun nsubseq (sequence start &optional (end (length sequence)))
|
||||
"Return a subsequence by pointing to location in original sequence."
|
||||
(make-array (- end start)
|
||||
:element-type (array-element-type sequence)
|
||||
:displaced-to sequence
|
||||
:displaced-index-offset start))
|
||||
Reference in New Issue
Block a user