git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3581 4281704c-cde7-0310-8518-8e2dc76b1ff0
507 lines
26 KiB
Common Lisp
507 lines
26 KiB
Common Lisp
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.34 2008/07/06 18:12:05 edi Exp $
|
|
|
|
;;; Here the scanner for the actual regex as well as utility scanners
|
|
;;; for the constant start and end strings are created.
|
|
|
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
|
|
|
;;; Redistribution and use in source and binary forms, with or without
|
|
;;; modification, are permitted provided that the following conditions
|
|
;;; are met:
|
|
|
|
;;; * Redistributions of source code must retain the above copyright
|
|
;;; notice, this list of conditions and the following disclaimer.
|
|
|
|
;;; * Redistributions in binary form must reproduce the above
|
|
;;; copyright notice, this list of conditions and the following
|
|
;;; disclaimer in the documentation and/or other materials
|
|
;;; provided with the distribution.
|
|
|
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
(in-package :cl-ppcre)
|
|
|
|
(defmacro bmh-matcher-aux (&key case-insensitive-p)
|
|
"Auxiliary macro used by CREATE-BMH-MATCHER."
|
|
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
|
`(lambda (start-pos)
|
|
(declare (fixnum start-pos))
|
|
(if (or (minusp start-pos)
|
|
(> (the fixnum (+ start-pos m)) *end-pos*))
|
|
nil
|
|
(loop named bmh-matcher
|
|
for k of-type fixnum = (+ start-pos m -1)
|
|
then (+ k (max 1 (aref skip (char-code (schar *string* k)))))
|
|
while (< k *end-pos*)
|
|
do (loop for j of-type fixnum downfrom (1- m)
|
|
for i of-type fixnum downfrom k
|
|
while (and (>= j 0)
|
|
(,char-compare (schar *string* i)
|
|
(schar pattern j)))
|
|
finally (if (minusp j)
|
|
(return-from bmh-matcher (1+ i)))))))))
|
|
|
|
(defun create-bmh-matcher (pattern case-insensitive-p)
|
|
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
|
|
simple-string *STRING* for the first occurence of the substring
|
|
PATTERN. The search starts at the position START-POS within *STRING*
|
|
and stops before *END-POS* is reached. Depending on the second
|
|
argument the search is case-insensitive or not. If the special
|
|
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
|
|
instead. \(BMH matchers are faster but need much more space.)"
|
|
(declare #.*standard-optimize-settings*)
|
|
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
|
|
;; details
|
|
(unless *use-bmh-matchers*
|
|
(let ((test (if case-insensitive-p #'char-equal #'char=)))
|
|
(return-from create-bmh-matcher
|
|
(lambda (start-pos)
|
|
(declare (fixnum start-pos))
|
|
(and (not (minusp start-pos))
|
|
(search pattern
|
|
*string*
|
|
:start2 start-pos
|
|
:end2 *end-pos*
|
|
:test test))))))
|
|
(let* ((m (length pattern))
|
|
(skip (make-array *regex-char-code-limit*
|
|
:element-type 'fixnum
|
|
:initial-element m)))
|
|
(declare (fixnum m))
|
|
(loop for k of-type fixnum below m
|
|
if case-insensitive-p
|
|
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
|
|
(aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1))
|
|
else
|
|
do (setf (aref skip (char-code (schar pattern k))) (- m k 1)))
|
|
(if case-insensitive-p
|
|
(bmh-matcher-aux :case-insensitive-p t)
|
|
(bmh-matcher-aux))))
|
|
|
|
(defmacro char-searcher-aux (&key case-insensitive-p)
|
|
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
|
|
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
|
|
`(lambda (start-pos)
|
|
(declare (fixnum start-pos))
|
|
(and (not (minusp start-pos))
|
|
(loop for i of-type fixnum from start-pos below *end-pos*
|
|
thereis (and (,char-compare (schar *string* i) chr) i))))))
|
|
|
|
(defun create-char-searcher (chr case-insensitive-p)
|
|
"Returns a function which searches the (special) simple-string
|
|
*STRING* for the first occurence of the character CHR. The search
|
|
starts at the position START-POS within *STRING* and stops before
|
|
*END-POS* is reached. Depending on the second argument the search is
|
|
case-insensitive or not."
|
|
(declare #.*standard-optimize-settings*)
|
|
(if case-insensitive-p
|
|
(char-searcher-aux :case-insensitive-p t)
|
|
(char-searcher-aux)))
|
|
|
|
(declaim (inline newline-skipper))
|
|
(defun newline-skipper (start-pos)
|
|
"Finds the next occurence of a character in *STRING* which is behind
|
|
a #\Newline."
|
|
(declare #.*standard-optimize-settings*)
|
|
(declare (fixnum start-pos))
|
|
;; we can start with (1- START-POS) without testing for (PLUSP
|
|
;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
|
|
;; the first iteration
|
|
(loop for i of-type fixnum from (1- start-pos) below *end-pos*
|
|
thereis (and (char= (schar *string* i)
|
|
#\Newline)
|
|
(1+ i))))
|
|
|
|
(defmacro insert-advance-fn (advance-fn)
|
|
"Creates the actual closure returned by CREATE-SCANNER-AUX by
|
|
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
|
|
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
|
|
(subst
|
|
advance-fn '(advance-fn-definition)
|
|
'(lambda (string start end)
|
|
(block scan
|
|
;; initialize a couple of special variables used by the
|
|
;; matchers (see file specials.lisp)
|
|
(let* ((*string* string)
|
|
(*start-pos* start)
|
|
(*end-pos* end)
|
|
;; we will search forward for END-STRING if this value
|
|
;; isn't at least as big as POS (see ADVANCE-FN), so it
|
|
;; is safe to start to the left of *START-POS*; note
|
|
;; that this value will _never_ be decremented - this
|
|
;; is crucial to the scanning process
|
|
(*end-string-pos* (1- *start-pos*))
|
|
;; the next five will shadow the variables defined by
|
|
;; DEFPARAMETER; at this point, we don't know if we'll
|
|
;; actually use them, though
|
|
(*repeat-counters* *repeat-counters*)
|
|
(*last-pos-stores* *last-pos-stores*)
|
|
(*reg-starts* *reg-starts*)
|
|
(*regs-maybe-start* *regs-maybe-start*)
|
|
(*reg-ends* *reg-ends*)
|
|
;; we might be able to optimize the scanning process by
|
|
;; (virtually) shifting *START-POS* to the right
|
|
(scan-start-pos *start-pos*)
|
|
(starts-with-str (if start-string-test
|
|
(str starts-with)
|
|
nil))
|
|
;; we don't need to try further than MAX-END-POS
|
|
(max-end-pos (- *end-pos* min-len)))
|
|
(declare (fixnum scan-start-pos)
|
|
(function match-fn))
|
|
;; definition of ADVANCE-FN will be inserted here by macrology
|
|
(labels ((advance-fn-definition))
|
|
(declare (inline advance-fn))
|
|
(when (plusp rep-num)
|
|
;; we have at least one REPETITION which needs to count
|
|
;; the number of repetitions
|
|
(setq *repeat-counters* (make-array rep-num
|
|
:initial-element 0
|
|
:element-type 'fixnum)))
|
|
(when (plusp zero-length-num)
|
|
;; we have at least one REPETITION which needs to watch
|
|
;; out for zero-length repetitions
|
|
(setq *last-pos-stores* (make-array zero-length-num
|
|
:initial-element nil)))
|
|
(when (plusp reg-num)
|
|
;; we have registers in our regular expression
|
|
(setq *reg-starts* (make-array reg-num :initial-element nil)
|
|
*regs-maybe-start* (make-array reg-num :initial-element nil)
|
|
*reg-ends* (make-array reg-num :initial-element nil)))
|
|
(when end-anchored-p
|
|
;; the regular expression has a constant end string which
|
|
;; is anchored at the very end of the target string
|
|
;; (perhaps modulo a #\Newline)
|
|
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
|
|
(declare (fixnum end-test-pos)
|
|
(function end-string-test))
|
|
(unless (setq *end-string-pos* (funcall end-string-test
|
|
end-test-pos))
|
|
(when (and (= 1 (the fixnum end-anchored-p))
|
|
(> *end-pos* scan-start-pos)
|
|
(char= #\Newline (schar *string* (1- *end-pos*))))
|
|
;; if we didn't find an end string candidate from
|
|
;; END-TEST-POS and if a #\Newline at the end is
|
|
;; allowed we try it again from one position to the
|
|
;; left
|
|
(setq *end-string-pos* (funcall end-string-test
|
|
(1- end-test-pos))))))
|
|
(unless (and *end-string-pos*
|
|
(<= *start-pos* *end-string-pos*))
|
|
;; no end string candidate found, so give up
|
|
(return-from scan nil))
|
|
(when end-string-offset
|
|
;; if the offset of the constant end string from the
|
|
;; left of the regular expression is known we can start
|
|
;; scanning further to the right; this is similar to
|
|
;; what we might do in ADVANCE-FN
|
|
(setq scan-start-pos (max scan-start-pos
|
|
(- (the fixnum *end-string-pos*)
|
|
(the fixnum end-string-offset))))))
|
|
(cond
|
|
(start-anchored-p
|
|
;; we're anchored at the start of the target string,
|
|
;; so no need to try again after first failure
|
|
(when (or (/= *start-pos* scan-start-pos)
|
|
(< max-end-pos *start-pos*))
|
|
;; if END-STRING-OFFSET has proven that we don't
|
|
;; need to bother to scan from *START-POS* or if the
|
|
;; minimal length of the regular expression is
|
|
;; longer than the target string we give up
|
|
(return-from scan nil))
|
|
(when starts-with-str
|
|
(locally
|
|
(declare (fixnum starts-with-len))
|
|
(cond ((and (case-insensitive-p starts-with)
|
|
(not (*string*-equal starts-with-str
|
|
*start-pos*
|
|
(+ *start-pos*
|
|
starts-with-len)
|
|
0 starts-with-len)))
|
|
;; the regular expression has a
|
|
;; case-insensitive constant start string
|
|
;; and we didn't find it
|
|
(return-from scan nil))
|
|
((and (not (case-insensitive-p starts-with))
|
|
(not (*string*= starts-with-str
|
|
*start-pos*
|
|
(+ *start-pos* starts-with-len)
|
|
0 starts-with-len)))
|
|
;; the regular expression has a
|
|
;; case-sensitive constant start string
|
|
;; and we didn't find it
|
|
(return-from scan nil))
|
|
(t nil))))
|
|
(when (and end-string-test
|
|
(not end-anchored-p))
|
|
;; the regular expression has a constant end string
|
|
;; which isn't anchored so we didn't check for it
|
|
;; already
|
|
(block end-string-loop
|
|
;; we temporarily use *END-STRING-POS* as our
|
|
;; starting position to look for end string
|
|
;; candidates
|
|
(setq *end-string-pos* *start-pos*)
|
|
(loop
|
|
(unless (setq *end-string-pos*
|
|
(funcall (the function end-string-test)
|
|
*end-string-pos*))
|
|
;; no end string candidate found, so give up
|
|
(return-from scan nil))
|
|
(unless end-string-offset
|
|
;; end string doesn't have an offset so we
|
|
;; can start scanning now
|
|
(return-from end-string-loop))
|
|
(let ((maybe-start-pos (- (the fixnum *end-string-pos*)
|
|
(the fixnum end-string-offset))))
|
|
(cond ((= maybe-start-pos *start-pos*)
|
|
;; offset of end string into regular
|
|
;; expression matches start anchor -
|
|
;; fine...
|
|
(return-from end-string-loop))
|
|
((and (< maybe-start-pos *start-pos*)
|
|
(< (+ *end-string-pos* end-string-len) *end-pos*))
|
|
;; no match but maybe we find another
|
|
;; one to the right - try again
|
|
(incf *end-string-pos*))
|
|
(t
|
|
;; otherwise give up
|
|
(return-from scan nil)))))))
|
|
;; if we got here we scan exactly once
|
|
(let ((next-pos (funcall match-fn *start-pos*)))
|
|
(when next-pos
|
|
(values (if next-pos *start-pos* nil)
|
|
next-pos
|
|
*reg-starts*
|
|
*reg-ends*))))
|
|
(t
|
|
(loop for pos = (if starts-with-everything
|
|
;; don't jump to the next
|
|
;; #\Newline on the first
|
|
;; iteration
|
|
scan-start-pos
|
|
(advance-fn scan-start-pos))
|
|
then (advance-fn pos)
|
|
;; give up if the regular expression can't fit
|
|
;; into the rest of the target string
|
|
while (and pos
|
|
(<= (the fixnum pos) max-end-pos))
|
|
do (let ((next-pos (funcall match-fn pos)))
|
|
(when next-pos
|
|
(return-from scan (values pos
|
|
next-pos
|
|
*reg-starts*
|
|
*reg-ends*)))
|
|
;; not yet found, increment POS
|
|
#-cormanlisp (incf (the fixnum pos))
|
|
#+cormanlisp (incf pos)))))))))
|
|
:test #'equalp))
|
|
|
|
(defun create-scanner-aux (match-fn
|
|
min-len
|
|
start-anchored-p
|
|
starts-with
|
|
start-string-test
|
|
end-anchored-p
|
|
end-string-test
|
|
end-string-len
|
|
end-string-offset
|
|
rep-num
|
|
zero-length-num
|
|
reg-num)
|
|
"Auxiliary function to create and return a scanner \(which is
|
|
actually a closure). Used by CREATE-SCANNER."
|
|
(declare #.*standard-optimize-settings*)
|
|
(declare (fixnum min-len zero-length-num rep-num reg-num))
|
|
(let ((starts-with-len (if (typep starts-with 'str)
|
|
(len starts-with)))
|
|
(starts-with-everything (typep starts-with 'everything)))
|
|
(cond
|
|
;; this COND statement dispatches on the different versions we
|
|
;; have for ADVANCE-FN and creates different closures for each;
|
|
;; note that you see only the bodies of ADVANCE-FN below - the
|
|
;; actual scanner is defined in INSERT-ADVANCE-FN above; (we
|
|
;; could have done this with closures instead of macrology but
|
|
;; would have consed a lot more)
|
|
((and start-string-test end-string-test end-string-offset)
|
|
;; we know that the regular expression has constant start and
|
|
;; end strings and we know the end string's offset (from the
|
|
;; left)
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(declare (fixnum end-string-offset starts-with-len)
|
|
(function start-string-test end-string-test))
|
|
(loop
|
|
(unless (setq pos (funcall start-string-test pos))
|
|
;; give up completely if we can't find a start string
|
|
;; candidate
|
|
(return-from scan nil))
|
|
(locally
|
|
;; from here we know that POS is a FIXNUM
|
|
(declare (fixnum pos))
|
|
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
|
;; if we already found an end string candidate the
|
|
;; position of which matches the start string
|
|
;; candidate we're done
|
|
(return-from advance-fn pos))
|
|
(let ((try-pos (+ pos starts-with-len)))
|
|
;; otherwise try (again) to find an end string
|
|
;; candidate which starts behind the start string
|
|
;; candidate
|
|
(loop
|
|
(unless (setq *end-string-pos*
|
|
(funcall end-string-test try-pos))
|
|
;; no end string candidate found, so give up
|
|
(return-from scan nil))
|
|
;; NEW-POS is where we should start scanning
|
|
;; according to the end string candidate
|
|
(let ((new-pos (- (the fixnum *end-string-pos*)
|
|
end-string-offset)))
|
|
(declare (fixnum new-pos *end-string-pos*))
|
|
(cond ((= new-pos pos)
|
|
;; if POS and NEW-POS are equal then the
|
|
;; two candidates agree so we're fine
|
|
(return-from advance-fn pos))
|
|
((> new-pos pos)
|
|
;; if NEW-POS is further to the right we
|
|
;; advance POS and try again, i.e. we go
|
|
;; back to the start of the outer LOOP
|
|
(setq pos new-pos)
|
|
;; this means "return from inner LOOP"
|
|
(return))
|
|
(t
|
|
;; otherwise NEW-POS is smaller than POS,
|
|
;; so we have to redo the inner LOOP to
|
|
;; find another end string candidate
|
|
;; further to the right
|
|
(setq try-pos (1+ *end-string-pos*))))))))))))
|
|
((and starts-with-everything end-string-test end-string-offset)
|
|
;; we know that the regular expression starts with ".*" (which
|
|
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
|
|
;; with a constant end string and we know the end string's
|
|
;; offset (from the left)
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(declare (fixnum end-string-offset)
|
|
(function end-string-test))
|
|
(loop
|
|
(unless (setq pos (newline-skipper pos))
|
|
;; if we can't find a #\Newline we give up immediately
|
|
(return-from scan nil))
|
|
(locally
|
|
;; from here we know that POS is a FIXNUM
|
|
(declare (fixnum pos))
|
|
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
|
|
;; if we already found an end string candidate the
|
|
;; position of which matches the place behind the
|
|
;; #\Newline we're done
|
|
(return-from advance-fn pos))
|
|
(let ((try-pos pos))
|
|
;; otherwise try (again) to find an end string
|
|
;; candidate which starts behind the #\Newline
|
|
(loop
|
|
(unless (setq *end-string-pos*
|
|
(funcall end-string-test try-pos))
|
|
;; no end string candidate found, so we give up
|
|
(return-from scan nil))
|
|
;; NEW-POS is where we should start scanning
|
|
;; according to the end string candidate
|
|
(let ((new-pos (- (the fixnum *end-string-pos*)
|
|
end-string-offset)))
|
|
(declare (fixnum new-pos *end-string-pos*))
|
|
(cond ((= new-pos pos)
|
|
;; if POS and NEW-POS are equal then the
|
|
;; the end string candidate agrees with
|
|
;; the #\Newline so we're fine
|
|
(return-from advance-fn pos))
|
|
((> new-pos pos)
|
|
;; if NEW-POS is further to the right we
|
|
;; advance POS and try again, i.e. we go
|
|
;; back to the start of the outer LOOP
|
|
(setq pos new-pos)
|
|
;; this means "return from inner LOOP"
|
|
(return))
|
|
(t
|
|
;; otherwise NEW-POS is smaller than POS,
|
|
;; so we have to redo the inner LOOP to
|
|
;; find another end string candidate
|
|
;; further to the right
|
|
(setq try-pos (1+ *end-string-pos*))))))))))))
|
|
((and start-string-test end-string-test)
|
|
;; we know that the regular expression has constant start and
|
|
;; end strings; similar to the first case but we only need to
|
|
;; check for the end string, it doesn't provide enough
|
|
;; information to advance POS
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(declare (function start-string-test end-string-test))
|
|
(unless (setq pos (funcall start-string-test pos))
|
|
(return-from scan nil))
|
|
(if (<= (the fixnum pos)
|
|
(the fixnum *end-string-pos*))
|
|
(return-from advance-fn pos))
|
|
(unless (setq *end-string-pos* (funcall end-string-test pos))
|
|
(return-from scan nil))
|
|
pos)))
|
|
((and starts-with-everything end-string-test)
|
|
;; we know that the regular expression starts with ".*" (which
|
|
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
|
|
;; with a constant end string; similar to the second case but we
|
|
;; only need to check for the end string, it doesn't provide
|
|
;; enough information to advance POS
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(declare (function end-string-test))
|
|
(unless (setq pos (newline-skipper pos))
|
|
(return-from scan nil))
|
|
(if (<= (the fixnum pos)
|
|
(the fixnum *end-string-pos*))
|
|
(return-from advance-fn pos))
|
|
(unless (setq *end-string-pos* (funcall end-string-test pos))
|
|
(return-from scan nil))
|
|
pos)))
|
|
(start-string-test
|
|
;; just check for constant start string candidate
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(declare (function start-string-test))
|
|
(unless (setq pos (funcall start-string-test pos))
|
|
(return-from scan nil))
|
|
pos)))
|
|
(starts-with-everything
|
|
;; just advance POS with NEWLINE-SKIPPER
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(unless (setq pos (newline-skipper pos))
|
|
(return-from scan nil))
|
|
pos)))
|
|
(end-string-test
|
|
;; just check for the next end string candidate if POS has
|
|
;; advanced beyond the last one
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
(declare (function end-string-test))
|
|
(if (<= (the fixnum pos)
|
|
(the fixnum *end-string-pos*))
|
|
(return-from advance-fn pos))
|
|
(unless (setq *end-string-pos* (funcall end-string-test pos))
|
|
(return-from scan nil))
|
|
pos)))
|
|
(t
|
|
;; not enough optimization information about the regular
|
|
;; expression to optimize so we just return POS
|
|
(insert-advance-fn
|
|
(advance-fn (pos)
|
|
pos))))))
|