git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@12 4281704c-cde7-0310-8518-8e2dc76b1ff0
752 lines
28 KiB
Common Lisp
752 lines
28 KiB
Common Lisp
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
|
;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/regex-class.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $
|
|
|
|
;;; This file defines the REGEX class and some utility methods for
|
|
;;; this class. REGEX objects are used to represent the (transformed)
|
|
;;; parse trees internally
|
|
|
|
;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved.
|
|
|
|
;;; Redistribution and use in source and binary forms, with or without
|
|
;;; modification, are permitted provided that the following conditions
|
|
;;; are met:
|
|
|
|
;;; * Redistributions of source code must retain the above copyright
|
|
;;; notice, this list of conditions and the following disclaimer.
|
|
|
|
;;; * Redistributions in binary form must reproduce the above
|
|
;;; copyright notice, this list of conditions and the following
|
|
;;; disclaimer in the documentation and/or other materials
|
|
;;; provided with the distribution.
|
|
|
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
(in-package #:cl-ppcre)
|
|
|
|
(locally
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(defclass regex ()
|
|
()
|
|
(:documentation "The REGEX base class. All other classes inherit from this one."))
|
|
|
|
|
|
(defclass seq (regex)
|
|
((elements :initarg :elements
|
|
:accessor elements
|
|
:type cons
|
|
:documentation "A list of REGEX objects."))
|
|
(:documentation "SEQ objects represents sequences of
|
|
regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
|
|
|
(defclass alternation (regex)
|
|
((choices :initarg :choices
|
|
:accessor choices
|
|
:type cons
|
|
:documentation "A list of REGEX objects"))
|
|
(:documentation "ALTERNATION objects represent alternations of
|
|
regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
|
|
|
(defclass lookahead (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The REGEX object we're checking.")
|
|
(positivep :initarg :positivep
|
|
:reader positivep
|
|
:documentation "Whether this assertion is positive."))
|
|
(:documentation "LOOKAHEAD objects represent look-ahead assertions."))
|
|
|
|
(defclass lookbehind (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The REGEX object we're checking.")
|
|
(positivep :initarg :positivep
|
|
:reader positivep
|
|
:documentation "Whether this assertion is positive.")
|
|
(len :initarg :len
|
|
:accessor len
|
|
:type fixnum
|
|
:documentation "The (fixed) length of the enclosed regex."))
|
|
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
|
|
|
|
(defclass repetition (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The REGEX that's repeated.")
|
|
(greedyp :initarg :greedyp
|
|
:reader greedyp
|
|
:documentation "Whether the repetition is greedy.")
|
|
(minimum :initarg :minimum
|
|
:accessor minimum
|
|
:type fixnum
|
|
:documentation "The minimal number of repetitions.")
|
|
(maximum :initarg :maximum
|
|
:accessor maximum
|
|
:documentation "The maximal number of repetitions.
|
|
Can be NIL for unbounded.")
|
|
(min-len :initarg :min-len
|
|
:reader min-len
|
|
:documentation "The minimal length of the enclosed regex.")
|
|
(len :initarg :len
|
|
:reader len
|
|
:documentation "The length of the enclosed regex. NIL if unknown.")
|
|
(min-rest :initform 0
|
|
:accessor min-rest
|
|
:type fixnum
|
|
:documentation "The minimal number of characters which must
|
|
appear after this repetition.")
|
|
(contains-register-p :initarg :contains-register-p
|
|
:reader contains-register-p
|
|
:documentation "If the regex contains a register."))
|
|
(:documentation "REPETITION objects represent repetitions of regexes."))
|
|
|
|
(defclass register (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The inner regex.")
|
|
(num :initarg :num
|
|
:reader num
|
|
:type fixnum
|
|
:documentation "The number of this register, starting from 0.
|
|
This is the index into *REGS-START* and *REGS-END*."))
|
|
(:documentation "REGISTER objects represent register groups."))
|
|
|
|
(defclass standalone (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The inner regex."))
|
|
(:documentation "A standalone regular expression."))
|
|
|
|
(defclass back-reference (regex)
|
|
((num :initarg :num
|
|
:accessor num
|
|
:type fixnum
|
|
:documentation "The number of the register this reference refers to.")
|
|
(case-insensitive-p :initarg :case-insensitive-p
|
|
:reader case-insensitive-p
|
|
:documentation "Whether we check case-insensitively."))
|
|
(:documentation "BACK-REFERENCE objects represent backreferences."))
|
|
|
|
(defclass char-class (regex)
|
|
((hash :initarg :hash
|
|
:reader hash
|
|
:type (or hash-table null)
|
|
:documentation "A hash table the keys of which are the characters;
|
|
the values are always T.")
|
|
(case-insensitive-p :initarg :case-insensitive-p
|
|
:reader case-insensitive-p
|
|
:documentation "If the char class case-insensitive.")
|
|
(invertedp :initarg :invertedp
|
|
:reader invertedp
|
|
:documentation "Whether we mean the inverse of the char class.")
|
|
(word-char-class-p :initarg :word-char-class-p
|
|
:reader word-char-class-p
|
|
:documentation "Whether this CHAR CLASS
|
|
represents the special class WORD-CHAR-CLASS."))
|
|
(:documentation "CHAR-CLASS objects represent character classes."))
|
|
|
|
(defclass str (regex)
|
|
((str :initarg :str
|
|
:accessor str
|
|
:type string
|
|
:documentation "The actual string.")
|
|
(len :initform 0
|
|
:accessor len
|
|
:type fixnum
|
|
:documentation "The length of the string.")
|
|
(case-insensitive-p :initarg :case-insensitive-p
|
|
:reader case-insensitive-p
|
|
:documentation "If we match case-insensitively.")
|
|
(offset :initform nil
|
|
:accessor offset
|
|
:documentation "Offset from the left of the whole parse tree.
|
|
The first regex has offset 0.
|
|
NIL if unknown, i.e. behind a variable-length regex.")
|
|
(skip :initform nil
|
|
:initarg :skip
|
|
:accessor skip
|
|
:documentation "If we can avoid testing for this string
|
|
because the SCAN function has done this already.")
|
|
(start-of-end-string-p :initform nil
|
|
:accessor start-of-end-string-p
|
|
:documentation "If this is the unique STR which
|
|
starts END-STRING (a slot of MATCHER)."))
|
|
(:documentation "STR objects represent string."))
|
|
|
|
(defclass anchor (regex)
|
|
((startp :initarg :startp
|
|
:reader startp
|
|
:documentation "Whether this is a \"start anchor\".")
|
|
(multi-line-p :initarg :multi-line-p
|
|
:reader multi-line-p
|
|
:documentation "Whether we're in multi-line mode,
|
|
i.e. whether each #\\Newline is surrounded by anchors.")
|
|
(no-newline-p :initarg :no-newline-p
|
|
:reader no-newline-p
|
|
:documentation "Whether we ignore #\\Newline at the end."))
|
|
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
|
|
|
|
(defclass everything (regex)
|
|
((single-line-p :initarg :single-line-p
|
|
:reader single-line-p
|
|
:documentation "Whether we're in single-line mode,
|
|
i.e. whether we also match #\\Newline."))
|
|
(:documentation "EVERYTHING objects represent regexes matching
|
|
\"everything\", i.e. dots."))
|
|
|
|
(defclass word-boundary (regex)
|
|
((negatedp :initarg :negatedp
|
|
:reader negatedp
|
|
:documentation "Whether we mean the opposite,
|
|
i.e. no word-boundary."))
|
|
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))
|
|
|
|
(defclass branch (regex)
|
|
((test :initarg :test
|
|
:accessor test
|
|
:documentation "The test of this branch, one of LOOKAHEAD,
|
|
LOOKBEHIND, or a number.")
|
|
(then-regex :initarg :then-regex
|
|
:accessor then-regex
|
|
:documentation "The regex that's to be matched if the
|
|
test succeeds.")
|
|
(else-regex :initarg :else-regex
|
|
:initform (make-instance 'void)
|
|
:accessor else-regex
|
|
:documentation "The regex that's to be matched if the
|
|
test fails."))
|
|
(:documentation "BRANCH objects represent Perl's conditional regular
|
|
expressions."))
|
|
|
|
(defclass void (regex)
|
|
()
|
|
(:documentation "VOID objects represent empty regular expressions.")))
|
|
|
|
(declaim (ftype (function (t) simple-string) str))
|
|
|
|
;;; The following four methods allow a VOID object to behave like a
|
|
;;; zero-length STR object (only readers needed)
|
|
|
|
(defmethod initialize-instance :after ((str str) &rest init-args)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(declare (ignore init-args))
|
|
"Automatically computes the length of a STR after initialization."
|
|
(let ((str-slot (slot-value str 'str)))
|
|
(unless (typep str-slot 'simple-string)
|
|
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
|
|
(setf (len str) (length (str str))))
|
|
|
|
(defmethod len ((void void))
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
0)
|
|
|
|
(defmethod str ((void void))
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
"")
|
|
|
|
(defmethod skip ((void void))
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
nil)
|
|
|
|
(defmethod start-of-end-string-p ((void void))
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
nil)
|
|
|
|
(defgeneric case-mode (regex old-case-mode)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
|
|
Returns a keyword denoting the case-(in)sensitivity of a STR or its
|
|
second argument if the STR has length 0. Returns NIL for REGEX objects
|
|
which are not of type STR."))
|
|
|
|
(defmethod case-mode ((str str) old-case-mode)
|
|
(cond ((zerop (len str))
|
|
old-case-mode)
|
|
((case-insensitive-p str)
|
|
:case-insensitive)
|
|
(t
|
|
:case-sensitive)))
|
|
|
|
(defmethod case-mode ((regex regex) old-case-mode)
|
|
(declare (ignore old-case-mode))
|
|
nil)
|
|
|
|
(defgeneric copy-regex (regex)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Implements a deep copy of a REGEX object."))
|
|
|
|
(defmethod copy-regex ((anchor anchor))
|
|
(make-instance 'anchor
|
|
:startp (startp anchor)
|
|
:multi-line-p (multi-line-p anchor)
|
|
:no-newline-p (no-newline-p anchor)))
|
|
|
|
(defmethod copy-regex ((everything everything))
|
|
(make-instance 'everything
|
|
:single-line-p (single-line-p everything)))
|
|
|
|
(defmethod copy-regex ((word-boundary word-boundary))
|
|
(make-instance 'word-boundary
|
|
:negatedp (negatedp word-boundary)))
|
|
|
|
(defmethod copy-regex ((void void))
|
|
(make-instance 'void))
|
|
|
|
(defmethod copy-regex ((lookahead lookahead))
|
|
(make-instance 'lookahead
|
|
:regex (copy-regex (regex lookahead))
|
|
:positivep (positivep lookahead)))
|
|
|
|
(defmethod copy-regex ((seq seq))
|
|
(make-instance 'seq
|
|
:elements (mapcar #'copy-regex (elements seq))))
|
|
|
|
(defmethod copy-regex ((alternation alternation))
|
|
(make-instance 'alternation
|
|
:choices (mapcar #'copy-regex (choices alternation))))
|
|
|
|
(defmethod copy-regex ((branch branch))
|
|
(with-slots ((test test))
|
|
branch
|
|
(make-instance 'branch
|
|
:test (if (typep test 'regex)
|
|
(copy-regex test)
|
|
test)
|
|
:then-regex (copy-regex (then-regex branch))
|
|
:else-regex (copy-regex (else-regex branch)))))
|
|
|
|
(defmethod copy-regex ((lookbehind lookbehind))
|
|
(make-instance 'lookbehind
|
|
:regex (copy-regex (regex lookbehind))
|
|
:positivep (positivep lookbehind)
|
|
:len (len lookbehind)))
|
|
|
|
(defmethod copy-regex ((repetition repetition))
|
|
(make-instance 'repetition
|
|
:regex (copy-regex (regex repetition))
|
|
:greedyp (greedyp repetition)
|
|
:minimum (minimum repetition)
|
|
:maximum (maximum repetition)
|
|
:min-len (min-len repetition)
|
|
:len (len repetition)
|
|
:contains-register-p (contains-register-p repetition)))
|
|
|
|
(defmethod copy-regex ((register register))
|
|
(make-instance 'register
|
|
:regex (copy-regex (regex register))
|
|
:num (num register)))
|
|
|
|
(defmethod copy-regex ((standalone standalone))
|
|
(make-instance 'standalone
|
|
:regex (copy-regex (regex standalone))))
|
|
|
|
(defmethod copy-regex ((back-reference back-reference))
|
|
(make-instance 'back-reference
|
|
:num (num back-reference)
|
|
:case-insensitive-p (case-insensitive-p back-reference)))
|
|
|
|
(defmethod copy-regex ((char-class char-class))
|
|
(make-instance 'char-class
|
|
:hash (hash char-class)
|
|
:case-insensitive-p (case-insensitive-p char-class)
|
|
:invertedp (invertedp char-class)
|
|
:word-char-class-p (word-char-class-p char-class)))
|
|
|
|
(defmethod copy-regex ((str str))
|
|
(make-instance 'str
|
|
:str (str str)
|
|
:case-insensitive-p (case-insensitive-p str)))
|
|
|
|
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
|
|
;;; wrapped into one function. Maybe in the next release...
|
|
|
|
;;; Further note that this function is used by CONVERT to factor out
|
|
;;; complicated repetitions, i.e. cases like
|
|
;;; (a)* -> (?:a*(a))?
|
|
;;; This won't work for, say,
|
|
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
|
|
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
|
|
|
|
(defgeneric remove-registers (regex)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
|
|
optionally removes embedded REGISTER objects if possible and if the
|
|
special variable REMOVE-REGISTERS-P is true."))
|
|
|
|
(defmethod remove-registers ((register register))
|
|
(declare (special remove-registers-p reg-seen))
|
|
(cond (remove-registers-p
|
|
(remove-registers (regex register)))
|
|
(t
|
|
;; mark REG-SEEN as true so enclosing REPETITION objects
|
|
;; (see method below) know if they contain a register or not
|
|
(setq reg-seen t)
|
|
(copy-regex register))))
|
|
|
|
(defmethod remove-registers ((repetition repetition))
|
|
(let* (reg-seen
|
|
(inner-regex (remove-registers (regex repetition))))
|
|
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
|
|
;; (REGEX REPETITION) contains a REGISTER
|
|
(declare (special reg-seen))
|
|
(make-instance 'repetition
|
|
:regex inner-regex
|
|
:greedyp (greedyp repetition)
|
|
:minimum (minimum repetition)
|
|
:maximum (maximum repetition)
|
|
:min-len (min-len repetition)
|
|
:len (len repetition)
|
|
:contains-register-p reg-seen)))
|
|
|
|
(defmethod remove-registers ((standalone standalone))
|
|
(make-instance 'standalone
|
|
:regex (remove-registers (regex standalone))))
|
|
|
|
(defmethod remove-registers ((lookahead lookahead))
|
|
(make-instance 'lookahead
|
|
:regex (remove-registers (regex lookahead))
|
|
:positivep (positivep lookahead)))
|
|
|
|
(defmethod remove-registers ((lookbehind lookbehind))
|
|
(make-instance 'lookbehind
|
|
:regex (remove-registers (regex lookbehind))
|
|
:positivep (positivep lookbehind)
|
|
:len (len lookbehind)))
|
|
|
|
(defmethod remove-registers ((branch branch))
|
|
(with-slots ((test test))
|
|
branch
|
|
(make-instance 'branch
|
|
:test (if (typep test 'regex)
|
|
(remove-registers test)
|
|
test)
|
|
:then-regex (remove-registers (then-regex branch))
|
|
:else-regex (remove-registers (else-regex branch)))))
|
|
|
|
(defmethod remove-registers ((alternation alternation))
|
|
(declare (special remove-registers-p))
|
|
;; an ALTERNATION, so we can't remove REGISTER objects further down
|
|
(setq remove-registers-p nil)
|
|
(copy-regex alternation))
|
|
|
|
(defmethod remove-registers ((regex regex))
|
|
(copy-regex regex))
|
|
|
|
(defmethod remove-registers ((seq seq))
|
|
(make-instance 'seq
|
|
:elements (mapcar #'remove-registers (elements seq))))
|
|
|
|
(defgeneric everythingp (regex)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
|
|
to this object, otherwise NIL. So, \"(.){1}\" would return true
|
|
(i.e. the object corresponding to \".\", for example."))
|
|
|
|
(defmethod everythingp ((seq seq))
|
|
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
|
|
;; due to the parsing process
|
|
(let ((cleaned-elements (remove-if #'(lambda (element)
|
|
(typep element 'void))
|
|
(elements seq))))
|
|
(and (= 1 (length cleaned-elements))
|
|
(everythingp (first cleaned-elements)))))
|
|
|
|
(defmethod everythingp ((alternation alternation))
|
|
(with-slots ((choices choices))
|
|
alternation
|
|
(and (= 1 (length choices))
|
|
;; this is unlikely to happen for human-generated regexes,
|
|
;; but machine-generated ones might look like this
|
|
(everythingp (first choices)))))
|
|
|
|
(defmethod everythingp ((repetition repetition))
|
|
(with-slots ((maximum maximum)
|
|
(minimum minimum)
|
|
(regex regex))
|
|
repetition
|
|
(and maximum
|
|
(= 1 minimum maximum)
|
|
;; treat "<regex>{1,1}" like "<regex>"
|
|
(everythingp regex))))
|
|
|
|
(defmethod everythingp ((register register))
|
|
(everythingp (regex register)))
|
|
|
|
(defmethod everythingp ((standalone standalone))
|
|
(everythingp (regex standalone)))
|
|
|
|
(defmethod everythingp ((everything everything))
|
|
everything)
|
|
|
|
(defmethod everythingp ((regex regex))
|
|
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
|
|
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY
|
|
nil)
|
|
|
|
(defgeneric regex-length (regex)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
|
|
|
|
(defmethod regex-length ((seq seq))
|
|
;; simply add all inner lengths unless one of them is NIL
|
|
(loop for sub-regex in (elements seq)
|
|
for len = (regex-length sub-regex)
|
|
if (not len) do (return nil)
|
|
sum len))
|
|
|
|
(defmethod regex-length ((alternation alternation))
|
|
;; only return a true value if all inner lengths are non-NIL and
|
|
;; mutually equal
|
|
(loop for sub-regex in (choices alternation)
|
|
for old-len = nil then len
|
|
for len = (regex-length sub-regex)
|
|
if (or (not len)
|
|
(and old-len (/= len old-len))) do (return nil)
|
|
finally (return len)))
|
|
|
|
(defmethod regex-length ((branch branch))
|
|
;; only return a true value if both alternations have a length and
|
|
;; if they're equal
|
|
(let ((then-length (regex-length (then-regex branch))))
|
|
(and then-length
|
|
(eql then-length (regex-length (else-regex branch)))
|
|
then-length)))
|
|
|
|
(defmethod regex-length ((repetition repetition))
|
|
;; we can only compute the length of a REPETITION object if the
|
|
;; number of repetitions is fixed; note that we don't call
|
|
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
|
|
;; always set correctly
|
|
(with-slots ((len len)
|
|
(minimum minimum)
|
|
(maximum maximum))
|
|
repetition
|
|
(if (and len
|
|
(eq minimum maximum))
|
|
(* minimum len)
|
|
nil)))
|
|
|
|
(defmethod regex-length ((register register))
|
|
(regex-length (regex register)))
|
|
|
|
(defmethod regex-length ((standalone standalone))
|
|
(regex-length (regex standalone)))
|
|
|
|
(defmethod regex-length ((back-reference back-reference))
|
|
;; with enough effort we could possibly do better here, but
|
|
;; currently we just give up and return NIL
|
|
nil)
|
|
|
|
(defmethod regex-length ((char-class char-class))
|
|
1)
|
|
|
|
(defmethod regex-length ((everything everything))
|
|
1)
|
|
|
|
(defmethod regex-length ((str str))
|
|
(len str))
|
|
|
|
(defmethod regex-length ((regex regex))
|
|
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
|
;; WORD-BOUNDARY (which all have zero-length)
|
|
0)
|
|
|
|
(defgeneric regex-min-length (regex)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Returns the minimal length of REGEX."))
|
|
|
|
(defmethod regex-min-length ((seq seq))
|
|
;; simply add all inner minimal lengths
|
|
(loop for sub-regex in (elements seq)
|
|
for len = (regex-min-length sub-regex)
|
|
sum len))
|
|
|
|
(defmethod regex-min-length ((alternation alternation))
|
|
;; minimal length of an alternation is the minimal length of the
|
|
;; "shortest" element
|
|
(loop for sub-regex in (choices alternation)
|
|
for len = (regex-min-length sub-regex)
|
|
minimize len))
|
|
|
|
(defmethod regex-min-length ((branch branch))
|
|
;; minimal length of both alternations
|
|
(min (regex-min-length (then-regex branch))
|
|
(regex-min-length (else-regex branch))))
|
|
|
|
(defmethod regex-min-length ((repetition repetition))
|
|
;; obviously the product of the inner minimal length and the minimal
|
|
;; number of repetitions
|
|
(* (minimum repetition) (min-len repetition)))
|
|
|
|
(defmethod regex-min-length ((register register))
|
|
(regex-min-length (regex register)))
|
|
|
|
(defmethod regex-min-length ((standalone standalone))
|
|
(regex-min-length (regex standalone)))
|
|
|
|
(defmethod regex-min-length ((char-class char-class))
|
|
1)
|
|
|
|
(defmethod regex-min-length ((everything everything))
|
|
1)
|
|
|
|
(defmethod regex-min-length ((str str))
|
|
(len str))
|
|
|
|
(defmethod regex-min-length ((regex regex))
|
|
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
|
|
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
|
|
0)
|
|
|
|
(defgeneric compute-offsets (regex start-pos)
|
|
(declare (optimize speed
|
|
(safety 0)
|
|
(space 0)
|
|
(debug 0)
|
|
(compilation-speed 0)
|
|
#+:lispworks (hcl:fixnum-safety 0)))
|
|
(:documentation "Returns the offset the following regex would have
|
|
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
|
|
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
|
|
slots of STR objects further down the tree."))
|
|
|
|
;; note that we're actually only interested in the offset of
|
|
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
|
|
;; can stop at variable-length alternations and don't need to descend
|
|
;; into repetitions
|
|
|
|
(defmethod compute-offsets ((seq seq) start-pos)
|
|
(loop for element in (elements seq)
|
|
;; advance offset argument for next call while looping through
|
|
;; the elements
|
|
for pos = start-pos then curr-offset
|
|
for curr-offset = (compute-offsets element pos)
|
|
while curr-offset
|
|
finally (return curr-offset)))
|
|
|
|
(defmethod compute-offsets ((alternation alternation) start-pos)
|
|
(loop for choice in (choices alternation)
|
|
for old-offset = nil then curr-offset
|
|
for curr-offset = (compute-offsets choice start-pos)
|
|
;; we stop immediately if two alternations don't result in the
|
|
;; same offset
|
|
if (or (not curr-offset)
|
|
(and old-offset (/= curr-offset old-offset)))
|
|
do (return nil)
|
|
finally (return curr-offset)))
|
|
|
|
(defmethod compute-offsets ((branch branch) start-pos)
|
|
;; only return offset if both alternations have equal value
|
|
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
|
|
(and then-offset
|
|
(eql then-offset (compute-offsets (else-regex branch) start-pos))
|
|
then-offset)))
|
|
|
|
(defmethod compute-offsets ((repetition repetition) start-pos)
|
|
;; no need to descend into the inner regex
|
|
(with-slots ((len len)
|
|
(minimum minimum)
|
|
(maximum maximum))
|
|
repetition
|
|
(if (and len
|
|
(eq minimum maximum))
|
|
;; fixed number of repetitions, so we know how to proceed
|
|
(+ start-pos (* minimum len))
|
|
;; otherwise return NIL
|
|
nil)))
|
|
|
|
(defmethod compute-offsets ((register register) start-pos)
|
|
(compute-offsets (regex register) start-pos))
|
|
|
|
(defmethod compute-offsets ((standalone standalone) start-pos)
|
|
(compute-offsets (regex standalone) start-pos))
|
|
|
|
(defmethod compute-offsets ((char-class char-class) start-pos)
|
|
(1+ start-pos))
|
|
|
|
(defmethod compute-offsets ((everything everything) start-pos)
|
|
(1+ start-pos))
|
|
|
|
(defmethod compute-offsets ((str str) start-pos)
|
|
(setf (offset str) start-pos)
|
|
(+ start-pos (len str)))
|
|
|
|
(defmethod compute-offsets ((back-reference back-reference) start-pos)
|
|
;; with enough effort we could possibly do better here, but
|
|
;; currently we just give up and return NIL
|
|
(declare (ignore start-pos))
|
|
nil)
|
|
|
|
(defmethod compute-offsets ((regex regex) start-pos)
|
|
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
|
|
;; WORD-BOUNDARY (which all have zero-length)
|
|
start-pos) |