git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@4461 4281704c-cde7-0310-8518-8e2dc76b1ff0
260 lines
9.8 KiB
Common Lisp
260 lines
9.8 KiB
Common Lisp
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.44 2009/10/28 07:36:15 edi Exp $
|
|
|
|
;;; This file defines the REGEX class. REGEX objects are used to
|
|
;;; represent the (transformed) parse trees internally
|
|
|
|
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
|
|
|
|
;;; Redistribution and use in source and binary forms, with or without
|
|
;;; modification, are permitted provided that the following conditions
|
|
;;; are met:
|
|
|
|
;;; * Redistributions of source code must retain the above copyright
|
|
;;; notice, this list of conditions and the following disclaimer.
|
|
|
|
;;; * Redistributions in binary form must reproduce the above
|
|
;;; copyright notice, this list of conditions and the following
|
|
;;; disclaimer in the documentation and/or other materials
|
|
;;; provided with the distribution.
|
|
|
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
(in-package :cl-ppcre)
|
|
|
|
(defclass regex ()
|
|
()
|
|
(:documentation "The REGEX base class. All other classes inherit
|
|
from this one."))
|
|
|
|
(defclass seq (regex)
|
|
((elements :initarg :elements
|
|
:accessor elements
|
|
:type cons
|
|
:documentation "A list of REGEX objects."))
|
|
(:documentation "SEQ objects represents sequences of regexes.
|
|
\(Like \"ab\" is the sequence of \"a\" and \"b\".)"))
|
|
|
|
(defclass alternation (regex)
|
|
((choices :initarg :choices
|
|
:accessor choices
|
|
:type cons
|
|
:documentation "A list of REGEX objects"))
|
|
(:documentation "ALTERNATION objects represent alternations of
|
|
regexes. \(Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
|
|
|
|
(defclass lookahead (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The REGEX object we're checking.")
|
|
(positivep :initarg :positivep
|
|
:reader positivep
|
|
:documentation "Whether this assertion is positive."))
|
|
(:documentation "LOOKAHEAD objects represent look-ahead assertions."))
|
|
|
|
(defclass lookbehind (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The REGEX object we're checking.")
|
|
(positivep :initarg :positivep
|
|
:reader positivep
|
|
:documentation "Whether this assertion is positive.")
|
|
(len :initarg :len
|
|
:accessor len
|
|
:type fixnum
|
|
:documentation "The \(fixed) length of the enclosed regex."))
|
|
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
|
|
|
|
(defclass repetition (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The REGEX that's repeated.")
|
|
(greedyp :initarg :greedyp
|
|
:reader greedyp
|
|
:documentation "Whether the repetition is greedy.")
|
|
(minimum :initarg :minimum
|
|
:accessor minimum
|
|
:type fixnum
|
|
:documentation "The minimal number of repetitions.")
|
|
(maximum :initarg :maximum
|
|
:accessor maximum
|
|
:documentation "The maximal number of repetitions.
|
|
Can be NIL for unbounded.")
|
|
(min-len :initarg :min-len
|
|
:reader min-len
|
|
:documentation "The minimal length of the enclosed regex.")
|
|
(len :initarg :len
|
|
:reader len
|
|
:documentation "The length of the enclosed regex. NIL if
|
|
unknown.")
|
|
(min-rest :initform 0
|
|
:accessor min-rest
|
|
:type fixnum
|
|
:documentation "The minimal number of characters which
|
|
must appear after this repetition.")
|
|
(contains-register-p :initarg :contains-register-p
|
|
:reader contains-register-p
|
|
:documentation "Whether the regex contains a
|
|
register."))
|
|
(:documentation "REPETITION objects represent repetitions of regexes."))
|
|
|
|
(defclass register (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The inner regex.")
|
|
(num :initarg :num
|
|
:reader num
|
|
:type fixnum
|
|
:documentation "The number of this register, starting from 0.
|
|
This is the index into *REGS-START* and *REGS-END*.")
|
|
(name :initarg :name
|
|
:reader name
|
|
:documentation "Name of this register or NIL."))
|
|
(:documentation "REGISTER objects represent register groups."))
|
|
|
|
(defclass standalone (regex)
|
|
((regex :initarg :regex
|
|
:accessor regex
|
|
:documentation "The inner regex."))
|
|
(:documentation "A standalone regular expression."))
|
|
|
|
(defclass back-reference (regex)
|
|
((num :initarg :num
|
|
:accessor num
|
|
:type fixnum
|
|
:documentation "The number of the register this
|
|
reference refers to.")
|
|
(name :initarg :name
|
|
:accessor name
|
|
:documentation "The name of the register this
|
|
reference refers to or NIL.")
|
|
(case-insensitive-p :initarg :case-insensitive-p
|
|
:reader case-insensitive-p
|
|
:documentation "Whether we check
|
|
case-insensitively."))
|
|
(:documentation "BACK-REFERENCE objects represent backreferences."))
|
|
|
|
(defclass char-class (regex)
|
|
((test-function :initarg :test-function
|
|
:reader test-function
|
|
:type (or function symbol nil)
|
|
:documentation "A unary function \(accepting a
|
|
character) which stands in for the character class and does the work
|
|
of checking whether a character belongs to the class."))
|
|
(:documentation "CHAR-CLASS objects represent character classes."))
|
|
|
|
(defclass str (regex)
|
|
((str :initarg :str
|
|
:accessor str
|
|
:type string
|
|
:documentation "The actual string.")
|
|
(len :initform 0
|
|
:accessor len
|
|
:type fixnum
|
|
:documentation "The length of the string.")
|
|
(case-insensitive-p :initarg :case-insensitive-p
|
|
:reader case-insensitive-p
|
|
:documentation "If we match case-insensitively.")
|
|
(offset :initform nil
|
|
:accessor offset
|
|
:documentation "Offset from the left of the whole
|
|
parse tree. The first regex has offset 0. NIL if unknown, i.e. behind
|
|
a variable-length regex.")
|
|
(skip :initform nil
|
|
:initarg :skip
|
|
:accessor skip
|
|
:documentation "If we can avoid testing for this
|
|
string because the SCAN function has done this already.")
|
|
(start-of-end-string-p :initform nil
|
|
:accessor start-of-end-string-p
|
|
:documentation "If this is the unique
|
|
STR which starts END-STRING (a slot of MATCHER)."))
|
|
(:documentation "STR objects represent string."))
|
|
|
|
(defclass anchor (regex)
|
|
((startp :initarg :startp
|
|
:reader startp
|
|
:documentation "Whether this is a \"start anchor\".")
|
|
(multi-line-p :initarg :multi-line-p
|
|
:initform nil
|
|
:reader multi-line-p
|
|
:documentation "Whether we're in multi-line mode,
|
|
i.e. whether each #\\Newline is surrounded by anchors.")
|
|
(no-newline-p :initarg :no-newline-p
|
|
:initform nil
|
|
:reader no-newline-p
|
|
:documentation "Whether we ignore #\\Newline at the end."))
|
|
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
|
|
|
|
(defclass everything (regex)
|
|
((single-line-p :initarg :single-line-p
|
|
:reader single-line-p
|
|
:documentation "Whether we're in single-line mode,
|
|
i.e. whether we also match #\\Newline."))
|
|
(:documentation "EVERYTHING objects represent regexes matching
|
|
\"everything\", i.e. dots."))
|
|
|
|
(defclass word-boundary (regex)
|
|
((negatedp :initarg :negatedp
|
|
:reader negatedp
|
|
:documentation "Whether we mean the opposite,
|
|
i.e. no word-boundary."))
|
|
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))
|
|
|
|
(defclass branch (regex)
|
|
((test :initarg :test
|
|
:accessor test
|
|
:documentation "The test of this branch, one of
|
|
LOOKAHEAD, LOOKBEHIND, or a number.")
|
|
(then-regex :initarg :then-regex
|
|
:accessor then-regex
|
|
:documentation "The regex that's to be matched if the
|
|
test succeeds.")
|
|
(else-regex :initarg :else-regex
|
|
:initform (make-instance 'void)
|
|
:accessor else-regex
|
|
:documentation "The regex that's to be matched if the
|
|
test fails."))
|
|
(:documentation "BRANCH objects represent Perl's conditional regular
|
|
expressions."))
|
|
|
|
(defclass filter (regex)
|
|
((fn :initarg :fn
|
|
:accessor fn
|
|
:type (or function symbol)
|
|
:documentation "The user-defined function.")
|
|
(len :initarg :len
|
|
:reader len
|
|
:documentation "The fixed length of this filter or NIL."))
|
|
(:documentation "FILTER objects represent arbitrary functions
|
|
defined by the user."))
|
|
|
|
(defclass void (regex)
|
|
()
|
|
(:documentation "VOID objects represent empty regular expressions."))
|
|
|
|
(defmethod initialize-instance :after ((str str) &rest init-args)
|
|
(declare #.*standard-optimize-settings*)
|
|
(declare (ignore init-args))
|
|
"Automatically computes the length of a STR after initialization."
|
|
(let ((str-slot (slot-value str 'str)))
|
|
(unless (typep str-slot
|
|
#-:lispworks 'simple-string
|
|
#+:lispworks 'lw:simple-text-string)
|
|
(setf (slot-value str 'str)
|
|
(coerce str-slot
|
|
#-:lispworks 'simple-string
|
|
#+:lispworks 'lw:simple-text-string))))
|
|
(setf (len str) (length (str str))))
|
|
|