Import 1.4.1 version of CL-PPCRE

git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3577 4281704c-cde7-0310-8518-8e2dc76b1ff0
This commit is contained in:
Edi Weitz
2008-07-23 11:29:40 +00:00
parent bf6913769f
commit 2974af4010
25 changed files with 1907 additions and 1223 deletions

View File

@ -1,11 +1,11 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.26 2005/06/10 10:23:42 edi Exp $
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.34 2008/07/03 07:44:06 edi Exp $
;;; This file defines the REGEX class and some utility methods for
;;; this class. REGEX objects are used to represent the (transformed)
;;; parse trees internally
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
@ -122,7 +122,10 @@ appear after this repetition.")
:reader num
:type fixnum
:documentation "The number of this register, starting from 0.
This is the index into *REGS-START* and *REGS-END*."))
This is the index into *REGS-START* and *REGS-END*.")
(name :initarg :name
:reader name
:documentation "Name of this register or NIL."))
(:documentation "REGISTER objects represent register groups."))
(defclass standalone (regex)
@ -137,18 +140,21 @@ This is the index into *REGS-START* and *REGS-END*."))
:type fixnum
:documentation "The number of the register this
reference refers to.")
(name :initarg :name
:accessor name
:documentation "The name of the register this
reference refers to or NIL.")
(case-insensitive-p :initarg :case-insensitive-p
:reader case-insensitive-p
:documentation "Whether we check
case-insensitively."))
(:documentation "BACK-REFERENCE objects represent backreferences."))
(defclass char-class (regex)
((hash :initarg :hash
:reader hash
:type (or hash-table null)
:documentation "A hash table the keys of which are the
characters; the values are always T.")
((charset :initarg :charset
:reader charset
:type (or charset null)
:documentation "A charset denoting the characters
in the character class.")
(case-insensitive-p :initarg :case-insensitive-p
:reader case-insensitive-p
:documentation "If the char class
@ -253,20 +259,16 @@ defined by the user."))
(defmethod initialize-instance :after ((char-class char-class) &rest init-args)
(declare #.*standard-optimize-settings*)
"Make large hash tables smaller, if possible."
(let ((hash (getf init-args :hash)))
(when (and hash
"Make large charsets smaller, if possible."
(let ((set (getf init-args :charset)))
(when (and set
(> *regex-char-code-limit* 256)
(> (hash-table-count hash)
(> (charset-count set)
(/ *regex-char-code-limit* 2)))
(setf (slot-value char-class 'hash)
(merge-inverted-hash (make-hash-table)
hash)
(setf (slot-value char-class 'set)
(merge-set (make-charset) set)
(slot-value char-class 'invertedp)
(not (slot-value char-class 'invertedp))))))
;;; The following four methods allow a VOID object to behave like a
;;; zero-length STR object (only readers needed)
(not (slot-value char-class 'invertedp))))))
(defmethod initialize-instance :after ((str str) &rest init-args)
(declare #.*standard-optimize-settings*)
@ -277,6 +279,9 @@ defined by the user."))
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
(setf (len str) (length (str str))))
;;; The following four methods allow a VOID object to behave like a
;;; zero-length STR object (only readers needed)
(defmethod len ((void void))
(declare #.*standard-optimize-settings*)
0)
@ -301,6 +306,7 @@ second argument if the STR has length 0. Returns NIL for REGEX objects
which are not of type STR."))
(defmethod case-mode ((str str) old-case-mode)
(declare #.*standard-optimize-settings*)
(cond ((zerop (len str))
old-case-mode)
((case-insensitive-p str)
@ -309,6 +315,7 @@ which are not of type STR."))
:case-sensitive)))
(defmethod case-mode ((regex regex) old-case-mode)
(declare #.*standard-optimize-settings*)
(declare (ignore old-case-mode))
nil)
@ -317,37 +324,45 @@ which are not of type STR."))
(:documentation "Implements a deep copy of a REGEX object."))
(defmethod copy-regex ((anchor anchor))
(declare #.*standard-optimize-settings*)
(make-instance 'anchor
:startp (startp anchor)
:multi-line-p (multi-line-p anchor)
:no-newline-p (no-newline-p anchor)))
(defmethod copy-regex ((everything everything))
(declare #.*standard-optimize-settings*)
(make-instance 'everything
:single-line-p (single-line-p everything)))
(defmethod copy-regex ((word-boundary word-boundary))
(declare #.*standard-optimize-settings*)
(make-instance 'word-boundary
:negatedp (negatedp word-boundary)))
(defmethod copy-regex ((void void))
(declare #.*standard-optimize-settings*)
(make-instance 'void))
(defmethod copy-regex ((lookahead lookahead))
(declare #.*standard-optimize-settings*)
(make-instance 'lookahead
:regex (copy-regex (regex lookahead))
:positivep (positivep lookahead)))
(defmethod copy-regex ((seq seq))
(declare #.*standard-optimize-settings*)
(make-instance 'seq
:elements (mapcar #'copy-regex (elements seq))))
(defmethod copy-regex ((alternation alternation))
(declare #.*standard-optimize-settings*)
(make-instance 'alternation
:choices (mapcar #'copy-regex (choices alternation))))
(defmethod copy-regex ((branch branch))
(with-slots ((test test))
(declare #.*standard-optimize-settings*)
(with-slots (test)
branch
(make-instance 'branch
:test (if (typep test 'regex)
@ -357,12 +372,14 @@ which are not of type STR."))
:else-regex (copy-regex (else-regex branch)))))
(defmethod copy-regex ((lookbehind lookbehind))
(declare #.*standard-optimize-settings*)
(make-instance 'lookbehind
:regex (copy-regex (regex lookbehind))
:positivep (positivep lookbehind)
:len (len lookbehind)))
(defmethod copy-regex ((repetition repetition))
(declare #.*standard-optimize-settings*)
(make-instance 'repetition
:regex (copy-regex (regex repetition))
:greedyp (greedyp repetition)
@ -373,32 +390,39 @@ which are not of type STR."))
:contains-register-p (contains-register-p repetition)))
(defmethod copy-regex ((register register))
(declare #.*standard-optimize-settings*)
(make-instance 'register
:regex (copy-regex (regex register))
:num (num register)))
:num (num register)
:name (name register)))
(defmethod copy-regex ((standalone standalone))
(declare #.*standard-optimize-settings*)
(make-instance 'standalone
:regex (copy-regex (regex standalone))))
(defmethod copy-regex ((back-reference back-reference))
(declare #.*standard-optimize-settings*)
(make-instance 'back-reference
:num (num back-reference)
:case-insensitive-p (case-insensitive-p back-reference)))
(defmethod copy-regex ((char-class char-class))
(declare #.*standard-optimize-settings*)
(make-instance 'char-class
:hash (hash char-class)
:charset (charset char-class)
:case-insensitive-p (case-insensitive-p char-class)
:invertedp (invertedp char-class)
:word-char-class-p (word-char-class-p char-class)))
(defmethod copy-regex ((str str))
(declare #.*standard-optimize-settings*)
(make-instance 'str
:str (str str)
:case-insensitive-p (case-insensitive-p str)))
(defmethod copy-regex ((filter filter))
(declare #.*standard-optimize-settings*)
(make-instance 'filter
:fn (fn filter)
:len (len filter)))
@ -420,6 +444,7 @@ optionally removes embedded REGISTER objects if possible and if the
special variable REMOVE-REGISTERS-P is true."))
(defmethod remove-registers ((register register))
(declare #.*standard-optimize-settings*)
(declare (special remove-registers-p reg-seen))
(cond (remove-registers-p
(remove-registers (regex register)))
@ -430,6 +455,7 @@ special variable REMOVE-REGISTERS-P is true."))
(copy-regex register))))
(defmethod remove-registers ((repetition repetition))
(declare #.*standard-optimize-settings*)
(let* (reg-seen
(inner-regex (remove-registers (regex repetition))))
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
@ -445,22 +471,26 @@ special variable REMOVE-REGISTERS-P is true."))
:contains-register-p reg-seen)))
(defmethod remove-registers ((standalone standalone))
(declare #.*standard-optimize-settings*)
(make-instance 'standalone
:regex (remove-registers (regex standalone))))
(defmethod remove-registers ((lookahead lookahead))
(declare #.*standard-optimize-settings*)
(make-instance 'lookahead
:regex (remove-registers (regex lookahead))
:positivep (positivep lookahead)))
(defmethod remove-registers ((lookbehind lookbehind))
(declare #.*standard-optimize-settings*)
(make-instance 'lookbehind
:regex (remove-registers (regex lookbehind))
:positivep (positivep lookbehind)
:len (len lookbehind)))
(defmethod remove-registers ((branch branch))
(with-slots ((test test))
(declare #.*standard-optimize-settings*)
(with-slots (test)
branch
(make-instance 'branch
:test (if (typep test 'regex)
@ -470,15 +500,18 @@ special variable REMOVE-REGISTERS-P is true."))
:else-regex (remove-registers (else-regex branch)))))
(defmethod remove-registers ((alternation alternation))
(declare #.*standard-optimize-settings*)
(declare (special remove-registers-p))
;; an ALTERNATION, so we can't remove REGISTER objects further down
(setq remove-registers-p nil)
(copy-regex alternation))
(defmethod remove-registers ((regex regex))
(declare #.*standard-optimize-settings*)
(copy-regex regex))
(defmethod remove-registers ((seq seq))
(declare #.*standard-optimize-settings*)
(make-instance 'seq
:elements (mapcar #'remove-registers (elements seq))))
@ -489,6 +522,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(i.e. the object corresponding to \".\", for example."))
(defmethod everythingp ((seq seq))
(declare #.*standard-optimize-settings*)
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
;; due to the parsing process
(let ((cleaned-elements (remove-if #'(lambda (element)
@ -498,7 +532,8 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(everythingp (first cleaned-elements)))))
(defmethod everythingp ((alternation alternation))
(with-slots ((choices choices))
(declare #.*standard-optimize-settings*)
(with-slots (choices)
alternation
(and (= 1 (length choices))
;; this is unlikely to happen for human-generated regexes,
@ -506,9 +541,8 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(everythingp (first choices)))))
(defmethod everythingp ((repetition repetition))
(with-slots ((maximum maximum)
(minimum minimum)
(regex regex))
(declare #.*standard-optimize-settings*)
(with-slots (maximum minimum regex)
repetition
(and maximum
(= 1 minimum maximum)
@ -516,15 +550,19 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(everythingp regex))))
(defmethod everythingp ((register register))
(declare #.*standard-optimize-settings*)
(everythingp (regex register)))
(defmethod everythingp ((standalone standalone))
(declare #.*standard-optimize-settings*)
(everythingp (regex standalone)))
(defmethod everythingp ((everything everything))
(declare #.*standard-optimize-settings*)
everything)
(defmethod everythingp ((regex regex))
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
nil)
@ -534,6 +572,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
(defmethod regex-length ((seq seq))
(declare #.*standard-optimize-settings*)
;; simply add all inner lengths unless one of them is NIL
(loop for sub-regex in (elements seq)
for len = (regex-length sub-regex)
@ -541,6 +580,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
sum len))
(defmethod regex-length ((alternation alternation))
(declare #.*standard-optimize-settings*)
;; only return a true value if all inner lengths are non-NIL and
;; mutually equal
(loop for sub-regex in (choices alternation)
@ -551,6 +591,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
finally (return len)))
(defmethod regex-length ((branch branch))
(declare #.*standard-optimize-settings*)
;; only return a true value if both alternations have a length and
;; if they're equal
(let ((then-length (regex-length (then-regex branch))))
@ -559,13 +600,12 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
then-length)))
(defmethod regex-length ((repetition repetition))
(declare #.*standard-optimize-settings*)
;; we can only compute the length of a REPETITION object if the
;; number of repetitions is fixed; note that we don't call
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
;; always set correctly
(with-slots ((len len)
(minimum minimum)
(maximum maximum))
(with-slots (len minimum maximum)
repetition
(if (and len
(eql minimum maximum))
@ -573,29 +613,37 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
nil)))
(defmethod regex-length ((register register))
(declare #.*standard-optimize-settings*)
(regex-length (regex register)))
(defmethod regex-length ((standalone standalone))
(declare #.*standard-optimize-settings*)
(regex-length (regex standalone)))
(defmethod regex-length ((back-reference back-reference))
(declare #.*standard-optimize-settings*)
;; with enough effort we could possibly do better here, but
;; currently we just give up and return NIL
nil)
(defmethod regex-length ((char-class char-class))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-length ((everything everything))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-length ((str str))
(declare #.*standard-optimize-settings*)
(len str))
(defmethod regex-length ((filter filter))
(declare #.*standard-optimize-settings*)
(len filter))
(defmethod regex-length ((regex regex))
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
;; WORD-BOUNDARY (which all have zero-length)
0)
@ -605,12 +653,14 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
(:documentation "Returns the minimal length of REGEX."))
(defmethod regex-min-length ((seq seq))
(declare #.*standard-optimize-settings*)
;; simply add all inner minimal lengths
(loop for sub-regex in (elements seq)
for len = (regex-min-length sub-regex)
sum len))
(defmethod regex-min-length ((alternation alternation))
(declare #.*standard-optimize-settings*)
;; minimal length of an alternation is the minimal length of the
;; "shortest" element
(loop for sub-regex in (choices alternation)
@ -618,35 +668,44 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true
minimize len))
(defmethod regex-min-length ((branch branch))
(declare #.*standard-optimize-settings*)
;; minimal length of both alternations
(min (regex-min-length (then-regex branch))
(regex-min-length (else-regex branch))))
(defmethod regex-min-length ((repetition repetition))
(declare #.*standard-optimize-settings*)
;; obviously the product of the inner minimal length and the minimal
;; number of repetitions
(* (minimum repetition) (min-len repetition)))
(defmethod regex-min-length ((register register))
(declare #.*standard-optimize-settings*)
(regex-min-length (regex register)))
(defmethod regex-min-length ((standalone standalone))
(declare #.*standard-optimize-settings*)
(regex-min-length (regex standalone)))
(defmethod regex-min-length ((char-class char-class))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-min-length ((everything everything))
(declare #.*standard-optimize-settings*)
1)
(defmethod regex-min-length ((str str))
(declare #.*standard-optimize-settings*)
(len str))
(defmethod regex-min-length ((filter filter))
(declare #.*standard-optimize-settings*)
(or (len filter)
0))
(defmethod regex-min-length ((regex regex))
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
0)
@ -664,6 +723,7 @@ slots of STR objects further down the tree."))
;; into repetitions
(defmethod compute-offsets ((seq seq) start-pos)
(declare #.*standard-optimize-settings*)
(loop for element in (elements seq)
;; advance offset argument for next call while looping through
;; the elements
@ -673,6 +733,7 @@ slots of STR objects further down the tree."))
finally (return curr-offset)))
(defmethod compute-offsets ((alternation alternation) start-pos)
(declare #.*standard-optimize-settings*)
(loop for choice in (choices alternation)
for old-offset = nil then curr-offset
for curr-offset = (compute-offsets choice start-pos)
@ -684,6 +745,7 @@ slots of STR objects further down the tree."))
finally (return curr-offset)))
(defmethod compute-offsets ((branch branch) start-pos)
(declare #.*standard-optimize-settings*)
;; only return offset if both alternations have equal value
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
(and then-offset
@ -691,10 +753,9 @@ slots of STR objects further down the tree."))
then-offset)))
(defmethod compute-offsets ((repetition repetition) start-pos)
(declare #.*standard-optimize-settings*)
;; no need to descend into the inner regex
(with-slots ((len len)
(minimum minimum)
(maximum maximum))
(with-slots (len minimum maximum)
repetition
(if (and len
(eq minimum maximum))
@ -704,34 +765,42 @@ slots of STR objects further down the tree."))
nil)))
(defmethod compute-offsets ((register register) start-pos)
(declare #.*standard-optimize-settings*)
(compute-offsets (regex register) start-pos))
(defmethod compute-offsets ((standalone standalone) start-pos)
(declare #.*standard-optimize-settings*)
(compute-offsets (regex standalone) start-pos))
(defmethod compute-offsets ((char-class char-class) start-pos)
(declare #.*standard-optimize-settings*)
(1+ start-pos))
(defmethod compute-offsets ((everything everything) start-pos)
(declare #.*standard-optimize-settings*)
(1+ start-pos))
(defmethod compute-offsets ((str str) start-pos)
(declare #.*standard-optimize-settings*)
(setf (offset str) start-pos)
(+ start-pos (len str)))
(defmethod compute-offsets ((back-reference back-reference) start-pos)
(declare #.*standard-optimize-settings*)
;; with enough effort we could possibly do better here, but
;; currently we just give up and return NIL
(declare (ignore start-pos))
nil)
(defmethod compute-offsets ((filter filter) start-pos)
(declare #.*standard-optimize-settings*)
(let ((len (len filter)))
(if len
(+ start-pos len)
nil)))
(defmethod compute-offsets ((regex regex) start-pos)
(declare #.*standard-optimize-settings*)
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
;; WORD-BOUNDARY (which all have zero-length)
start-pos)