diff --git a/CHANGELOG b/CHANGELOG index b1ccf2d..a2f14e8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,61 @@ +Version 1.4.1 +2008-07-03 +Skip non-characters in CREATE-RANGES-FROM-SET + +Version 1.4.0 +2008-07-03 +Replaced hash tables with charsets (by Nikodemus Siivola) +Get rid of duplicates in REGEX-APROPOS(-LIST) + +Version 1.3.3 +2008-06-25 +Let the Lisp decide how it wants to enlarge its hash tables +Fixed anchors for special variables in docs +Fixed typo in docs (thanks to Jason S. Cornez) + +Version 1.3.2 +2007-09-13 +Updated docs and ChangeLog to be really in sync with 1.3.1 changes (thanks to Sébastien Saint-Sevin) + +Version 1.3.1 +2007-08-24 +Second return value for REGEX-REPLACE and REGEX-REPLACE-ALL (patch by Matthew Sachs) + +Version 1.3.0 +2007-03-24 +Optional support for named registers (patch by Ondrej Svitek) + +Version 1.2.19 +2007-01-16 +Fixed behaviour of look-behind in repeated scans (caught by RegexCoach user Hans Jud) + +Version 1.2.18 +2006-10-12 +Changed default element type for LispWorks +Fixed documentation for REGEX-REPLACE-ALL + +Version 1.2.17 +2006-10-11 +Fixed bug in DO-SCANS which affected anchors (caught by RegexCoach user Laurent Taupiac) +Update link for 'man perlre' (thanks to Ricardo Boccato Alves) + +Version 1.2.16 +2006-07-16 +Added :ELEMENT-TYPE to REGEX-REPLACE(-ALL) + +Version 1.2.15 +2006-07-03 +Added :REGEX tag to parse tree syntax (thanks to Frédéric Jolliton) + +Version 1.2.14 +2006-05-24 +Added missing tag in docs (thanks to Wojciech Kaczmarek) +Fixed IMPORT statement for LW + +Version 1.2.13 +2005-12-06 +Fixed bug involving *REAL-START-POS* (caught by "tichy") + Version 1.2.12 2005-11-01 REGEX-APROPOS-AUX now also uses :INHERITED diff --git a/api.lisp b/api.lisp index 9afe62a..e3523c3 100644 --- a/api.lisp +++ b/api.lisp @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.60 2005/11/01 09:51:01 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.79 2008/07/03 08:39:10 edi Exp $ ;;; The external API for creating and using scanners. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -38,10 +38,11 @@ destructive) (:documentation "Accepts a regular expression - either as a parse-tree or as a string - and returns a scan closure which will scan -strings for this regular expression. The \"mode\" keyboard arguments -are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE is not -NIL the function is allowed to destructively modify its first argument -\(but only if it's a parse tree).")) +strings for this regular expression and a list mapping registers to +their names \(NIL stands for unnamed ones). The \"mode\" keyboard +arguments are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE +is not NIL the function is allowed to destructively modify its first +argument \(but only if it's a parse tree).")) #-:use-acl-regexp2-engine (defmethod create-scanner ((regex-string string) &key case-insensitive-mode @@ -104,7 +105,7 @@ NIL the function is allowed to destructively modify its first argument (when flags (setq parse-tree (list :group (cons :flags flags) parse-tree)))) (let ((*syntax-error-string* nil)) - (multiple-value-bind (regex reg-num starts-with) + (multiple-value-bind (regex reg-num starts-with reg-names) (convert parse-tree) ;; simplify REGEX by flattening nested SEQ and ALTERNATION ;; constructs and gathering STR objects @@ -151,30 +152,31 @@ NIL the function is allowed to destructively modify its first argument (case-insensitive-p starts-with)))))) (declare (special end-string-offset end-anchored-p end-string)) ;; now create the scanner and return it - (create-scanner-aux match-fn - (regex-min-length regex) - (or (start-anchored-p regex) - ;; a dot in single-line-mode also - ;; implicitely anchors the regex at - ;; the start, i.e. if we can't match - ;; from the first position we won't - ;; match at all - (and (typep starts-with 'everything) - (single-line-p starts-with))) - starts-with - start-string-test - ;; only mark regex as end-anchored if we - ;; found a non-zero-length string before - ;; the anchor - (and end-string-test end-anchored-p) - end-string-test - (if end-string-test - (len end-string) - nil) - end-string-offset - *rep-num* - *zero-length-num* - reg-num)))))) + (values (create-scanner-aux match-fn + (regex-min-length regex) + (or (start-anchored-p regex) + ;; a dot in single-line-mode also + ;; implicitly anchors the regex at + ;; the start, i.e. if we can't match + ;; from the first position we won't + ;; match at all + (and (typep starts-with 'everything) + (single-line-p starts-with))) + starts-with + start-string-test + ;; only mark regex as end-anchored if we + ;; found a non-zero-length string before + ;; the anchor + (and end-string-test end-anchored-p) + end-string-test + (if end-string-test + (len end-string) + nil) + end-string-offset + *rep-num* + *zero-length-num* + reg-num) + reg-names)))))) #+:use-acl-regexp2-engine (declaim (inline create-scanner)) @@ -185,6 +187,7 @@ NIL the function is allowed to destructively modify its first argument single-line-mode extended-mode destructive) + (declare #.*standard-optimize-settings*) (declare (ignore destructive)) (when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode) (signal-ppcre-invocation-error @@ -197,6 +200,7 @@ NIL the function is allowed to destructively modify its first argument single-line-mode extended-mode destructive) + (declare #.*standard-optimize-settings*) (declare (ignore destructive)) (excl:compile-re parse-tree :case-fold case-insensitive-mode @@ -205,19 +209,22 @@ NIL the function is allowed to destructively modify its first argument :single-line single-line-mode :return :index)) -(defgeneric scan (regex target-string &key start end) +(defgeneric scan (regex target-string &key start end real-start-pos) (:documentation "Searches TARGET-STRING from START to END and tries -to match REGEX. On success returns four values - the start of the +to match REGEX. On success returns four values - the start of the match, the end of the match, and two arrays denoting the beginnings -and ends of register matches. On failure returns NIL. REGEX can be a +and ends of register matches. On failure returns NIL. REGEX can be a string which will be parsed according to Perl syntax, a parse tree, or -a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will -be coerced to a simple string if it isn't one already.")) +a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will +be coerced to a simple string if it isn't one already. The +REAL-START-POS parameter should be ignored - it exists only for +internal purposes.")) #-:use-acl-regexp2-engine (defmethod scan ((regex-string string) target-string &key (start 0) - (end (length target-string))) + (end (length target-string)) + ((:real-start-pos *real-start-pos*) nil)) (declare #.*standard-optimize-settings*) ;; note that the scanners are optimized for simple strings so we ;; have to coerce TARGET-STRING into one if it isn't already @@ -228,7 +235,8 @@ be coerced to a simple string if it isn't one already.")) #-:use-acl-regexp2-engine (defmethod scan ((scanner function) target-string &key (start 0) - (end (length target-string))) + (end (length target-string)) + ((:real-start-pos *real-start-pos*) nil)) (declare #.*standard-optimize-settings*) (funcall scanner (maybe-coerce-to-simple-string target-string) @@ -237,7 +245,8 @@ be coerced to a simple string if it isn't one already.")) #-:use-acl-regexp2-engine (defmethod scan ((parse-tree t) target-string &key (start 0) - (end (length target-string))) + (end (length target-string)) + ((:real-start-pos *real-start-pos*) nil)) (declare #.*standard-optimize-settings*) (funcall (create-scanner parse-tree) (maybe-coerce-to-simple-string target-string) @@ -249,7 +258,9 @@ be coerced to a simple string if it isn't one already.")) #+:use-acl-regexp2-engine (defmethod scan ((parse-tree t) target-string &key (start 0) - (end (length target-string))) + (end (length target-string)) + ((:real-start-pos *real-start-pos*) nil)) + (declare #.*standard-optimize-settings*) (when (< end start) (return-from scan nil)) (let ((results (multiple-value-list (excl:match-re parse-tree target-string @@ -274,9 +285,8 @@ be coerced to a simple string if it isn't one already.")) (define-compiler-macro scan (&whole form &environment env regex target-string &rest rest) "Make sure that constant forms are compiled into scanners at compile time." (cond ((constantp regex env) - `(scan (load-time-value - (create-scanner ,regex)) - ,target-string ,@rest)) + `(scan (load-time-value (create-scanner ,regex)) + ,target-string ,@rest)) (t form))) (defun scan-to-strings (regex target-string &key (start 0) @@ -309,8 +319,7 @@ structure with TARGET-STRING." (&whole form &environment env regex target-string &rest rest) "Make sure that constant forms are compiled into scanners at compile time." (cond ((constantp regex env) - `(scan-to-strings (load-time-value - (create-scanner ,regex)) + `(scan-to-strings (load-time-value (create-scanner ,regex)) ,target-string ,@rest)) (t form))) @@ -366,54 +375,50 @@ terminate the loop immediately. If REGEX matches an empty string the scan is continued one position behind this match. BODY may start with declarations." (with-rebinding (target-string) - (with-unique-names (%start %end %regex scanner loop-tag block-name) + (with-unique-names (%start %end %regex scanner) (declare (ignorable %regex scanner)) ;; the NIL BLOCK to enable exits via (RETURN ...) `(block nil - (let* ((,%start (or ,start 0)) - (*real-start-pos* ,%start) - (,%end (or ,end (length ,target-string))) - ,@(unless (constantp regex env) - ;; leave constant regular expressions as they are - - ;; SCAN's compiler macro will take care of them; - ;; otherwise create a scanner unless the regex is - ;; already a function (otherwise SCAN will do this - ;; on each iteration) - `((,%regex ,regex) - (,scanner (typecase ,%regex - (function ,%regex) - (t (create-scanner ,%regex))))))) - ;; coerce TARGET-STRING to a simple string unless it is one - ;; already (otherwise SCAN will do this on each iteration) - (setq ,target-string - (maybe-coerce-to-simple-string ,target-string)) - ;; a named BLOCK so we can exit the TAGBODY - (block ,block-name - (tagbody - ,loop-tag - ;; invoke SCAN and bind the returned values to the - ;; provided variables - (multiple-value-bind - (,match-start ,match-end ,reg-starts ,reg-ends) - (scan ,(cond ((constantp regex env) regex) - (t scanner)) - ,target-string :start ,%start :end ,%end) - ;; declare the variables to be IGNORABLE to prevent the - ;; compiler from issuing warnings - (declare - (ignorable ,match-start ,match-end ,reg-starts ,reg-ends)) - (unless ,match-start - ;; stop iteration on first failure - (return-from ,block-name ,result-form)) - ;; execute BODY (wrapped in LOCALLY so it can start with - ;; declarations) - (locally - ,@body) - ;; advance by one position if we had a zero-length match - (setq ,%start (if (= ,match-start ,match-end) - (1+ ,match-end) - ,match-end))) - (go ,loop-tag)))))))) + (let* ((,%start (or ,start 0)) + (,%end (or ,end (length ,target-string))) + ,@(unless (constantp regex env) + ;; leave constant regular expressions as they are - + ;; SCAN's compiler macro will take care of them; + ;; otherwise create a scanner unless the regex is + ;; already a function (otherwise SCAN will do this + ;; on each iteration) + `((,%regex ,regex) + (,scanner (typecase ,%regex + (function ,%regex) + (t (create-scanner ,%regex))))))) + ;; coerce TARGET-STRING to a simple string unless it is one + ;; already (otherwise SCAN will do this on each iteration) + (setq ,target-string + (maybe-coerce-to-simple-string ,target-string)) + (loop + ;; invoke SCAN and bind the returned values to the + ;; provided variables + (multiple-value-bind + (,match-start ,match-end ,reg-starts ,reg-ends) + (scan ,(cond ((constantp regex env) regex) + (t scanner)) + ,target-string :start ,%start :end ,%end + :real-start-pos (or ,start 0)) + ;; declare the variables to be IGNORABLE to prevent the + ;; compiler from issuing warnings + (declare + (ignorable ,match-start ,match-end ,reg-starts ,reg-ends)) + (unless ,match-start + ;; stop iteration on first failure + (return ,result-form)) + ;; execute BODY (wrapped in LOCALLY so it can start with + ;; declarations) + (locally + ,@body) + ;; advance by one position if we had a zero-length match + (setq ,%start (if (= ,match-start ,match-end) + (1+ ,match-end) + ,match-end))))))))) (defmacro do-matches ((match-start match-end regex target-string @@ -523,9 +528,8 @@ the scan is continued one position behind this match." "Make sure that constant forms are compiled into scanners at compile time." (cond ((constantp regex env) - `(all-matches (load-time-value - (create-scanner ,regex)) - ,@rest)) + `(all-matches (load-time-value (create-scanner ,regex)) + ,@rest)) (t form))) (defun all-matches-as-strings (regex target-string @@ -548,9 +552,8 @@ share structure with TARGET-STRING." compile time." (cond ((constantp regex env) `(all-matches-as-strings - (load-time-value - (create-scanner ,regex)) - ,@rest)) + (load-time-value (create-scanner ,regex)) + ,@rest)) (t form))) (defun split (regex target-string @@ -628,8 +631,7 @@ TARGET-STRING." (define-compiler-macro split (&whole form &environment env regex target-string &rest rest) "Make sure that constant forms are compiled into scanners at compile time." (cond ((constantp regex env) - `(split (load-time-value - (create-scanner ,regex)) + `(split (load-time-value (create-scanner ,regex)) ,target-string ,@rest)) (t form))) @@ -751,14 +753,17 @@ S-expression.")) #-:cormanlisp (defmethod build-replacement-template ((replacement-function function)) + (declare #.*standard-optimize-settings*) (list replacement-function)) #-:cormanlisp (defmethod build-replacement-template ((replacement-function-symbol symbol)) + (declare #.*standard-optimize-settings*) (list replacement-function-symbol)) #-:cormanlisp (defmethod build-replacement-template ((replacement-list list)) + (declare #.*standard-optimize-settings*) replacement-list) ;;; Corman Lisp's methods can't be closures... :( @@ -816,17 +821,18 @@ S-expression.")) start end match-start match-end reg-starts reg-ends - simple-calls) + simple-calls + element-type) (declare #.*standard-optimize-settings*) "Accepts a replacement template and the current values from the matching process in REGEX-REPLACE or REGEX-REPLACE-ALL and returns the -corresponding template." +corresponding string." ;; the upper exclusive bound of the register numbers in the regular ;; expression (let ((reg-bound (if reg-starts (array-dimension reg-starts 0) 0))) - (with-output-to-string (s) + (with-output-to-string (s nil :element-type element-type) (loop for token in replacement-template do (typecase token (string @@ -901,8 +907,8 @@ corresponding template." reg-starts reg-ends))) s))))))))) -(defun replace-aux (target-string replacement pos-list reg-list - start end preserve-case simple-calls) +(defun replace-aux (target-string replacement pos-list reg-list start end + preserve-case simple-calls element-type) (declare #.*standard-optimize-settings*) "Auxiliary function used by REGEX-REPLACE and REGEX-REPLACE-ALL. POS-LIST contains a list with the start and end @@ -910,7 +916,7 @@ positions of all matches while REG-LIST contains a list of arrays representing the corresponding register start and end positions." ;; build the template once before we start the loop (let ((replacement-template (build-replacement-template replacement))) - (with-output-to-string (s) + (with-output-to-string (s nil :element-type element-type) ;; loop through all matches and take the start and end of the ;; whole string into account (loop for (from to) on (append (list start) pos-list (list end)) @@ -925,7 +931,8 @@ representing the corresponding register start and end positions." start end from to reg-starts reg-ends - simple-calls) + simple-calls + element-type) nil) while to if replace @@ -942,14 +949,16 @@ representing the corresponding register start and end positions." ;; no replacement do (write-string target-string s :start from :end to))))) -(defun regex-replace (regex target-string replacement - &key (start 0) +(defun regex-replace (regex target-string replacement &key + (start 0) (end (length target-string)) preserve-case - simple-calls) + simple-calls + (element-type #+:lispworks 'lw:simple-char #-:lispworks 'character)) (declare #.*standard-optimize-settings*) "Try to match TARGET-STRING between START and END against REGEX and -replace the first match with REPLACEMENT. +replace the first match with REPLACEMENT. Two values are returned; +the modified string, and T if REGEX matched or NIL otherwise. REPLACEMENT can be a string which may contain the special substrings \"\\&\" for the whole match, \"\\`\" for the part of TARGET-STRING @@ -973,34 +982,40 @@ representing register (1+ N) -, or a function designator. If PRESERVE-CASE is true, the replacement will try to preserve the case (all upper case, all lower case, or capitalized) of the match. The result will always be a fresh string, even if REGEX doesn't -match." +match. + + ELEMENT-TYPE is the element type of the resulting string." (multiple-value-bind (match-start match-end reg-starts reg-ends) (scan regex target-string :start start :end end) (if match-start - (replace-aux target-string replacement - (list match-start match-end) - (list reg-starts reg-ends) - start end preserve-case simple-calls) - (subseq target-string start end)))) + (values (replace-aux target-string replacement + (list match-start match-end) + (list reg-starts reg-ends) + start end preserve-case + simple-calls element-type) + t) + (values (subseq target-string start end) + nil)))) #-:cormanlisp (define-compiler-macro regex-replace (&whole form &environment env regex target-string replacement &rest rest) "Make sure that constant forms are compiled into scanners at compile time." (cond ((constantp regex env) - `(regex-replace (load-time-value - (create-scanner ,regex)) + `(regex-replace (load-time-value (create-scanner ,regex)) ,target-string ,replacement ,@rest)) (t form))) -(defun regex-replace-all (regex target-string replacement - &key (start 0) - (end (length target-string)) - preserve-case - simple-calls) +(defun regex-replace-all (regex target-string replacement &key + (start 0) + (end (length target-string)) + preserve-case + simple-calls + (element-type #+:lispworks 'lw:simple-char #-:lispworks 'character)) (declare #.*standard-optimize-settings*) "Try to match TARGET-STRING between START and END against REGEX and -replace all matches with REPLACEMENT. +replace all matches with REPLACEMENT. Two values are returned; the +modified string, and T if REGEX matched or NIL otherwise. REPLACEMENT can be a string which may contain the special substrings \"\\&\" for the whole match, \"\\`\" for the part of TARGET-STRING @@ -1024,30 +1039,34 @@ representing register (1+ N) -, or a function designator. If PRESERVE-CASE is true, the replacement will try to preserve the case (all upper case, all lower case, or capitalized) of the match. The result will always be a fresh string, even if REGEX doesn't -match." +match. + + ELEMENT-TYPE is the element type of the resulting string." (let ((pos-list '()) (reg-list '())) (do-scans (match-start match-end reg-starts reg-ends regex target-string - nil - :start start :end end) + nil + :start start :end end) (push match-start pos-list) (push match-end pos-list) (push reg-starts reg-list) (push reg-ends reg-list)) (if pos-list - (replace-aux target-string replacement - (nreverse pos-list) - (nreverse reg-list) - start end preserve-case simple-calls) - (subseq target-string start end)))) + (values (replace-aux target-string replacement + (nreverse pos-list) + (nreverse reg-list) + start end preserve-case + simple-calls element-type) + t) + (values (subseq target-string start end) + nil)))) #-:cormanlisp (define-compiler-macro regex-replace-all (&whole form &environment env regex target-string replacement &rest rest) "Make sure that constant forms are compiled into scanners at compile time." (cond ((constantp regex env) - `(regex-replace-all (load-time-value - (create-scanner ,regex)) + `(regex-replace-all (load-time-value (create-scanner ,regex)) ,target-string ,replacement ,@rest)) (t form))) @@ -1060,32 +1079,35 @@ which matches REGEX. Optionally evaluates and returns RETURN-FORM at the end. If CASE-INSENSITIVE is true and REGEX isn't already a scanner, a case-insensitive scanner is used." (with-rebinding (regex) - (with-unique-names (scanner %packages next morep) + (with-unique-names (scanner %packages next morep hash) `(let* ((,scanner (create-scanner ,regex :case-insensitive-mode (and ,case-insensitive (not (functionp ,regex))))) (,%packages (or ,packages - (list-all-packages)))) + (list-all-packages))) + (,hash (make-hash-table :test #'eq))) (with-package-iterator (,next ,%packages :external :internal :inherited) (loop (multiple-value-bind (,morep symbol) (,next) (unless ,morep (return ,return-form)) - (when (scan ,scanner (symbol-name symbol)) - ,@body)))))))) + (unless (gethash symbol ,hash) + (when (scan ,scanner (symbol-name symbol)) + (setf (gethash symbol ,hash) t) + ,@body))))))))) ;;; The following two functions were provided by Karsten Poeck #+:cormanlisp -(defmacro do-with-all-symbols ((variable package-packagelist) &body body) - (with-unique-names (pack-var iter-sym) - `(if (listp ,package-packagelist) - (dolist (,pack-var ,package-packagelist) - (do-symbols (,iter-sym ,pack-var) +(defmacro do-with-all-symbols ((variable package-or-packagelist) &body body) + (with-unique-names (pack-var) + `(if (listp ,package-or-packagelist) + (dolist (,pack-var ,package-or-packagelist) + (do-symbols (,variable ,pack-var) ,@body)) - (do-symbols (,iter-sym ,package-packagelist) + (do-symbols (,variable ,package-or-packagelist) ,@body)))) #+:cormanlisp @@ -1097,16 +1119,19 @@ which matches REGEX. Optionally evaluates and returns RETURN-FORM at the end. If CASE-INSENSITIVE is true and REGEX isn't already a scanner, a case-insensitive scanner is used." (with-rebinding (regex) - (with-unique-names (scanner %packages) + (with-unique-names (scanner %packages hash) `(let* ((,scanner (create-scanner ,regex :case-insensitive-mode (and ,case-insensitive (not (functionp ,regex))))) (,%packages (or ,packages - (list-all-packages)))) + (list-all-packages))) + (,hash (make-hash-table :test #'eq))) (do-with-all-symbols (symbol ,%packages) - (when (scan ,scanner (symbol-name symbol)) - ,@body)) + (unless (gethash symbol ,hash) + (when (scan ,scanner (symbol-name symbol)) + (setf (gethash symbol ,hash) t) + ,@body))) ,return-form)))) (defun regex-apropos-list (regex &optional packages &key (case-insensitive t)) @@ -1148,8 +1173,8 @@ meaningful information about a symbol." (push (format nil "[constant]~:[~; value: ~S~]" (boundp symbol) (symbol-value symbol)) output-list)) ((boundp symbol) - (push #+(or LispWorks CLISP) "[variable]" - #-(or LispWorks CLISP) (format nil "[variable] value: ~S" + (push #+(or :lispworks :clisp) "[variable]" + #-(or :lispworks :clisp) (format nil "[variable] value: ~S" (symbol-value symbol)) output-list))) #-(or :cormanlisp :clisp) @@ -1160,7 +1185,7 @@ meaningful information about a symbol." (condition () ;; this seems to be necessary due to some errors I encountered ;; with LispWorks - (format t "~&~S [an error occured while trying to print more info]" symbol)))) + (format t "~&~S [an error occurred while trying to print more info]" symbol)))) (defun regex-apropos (regex &optional packages &key (case-insensitive t)) "Similar to the standard function APROPOS but returns a list of all diff --git a/charset.lisp b/charset.lisp new file mode 100755 index 0000000..11ccb0a --- /dev/null +++ b/charset.lisp @@ -0,0 +1,227 @@ +;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- +;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.4 2008/07/03 08:39:10 edi Exp $ + +;;; A specialized set implementation for characters by Nikodemus Siivola. + +;;; Copyright (c) 2008, Nikodemus Siivola. All rights reserved. +;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved. + +;;; Redistribution and use in source and binary forms, with or without +;;; modification, are permitted provided that the following conditions +;;; are met: + +;;; * Redistributions of source code must retain the above copyright +;;; notice, this list of conditions and the following disclaimer. + +;;; * Redistributions in binary form must reproduce the above +;;; copyright notice, this list of conditions and the following +;;; disclaimer in the documentation and/or other materials +;;; provided with the distribution. + +;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED +;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +(in-package #:cl-ppcre) + +(defconstant +probe-depth+ 3 + "Maximum number of collisions \(for any element) we accept before we +allocate more storage. This is now fixed, but could be made to vary +depending on the size of the storage vector \(e.g. in the range of +1-4). Larger probe-depths mean more collisions are tolerated before +the table grows, but increase the constant factor.") + +(defun make-char-vector (size) + "Returns a vector of size SIZE to hold characters. All elements are +initialized to #\Null except for the first one which is initialized to +#\?." + (declare #.*standard-optimize-settings*) + (declare (type (integer 2 #.(1- array-total-size-limit)) size)) + ;; Since #\Null always hashes to 0, store something else there + ;; initially, and #\Null everywhere else + (let ((result (make-array size + :element-type #-:lispworks 'character #+:lispworks 'lw:simple-char + :initial-element (code-char 0)))) + (setf (char result 0) #\?) + result)) + +(defstruct (charset (:constructor make-charset)) + ;; this is set to 0 when we stop hashing and just use a CHAR-CODE + ;; indexed vector + (depth +probe-depth+ :type fixnum) + ;; the number of characters in this set + (count 0 :type fixnum) + ;; the storage vector + (vector (make-char-vector 12) :type (simple-array character (*)))) + +;; seems to be necessary for some Lisps like ClozureCL +(defmethod make-load-form ((set charset) &optional environment) + (make-load-form-saving-slots set :environment environment)) + +(declaim (inline mix)) +(defun mix (code hash) + "Given a character code CODE and a hash code HASH, computes and +returns the \"next\" hash code. See comments below." + (declare #.*standard-optimize-settings*) + ;; mixing the CHAR-CODE back in at each step makes sure that if two + ;; characters collide (their hashes end up pointing in the same + ;; storage vector index) on one round, they should (hopefully!) not + ;; collide on the next + (sxhash (logand most-positive-fixnum (+ code hash)))) + +(declaim (inline compute-index)) +(defun compute-index (hash vector) + "Computes and returns the index into the vector VECTOR corresponding +to the hash code HASH." + (declare #.*standard-optimize-settings*) + (1+ (mod hash (1- (length vector))))) + +(defun in-charset-p (char set) + "Checks whether the character CHAR is in the charset SET." + (declare #.*standard-optimize-settings*) + (declare (character char) (charset set)) + (let ((vector (charset-vector set)) + (depth (charset-depth set)) + (code (char-code char))) + (declare (fixnum depth)) + ;; As long as the set remains reasonably small, we use non-linear + ;; hashing - the first hash of any character is its CHAR-CODE, and + ;; subsequent hashes are computed by MIX above + (cond ((or + ;; depth 0 is special - each char maps only to its code, + ;; nothing else + (zerop depth) + ;; index 0 is special - only #\Null maps to it, no matter + ;; what the depth is + (zerop code)) + (eq char (char vector code))) + (t + ;; otherwise hash starts out as the character code, but + ;; maps to indexes 1-N + (let ((hash code)) + (tagbody + :retry + (let* ((index (compute-index hash vector)) + (x (char vector index))) + (cond ((eq x (code-char 0)) + ;; empty, no need to probe further + (return-from in-charset-p nil)) + ((eq x char) + ;; got it + (return-from in-charset-p t)) + ((zerop (decf depth)) + ;; max probe depth reached, nothing found + (return-from in-charset-p nil)) + (t + ;; nothing yet, try next place + (setf hash (mix code hash)) + (go :retry)))))))))) + +(defun add-to-charset (char set) + "Adds the character CHAR to the charset SET, extending SET if +necessary. Returns CHAR." + (declare #.*standard-optimize-settings*) + (or (%add-to-charset char set) + (%add-to-charset/expand char set) + (error "Oops, this should not happen...")) + char) + +(defun %add-to-charset (char set) + "Tries to add the character CHAR to the charset SET without +extending it. Returns NIL if this fails." + (declare #.*standard-optimize-settings*) + (declare (character char) (charset set)) + (let ((vector (charset-vector set)) + (depth (charset-depth set)) + (code (char-code char))) + (declare (fixnum depth)) + ;; see comments in IN-CHARSET-P for algorithm + (cond ((or (zerop depth) (zerop code)) + (setf (char vector code) char)) + (t + (let ((hash code)) + (tagbody + :retry + (let* ((index (compute-index hash vector)) + (x (char vector index))) + (cond ((eq x (code-char 0)) + (setf (char vector index) char) + (incf (charset-count set)) + (return-from %add-to-charset char)) + ((eq x char) + (return-from %add-to-charset char)) + ((zerop (decf depth)) + ;; need to expand the table + (return-from %add-to-charset nil)) + (t + (setf hash (mix code hash)) + (go :retry)))))))))) + +(defun %add-to-charset/expand (char set) + "Extends the charset SET and then adds the character CHAR to it." + (declare #.*standard-optimize-settings*) + (declare (character char) (charset set)) + (let* ((old-vector (charset-vector set)) + (new-size (* 2 (length old-vector)))) + (tagbody + :retry + ;; when the table grows large (currently over 1/3 of + ;; CHAR-CODE-LIMIT), we dispense with hashing and just allocate a + ;; storage vector with space for all characters, so that each + ;; character always uses only the CHAR-CODE + (multiple-value-bind (new-depth new-vector) + (if (>= new-size #.(truncate char-code-limit 3)) + (values 0 (make-char-vector char-code-limit)) + (values +probe-depth+ (make-char-vector new-size))) + (setf (charset-depth set) new-depth + (charset-vector set) new-vector) + (flet ((try-add (x) + (unless (%add-to-charset x set) + (assert (not (zerop new-depth))) + (setf new-size (* 2 new-size)) + (go :retry)))) + (try-add char) + (dotimes (i (length old-vector)) + (let ((x (char old-vector i))) + (if (eq x (code-char 0)) + (when (zerop i) + (try-add x)) + (unless (zerop i) + (try-add x)))))))) + t)) + +(defun all-characters (set) + "Returns a list of all characters in the charset SET." + (declare #.*standard-optimize-settings*) + (loop with count = (charset-count set) + with counter = 0 + for code below char-code-limit + for char = (code-char code) + while (< counter count) + when (and char (in-charset-p char set)) + do (incf counter) + and collect char)) + +(defun merge-set (set1 set2 &optional invertedp) + "Returns the \"sum\" of two charsets. This is a destructive +operation on SET1. If INVERTEDP is true, merges the \"inverse\" of +SET2 into SET1 instead." + (declare #.*standard-optimize-settings*) + ;; we only consider values with character codes below + ;; *REGEX-CHAR-CODE-LIMIT* + (loop for code of-type fixnum from 0 below *regex-char-code-limit* + for char = (code-char code) + when (and char (if invertedp + (not (in-charset-p char set2)) + (in-charset-p char set2))) + do (add-to-charset char set1)) + set1) + diff --git a/cl-ppcre-test.asd b/cl-ppcre-test.asd index 271eac4..4690319 100644 --- a/cl-ppcre-test.asd +++ b/cl-ppcre-test.asd @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.asd,v 1.8 2005/11/01 09:51:01 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.asd,v 1.14 2008/06/25 14:04:27 edi Exp $ ;;; This ASDF system definition was kindly provided by Marco Baringer. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -29,13 +29,6 @@ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -(defpackage #:cl-ppcre-test.system - (:use #:cl - #:asdf)) - -(in-package #:cl-ppcre-test.system) - -(defsystem #:cl-ppcre-test - :version "1.2.12" - :depends-on (#:cl-ppcre) +(asdf:defsystem :cl-ppcre-test + :depends-on (:cl-ppcre) :components ((:file "ppcre-tests"))) diff --git a/cl-ppcre-test.system b/cl-ppcre-test.system index 8e6cea4..f521ff5 100644 --- a/cl-ppcre-test.system +++ b/cl-ppcre-test.system @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.9 2005/04/01 21:29:09 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.11 2007/01/01 23:43:10 edi Exp $ -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions diff --git a/cl-ppcre.asd b/cl-ppcre.asd index 60a8f45..40dd700 100644 --- a/cl-ppcre.asd +++ b/cl-ppcre.asd @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.12 2005/11/01 09:51:01 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.30 2008/07/03 10:06:15 edi Exp $ ;;; This ASDF system definition was kindly provided by Marco Baringer. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -29,17 +29,12 @@ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -(defpackage #:cl-ppcre.system - (:use #:cl - #:asdf)) - -(in-package #:cl-ppcre.system) - -(defsystem #:cl-ppcre - :version "1.2.12" +(asdf:defsystem :cl-ppcre + :version "1.4.1" :serial t :components ((:file "packages") (:file "specials") + (:file "charset") (:file "util") (:file "errors") #-:use-acl-regexp2-engine diff --git a/cl-ppcre.system b/cl-ppcre.system index 3aed698..5f01431 100644 --- a/cl-ppcre.system +++ b/cl-ppcre.system @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.11 2005/04/01 21:29:09 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.13 2007/01/01 23:43:10 edi Exp $ -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions diff --git a/closures.lisp b/closures.lisp index 3ed8354..7606197 100644 --- a/closures.lisp +++ b/closures.lisp @@ -1,10 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.29 2005/05/16 16:29:23 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.36 2008/07/03 07:44:06 edi Exp $ ;;; Here we create the closures which together build the final ;;; scanner. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -63,6 +63,7 @@ START-POS, and tests whether REGEX can match *STRING* at START-POS such that the call to NEXT-FN after the match would succeed.")) (defmethod create-matcher-aux ((seq seq) next-fn) + (declare #.*standard-optimize-settings*) ;; the closure for a SEQ is a chain of closures for the elements of ;; this sequence which call each other in turn; the last closure ;; calls NEXT-FN @@ -72,6 +73,7 @@ such that the call to NEXT-FN after the match would succeed.")) finally (return next-matcher))) (defmethod create-matcher-aux ((alternation alternation) next-fn) + (declare #.*standard-optimize-settings*) ;; first create closures for all alternations of ALTERNATION (let ((all-matchers (mapcar #'(lambda (choice) (create-matcher-aux choice next-fn)) @@ -84,6 +86,7 @@ such that the call to NEXT-FN after the match would succeed.")) thereis (funcall (the function matcher) start-pos))))) (defmethod create-matcher-aux ((register register) next-fn) + (declare #.*standard-optimize-settings*) ;; the position of this REGISTER within the whole regex; we start to ;; count at 0 (let ((num (num register))) @@ -122,6 +125,7 @@ such that the call to NEXT-FN after the match would succeed.")) next-pos))))))) (defmethod create-matcher-aux ((lookahead lookahead) next-fn) + (declare #.*standard-optimize-settings*) ;; create a closure which just checks for the inner regex and ;; doesn't care about NEXT-FN (let ((test-matcher (create-matcher-aux (regex lookahead) #'identity))) @@ -139,6 +143,7 @@ such that the call to NEXT-FN after the match would succeed.")) (funcall next-fn start-pos)))))) (defmethod create-matcher-aux ((lookbehind lookbehind) next-fn) + (declare #.*standard-optimize-settings*) (let ((len (len lookbehind)) ;; create a closure which just checks for the inner regex and ;; doesn't care about NEXT-FN @@ -150,14 +155,14 @@ such that the call to NEXT-FN after the match would succeed.")) ;; far enough from the start of *STRING*), then call NEXT-FN (lambda (start-pos) (declare (type fixnum start-pos)) - (and (>= (- start-pos *start-pos*) len) + (and (>= (- start-pos (or *real-start-pos* *start-pos*)) len) (funcall test-matcher (- start-pos len)) (funcall next-fn start-pos))) ;; negative look-behind: check failure of inner regex (if we're ;; far enough from the start of *STRING*), then call NEXT-FN (lambda (start-pos) (declare (type fixnum start-pos)) - (and (or (< start-pos len) + (and (or (< (- start-pos (or *real-start-pos* *start-pos*)) len) (not (funcall test-matcher (- start-pos len)))) (funcall next-fn start-pos)))))) @@ -172,55 +177,54 @@ against CHR-EXPR." (subst new '(char-class-test) body :test #'equalp))) `(let* ((,%char-class ,char-class) - (hash (hash ,%char-class)) - (count (if hash - (hash-table-count hash) + (set (charset ,%char-class)) + (count (if set + (charset-count set) most-positive-fixnum)) - ;; collect a list of "all" characters in the hash if + ;; collect a list of "all" characters in the set if ;; there aren't more than two - (key-list (if (<= count 2) - (loop for chr being the hash-keys of hash - collect chr) - nil)) + (all-chars (if (<= count 2) + (all-characters set) + nil)) downcasedp) (declare (type fixnum count)) - ;; check if we can partition the hash into three ranges (or + ;; check if we can partition the charset into three ranges (or ;; less) (multiple-value-bind (min1 max1 min2 max2 min3 max3) - (create-ranges-from-hash hash) + (create-ranges-from-set set) ;; if that didn't work and CHAR-CLASS is case-insensitive we ;; try it again with every character downcased (when (and (not min1) (case-insensitive-p ,%char-class)) (multiple-value-setq (min1 max1 min2 max2 min3 max3) - (create-ranges-from-hash hash :downcasep t)) + (create-ranges-from-set set :downcasep t)) (setq downcasedp t)) (cond ((= count 1) - ;; hash contains exactly one character so we just + ;; charset contains exactly one character so we just ;; check for this single character; (note that this ;; actually can't happen because this case is ;; optimized away in CONVERT already...) - (let ((chr1 (first key-list))) + (let ((chr1 (first all-chars))) ,@(substitute-char-class-tester `(char= ,chr-expr chr1)))) ((= count 2) - ;; hash contains exactly two characters - (let ((chr1 (first key-list)) - (chr2 (second key-list))) + ;; set contains exactly two characters + (let ((chr1 (first all-chars)) + (chr2 (second all-chars))) ,@(substitute-char-class-tester `(let ((chr ,chr-expr)) (or (char= chr chr1) (char= chr chr2)))))) ((word-char-class-p ,%char-class) - ;; special-case: hash is \w, \W, [\w], [\W] or + ;; special-case: set is \w, \W, [\w], [\W] or ;; something equivalent ,@(substitute-char-class-tester `(word-char-p ,chr-expr))) ((= count *regex-char-code-limit*) ;; according to the ANSI standard we might have all - ;; possible characters in the hash even if it - ;; doesn't contain CHAR-CODE-LIMIT characters but - ;; this doesn't seem to be the case for current + ;; possible characters in the set even if it doesn't + ;; contain CHAR-CODE-LIMIT characters but this + ;; doesn't seem to be the case for current ;; implementations (also note that this optimization ;; implies that you must not have characters with ;; character codes beyond *REGEX-CHAR-CODE-LIMIT* in @@ -264,17 +268,13 @@ against CHR-EXPR." `(char<= min1 ,chr-expr max1))) (t ;; the general case; note that most of the above - ;; "optimizations" are based on experiences and - ;; benchmarks with CMUCL - if you're really - ;; concerned with speed you might find out that the - ;; general case is almost always the best one for - ;; other implementations (because the speed of their - ;; hash-table access in relation to other operations - ;; might be better than in CMUCL) + ;; "optimizations" are based on early (2002) + ;; experiences and benchmarks with CMUCL ,@(substitute-char-class-tester - `(gethash ,chr-expr hash))))))))) + `(in-charset-p ,chr-expr set))))))))) (defmethod create-matcher-aux ((char-class char-class) next-fn) + (declare #.*standard-optimize-settings*) (declare (type function next-fn)) ;; insert a test against the current character within *STRING* (insert-char-class-tester (char-class (schar *string* start-pos)) @@ -291,6 +291,7 @@ against CHR-EXPR." (funcall next-fn (1+ start-pos))))))) (defmethod create-matcher-aux ((str str) next-fn) + (declare #.*standard-optimize-settings*) (declare (type fixnum *end-string-pos*) (type function next-fn) ;; this special value is set by CREATE-SCANNER when the @@ -405,6 +406,7 @@ against CHR-EXPR." (word-char-p (schar *string* start-pos))))))) (defmethod create-matcher-aux ((word-boundary word-boundary) next-fn) + (declare #.*standard-optimize-settings*) (declare (type function next-fn)) (if (negatedp word-boundary) (lambda (start-pos) @@ -415,6 +417,7 @@ against CHR-EXPR." (funcall next-fn start-pos))))) (defmethod create-matcher-aux ((everything everything) next-fn) + (declare #.*standard-optimize-settings*) (declare (type function next-fn)) (if (single-line-p everything) ;; closure for single-line-mode: we really match everything, so we @@ -432,11 +435,12 @@ against CHR-EXPR." (funcall next-fn (1+ start-pos)))))) (defmethod create-matcher-aux ((anchor anchor) next-fn) + (declare #.*standard-optimize-settings*) (declare (type function next-fn)) (let ((startp (startp anchor)) (multi-line-p (multi-line-p anchor))) (cond ((no-newline-p anchor) - ;; this must be and end-anchor and it must be modeless, so + ;; this must be an end-anchor and it must be modeless, so ;; we just have to check whether START-POS equals ;; *END-POS* (lambda (start-pos) @@ -486,6 +490,7 @@ against CHR-EXPR." (funcall next-fn start-pos))))))) (defmethod create-matcher-aux ((back-reference back-reference) next-fn) + (declare #.*standard-optimize-settings*) (declare (type function next-fn)) ;; the position of the corresponding REGISTER within the whole ;; regex; we start to count at 0 @@ -525,6 +530,7 @@ against CHR-EXPR." (funcall next-fn next-pos))))))))) (defmethod create-matcher-aux ((branch branch) next-fn) + (declare #.*standard-optimize-settings*) (let* ((test (test branch)) (then-matcher (create-matcher-aux (then-regex branch) next-fn)) (else-matcher (create-matcher-aux (else-regex branch) next-fn))) @@ -545,6 +551,7 @@ against CHR-EXPR." (funcall else-matcher start-pos)))))))) (defmethod create-matcher-aux ((standalone standalone) next-fn) + (declare #.*standard-optimize-settings*) (let ((inner-matcher (create-matcher-aux (regex standalone) #'identity))) (declare (type function next-fn inner-matcher)) (lambda (start-pos) @@ -553,6 +560,7 @@ against CHR-EXPR." (funcall next-fn next-pos)))))) (defmethod create-matcher-aux ((filter filter) next-fn) + (declare #.*standard-optimize-settings*) (let ((fn (fn filter))) (lambda (start-pos) (let ((next-pos (funcall fn start-pos))) @@ -560,5 +568,6 @@ against CHR-EXPR." (funcall next-fn next-pos)))))) (defmethod create-matcher-aux ((void void) next-fn) + (declare #.*standard-optimize-settings*) ;; optimize away VOIDs: don't create a closure, just return NEXT-FN next-fn) diff --git a/convert.lisp b/convert.lisp index c512877..bf41f13 100644 --- a/convert.lisp +++ b/convert.lisp @@ -1,11 +1,11 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.22 2005/04/01 21:29:09 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.29 2008/07/03 07:44:06 edi Exp $ ;;; Here the parse tree is converted into its internal representation ;;; using REGEX objects. At the same time some optimizations are ;;; already applied. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -70,12 +70,12 @@ the special FLAGS list." (otherwise (signal-ppcre-syntax-error "Unknown flag token ~A" token)))) -(defun add-range-to-hash (hash from to) +(defun add-range-to-set (set from to) (declare #.*standard-optimize-settings*) (declare (special flags)) - "Adds all characters from character FROM to character TO (inclusive) -to the char class hash HASH. Does the right thing with respect to -case-(in)sensitivity as specified by the special variable FLAGS." + "Adds all characters from character FROM to character TO +\(inclusive) to the charset SET. Does the right thing with respect to +case-\(in)sensitivity as specified by the special variable FLAGS." (let ((from-code (char-code from)) (to-code (char-code to))) (when (> from-code to-code) @@ -83,57 +83,55 @@ case-(in)sensitivity as specified by the special variable FLAGS." from to)) (cond ((case-insensitive-mode-p flags) (loop for code from from-code to to-code - for chr = (code-char code) - do (setf (gethash (char-upcase chr) hash) t - (gethash (char-downcase chr) hash) t))) + for char = (code-char code) + do (add-to-charset (char-upcase char) set) + (add-to-charset (char-downcase char) set))) (t (loop for code from from-code to to-code - do (setf (gethash (code-char code) hash) t)))) - hash)) + do (add-to-charset (code-char code) set)))) + set)) -(defun convert-char-class-to-hash (list) +(defun convert-char-class-to-charset (list) (declare #.*standard-optimize-settings*) - "Combines all items in LIST into one char class hash and returns it. -Items can be single characters, character ranges like \(:RANGE #\\A -#\\E), or special character classes like :DIGIT-CLASS. Does the right -thing with respect to case-\(in)sensitivity as specified by the -special variable FLAGS." - (loop with hash = (make-hash-table :size (ceiling (expt *regex-char-code-limit* (/ 1 4))) - :rehash-size (float (expt *regex-char-code-limit* (/ 1 4))) - :rehash-threshold #-genera 1.0 #+genera 0.99) + "Combines all items in LIST into one charset and returns it. Items +can be single characters, character ranges like \(:RANGE #\\A #\\E), +or special character classes like :DIGIT-CLASS. Does the right thing +with respect to case-\(in)sensitivity as specified by the special +variable FLAGS." + (loop with set = (make-charset) for item in list if (characterp item) - ;; treat a single character C like a range (:RANGE C C) - do (add-range-to-hash hash item item) + ;; treat a single character C like a range (:RANGE C C) + do (add-range-to-set set item item) else if (symbolp item) - ;; special character classes - do (setq hash - (case item - ((:digit-class) - (merge-hash hash +digit-hash+)) - ((:non-digit-class) - (merge-inverted-hash hash +digit-hash+)) - ((:whitespace-char-class) - (merge-hash hash +whitespace-char-hash+)) - ((:non-whitespace-char-class) - (merge-inverted-hash hash +whitespace-char-hash+)) - ((:word-char-class) - (merge-hash hash +word-char-hash+)) - ((:non-word-char-class) - (merge-inverted-hash hash +word-char-hash+)) - (otherwise - (signal-ppcre-syntax-error - "Unknown symbol ~A in character class" - item)))) + ;; special character classes + do (setq set + (case item + ((:digit-class) + (merge-set set +digit-set+)) + ((:non-digit-class) + (merge-set set +digit-set+ t)) + ((:whitespace-char-class) + (merge-set set +whitespace-char-set+)) + ((:non-whitespace-char-class) + (merge-set set +whitespace-char-set+ t)) + ((:word-char-class) + (merge-set set +word-char-set+)) + ((:non-word-char-class) + (merge-set set +word-char-set+ t)) + (otherwise + (signal-ppcre-syntax-error + "Unknown symbol ~A in character class" + item)))) else if (and (consp item) (eq (car item) :range)) - ;; proper ranges - do (add-range-to-hash hash - (second item) - (third item)) + ;; proper ranges + do (add-range-to-set set + (second item) + (third item)) else do (signal-ppcre-syntax-error "Unknown item ~A in char-class list" item) - finally (return hash))) + finally (return set))) (defun maybe-split-repetition (regex greedyp @@ -207,7 +205,7 @@ the same name." ;; During the conversion of the parse tree we keep track of the start ;; of the parse tree in the special variable STARTS-WITH which'll ;; either hold a STR object or an EVERYTHING object. The latter is the -;; case if the regex starts with ".*" which implicitely anchors the +;; case if the regex starts with ".*" which implicitly anchors the ;; regex at the start (perhaps modulo #\Newline). (defun maybe-accumulate (str) @@ -267,7 +265,7 @@ NIL or a STR object of the same case mode. Always returns NIL." (defun convert-aux (parse-tree) (declare #.*standard-optimize-settings*) - (declare (special flags reg-num accumulate-start-p starts-with max-back-ref)) + (declare (special flags reg-num reg-names accumulate-start-p starts-with max-back-ref)) "Converts the parse tree PARSE-TREE into a REGEX object and returns it. Will also @@ -275,260 +273,292 @@ Will also - accumulate strings or EVERYTHING objects into the special variable STARTS-WITH, - keep track of all registers seen in the special variable REG-NUM, + - keep track of all named registers seen in the special variable REG-NAMES - keep track of the highest backreference seen in the special variable MAX-BACK-REF, - maintain and adher to the currently applicable modifiers in the special variable FLAGS, and - maybe even wash your car..." (cond ((consp parse-tree) - (case (first parse-tree) - ;; (:SEQUENCE {}*) - ((:sequence) + (case (first parse-tree) + ;; (:SEQUENCE {}*) + ((:sequence) + (cond ((cddr parse-tree) + ;; this is essentially like + ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE)) + ;; but we don't cons a new list + (loop for parse-tree-rest on (rest parse-tree) + while parse-tree-rest + do (setf (car parse-tree-rest) + (convert-aux (car parse-tree-rest)))) + (make-instance 'seq + :elements (rest parse-tree))) + (t (convert-aux (second parse-tree))))) + ;; (:GROUP {}*) + ;; this is a syntactical construct equivalent to :SEQUENCE + ;; intended to keep the effect of modifiers local + ((:group) + ;; make a local copy of FLAGS and shadow the global + ;; value while we descend into the enclosed regexes + (let ((flags (copy-list flags))) + (declare (special flags)) (cond ((cddr parse-tree) - ;; this is essentially like - ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE)) - ;; but we don't cons a new list - (loop for parse-tree-rest on (rest parse-tree) - while parse-tree-rest - do (setf (car parse-tree-rest) - (convert-aux (car parse-tree-rest)))) - (make-instance 'seq - :elements (rest parse-tree))) - (t (convert-aux (second parse-tree))))) - ;; (:GROUP {}*) - ;; this is a syntactical construct equivalent to :SEQUENCE - ;; intended to keep the effect of modifiers local - ((:group) - ;; make a local copy of FLAGS and shadow the global - ;; value while we descend into the enclosed regexes - (let ((flags (copy-list flags))) - (declare (special flags)) - (cond ((cddr parse-tree) - (loop for parse-tree-rest on (rest parse-tree) - while parse-tree-rest - do (setf (car parse-tree-rest) - (convert-aux (car parse-tree-rest)))) - (make-instance 'seq - :elements (rest parse-tree))) - (t (convert-aux (second parse-tree)))))) - ;; (:ALTERNATION {}*) - ((:alternation) - ;; we must stop accumulating objects into STARTS-WITH - ;; once we reach an alternation - (setq accumulate-start-p nil) - (loop for parse-tree-rest on (rest parse-tree) - while parse-tree-rest - do (setf (car parse-tree-rest) - (convert-aux (car parse-tree-rest)))) - (make-instance 'alternation - :choices (rest parse-tree))) - ;; (:BRANCH ) - ;; must be look-ahead, look-behind or number; - ;; if is an alternation it must have one or two - ;; choices - ((:branch) - (setq accumulate-start-p nil) - (let* ((test-candidate (second parse-tree)) - (test (cond ((numberp test-candidate) - (when (zerop (the fixnum test-candidate)) - (signal-ppcre-syntax-error - "Register 0 doesn't exist: ~S" - parse-tree)) - (1- (the fixnum test-candidate))) - (t (convert-aux test-candidate)))) - (alternations (convert-aux (third parse-tree)))) - (when (and (not (numberp test)) - (not (typep test 'lookahead)) - (not (typep test 'lookbehind))) - (signal-ppcre-syntax-error - "Branch test must be look-ahead, look-behind or number: ~S" - parse-tree)) - (typecase alternations - (alternation - (case (length (choices alternations)) - ((0) - (signal-ppcre-syntax-error "No choices in branch: ~S" - parse-tree)) - ((1) - (make-instance 'branch - :test test - :then-regex (first - (choices alternations)))) - ((2) - (make-instance 'branch - :test test - :then-regex (first - (choices alternations)) - :else-regex (second - (choices alternations)))) - (otherwise - (signal-ppcre-syntax-error - "Too much choices in branch: ~S" - parse-tree)))) - (t + (loop for parse-tree-rest on (rest parse-tree) + while parse-tree-rest + do (setf (car parse-tree-rest) + (convert-aux (car parse-tree-rest)))) + (make-instance 'seq + :elements (rest parse-tree))) + (t (convert-aux (second parse-tree)))))) + ;; (:ALTERNATION {}*) + ((:alternation) + ;; we must stop accumulating objects into STARTS-WITH + ;; once we reach an alternation + (setq accumulate-start-p nil) + (loop for parse-tree-rest on (rest parse-tree) + while parse-tree-rest + do (setf (car parse-tree-rest) + (convert-aux (car parse-tree-rest)))) + (make-instance 'alternation + :choices (rest parse-tree))) + ;; (:BRANCH ) + ;; must be look-ahead, look-behind or number; + ;; if is an alternation it must have one or two + ;; choices + ((:branch) + (setq accumulate-start-p nil) + (let* ((test-candidate (second parse-tree)) + (test (cond ((numberp test-candidate) + (when (zerop (the fixnum test-candidate)) + (signal-ppcre-syntax-error + "Register 0 doesn't exist: ~S" + parse-tree)) + (1- (the fixnum test-candidate))) + (t (convert-aux test-candidate)))) + (alternations (convert-aux (third parse-tree)))) + (when (and (not (numberp test)) + (not (typep test 'lookahead)) + (not (typep test 'lookbehind))) + (signal-ppcre-syntax-error + "Branch test must be look-ahead, look-behind or number: ~S" + parse-tree)) + (typecase alternations + (alternation + (case (length (choices alternations)) + ((0) + (signal-ppcre-syntax-error "No choices in branch: ~S" + parse-tree)) + ((1) (make-instance 'branch :test test - :then-regex alternations))))) - ;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD ) - ((:positive-lookahead :negative-lookahead) - ;; keep the effect of modifiers local to the enclosed - ;; regex and stop accumulating into STARTS-WITH - (setq accumulate-start-p nil) - (let ((flags (copy-list flags))) - (declare (special flags)) - (make-instance 'lookahead - :regex (convert-aux (second parse-tree)) - :positivep (eq (first parse-tree) - :positive-lookahead)))) - ;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND ) - ((:positive-lookbehind :negative-lookbehind) - ;; keep the effect of modifiers local to the enclosed - ;; regex and stop accumulating into STARTS-WITH - (setq accumulate-start-p nil) - (let* ((flags (copy-list flags)) - (regex (convert-aux (second parse-tree))) - (len (regex-length regex))) - (declare (special flags)) - ;; lookbehind assertions must be of fixed length - (unless len - (signal-ppcre-syntax-error - "Variable length look-behind not implemented (yet): ~S" - parse-tree)) - (make-instance 'lookbehind - :regex regex - :positivep (eq (first parse-tree) - :positive-lookbehind) - :len len))) - ;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION ) - ((:greedy-repetition :non-greedy-repetition) - ;; remember the value of ACCUMULATE-START-P upon entering - (let ((local-accumulate-start-p accumulate-start-p)) - (let ((minimum (second parse-tree)) - (maximum (third parse-tree))) - (declare (type fixnum minimum)) - (declare (type (or null fixnum) maximum)) - (unless (and maximum - (= 1 minimum maximum)) - ;; set ACCUMULATE-START-P to NIL for the rest of - ;; the conversion because we can't continue to - ;; accumulate inside as well as after a proper - ;; repetition - (setq accumulate-start-p nil)) - (let* (reg-seen - (regex (convert-aux (fourth parse-tree))) - (min-len (regex-min-length regex)) - (greedyp (eq (first parse-tree) :greedy-repetition)) - (length (regex-length regex))) - ;; note that this declaration already applies to - ;; the call to CONVERT-AUX above - (declare (special reg-seen)) - (when (and local-accumulate-start-p - (not starts-with) - (zerop minimum) - (not maximum)) - ;; if this repetition is (equivalent to) ".*" - ;; and if we're at the start of the regex we - ;; remember it for ADVANCE-FN (see the SCAN - ;; function) - (setq starts-with (everythingp regex))) - (if (or (not reg-seen) - (not greedyp) - (not length) - (zerop length) - (and maximum (= minimum maximum))) - ;; the repetition doesn't enclose a register, or - ;; it's not greedy, or we can't determine it's - ;; (inner) length, or the length is zero, or the - ;; number of repetitions is fixed; in all of - ;; these cases we don't bother to optimize - (maybe-split-repetition regex - greedyp - minimum - maximum - min-len - length - reg-seen) - ;; otherwise we make a transformation that looks - ;; roughly like one of - ;; * -> (?:*)? - ;; + -> * - ;; where the trick is that as much as possible - ;; registers from are removed in - ;; - (let* (reg-seen ; new instance for REMOVE-REGISTERS - (remove-registers-p t) - (inner-regex (remove-registers regex)) - (inner-repetition - ;; this is the "" part - (maybe-split-repetition inner-regex - ;; always greedy - t - ;; reduce minimum by 1 - ;; unless it's already 0 - (if (zerop minimum) - 0 - (1- minimum)) - ;; reduce maximum by 1 - ;; unless it's NIL - (and maximum - (1- maximum)) - min-len - length - reg-seen)) - (inner-seq - ;; this is the "*" part - (make-instance 'seq - :elements (list inner-repetition - regex)))) - ;; note that this declaration already applies - ;; to the call to REMOVE-REGISTERS above - (declare (special remove-registers-p reg-seen)) - ;; wrap INNER-SEQ with a greedy - ;; {0,1}-repetition (i.e. "?") if necessary - (if (plusp minimum) - inner-seq - (maybe-split-repetition inner-seq - t - 0 - 1 - min-len - nil - t)))))))) - ;; (:REGISTER ) - ((:register) - ;; keep the effect of modifiers local to the enclosed - ;; regex; also, assign the current value of REG-NUM to - ;; the corresponding slot of the REGISTER object and - ;; increase this counter afterwards - (let ((flags (copy-list flags)) - (stored-reg-num reg-num)) - (declare (special flags reg-seen)) - (setq reg-seen t) - (incf (the fixnum reg-num)) - (make-instance 'register - :regex (convert-aux (second parse-tree)) - :num stored-reg-num))) - ;; (:FILTER &optional ) - ((:filter) - ;; stop accumulating into STARTS-WITH - (setq accumulate-start-p nil) - (make-instance 'filter - :fn (second parse-tree) - :len (third parse-tree))) - ;; (:STANDALONE ) - ((:standalone) - ;; stop accumulating into STARTS-WITH - (setq accumulate-start-p nil) - ;; keep the effect of modifiers local to the enclosed - ;; regex - (let ((flags (copy-list flags))) - (declare (special flags)) - (make-instance 'standalone - :regex (convert-aux (second parse-tree))))) - ;; (:BACK-REFERENCE ) - ((:back-reference) - (let ((backref-number (second parse-tree))) - (declare (type fixnum backref-number)) + :then-regex (first + (choices alternations)))) + ((2) + (make-instance 'branch + :test test + :then-regex (first + (choices alternations)) + :else-regex (second + (choices alternations)))) + (otherwise + (signal-ppcre-syntax-error + "Too much choices in branch: ~S" + parse-tree)))) + (t + (make-instance 'branch + :test test + :then-regex alternations))))) + ;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD ) + ((:positive-lookahead :negative-lookahead) + ;; keep the effect of modifiers local to the enclosed + ;; regex and stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (let ((flags (copy-list flags))) + (declare (special flags)) + (make-instance 'lookahead + :regex (convert-aux (second parse-tree)) + :positivep (eq (first parse-tree) + :positive-lookahead)))) + ;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND ) + ((:positive-lookbehind :negative-lookbehind) + ;; keep the effect of modifiers local to the enclosed + ;; regex and stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (let* ((flags (copy-list flags)) + (regex (convert-aux (second parse-tree))) + (len (regex-length regex))) + (declare (special flags)) + ;; lookbehind assertions must be of fixed length + (unless len + (signal-ppcre-syntax-error + "Variable length look-behind not implemented (yet): ~S" + parse-tree)) + (make-instance 'lookbehind + :regex regex + :positivep (eq (first parse-tree) + :positive-lookbehind) + :len len))) + ;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION ) + ((:greedy-repetition :non-greedy-repetition) + ;; remember the value of ACCUMULATE-START-P upon entering + (let ((local-accumulate-start-p accumulate-start-p)) + (let ((minimum (second parse-tree)) + (maximum (third parse-tree))) + (declare (type fixnum minimum)) + (declare (type (or null fixnum) maximum)) + (unless (and maximum + (= 1 minimum maximum)) + ;; set ACCUMULATE-START-P to NIL for the rest of + ;; the conversion because we can't continue to + ;; accumulate inside as well as after a proper + ;; repetition + (setq accumulate-start-p nil)) + (let* (reg-seen + (regex (convert-aux (fourth parse-tree))) + (min-len (regex-min-length regex)) + (greedyp (eq (first parse-tree) :greedy-repetition)) + (length (regex-length regex))) + ;; note that this declaration already applies to + ;; the call to CONVERT-AUX above + (declare (special reg-seen)) + (when (and local-accumulate-start-p + (not starts-with) + (zerop minimum) + (not maximum)) + ;; if this repetition is (equivalent to) ".*" + ;; and if we're at the start of the regex we + ;; remember it for ADVANCE-FN (see the SCAN + ;; function) + (setq starts-with (everythingp regex))) + (if (or (not reg-seen) + (not greedyp) + (not length) + (zerop length) + (and maximum (= minimum maximum))) + ;; the repetition doesn't enclose a register, or + ;; it's not greedy, or we can't determine it's + ;; (inner) length, or the length is zero, or the + ;; number of repetitions is fixed; in all of + ;; these cases we don't bother to optimize + (maybe-split-repetition regex + greedyp + minimum + maximum + min-len + length + reg-seen) + ;; otherwise we make a transformation that looks + ;; roughly like one of + ;; * -> (?:*)? + ;; + -> * + ;; where the trick is that as much as possible + ;; registers from are removed in + ;; + (let* (reg-seen ; new instance for REMOVE-REGISTERS + (remove-registers-p t) + (inner-regex (remove-registers regex)) + (inner-repetition + ;; this is the "" part + (maybe-split-repetition inner-regex + ;; always greedy + t + ;; reduce minimum by 1 + ;; unless it's already 0 + (if (zerop minimum) + 0 + (1- minimum)) + ;; reduce maximum by 1 + ;; unless it's NIL + (and maximum + (1- maximum)) + min-len + length + reg-seen)) + (inner-seq + ;; this is the "*" part + (make-instance 'seq + :elements (list inner-repetition + regex)))) + ;; note that this declaration already applies + ;; to the call to REMOVE-REGISTERS above + (declare (special remove-registers-p reg-seen)) + ;; wrap INNER-SEQ with a greedy + ;; {0,1}-repetition (i.e. "?") if necessary + (if (plusp minimum) + inner-seq + (maybe-split-repetition inner-seq + t + 0 + 1 + min-len + nil + t)))))))) + ;; (:REGISTER ) + ;; (:NAMED-REGISTER ) + ((:register :named-register) + ;; keep the effect of modifiers local to the enclosed + ;; regex; also, assign the current value of REG-NUM to + ;; the corresponding slot of the REGISTER object and + ;; increase this counter afterwards; for named register + ;; update REG-NAMES and set the corresponding name slot + ;; of the REGISTER object too + (let ((flags (copy-list flags)) + (stored-reg-num reg-num) + (reg-name (when (eq (first parse-tree) :named-register) + (copy-seq (second parse-tree))))) + (declare (special flags reg-seen named-reg-seen)) + (setq reg-seen t) + (when reg-name + (setq named-reg-seen t)) + (incf (the fixnum reg-num)) + (push reg-name + reg-names) + (make-instance 'register + :regex (convert-aux (if (eq (first parse-tree) :named-register) + (third parse-tree) + (second parse-tree))) + :num stored-reg-num + :name reg-name))) + ;; (:FILTER &optional ) + ((:filter) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (make-instance 'filter + :fn (second parse-tree) + :len (third parse-tree))) + ;; (:STANDALONE ) + ((:standalone) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + ;; keep the effect of modifiers local to the enclosed + ;; regex + (let ((flags (copy-list flags))) + (declare (special flags)) + (make-instance 'standalone + :regex (convert-aux (second parse-tree))))) + ;; (:BACK-REFERENCE ) + ;; (:BACK-REFERENCE ) + ((:back-reference) + (locally (declare (special reg-names reg-num)) + (let* ((backref-name (and (stringp (second parse-tree)) + (second parse-tree))) + (referred-regs + (when backref-name + ;; find which register corresponds to the given name + ;; we have to deal with case where several registers share + ;; the same name and collect their respective numbers + (loop + for name in reg-names + for reg-index from 0 + when (string= name backref-name) + ;; NOTE: REG-NAMES stores register names in reversed order + ;; REG-NUM contains number of (any) registers seen so far + ;; 1- will be done later + collect (- reg-num reg-index)))) + ;; store the register number for the simple case + (backref-number (or (first referred-regs) + (second parse-tree)))) + (declare (type (or fixnum null) backref-number)) (when (or (not (typep backref-number 'fixnum)) (<= backref-number 0)) (signal-ppcre-syntax-error @@ -539,191 +569,204 @@ Will also (setq accumulate-start-p nil max-back-ref (max (the fixnum max-back-ref) backref-number)) - (make-instance 'back-reference - ;; we start counting from 0 internally - :num (1- backref-number) - :case-insensitive-p (case-insensitive-mode-p - flags)))) - ;; (:CHAR-CLASS|:INVERTED-CHAR-CLASS {}*) - ;; where item is one of - ;; - a character - ;; - a character range: (:RANGE ) - ;; - a special char class symbol like :DIGIT-CHAR-CLASS - ((:char-class :inverted-char-class) - ;; first create the hash-table and some auxiliary values - (let* (hash - hash-keys - (count most-positive-fixnum) - (item-list (rest parse-tree)) - (invertedp (eq (first parse-tree) :inverted-char-class)) - word-char-class-p) - (cond ((every (lambda (item) (eq item :word-char-class)) - item-list) - ;; treat "[\\w]" like "\\w" - (setq word-char-class-p t)) - ((every (lambda (item) (eq item :non-word-char-class)) - item-list) - ;; treat "[\\W]" like "\\W" - (setq word-char-class-p t) - (setq invertedp (not invertedp))) - (t - (setq hash (convert-char-class-to-hash item-list) - count (hash-table-count hash)) - (when (<= count 2) - ;; collect the hash-table keys into a list if - ;; COUNT is smaller than 3 - (setq hash-keys - (loop for chr being the hash-keys of hash - collect chr))))) - (cond ((and (not invertedp) - (= count 1)) - ;; convert one-element hash table into a STR - ;; object and try to accumulate into - ;; STARTS-WITH - (let ((str (make-instance 'str - :str (string - (first hash-keys)) - :case-insensitive-p nil))) - (maybe-accumulate str) - str)) - ((and (not invertedp) - (= count 2) - (char-equal (first hash-keys) (second hash-keys))) - ;; convert two-element hash table into a - ;; case-insensitive STR object and try to - ;; accumulate into STARTS-WITH if the two - ;; characters are CHAR-EQUAL - (let ((str (make-instance 'str - :str (string - (first hash-keys)) - :case-insensitive-p t))) - (maybe-accumulate str) - str)) - (t - ;; the general case; stop accumulating into STARTS-WITH - (setq accumulate-start-p nil) - (make-instance 'char-class - :hash hash - :case-insensitive-p - (case-insensitive-mode-p flags) - :invertedp invertedp - :word-char-class-p word-char-class-p))))) - ;; (:FLAGS {}*) - ;; where flag is a modifier symbol like :CASE-INSENSITIVE-P - ((:flags) - ;; set/unset the flags corresponding to the symbols - ;; following :FLAGS - (mapc #'set-flag (rest parse-tree)) - ;; we're only interested in the side effect of - ;; setting/unsetting the flags and turn this syntactical - ;; construct into a VOID object which'll be optimized - ;; away when creating the matcher - (make-instance 'void)) - (otherwise - (signal-ppcre-syntax-error - "Unknown token ~A in parse-tree" - (first parse-tree))))) - ((or (characterp parse-tree) (stringp parse-tree)) - ;; turn characters or strings into STR objects and try to - ;; accumulate into STARTS-WITH - (let ((str (make-instance 'str - :str (string parse-tree) + (flet ((make-back-ref (backref-number) + (make-instance 'back-reference + ;; we start counting from 0 internally + :num (1- backref-number) + :case-insensitive-p (case-insensitive-mode-p flags) + ;; backref-name is NIL or string, safe to copy + :name (copy-seq backref-name)))) + (cond + ((cdr referred-regs) + ;; several registers share the same name + ;; we will try to match any of them, starting + ;; with the most recent first + ;; alternation is used to accomplish matching + (make-instance 'alternation + :choices (loop + for reg-index in referred-regs + collect (make-back-ref reg-index)))) + ;; simple case - backref corresponds to only one register + (t + (make-back-ref backref-number))))))) + ;; (:REGEX ) + ((:regex) + (let ((regex (second parse-tree))) + (convert-aux (parse-string regex)))) + ;; (:CHAR-CLASS|:INVERTED-CHAR-CLASS {}*) + ;; where item is one of + ;; - a character + ;; - a character range: (:RANGE ) + ;; - a special char class symbol like :DIGIT-CHAR-CLASS + ((:char-class :inverted-char-class) + ;; first create the charset and some auxiliary values + (let* (set set-contents + (count most-positive-fixnum) + (item-list (rest parse-tree)) + (invertedp (eq (first parse-tree) :inverted-char-class)) + word-char-class-p) + (cond ((every (lambda (item) (eq item :word-char-class)) + item-list) + ;; treat "[\\w]" like "\\w" + (setq word-char-class-p t)) + ((every (lambda (item) (eq item :non-word-char-class)) + item-list) + ;; treat "[\\W]" like "\\W" + (setq word-char-class-p t) + (setq invertedp (not invertedp))) + (t + (setq set (convert-char-class-to-charset item-list) + count (charset-count set)) + (when (<= count 2) + ;; collect the contents of SET into a list if + ;; COUNT is smaller than 3 + (setq set-contents (all-characters set))))) + (cond ((and (not invertedp) + (= count 1)) + ;; convert one-element charset into a STR object + ;; and try to accumulate into STARTS-WITH + (let ((str (make-instance 'str + :str (string (first set-contents)) + :case-insensitive-p nil))) + (maybe-accumulate str) + str)) + ((and (not invertedp) + (= count 2) + (char-equal (first set-contents) (second set-contents))) + ;; convert two-element charset into a + ;; case-insensitive STR object and try to + ;; accumulate into STARTS-WITH if the two + ;; characters are CHAR-EQUAL + (let ((str (make-instance 'str + :str (string (first set-contents)) + :case-insensitive-p t))) + (maybe-accumulate str) + str)) + (t + ;; the general case; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (make-instance 'char-class + :charset set :case-insensitive-p - (case-insensitive-mode-p flags)))) - (maybe-accumulate str) - str)) + (case-insensitive-mode-p flags) + :invertedp invertedp + :word-char-class-p word-char-class-p))))) + ;; (:FLAGS {}*) + ;; where flag is a modifier symbol like :CASE-INSENSITIVE-P + ((:flags) + ;; set/unset the flags corresponding to the symbols + ;; following :FLAGS + (mapc #'set-flag (rest parse-tree)) + ;; we're only interested in the side effect of + ;; setting/unsetting the flags and turn this syntactical + ;; construct into a VOID object which'll be optimized + ;; away when creating the matcher + (make-instance 'void)) + (otherwise + (signal-ppcre-syntax-error + "Unknown token ~A in parse-tree" + (first parse-tree))))) + ((or (characterp parse-tree) (stringp parse-tree)) + ;; turn characters or strings into STR objects and try to + ;; accumulate into STARTS-WITH + (let ((str (make-instance 'str + :str (string parse-tree) + :case-insensitive-p + (case-insensitive-mode-p flags)))) + (maybe-accumulate str) + str)) (t - ;; and now for the tokens which are symbols - (case parse-tree - ((:void) - (make-instance 'void)) - ((:word-boundary) - (make-instance 'word-boundary :negatedp nil)) - ((:non-word-boundary) - (make-instance 'word-boundary :negatedp t)) - ;; the special character classes - ((:digit-class - :non-digit-class - :word-char-class - :non-word-char-class - :whitespace-char-class - :non-whitespace-char-class) - ;; stop accumulating into STARTS-WITH - (setq accumulate-start-p nil) - (make-instance 'char-class - ;; use the constants defined in util.lisp - :hash (case parse-tree - ((:digit-class - :non-digit-class) - +digit-hash+) - ((:word-char-class - :non-word-char-class) + ;; and now for the tokens which are symbols + (case parse-tree + ((:void) + (make-instance 'void)) + ((:word-boundary) + (make-instance 'word-boundary :negatedp nil)) + ((:non-word-boundary) + (make-instance 'word-boundary :negatedp t)) + ;; the special character classes + ((:digit-class + :non-digit-class + :word-char-class + :non-word-char-class + :whitespace-char-class + :non-whitespace-char-class) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (make-instance 'char-class + ;; use the constants defined in util.lisp + :charset (case parse-tree + ((:digit-class + :non-digit-class) + +digit-set+) + ((:word-char-class + :non-word-char-class) nil) - ((:whitespace-char-class - :non-whitespace-char-class) - +whitespace-char-hash+)) - ;; this value doesn't really matter but - ;; NIL should result in slightly faster - ;; matchers - :case-insensitive-p nil - :invertedp (member parse-tree - '(:non-digit-class - :non-word-char-class - :non-whitespace-char-class) - :test #'eq) - :word-char-class-p (member parse-tree - '(:word-char-class - :non-word-char-class) - :test #'eq))) - ((:start-anchor ; Perl's "^" - :end-anchor ; Perl's "$" - :modeless-end-anchor-no-newline + ((:whitespace-char-class + :non-whitespace-char-class) + +whitespace-char-set+)) + ;; this value doesn't really matter but + ;; NIL should result in slightly faster + ;; matchers + :case-insensitive-p nil + :invertedp (member parse-tree + '(:non-digit-class + :non-word-char-class + :non-whitespace-char-class) + :test #'eq) + :word-char-class-p (member parse-tree + '(:word-char-class + :non-word-char-class) + :test #'eq))) + ((:start-anchor ; Perl's "^" + :end-anchor ; Perl's "$" + :modeless-end-anchor-no-newline ; Perl's "\z" - :modeless-start-anchor ; Perl's "\A" - :modeless-end-anchor) ; Perl's "\Z" - (make-instance 'anchor - :startp (member parse-tree - '(:start-anchor - :modeless-start-anchor) - :test #'eq) - ;; set this value according to the - ;; current settings of FLAGS (unless it's - ;; a modeless anchor) - :multi-line-p - (and (multi-line-mode-p flags) - (not (member parse-tree - '(:modeless-start-anchor - :modeless-end-anchor - :modeless-end-anchor-no-newline) - :test #'eq))) - :no-newline-p - (eq parse-tree - :modeless-end-anchor-no-newline))) - ((:everything) - ;; stop accumulating into STARTS-WITHS - (setq accumulate-start-p nil) - (make-instance 'everything - :single-line-p (single-line-mode-p flags))) - ;; special tokens corresponding to Perl's "ism" modifiers - ((:case-insensitive-p - :case-sensitive-p - :multi-line-mode-p - :not-multi-line-mode-p - :single-line-mode-p - :not-single-line-mode-p) - ;; we're only interested in the side effect of - ;; setting/unsetting the flags and turn these tokens - ;; into VOID objects which'll be optimized away when - ;; creating the matcher - (set-flag parse-tree) - (make-instance 'void)) - (otherwise - (let ((translation (and (symbolp parse-tree) - (parse-tree-synonym parse-tree)))) - (if translation - (convert-aux (copy-tree translation)) - (signal-ppcre-syntax-error "Unknown token ~A in parse-tree" - parse-tree)))))))) + :modeless-start-anchor ; Perl's "\A" + :modeless-end-anchor) ; Perl's "\Z" + (make-instance 'anchor + :startp (member parse-tree + '(:start-anchor + :modeless-start-anchor) + :test #'eq) + ;; set this value according to the + ;; current settings of FLAGS (unless it's + ;; a modeless anchor) + :multi-line-p + (and (multi-line-mode-p flags) + (not (member parse-tree + '(:modeless-start-anchor + :modeless-end-anchor + :modeless-end-anchor-no-newline) + :test #'eq))) + :no-newline-p + (eq parse-tree + :modeless-end-anchor-no-newline))) + ((:everything) + ;; stop accumulating into STARTS-WITHS + (setq accumulate-start-p nil) + (make-instance 'everything + :single-line-p (single-line-mode-p flags))) + ;; special tokens corresponding to Perl's "ism" modifiers + ((:case-insensitive-p + :case-sensitive-p + :multi-line-mode-p + :not-multi-line-mode-p + :single-line-mode-p + :not-single-line-mode-p) + ;; we're only interested in the side effect of + ;; setting/unsetting the flags and turn these tokens + ;; into VOID objects which'll be optimized away when + ;; creating the matcher + (set-flag parse-tree) + (make-instance 'void)) + (otherwise + (let ((translation (and (symbolp parse-tree) + (parse-tree-synonym parse-tree)))) + (if translation + (convert-aux (copy-tree translation)) + (signal-ppcre-syntax-error "Unknown token ~A in parse-tree" + parse-tree)))))))) (defun convert (parse-tree) (declare #.*standard-optimize-settings*) @@ -736,11 +779,14 @@ or an EVERYTHING object (if the regex starts with something like ;; and then calls CONVERT-AUX to do all the work (let* ((flags (list nil nil nil)) (reg-num 0) + reg-names + named-reg-seen (accumulate-start-p t) starts-with (max-back-ref 0) (converted-parse-tree (convert-aux parse-tree))) - (declare (special flags reg-num accumulate-start-p starts-with max-back-ref)) + (declare (special flags reg-num reg-names named-reg-seen + accumulate-start-p starts-with max-back-ref)) ;; make sure we don't reference registers which aren't there (when (> (the fixnum max-back-ref) (the fixnum reg-num)) @@ -750,4 +796,8 @@ or an EVERYTHING object (if the regex starts with something like (when (typep starts-with 'str) (setf (slot-value starts-with 'str) (coerce (slot-value starts-with 'str) 'simple-string))) - (values converted-parse-tree reg-num starts-with))) + (values converted-parse-tree reg-num starts-with + ;; we can't simply use *ALLOW-NAMED-REGISTERS* + ;; since parse-tree syntax ignores it + (when named-reg-seen + (nreverse reg-names))))) diff --git a/doc/index.html b/doc/index.html index 9f05d1e..eefdc09 100644 --- a/doc/index.html +++ b/doc/index.html @@ -3,21 +3,30 @@ - CL-PPCRE - portable Perl-compatible regular expressions for Common Lisp + CL-PPCRE - Portable Perl-compatible regular expressions for Common Lisp + -

CL-PPCRE - portable Perl-compatible regular expressions for Common Lisp

+

CL-PPCRE - Portable Perl-compatible regular expressions for Common Lisp


 

Abstract

@@ -29,24 +38,20 @@ which has the following features:
  • It is compatible with Perl. (Well - as far as you can be compatible with a language defined by its ever-changing -implementation. Currently, as of December 2002, CL-PPCRE is more -compatible with the regex semantics of Perl 5.8.0 than, say, -Perl 5.6.1 is...:) It even correctly parses and applies Jeffrey Friedl's -famous 6600-byte long RFC822 address pattern. +implementation. As of December 2002 CL-PPCRE was more +compatible with the regex semantics of Perl 5.8.0 than, say, +Perl 5.6.1 was...) It even correctly parses and +applies Jeffrey +Friedl's famous 6600-byte long RFC822 address pattern. -
  • It is fast. If compiled with CMUCL it outperforms Perl's highly optimized regex engine (written -in C) which to my knowledge is faster than most other regex engines -around. If compiled with CLISP it is still comparable -to CLISP's own regex implementation which is also written in -C. +
  • It is fast. Used with a Lisp compiler which compiles to +native code it is on par with Perl's highly +optimized regex engine (written in C) which to my knowledge is faster +than most other regex engines around.
  • It is portable, i.e. the code aims to be strictly ANSI-compliant. If -you encounter any deviations this is an error and should be +you encounter any deviations, this is an error and should be reported to the mailing list. CL-PPCRE has been successfully tested with the following Common Lisp implementations: @@ -68,7 +73,7 @@ successfully tested with the following Common Lisp implementations: -If you succeed in using CL-PPCRE on other platforms please let us know.
    @@ -117,7 +122,9 @@ license so you can basically do with it whatever you want. CL-PPCRE has been used successfully in various applications like BioLingua, BioBike, +clutu, +LoGS, CafeSpot, Eboy, or The Regex Coach. @@ -150,9 +157,10 @@ href="http://weitz.de/regex-coach/">The Regex Coach.
  • regex-replace-all
  • regex-apropos
  • regex-apropos-list -
  • *regex-char-code-limit* -
  • *use-bmh-matchers* +
  • *regex-char-code-limit* +
  • *use-bmh-matchers*
  • *allow-quoting* +
  • *allow-named-registers*
  • quote-meta-chars
  • ppcre-error
  • ppcre-invocation-error @@ -192,15 +200,14 @@ href="http://weitz.de/regex-coach/">The Regex Coach. CL-PPCRE together with this documentation can be downloaded from http://weitz.de/files/cl-ppcre.tar.gz. The -current version is 1.2.12. A CHANGELOG is available. +current version is 1.4.1.

    -If you're on Debian you should +If you're on Debian, you should probably use the cl-ppcre +href="http://packages.debian.org/cgi-bin/search_packages.pl?keywords=cl-ppcre&searchon=names&version=all&release=all">cl-ppcre Debian package which is available thanks to Peter van Eynde and Kevin Rosenberg. There's also a port -for Gentoo Linux thanks to Matthew Kennedy and a FreeBSD port thanks to Henrik Motakef. +for Gentoo Linux thanks to Matthew Kennedy and a FreeBSD port thanks to Henrik Motakef. Installation via asdf-install should as well be possible. @@ -214,7 +221,7 @@ directory start your Lisp image and evaluate the form equivalent one for asdf) which should compile and load the whole system.

    -If for some reason you don't want to use MK:DEFSYSTEM or asdf you +If for some reason you don't want to use MK:DEFSYSTEM or asdf, you can just LOAD the file load.lisp or you can also get away with something like this: @@ -239,35 +246,46 @@ cat {packages,specials,util,errors,lexer,parser,regex-class,convert,optimize,clo your platform.)

    Note that there is no public CVS repository for CL-PPCRE - the repository at common-lisp.net is out of date and not in sync with the (current) version distributed from weitz.de. +

    +Luís Oliveira maintains a darcs +repository of CL-PPCRE +at http://common-lisp.net/~loliveira/ediware/.
     

    Support and mailing lists

    For questions, bug reports, feature requests, improvements, or patches -please use the cl-ppcre-devel -mailing list. If you want to be notified about future releases -subscribe to the cl-ppcre-announce +please use +the cl-ppcre-devel +mailing list. If you want to be notified about future releases, +subscribe to +the cl-ppcre-announce mailing list. These mailing lists were made available thanks to the services of common-lisp.net. +Terrence Brannon has created a Google group for the list which is +at http://groups.google.com/group/cl-ppcre. +

    +If you want to send patches, please read this first.
     

    The CL-PPCRE dictionary

    CL-PPCRE exports the following symbols:


    [Method] -
    create-scanner (string string)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner +
    create-scanner (string string)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner, register-names


    Accepts a string which is a regular expression in Perl syntax and returns a closure which will scan strings for this -regular expression. The mode keyword arguments are equivalent to the +regular expression. The second value is only returned if *ALLOW-NAMED-REGISTERS* is true. It represents a list of strings mapping registers to their respective names - the first element stands for first register, the second element for second register, etc. You have to store this value if you want to map a register number to its name later as scanner doesn't capture any information about register names. If a register isn't named, it has NIL as its name. +

    +The mode keyword arguments are equivalent to the "imsx" modifiers in Perl. The destructive keyword will be ignored.

    The function accepts most of the regex syntax of Perl 5 as described in man +href="http://perldoc.perl.org/perlre.html">man perlre including extended features like non-greedy repetitions, positive and negative look-ahead and look-behind assertions, "standalone" subexpressions, and conditional @@ -308,6 +326,8 @@ codes), \c[ (control characters), \w, Since version 0.6.0 CL-PPCRE also supports Perl's \Q and \E - see *ALLOW-QUOTING* below. Make sure you also read the relevant section in "Bugs and problems."

    +Since version 1.3.0 CL-PPCRE offers support for AllegroCL's (?<name>"<regex>") named registers and \k<name> back-references syntax, have a look at *ALLOW-NAMED-REGISTERS* for details. +

    The keyword arguments are just for your convenience. You can always use embedded modifiers like "(?i-s)" instead.

    @@ -319,7 +339,7 @@ In this case function should be a scanner returned by anothe

  • [Method] -
    create-scanner (parse-tree t)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner +
    create-scanner (parse-tree t)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner, register-names


    This is similar to CREATE-SCANNER for regex strings above but @@ -405,7 +425,7 @@ stands for a register or a parse tree which is a look-ahead or look-behind assertion. See the entry for (?(<condition>)<yes-pattern>|<no-pattern>) in man +href="http://perldoc.perl.org/perlre.html#Extended-Patterns">man perlre for the semantics of this construct. If <parse-tree> is an alternation is must enclose exactly one or two parse trees where the second @@ -439,9 +459,19 @@ to the Perl regex string "(?:ab)??". register group. As usual, registers are counted from left to right beginning with 1. -
  • (:BACK-REFERENCE <number>) where -<number> is a positive integer is a back-reference to a -register group. +
  • (:NAMED-REGISTER <name> <parse-tree>) is a named capturing +register group. Acts as :REGISTER, but assigns <name> to a register too. This <name> can be later referred to via :BACK-REFERENCE. Names are case-sensitive and don't need to be unique. See *ALLOW-NAMED-REGISTERS* for details. + + +
  • (:BACK-REFERENCE <ref>) is a +back-reference to a register group. <ref> is +a positive integer or a string denoting a register name. If there are +several registers with the same name, the regex engine tries to +successfully match at least of them, starting with the most recently +seen register continuing to the least recently seen one, until a match +is found. See *ALLOW-NAMED-REGISTERS* +for more information.
  • (:FILTER <function> &optional <length>) where @@ -451,6 +481,11 @@ designator and <length> is a non-negative integer or NIL is a user-defined filter. +
  • (:REGEX <string>) where +<string> is an +embedded regular expression in Perl +syntax. +
  • (:CHAR-CLASS|:INVERTED-CHAR-CLASS {<item>}*) where <item> is either a character, a character range, or a symbol for a @@ -483,11 +518,11 @@ doesn't make sense if CREATE-SCANNER is applied to parse trees and will signal an error.

    If destructive is not NIL (the default is -NIL) the function is allowed to destructively modify +NIL), the function is allowed to destructively modify parse-tree while creating the scanner.

    If you want to find out how parse trees are related to Perl regex -strings you should play around with +strings, you should play around with CL-PPCRE::PARSE-STRING - a function which converts Perl regex strings to parse trees. Here are some examples: @@ -514,7 +549,7 @@ regex strings to parse trees. Here are some examples:


    [Accessor]
    parse-tree-synonym symbol => parse-tree -
    (setf (parse-tree-synonym symbol) new-parse-tree)
    +
    (setf (parse-tree-synonym symbol) new-parse-tree)


    Any symbol (unless it's a keyword with a special meaning in parse @@ -568,7 +603,7 @@ so you can write code like this:
     (define-parse-tree-synonym a-z
    -  (:char-class (:range #\a #\z) (:range #\a #\z)))
    +  (:char-class (:range #\a #\z) (:range #\A #\Z)))
     
     (define-parse-tree-synonym a-z*
       (:greedy-repetition 0 nil a-z))
    @@ -590,19 +625,21 @@ href="#scan">SCAN.
     
     
     
    -


    [Standard Generic Function] +


    [Generic Function]
    scan regex target-string &key start end => match-start, match-end, reg-starts, reg-ends


    -Searches the string target-string from -start (which defaults to 0) to +Searches the string target-string +from start (which defaults to 0) to end (which default to the length of target-string) and tries to match regex. On success returns four values - the start of the match, the end of the match, and two arrays denoting the beginnings and ends of register matches. On failure returns -NIL. target-string will be coerced to a -simple string if it isn't one already. +NIL. target-string will be coerced +to a simple string if it isn't one already. (There's another keyword +parameter real-start-pos. This one should +never be set from user code - it is only used internally.)

    SCAN acts as if the part of target-string between start @@ -691,7 +728,7 @@ and FN is a function designator (which is evaluated) denoting a function which is to be applied to the string before the result is bound to VAR. To make this even more convenient the form (FN VAR1 ...VARn) can be used as an abbreviation for -(FN VAR1) ... (FN VARn). +(FN VAR1) ... (FN VARn).

    If there is no match, the statement* forms are not executed. For each element of @@ -739,7 +776,7 @@ returns result-form if provided or NIL otherwise. An implicit block named NIL surrounds DO-SCANS; RETURN may be used to terminate the loop immediately. If regex matches -an empty string the scan is continued one position behind this match. +an empty string, the scan is continued one position behind this match.

    This is the most general macro to iterate over all matches in a target string. See the source code of result-form if provided or NIL otherwise. An implicit block named NIL surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop immediately. If regex matches -an empty string the scan is continued one position behind this +an empty string, the scan is continued one position behind this match. If sharedp is true, the substrings may share structure with target-string.

    Example: @@ -916,7 +953,7 @@ simply be left out, otherwise they will show up as elements returned - registers aren't counted. If limit is NIL (or 0 which is equivalent), trailing empty strings are removed from the result list. -If regex matches an empty string the scan is +If regex matches an empty string, the scan is continued one position behind this match. If sharedp is true, the substrings may share structure with target-string.

    @@ -970,12 +1007,14 @@ frob")


    [Function] -
    regex-replace regex target-string replacement &key start end preserve-case simple-calls => list +
    regex-replace regex target-string replacement &key start end preserve-case simple-calls element-type => string, matchp


    Try to match target-string between start and end against regex and replace the first match with -replacement. +replacement. Two values are returned; the modified +string, and T if regex matched or +NIL otherwise.

    replacement can be a string which may contain the special substrings "\&" for the whole @@ -1024,35 +1063,61 @@ will always be a fresh string, even if regex doesn't match.

    +element-type specifies +the array +element type of the string which is returned, the default +is LW:SIMPLE-CHAR +for LispWorks +and CHARACTER +for other Lisps. +

    Examples:

     * (cl-ppcre:regex-replace "fo+" "foo bar" "frob")
     "frob bar"
    +T
     
     * (cl-ppcre:regex-replace "fo+" "FOO bar" "frob")
     "FOO bar"
    +NIL
     
     * (cl-ppcre:regex-replace "(?i)fo+" "FOO bar" "frob")
     "frob bar"
    +T
     
     * (cl-ppcre:regex-replace "(?i)fo+" "FOO bar" "frob" :preserve-case t)
     "FROB bar"
    +T
     
     * (cl-ppcre:regex-replace "(?i)fo+" "Foo bar" "frob" :preserve-case t)
     "Frob bar"
    +T
     
     * (cl-ppcre:regex-replace "bar" "foo bar baz" "[frob (was '\\&' between '\\`' and '\\'')]")
     "foo [frob (was 'bar' between 'foo ' and ' baz')] baz"
    +T
     
     * (cl-ppcre:regex-replace "bar" "foo bar baz"
                               '("[frob (was '" :match "' between '" :before-match "' and '" :after-match "')]"))
     "foo [frob (was 'bar' between 'foo ' and ' baz')] baz"
    +T
    +
    +* (cl-ppcre:regex-replace "(be)(nev)(o)(lent)"
    +                          "benevolent: adj. generous, kind"
    +                          #'(lambda (match &rest registers)
    +                              (format nil "~A [~{~A~^.~}]" match registers))
    +                          :simple-calls t)
    +"benevolent [be.nev.o.lent]: adj. generous, kind"
    +T
     


    [Function] -
    regex-replace-all regex target-string replacement &key start end preserve-case simple-calls => list +
    regex-replace-all regex target-string replacement &key start end preserve-case simple-calls element-type => string, matchp


    Like REGEX-REPLACE but replaces all matches. @@ -1062,9 +1127,11 @@ Examples:
     * (cl-ppcre:regex-replace-all "(?i)fo+" "foo Fooo FOOOO bar" "frob" :preserve-case t)
     "frob Frob FROB bar"
    +T
     
     * (cl-ppcre:regex-replace-all "(?i)f(o+)" "foo Fooo FOOOO bar" "fr\\1b" :preserve-case t)
     "froob Frooob FROOOOB bar"
    +T
     
     * (let ((qp-regex (cl-ppcre:create-scanner "[\\x80-\\xff]")))
         (defun encode-quoted-printable (string)
    @@ -1079,6 +1146,7 @@ ENCODE-QUOTED-PRINTABLE
     
     * (encode-quoted-printable "Fête Sørensen naïve Hühner Straße")
     "F=EAte S=F8rensen na=EFve H=FChner Stra=DFe"
    +T
     
     * (let ((url-regex (cl-ppcre:create-scanner "[^a-zA-Z0-9_\\-.]")))
         (defun url-encode (string)
    @@ -1093,6 +1161,7 @@ URL-ENCODE
     
     * (url-encode "Fête Sørensen naïve Hühner Straße")
     "F%EAte%20S%F8rensen%20na%EFve%20H%FChner%20Stra%DFe"
    +T
     
     * (defun how-many (target-string start end match-start match-end reg-starts reg-ends)
         (declare (ignore start end match-start match-end))
    @@ -1104,6 +1173,7 @@ HOW-MANY
                                   "foo{...}bar{.....}{..}baz{....}frob"
                                   (list "[" 'how-many " dots]"))
     "foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"
    +T
     
     * (let ((qp-regex (cl-ppcre:create-scanner "[\\x80-\\xff]")))
         (defun encode-quoted-printable (string)
    @@ -1120,6 +1190,7 @@ ENCODE-QUOTED-PRINTABLE
     
     * (encode-quoted-printable "Fête Sørensen naïve Hühner Straße")
     "F=EAte S=F8rensen na=EFve H=FChner Stra=DFe"
    +T
     
     * (defun how-many (match first-register)
         (declare (ignore match))
    @@ -1132,6 +1203,7 @@ HOW-MANY
                                   :simple-calls t)
     
     "foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"
    +T
     


    [Function] @@ -1217,18 +1289,18 @@ Example (continued from above):


    [Special variable] -
    *regex-char-code-limit* +
    *regex-char-code-limit*


    This variable controls whether scanners take into -account all characters of your CL implementation or only those the CHAR-CODE -of which is not larger than its value. It is only relevant if the -regular expression contains certain character classes. The default is +of which is not larger than its value. The default is CHAR-CODE-LIMIT, and you might see significant speed and space improvements during scanner creation if, say, your target strings only contain ISO-8859-1 +href="http://czyborra.com/charsets/iso8859.html">ISO-8859-1 characters and you're using an implementation like AllegroCL, CLISP, LispWorks, or SBCL where CHAR-CODE-LIMIT has a value much higher than 256. The test suite will @@ -1259,8 +1331,8 @@ Allocation = 3336 bytes standard / 8338 bytes fixlen #<closure 206569DA>

    -Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN some +Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN and other functions, some scanners might be created in a null lexical environment at load time or at compile time so be careful @@ -1269,11 +1341,11 @@ time. The default value should always yield correct results unless you play dirty tricks with implementation-dependent behaviour, though.


    [Special variable] -
    *use-bmh-matchers* +
    *use-bmh-matchers*


    Usually, the scanners created by CREATE-SCANNER (or -implicitely by other functions and macros) will use fast Boyer-Moore-Horspool matchers to check for constant strings at the start or end of the regular expression. If *USE-BMH-MATCHERS* is @@ -1286,8 +1358,8 @@ href="#test">test suite will automatically set *USE-BMH-MATCHERS* to NIL while you're running the default test.

    -Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN some +Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN and other functions, some scanners might be created in a null lexical environment at load time or at compile time so be careful @@ -1298,7 +1370,7 @@ time.


    *allow-quoting*

    -If this value is true (the default is NIL) +If this value is true (the default is NIL), CL-PPCRE will support \Q and \E in regex strings to quote (disable) metacharacters. Note that this entails a slight performance penalty when creating scanners because (a copy of) the regex @@ -1312,14 +1384,15 @@ about the converted string and not about the original regex string. NIL * (let ((cl-ppcre:*allow-quoting* t)) - (cl-ppcre:scan "^\\Qa+\\E$" "a+")) + ;;we use CREATE-SCANNER because of Lisps like SBCL that don't have an interpreter + (cl-ppcre:scan (cl-ppcre:create-scanner "^\\Qa+\\E$") "a+")) 0 2 #() #() * (let ((cl-ppcre:*allow-quoting* t)) - (cl-ppcre:scan "\\Qa()\\E(?#comment\\Q)a**b" "()ab")) + (cl-ppcre:scan (cl-ppcre:create-scanner "\\Qa()\\E(?#comment\\Q)a**b") "()ab")) Quantifier '*' not allowed at position 19 in string "a\\(\\)(?#commentQ)a**b" @@ -1341,9 +1414,135 @@ understand (and Lisp-ier) if you write it like this: Make sure you also read the relevant section in "Bugs and problems." +

    +Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN and other functions, some +scanners might be created in a null +lexical environment at load time or at compile time so be careful +to which value *ALLOW-QUOTING* is bound at that +time.

  • +


    [Special variable] +
    *allow-named-registers* + +


    +If this value is true (the default is NIL), +CL-PPCRE will support (?<name>"<regex>") and \k<name> in regex +strings to provide named registers and back-references as in AllegroCL. name is has to start with a letter and can contain only alphanumeric characters or minus sign. Names of registers are matched case-sensitively. +The parse tree syntax is not affected by the *ALLOW-NAMED-REGISTERS* switch, :NAMED-REGISTER and :BACK-REFERENCE forms are always resolved as expected. There are also no restrictions on register names in this syntax except that they have to be strings. + +

    +Examples: + +

    +;; Perl compatible mode (*ALLOW-NAMED-REGISTERS* is NIL)
    +* (cl-ppcre:create-scanner "(?<reg>.*)")
    +Character 'r' may not follow '(?<' at position 3 in string "(?<reg>)"
    +
    +;; just unescapes "\\k"
    +* (cl-ppcre::parse-string "\\k<reg>")
    +"k<reg>"
    +
    + +
    +* (setq cl-ppcre:*allow-named-registers* t)
    +T
    +
    +* (cl-ppcre:create-scanner "((?<small>[a-z]*)(?<big>[A-Z]*))")
    +#<CLOSURE (LAMBDA (STRING CL-PPCRE::START CL-PPCRE::END)) {AD75BFD}>
    +(NIL "small" "big")
    +
    +;; the scanner doesn't capture any information about named groups -
    +;; you have to store the second value returned from CREATE-SCANNER yourself
    +* (cl-ppcre:scan * "aaaBBB")
    +0
    +6
    +#(0 0 3)
    +#(6 3 6)
    +
    + +
    +;; parse tree syntax
    +* (cl-ppcre::parse-string "((?<small>[a-z]*)(?<big>[A-Z]*))")
    +(:REGISTER
    + (:SEQUENCE
    +  (:NAMED-REGISTER "small"
    +   (:GREEDY-REPETITION 0 NIL (:CHAR-CLASS (:RANGE #\a #\z))))
    +  (:NAMED-REGISTER "big"
    +   (:GREEDY-REPETITION 0 NIL (:CHAR-CLASS (:RANGE #\A #\Z))))))
    +
    +* (cl-ppcre:create-scanner *)
    +#<CLOSURE (LAMBDA (STRING CL-PPCRE::START CL-PPCRE::END)) {B158E3D}>
    +(NIL "small" "big")
    +
    + +
    +;; multiple-choice back-reference
    +* (cl-ppcre:scan "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$" "a1aa")
    +0
    +4
    +#(0 1)
    +#(1 2)
    +
    +* (cl-ppcre:scan "^(?<reg>[ab])(?<reg>[12])\\k<reg>\\k<reg>$" "a22a")
    +0
    +4
    +#(0 1)
    +#(1 2)
    +
    + + +
    +;; demonstrating most-recently-seen-register-first property of back-reference;
    +;; "greedy" regex (analogous to "aa?")
    +* (cl-ppcre:scan "^(?<reg>)(?<reg>a)(\\k<reg>)" "a")
    +0
    +1
    +#(0 0 1)
    +#(0 1 1)
    +
    +* (cl-ppcre:scan "^(?<reg>)(?<reg>a)(\\k<reg>)" "aa")
    +0
    +2
    +#(0 0 1)
    +#(0 1 2)
    +
    + +
    +;; switched groups
    +;; "lazy" regex (analogous to "aa??")
    +* (cl-ppcre:scan "^(?<reg>a)(?<reg>)(\\k<reg>)" "a")
    +0
    +1
    +#(0 1 1)
    +#(1 1 1)
    +
    +;; scanner ignores the second "a"
    +* (cl-ppcre:scan "^(?<reg>a)(?<reg>)(\\k<reg>)" "aa")
    +0
    +1
    +#(0 1 1)
    +#(1 1 1)
    +
    +;; "aa" will be matched only when forced by adding "$" at the end
    +* (cl-ppcre:scan "^(?<reg>a)(?<reg>)(\\k<reg>)$" "aa")
    +0
    +2
    +#(0 1 1)
    +#(1 1 2)
    +
    +Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN and other functions, some +scanners might be created in a null +lexical environment at load time or at compile time so be careful +to which value *ALLOW-NAMED-REGISTERS* is bound at that +time.
    + +


    [Function]
    quote-meta-chars string => string' @@ -1392,8 +1591,8 @@ string or to convert a parse tree into its internal representation. This is a direct subtype of PPCRE-ERROR with two additional slots. These denote the regex string which HTML-PPCRE was parsing and -the position within the string where the error occured. If the error -happens while CL-PPCRE is converting a parse tree both of these slots +the position within the string where the error occurred. If the error +happens while CL-PPCRE is converting a parse tree, both of these slots contain NIL. (See the next two entries on how to access these slots.)

    @@ -1420,11 +1619,11 @@ The last message we received was "Quantifier '*' not allowed".


    [Function] -
    ppcre-syntax-error-string condition => string +
    ppcre-syntax-error-string condition => string


    If condition is a condition of type PPCRE-SYNTAX-ERROR this +href="#ppcre-syntax-error">PPCRE-SYNTAX-ERROR, this function will return the string the parser was parsing when the error was encountered (or NIL if the error happened while trying to convert a parse tree). This might be particularly useful when CREATE-SCANNER function.


    [Function] -
    ppcre-syntax-error-pos condition => number +
    ppcre-syntax-error-pos condition => number


    If condition is a condition of type PPCRE-SYNTAX-ERROR this +href="#ppcre-syntax-error">PPCRE-SYNTAX-ERROR, this function will return the position within the string where the error -occured (or NIL if the error happened while trying to +occurred (or NIL if the error happened while trying to convert a parse tree).
    @@ -1454,13 +1653,10 @@ regex building blocks. Filters can only be used within parse trees, not within Perl regex strings.

    -Note that filters are currently considered an experimental feature and -their API might change in the future. -

    A filter is defined by its filter function which must be a function of one argument. During the parsing process this function might be called once or several times or it might not be called at -all. If it's called its argument is an integer pos +all. If it's called, its argument is an integer pos which is the current position within the target string. The filter can either return NIL (which means that the subexpression represented by this filter didn't match) or an integer not smaller @@ -1470,7 +1666,7 @@ wants to consume N characters should return (+ POS N).

    If you supply the optional value length and it is -not NIL then this is a promise to the regex engine that +not NIL, then this is a promise to the regex engine that your filter will always consume exactly length characters. The regex engine might use this information for optimization purposes but it is otherwise irrelevant @@ -1499,7 +1695,7 @@ value of this variable is NIL.

  • CL-PPCRE::*REG-STARTS* and CL-PPCRE::*REG-ENDS*: Two simple vectors which denote the start and end indices of registers within the regular expression. The -first register is indexed by 0. If a register hasn't matched yet +first register is indexed by 0. If a register hasn't matched yet, then its corresponding entry in CL-PPCRE::*REG-STARTS* is NIL. @@ -1568,7 +1764,7 @@ NIL * (defun my-weird-filter (pos) "Only match at this point if either pos is odd and the character - we're looking at is lowerrcase or if pos is even and the next two + we're looking at is lowercase or if pos is even and the next two characters we're looking at are uppercase. Consume these characters if there's a match." (format t "Trying at position ~A~%" pos) @@ -1653,10 +1849,10 @@ NIL * (cl-ppcre-test:test) ;; .... -;; (a list of incompatibilities with Perl) +;; (a list of incompatibilities with Perl) -(If you're not using MK:DEFSYSTEM or asdf it suffices to build +(If you're not using MK:DEFSYSTEM or asdf, it suffices to build CL-PPCRE and then compile and load the file ppcre-tests.lisp.)

    @@ -1746,7 +1942,7 @@ matching non-ASCII characters. The CL-PPCRE test suite can also be used for benchmarking purposes: If you call perltest.pl with a -command line argument it will be interpreted as the minimum number of seconds +command line argument, it will be interpreted as the minimum number of seconds each test should run. Perl will time its tests accordingly and create output which, when fed to CL-PPCRE-TEST:TEST, will result in a benchmark. Here's an example: @@ -1789,7 +1985,7 @@ NIL We gave two test cases to perltest.pl and asked it to repeat those tests often enough so that it takes at least 0.5 seconds to run each of them. In both cases, CMUCL was about twice as fast as Perl.

    -Here are some more benchmarks (done with Perl 5.6.1 and CMUCL 18d+): +Here are some more benchmarks (done with Perl 5.6.1 and CMUCL 18d+ in 2002):

    @@ -1898,8 +2094,8 @@ to SCAN, SCAN- REGEX-REPLACE, or REGEX-REPLACE-ALL is a constant form. (But see the notes for *REGEX-CHAR-CODE-LIMIT* and -*USE-BMH-MATCHERS*.) +href="#*regex-char-code-limit*">*REGEX-CHAR-CODE-LIMIT* and +*USE-BMH-MATCHERS*.)

    Here's an example of its effect @@ -2012,9 +2208,9 @@ Another thing to consider is that, for performance reasons, CL-PPCRE assumes that most of the target strings you're trying to match are simple strings and coerces non-simple strings to simple strings before -scanning them. If you plan on working with non-simple strings mostly +scanning them. If you plan on working with non-simple strings mostly, you might consider modifying the CL-PPCRE source code. This is easy: -Change all occurences of SCHAR to CHAR and +Change all occurrences of SCHAR to CHAR and redefine the macro in util.lisp where the coercion takes place - that's all. @@ -2157,7 +2353,7 @@ print 1 if '\E*\E*' =~ /(?:\Q$a\E){2}/; -If you try to do something similar in CL-PPCRE you get an error: +If you try to do something similar in CL-PPCRE, you get an error:

     * (let ((cl-ppcre:*allow-quoting* t)
    @@ -2179,7 +2375,7 @@ the scary details. It can happen in CL-PPCRE, though.
     Bummer!
     

    What gives? "\\Q...\\E" in CL-PPCRE should only -be used in literal strings. If you want to quote arbitrary strings +be used in literal strings. If you want to quote arbitrary strings, try CL-INTERPOL or use QUOTE-META-CHARS:

    @@ -2270,7 +2466,6 @@ differences (most of which probably don't matter very often):
     href="#parse-tree-synonym">parse tree synonyms and filters.
     
  • The AllegroCL engine will choke on some regular expressions involving curly braces that are accepted by Perl and CL-PPCRE's native engine.
  • The AllegroCL engine's case-folding mode switch (which is used instead of CL-PPCRE's :CASE-INSENSITIVE keyword parameter) is currently only effective for ASCII characters. -
  • CL-PPCRE's engine doesn't understand the named register groups provided by AllegroCL.
  • The AllegroCL engine doesn't support quoting of metacharacters.
  • In AllegroCL compatibility mode compiled regular expressions (as returned by CREATE-SCANNER) aren't functions but structures. @@ -2294,23 +2489,26 @@ href="http://www.cons.org/cmucl/support.html">mailing list as well as the output of Perl's use re "debug" pragma have been very helpful in optimizing the scanners created by CL-PPCRE. -

    -The asdf system definitions were kindly provided by Marco +

    The asdf system definitions were kindly provided by Marco Baringer. Hannu Koivisto provided patches to make the .system files more usable. Thanks to Kevin Rosenberg and Douglas Crosher for pointing out how to be friendly to case-sensitive ACL images. Thanks to Karsten Poeck and JP Massar for their help in making CL-PPCRE work with Corman Lisp. JP Massar and Kent M. Pitman -also helped to improve/fix the test suite and the compiler macro. +also helped to improve/fix the test suite and the compiler macro. Nikodemus Siivola provided the +fast charset implementation in charset.lisp. See the ChangeLog for several +other people who helped with bug reports or patches.

    -Thanks to the guys at "Café Olé" in Hamburg +Thanks to the guys at "Café Olé" in Hamburg where I wrote most of the code and thanks to my wife for lending me her PowerBook to test CL-PPCRE with MCL and OpenMCL.

    -$Header: /usr/local/cvsrep/cl-ppcre/doc/index.html,v 1.131 2005/11/01 09:51:02 edi Exp $ +$Header: /usr/local/cvsrep/cl-ppcre/doc/index.html,v 1.171 2008/07/03 10:06:17 edi Exp $

    BACK TO MY HOMEPAGE - \ No newline at end of file + diff --git a/errors.lisp b/errors.lisp index 398a5fa..839d284 100644 --- a/errors.lisp +++ b/errors.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.14 2005/04/01 21:29:09 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.18 2008/06/25 14:04:27 edi Exp $ -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -61,7 +61,7 @@ encountered \(or NIL if the error happened while trying to convert a parse tree).") (setf (documentation 'ppcre-syntax-error-pos 'function) - "Returns the position within the string where the error occured + "Returns the position within the string where the error occurred \(or NIL if the error happened while trying to convert a parse tree") (define-condition ppcre-invocation-error (ppcre-error) diff --git a/lexer.lisp b/lexer.lisp index 30268b5..1462ad9 100644 --- a/lexer.lisp +++ b/lexer.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.28 2008/06/25 14:04:27 edi Exp $ ;;; The lexer's responsibility is to convert the regex string into a ;;; sequence of tokens which are in turn consumed by the parser. @@ -9,7 +9,7 @@ ;;; has opened so far. (The latter is necessary for interpreting ;;; strings like "\\10" correctly.) -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -462,203 +462,255 @@ resets the lexer to its old position." (otherwise (fail lexer))))) +(defun parse-register-name-aux (lexer) + "Reads and returns the name in a named register group. It is +assumed that the starting #\< character has already been read. The +closing #\> will also be consumed." + ;; we have to look for an ending > character now + (let ((end-name (position #\> + (lexer-str lexer) + :start (lexer-pos lexer) + :test #'char=))) + (unless end-name + ;; there has to be > somewhere, syntax error otherwise + (signal-ppcre-syntax-error* + (1- (lexer-pos lexer)) + "Opening #\< in named group has no closing #\>")) + (let ((name (subseq (lexer-str lexer) + (lexer-pos lexer) + end-name))) + (unless (every #'(lambda (char) + (or (alphanumericp char) + (char= #\- char))) + name) + ;; register name can contain only alphanumeric characters or #\- + (signal-ppcre-syntax-error* + (lexer-pos lexer) + "Invalid character in named register group")) + ;; advance lexer beyond "" part + (setf (lexer-pos lexer) (1+ end-name)) + name))) + (defun get-token (lexer) (declare #.*standard-optimize-settings*) - "Returns and consumes the next token from the regex string (or NIL)." + "Returns and consumes the next token from the regex string \(or NIL)." ;; remember starting position for UNGET-TOKEN function (push (lexer-pos lexer) (lexer-last-pos lexer)) (let ((next-char (next-char lexer))) (cond (next-char - (case next-char - ;; the easy cases first - the following six characters - ;; always have a special meaning and get translated - ;; into tokens immediately - ((#\)) - :close-paren) - ((#\|) - :vertical-bar) - ((#\?) - :question-mark) - ((#\.) - :everything) - ((#\^) - :start-anchor) - ((#\$) - :end-anchor) - ((#\+ #\*) - ;; quantifiers will always be consumend by - ;; GET-QUANTIFIER, they must not appear here - (signal-ppcre-syntax-error* - (1- (lexer-pos lexer)) - "Quantifier '~A' not allowed" - next-char)) - ((#\{) - ;; left brace isn't a special character in it's own - ;; right but we must check if what follows might - ;; look like a quantifier - (let ((this-pos (lexer-pos lexer)) - (this-last-pos (lexer-last-pos lexer))) - (unget-token lexer) - (when (get-quantifier lexer) - (signal-ppcre-syntax-error* - (car this-last-pos) - "Quantifier '~A' not allowed" - (subseq (lexer-str lexer) - (car this-last-pos) - (lexer-pos lexer)))) - (setf (lexer-pos lexer) this-pos - (lexer-last-pos lexer) this-last-pos) - next-char)) - ((#\[) - ;; left bracket always starts a character class - (cons (cond ((looking-at-p lexer #\^) - (incf (lexer-pos lexer)) - :inverted-char-class) - (t - :char-class)) - (collect-char-class lexer))) - ((#\\) - ;; backslash might mean different things so we have - ;; to peek one char ahead: - (let ((next-char (next-char-non-extended lexer))) - (case next-char - ((#\A) - :modeless-start-anchor) - ((#\Z) - :modeless-end-anchor) - ((#\z) - :modeless-end-anchor-no-newline) - ((#\b) - :word-boundary) - ((#\B) - :non-word-boundary) - ((#\d #\D #\w #\W #\s #\S) - ;; these will be treated like character classes - (map-char-to-special-char-class next-char)) - ((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) - ;; uh, a digit... - (let* ((old-pos (decf (lexer-pos lexer))) - ;; ...so let's get the whole number first - (backref-number (get-number lexer))) - (declare (type fixnum backref-number)) - (cond ((and (> backref-number (lexer-reg lexer)) - (<= 10 backref-number)) - ;; \10 and higher are treated as octal - ;; character codes if we haven't - ;; opened that much register groups - ;; yet - (setf (lexer-pos lexer) old-pos) - ;; re-read the number from the old - ;; position and convert it to its - ;; corresponding character - (make-char-from-code (get-number lexer :radix 8 :max-length 3) - old-pos)) - (t - ;; otherwise this must refer to a - ;; backreference - (list :back-reference backref-number))))) - ((#\0) - ;; this always means an octal character code - ;; (at most three digits) - (let ((old-pos (decf (lexer-pos lexer)))) - (make-char-from-code (get-number lexer :radix 8 :max-length 3) - old-pos))) - (otherwise - ;; in all other cases just unescape the - ;; character - (decf (lexer-pos lexer)) - (unescape-char lexer))))) - ((#\() - ;; an open parenthesis might mean different things - ;; depending on what follows... - (cond ((looking-at-p lexer #\?) - ;; this is the case '(?' (and probably more behind) - (incf (lexer-pos lexer)) - ;; we have to check for modifiers first - ;; because a colon might follow - (let* ((flags (maybe-parse-flags lexer)) - (next-char (next-char-non-extended lexer))) - ;; modifiers are only allowed if a colon - ;; or a closing parenthesis are following - (when (and flags - (not (find next-char ":)" :test #'char=))) - (signal-ppcre-syntax-error* - (car (lexer-last-pos lexer)) - "Sequence '~A' not recognized" - (subseq (lexer-str lexer) - (car (lexer-last-pos lexer)) - (lexer-pos lexer)))) - (case next-char - ((nil) - ;; syntax error - (signal-ppcre-syntax-error - "End of string following '(?'")) - ((#\)) - ;; an empty group except for the flags - ;; (if there are any) - (or (and flags - (cons :flags flags)) - :void)) - ((#\() - ;; branch - :open-paren-paren) - ((#\>) - ;; standalone - :open-paren-greater) - ((#\=) - ;; positive look-ahead - :open-paren-equal) - ((#\!) - ;; negative look-ahead - :open-paren-exclamation) - ((#\:) - ;; non-capturing group - return flags as - ;; second value - (values :open-paren-colon flags)) - ((#\<) - ;; might be a look-behind assertion, so - ;; check next character - (let ((next-char (next-char-non-extended lexer))) - (case next-char - ((#\=) - ;; positive look-behind - :open-paren-less-equal) - ((#\!) - ;; negative look-behind - :open-paren-less-exclamation) - ((#\)) - ;; Perl allows "(?<)" and treats - ;; it like a null string - :void) - ((nil) - ;; syntax error - (signal-ppcre-syntax-error - "End of string following '(?<'")) - (t - ;; also syntax error + (case next-char + ;; the easy cases first - the following six characters + ;; always have a special meaning and get translated + ;; into tokens immediately + ((#\)) + :close-paren) + ((#\|) + :vertical-bar) + ((#\?) + :question-mark) + ((#\.) + :everything) + ((#\^) + :start-anchor) + ((#\$) + :end-anchor) + ((#\+ #\*) + ;; quantifiers will always be consumend by + ;; GET-QUANTIFIER, they must not appear here + (signal-ppcre-syntax-error* + (1- (lexer-pos lexer)) + "Quantifier '~A' not allowed" + next-char)) + ((#\{) + ;; left brace isn't a special character in it's own + ;; right but we must check if what follows might + ;; look like a quantifier + (let ((this-pos (lexer-pos lexer)) + (this-last-pos (lexer-last-pos lexer))) + (unget-token lexer) + (when (get-quantifier lexer) + (signal-ppcre-syntax-error* + (car this-last-pos) + "Quantifier '~A' not allowed" + (subseq (lexer-str lexer) + (car this-last-pos) + (lexer-pos lexer)))) + (setf (lexer-pos lexer) this-pos + (lexer-last-pos lexer) this-last-pos) + next-char)) + ((#\[) + ;; left bracket always starts a character class + (cons (cond ((looking-at-p lexer #\^) + (incf (lexer-pos lexer)) + :inverted-char-class) + (t + :char-class)) + (collect-char-class lexer))) + ((#\\) + ;; backslash might mean different things so we have + ;; to peek one char ahead: + (let ((next-char (next-char-non-extended lexer))) + (case next-char + ((#\A) + :modeless-start-anchor) + ((#\Z) + :modeless-end-anchor) + ((#\z) + :modeless-end-anchor-no-newline) + ((#\b) + :word-boundary) + ((#\B) + :non-word-boundary) + ((#\k) + (cond ((and *allow-named-registers* + (looking-at-p lexer #\<)) + ;; back-referencing a named register + (incf (lexer-pos lexer)) + (list :back-reference + (nreverse (parse-register-name-aux lexer)))) + (t + ;; false alarm, just unescape \k + #\k))) + ((#\d #\D #\w #\W #\s #\S) + ;; these will be treated like character classes + (map-char-to-special-char-class next-char)) + ((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) + ;; uh, a digit... + (let* ((old-pos (decf (lexer-pos lexer))) + ;; ...so let's get the whole number first + (backref-number (get-number lexer))) + (declare (type fixnum backref-number)) + (cond ((and (> backref-number (lexer-reg lexer)) + (<= 10 backref-number)) + ;; \10 and higher are treated as octal + ;; character codes if we haven't + ;; opened that much register groups + ;; yet + (setf (lexer-pos lexer) old-pos) + ;; re-read the number from the old + ;; position and convert it to its + ;; corresponding character + (make-char-from-code (get-number lexer :radix 8 :max-length 3) + old-pos)) + (t + ;; otherwise this must refer to a + ;; backreference + (list :back-reference backref-number))))) + ((#\0) + ;; this always means an octal character code + ;; (at most three digits) + (let ((old-pos (decf (lexer-pos lexer)))) + (make-char-from-code (get-number lexer :radix 8 :max-length 3) + old-pos))) + (otherwise + ;; in all other cases just unescape the + ;; character + (decf (lexer-pos lexer)) + (unescape-char lexer))))) + ((#\() + ;; an open parenthesis might mean different things + ;; depending on what follows... + (cond ((looking-at-p lexer #\?) + ;; this is the case '(?' (and probably more behind) + (incf (lexer-pos lexer)) + ;; we have to check for modifiers first + ;; because a colon might follow + (let* ((flags (maybe-parse-flags lexer)) + (next-char (next-char-non-extended lexer))) + ;; modifiers are only allowed if a colon + ;; or a closing parenthesis are following + (when (and flags + (not (find next-char ":)" :test #'char=))) + (signal-ppcre-syntax-error* + (car (lexer-last-pos lexer)) + "Sequence '~A' not recognized" + (subseq (lexer-str lexer) + (car (lexer-last-pos lexer)) + (lexer-pos lexer)))) + (case next-char + ((nil) + ;; syntax error + (signal-ppcre-syntax-error + "End of string following '(?'")) + ((#\)) + ;; an empty group except for the flags + ;; (if there are any) + (or (and flags + (cons :flags flags)) + :void)) + ((#\() + ;; branch + :open-paren-paren) + ((#\>) + ;; standalone + :open-paren-greater) + ((#\=) + ;; positive look-ahead + :open-paren-equal) + ((#\!) + ;; negative look-ahead + :open-paren-exclamation) + ((#\:) + ;; non-capturing group - return flags as + ;; second value + (values :open-paren-colon flags)) + ((#\<) + ;; might be a look-behind assertion or a named group, so + ;; check next character + (let ((next-char (next-char-non-extended lexer))) + (if (alpha-char-p next-char) + (progn + ;; we have encountered a named group + ;; are we supporting register naming? + (unless *allow-named-registers* (signal-ppcre-syntax-error* (1- (lexer-pos lexer)) "Character '~A' may not follow '(?<'" - next-char ))))) - (otherwise - (signal-ppcre-syntax-error* - (1- (lexer-pos lexer)) - "Character '~A' may not follow '(?'" - next-char))))) - (t - ;; if next-char was not #\? (this is within - ;; the first COND), we've just seen an opening - ;; parenthesis and leave it like that - :open-paren))) - (otherwise - ;; all other characters are their own tokens - next-char))) + next-char)) + ;; put the letter back + (decf (lexer-pos lexer)) + ;; named group + :open-paren-less-letter) + (case next-char + ((#\=) + ;; positive look-behind + :open-paren-less-equal) + ((#\!) + ;; negative look-behind + :open-paren-less-exclamation) + ((#\)) + ;; Perl allows "(?<)" and treats + ;; it like a null string + :void) + ((nil) + ;; syntax error + (signal-ppcre-syntax-error + "End of string following '(?<'")) + (t + ;; also syntax error + (signal-ppcre-syntax-error* + (1- (lexer-pos lexer)) + "Character '~A' may not follow '(?<'" + next-char )))))) + (otherwise + (signal-ppcre-syntax-error* + (1- (lexer-pos lexer)) + "Character '~A' may not follow '(?'" + next-char))))) + (t + ;; if next-char was not #\? (this is within + ;; the first COND), we've just seen an opening + ;; parenthesis and leave it like that + :open-paren))) + (otherwise + ;; all other characters are their own tokens + next-char))) ;; we didn't get a character (this if the "else" branch from ;; the first IF), so we don't return a token but NIL (t - (pop (lexer-last-pos lexer)) - nil)))) + (pop (lexer-last-pos lexer)) + nil)))) (declaim (inline unget-token)) (defun unget-token (lexer) diff --git a/lispworks-defsystem.lisp b/lispworks-defsystem.lisp index 0d84f56..58ca0c8 100644 --- a/lispworks-defsystem.lisp +++ b/lispworks-defsystem.lisp @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.1 2005/04/30 20:00:50 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.4 2008/06/25 14:04:27 edi Exp $ ;;; This system definition for LispWorks was kindly provided by Wade Humeniuk -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions diff --git a/load.lisp b/load.lisp old mode 100755 new mode 100644 index cfa7d0a..09289fb --- a/load.lisp +++ b/load.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.13 2005/04/01 21:29:09 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.16 2008/06/25 14:04:27 edi Exp $ -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions diff --git a/optimize.lisp b/optimize.lisp index e004d13..12c7256 100644 --- a/optimize.lisp +++ b/optimize.lisp @@ -1,10 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.26 2005/04/13 15:35:57 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.31 2008/06/25 14:04:27 edi Exp $ ;;; This file contains optimizations which can be applied to converted ;;; parse trees. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -40,6 +40,7 @@ transforms # # #>> to operation on REGEX.")) (defmethod flatten ((seq seq)) + (declare #.*standard-optimize-settings*) ;; this looks more complicated than it is because we modify SEQ in ;; place to avoid unnecessary consing (let ((elements-rest (elements seq))) @@ -71,6 +72,7 @@ operation on REGEX.")) (t (make-instance 'void))))) (defmethod flatten ((alternation alternation)) + (declare #.*standard-optimize-settings*) ;; same algorithm as above (let ((choices-rest (choices alternation))) (loop @@ -98,9 +100,8 @@ operation on REGEX.")) "Encountered alternation without choices."))))) (defmethod flatten ((branch branch)) - (with-slots ((test test) - (then-regex then-regex) - (else-regex else-regex)) + (declare #.*standard-optimize-settings*) + (with-slots (test then-regex else-regex) branch (setq test (if (numberp test) @@ -111,6 +112,7 @@ operation on REGEX.")) branch)) (defmethod flatten ((regex regex)) + (declare #.*standard-optimize-settings*) (typecase regex ((or repetition register lookahead lookbehind standalone) ;; if REGEX contains exactly one inner REGEX object flatten it @@ -124,12 +126,13 @@ operation on REGEX.")) regex))) (defgeneric gather-strings (regex) - (declare #.*standard-optimize-settings*) + (declare #.*standard-optimize-settings*) (:documentation "Collects adjacent strings or characters into one string provided they have the same case mode. This is a destructive operation on REGEX.")) (defmethod gather-strings ((seq seq)) + (declare #.*standard-optimize-settings*) ;; note that GATHER-STRINGS is to be applied after FLATTEN, i.e. it ;; expects SEQ to be flattened already; in particular, SEQ cannot be ;; empty and cannot contain embedded SEQ objects @@ -246,6 +249,7 @@ operation on REGEX.")) seq)) (defmethod gather-strings ((alternation alternation)) + (declare #.*standard-optimize-settings*) ;; loop ON the choices of ALTERNATION so we can modify them directly (loop for choices-rest on (choices alternation) while choices-rest @@ -254,9 +258,8 @@ operation on REGEX.")) alternation) (defmethod gather-strings ((branch branch)) - (with-slots ((test test) - (then-regex then-regex) - (else-regex else-regex)) + (declare #.*standard-optimize-settings*) + (with-slots (test then-regex else-regex) branch (setq test (if (numberp test) @@ -267,6 +270,7 @@ operation on REGEX.")) branch)) (defmethod gather-strings ((regex regex)) + (declare #.*standard-optimize-settings*) (typecase regex ((or repetition register lookahead lookbehind standalone) ;; if REGEX contains exactly one inner REGEX object apply @@ -283,7 +287,7 @@ operation on REGEX.")) ;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS. (defgeneric start-anchored-p (regex &optional in-seq-p) - (declare #.*standard-optimize-settings*) + (declare #.*standard-optimize-settings*) (:documentation "Returns T if REGEX starts with a \"real\" start anchor, i.e. one that's not in multi-line mode, NIL otherwise. If IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a @@ -302,6 +306,7 @@ zero-length assertion.")) finally (return (and anchored-p (not (eq anchored-p :zero-length)))))) (defmethod start-anchored-p ((alternation alternation) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (declare (ignore in-seq-p)) ;; clearly an alternation can only be start-anchored if all of its ;; choices are start-anchored @@ -309,30 +314,36 @@ zero-length assertion.")) always (start-anchored-p choice))) (defmethod start-anchored-p ((branch branch) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (declare (ignore in-seq-p)) (and (start-anchored-p (then-regex branch)) (start-anchored-p (else-regex branch)))) (defmethod start-anchored-p ((repetition repetition) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (declare (ignore in-seq-p)) ;; well, this wouldn't make much sense, but anyway... (and (plusp (minimum repetition)) (start-anchored-p (regex repetition)))) (defmethod start-anchored-p ((register register) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (declare (ignore in-seq-p)) (start-anchored-p (regex register))) (defmethod start-anchored-p ((standalone standalone) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (declare (ignore in-seq-p)) (start-anchored-p (regex standalone))) (defmethod start-anchored-p ((anchor anchor) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (declare (ignore in-seq-p)) (and (startp anchor) (not (multi-line-p anchor)))) (defmethod start-anchored-p ((regex regex) &optional in-seq-p) + (declare #.*standard-optimize-settings*) (typecase regex ((or lookahead lookbehind word-boundary void) ;; zero-length assertions @@ -352,7 +363,7 @@ zero-length assertion.")) ;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS. (defgeneric end-string-aux (regex &optional old-case-insensitive-p) - (declare #.*standard-optimize-settings*) + (declare #.*standard-optimize-settings*) (:documentation "Returns the constant string (if it exists) REGEX ends with wrapped into a STR object, otherwise NIL. OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR @@ -361,6 +372,7 @@ function called by END-STRIN.)")) (defmethod end-string-aux ((str str) &optional (old-case-insensitive-p :void)) + (declare #.*standard-optimize-settings*) (declare (special last-str)) (cond ((and (not (skip str)) ; avoid constituents of STARTS-WITH ;; only use STR if nothing has been collected yet or if @@ -376,6 +388,7 @@ function called by END-STRIN.)")) (defmethod end-string-aux ((seq seq) &optional (old-case-insensitive-p :void)) + (declare #.*standard-optimize-settings*) (declare (special continuep)) (let (case-insensitive-p concatenated-string @@ -444,14 +457,17 @@ function called by END-STRIN.)")) (defmethod end-string-aux ((register register) &optional (old-case-insensitive-p :void)) + (declare #.*standard-optimize-settings*) (end-string-aux (regex register) old-case-insensitive-p)) (defmethod end-string-aux ((standalone standalone) &optional (old-case-insensitive-p :void)) + (declare #.*standard-optimize-settings*) (end-string-aux (regex standalone) old-case-insensitive-p)) (defmethod end-string-aux ((regex regex) &optional (old-case-insensitive-p :void)) + (declare #.*standard-optimize-settings*) (declare (special last-str end-anchored-p continuep)) (typecase regex ((or anchor lookahead lookbehind word-boundary void) @@ -474,14 +490,11 @@ function called by END-STRIN.)")) ;; REPETITION, FILTER) nil))) -(defgeneric end-string (regex) - (declare #.*standard-optimize-settings*) - (:documentation "Returns the constant string (if it exists) REGEX ends with wrapped -into a STR object, otherwise NIL.")) - -(defmethod end-string ((regex regex)) +(defun end-string (regex) (declare (special end-string-offset)) - (declare #.*standard-optimize-settings*) + (declare #.*standard-optimize-settings*) + "Returns the constant string (if it exists) REGEX ends with wrapped +into a STR object, otherwise NIL." ;; LAST-STR points to the last STR object (seen from the end) that's ;; part of END-STRING; CONTINUEP is set to T if we stop collecting ;; in the middle of a SEQ @@ -499,53 +512,64 @@ into a STR object, otherwise NIL.")) end-string-offset (offset last-str)))))) (defgeneric compute-min-rest (regex current-min-rest) - (declare #.*standard-optimize-settings*) + (declare #.*standard-optimize-settings*) (:documentation "Returns the minimal length of REGEX plus CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it recurses down into REGEX and sets the MIN-REST slots of REPETITION objects.")) (defmethod compute-min-rest ((seq seq) current-min-rest) + (declare #.*standard-optimize-settings*) (loop for element in (reverse (elements seq)) for last-min-rest = current-min-rest then this-min-rest for this-min-rest = (compute-min-rest element last-min-rest) finally (return this-min-rest))) (defmethod compute-min-rest ((alternation alternation) current-min-rest) + (declare #.*standard-optimize-settings*) (loop for choice in (choices alternation) minimize (compute-min-rest choice current-min-rest))) (defmethod compute-min-rest ((branch branch) current-min-rest) + (declare #.*standard-optimize-settings*) (min (compute-min-rest (then-regex branch) current-min-rest) (compute-min-rest (else-regex branch) current-min-rest))) (defmethod compute-min-rest ((str str) current-min-rest) + (declare #.*standard-optimize-settings*) (+ current-min-rest (len str))) (defmethod compute-min-rest ((filter filter) current-min-rest) + (declare #.*standard-optimize-settings*) (+ current-min-rest (or (len filter) 0))) (defmethod compute-min-rest ((repetition repetition) current-min-rest) + (declare #.*standard-optimize-settings*) (setf (min-rest repetition) current-min-rest) (compute-min-rest (regex repetition) current-min-rest) (+ current-min-rest (* (minimum repetition) (min-len repetition)))) (defmethod compute-min-rest ((register register) current-min-rest) + (declare #.*standard-optimize-settings*) (compute-min-rest (regex register) current-min-rest)) (defmethod compute-min-rest ((standalone standalone) current-min-rest) + (declare #.*standard-optimize-settings*) (declare (ignore current-min-rest)) (compute-min-rest (regex standalone) 0)) (defmethod compute-min-rest ((lookahead lookahead) current-min-rest) + (declare #.*standard-optimize-settings*) (compute-min-rest (regex lookahead) 0) current-min-rest) (defmethod compute-min-rest ((lookbehind lookbehind) current-min-rest) + (declare #.*standard-optimize-settings*) (compute-min-rest (regex lookbehind) (+ current-min-rest (len lookbehind))) current-min-rest) (defmethod compute-min-rest ((regex regex) current-min-rest) + (declare #.*standard-optimize-settings*) (typecase regex ((or char-class everything) (1+ current-min-rest)) diff --git a/packages.lisp b/packages.lisp index 6c046fd..9c5e862 100644 --- a/packages.lisp +++ b/packages.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.19 2005/04/01 21:29:10 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.24 2008/06/25 14:04:27 edi Exp $ -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -53,15 +53,14 @@ #:*regex-char-code-limit* #:*use-bmh-matchers* #:*allow-quoting* + #:*allow-named-registers* #:ppcre-error #:ppcre-invocation-error #:ppcre-syntax-error #:ppcre-syntax-error-string #:ppcre-syntax-error-pos #:register-groups-bind - #:do-register-groups - #:*standard-optimize-settings* - #:*special-optimize-settings*)) + #:do-register-groups)) #+:cormanlisp (defpackage "CL-PPCRE" @@ -86,15 +85,14 @@ "*REGEX-CHAR-CODE-LIMIT*" "*USE-BMH-MATCHERS*" "*ALLOW-QUOTING*" + "*ALLOW-NAMED-REGISTERS*" "PPCRE-ERROR" "PPCRE-INVOCATION-ERROR" "PPCRE-SYNTAX-ERROR" "PPCRE-SYNTAX-ERROR-STRING" "PPCRE-SYNTAX-ERROR-POS" "REGISTER-GROUPS-BIND" - "DO-REGISTER-GROUPS" - "*STANDARD-OPTIMIZE-SETTINGS*" - "*SPECIAL-OPTIMIZE-SETTINGS*")) + "DO-REGISTER-GROUPS")) #-:cormanlisp (defpackage #:cl-ppcre-test diff --git a/parser.lisp b/parser.lisp index 62c1d79..9fd0820 100644 --- a/parser.lisp +++ b/parser.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.25 2008/06/25 14:04:28 edi Exp $ ;;; The parser will - with the help of the lexer - parse a regex ;;; string and convert it into a "parse tree" (see docs for details @@ -7,7 +7,7 @@ ;;; illegal parse trees. It is assumed that the conversion process ;;; later on will track them down. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -38,19 +38,20 @@ (defun group (lexer) (declare #.*standard-optimize-settings*) "Parses and consumes a . -The productions are: -> \"(\"\")\" - \"(?:\"\")\" - \"(?>\"\")\" - \"(?:\"\")\" - \"(?=\"\")\" - \"(?!\"\")\" - \"(?<=\"\")\" - \"(?\")\" - \"(?(\"\")\"\")\" - \"(?(\"\")\"\")\" +The productions are: -> \"\(\"\")\" + \"\(?:\"\")\" + \"\(?>\"\")\" + \"\(?:\"\")\" + \"\(?=\"\")\" + \"\(?!\"\")\" + \"\(?<=\"\")\" + \"\(?\")\" + \"\(?\(\"\")\"\")\" + \"\(?\(\"\")\"\")\" + \"\(?\"\")\" \(when *ALLOW-NAMED-REGISTERS* is T) where is parsed by the lexer function MAYBE-PARSE-FLAGS. -Will return or ( ) where +Will return or \( ) where is one of six keywords - see source for details." (multiple-value-bind (open-token flags) (get-token lexer) @@ -103,17 +104,21 @@ Will return or ( ) where :open-paren-equal :open-paren-exclamation :open-paren-less-equal - :open-paren-less-exclamation) + :open-paren-less-exclamation + :open-paren-less-letter) :test #'eq) ;; make changes to extended-mode-p local (let ((*extended-mode-p* *extended-mode-p*)) ;; we saw one of the six token representing opening ;; parentheses (let* ((open-paren-pos (car (lexer-last-pos lexer))) + (register-name (when (eq open-token :open-paren-less-letter) + (parse-register-name-aux lexer))) (reg-expr (reg-expr lexer)) (close-token (get-token lexer))) - (when (eq open-token :open-paren) - ;; if this is the "("")" production we have to + (when (or (eq open-token :open-paren) + (eq open-token :open-paren-less-letter)) + ;; if this is the "("")" or "(?"""")" production we have to ;; increment the register counter of the lexer (incf (lexer-reg lexer))) (unless (eq close-token :close-paren) @@ -126,27 +131,33 @@ Will return or ( ) where ;; if the lexer has returned a list of flags this must ;; have been the "(?:"")" production (cons :group (nconc flags (list reg-expr))) - (list (case open-token - ((:open-paren) - :register) - ((:open-paren-colon) - :group) - ((:open-paren-greater) - :standalone) - ((:open-paren-equal) - :positive-lookahead) - ((:open-paren-exclamation) - :negative-lookahead) - ((:open-paren-less-equal) - :positive-lookbehind) - ((:open-paren-less-exclamation) - :negative-lookbehind)) - reg-expr))))) + (if (eq open-token :open-paren-less-letter) + (list :named-register + ;; every string was reversed, so we have to + ;; reverse it back to get the name + (nreverse register-name) + reg-expr) + (list (case open-token + ((:open-paren) + :register) + ((:open-paren-colon) + :group) + ((:open-paren-greater) + :standalone) + ((:open-paren-equal) + :positive-lookahead) + ((:open-paren-exclamation) + :negative-lookahead) + ((:open-paren-less-equal) + :positive-lookbehind) + ((:open-paren-less-exclamation) + :negative-lookbehind)) + reg-expr)))))) (t - ;; this is the production; is - ;; any token which passes START-OF-SUBEXPR-P (otherwise - ;; parsing had already stopped in the SEQ method) - open-token)))) + ;; this is the production; is + ;; any token which passes START-OF-SUBEXPR-P (otherwise + ;; parsing had already stopped in the SEQ method) + open-token)))) (defun greedy-quant (lexer) (declare #.*standard-optimize-settings*) diff --git a/ppcre-tests.lisp b/ppcre-tests.lisp index 560beae..25cb6c1 100644 --- a/ppcre-tests.lisp +++ b/ppcre-tests.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.31 2005/08/23 12:23:13 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.36 2008/06/25 14:04:28 edi Exp $ -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -41,7 +41,8 @@ #+:ecl (si:gc t) #+:clisp (ext:gc) #+:cormanlisp (loop for i from 0 to 3 do (cormanlisp:gc i)) - #+:lispworks (hcl:mark-and-sweep 3) + #+:lispworks4 (hcl:mark-and-sweep 3) + #+:lispworks5 (hcl:gc-generation #+:lispworks-32bit 3 #+:lispworks-64bit :blocking-gen-num) #+:sbcl (sb-ext:gc :full t)) ;; warning: ugly code ahead!! @@ -52,7 +53,7 @@ multi-line-mode single-line-mode extended-mode) - (declare #.*standard-optimize-settings*) + (declare #.ppcre::*standard-optimize-settings*) "Auxiliary function used by TEST to benchmark a regex scanner against Perl timings." (declare (type string string)) @@ -73,7 +74,7 @@ against Perl timings." lispworks (and sbcl sb-thread)) (defun threaded-scan (scanner target-string &key (threads 10) (repetitions 5000)) - (declare #.*standard-optimize-settings*) + (declare #.ppcre::*standard-optimize-settings*) "Auxiliary function used by TEST to check whether SCANNER is thread-safe." (full-gc) (let ((collector (make-array threads)) @@ -133,7 +134,7 @@ against Perl timings." :defaults *cl-ppcre-test-base-directory*) file-name-provided-p) threaded) - (declare #.*standard-optimize-settings*) + (declare #.ppcre::*standard-optimize-settings*) (declare (ignorable threaded)) "Loop through all test cases in FILE-NAME and print report. Only in LispWorks and SCL: If THREADED is true, also test whether the scanners diff --git a/regex-class.lisp b/regex-class.lisp index 62c0f5b..848cd34 100644 --- a/regex-class.lisp +++ b/regex-class.lisp @@ -1,11 +1,11 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.26 2005/06/10 10:23:42 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.34 2008/07/03 07:44:06 edi Exp $ ;;; This file defines the REGEX class and some utility methods for ;;; this class. REGEX objects are used to represent the (transformed) ;;; parse trees internally -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -122,7 +122,10 @@ appear after this repetition.") :reader num :type fixnum :documentation "The number of this register, starting from 0. -This is the index into *REGS-START* and *REGS-END*.")) +This is the index into *REGS-START* and *REGS-END*.") + (name :initarg :name + :reader name + :documentation "Name of this register or NIL.")) (:documentation "REGISTER objects represent register groups.")) (defclass standalone (regex) @@ -137,18 +140,21 @@ This is the index into *REGS-START* and *REGS-END*.")) :type fixnum :documentation "The number of the register this reference refers to.") + (name :initarg :name + :accessor name + :documentation "The name of the register this +reference refers to or NIL.") (case-insensitive-p :initarg :case-insensitive-p :reader case-insensitive-p :documentation "Whether we check case-insensitively.")) (:documentation "BACK-REFERENCE objects represent backreferences.")) - (defclass char-class (regex) - ((hash :initarg :hash - :reader hash - :type (or hash-table null) - :documentation "A hash table the keys of which are the -characters; the values are always T.") + ((charset :initarg :charset + :reader charset + :type (or charset null) + :documentation "A charset denoting the characters +in the character class.") (case-insensitive-p :initarg :case-insensitive-p :reader case-insensitive-p :documentation "If the char class @@ -253,20 +259,16 @@ defined by the user.")) (defmethod initialize-instance :after ((char-class char-class) &rest init-args) (declare #.*standard-optimize-settings*) - "Make large hash tables smaller, if possible." - (let ((hash (getf init-args :hash))) - (when (and hash + "Make large charsets smaller, if possible." + (let ((set (getf init-args :charset))) + (when (and set (> *regex-char-code-limit* 256) - (> (hash-table-count hash) + (> (charset-count set) (/ *regex-char-code-limit* 2))) - (setf (slot-value char-class 'hash) - (merge-inverted-hash (make-hash-table) - hash) + (setf (slot-value char-class 'set) + (merge-set (make-charset) set) (slot-value char-class 'invertedp) - (not (slot-value char-class 'invertedp)))))) - -;;; The following four methods allow a VOID object to behave like a -;;; zero-length STR object (only readers needed) + (not (slot-value char-class 'invertedp)))))) (defmethod initialize-instance :after ((str str) &rest init-args) (declare #.*standard-optimize-settings*) @@ -277,6 +279,9 @@ defined by the user.")) (setf (slot-value str 'str) (coerce str-slot 'simple-string)))) (setf (len str) (length (str str)))) +;;; The following four methods allow a VOID object to behave like a +;;; zero-length STR object (only readers needed) + (defmethod len ((void void)) (declare #.*standard-optimize-settings*) 0) @@ -301,6 +306,7 @@ second argument if the STR has length 0. Returns NIL for REGEX objects which are not of type STR.")) (defmethod case-mode ((str str) old-case-mode) + (declare #.*standard-optimize-settings*) (cond ((zerop (len str)) old-case-mode) ((case-insensitive-p str) @@ -309,6 +315,7 @@ which are not of type STR.")) :case-sensitive))) (defmethod case-mode ((regex regex) old-case-mode) + (declare #.*standard-optimize-settings*) (declare (ignore old-case-mode)) nil) @@ -317,37 +324,45 @@ which are not of type STR.")) (:documentation "Implements a deep copy of a REGEX object.")) (defmethod copy-regex ((anchor anchor)) + (declare #.*standard-optimize-settings*) (make-instance 'anchor :startp (startp anchor) :multi-line-p (multi-line-p anchor) :no-newline-p (no-newline-p anchor))) (defmethod copy-regex ((everything everything)) + (declare #.*standard-optimize-settings*) (make-instance 'everything :single-line-p (single-line-p everything))) (defmethod copy-regex ((word-boundary word-boundary)) + (declare #.*standard-optimize-settings*) (make-instance 'word-boundary :negatedp (negatedp word-boundary))) (defmethod copy-regex ((void void)) + (declare #.*standard-optimize-settings*) (make-instance 'void)) (defmethod copy-regex ((lookahead lookahead)) + (declare #.*standard-optimize-settings*) (make-instance 'lookahead :regex (copy-regex (regex lookahead)) :positivep (positivep lookahead))) (defmethod copy-regex ((seq seq)) + (declare #.*standard-optimize-settings*) (make-instance 'seq :elements (mapcar #'copy-regex (elements seq)))) (defmethod copy-regex ((alternation alternation)) + (declare #.*standard-optimize-settings*) (make-instance 'alternation :choices (mapcar #'copy-regex (choices alternation)))) (defmethod copy-regex ((branch branch)) - (with-slots ((test test)) + (declare #.*standard-optimize-settings*) + (with-slots (test) branch (make-instance 'branch :test (if (typep test 'regex) @@ -357,12 +372,14 @@ which are not of type STR.")) :else-regex (copy-regex (else-regex branch))))) (defmethod copy-regex ((lookbehind lookbehind)) + (declare #.*standard-optimize-settings*) (make-instance 'lookbehind :regex (copy-regex (regex lookbehind)) :positivep (positivep lookbehind) :len (len lookbehind))) (defmethod copy-regex ((repetition repetition)) + (declare #.*standard-optimize-settings*) (make-instance 'repetition :regex (copy-regex (regex repetition)) :greedyp (greedyp repetition) @@ -373,32 +390,39 @@ which are not of type STR.")) :contains-register-p (contains-register-p repetition))) (defmethod copy-regex ((register register)) + (declare #.*standard-optimize-settings*) (make-instance 'register :regex (copy-regex (regex register)) - :num (num register))) + :num (num register) + :name (name register))) (defmethod copy-regex ((standalone standalone)) + (declare #.*standard-optimize-settings*) (make-instance 'standalone :regex (copy-regex (regex standalone)))) (defmethod copy-regex ((back-reference back-reference)) + (declare #.*standard-optimize-settings*) (make-instance 'back-reference :num (num back-reference) :case-insensitive-p (case-insensitive-p back-reference))) (defmethod copy-regex ((char-class char-class)) + (declare #.*standard-optimize-settings*) (make-instance 'char-class - :hash (hash char-class) + :charset (charset char-class) :case-insensitive-p (case-insensitive-p char-class) :invertedp (invertedp char-class) :word-char-class-p (word-char-class-p char-class))) (defmethod copy-regex ((str str)) + (declare #.*standard-optimize-settings*) (make-instance 'str :str (str str) :case-insensitive-p (case-insensitive-p str))) (defmethod copy-regex ((filter filter)) + (declare #.*standard-optimize-settings*) (make-instance 'filter :fn (fn filter) :len (len filter))) @@ -420,6 +444,7 @@ optionally removes embedded REGISTER objects if possible and if the special variable REMOVE-REGISTERS-P is true.")) (defmethod remove-registers ((register register)) + (declare #.*standard-optimize-settings*) (declare (special remove-registers-p reg-seen)) (cond (remove-registers-p (remove-registers (regex register))) @@ -430,6 +455,7 @@ special variable REMOVE-REGISTERS-P is true.")) (copy-regex register)))) (defmethod remove-registers ((repetition repetition)) + (declare #.*standard-optimize-settings*) (let* (reg-seen (inner-regex (remove-registers (regex repetition)))) ;; REMOVE-REGISTERS will set REG-SEEN (see method above) if @@ -445,22 +471,26 @@ special variable REMOVE-REGISTERS-P is true.")) :contains-register-p reg-seen))) (defmethod remove-registers ((standalone standalone)) + (declare #.*standard-optimize-settings*) (make-instance 'standalone :regex (remove-registers (regex standalone)))) (defmethod remove-registers ((lookahead lookahead)) + (declare #.*standard-optimize-settings*) (make-instance 'lookahead :regex (remove-registers (regex lookahead)) :positivep (positivep lookahead))) (defmethod remove-registers ((lookbehind lookbehind)) + (declare #.*standard-optimize-settings*) (make-instance 'lookbehind :regex (remove-registers (regex lookbehind)) :positivep (positivep lookbehind) :len (len lookbehind))) (defmethod remove-registers ((branch branch)) - (with-slots ((test test)) + (declare #.*standard-optimize-settings*) + (with-slots (test) branch (make-instance 'branch :test (if (typep test 'regex) @@ -470,15 +500,18 @@ special variable REMOVE-REGISTERS-P is true.")) :else-regex (remove-registers (else-regex branch))))) (defmethod remove-registers ((alternation alternation)) + (declare #.*standard-optimize-settings*) (declare (special remove-registers-p)) ;; an ALTERNATION, so we can't remove REGISTER objects further down (setq remove-registers-p nil) (copy-regex alternation)) (defmethod remove-registers ((regex regex)) + (declare #.*standard-optimize-settings*) (copy-regex regex)) (defmethod remove-registers ((seq seq)) + (declare #.*standard-optimize-settings*) (make-instance 'seq :elements (mapcar #'remove-registers (elements seq)))) @@ -489,6 +522,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (i.e. the object corresponding to \".\", for example.")) (defmethod everythingp ((seq seq)) + (declare #.*standard-optimize-settings*) ;; we might have degenerate cases like (:SEQUENCE :VOID ...) ;; due to the parsing process (let ((cleaned-elements (remove-if #'(lambda (element) @@ -498,7 +532,8 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (everythingp (first cleaned-elements))))) (defmethod everythingp ((alternation alternation)) - (with-slots ((choices choices)) + (declare #.*standard-optimize-settings*) + (with-slots (choices) alternation (and (= 1 (length choices)) ;; this is unlikely to happen for human-generated regexes, @@ -506,9 +541,8 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (everythingp (first choices))))) (defmethod everythingp ((repetition repetition)) - (with-slots ((maximum maximum) - (minimum minimum) - (regex regex)) + (declare #.*standard-optimize-settings*) + (with-slots (maximum minimum regex) repetition (and maximum (= 1 minimum maximum) @@ -516,15 +550,19 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (everythingp regex)))) (defmethod everythingp ((register register)) + (declare #.*standard-optimize-settings*) (everythingp (regex register))) (defmethod everythingp ((standalone standalone)) + (declare #.*standard-optimize-settings*) (everythingp (regex standalone))) (defmethod everythingp ((everything everything)) + (declare #.*standard-optimize-settings*) everything) (defmethod everythingp ((regex regex)) + (declare #.*standard-optimize-settings*) ;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS, ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY nil) @@ -534,6 +572,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (:documentation "Return the length of REGEX if it is fixed, NIL otherwise.")) (defmethod regex-length ((seq seq)) + (declare #.*standard-optimize-settings*) ;; simply add all inner lengths unless one of them is NIL (loop for sub-regex in (elements seq) for len = (regex-length sub-regex) @@ -541,6 +580,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true sum len)) (defmethod regex-length ((alternation alternation)) + (declare #.*standard-optimize-settings*) ;; only return a true value if all inner lengths are non-NIL and ;; mutually equal (loop for sub-regex in (choices alternation) @@ -551,6 +591,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true finally (return len))) (defmethod regex-length ((branch branch)) + (declare #.*standard-optimize-settings*) ;; only return a true value if both alternations have a length and ;; if they're equal (let ((then-length (regex-length (then-regex branch)))) @@ -559,13 +600,12 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true then-length))) (defmethod regex-length ((repetition repetition)) + (declare #.*standard-optimize-settings*) ;; we can only compute the length of a REPETITION object if the ;; number of repetitions is fixed; note that we don't call ;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is ;; always set correctly - (with-slots ((len len) - (minimum minimum) - (maximum maximum)) + (with-slots (len minimum maximum) repetition (if (and len (eql minimum maximum)) @@ -573,29 +613,37 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true nil))) (defmethod regex-length ((register register)) + (declare #.*standard-optimize-settings*) (regex-length (regex register))) (defmethod regex-length ((standalone standalone)) + (declare #.*standard-optimize-settings*) (regex-length (regex standalone))) (defmethod regex-length ((back-reference back-reference)) + (declare #.*standard-optimize-settings*) ;; with enough effort we could possibly do better here, but ;; currently we just give up and return NIL nil) (defmethod regex-length ((char-class char-class)) + (declare #.*standard-optimize-settings*) 1) (defmethod regex-length ((everything everything)) + (declare #.*standard-optimize-settings*) 1) (defmethod regex-length ((str str)) + (declare #.*standard-optimize-settings*) (len str)) (defmethod regex-length ((filter filter)) + (declare #.*standard-optimize-settings*) (len filter)) (defmethod regex-length ((regex regex)) + (declare #.*standard-optimize-settings*) ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and ;; WORD-BOUNDARY (which all have zero-length) 0) @@ -605,12 +653,14 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (:documentation "Returns the minimal length of REGEX.")) (defmethod regex-min-length ((seq seq)) + (declare #.*standard-optimize-settings*) ;; simply add all inner minimal lengths (loop for sub-regex in (elements seq) for len = (regex-min-length sub-regex) sum len)) (defmethod regex-min-length ((alternation alternation)) + (declare #.*standard-optimize-settings*) ;; minimal length of an alternation is the minimal length of the ;; "shortest" element (loop for sub-regex in (choices alternation) @@ -618,35 +668,44 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true minimize len)) (defmethod regex-min-length ((branch branch)) + (declare #.*standard-optimize-settings*) ;; minimal length of both alternations (min (regex-min-length (then-regex branch)) (regex-min-length (else-regex branch)))) (defmethod regex-min-length ((repetition repetition)) + (declare #.*standard-optimize-settings*) ;; obviously the product of the inner minimal length and the minimal ;; number of repetitions (* (minimum repetition) (min-len repetition))) (defmethod regex-min-length ((register register)) + (declare #.*standard-optimize-settings*) (regex-min-length (regex register))) (defmethod regex-min-length ((standalone standalone)) + (declare #.*standard-optimize-settings*) (regex-min-length (regex standalone))) (defmethod regex-min-length ((char-class char-class)) + (declare #.*standard-optimize-settings*) 1) (defmethod regex-min-length ((everything everything)) + (declare #.*standard-optimize-settings*) 1) (defmethod regex-min-length ((str str)) + (declare #.*standard-optimize-settings*) (len str)) (defmethod regex-min-length ((filter filter)) + (declare #.*standard-optimize-settings*) (or (len filter) 0)) (defmethod regex-min-length ((regex regex)) + (declare #.*standard-optimize-settings*) ;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD, ;; LOOKBEHIND, VOID, and WORD-BOUNDARY 0) @@ -664,6 +723,7 @@ slots of STR objects further down the tree.")) ;; into repetitions (defmethod compute-offsets ((seq seq) start-pos) + (declare #.*standard-optimize-settings*) (loop for element in (elements seq) ;; advance offset argument for next call while looping through ;; the elements @@ -673,6 +733,7 @@ slots of STR objects further down the tree.")) finally (return curr-offset))) (defmethod compute-offsets ((alternation alternation) start-pos) + (declare #.*standard-optimize-settings*) (loop for choice in (choices alternation) for old-offset = nil then curr-offset for curr-offset = (compute-offsets choice start-pos) @@ -684,6 +745,7 @@ slots of STR objects further down the tree.")) finally (return curr-offset))) (defmethod compute-offsets ((branch branch) start-pos) + (declare #.*standard-optimize-settings*) ;; only return offset if both alternations have equal value (let ((then-offset (compute-offsets (then-regex branch) start-pos))) (and then-offset @@ -691,10 +753,9 @@ slots of STR objects further down the tree.")) then-offset))) (defmethod compute-offsets ((repetition repetition) start-pos) + (declare #.*standard-optimize-settings*) ;; no need to descend into the inner regex - (with-slots ((len len) - (minimum minimum) - (maximum maximum)) + (with-slots (len minimum maximum) repetition (if (and len (eq minimum maximum)) @@ -704,34 +765,42 @@ slots of STR objects further down the tree.")) nil))) (defmethod compute-offsets ((register register) start-pos) + (declare #.*standard-optimize-settings*) (compute-offsets (regex register) start-pos)) (defmethod compute-offsets ((standalone standalone) start-pos) + (declare #.*standard-optimize-settings*) (compute-offsets (regex standalone) start-pos)) (defmethod compute-offsets ((char-class char-class) start-pos) + (declare #.*standard-optimize-settings*) (1+ start-pos)) (defmethod compute-offsets ((everything everything) start-pos) + (declare #.*standard-optimize-settings*) (1+ start-pos)) (defmethod compute-offsets ((str str) start-pos) + (declare #.*standard-optimize-settings*) (setf (offset str) start-pos) (+ start-pos (len str))) (defmethod compute-offsets ((back-reference back-reference) start-pos) + (declare #.*standard-optimize-settings*) ;; with enough effort we could possibly do better here, but ;; currently we just give up and return NIL (declare (ignore start-pos)) nil) (defmethod compute-offsets ((filter filter) start-pos) + (declare #.*standard-optimize-settings*) (let ((len (len filter))) (if len (+ start-pos len) nil))) (defmethod compute-offsets ((regex regex) start-pos) + (declare #.*standard-optimize-settings*) ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and ;; WORD-BOUNDARY (which all have zero-length) start-pos) diff --git a/repetition-closures.lisp b/repetition-closures.lisp index db1c5a1..1d5216b 100644 --- a/repetition-closures.lisp +++ b/repetition-closures.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.24 2005/04/13 15:35:58 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $ ;;; This is actually a part of closures.lisp which we put into a ;;; separate file because it is rather complex. We only deal with @@ -7,7 +7,7 @@ ;;; rather crazy micro-optimizations which were introduced to be as ;;; competitive with Perl as possible in tight loops. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -804,45 +804,41 @@ that REPETITION has a constant number of repetitions.")) ;; utilizes all the functions and macros defined above (defmethod create-matcher-aux ((repetition repetition) next-fn) - (with-slots ((minimum minimum) - (maximum maximum) - (len len) - (min-len min-len) - (greedyp greedyp) - (contains-register-p contains-register-p)) + (declare #.*standard-optimize-settings*) + (with-slots (minimum maximum len min-len greedyp contains-register-p) repetition (cond ((and maximum (zerop maximum)) - ;; this should have been optimized away by CONVERT but just - ;; in case... - (error "Got REPETITION with MAXIMUM 0 \(should not happen)")) + ;; this should have been optimized away by CONVERT but just + ;; in case... + (error "Got REPETITION with MAXIMUM 0 \(should not happen)")) ((and maximum (= minimum maximum 1)) - ;; this should have been optimized away by CONVERT but just - ;; in case... - (error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)")) + ;; this should have been optimized away by CONVERT but just + ;; in case... + (error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)")) ((and (eql minimum maximum) len (not contains-register-p)) - (create-constant-repetition-constant-length-matcher repetition next-fn)) + (create-constant-repetition-constant-length-matcher repetition next-fn)) ((eql minimum maximum) - (create-constant-repetition-matcher repetition next-fn)) + (create-constant-repetition-matcher repetition next-fn)) ((and greedyp len (not contains-register-p)) - (create-greedy-constant-length-matcher repetition next-fn)) + (create-greedy-constant-length-matcher repetition next-fn)) ((and greedyp (or (plusp min-len) (eql maximum 1))) - (create-greedy-no-zero-matcher repetition next-fn)) + (create-greedy-no-zero-matcher repetition next-fn)) (greedyp - (create-greedy-matcher repetition next-fn)) + (create-greedy-matcher repetition next-fn)) ((and len (plusp len) (not contains-register-p)) - (create-non-greedy-constant-length-matcher repetition next-fn)) + (create-non-greedy-constant-length-matcher repetition next-fn)) ((or (plusp min-len) (eql maximum 1)) - (create-non-greedy-no-zero-matcher repetition next-fn)) + (create-non-greedy-no-zero-matcher repetition next-fn)) (t - (create-non-greedy-matcher repetition next-fn))))) + (create-non-greedy-matcher repetition next-fn))))) diff --git a/scanner.lisp b/scanner.lisp index 62b04bf..c090da1 100644 --- a/scanner.lisp +++ b/scanner.lisp @@ -1,10 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.26 2005/07/19 23:18:15 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.29 2008/06/25 14:04:28 edi Exp $ ;;; Here the scanner for the actual regex as well as utility scanners ;;; for the constant start and end strings are created. -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions diff --git a/specials.lisp b/specials.lisp index 0536349..b19f49e 100644 --- a/specials.lisp +++ b/specials.lisp @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.21 2005/04/01 21:29:10 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.27 2008/07/03 08:13:28 edi Exp $ ;;; globally declared special variables -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -45,7 +45,7 @@ (defvar *special-optimize-settings* '(optimize speed space) - "Special optimize settings used only be a few declaration expressions.") + "Special optimize settings used only by a few declaration expressions.") ;;; special variables used by the lexer/parser combo @@ -55,6 +55,13 @@ ;;; special variables used by the SCAN function and the matchers +(defvar *regex-char-code-limit* char-code-limit + "The upper exclusive bound on the char-codes of characters which can +occur in character classes. Change this value BEFORE creating +scanners if you don't need the \(full) Unicode support of +implementations like AllegroCL, CLISP, LispWorks, or SBCL.") +(declaim (type fixnum *regex-char-code-limit*)) + (defvar *string* "" "The string which is currently scanned by SCAN. Will always be coerced to a SIMPLE-STRING.") @@ -120,11 +127,16 @@ but large) Boyer-Moore-Horspool matchers.") (defvar *allow-quoting* nil "Whether the parser should support Perl's \\Q and \\E.") +(defvar *allow-named-registers* nil + "Whether the parser should support AllegroCL's named registers +\(?\"\") and back-reference \\k syntax.") + (pushnew :cl-ppcre *features*) ;; stuff for Nikodemus Siivola's HYPERDOC ;; see ;; and +;; also used by LW-ADD-ONS (defvar *hyperdoc-base-uri* "http://weitz.de/cl-ppcre/") diff --git a/testdata b/testdata index 1d4b0b8..925d0ef 100644 --- a/testdata +++ b/testdata @@ -14285,3 +14285,4 @@ b" nil 1 0 "b" (nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil) ." nil nil nil nil "aa" nil 1 0 nil nil) (1623 "\"abcdxklqj\" =~ /ab(?=.*q)cd/" "ab(?=.*q)cd" nil nil nil nil "abcdxklqj" nil 1 0 "abcd" (nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil)) (1624 "\"ab\" =~ /a(?!.*$)b/" "a(?!.*$)b" nil nil nil nil "ab" nil 1 0 nil nil) +(1625 "\"Axi\" =~ /.{2}[a-z]/" ".{2}[a-z]" nil nil nil nil "Axi" nil 1 0 "Axi" (nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil)) diff --git a/testinput b/testinput index e04a6e8..8f1ad56 100644 --- a/testinput +++ b/testinput @@ -3942,4 +3942,7 @@ abcdxklqj /a(?!.*$)b/ - ab \ No newline at end of file + ab + +/.{2}[a-z]/ + Axi \ No newline at end of file diff --git a/util.lisp b/util.lisp index 869a263..8caf6e1 100644 --- a/util.lisp +++ b/util.lisp @@ -1,13 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.32 2005/08/23 10:32:30 edi Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.40 2008/07/03 10:06:16 edi Exp $ -;;; Utility functions and constants dealing with the hash-tables -;;; we use to encode character classes +;;; Utility functions and constants dealing with the character sets we +;;; use to encode character classes -;;; Hash-tables are treated like sets, i.e. a character C is a member of the -;;; hash-table H iff (GETHASH C H) is true. - -;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -36,7 +33,8 @@ (in-package #:cl-ppcre) #+:lispworks -(import 'lw:with-unique-names) +(eval-when (:compile-toplevel :load-toplevel :execute) + (import 'lw:with-unique-names)) #-:lispworks (defmacro with-unique-names ((&rest bindings) &body body) @@ -103,22 +101,15 @@ are discarded \(that is, the body is an implicit PROGN)." ,,@body)))))) (eval-when (:compile-toplevel :execute :load-toplevel) - (defvar *regex-char-code-limit* char-code-limit - "The upper exclusive bound on the char-codes of characters which -can occur in character classes. Change this value BEFORE creating -scanners if you don't need the Unicode support of implementations like -AllegroCL, CLISP, LispWorks, or SBCL.") - (declaim (type fixnum *regex-char-code-limit*)) - - (defun make-char-hash (test) + (defun make-char-set (test) (declare #.*special-optimize-settings*) - "Returns a hash-table of all characters satisfying test." - (loop with hash = (make-hash-table) - for c of-type fixnum from 0 below char-code-limit - for chr = (code-char c) - if (and chr (funcall test chr)) - do (setf (gethash chr hash) t) - finally (return hash))) + "Returns a CHARSET for all characters satisfying test." + (loop with set = (make-charset) + for code of-type fixnum from 0 below char-code-limit + for char = (code-char code) + if (and char (funcall test char)) + do (add-to-charset char set) + finally (return set))) (declaim (inline word-char-p)) @@ -147,101 +138,72 @@ i.e. whether it would match [\\s] in Perl." ;; the following DEFCONSTANT statements are wrapped with ;; (UNLESS (BOUNDP ...) ...) to make SBCL happy -(unless (boundp '+digit-hash+) - (defconstant +digit-hash+ - (make-char-hash (lambda (chr) (char<= #\0 chr #\9))) - "Hash-table containing the digits from 0 to 9.")) +(unless (boundp '+digit-set+) + (defconstant +digit-set+ + (make-char-set (lambda (chr) (char<= #\0 chr #\9))) + "Character set containing the digits from 0 to 9.")) -(unless (boundp '+word-char-hash+) - (defconstant +word-char-hash+ - (make-char-hash #'word-char-p) - "Hash-table containing all \"word\" characters.")) +(unless (boundp '+word-char-set+) + (defconstant +word-char-set+ + (make-char-set #'word-char-p) + "Character set containing all \"word\" characters.")) -(unless (boundp '+whitespace-char-hash+) - (defconstant +whitespace-char-hash+ - (make-char-hash #'whitespacep) - "Hash-table containing all whitespace characters.")) +(unless (boundp '+whitespace-char-set+) + (defconstant +whitespace-char-set+ + (make-char-set #'whitespacep) + "Character set containing all whitespace characters.")) -(defun merge-hash (hash1 hash2) +(defun create-ranges-from-set (set &key downcasep) (declare #.*standard-optimize-settings*) - "Returns the \"sum\" of two hashes. This is a destructive operation -on HASH1." - (cond ((> (hash-table-count hash2) - *regex-char-code-limit*) - ;; don't walk through, e.g., the whole +WORD-CHAR-HASH+ if - ;; the user has set *REGEX-CHAR-CODE-LIMIT* to a lower value - (loop for c of-type fixnum from 0 below *regex-char-code-limit* - for chr = (code-char c) - if (and chr (gethash chr hash2)) - do (setf (gethash chr hash1) t))) - (t - (loop for chr being the hash-keys of hash2 - do (setf (gethash chr hash1) t)))) - hash1) - -(defun merge-inverted-hash (hash1 hash2) - (declare #.*standard-optimize-settings*) - "Returns the \"sum\" of HASH1 and the \"inverse\" of HASH2. This is -a destructive operation on HASH1." - (loop for c of-type fixnum from 0 below *regex-char-code-limit* - for chr = (code-char c) - if (and chr (not (gethash chr hash2))) - do (setf (gethash chr hash1) t)) - hash1) - -(defun create-ranges-from-hash (hash &key downcasep) - (declare #.*standard-optimize-settings*) - "Tries to identify up to three intervals (with respect to CHAR<) -which together comprise HASH. Returns NIL if this is not possible. -If DOWNCASEP is true it will treat the hash-table as if it represents -both the lower-case and the upper-case variants of its members and -will only return the respective lower-case intervals." - ;; discard empty hash-tables - (unless (and hash (plusp (hash-table-count hash))) - (return-from create-ranges-from-hash nil)) + "Tries to identify up to three intervals \(with respect to CHAR<) +which together comprise the charset SET. Returns NIL if this is not +possible. If DOWNCASEP is true it will treat the charset as if it +represents both the lower-case and the upper-case variants of its +members and will only return the respective lower-case intervals." + ;; discard empty charsets + (unless (and set (plusp (charset-count set))) + (return-from create-ranges-from-set nil)) (loop with min1 and min2 and min3 and max1 and max2 and max3 - ;; loop through all characters in HASH, sorted by CHAR< - for chr in (sort (the list - (loop for chr being the hash-keys of hash - collect (if downcasep - (char-downcase chr) - chr))) - #'char<) - for code = (char-code chr) + ;; loop through all characters in SET, sorted by CHAR< + ;; (actually by < on their character codes, see 13.1.6 in the + ;; ANSI standard) + for code of-type fixnum below *regex-char-code-limit* + for char = (code-char code) + when (and char (in-charset-p (if downcasep (char-downcase char) char) set)) ;; MIN1, MAX1, etc. are _exclusive_ ;; bounds of the intervals identified so far do (cond - ((not min1) - ;; this will only happen once, for the first character - (setq min1 (1- code) - max1 (1+ code))) - ((<= (the fixnum min1) code (the fixnum max1)) - ;; we're here as long as CHR fits into the first interval - (setq min1 (min (the fixnum min1) (1- code)) - max1 (max (the fixnum max1) (1+ code)))) - ((not min2) - ;; we need to open a second interval - ;; this'll also happen only once - (setq min2 (1- code) - max2 (1+ code))) - ((<= (the fixnum min2) code (the fixnum max2)) - ;; CHR fits into the second interval - (setq min2 (min (the fixnum min2) (1- code)) - max2 (max (the fixnum max2) (1+ code)))) - ((not min3) - ;; we need to open the third interval - ;; happens only once - (setq min3 (1- code) - max3 (1+ code))) - ((<= (the fixnum min3) code (the fixnum max3)) - ;; CHR fits into the third interval - (setq min3 (min (the fixnum min3) (1- code)) - max3 (max (the fixnum max3) (1+ code)))) - (t - ;; we're out of luck, CHR doesn't fit - ;; into one of the three intervals - (return nil))) + ((not min1) + ;; this will only happen once, for the first character + (setq min1 (1- code) + max1 (1+ code))) + ((<= (the fixnum min1) code (the fixnum max1)) + ;; we're here as long as CHAR fits into the first interval + (setq min1 (min (the fixnum min1) (1- code)) + max1 (max (the fixnum max1) (1+ code)))) + ((not min2) + ;; we need to open a second interval + ;; this'll also happen only once + (setq min2 (1- code) + max2 (1+ code))) + ((<= (the fixnum min2) code (the fixnum max2)) + ;; CHAR fits into the second interval + (setq min2 (min (the fixnum min2) (1- code)) + max2 (max (the fixnum max2) (1+ code)))) + ((not min3) + ;; we need to open the third interval + ;; happens only once + (setq min3 (1- code) + max3 (1+ code))) + ((<= (the fixnum min3) code (the fixnum max3)) + ;; CHAR fits into the third interval + (setq min3 (min (the fixnum min3) (1- code)) + max3 (max (the fixnum max3) (1+ code)))) + (t + ;; we're out of luck, CHAR doesn't fit + ;; into one of the three intervals + (return nil))) ;; on success return all bounds ;; make them inclusive bounds before returning finally (return (values (code-char (1+ min1))