diff --git a/CHANGELOG b/CHANGELOG index 1e62368..b1ccf2d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,124 @@ +Version 1.2.12 +2005-11-01 +REGEX-APROPOS-AUX now also uses :INHERITED +Fixed typo in parser.lisp (thanks to Derek Peschel) +Fixed value of *REGEX-CHAR-CODE-LIMIT* in docs and test (thanks to Christophe Rhodes) + +Version 1.2.11 +2005-08-01 +Added external format for SBCL in ppcre-tests.lisp (thanks to Christophe Rhodes) + +Version 1.2.10 +2005-07-20 +Fixed bug in CHAR-SEARCHER-AUX (caught by Peter Schuller) +Don't redefine what's already there (for LispWorks) + +Version 1.2.9 +2005-06-27 +Hide compiler macros from CCL (thanks to Karsten Poeck) + +Version 1.2.8 +2005-06-10 +Change EQ to EQL in REGEX-LENGTH for ANSI conformance and ABCL compatibility (thanks to Peter Graves) + +Version 1.2.7 +2005-05-16 +Added lispworks-defsystem.lisp (thanks to Wade Humeniuk) +Fixed bug in WORD-BOUNDARY-P + +Version 1.2.6 +2005-04-13 +Added some DEFGENERICs to appease SBCL (thanks to Alan Shields) +Removed wrong FTYPE declaration for STR (thanks to Alan Shields) + +Version 1.2.5 +2005-03-09 +Customizable optimize qualities (thanks to Damien Kick) + +Version 1.2.4 +2005-03-07 +Changed DEBUG optimize quality from 0 to 1 + +Version 1.2.3 +2005-02-02 +Wrapped WITH-COMPILATION-UNIT around loop in load.lisp + +Version 1.2.2 +2005-02-02 +Fixed bug in hash table optimization (introduced in 1.1.0) + +Version 1.2.1 +2005-01-25 +There was a wrong read-time conditional in api.lisp, sorry + +Version 1.2.0 +2005-01-24 +AllegroCL compatibility mode +Fixed broken load.lisp file (caught by Jim Prewett and Zach Beane) + +Version 1.1.0 +2005-01-23 +Cleaned up load.lisp and cl-ppcre.asd +Make large hash tables smaller, if possible +Correct treatment of constant regular expressions in DO-SCANS + +Version 1.0.0 +2004-12-22 +Special anniversary release... :) + +Version 0.9.4 +2004-12-18 +Fixed bug in NORMALIZE-VAR-LIST (caught by Dave Roberts) + +Version 0.9.3 +2004-12-09 +Fixed bug in CREATE-SCANNER-AUX (caught by Allan Ruttenberg and Gary Byers) + +Version 0.9.2 +2004-12-06 +More compiler macros (thanks to Allan Ruttenberg) + +Version 0.9.1 +2004-11-29 +Shortcuts for REGISTER-GROUPS-BIND and DO-REGISTER-GROUPS (suggested by Alexander Kjeldaas) + +Version 0.9.0 +2004-10-14 +Experimental support for "filters" +Bugfix for standalone regular expressions (ACCUMULATE-START-P wasn't set to NIL) + +Version 0.8.1 +2004-09-30 +Patches for Genera 8.5 (thanks to Patrick O'Donnell) + +Version 0.8.0 +2004-09-16 +Added parse tree synonyms (thanks to Patrick O'Donnell) + +Version 0.7.9 +2004-07-13 +Fixed bug in DO-SCANS (caught by Jan Rychter) + +Version 0.7.8 +2004-07-13 +New SIMPLE-CALLS keyword argument for REGEX-REPLACE(-ALL) +Added environment parameter to compiler macros (thanks to c.l.l article by Joe Marshall) +Added compiler macros for SCAN-TO-STRINGS and REGEX-REPLACE(-ALL) (they somehow got lost) + +Version 0.7.7 +2004-05-19 +Fixed bug in NEWLINE-SKIPPER (caught by RegexCoach user Thomas-Paz Hartman) +Added doc strings for PPCRE-SYNTAX-ERROR and friends (after playing with slime-apropos-package) +Added hyperdoc support + +Version 0.7.6 +2004-04-20 +The closures created by CREATE-BMH-MATCHER now cleanly cope with negative arguments (bug caught by Damien Kick) + +Version 0.7.5 +2004-04-19 +Fixed a bug with constant-length repetitions of . (dot) in single-line mode (caught by RegexCoach user Lee Gold) + Version 0.7.4 2004-02-16 Fixed wrong call to SIGNAL-PPCRE-SIGNAL-ERROR in lexer.lisp (caught by Peter Graves) @@ -6,7 +127,7 @@ Compiler macro for SPLIT Version 0.7.3 2004-01-28 -Fixed bug in CURRENT-MIN-REST for lookaheads (reported by Thomas-Paz Hartman) +Fixed bug in CURRENT-MIN-REST for lookaheads (reported by RegexCoach user Thomas-Paz Hartman) Added tests for this bug Version 0.7.2 diff --git a/README b/README index 988a8fc..1b92efd 100644 --- a/README +++ b/README @@ -1,6 +1,10 @@ Complete documentation for CL-PPCRE can be found in the 'doc' directory. +CL-PPCRE also supports Nikodemus Siivola's HYPERDOC, see + and +. + 1. Installation 1.1. Probably the easiest way is @@ -24,6 +28,9 @@ directory. 1.3. You can also use ASDF instead of MK:DEFSYSTEM in a similar way (use the .asd files instead of the .system files). +1.4. For LispWorks there's a file 'lispworks-defsystem.lisp' which includes + a system definition for LispWork's Common Defsystem. + 2. Test CL-PPCRE comes with a test suite that can be used to check its @@ -48,4 +55,8 @@ visual feedback.) It should exactly report three 'errors' (662, 790, and 1439) which are explained in the documentation. MCL might report an error for the ninth test case which is also -explained in the docs. \ No newline at end of file +explained in the docs. + +Genera notes (thanks to Patrick O'Donnell): Some more tests will fail +because characters like #\Return, #\Linefeed, or #\Tab have encodings +which differ from Perl's (and thus CL-PPCRE's) expectations. diff --git a/api.lisp b/api.lisp index c97aa2d..9afe62a 100644 --- a/api.lisp +++ b/api.lisp @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/api.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.60 2005/11/01 09:51:01 edi Exp $ ;;; The external API for creating and using scanners. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -43,17 +43,13 @@ are equivalent to the imsx modifiers in Perl. If DESTRUCTIVE is not NIL the function is allowed to destructively modify its first argument \(but only if it's a parse tree).")) +#-:use-acl-regexp2-engine (defmethod create-scanner ((regex-string string) &key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (ignore destructive)) ;; parse the string into a parse-tree and then call CREATE-SCANNER ;; again @@ -70,34 +66,26 @@ NIL the function is allowed to destructively modify its first argument :single-line-mode single-line-mode :destructive t))) +#-:use-acl-regexp2-engine (defmethod create-scanner ((scanner function) &key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (ignore destructive)) (when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode) (signal-ppcre-invocation-error "You can't use the keyword arguments to modify an existing scanner.")) scanner) +#-:use-acl-regexp2-engine (defmethod create-scanner ((parse-tree t) &key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (when extended-mode (signal-ppcre-invocation-error "Extended mode doesn't make sense in parse trees.")) @@ -188,6 +176,35 @@ NIL the function is allowed to destructively modify its first argument *zero-length-num* reg-num)))))) +#+:use-acl-regexp2-engine +(declaim (inline create-scanner)) + +#+:use-acl-regexp2-engine +(defmethod create-scanner ((scanner regexp::regular-expression) &key case-insensitive-mode + multi-line-mode + single-line-mode + extended-mode + destructive) + (declare (ignore destructive)) + (when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode) + (signal-ppcre-invocation-error + "You can't use the keyword arguments to modify an existing scanner.")) + scanner) + +#+:use-acl-regexp2-engine +(defmethod create-scanner ((parse-tree t) &key case-insensitive-mode + multi-line-mode + single-line-mode + extended-mode + destructive) + (declare (ignore destructive)) + (excl:compile-re parse-tree + :case-fold case-insensitive-mode + :ignore-whitespace extended-mode + :multiple-lines multi-line-mode + :single-line single-line-mode + :return :index)) + (defgeneric scan (regex target-string &key start end) (:documentation "Searches TARGET-STRING from START to END and tries to match REGEX. On success returns four values - the start of the @@ -197,50 +214,66 @@ string which will be parsed according to Perl syntax, a parse tree, or a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will be coerced to a simple string if it isn't one already.")) +#-:use-acl-regexp2-engine (defmethod scan ((regex-string string) target-string &key (start 0) (end (length target-string))) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) ;; note that the scanners are optimized for simple strings so we ;; have to coerce TARGET-STRING into one if it isn't already (funcall (create-scanner regex-string) (maybe-coerce-to-simple-string target-string) start end)) +#-:use-acl-regexp2-engine (defmethod scan ((scanner function) target-string &key (start 0) (end (length target-string))) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (funcall scanner (maybe-coerce-to-simple-string target-string) start end)) +#-:use-acl-regexp2-engine (defmethod scan ((parse-tree t) target-string &key (start 0) (end (length target-string))) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (funcall (create-scanner parse-tree) (maybe-coerce-to-simple-string target-string) start end)) -(define-compiler-macro scan (&whole form regex target-string &rest rest) +#+:use-acl-regexp2-engine +(declaim (inline scan)) + +#+:use-acl-regexp2-engine +(defmethod scan ((parse-tree t) target-string + &key (start 0) + (end (length target-string))) + (when (< end start) + (return-from scan nil)) + (let ((results (multiple-value-list (excl:match-re parse-tree target-string + :start start + :end end + :return :index)))) + (declare (dynamic-extent results)) + (cond ((null (first results)) nil) + (t (let* ((no-of-regs (- (length results) 2)) + (reg-starts (make-array no-of-regs + :element-type '(or null fixnum))) + (reg-ends (make-array no-of-regs + :element-type '(or null fixnum))) + (match (second results))) + (loop for (start . end) in (cddr results) + for i from 0 + do (setf (aref reg-starts i) start + (aref reg-ends i) end)) + (values (car match) (cdr match) reg-starts reg-ends)))))) + +#-:cormanlisp +(define-compiler-macro scan (&whole form &environment env regex target-string &rest rest) "Make sure that constant forms are compiled into scanners at compile time." - (cond ((constantp regex) + (cond ((constantp regex env) `(scan (load-time-value (create-scanner ,regex)) ,target-string ,@rest)) @@ -249,12 +282,7 @@ be coerced to a simple string if it isn't one already.")) (defun scan-to-strings (regex target-string &key (start 0) (end (length target-string)) sharedp) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Like SCAN but returns substrings of TARGET-STRING instead of positions, i.e. this function returns two values on success: the whole match as a string plus an array of substrings (or NILs) corresponding @@ -276,6 +304,16 @@ structure with TARGET-STRING." reg-starts reg-ends))))) +#-:cormanlisp +(define-compiler-macro scan-to-strings + (&whole form &environment env regex target-string &rest rest) + "Make sure that constant forms are compiled into scanners at compile time." + (cond ((constantp regex env) + `(scan-to-strings (load-time-value + (create-scanner ,regex)) + ,target-string ,@rest)) + (t form))) + (defmacro register-groups-bind (var-list (regex target-string &key start end sharedp) &body body) @@ -287,7 +325,7 @@ VAR-LIST which is NIL there's no binding to the corresponding register group. The number of variables in VAR-LIST must not be greater than the number of register groups. If SHAREDP is true, the substrings may share structure with TARGET-STRING." - (rebinding (target-string) + (with-rebinding (target-string) (with-unique-names (match-start match-end reg-starts reg-ends start-index substr-fn) `(multiple-value-bind (,match-start ,match-end ,reg-starts ,reg-ends) @@ -299,24 +337,26 @@ share structure with TARGET-STRING." `(,substr-fn (if ,sharedp #'nsubseq #'subseq)) - (loop for var in var-list + (loop for (function var) in (normalize-var-list var-list) for counter from 0 when var - collect `(,var (let ((,start-index - (aref ,reg-starts ,counter))) - (if ,start-index - (funcall ,substr-fn - ,target-string - ,start-index - (aref ,reg-ends ,counter)) - nil))))) + collect `(,var (let ((,start-index + (aref ,reg-starts ,counter))) + (if ,start-index + (funcall ,function + (funcall ,substr-fn + ,target-string + ,start-index + (aref ,reg-ends ,counter))) + nil))))) ,@body)))))) (defmacro do-scans ((match-start match-end reg-starts reg-ends regex target-string &optional result-form &key start end) - &body body) + &body body + &environment env) "Iterates over TARGET-STRING and tries to match REGEX as often as possible evaluating BODY with MATCH-START, MATCH-END, REG-STARTS, and REG-ENDS bound to the four return values of each match in turn. After @@ -325,19 +365,24 @@ implicit block named NIL surrounds DO-SCANS; RETURN may be used to terminate the loop immediately. If REGEX matches an empty string the scan is continued one position behind this match. BODY may start with declarations." - (rebinding (target-string regex) - (with-unique-names (%start %end scanner loop-tag block-name) + (with-rebinding (target-string) + (with-unique-names (%start %end %regex scanner loop-tag block-name) + (declare (ignorable %regex scanner)) ;; the NIL BLOCK to enable exits via (RETURN ...) `(block nil (let* ((,%start (or ,start 0)) (*real-start-pos* ,%start) (,%end (or ,end (length ,target-string))) - ;; create a scanner unless the regex is already a - ;; function (otherwise SCAN will do this on each - ;; iteration) - (,scanner (typecase ,regex - (function ,regex) - (t (create-scanner ,regex))))) + ,@(unless (constantp regex env) + ;; leave constant regular expressions as they are - + ;; SCAN's compiler macro will take care of them; + ;; otherwise create a scanner unless the regex is + ;; already a function (otherwise SCAN will do this + ;; on each iteration) + `((,%regex ,regex) + (,scanner (typecase ,%regex + (function ,%regex) + (t (create-scanner ,%regex))))))) ;; coerce TARGET-STRING to a simple string unless it is one ;; already (otherwise SCAN will do this on each iteration) (setq ,target-string @@ -350,7 +395,9 @@ declarations." ;; provided variables (multiple-value-bind (,match-start ,match-end ,reg-starts ,reg-ends) - (scan ,scanner ,target-string :start ,%start :end ,%end) + (scan ,(cond ((constantp regex env) regex) + (t scanner)) + ,target-string :start ,%start :end ,%end) ;; declare the variables to be IGNORABLE to prevent the ;; compiler from issuing warnings (declare @@ -363,7 +410,7 @@ declarations." (locally ,@body) ;; advance by one position if we had a zero-length match - (setq ,%start (if (= ,%start ,match-end) + (setq ,%start (if (= ,match-start ,match-end) (1+ ,match-end) ,match-end))) (go ,loop-tag)))))))) @@ -405,7 +452,7 @@ terminate the loop immediately. If REGEX matches an empty string the scan is continued one position behind this match. If SHAREDP is true, the substrings may share structure with TARGET-STRING. BODY may start with declarations." - (rebinding (target-string) + (with-rebinding (target-string) (with-unique-names (match-start match-end substr-fn) `(let ((,substr-fn (if ,sharedp #'nsubseq #'subseq))) ;; simple use DO-MATCHES to extract the substrings @@ -432,7 +479,7 @@ surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop immediately. If REGEX matches an empty string the scan is continued one position behind this match. If SHAREDP is true, the substrings may share structure with TARGET-STRING. BODY may start with declarations." - (rebinding (target-string) + (with-rebinding (target-string) (with-unique-names (substr-fn match-start match-end reg-starts reg-ends start-index) `(let ((,substr-fn (if ,sharedp @@ -441,27 +488,24 @@ share structure with TARGET-STRING. BODY may start with declarations." (do-scans (,match-start ,match-end ,reg-starts ,reg-ends ,regex ,target-string ,result-form :start ,start :end ,end) - (let ,(loop for var in var-list + (let ,(loop for (function var) in (normalize-var-list var-list) for counter from 0 - collect `(,var (let ((,start-index - (aref ,reg-starts ,counter))) - (if ,start-index - (funcall ,substr-fn - ,target-string - ,start-index - (aref ,reg-ends ,counter)) - nil)))) + when var + collect `(,var (let ((,start-index + (aref ,reg-starts ,counter))) + (if ,start-index + (funcall ,function + (funcall ,substr-fn + ,target-string + ,start-index + (aref ,reg-ends ,counter))) + nil)))) ,@body)))))) (defun all-matches (regex target-string &key (start 0) (end (length target-string))) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns a list containing the start and end positions of all matches of REGEX against TARGET-STRING, i.e. if there are N matches the list contains (* 2 N) elements. If REGEX matches an empty string @@ -474,16 +518,21 @@ the scan is continued one position behind this match." (push match-start result-list) (push match-end result-list)))) +#-:cormanlisp +(define-compiler-macro all-matches (&whole form &environment env regex &rest rest) + "Make sure that constant forms are compiled into scanners at +compile time." + (cond ((constantp regex env) + `(all-matches (load-time-value + (create-scanner ,regex)) + ,@rest)) + (t form))) + (defun all-matches-as-strings (regex target-string &key (start 0) (end (length target-string)) sharedp) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns a list containing all substrings of TARGET-STRING which match REGEX. If REGEX matches an empty string the scan is continued one position behind this match. If SHAREDP is true, the substrings may @@ -493,6 +542,17 @@ share structure with TARGET-STRING." :start start :end end :sharedp sharedp) (push match result-list)))) +#-:cormanlisp +(define-compiler-macro all-matches-as-strings (&whole form &environment env regex &rest rest) + "Make sure that constant forms are compiled into scanners at +compile time." + (cond ((constantp regex env) + `(all-matches-as-strings + (load-time-value + (create-scanner ,regex)) + ,@rest)) + (t form))) + (defun split (regex target-string &key (start 0) (end (length target-string)) @@ -500,12 +560,7 @@ share structure with TARGET-STRING." with-registers-p omit-unmatched-p sharedp) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Matches REGEX against TARGET-STRING as often as possible and returns a list of the substrings between the matches. If WITH-REGISTERS-P is true, substrings corresponding to matched @@ -569,21 +624,17 @@ TARGET-STRING." target-string this-start this-end) nil))))) -(define-compiler-macro split (&whole form regex target-string &rest rest) +#-:cormanlisp +(define-compiler-macro split (&whole form &environment env regex target-string &rest rest) "Make sure that constant forms are compiled into scanners at compile time." - (cond ((constantp regex) + (cond ((constantp regex env) `(split (load-time-value (create-scanner ,regex)) ,target-string ,@rest)) (t form))) (defun string-case-modifier (str from to start end) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum from to start end)) "Checks whether all words in STR between FROM and TO are upcased, downcased or capitalized and returns a function which applies a @@ -648,19 +699,18 @@ that (<= START FROM TO END)." ;; first create a scanner to identify the special parts of the ;; replacement string (eat your own dog food...) + +(defgeneric build-replacement-template (replacement-string) + (declare #.*standard-optimize-settings*) + (:documentation "Converts a replacement string for REGEX-REPLACE or +REGEX-REPLACE-ALL into a replacement template which is an +S-expression.")) + #-:cormanlisp (let* ((*use-bmh-matchers* nil) - (reg-scanner (create-scanner "\\\\(?:\\\\|{\\d+}|\\d+|&|`|')"))) + (reg-scanner (create-scanner "\\\\(?:\\\\|\\{\\d+\\}|\\d+|&|`|')"))) (defmethod build-replacement-template ((replacement-string string)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Converts a replacement string for REGEX-REPLACE or -REGEX-REPLACE-ALL into a replacement template which is an -S-expression." + (declare #.*standard-optimize-settings*) (let ((from 0) ;; COLLECTOR will hold the (reversed) template (collector '())) @@ -714,14 +764,9 @@ S-expression." ;;; Corman Lisp's methods can't be closures... :( #+:cormanlisp (let* ((*use-bmh-matchers* nil) - (reg-scanner (create-scanner "\\\\(?:\\\\|{\\d+}|\\d+|&|`|')"))) + (reg-scanner (create-scanner "\\\\(?:\\\\|\\{\\d+\\}|\\d+|&|`|')"))) (defun build-replacement-template (replacement) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (typecase replacement (string (let ((from 0) @@ -770,13 +815,9 @@ S-expression." target-string start end match-start match-end - reg-starts reg-ends) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + reg-starts reg-ends + simple-calls) + (declare #.*standard-optimize-settings*) "Accepts a replacement template and the current values from the matching process in REGEX-REPLACE or REGEX-REPLACE-ALL and returns the corresponding template." @@ -806,12 +847,22 @@ corresponding template." :start (svref reg-starts token) :end (svref reg-ends token)))) (function - (write-string (funcall token - target-string - start end - match-start match-end - reg-starts reg-ends) - s)) + (write-string + (cond (simple-calls + (apply token + (nsubseq target-string match-start match-end) + (map 'list + (lambda (reg-start reg-end) + (and reg-start + (nsubseq target-string reg-start reg-end))) + reg-starts reg-ends))) + (t + (funcall token + target-string + start end + match-start match-end + reg-starts reg-ends))) + s)) (symbol (case token ((:backslash) @@ -833,20 +884,26 @@ corresponding template." :start match-end :end end)) (otherwise - (write-string (funcall token - target-string - start end - match-start match-end - reg-starts reg-ends) - s))))))))) + (write-string + (cond (simple-calls + (apply token + (nsubseq target-string match-start match-end) + (map 'list + (lambda (reg-start reg-end) + (and reg-start + (nsubseq target-string reg-start reg-end))) + reg-starts reg-ends))) + (t + (funcall token + target-string + start end + match-start match-end + reg-starts reg-ends))) + s))))))))) -(defun replace-aux (target-string replacement pos-list reg-list start end preserve-case) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) +(defun replace-aux (target-string replacement pos-list reg-list + start end preserve-case simple-calls) + (declare #.*standard-optimize-settings*) "Auxiliary function used by REGEX-REPLACE and REGEX-REPLACE-ALL. POS-LIST contains a list with the start and end positions of all matches while REG-LIST contains a list of arrays @@ -867,7 +924,8 @@ representing the corresponding register start and end positions." target-string start end from to - reg-starts reg-ends) + reg-starts reg-ends + simple-calls) nil) while to if replace @@ -887,13 +945,9 @@ representing the corresponding register start and end positions." (defun regex-replace (regex target-string replacement &key (start 0) (end (length target-string)) - preserve-case) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + preserve-case + simple-calls) + (declare #.*standard-optimize-settings*) "Try to match TARGET-STRING between START and END against REGEX and replace the first match with REPLACEMENT. @@ -926,19 +980,25 @@ match." (replace-aux target-string replacement (list match-start match-end) (list reg-starts reg-ends) - start end preserve-case) + start end preserve-case simple-calls) (subseq target-string start end)))) +#-:cormanlisp +(define-compiler-macro regex-replace + (&whole form &environment env regex target-string replacement &rest rest) + "Make sure that constant forms are compiled into scanners at compile time." + (cond ((constantp regex env) + `(regex-replace (load-time-value + (create-scanner ,regex)) + ,target-string ,replacement ,@rest)) + (t form))) + (defun regex-replace-all (regex target-string replacement &key (start 0) (end (length target-string)) - preserve-case) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + preserve-case + simple-calls) + (declare #.*standard-optimize-settings*) "Try to match TARGET-STRING between START and END against REGEX and replace all matches with REPLACEMENT. @@ -978,9 +1038,19 @@ match." (replace-aux target-string replacement (nreverse pos-list) (nreverse reg-list) - start end preserve-case) + start end preserve-case simple-calls) (subseq target-string start end)))) +#-:cormanlisp +(define-compiler-macro regex-replace-all + (&whole form &environment env regex target-string replacement &rest rest) + "Make sure that constant forms are compiled into scanners at compile time." + (cond ((constantp regex env) + `(regex-replace-all (load-time-value + (create-scanner ,regex)) + ,target-string ,replacement ,@rest)) + (t form))) + #-:cormanlisp (defmacro regex-apropos-aux ((regex packages case-insensitive &optional return-form) &body body) @@ -989,7 +1059,7 @@ through PACKAGES and executes BODY with SYMBOL bound to each symbol which matches REGEX. Optionally evaluates and returns RETURN-FORM at the end. If CASE-INSENSITIVE is true and REGEX isn't already a scanner, a case-insensitive scanner is used." - (rebinding (regex) + (with-rebinding (regex) (with-unique-names (scanner %packages next morep) `(let* ((,scanner (create-scanner ,regex :case-insensitive-mode @@ -997,14 +1067,14 @@ scanner, a case-insensitive scanner is used." (not (functionp ,regex))))) (,%packages (or ,packages (list-all-packages)))) - (with-package-iterator (,next ,%packages :external :internal) - (loop - (multiple-value-bind (,morep symbol) - (,next) - (unless ,morep - (return ,return-form)) - (when (scan ,scanner (symbol-name symbol)) - ,@body)))))))) + (with-package-iterator (,next ,%packages :external :internal :inherited) + (loop + (multiple-value-bind (,morep symbol) + (,next) + (unless ,morep + (return ,return-form)) + (when (scan ,scanner (symbol-name symbol)) + ,@body)))))))) ;;; The following two functions were provided by Karsten Poeck @@ -1026,7 +1096,7 @@ through PACKAGES and executes BODY with SYMBOL bound to each symbol which matches REGEX. Optionally evaluates and returns RETURN-FORM at the end. If CASE-INSENSITIVE is true and REGEX isn't already a scanner, a case-insensitive scanner is used." - (rebinding (regex) + (with-rebinding (regex) (with-unique-names (scanner %packages) `(let* ((,scanner (create-scanner ,regex :case-insensitive-mode @@ -1040,12 +1110,7 @@ scanner, a case-insensitive scanner is used." ,return-form)))) (defun regex-apropos-list (regex &optional packages &key (case-insensitive t)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Similar to the standard function APROPOS-LIST but returns a list of all symbols which match the regular expression REGEX. If CASE-INSENSITIVE is true and REGEX isn't already a scanner, a @@ -1057,12 +1122,7 @@ case-insensitive scanner is used." (defun print-symbol-info (symbol) "Auxiliary function used by REGEX-APROPOS. Tries to print some meaningful information about a symbol." - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (handler-case (let ((output-list '())) (cond ((special-operator-p symbol) @@ -1107,12 +1167,7 @@ meaningful information about a symbol." symbols which match the regular expression REGEX. If CASE-INSENSITIVE is true and REGEX isn't already a scanner, a case-insensitive scanner is used." - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (regex-apropos-aux (regex packages case-insensitive) (print-symbol-info symbol)) (values)) @@ -1169,3 +1224,18 @@ end-of-line comments, i.e. those starting with #\\# and ending with comment-scanner) string #'remove-tokens)))) + +(defun parse-tree-synonym (symbol) + "Returns the parse tree the SYMBOL symbol is a synonym for. Returns +NIL is SYMBOL wasn't yet defined to be a synonym." + (get symbol 'parse-tree-synonym)) + +(defun (setf parse-tree-synonym) (new-parse-tree symbol) + "Defines SYMBOL to be a synonm for the parse tree NEW-PARSE-TREE." + (setf (get symbol 'parse-tree-synonym) new-parse-tree)) + +(defmacro define-parse-tree-synonym (name parse-tree) + "Defines the symbol NAME to be a synonym for the parse tree +PARSE-TREE. Both arguments are quoted." + `(eval-when (:compile-toplevel :load-toplevel :execute) + (setf (parse-tree-synonym ',name) ',parse-tree))) diff --git a/cl-ppcre-test.asd b/cl-ppcre-test.asd index eb178a1..271eac4 100644 --- a/cl-ppcre-test.asd +++ b/cl-ppcre-test.asd @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre-test.asd,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.asd,v 1.8 2005/11/01 09:51:01 edi Exp $ ;;; This ASDF system definition was kindly provided by Marco Baringer. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -36,5 +36,6 @@ (in-package #:cl-ppcre-test.system) (defsystem #:cl-ppcre-test - :depends-on (#:cl-ppcre) - :components ((:file "ppcre-tests"))) + :version "1.2.12" + :depends-on (#:cl-ppcre) + :components ((:file "ppcre-tests"))) diff --git a/cl-ppcre-test.system b/cl-ppcre-test.system index c387d0d..8e6cea4 100644 --- a/cl-ppcre-test.system +++ b/cl-ppcre-test.system @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre-test.system,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre-test.system,v 1.9 2005/04/01 21:29:09 edi Exp $ -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions diff --git a/cl-ppcre.asd b/cl-ppcre.asd index bca2edf..60a8f45 100644 --- a/cl-ppcre.asd +++ b/cl-ppcre.asd @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre.asd,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.asd,v 1.12 2005/11/01 09:51:01 edi Exp $ ;;; This ASDF system definition was kindly provided by Marco Baringer. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -36,16 +36,26 @@ (in-package #:cl-ppcre.system) (defsystem #:cl-ppcre - :components ((:file "packages") - (:file "specials" :depends-on ("packages")) - (:file "util" :depends-on ("packages")) - (:file "errors" :depends-on ("util")) - (:file "lexer" :depends-on ("errors" "specials")) - (:file "parser" :depends-on ("lexer")) - (:file "regex-class" :depends-on ("parser")) - (:file "convert" :depends-on ("regex-class")) - (:file "optimize" :depends-on ("convert")) - (:file "closures" :depends-on ("optimize" "specials")) - (:file "repetition-closures" :depends-on ("closures")) - (:file "scanner" :depends-on ("repetition-closures")) - (:file "api" :depends-on ("scanner")))) + :version "1.2.12" + :serial t + :components ((:file "packages") + (:file "specials") + (:file "util") + (:file "errors") + #-:use-acl-regexp2-engine + (:file "lexer") + #-:use-acl-regexp2-engine + (:file "parser") + #-:use-acl-regexp2-engine + (:file "regex-class") + #-:use-acl-regexp2-engine + (:file "convert") + #-:use-acl-regexp2-engine + (:file "optimize") + #-:use-acl-regexp2-engine + (:file "closures") + #-:use-acl-regexp2-engine + (:file "repetition-closures") + #-:use-acl-regexp2-engine + (:file "scanner") + (:file "api"))) diff --git a/cl-ppcre.system b/cl-ppcre.system index 34e483c..3aed698 100644 --- a/cl-ppcre.system +++ b/cl-ppcre.system @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/cl-ppcre.system,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/cl-ppcre.system,v 1.11 2005/04/01 21:29:09 edi Exp $ -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -40,12 +40,20 @@ (:file "specials" :depends-on ("packages")) (:file "util" :depends-on ("packages")) (:file "errors" :depends-on ("util")) + #-:use-acl-regexp2-engine (:file "lexer" :depends-on ("errors" "specials")) + #-:use-acl-regexp2-engine (:file "parser" :depends-on ("lexer")) + #-:use-acl-regexp2-engine (:file "regex-class" :depends-on ("parser")) + #-:use-acl-regexp2-engine (:file "convert" :depends-on ("regex-class")) + #-:use-acl-regexp2-engine (:file "optimize" :depends-on ("convert")) + #-:use-acl-regexp2-engine (:file "closures" :depends-on ("optimize" "specials")) + #-:use-acl-regexp2-engine (:file "repetition-closures" :depends-on ("closures")) + #-:use-acl-regexp2-engine (:file "scanner" :depends-on ("repetition-closures")) (:file "api" :depends-on ("scanner")))) diff --git a/closures.lisp b/closures.lisp index 85435f8..3ed8354 100644 --- a/closures.lisp +++ b/closures.lisp @@ -1,10 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/closures.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.29 2005/05/16 16:29:23 edi Exp $ ;;; Here we create the closures which together build the final ;;; scanner. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -38,12 +38,7 @@ "Like STRING=, i.e. compares the special string *STRING* from START1 to END1 with STRING2 from START2 to END2. Note that there's no boundary check - this has to be implemented by the caller." - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum start1 end1 start2 end2)) (loop for string1-idx of-type fixnum from start1 below end1 for string2-idx of-type fixnum from start2 below end2 @@ -54,12 +49,7 @@ boundary check - this has to be implemented by the caller." "Like STRING-EQUAL, i.e. compares the special string *STRING* from START1 to END1 with STRING2 from START2 to END2. Note that there's no boundary check - this has to be implemented by the caller." - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum start1 end1 start2 end2)) (loop for string1-idx of-type fixnum from start1 below end1 for string2-idx of-type fixnum from start2 below end2 @@ -67,12 +57,7 @@ boundary check - this has to be implemented by the caller." (schar string2 string2-idx)))) (defgeneric create-matcher-aux (regex next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Creates a closure which takes one parameter, START-POS, and tests whether REGEX can match *STRING* at START-POS such that the call to NEXT-FN after the match would succeed.")) @@ -399,14 +384,10 @@ against CHR-EXPR." (defun word-boundary-p (start-pos) "Check whether START-POS is a word-boundary within *STRING*." - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum start-pos)) - (let ((1-start-pos (1- start-pos))) + (let ((1-start-pos (1- start-pos)) + (*start-pos* (or *real-start-pos* *start-pos*))) ;; either the character before START-POS is a word-constituent and ;; the character at START-POS isn't... (or (and (or (= start-pos *end-pos*) @@ -571,6 +552,13 @@ against CHR-EXPR." (and next-pos (funcall next-fn next-pos)))))) +(defmethod create-matcher-aux ((filter filter) next-fn) + (let ((fn (fn filter))) + (lambda (start-pos) + (let ((next-pos (funcall fn start-pos))) + (and next-pos + (funcall next-fn next-pos)))))) + (defmethod create-matcher-aux ((void void) next-fn) ;; optimize away VOIDs: don't create a closure, just return NEXT-FN next-fn) diff --git a/convert.lisp b/convert.lisp index c0bbff7..c512877 100644 --- a/convert.lisp +++ b/convert.lisp @@ -1,11 +1,11 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/convert.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.22 2005/04/01 21:29:09 edi Exp $ ;;; Here the parse tree is converted into its internal representation ;;; using REGEX objects. At the same time some optimizations are ;;; already applied. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -50,12 +50,7 @@ `(third ,flags)) (defun set-flag (token) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (special flags)) "Reads a flag token and sets or unsets the corresponding entry in the special FLAGS list." @@ -76,12 +71,7 @@ the special FLAGS list." (signal-ppcre-syntax-error "Unknown flag token ~A" token)))) (defun add-range-to-hash (hash from to) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (special flags)) "Adds all characters from character FROM to character TO (inclusive) to the char class hash HASH. Does the right thing with respect to @@ -102,12 +92,7 @@ case-(in)sensitivity as specified by the special variable FLAGS." hash)) (defun convert-char-class-to-hash (list) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Combines all items in LIST into one char class hash and returns it. Items can be single characters, character ranges like \(:RANGE #\\A #\\E), or special character classes like :DIGIT-CLASS. Does the right @@ -115,7 +100,7 @@ thing with respect to case-\(in)sensitivity as specified by the special variable FLAGS." (loop with hash = (make-hash-table :size (ceiling (expt *regex-char-code-limit* (/ 1 4))) :rehash-size (float (expt *regex-char-code-limit* (/ 1 4))) - :rehash-threshold 1.0) + :rehash-threshold #-genera 1.0 #+genera 0.99) for item in list if (characterp item) ;; treat a single character C like a range (:RANGE C C) @@ -157,12 +142,7 @@ special variable FLAGS." min-len length reg-seen) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum minimum) (type (or fixnum null) maximum)) "Splits a REPETITION object into a constant and a varying part if @@ -230,13 +210,8 @@ the same name." ;; case if the regex starts with ".*" which implicitely anchors the ;; regex at the start (perhaps modulo #\Newline). -(defmethod maybe-accumulate ((str str)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) +(defun maybe-accumulate (str) + (declare #.*standard-optimize-settings*) (declare (special accumulate-start-p starts-with)) (declare (ftype (function (t) fixnum) len)) "Accumulate STR into the special variable STARTS-WITH if @@ -291,12 +266,7 @@ NIL or a STR object of the same case mode. Always returns NIL." nil) (defun convert-aux (parse-tree) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (special flags reg-num accumulate-start-p starts-with max-back-ref)) "Converts the parse tree PARSE-TREE into a REGEX object and returns it. @@ -538,8 +508,17 @@ Will also (make-instance 'register :regex (convert-aux (second parse-tree)) :num stored-reg-num))) + ;; (:FILTER &optional ) + ((:filter) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (make-instance 'filter + :fn (second parse-tree) + :len (third parse-tree))) ;; (:STANDALONE ) ((:standalone) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) ;; keep the effect of modifiers local to the enclosed ;; regex (let ((flags (copy-list flags))) @@ -739,16 +718,15 @@ Will also (set-flag parse-tree) (make-instance 'void)) (otherwise - (signal-ppcre-syntax-error "Unknown token ~A in parse-tree" - parse-tree)))))) + (let ((translation (and (symbolp parse-tree) + (parse-tree-synonym parse-tree)))) + (if translation + (convert-aux (copy-tree translation)) + (signal-ppcre-syntax-error "Unknown token ~A in parse-tree" + parse-tree)))))))) (defun convert (parse-tree) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Converts the parse tree PARSE-TREE into an equivalent REGEX object and returns three values: the REGEX object, the number of registers seen and an object the regex starts with which is either a STR object diff --git a/doc/index.html b/doc/index.html index 1cd98f9..9f05d1e 100644 --- a/doc/index.html +++ b/doc/index.html @@ -6,14 +6,12 @@ CL-PPCRE - portable Perl-compatible regular expressions for Common Lisp @@ -47,7 +45,7 @@ to CLISP's own regex implementation which is also written in C.
  • It is portable, i.e. the code aims to be strictly ANSI-compliant. If +href="http://www.lispworks.com/documentation/HyperSpec/Front/index.htm">ANSI-compliant. If you encounter any deviations this is an error and should be reported to the mailing list. CL-PPCRE has been @@ -55,16 +53,18 @@ successfully tested with the following Common Lisp implementations: @@ -116,14 +116,26 @@ license so you can basically do with it whatever you want. +CL-PPCRE has been used successfully in various applications like BioLingua, LoGS, CafeSpot, Eboy, or The Regex Coach. + +

    +Download shortcut: http://weitz.de/files/cl-ppcre.tar.gz. +
     

    Contents

      -
    1. How to use CL-PPCRE +
    2. Download and installation +
    3. Support and mailing lists +
    4. The CL-PPCRE dictionary
        -
      1. create-scanner (for Perl regex strings) +
      2. create-scanner (for Perl regex strings)
      3. create-scanner (for parse trees) +
      4. parse-tree-synonym +
      5. define-parse-tree-synonym
      6. scan
      7. scan-to-strings
      8. register-groups-bind @@ -148,8 +160,7 @@ license so you can basically do with it whatever you want.
      9. ppcre-syntax-error-string
      10. ppcre-syntax-error-pos
      -
    5. Download and installation -
    6. Support and mailing lists +
    7. Filters
    8. Testing CL-PPCRE
    9. Compatibility with Perl
        @@ -173,19 +184,84 @@ license so you can basically do with it whatever you want.
      1. Backslashes may confuse you...
    10. Remarks +
    11. AllegroCL compatibility mode
    12. Acknowledgements
    -
     

    How to use CL-PPCRE

    +
     

    Download and installation

    + +CL-PPCRE together with this documentation can be downloaded from http://weitz.de/files/cl-ppcre.tar.gz. The +current version is 1.2.12. A CHANGELOG is available. +

    +If you're on Debian you should +probably use the cl-ppcre +Debian package which is available thanks to Peter van Eynde and Kevin +Rosenberg. There's also a port +for Gentoo Linux thanks to Matthew Kennedy and a FreeBSD port thanks to Henrik Motakef. +Installation via asdf-install should as well +be possible. +

    +CL-PPCRE comes with simple system definitions for MK:DEFSYSTEM and asdf so you can either adapt it +to your needs or just unpack the archive and from within the CL-PPCRE +directory start your Lisp image and evaluate the form +(mk:compile-system "cl-ppcre") (or the +equivalent one for asdf) which should compile and load the whole +system. +

    +If for some reason you don't want to use MK:DEFSYSTEM or asdf you +can just LOAD the file load.lisp or you +can also get away with something like this: + +

    +(loop for name in '("packages" "specials" "util" "errors" "lexer"
    +                    "parser" "regex-class" "convert" "optimize"
    +                    "closures" "repetition-closures" "scanner" "api")
    +      do (compile-file (make-pathname :name name
    +                                      :type "lisp"))
    +         (load name))
    +
    + +Note that on CL implementations which use the Python compiler +(i.e. CMUCL, SBCL, SCL) you can concatenate the compiled object files +to create one single object file which you can load afterwards: + +
    +cat {packages,specials,util,errors,lexer,parser,regex-class,convert,optimize,closures,repetition-closures,scanner,api}.x86f > cl-ppcre.x86f
    +
    + +(Replace ".x86f" with the correct suffix for +your platform.) +

    +Note that there is no public CVS repository for CL-PPCRE - the repository at common-lisp.net is out of date and not in sync with the (current) version distributed from weitz.de. + + +
     

    Support and mailing lists

    + +For questions, bug reports, feature requests, improvements, or patches +please use the cl-ppcre-devel +mailing list. If you want to be notified about future releases +subscribe to the cl-ppcre-announce +mailing list. These mailing lists were made available thanks to +the services of common-lisp.net. + +
     

    The CL-PPCRE dictionary

    CL-PPCRE exports the following symbols: -


    [Function] -
    create-scanner string &key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner +


    [Method] +
    create-scanner (string string)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner


    Accepts a string which is a regular expression in Perl syntax and returns a closure which will scan strings for this -regular expression. The mode keyboard arguments are equivalent to the +regular expression. The mode keyword arguments are equivalent to the "imsx" modifiers in Perl. The destructive keyword will be ignored.

    @@ -236,12 +312,17 @@ The keyword arguments are just for your convenience. You can always use embedded modifiers like "(?i-s)" instead.

    +


    [Method] +
    create-scanner (function function)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner +


    +In this case function should be a scanner returned by another invocation of CREATE-SCANNER. It will be returned as is. +
    -


    [Function] -
    create-scanner parse-tree &key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner +


    [Method] +
    create-scanner (parse-tree t)&key case-insensitive-mode multi-line-mode single-line-mode extended-mode destructive => scanner


    This is similar to CREATE-SCANNER above but +href="#create-scanner">CREATE-SCANNER for regex strings above but accepts a parse tree as its first argument. A parse tree is an S-expression conforming to the following syntax: @@ -290,6 +371,11 @@ and :NOT-SINGLE-LINE-MODE-P are equivalent to Perl's kept local to the innermost enclosing grouping or clustering construct. +
  • All other symbols will signal an error of type PPCRE-SYNTAX-ERROR +unless they are defined to be parse tree synonyms. +
  • (:FLAGS {<modifier>}*) where <modifier> is one of the modifier symbols from above is used to group modifier symbols. The modifiers are applied @@ -357,6 +443,14 @@ beginning with 1. <number> is a positive integer is a back-reference to a register group. +
  • (:FILTER <function> &optional +<length>) where +<function> is a function +designator and <length> is a +non-negative integer or NIL is a user-defined filter. +
  • (:CHAR-CLASS|:INVERTED-CHAR-CLASS {<item>}*) where <item> is either a character, a character range, or a symbol for a @@ -379,10 +473,10 @@ Perl regex strings when given to CREATE-SCANNER. To circumvent this you can always use the equivalent parse tree (:GROUP <string>) instead.

    -Note that currently CREATE-SCANNER doesn't always check +Note that CREATE-SCANNER doesn't always check for the well-formedness of its first argument, i.e. you are expected -to provide correct parse trees. This will most likely change in -future releases. +to provide correct parse trees. +

    The usage of the keyword argument extended-mode obviously doesn't make sense if CREATE-SCANNER is applied to parse @@ -418,6 +512,72 @@ regex strings to parse trees. Here are some examples: (:SEQUENCE (:POSITIVE-LOOKAHEAD #\a) #\b) +


    [Accessor] +
    parse-tree-synonym symbol => parse-tree +
    (setf (parse-tree-synonym symbol) new-parse-tree)
    + +


    +Any symbol (unless it's a keyword with a special meaning in parse +trees) can be made a "synonym", i.e. an abbreviation, for another parse +tree by this accessor. PARSE-TREE-SYNONYM returns NIL if symbol isn't a synonym yet. +

    +Here's an example: + +

    * (cl-ppcre::parse-string "a*b+")
    +(:SEQUENCE (:GREEDY-REPETITION 0 NIL #\a) (:GREEDY-REPETITION 1 NIL #\b))
    +
    +* (defun my-repetition (char min)
    +    `(:greedy-repetition ,min nil ,char))
    +MY-REPETITION
    +
    +* (setf (parse-tree-synonym 'a*) (my-repetition #\a 0))
    +(:GREEDY-REPETITION 0 NIL #\a)
    +
    +* (setf (parse-tree-synonym 'b+) (my-repetition #\b 1))
    +(:GREEDY-REPETITION 1 NIL #\b)
    +
    +* (let ((scanner (create-scanner '(:sequence a* b+))))
    +    (dolist (string '("ab" "b" "aab" "a" "x"))
    +      (print (scan scanner string)))
    +    (values))
    +0
    +0
    +0
    +NIL
    +NIL
    +
    +* (parse-tree-synonym 'a*)
    +(:GREEDY-REPETITION 0 NIL #\a)
    +
    +* (parse-tree-synonym 'a+)
    +NIL
    +
    + +


    [Macro] +
    define-parse-tree-synonym name parse-tree => parse-tree + +


    +This is a convenience macro for parse tree synonyms defined as + +
    (defmacro define-parse-tree-synonym (name parse-tree)
    +  `(eval-when (:compile-toplevel :load-toplevel :execute)
    +     (setf (parse-tree-synonym ',name) ',parse-tree)))
    +
    + +so you can write code like this: + +
    +(define-parse-tree-synonym a-z
    +  (:char-class (:range #\a #\z) (:range #\a #\z)))
    +
    +(define-parse-tree-synonym a-z*
    +  (:greedy-repetition 0 nil a-z))
    +
    +(defun ascii-char-tester (string)
    +  (scan '(:sequence :start-anchor a-z* :end-anchor)
    +        string))
    +
    +


    For the rest of this section regex can always be a string (which is interpreted as a Perl regular @@ -430,7 +590,7 @@ href="#scan">SCAN. -


    [Function] +


    [Standard Generic Function]
    scan regex target-string &key start end => match-start, match-end, reg-starts, reg-ends


    @@ -525,7 +685,15 @@ Examples: Evaluates statement* with the variables in var-list bound to the corresponding register groups after target-string has been matched against regex, i.e. each variable is either -bound to a string or to NIL. If there is no match, the statement* forms are not +bound to a string or to NIL. +As a shortcut, the elements of var-list can also be lists of the form (FN VAR) where VAR is the variable symbol +and FN is a function +designator (which is evaluated) denoting a function which is to be applied to the string before the result is bound to VAR. +To make this even more convenient the form (FN VAR1 ...VARn) can be used as an abbreviation for +(FN VAR1) ... (FN VARn). +

    +If there is no match, the statement* forms are not executed. For each element of var-list which is NIL there's no binding to the corresponding register group. The number of variables in var-list must not be greater than @@ -537,15 +705,22 @@ share structure with target-string. ("((a)|(b)|(c))+" "abababc" :sharedp t) (list first second third fourth)) ("c" "a" "b" "c") + * (register-groups-bind (nil second third fourth) ;; note that we don't bind the first and fifth register group ("((a)|(b)|(c))()+" "abababc" :start 6) (list second third fourth)) (NIL NIL "c") + * (register-groups-bind (first) ("(a|b)+" "accc" :start 1) (format t "This will not be printed: ~A" first)) NIL + +* (register-groups-bind (fname lname (#'parse-integer date month year)) + ("(\\w+)\\s+(\\w+)\\s+(\\d{1,2})\\.(\\d{1,2})\\.(\\d{4})" "Frank Zappa 21.12.1940") + (list fname lname (encode-universal-time 0 0 0 date month year))) +("Frank" "Zappa" 1292882400)

    @@ -639,7 +814,7 @@ CROSSFOOT 6 -Of course, in real life you would do this with DO-MATCHES and use the start and end keyword parameters of PARSE-INTEGER. +Of course, in real life you would do this with DO-MATCHES and use the start and end keyword parameters of PARSE-INTEGER.


    [Macro]
    do-register-groups var-list (regex target-string &optional result-form &key start end sharedp) declaration* statement* => result* @@ -648,7 +823,7 @@ Of course, in real life you would do this with DO-MA Iterates over target-string and tries to match regex as often as possible evaluating statement* with the variables in var-list bound to the corresponding register groups for each match in turn, i.e. each -variable is either bound to a string or to NIL. The number of +variable is either bound to a string or to NIL. You can use the same shortcuts and abbreviations as in REGISTER-GROUPS-BIND. The number of variables in var-list must not be greater than the number of register groups. For each element of var-list which is NIL there's no binding to the corresponding register @@ -669,6 +844,14 @@ match. If sharedp is true, the substrings may share structur ("b" NIL "b" NIL) ("c" NIL NIL "c") NIL + +* (let (result) + (do-register-groups ((#'parse-integer n) (#'intern sign) whitespace) + ("(\\d+)|(\\+|-|\\*|/)|(\\s+)" "12*15 - 42/3") + (unless whitespace + (push (or n sign) result))) + (nreverse result)) +(12 * 15 - 42 / 3) @@ -787,7 +970,7 @@ frob")


    [Function] -
    regex-replace regex target-string replacement &key start end preserve-case => list +
    regex-replace regex target-string replacement &key start end preserve-case simple-calls => list


    Try to match target-string between start and end against @@ -804,7 +987,7 @@ match, "\`" for the part of Nth register where N is a positive integer.

    replacement can also be a function +href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#function_designator">function designator in which case the match will be replaced with the result of calling the function designated by replacement with the arguments @@ -816,6 +999,15 @@ result of calling the function designated by positions of matched registers (or NIL) - the meaning of the other arguments should be obvious.)

    +If simple-calls is true, a function designated by +replacement will instead be called with the +arguments match, register-1, +..., register-n where match is +the whole match as a string and register-1 to +register-n are the matched registers, also as +strings (or NIL). Note that these strings share structure with +target-string so you must not modify them. +

    Finally, replacement can be a list where each element is a string (which will be inserted verbatim), one of the symbols :match, :before-match, or @@ -829,7 +1021,7 @@ If preserve-case is true (default is NIL), the replacement will try to preserve the case (all upper case, all lower case, or capitalized) of the match. The result will always be a fresh +href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#fresh">fresh string, even if regex doesn't match.

    Examples: @@ -860,7 +1052,7 @@ Examples:


    [Function] -
    regex-replace-all regex target-string replacement &key start end preserve-case => list +
    regex-replace-all regex target-string replacement &key start end preserve-case simple-calls => list


    Like REGEX-REPLACE but replaces all matches. @@ -912,6 +1104,34 @@ HOW-MANY "foo{...}bar{.....}{..}baz{....}frob" (list "[" 'how-many " dots]")) "foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob" + +* (let ((qp-regex (cl-ppcre:create-scanner "[\\x80-\\xff]"))) + (defun encode-quoted-printable (string) + "Convert 8-bit string to quoted-printable representation. +Version using SIMPLE-CALLS keyword argument." + ;; ;; won't work for Corman Lisp because non-ASCII characters aren't 8-bit there + (flet ((convert (match) + (format nil "=~2,'0x" (char-code (char match 0))))) + (cl-ppcre:regex-replace-all qp-regex string #'convert + :simple-calls t)))) + +Converted ENCODE-QUOTED-PRINTABLE. +ENCODE-QUOTED-PRINTABLE + +* (encode-quoted-printable "Fête Sørensen naïve Hühner Straße") +"F=EAte S=F8rensen na=EFve H=FChner Stra=DFe" + +* (defun how-many (match first-register) + (declare (ignore match)) + (format nil "~A" (length first-register))) +HOW-MANY + +* (cl-ppcre:regex-replace-all "{(.+?)}" + "foo{...}bar{.....}{..}baz{....}frob" + (list "[" 'how-many " dots]") + :simple-calls t) + +"foo[3 dots]bar[5 dots][2 dots]baz[4 dots]frob"


    [Function] @@ -919,7 +1139,7 @@ HOW-MANY


    Like APROPOS +href="http://www.lispworks.com/documentation/HyperSpec/Body/f_apropo.htm">APROPOS but searches for interned symbols which match the regular expression regex. The output is implementation-dependent. If case-insensitive is true (which is the default) @@ -983,7 +1203,7 @@ FOOBOO [variable] value: 43

    Like APROPOS-LIST +href="http://www.lispworks.com/documentation/HyperSpec/Body/f_apropo.htm">APROPOS-LIST but searches for interned symbols which match the regular expression regex. If case-insensitive is true (which is the default) and regex isn't @@ -1001,18 +1221,18 @@ Example (continued from above):

    This variable controls whether scanners take into account all characters of your CL implementation or only those the CHAR-CODE +href="http://www.lispworks.com/documentation/HyperSpec/Body/f_char_c.htm#char-code">CHAR-CODE of which is not larger than its value. It is only relevant if the regular expression contains certain character classes. The default is CHAR-CODE-LIMIT, +href="http://www.lispworks.com/documentation/HyperSpec/Body/v_char_c.htm">CHAR-CODE-LIMIT, and you might see significant speed and space improvements during scanner creation if, say, your target strings only contain ISO-8859-1 characters and you're using an implementation like AllegroCL, -LispWorks, or CLISP where CHAR-CODE-LIMIT has a value -much higher than 255. The test suite will -automatically set *REGEX-CHAR-CODE-LIMIT* to 255 while +CLISP, LispWorks, or SBCL where CHAR-CODE-LIMIT has a value +much higher than 256. The test suite will +automatically set *REGEX-CHAR-CODE-LIMIT* to 256 while you're running the default test.

    Here's an example with LispWorks: @@ -1028,8 +1248,8 @@ Allocation = 546600 bytes standard / 2162611 bytes fixlen 0 Page faults #<closure 20654AF2> -CL-USER 24 > (time (let ((cl-ppcre:*regex-char-code-limit* 255)) (cl-ppcre:create-scanner "[3\\D]"))) -Timing the evaluation of (LET ((CL-PPCRE:*REGEX-CHAR-CODE-LIMIT* 255)) (CL-PPCRE:CREATE-SCANNER "[3\\D]")) +CL-USER 24 > (time (let ((cl-ppcre:*regex-char-code-limit* 256)) (cl-ppcre:create-scanner "[3\\D]"))) +Timing the evaluation of (LET ((CL-PPCRE:*REGEX-CHAR-CODE-LIMIT* 256)) (CL-PPCRE:CREATE-SCANNER "[3\\D]")) user time = 0.000 system time = 0.000 @@ -1042,7 +1262,7 @@ Allocation = 3336 bytes standard / 8338 bytes fixlen Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN some scanners might be created in a null +href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_n.htm#null_lexical_environment">null lexical environment at load time or at compile time so be careful to which value *REGEX-CHAR-CODE-LIMIT* is bound at that time. The default value should always yield correct results unless you @@ -1052,14 +1272,14 @@ play dirty tricks with implementation-dependent behaviour, though.


    *use-bmh-matchers*

    Usually, the scanners created by CREATE-SCANNER (or +href="#create-scanner">CREATE-SCANNER (or implicitely by other functions and macros) will use fast Boyer-Moore-Horspool matchers to check for constant strings at the start or end of the regular expression. If *USE-BMH-MATCHERS* is NIL (the default is T), the standard function SEARCH +href="http://www.lispworks.com/documentation/HyperSpec/Body/f_search.htm">SEARCH will be used instead. This will usually be a bit slower but can save lots of space if you're storing many scanners. The test suite will automatically set @@ -1069,7 +1289,7 @@ the default test. Note: Due to the nature of LOAD-TIME-VALUE and the compiler macro for SCAN some scanners might be created in a null +href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_n.htm#null_lexical_environment">null lexical environment at load time or at compile time so be careful to which value *USE-BMH-MATCHERS* is bound at that time.
    @@ -1134,7 +1354,7 @@ href="#*allow-quoting*">*ALLOW-QUOTING* is non-word characters (everything except ASCII characters, digits and underline) of STRING are quoted by prepending a backslash similar to Perl's quotemeta function. It always returns a fresh +href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#fresh">fresh string.
     * (cl-ppcre:quote-meta-chars "[a-z]*")
    @@ -1147,7 +1367,7 @@ string.
     

    Every error signaled by CL-PPCRE is of type PPCRE-ERROR. This is a direct subtype of SIMPLE-ERROR +href="http://www.lispworks.com/documentation/HyperSpec/Body/e_smp_er.htm">SIMPLE-ERROR without any additional slots or options.
    @@ -1210,7 +1430,7 @@ encountered (or NIL if the error happened while trying to convert a parse tree). This might be particularly useful when *ALLOW-QUOTING* is true because in this case the offending string might not be the one you gave to the CREATE-SCANNER function. +href="#create-scanner">CREATE-SCANNER function.


    [Function] @@ -1225,69 +1445,185 @@ convert a parse tree).

    -
     

    Download and installation

    +
     

    Filters

    -CL-PPCRE together with this documentation can be downloaded from http://weitz.de/files/cl-ppcre.tgz. The -current version is 0.7.4 - older versions are -available for download through URLs like -http://weitz.de/files/cl-ppcre-<version>.tgz. A CHANGELOG is available. +Because several users have asked for it, CL-PPCRE now offers +"filters" (see above for syntax) +which are basically arbitrary, user-defined functions that can act as +regex building blocks. Filters can only be used within parse trees, not within Perl regex +strings.

    -If you're on Debian you should -probably use the cl-ppcre -Debian package which is available thanks to Kevin -Rosenberg. There's also a port -for Gentoo Linux thanks to Matthew Kennedy and a FreeBSD port thanks to Henrik Motakef. -Installation via asdf-install should as well -be possible. +Note that filters are currently considered an experimental feature and +their API might change in the future.

    -CL-PPCRE comes with simple system definitions for MK:DEFSYSTEM and asdf so you can either adapt it -to your needs or just unpack the archive and from within the CL-PPCRE -directory start your Lisp image and evaluate the form -(mk:compile-system "cl-ppcre") (or the -equivalent one for asdf) which should compile and load the whole -system. +A filter is defined by its filter function which must be a +function of one argument. During the parsing process this function +might be called once or several times or it might not be called at +all. If it's called its argument is an integer pos +which is the current position within the target string. The filter can +either return NIL (which means that the subexpression +represented by this filter didn't match) or an integer not smaller +than pos for success. A zero-length assertion +should return pos itself while a filter which +wants to consume N characters should return +(+ POS N).

    -If for some reason you don't want to use MK:DEFSYSTEM or asdf you -can just LOAD the file load.lisp or you -can also get away with something like this: +If you supply the optional value length and it is +not NIL then this is a promise to the regex engine that +your filter will always consume exactly +length characters. The regex engine might use this +information for optimization purposes but it is otherwise irrelevant +to the outcome of the matching process. +

    +The filter function can access the following special variables from +its code body: +

      +
    • CL-PPCRE::*STRING*: The target (a string) of the +current matching process. + +
    • CL-PPCRE::*START-POS* and +CL-PPCRE::*END-POS*: The start and end (integers) indices +of the current matching process. These correspond to the +START and END keyword parameters of SCAN. + +
    • CL-PPCRE::*REAL-START-POS*: The initial starting +position. This is only relevant for repeated scans (as in DO-SCANS) where +CL-PPCRE::*START-POS* will be moved forward while +CL-PPCRE::*REAL-START-POS* won't. For normal scans the +value of this variable is NIL. + +
    • CL-PPCRE::*REG-STARTS* and +CL-PPCRE::*REG-ENDS*: Two simple vectors which denote the +start and end indices of registers within the regular expression. The +first register is indexed by 0. If a register hasn't matched yet +then its corresponding entry in CL-PPCRE::*REG-STARTS* is +NIL. + +
    + +These variables should be considered read-only. Do not change +these values unless you really know what you're doing! +

    +Note that the names of the variables are not exported from the +CL-PPCRE package because there's currently no guarantee +that they will be available in future releases. +

    +Here are some filter examples:

    -(loop for name in '("packages" "specials" "util" "errors" "lexer"
    -                    "parser" "regex-class" "convert" "optimize"
    -                    "closures" "repetition-closures" "scanner" "api")
    -      do (compile-file (make-pathname :name name
    -                                      :type "lisp"))
    -         (load name))
    +* (defun my-info-filter (pos)
    +    "Show some info about the matching process."
    +    (format t "Called at position ~A~%" pos)
    +    (loop with dim = (array-dimension cl-ppcre::*reg-starts* 0)
    +          for i below dim
    +          for reg-start = (aref cl-ppcre::*reg-starts* i)
    +          for reg-end = (aref cl-ppcre::*reg-ends* i)
    +          do (format t "Register ~A is currently " (1+ i))
    +          when reg-start
    +               (write-string cl-ppcre::*string* nil
    +            do (write-char #\')
    +               (write-string cl-ppcre::*string* nil
    +                     :start reg-start :end reg-end)
    +               (write-char #\')
    +          else
    +            do (write-string "unbound")
    +          do (terpri))
    +    (terpri)
    +    pos)
    +MY-INFO-FILTER
    +
    +* (scan '(:sequence
    +           (:register
    +             (:greedy-repetition 0 nil
    +                                 (:char-class (:range #\a #\z))))
    +           (:filter my-info-filter 0) "X")
    +        "bYcdeX")
    +Called at position 1
    +Register 1 is currently 'b'
    +
    +Called at position 0
    +Register 1 is currently ''
    +
    +Called at position 1
    +Register 1 is currently ''
    +
    +Called at position 5
    +Register 1 is currently 'cde'
    +
    +2
    +6
    +#(2)
    +#(5)
    +
    +* (scan '(:sequence
    +           (:register
    +             (:greedy-repetition 0 nil
    +                                 (:char-class (:range #\a #\z))))
    +           (:filter my-info-filter 0) "X")
    +        "bYcdeZ")
    +NIL
    +
    +* (defun my-weird-filter (pos)
    +    "Only match at this point if either pos is odd and the character
    +  we're looking at is lowerrcase or if pos is even and the next two
    +  characters we're looking at are uppercase. Consume these characters if
    +  there's a match."
    +    (format t "Trying at position ~A~%" pos)
    +    (cond ((and (oddp pos)
    +                (< pos cl-ppcre::*end-pos*)
    +                (lower-case-p (char cl-ppcre::*string* pos)))
    +           (1+ pos))
    +          ((and (evenp pos)
    +                (< (1+ pos) cl-ppcre::*end-pos*)
    +                (upper-case-p (char cl-ppcre::*string* pos))
    +                (upper-case-p (char cl-ppcre::*string* (1+ pos))))
    +           (+ pos 2))
    +          (t nil)))
    +MY-WEIRD-FILTER
    +
    +* (defparameter *weird-regex*
    +                `(:sequence "+" (:filter ,#'my-weird-filter) "+"))
    +*WEIRD-REGEX*
    +
    +* (scan *weird-regex* "+A++a+AA+")
    +Trying at position 1
    +Trying at position 3
    +Trying at position 4
    +Trying at position 6
    +5
    +9
    +#()
    +#()
    +
    +* (fmakunbound 'my-weird-filter)
    +MY-WEIRD-FILTER
    +
    +* (scan *weird-regex* "+A++a+AA+")
    +Trying at position 1
    +Trying at position 3
    +Trying at position 4
    +Trying at position 6
    +5
    +9
    +#()
    +#()
     
    -Note that on CL implementations which use the Python compiler -(i.e. CMUCL, SBCL, SCL) you can concatenate the compiled object files -to create one single object file which you can load afterwards: +Note that in the second call to SCAN our filter wasn't +invoked at all - it was optimized away by the regex engine because it +knew that it couldn't match. Also note that *WEIRD-REGEX* +still worked after we removed the global function definition of +MY-WEIRD-FILTER because the regular expression had +captured the original definition. -
    -cat {packages,specials,util,errors,lexer,parser,regex-class,convert,optimize,closures,repetition-closures,scanner,api}.x86f > cl-ppcre.x86f
    -
    +

    -(Replace ".x86f" with the correct suffix for -your platform.) - - -
     

    Support and mailing lists

    - -For questions, bug reports, feature requests, improvements, or patches -please use the cl-ppcre-devel -mailing list. If you want to be notified about future releases -subscribe to the cl-ppcre-announce -mailing list. These mailing lists were made available thanks to -the services of common-lisp.net. +For more ideas about what you can do with filters see this +thread on the mailing list.
     

    Testing CL-PPCRE

    @@ -1317,7 +1653,7 @@ NIL * (cl-ppcre-test:test) ;; .... -;; (a list of incompatibilities with Perl) +;; (a list of incompatibilities with Perl) (If you're not using MK:DEFSYSTEM or asdf it suffices to build @@ -1398,7 +1734,7 @@ translates "\r" to (CODE-CHAR

    What about "\w"?

    CL-PPCRE uses ALPHANUMERICP +href="http://www.lispworks.com/documentation/HyperSpec/Body/f_alphan.htm">ALPHANUMERICP to decide whether a character matches Perl's "\w", so depending on your CL implementation you might encounter differences between Perl and CL-PPCRE when @@ -1410,7 +1746,7 @@ matching non-ASCII characters. The CL-PPCRE test suite can also be used for benchmarking purposes: If you call perltest.pl with a -command line argument it will be interpreted as the number of seconds +command line argument it will be interpreted as the minimum number of seconds each test should run. Perl will time its tests accordingly and create output which, when fed to CL-PPCRE-TEST:TEST, will result in a benchmark. Here's an example: @@ -1554,13 +1890,13 @@ for you automatically.

    However, beginning with version 0.5.2, CL-PPCRE uses a compiler +href="http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_c.htm#compiler_macro">compiler macro and LOAD-TIME-VALUE +href="http://www.lispworks.com/documentation/HyperSpec/Body/s_ld_tim.htm">LOAD-TIME-VALUE to make sure that the scanner is only built once if the first argument -to SCAN, SCAN-TO-STRINGS, SPLIT, or -REGEX-REPLACE is a constant +to SCAN, SCAN-TO-STRINGS, SPLIT, +REGEX-REPLACE, or REGEX-REPLACE-ALL is a constant form. (But see the notes for *REGEX-CHAR-CODE-LIMIT* and *USE-BMH-MATCHERS*.) @@ -1674,7 +2010,7 @@ target strings.

    Another thing to consider is that, for performance reasons, CL-PPCRE assumes that most of the target strings you're trying to match are simple +href="http://www.lispworks.com/documentation/HyperSpec/Body/t_smp_st.htm">simple strings and coerces non-simple strings to simple strings before scanning them. If you plan on working with non-simple strings mostly you might consider modifying the CL-PPCRE source code. This is easy: @@ -1746,6 +2082,8 @@ TARGET With CMUCL the situation is better and worse at the same time. It will take a lot longer until CMUCL gives up but if it gives up the whole Lisp image will silently die (at least on my machine): +

    +[Note: This was true for CMUCL 18e - CMUCL 19a behaves in a much nicer way and gives you a chance to recover.]

     * (defun target (n) (concatenate 'string (make-string n :initial-element #\a) "b"))
    @@ -1900,6 +2238,50 @@ IBM Thinkpad T23 laptop (Pentium III 1.2 GHz,
     768 MB RAM) running Gentoo
     Linux 1.1a.
     
    +
     

    AllegroCL compatibility mode

    + +Since autumn 2004 AllegroCL offers +a +new regular expression API with a syntax very similar to +CL-PPCRE. Although CL-PPCRE is quite fast already, AllegroCL's engine will +most likely be even faster (but only on AllegroCL, of course). However, you might want to +stick to CL-PPCRE because you have a "legacy" application or because +you want your code to be portable to other Lisp implementations. +Therefore, beginning from version 1.2.0, CL-PPCRE offers a +"compatibility mode" where you can continue using the CL-PPCRE API as +described above but deploy the AllegroCL regex +engine under the hood. (The details are: Calls to CREATE-SCANNER and SCAN are dispatched to their AllegroCL +counterparts EXCL:COMPILE-RE +and EXCL:MATCH-RE +while everything else is left as is.) +

    +The advantage of this mode is that you'll get a much smaller image and +most likely faster code. (But note that CL-PPCRE needs to do a small amount of work to massage AllegroCL's output into the format expected by CL-PPCRE.) The downside is that your code won't be +fully compatible with CL-PPCRE anymore. Here are some of the +differences (most of which probably don't matter very often): +

    +For more details about the AllegroCL engine and possible deviations from CL-PPCRE see the documentation at the Franz Inc. website. +

    +To use the AllegroCL compatibility mode you have to +

    +(push :use-acl-regexp2-engine *features*)
    +
    +before you compile CL-PPCRE. +
     

    Acknowledgements

    Although I didn't use their code I was heavily inspired by looking at @@ -1927,7 +2309,7 @@ where I wrote most of the code and thanks to my wife for lending me her PowerBook to test CL-PPCRE with MCL and OpenMCL.

    -$Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/doc/index.html,v 1.1 2004/06/23 08:27:10 hans Exp $ +$Header: /usr/local/cvsrep/cl-ppcre/doc/index.html,v 1.131 2005/11/01 09:51:02 edi Exp $

    BACK TO MY HOMEPAGE diff --git a/errors.lisp b/errors.lisp index 67b6b35..398a5fa 100644 --- a/errors.lisp +++ b/errors.lisp @@ -1,7 +1,7 @@ -;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-LISP; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/errors.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- +;;; $Header: /usr/local/cvsrep/cl-ppcre/errors.lisp,v 1.14 2005/04/01 21:29:09 edi Exp $ -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -50,7 +50,19 @@ this type.")) (simple-condition-format-control condition) (simple-condition-format-arguments condition) (ppcre-syntax-error-pos condition) - (ppcre-syntax-error-string condition))))) + (ppcre-syntax-error-string condition)))) + (:documentation "Signaled if CL-PPCRE's parser encounters an error +when trying to parse a regex string or to convert a parse tree into +its internal representation.")) + +(setf (documentation 'ppcre-syntax-error-string 'function) + "Returns the string the parser was parsing when the error was +encountered \(or NIL if the error happened while trying to convert a +parse tree).") + +(setf (documentation 'ppcre-syntax-error-pos 'function) + "Returns the position within the string where the error occured +\(or NIL if the error happened while trying to convert a parse tree") (define-condition ppcre-invocation-error (ppcre-error) () diff --git a/lexer.lisp b/lexer.lisp index c29e46f..30268b5 100644 --- a/lexer.lisp +++ b/lexer.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/lexer.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp $ ;;; The lexer's responsibility is to convert the regex string into a ;;; sequence of tokens which are in turn consumed by the parser. @@ -9,7 +9,7 @@ ;;; has opened so far. (The latter is necessary for interpreting ;;; strings like "\\10" correctly.) -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -39,12 +39,7 @@ (declaim (inline map-char-to-special-class)) (defun map-char-to-special-char-class (chr) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Maps escaped characters like \"\\d\" to the tokens which represent their associated character classes." (case chr @@ -62,12 +57,7 @@ their associated character classes." :non-whitespace-char-class))) (locally - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (defstruct (lexer (:constructor make-lexer-internal)) "LEXER structures are used to hold the regex string which is currently lexed and to keep track of the lexer's state." @@ -86,30 +76,20 @@ currently lexed and to keep track of the lexer's state." (defun make-lexer (string) (declare (inline make-lexer-internal) - (type string string)) + #-genera (type string string)) (make-lexer-internal :str (maybe-coerce-to-simple-string string) :len (length string))) (declaim (inline end-of-string-p)) (defun end-of-string-p (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Tests whether we're at the end of the regex string." (<= (lexer-len lexer) (lexer-pos lexer))) (declaim (inline looking-at-p)) (defun looking-at-p (lexer chr) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Tests whether the next character the lexer would see is CHR. Does not respect extended mode." (and (not (end-of-string-p lexer)) @@ -118,12 +98,7 @@ Does not respect extended mode." (declaim (inline next-char-non-extended)) (defun next-char-non-extended (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns the next character which is to be examined and updates the POS slot. Does not respect extended mode." (cond ((end-of-string-p lexer) @@ -134,12 +109,7 @@ POS slot. Does not respect extended mode." (incf (lexer-pos lexer)))))) (defun next-char (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns the next character which is to be examined and updates the POS slot. Respects extended mode, i.e. whitespace, comments, and also nested comments are skipped if applicable." @@ -203,12 +173,7 @@ nested comments are skipped if applicable." (declaim (inline fail)) (defun fail (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Moves (LEXER-POS LEXER) back to the last position stored in \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack." (unless (lexer-last-pos lexer) @@ -217,12 +182,7 @@ nested comments are skipped if applicable." nil) (defun get-number (lexer &key (radix 10) max-length no-whitespace-p) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Read and consume the number the lexer is currently looking at and return it. Returns NIL if no number could be identified. RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read @@ -252,12 +212,7 @@ we don't tolerate whitespace in front of the number." (declaim (inline try-number)) (defun try-number (lexer &key (radix 10) max-length no-whitespace-p) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Like GET-NUMBER but won't consume anything if no number is seen." ;; remember current position (push (lexer-pos lexer) (lexer-last-pos lexer)) @@ -269,16 +224,11 @@ we don't tolerate whitespace in front of the number." (declaim (inline make-char-from-code)) (defun make-char-from-code (number error-pos) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Create character from char-code NUMBER. NUMBER can be NIL which is interpreted as 0. ERROR-POS is the position where the corresponding number started within the regex string." - ;; Only look at rightmost eight bits in compliance with Perl + ;; only look at rightmost eight bits in compliance with Perl (let ((code (logand #o377 (the fixnum (or number 0))))) (or (and (< code char-code-limit) (code-char code)) @@ -288,12 +238,7 @@ the corresponding number started within the regex string." number)))) (defun unescape-char (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Convert the characters(s) following a backslash into a token which is returned. This function is to be called when the backslash has already been consumed. Special character classes like \\W are @@ -351,12 +296,7 @@ handled elsewhere." chr)))) (defun collect-char-class (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Reads and consumes characters from regex string until a right bracket is seen. Assembles them into a list \(which is returned) of characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and @@ -437,12 +377,7 @@ we're inside a range or not." "Missing right bracket to close character class")))) (defun maybe-parse-flags (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Reads a sequence of modifiers \(including #\\- to reverse their meaning) and returns a corresponding list of \"flag\" tokens. The \"x\" modifier is treated specially in that it dynamically modifies @@ -478,12 +413,7 @@ the behaviour of the lexer itself via the special variable (decf (lexer-pos lexer)))) (defun get-quantifier (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns a list of two values (min max) if what the lexer is looking at can be interpreted as a quantifier. Otherwise returns NIL and resets the lexer to its old position." @@ -533,12 +463,7 @@ resets the lexer to its old position." (fail lexer))))) (defun get-token (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns and consumes the next token from the regex string (or NIL)." ;; remember starting position for UNGET-TOKEN function (push (lexer-pos lexer) @@ -737,12 +662,7 @@ resets the lexer to its old position." (declaim (inline unget-token)) (defun unget-token (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Moves the lexer back to the last position stored in the LAST-POS stack." (if (lexer-last-pos lexer) (setf (lexer-pos lexer) @@ -751,12 +671,7 @@ resets the lexer to its old position." (declaim (inline start-of-subexpr-p)) (defun start-of-subexpr-p (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Tests whether the next token can start a valid sub-expression, i.e. a stand-alone regex." (let* ((pos (lexer-pos lexer)) @@ -766,4 +681,4 @@ a stand-alone regex." (member (the character next-char) '(#\) #\|) :test #'char=) - (setf (lexer-pos lexer) pos)))))) \ No newline at end of file + (setf (lexer-pos lexer) pos)))))) diff --git a/lispworks-defsystem.lisp b/lispworks-defsystem.lisp new file mode 100644 index 0000000..0d84f56 --- /dev/null +++ b/lispworks-defsystem.lisp @@ -0,0 +1,57 @@ +;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- +;;; $Header: /usr/local/cvsrep/cl-ppcre/lispworks-defsystem.lisp,v 1.1 2005/04/30 20:00:50 edi Exp $ + +;;; This system definition for LispWorks was kindly provided by Wade Humeniuk + +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. + +;;; Redistribution and use in source and binary forms, with or without +;;; modification, are permitted provided that the following conditions +;;; are met: + +;;; * Redistributions of source code must retain the above copyright +;;; notice, this list of conditions and the following disclaimer. + +;;; * Redistributions in binary form must reproduce the above +;;; copyright notice, this list of conditions and the following +;;; disclaimer in the documentation and/or other materials +;;; provided with the distribution. + +;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED +;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +(in-package #:cl-user) + +(defparameter *cl-ppcre-base-directory* + (make-pathname :name nil :type nil :version nil + :defaults (parse-namestring *load-truename*))) + +(defsystem cl-ppcre + (:default-pathname *cl-ppcre-base-directory* + :default-type :lisp-file) + :members ("packages" + "specials" + "util" + "errors" + "lexer" + "parser" + "regex-class" + "convert" + "optimize" + "closures" + "repetition-closures" + "scanner" + "api") + :rules ((:in-order-to :compile :all + (:requires (:load :previous))) + (:in-order-to :load :all + (:requires (:load :previous))))) \ No newline at end of file diff --git a/load.lisp b/load.lisp index 5c39b0b..cfa7d0a 100755 --- a/load.lisp +++ b/load.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/load.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/load.lisp,v 1.13 2005/04/01 21:29:09 edi Exp $ -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -27,36 +27,41 @@ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -(in-package #:cl-user) +(in-package :cl-user) -(defparameter *cl-ppcre-base-directory* - (make-pathname :name nil :type nil :version nil - :defaults (parse-namestring *load-truename*))) - -(loop for file in '("packages" +(let ((cl-ppcre-base-directory + (make-pathname :name nil :type nil :version nil + :defaults (parse-namestring *load-truename*))) + must-compile) + (with-compilation-unit () + (dolist (file '("packages" "specials" "util" "errors" - "lexer" - "parser" - "regex-class" - "convert" - "optimize" - "closures" - "repetition-closures" - "scanner" + #-:use-acl-regexp2-engine "lexer" + #-:use-acl-regexp2-engine "parser" + #-:use-acl-regexp2-engine "regex-class" + #-:use-acl-regexp2-engine "convert" + #-:use-acl-regexp2-engine "optimize" + #-:use-acl-regexp2-engine "closures" + #-:use-acl-regexp2-engine "repetition-closures" + #-:use-acl-regexp2-engine "scanner" "api" - "ppcre-tests") - do (let ((pathname (make-pathname :name file :type "lisp" :version nil - :defaults *cl-ppcre-base-directory*))) - #-:cormanlisp - (let ((compiled-pathname (compile-file-pathname pathname))) - (unless (probe-file compiled-pathname) - (compile-file pathname)) - (setq pathname compiled-pathname)) - (load pathname))) - - + "ppcre-tests")) + (let ((pathname (make-pathname :name file :type "lisp" :version nil + :defaults cl-ppcre-base-directory))) + ;; don't use COMPILE-FILE in Corman Lisp, it's broken - LOAD + ;; will yield compiled functions anyway + #-:cormanlisp + (let ((compiled-pathname (compile-file-pathname pathname))) + (unless (and (not must-compile) + (probe-file compiled-pathname) + (< (file-write-date pathname) + (file-write-date compiled-pathname))) + (setq must-compile t) + (compile-file pathname)) + (setq pathname compiled-pathname)) + (load pathname))))) diff --git a/optimize.lisp b/optimize.lisp index 09ce672..e004d13 100644 --- a/optimize.lisp +++ b/optimize.lisp @@ -1,10 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/optimize.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.26 2005/04/13 15:35:57 edi Exp $ ;;; This file contains optimizations which can be applied to converted ;;; parse trees. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -32,37 +32,8 @@ (in-package #:cl-ppcre) -(defun string-list-to-simple-string (string-list) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Concatenates a list of strings to one simple-string." - ;; this function provided by JP Massar; note that we can't use APPLY - ;; with CONCATENATE here because of CALL-ARGUMENTS-LIMIT - (let ((total-size 0)) - (declare (type fixnum total-size)) - (dolist (string string-list) - (declare (type string string)) - (incf total-size (length string))) - (let ((result-string (make-sequence 'simple-string total-size)) - (curr-pos 0)) - (declare (type fixnum curr-pos)) - (dolist (string string-list) - (declare (type string string)) - (replace result-string string :start1 curr-pos) - (incf curr-pos (length string))) - result-string))) - (defgeneric flatten (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Merges adjacent sequences and alternations, i.e. it transforms # # #>> to # # #>. This is a destructive @@ -148,17 +119,12 @@ operation on REGEX.")) regex) (t ;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING, - ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY) do - ;; nothing + ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY) + ;; do nothing regex))) (defgeneric gather-strings (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Collects adjacent strings or characters into one string provided they have the same case mode. This is a destructive operation on REGEX.")) @@ -310,19 +276,14 @@ operation on REGEX.")) regex) (t ;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING, - ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY) do - ;; nothing + ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY) + ;; do nothing regex))) ;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS. (defgeneric start-anchored-p (regex &optional in-seq-p) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns T if REGEX starts with a \"real\" start anchor, i.e. one that's not in multi-line mode, NIL otherwise. If IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a @@ -378,6 +339,12 @@ zero-length assertion.")) (if in-seq-p :zero-length nil)) + (filter + (if (and in-seq-p + (len regex) + (zerop (len regex))) + :zero-length + nil)) (t ;; BACK-REFERENCE, CHAR-CLASS, EVERYTHING, and STR nil))) @@ -385,12 +352,7 @@ zero-length assertion.")) ;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS. (defgeneric end-string-aux (regex &optional old-case-insensitive-p) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns the constant string (if it exists) REGEX ends with wrapped into a STR object, otherwise NIL. OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR @@ -509,19 +471,17 @@ function called by END-STRIN.)")) :case-insensitive-p :void)) (t ;; (ALTERNATION, BACK-REFERENCE, BRANCH, CHAR-CLASS, EVERYTHING, - ;; REPETITION) + ;; REPETITION, FILTER) nil))) +(defgeneric end-string (regex) + (declare #.*standard-optimize-settings*) + (:documentation "Returns the constant string (if it exists) REGEX ends with wrapped +into a STR object, otherwise NIL.")) + (defmethod end-string ((regex regex)) (declare (special end-string-offset)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Returns the constant string (if it exists) REGEX ends with wrapped -into a STR object, otherwise NIL." + (declare #.*standard-optimize-settings*) ;; LAST-STR points to the last STR object (seen from the end) that's ;; part of END-STRING; CONTINUEP is set to T if we stop collecting ;; in the middle of a SEQ @@ -539,12 +499,7 @@ into a STR object, otherwise NIL." end-string-offset (offset last-str)))))) (defgeneric compute-min-rest (regex current-min-rest) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns the minimal length of REGEX plus CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it recurses down into REGEX and sets the MIN-REST slots of REPETITION @@ -567,6 +522,9 @@ objects.")) (defmethod compute-min-rest ((str str) current-min-rest) (+ current-min-rest (len str))) +(defmethod compute-min-rest ((filter filter) current-min-rest) + (+ current-min-rest (or (len filter) 0))) + (defmethod compute-min-rest ((repetition repetition) current-min-rest) (setf (min-rest repetition) current-min-rest) (compute-min-rest (regex repetition) current-min-rest) @@ -594,4 +552,4 @@ objects.")) (t ;; zero min-len and no embedded regexes (ANCHOR, ;; BACK-REFERENCE, VOID, and WORD-BOUNDARY) - current-min-rest))) \ No newline at end of file + current-min-rest))) diff --git a/packages.lisp b/packages.lisp index e5fafe1..6c046fd 100644 --- a/packages.lisp +++ b/packages.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/packages.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/packages.lisp,v 1.19 2005/04/01 21:29:10 edi Exp $ -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -27,13 +27,16 @@ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -(in-package #:cl-user) +(in-package :cl-user) #-:cormanlisp (defpackage #:cl-ppcre (:nicknames #:ppcre) - (:use #:cl) + #+genera (:shadowing-import-from #:common-lisp #:lambda #:simple-string #:string) + (:use #-genera #:cl #+genera #:future-common-lisp) (:export #:create-scanner + #:parse-tree-synonym + #:define-parse-tree-synonym #:scan #:scan-to-strings #:do-scans @@ -56,13 +59,17 @@ #:ppcre-syntax-error-string #:ppcre-syntax-error-pos #:register-groups-bind - #:do-register-groups)) + #:do-register-groups + #:*standard-optimize-settings* + #:*special-optimize-settings*)) #+:cormanlisp (defpackage "CL-PPCRE" (:nicknames "PPCRE") (:use "CL") (:export "CREATE-SCANNER" + "PARSE-TREE-SYNONYM" + "DEFINE-PARSE-TREE-SYNONYM" "SCAN" "SCAN-TO-STRINGS" "DO-SCANS" @@ -85,4 +92,17 @@ "PPCRE-SYNTAX-ERROR-STRING" "PPCRE-SYNTAX-ERROR-POS" "REGISTER-GROUPS-BIND" - "DO-REGISTER-GROUPS")) + "DO-REGISTER-GROUPS" + "*STANDARD-OPTIMIZE-SETTINGS*" + "*SPECIAL-OPTIMIZE-SETTINGS*")) + +#-:cormanlisp +(defpackage #:cl-ppcre-test + #+genera (:shadowing-import-from #:common-lisp #:lambda) + (:use #-genera #:cl #+genera #:future-common-lisp #:cl-ppcre) + (:export #:test)) + +#+:cormanlisp +(defpackage "CL-PPCRE-TEST" + (:use "CL" "CL-PPCRE") + (:export "TEST")) diff --git a/parser.lisp b/parser.lisp index 9fed198..62c1d79 100644 --- a/parser.lisp +++ b/parser.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/parser.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp $ ;;; The parser will - with the help of the lexer - parse a regex ;;; string and convert it into a "parse tree" (see docs for details @@ -7,7 +7,7 @@ ;;; illegal parse trees. It is assumed that the conversion process ;;; later on will track them down. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -36,16 +36,11 @@ (in-package #:cl-ppcre) (defun group (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Parses and consumes a . The productions are: -> \"(\"\")\" \"(?:\"\")\" - \"(?<\"\")\" + \"(?>\"\")\" \"(?:\"\")\" \"(?=\"\")\" \"(?!\"\")\" @@ -154,12 +149,7 @@ Will return or ( ) where open-token)))) (defun greedy-quant (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Parses and consumes a . The productions are: -> | where is parsed by the lexer function GET-QUANTIFIER. @@ -173,12 +163,7 @@ Will return or (:GREEDY-REPETITION )." group))) (defun quant (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Parses and consumes a . The productions are: -> | \"?\". Will return the returned by GREEDY-QUANT and optionally @@ -193,12 +178,7 @@ change :GREEDY-REPETITION to :NON-GREEDY-REPETITION." greedy-quant)) (defun seq (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Parses and consumes a . The productions are: -> | . Will return or (:SEQUENCE )." @@ -263,12 +243,7 @@ Will return or (:SEQUENCE )." :void))) (defun reg-expr (lexer) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Parses and consumes a , a complete regular expression. The productions are: -> | \"|\". Will return or (:ALTERNATION )." @@ -313,12 +288,7 @@ Will return or (:ALTERNATION )." seq))))))) (defun reverse-strings (parse-tree) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (cond ((stringp parse-tree) (nreverse parse-tree)) ((consp parse-tree) @@ -330,12 +300,7 @@ Will return or (:ALTERNATION )." (t parse-tree))) (defun parse-string (string) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Translate the regex string STRING into a parse tree." (let* ((lexer (make-lexer string)) (parse-tree (reverse-strings (reg-expr lexer)))) @@ -344,4 +309,4 @@ Will return or (:ALTERNATION )." parse-tree (signal-ppcre-syntax-error* (lexer-pos lexer) - "Expected end of string")))) \ No newline at end of file + "Expected end of string")))) diff --git a/ppcre-tests.lisp b/ppcre-tests.lisp index 774153b..560beae 100644 --- a/ppcre-tests.lisp +++ b/ppcre-tests.lisp @@ -1,7 +1,7 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE-TEST; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/ppcre-tests.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/ppcre-tests.lisp,v 1.31 2005/08/23 12:23:13 edi Exp $ -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -27,18 +27,6 @@ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -(in-package #:cl-user) - -#-:cormanlisp -(defpackage #:cl-ppcre-test - (:use #:cl #:cl-ppcre) - (:export #:test)) - -#+:cormanlisp -(defpackage "CL-PPCRE-TEST" - (:use "CL" "CL-PPCRE") - (:export "TEST")) - (in-package #:cl-ppcre-test) (defparameter *cl-ppcre-test-base-directory* @@ -64,12 +52,7 @@ multi-line-mode single-line-mode extended-mode) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Auxiliary function used by TEST to benchmark a regex scanner against Perl timings." (declare (type string string)) @@ -90,12 +73,7 @@ against Perl timings." lispworks (and sbcl sb-thread)) (defun threaded-scan (scanner target-string &key (threads 10) (repetitions 5000)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Auxiliary function used by TEST to check whether SCANNER is thread-safe." (full-gc) (let ((collector (make-array threads)) @@ -155,32 +133,26 @@ against Perl timings." :defaults *cl-ppcre-test-base-directory*) file-name-provided-p) threaded) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (ignorable threaded)) "Loop through all test cases in FILE-NAME and print report. Only in LispWorks and SCL: If THREADED is true, also test whether the scanners work multi-threaded." (with-open-file (stream file-name - #+(or :allegro :clisp :scl) + #+(or :allegro :clisp :scl :sbcl) :external-format - #+(or :allegro :clisp :scl) + #+(or :allegro :clisp :scl :sbcl) (if file-name-provided-p :default - #+:allegro :iso-8859-1 - #+:clisp charset:iso-8859-1 - #+:scl :iso-8859-1)) + #+(or :allegro :scl :sbcl) :iso-8859-1 + #+:clisp charset:iso-8859-1)) (loop with testcount of-type fixnum = 0 with *regex-char-code-limit* = (if file-name-provided-p *regex-char-code-limit* ;; the standard test suite - ;; doesn't need full - ;; Unicode support - 255) + ;; doesn't need Unicode + ;; support + 256) with *allow-quoting* = (if file-name-provided-p *allow-quoting* t) diff --git a/regex-class.lisp b/regex-class.lisp index 83e163d..62c0f5b 100644 --- a/regex-class.lisp +++ b/regex-class.lisp @@ -1,11 +1,11 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/regex-class.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.26 2005/06/10 10:23:42 edi Exp $ ;;; This file defines the REGEX class and some utility methods for ;;; this class. REGEX objects are used to represent the (transformed) ;;; parse trees internally -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -33,221 +33,243 @@ (in-package #:cl-ppcre) -(locally - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - (defclass regex () - () - (:documentation "The REGEX base class. All other classes inherit from this one.")) +;; Genera need the eval-when, here, or the types created by the class +;; definitions aren't seen by the typep calls later in the file. +(eval-when (:compile-toplevel :load-toplevel :execute) + (locally + (declare #.*standard-optimize-settings*) + (defclass regex () + () + (:documentation "The REGEX base class. All other classes inherit +from this one.")) - (defclass seq (regex) - ((elements :initarg :elements - :accessor elements - :type cons - :documentation "A list of REGEX objects.")) - (:documentation "SEQ objects represents sequences of + (defclass seq (regex) + ((elements :initarg :elements + :accessor elements + :type cons + :documentation "A list of REGEX objects.")) + (:documentation "SEQ objects represents sequences of regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)")) - (defclass alternation (regex) - ((choices :initarg :choices - :accessor choices - :type cons - :documentation "A list of REGEX objects")) - (:documentation "ALTERNATION objects represent alternations of + (defclass alternation (regex) + ((choices :initarg :choices + :accessor choices + :type cons + :documentation "A list of REGEX objects")) + (:documentation "ALTERNATION objects represent alternations of regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)")) - (defclass lookahead (regex) - ((regex :initarg :regex - :accessor regex - :documentation "The REGEX object we're checking.") - (positivep :initarg :positivep - :reader positivep - :documentation "Whether this assertion is positive.")) - (:documentation "LOOKAHEAD objects represent look-ahead assertions.")) + (defclass lookahead (regex) + ((regex :initarg :regex + :accessor regex + :documentation "The REGEX object we're checking.") + (positivep :initarg :positivep + :reader positivep + :documentation "Whether this assertion is positive.")) + (:documentation "LOOKAHEAD objects represent look-ahead assertions.")) - (defclass lookbehind (regex) - ((regex :initarg :regex - :accessor regex - :documentation "The REGEX object we're checking.") - (positivep :initarg :positivep - :reader positivep - :documentation "Whether this assertion is positive.") - (len :initarg :len - :accessor len - :type fixnum - :documentation "The (fixed) length of the enclosed regex.")) - (:documentation "LOOKBEHIND objects represent look-behind assertions.")) - - (defclass repetition (regex) - ((regex :initarg :regex - :accessor regex - :documentation "The REGEX that's repeated.") - (greedyp :initarg :greedyp - :reader greedyp - :documentation "Whether the repetition is greedy.") - (minimum :initarg :minimum - :accessor minimum - :type fixnum - :documentation "The minimal number of repetitions.") - (maximum :initarg :maximum - :accessor maximum - :documentation "The maximal number of repetitions. -Can be NIL for unbounded.") - (min-len :initarg :min-len - :reader min-len - :documentation "The minimal length of the enclosed regex.") - (len :initarg :len - :reader len - :documentation "The length of the enclosed regex. NIL if unknown.") - (min-rest :initform 0 - :accessor min-rest + (defclass lookbehind (regex) + ((regex :initarg :regex + :accessor regex + :documentation "The REGEX object we're checking.") + (positivep :initarg :positivep + :reader positivep + :documentation "Whether this assertion is positive.") + (len :initarg :len + :accessor len :type fixnum - :documentation "The minimal number of characters which must + :documentation "The (fixed) length of the enclosed regex.")) + (:documentation "LOOKBEHIND objects represent look-behind assertions.")) + + (defclass repetition (regex) + ((regex :initarg :regex + :accessor regex + :documentation "The REGEX that's repeated.") + (greedyp :initarg :greedyp + :reader greedyp + :documentation "Whether the repetition is greedy.") + (minimum :initarg :minimum + :accessor minimum + :type fixnum + :documentation "The minimal number of repetitions.") + (maximum :initarg :maximum + :accessor maximum + :documentation "The maximal number of repetitions. +Can be NIL for unbounded.") + (min-len :initarg :min-len + :reader min-len + :documentation "The minimal length of the enclosed regex.") + (len :initarg :len + :reader len + :documentation "The length of the enclosed regex. NIL +if unknown.") + (min-rest :initform 0 + :accessor min-rest + :type fixnum + :documentation "The minimal number of characters which must appear after this repetition.") - (contains-register-p :initarg :contains-register-p - :reader contains-register-p - :documentation "If the regex contains a register.")) - (:documentation "REPETITION objects represent repetitions of regexes.")) + (contains-register-p :initarg :contains-register-p + :reader contains-register-p + :documentation "If the regex contains a register.")) + (:documentation "REPETITION objects represent repetitions of regexes.")) - (defclass register (regex) - ((regex :initarg :regex - :accessor regex - :documentation "The inner regex.") - (num :initarg :num - :reader num - :type fixnum - :documentation "The number of this register, starting from 0. + (defclass register (regex) + ((regex :initarg :regex + :accessor regex + :documentation "The inner regex.") + (num :initarg :num + :reader num + :type fixnum + :documentation "The number of this register, starting from 0. This is the index into *REGS-START* and *REGS-END*.")) - (:documentation "REGISTER objects represent register groups.")) + (:documentation "REGISTER objects represent register groups.")) - (defclass standalone (regex) - ((regex :initarg :regex - :accessor regex - :documentation "The inner regex.")) - (:documentation "A standalone regular expression.")) + (defclass standalone (regex) + ((regex :initarg :regex + :accessor regex + :documentation "The inner regex.")) + (:documentation "A standalone regular expression.")) - (defclass back-reference (regex) - ((num :initarg :num - :accessor num - :type fixnum - :documentation "The number of the register this reference refers to.") - (case-insensitive-p :initarg :case-insensitive-p - :reader case-insensitive-p - :documentation "Whether we check case-insensitively.")) - (:documentation "BACK-REFERENCE objects represent backreferences.")) + (defclass back-reference (regex) + ((num :initarg :num + :accessor num + :type fixnum + :documentation "The number of the register this +reference refers to.") + (case-insensitive-p :initarg :case-insensitive-p + :reader case-insensitive-p + :documentation "Whether we check +case-insensitively.")) + (:documentation "BACK-REFERENCE objects represent backreferences.")) - (defclass char-class (regex) - ((hash :initarg :hash - :reader hash - :type (or hash-table null) - :documentation "A hash table the keys of which are the characters; -the values are always T.") - (case-insensitive-p :initarg :case-insensitive-p - :reader case-insensitive-p - :documentation "If the char class case-insensitive.") - (invertedp :initarg :invertedp - :reader invertedp - :documentation "Whether we mean the inverse of the char class.") - (word-char-class-p :initarg :word-char-class-p - :reader word-char-class-p - :documentation "Whether this CHAR CLASS + (defclass char-class (regex) + ((hash :initarg :hash + :reader hash + :type (or hash-table null) + :documentation "A hash table the keys of which are the +characters; the values are always T.") + (case-insensitive-p :initarg :case-insensitive-p + :reader case-insensitive-p + :documentation "If the char class +case-insensitive.") + (invertedp :initarg :invertedp + :reader invertedp + :documentation "Whether we mean the inverse of +the char class.") + (word-char-class-p :initarg :word-char-class-p + :reader word-char-class-p + :documentation "Whether this CHAR CLASS represents the special class WORD-CHAR-CLASS.")) - (:documentation "CHAR-CLASS objects represent character classes.")) + (:documentation "CHAR-CLASS objects represent character classes.")) - (defclass str (regex) - ((str :initarg :str - :accessor str - :type string - :documentation "The actual string.") - (len :initform 0 - :accessor len - :type fixnum - :documentation "The length of the string.") - (case-insensitive-p :initarg :case-insensitive-p - :reader case-insensitive-p - :documentation "If we match case-insensitively.") - (offset :initform nil - :accessor offset - :documentation "Offset from the left of the whole parse tree. -The first regex has offset 0. -NIL if unknown, i.e. behind a variable-length regex.") - (skip :initform nil - :initarg :skip - :accessor skip - :documentation "If we can avoid testing for this string -because the SCAN function has done this already.") - (start-of-end-string-p :initform nil - :accessor start-of-end-string-p - :documentation "If this is the unique STR which -starts END-STRING (a slot of MATCHER).")) - (:documentation "STR objects represent string.")) + (defclass str (regex) + ((str :initarg :str + :accessor str + :type string + :documentation "The actual string.") + (len :initform 0 + :accessor len + :type fixnum + :documentation "The length of the string.") + (case-insensitive-p :initarg :case-insensitive-p + :reader case-insensitive-p + :documentation "If we match case-insensitively.") + (offset :initform nil + :accessor offset + :documentation "Offset from the left of the whole +parse tree. The first regex has offset 0. NIL if unknown, i.e. behind +a variable-length regex.") + (skip :initform nil + :initarg :skip + :accessor skip + :documentation "If we can avoid testing for this +string because the SCAN function has done this already.") + (start-of-end-string-p :initform nil + :accessor start-of-end-string-p + :documentation "If this is the unique +STR which starts END-STRING (a slot of MATCHER).")) + (:documentation "STR objects represent string.")) - (defclass anchor (regex) - ((startp :initarg :startp - :reader startp - :documentation "Whether this is a \"start anchor\".") - (multi-line-p :initarg :multi-line-p - :reader multi-line-p - :documentation "Whether we're in multi-line mode, + (defclass anchor (regex) + ((startp :initarg :startp + :reader startp + :documentation "Whether this is a \"start anchor\".") + (multi-line-p :initarg :multi-line-p + :reader multi-line-p + :documentation "Whether we're in multi-line mode, i.e. whether each #\\Newline is surrounded by anchors.") - (no-newline-p :initarg :no-newline-p - :reader no-newline-p - :documentation "Whether we ignore #\\Newline at the end.")) - (:documentation "ANCHOR objects represent anchors like \"^\" or \"$\".")) + (no-newline-p :initarg :no-newline-p + :reader no-newline-p + :documentation "Whether we ignore #\\Newline at the end.")) + (:documentation "ANCHOR objects represent anchors like \"^\" or \"$\".")) - (defclass everything (regex) - ((single-line-p :initarg :single-line-p - :reader single-line-p - :documentation "Whether we're in single-line mode, + (defclass everything (regex) + ((single-line-p :initarg :single-line-p + :reader single-line-p + :documentation "Whether we're in single-line mode, i.e. whether we also match #\\Newline.")) - (:documentation "EVERYTHING objects represent regexes matching + (:documentation "EVERYTHING objects represent regexes matching \"everything\", i.e. dots.")) - (defclass word-boundary (regex) - ((negatedp :initarg :negatedp - :reader negatedp - :documentation "Whether we mean the opposite, + (defclass word-boundary (regex) + ((negatedp :initarg :negatedp + :reader negatedp + :documentation "Whether we mean the opposite, i.e. no word-boundary.")) - (:documentation "WORD-BOUNDARY objects represent word-boundary assertions.")) + (:documentation "WORD-BOUNDARY objects represent word-boundary assertions.")) - (defclass branch (regex) - ((test :initarg :test - :accessor test - :documentation "The test of this branch, one of LOOKAHEAD, -LOOKBEHIND, or a number.") - (then-regex :initarg :then-regex - :accessor then-regex - :documentation "The regex that's to be matched if the + (defclass branch (regex) + ((test :initarg :test + :accessor test + :documentation "The test of this branch, one of +LOOKAHEAD, LOOKBEHIND, or a number.") + (then-regex :initarg :then-regex + :accessor then-regex + :documentation "The regex that's to be matched if the test succeeds.") - (else-regex :initarg :else-regex - :initform (make-instance 'void) - :accessor else-regex - :documentation "The regex that's to be matched if the + (else-regex :initarg :else-regex + :initform (make-instance 'void) + :accessor else-regex + :documentation "The regex that's to be matched if the test fails.")) - (:documentation "BRANCH objects represent Perl's conditional regular + (:documentation "BRANCH objects represent Perl's conditional regular expressions.")) + + (defclass filter (regex) + ((fn :initarg :fn + :accessor fn + :type (or function symbol) + :documentation "The user-defined function.") + (len :initarg :len + :reader len + :documentation "The fixed length of this filter or NIL.")) + (:documentation "FILTER objects represent arbitrary functions +defined by the user.")) - (defclass void (regex) - () - (:documentation "VOID objects represent empty regular expressions."))) + (defclass void (regex) + () + (:documentation "VOID objects represent empty regular expressions.")))) -(declaim (ftype (function (t) simple-string) str)) +(defmethod initialize-instance :after ((char-class char-class) &rest init-args) + (declare #.*standard-optimize-settings*) + "Make large hash tables smaller, if possible." + (let ((hash (getf init-args :hash))) + (when (and hash + (> *regex-char-code-limit* 256) + (> (hash-table-count hash) + (/ *regex-char-code-limit* 2))) + (setf (slot-value char-class 'hash) + (merge-inverted-hash (make-hash-table) + hash) + (slot-value char-class 'invertedp) + (not (slot-value char-class 'invertedp)))))) ;;; The following four methods allow a VOID object to behave like a ;;; zero-length STR object (only readers needed) (defmethod initialize-instance :after ((str str) &rest init-args) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (ignore init-args)) "Automatically computes the length of a STR after initialization." (let ((str-slot (slot-value str 'str))) @@ -256,48 +278,23 @@ expressions.")) (setf (len str) (length (str str)))) (defmethod len ((void void)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) 0) (defmethod str ((void void)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "") (defmethod skip ((void void)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) nil) (defmethod start-of-end-string-p ((void void)) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) nil) (defgeneric case-mode (regex old-case-mode) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Utility function used by the optimizer (see GATHER-STRINGS). Returns a keyword denoting the case-(in)sensitivity of a STR or its second argument if the STR has length 0. Returns NIL for REGEX objects @@ -316,12 +313,7 @@ which are not of type STR.")) nil) (defgeneric copy-regex (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Implements a deep copy of a REGEX object.")) (defmethod copy-regex ((anchor anchor)) @@ -406,6 +398,11 @@ which are not of type STR.")) :str (str str) :case-insensitive-p (case-insensitive-p str))) +(defmethod copy-regex ((filter filter)) + (make-instance 'filter + :fn (fn filter) + :len (len filter))) + ;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been ;;; wrapped into one function. Maybe in the next release... @@ -417,12 +414,7 @@ which are not of type STR.")) ;;; and therefore we stop REGISTER removal once we see an ALTERNATION. (defgeneric remove-registers (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and optionally removes embedded REGISTER objects if possible and if the special variable REMOVE-REGISTERS-P is true.")) @@ -491,12 +483,7 @@ special variable REMOVE-REGISTERS-P is true.")) :elements (mapcar #'remove-registers (elements seq)))) (defgeneric everythingp (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns an EVERYTHING object if REGEX is equivalent to this object, otherwise NIL. So, \"(.){1}\" would return true (i.e. the object corresponding to \".\", for example.")) @@ -539,16 +526,11 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (defmethod everythingp ((regex regex)) ;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS, - ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, and WORD-BOUNDARY + ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY nil) (defgeneric regex-length (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Return the length of REGEX if it is fixed, NIL otherwise.")) (defmethod regex-length ((seq seq)) @@ -586,7 +568,7 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (maximum maximum)) repetition (if (and len - (eq minimum maximum)) + (eql minimum maximum)) (* minimum len) nil))) @@ -610,18 +592,16 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (defmethod regex-length ((str str)) (len str)) +(defmethod regex-length ((filter filter)) + (len filter)) + (defmethod regex-length ((regex regex)) ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and ;; WORD-BOUNDARY (which all have zero-length) 0) (defgeneric regex-min-length (regex) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns the minimal length of REGEX.")) (defmethod regex-min-length ((seq seq)) @@ -662,18 +642,17 @@ to this object, otherwise NIL. So, \"(.){1}\" would return true (defmethod regex-min-length ((str str)) (len str)) +(defmethod regex-min-length ((filter filter)) + (or (len filter) + 0)) + (defmethod regex-min-length ((regex regex)) ;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD, ;; LOOKBEHIND, VOID, and WORD-BOUNDARY 0) (defgeneric compute-offsets (regex start-pos) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (:documentation "Returns the offset the following regex would have relative to START-POS or NIL if we can't compute it. Sets the OFFSET slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET @@ -746,7 +725,13 @@ slots of STR objects further down the tree.")) (declare (ignore start-pos)) nil) +(defmethod compute-offsets ((filter filter) start-pos) + (let ((len (len filter))) + (if len + (+ start-pos len) + nil))) + (defmethod compute-offsets ((regex regex) start-pos) ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and ;; WORD-BOUNDARY (which all have zero-length) - start-pos) \ No newline at end of file + start-pos) diff --git a/repetition-closures.lisp b/repetition-closures.lisp index b51fec0..db1c5a1 100644 --- a/repetition-closures.lisp +++ b/repetition-closures.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/repetition-closures.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.24 2005/04/13 15:35:58 edi Exp $ ;;; This is actually a part of closures.lisp which we put into a ;;; separate file because it is rather complex. We only deal with @@ -7,7 +7,7 @@ ;;; rather crazy micro-optimizations which were introduced to be as ;;; competitive with Perl as possible in tight loops. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -117,12 +117,7 @@ repetition matches at CURR-POS." (go backward-loop))))))) (defun create-greedy-everything-matcher (maximum min-rest next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum min-rest) (type function next-fn)) "Creates a closure which just matches as far ahead as possible, @@ -149,18 +144,16 @@ i.e. a closure for a dot in single-line mode." (loop for curr-pos of-type fixnum from target-end-pos downto start-pos thereis (funcall next-fn curr-pos)))))) -(defmethod create-greedy-constant-length-matcher ((repetition repetition) - next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed +(defgeneric create-greedy-constant-length-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION is greedy and the minimal number of repetitions is zero. It is furthermore assumed that the inner regex of REPETITION is -of fixed length and doesn't contain registers." +of fixed length and doesn't contain registers.")) + +(defmethod create-greedy-constant-length-matcher ((repetition repetition) + next-fn) + (declare #.*standard-optimize-settings*) (let ((len (len repetition)) (maximum (maximum repetition)) (regex (regex repetition)) @@ -212,19 +205,17 @@ of fixed length and doesn't contain registers." (declare (type function inner-matcher)) (greedy-constant-length-closure (funcall inner-matcher curr-pos))))))))) - -(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed + +(defgeneric create-greedy-no-zero-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION is greedy and the minimal number of repetitions is zero. It is furthermore assumed that the inner regex of REPETITION can never match a zero-length string (or instead the maximal number of -repetitions is 1)." +repetitions is 1).")) + +(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn) + (declare #.*standard-optimize-settings*) (let ((maximum (maximum repetition)) ;; REPEAT-MATCHER is part of the closure's environment but it ;; can only be defined after GREEDY-AUX is defined @@ -283,16 +274,14 @@ repetitions is 1)." (create-matcher-aux (regex repetition) #'greedy-aux)) #'greedy-aux))))) -(defmethod create-greedy-matcher ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed +(defgeneric create-greedy-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION is greedy and the minimal number of repetitions is -zero." +zero.")) + +(defmethod create-greedy-matcher ((repetition repetition) next-fn) + (declare #.*standard-optimize-settings*) (let ((maximum (maximum repetition)) ;; we make a reservation for our slot in *LAST-POS-STORES* because ;; we have to watch out for endless loops as the inner regex might @@ -409,17 +398,15 @@ repetition matches at CURR-POS." while ,check-curr-pos finally (return (funcall next-fn curr-pos))))))) -(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed +(defgeneric create-non-greedy-constant-length-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION is non-greedy and the minimal number of repetitions is zero. It is furthermore assumed that the inner regex of REPETITION is -of fixed length and doesn't contain registers." +of fixed length and doesn't contain registers.")) + +(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn) + (declare #.*standard-optimize-settings*) (let ((len (len repetition)) (maximum (maximum repetition)) (regex (regex repetition)) @@ -475,18 +462,16 @@ of fixed length and doesn't contain registers." (non-greedy-constant-length-closure (funcall inner-matcher curr-pos))))))))) -(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed +(defgeneric create-non-greedy-no-zero-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION is non-greedy and the minimal number of repetitions is zero. It is furthermore assumed that the inner regex of REPETITION can never match a zero-length string (or instead the maximal number of -repetitions is 1)." +repetitions is 1).")) + +(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn) + (declare #.*standard-optimize-settings*) (let ((maximum (maximum repetition)) ;; REPEAT-MATCHER is part of the closure's environment but it ;; can only be defined after NON-GREEDY-AUX is defined @@ -543,16 +528,14 @@ repetitions is 1)." (create-matcher-aux (regex repetition) #'non-greedy-aux)) #'non-greedy-aux))))) -(defmethod create-non-greedy-matcher ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed +(defgeneric create-non-greedy-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION is non-greedy and the minimal number of repetitions is -zero." +zero.")) + +(defmethod create-non-greedy-matcher ((repetition repetition) next-fn) + (declare #.*standard-optimize-settings*) ;; we make a reservation for our slot in *LAST-POS-STORES* because ;; we have to watch out for endless loops as the inner regex might ;; match zero-length strings @@ -656,18 +639,17 @@ of the repetition matches at CURR-POS." ;; finally call NEXT-FN if we made it that far (funcall next-fn target-end-pos))))) -(defmethod create-constant-repetition-constant-length-matcher - ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed +(defgeneric create-constant-repetition-constant-length-matcher + (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed that REPETITION has a constant number of repetitions. It is furthermore assumed that the inner regex of REPETITION is of fixed -length and doesn't contain registers." +length and doesn't contain registers.")) + +(defmethod create-constant-repetition-constant-length-matcher + ((repetition repetition) next-fn) + (declare #.*standard-optimize-settings*) (let ((len (len repetition)) (repetitions (minimum repetition)) (regex (regex repetition))) @@ -721,8 +703,8 @@ length and doesn't contain registers." (declare (type fixnum start-pos)) (let ((next-pos (+ start-pos repetitions))) (declare (type fixnum next-pos)) - (or (<= next-pos *end-pos*) - (funcall next-fn next-pos)))) + (and (<= next-pos *end-pos*) + (funcall next-fn next-pos)))) ;; a dot which is not in single-line-mode - make sure we ;; don't match #\Newline (constant-repetition-constant-length-closure @@ -736,15 +718,13 @@ length and doesn't contain registers." (constant-repetition-constant-length-closure (funcall inner-matcher curr-pos)))))))) +(defgeneric create-constant-repetition-matcher (repetition next-fn) + (declare #.*standard-optimize-settings*) + (:documentation "Creates a closure which tries to match REPETITION. It is assumed +that REPETITION has a constant number of repetitions.")) + (defmethod create-constant-repetition-matcher ((repetition repetition) next-fn) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) - "Creates a closure which tries to match REPETITION. It is assumed -that REPETITION has a constant number of repetitions." + (declare #.*standard-optimize-settings*) (let ((repetitions (minimum repetition)) ;; we make a reservation for our slot in *REPEAT-COUNTERS* ;; because we need to keep track of the number of repetitions diff --git a/scanner.lisp b/scanner.lisp index 7d26421..62b04bf 100644 --- a/scanner.lisp +++ b/scanner.lisp @@ -1,10 +1,10 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/scanner.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.26 2005/07/19 23:18:15 edi Exp $ ;;; Here the scanner for the actual regex as well as utility scanners ;;; for the constant start and end strings are created. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -37,7 +37,8 @@ (let ((char-compare (if case-insensitive-p 'char-equal 'char=))) `(lambda (start-pos) (declare (type fixnum start-pos)) - (if (> (the fixnum (+ start-pos m)) *end-pos*) + (if (or (minusp start-pos) + (> (the fixnum (+ start-pos m)) *end-pos*)) nil (loop named bmh-matcher for k of-type fixnum = (+ start-pos m -1) @@ -52,12 +53,7 @@ (return-from bmh-matcher (1+ i))))))))) (defun create-bmh-matcher (pattern case-insensitive-p) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns a Boyer-Moore-Horspool matcher which searches the (special) simple-string *STRING* for the first occurence of the substring PATTERN. The search starts at the position START-POS within *STRING* @@ -72,11 +68,12 @@ instead. (BMH matchers are faster but need much more space.)" (return-from create-bmh-matcher (lambda (start-pos) (declare (type fixnum start-pos)) - (search pattern - *string* - :start2 start-pos - :end2 *end-pos* - :test test))))) + (and (not (minusp start-pos)) + (search pattern + *string* + :start2 start-pos + :end2 *end-pos* + :test test)))))) (let* ((m (length pattern)) (skip (make-array *regex-char-code-limit* :element-type 'fixnum @@ -97,16 +94,12 @@ instead. (BMH matchers are faster but need much more space.)" (let ((char-compare (if case-insensitive-p 'char-equal 'char=))) `(lambda (start-pos) (declare (type fixnum start-pos)) - (loop for i of-type fixnum from start-pos below *end-pos* - thereis (and (,char-compare (schar *string* i) chr) i))))) + (and (not (minusp start-pos)) + (loop for i of-type fixnum from start-pos below *end-pos* + thereis (and (,char-compare (schar *string* i) chr) i)))))) (defun create-char-searcher (chr case-insensitive-p) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns a function which searches the (special) simple-string *STRING* for the first occurence of the character CHR. The search starts at the position START-POS within *STRING* and stops before @@ -119,17 +112,16 @@ case-insensitive or not." (declaim (inline newline-skipper)) (defun newline-skipper (start-pos) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum start-pos)) "Find the next occurence of a character in *STRING* which is behind a #\Newline." - (loop for i of-type fixnum from start-pos below *end-pos* - thereis (and (char= (schar *string* i) #\Newline) + ;; we can start with (1- START-POS) without testing for (PLUSP + ;; START-POS) because we know we'll never call NEWLINE-SKIPPER on + ;; the first iteration + (loop for i of-type fixnum from (1- start-pos) below *end-pos* + thereis (and (char= (schar *string* i) + #\Newline) (1+ i)))) (defmacro insert-advance-fn (advance-fn) @@ -198,6 +190,7 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX." (unless (setq *end-string-pos* (funcall end-string-test end-test-pos)) (when (and (= 1 (the fixnum end-anchored-p)) + (> *end-pos* scan-start-pos) (char= #\Newline (schar *string* (1- *end-pos*)))) ;; if we didn't find an end string candidate from ;; END-TEST-POS and if a #\Newline at the end is @@ -328,12 +321,7 @@ ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX." rep-num zero-length-num reg-num) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) (declare (type fixnum min-len zero-length-num rep-num reg-num)) "Auxiliary function to create and return a scanner \(which is actually a closure). Used by CREATE-SCANNER." @@ -516,4 +504,4 @@ actually a closure). Used by CREATE-SCANNER." ;; expression to optimize so we just return POS (insert-advance-fn (advance-fn (pos) - pos)))))) \ No newline at end of file + pos)))))) diff --git a/specials.lisp b/specials.lisp index f6f2447..0536349 100644 --- a/specials.lisp +++ b/specials.lisp @@ -1,9 +1,9 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/specials.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/specials.lisp,v 1.21 2005/04/01 21:29:10 edi Exp $ ;;; globally declared special variables -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -31,6 +31,22 @@ (in-package #:cl-ppcre) +;;; special variables used to effect declarations + +(defvar *standard-optimize-settings* + '(optimize + speed + (safety 0) + (space 0) + (debug 1) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0)) + "The standard optimize settings used by most declaration expressions.") + +(defvar *special-optimize-settings* + '(optimize speed space) + "Special optimize settings used only be a few declaration expressions.") + ;;; special variables used by the lexer/parser combo (defvar *extended-mode-p* nil @@ -104,4 +120,23 @@ but large) Boyer-Moore-Horspool matchers.") (defvar *allow-quoting* nil "Whether the parser should support Perl's \\Q and \\E.") -(pushnew :cl-ppcre *features*) \ No newline at end of file +(pushnew :cl-ppcre *features*) + +;; stuff for Nikodemus Siivola's HYPERDOC +;; see +;; and + +(defvar *hyperdoc-base-uri* "http://weitz.de/cl-ppcre/") + +(let ((exported-symbols-alist + (loop for symbol being the external-symbols of :cl-ppcre + collect (cons symbol + (concatenate 'string + "#" + (string-downcase symbol)))))) + (defun hyperdoc-lookup (symbol type) + (declare (ignore type)) + (cdr (assoc symbol + exported-symbols-alist + :test #'eq)))) + diff --git a/util.lisp b/util.lisp index 519b4aa..869a263 100644 --- a/util.lisp +++ b/util.lisp @@ -1,5 +1,5 @@ ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- -;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/util.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ +;;; $Header: /usr/local/cvsrep/cl-ppcre/util.lisp,v 1.32 2005/08/23 10:32:30 edi Exp $ ;;; Utility functions and constants dealing with the hash-tables ;;; we use to encode character classes @@ -7,7 +7,7 @@ ;;; Hash-tables are treated like sets, i.e. a character C is a member of the ;;; hash-table H iff (GETHASH C H) is true. -;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. +;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions @@ -35,6 +35,10 @@ (in-package #:cl-ppcre) +#+:lispworks +(import 'lw:with-unique-names) + +#-:lispworks (defmacro with-unique-names ((&rest bindings) &body body) "Syntax: WITH-UNIQUE-NAMES ( { var | (var x) }* ) declaration* form* @@ -65,8 +69,14 @@ are discarded \(that is, the body is an implicit PROGN)." bindings) ,@body)) -(defmacro rebinding (bindings &body body) - "REBINDING ( { var | (var prefix) }* ) form* +#+:lispworks +(eval-when (:compile-toplevel :load-toplevel :execute) + (setf (macro-function 'with-rebinding) + (macro-function 'lw:rebinding))) + +#-:lispworks +(defmacro with-rebinding (bindings &body body) + "WITH-REBINDING ( { var | (var prefix) }* ) form* Evaluates a series of forms in the lexical environment that is formed by adding the binding of each VAR to a fresh, uninterned @@ -94,14 +104,14 @@ are discarded \(that is, the body is an implicit PROGN)." (eval-when (:compile-toplevel :execute :load-toplevel) (defvar *regex-char-code-limit* char-code-limit - "The upper exclusive bound on the char-codes of characters -which can occur in character classes. -Change this value BEFORE creating scanners if you don't need -the full Unicode support of LW, ACL, or CLISP.") + "The upper exclusive bound on the char-codes of characters which +can occur in character classes. Change this value BEFORE creating +scanners if you don't need the Unicode support of implementations like +AllegroCL, CLISP, LispWorks, or SBCL.") (declaim (type fixnum *regex-char-code-limit*)) (defun make-char-hash (test) - (declare (optimize speed space)) + (declare #.*special-optimize-settings*) "Returns a hash-table of all characters satisfying test." (loop with hash = (make-hash-table) for c of-type fixnum from 0 below char-code-limit @@ -113,12 +123,7 @@ the full Unicode support of LW, ACL, or CLISP.") (declaim (inline word-char-p)) (defun word-char-p (chr) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Tests whether a character is a \"word\" character. In the ASCII charset this is equivalent to a-z, A-Z, 0-9, or _, i.e. the same as Perl's [\\w]." @@ -134,7 +139,7 @@ i.e. the same as Perl's [\\w]." Same as Perl's [\\s].")) (defun whitespacep (chr) - (declare (optimize speed space)) + (declare #.*special-optimize-settings*) "Tests whether a character is whitespace, i.e. whether it would match [\\s] in Perl." (find chr +whitespace-char-string+ :test #'char=))) @@ -158,12 +163,7 @@ i.e. whether it would match [\\s] in Perl." "Hash-table containing all whitespace characters.")) (defun merge-hash (hash1 hash2) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns the \"sum\" of two hashes. This is a destructive operation on HASH1." (cond ((> (hash-table-count hash2) @@ -180,12 +180,7 @@ on HASH1." hash1) (defun merge-inverted-hash (hash1 hash2) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Returns the \"sum\" of HASH1 and the \"inverse\" of HASH2. This is a destructive operation on HASH1." (loop for c of-type fixnum from 0 below *regex-char-code-limit* @@ -195,12 +190,7 @@ a destructive operation on HASH1." hash1) (defun create-ranges-from-hash (hash &key downcasep) - (declare (optimize speed - (safety 0) - (space 0) - (debug 0) - (compilation-speed 0) - #+:lispworks (hcl:fixnum-safety 0))) + (declare #.*standard-optimize-settings*) "Tries to identify up to three intervals (with respect to CHAR<) which together comprise HASH. Returns NIL if this is not possible. If DOWNCASEP is true it will treat the hash-table as if it represents @@ -276,3 +266,33 @@ will only return the respective lower-case intervals." :element-type (array-element-type sequence) :displaced-to sequence :displaced-index-offset start)) + +(defun normalize-var-list (var-list) + "Utility function for REGISTER-GROUPS-BIND and +DO-REGISTER-GROUPS. Creates the long form \(a list of \(FUNCTION VAR) +entries) out of the short form of VAR-LIST." + (loop for element in var-list + if (consp element) + nconc (loop for var in (rest element) + collect (list (first element) var)) + else + collect (list '(function identity) element))) + +(defun string-list-to-simple-string (string-list) + (declare #.*standard-optimize-settings*) + "Concatenates a list of strings to one simple-string." + ;; this function provided by JP Massar; note that we can't use APPLY + ;; with CONCATENATE here because of CALL-ARGUMENTS-LIMIT + (let ((total-size 0)) + (declare (type fixnum total-size)) + (dolist (string string-list) + #-genera (declare (type string string)) + (incf total-size (length string))) + (let ((result-string (make-sequence 'simple-string total-size)) + (curr-pos 0)) + (declare (type fixnum curr-pos)) + (dolist (string string-list) + #-genera (declare (type string string)) + (replace result-string string :start1 curr-pos) + (incf curr-pos (length string))) + result-string)))