From 7d64a90203a7987a15a2ea7386367c741fb5766d Mon Sep 17 00:00:00 2001 From: Stas Boukarev Date: Wed, 13 Feb 2013 21:29:23 +0400 Subject: [PATCH 1/3] Rewrite SEQ without using recursion. Allows for very long regexes to be processed. Doesn't require the result to be reversed afterwards. --- lexer.lisp | 5 +-- parser.lisp | 105 +++++++++++++++++++--------------------------------- 2 files changed, 40 insertions(+), 70 deletions(-) diff --git a/lexer.lisp b/lexer.lisp index 71c2642..032b5ca 100644 --- a/lexer.lisp +++ b/lexer.lisp @@ -290,8 +290,7 @@ handled elsewhere." (return)) (write-char char out)))))) (list (if (char= first-char #\p) :property :inverted-property) - ;; we must reverse here because of what PARSE-STRING does - (nreverse name)))) + name))) (defun collect-char-class (lexer) "Reads and consumes characters from regex string until a right @@ -571,7 +570,7 @@ closing #\> will also be consumed." ;; back-referencing a named register (incf (lexer-pos lexer)) (list :back-reference - (nreverse (parse-register-name-aux lexer)))) + (parse-register-name-aux lexer))) (t ;; false alarm, just unescape \k #\k))) diff --git a/parser.lisp b/parser.lisp index f6d9a3d..72e39e9 100644 --- a/parser.lisp +++ b/parser.lisp @@ -128,10 +128,7 @@ Will return or \( ) where ;; have been the "(?:"")" production (cons :group (nconc flags (list reg-expr))) (if (eq open-token :open-paren-less-letter) - (list :named-register - ;; every string was reversed, so we have to - ;; reverse it back to get the name - (nreverse register-name) + (list :named-register register-name reg-expr) (list (case open-token ((:open-paren) @@ -201,54 +198,42 @@ Will return or (:SEQUENCE )." ;; to parse a or in order to catch empty regular ;; expressions (if (start-of-subexpr-p lexer) - (let ((quant (quant lexer))) - (if (start-of-subexpr-p lexer) - (let* ((seq (seq lexer)) - (quant-is-char-p (characterp quant)) - (seq-is-sequence-p (and (consp seq) - (eq (first seq) :sequence)))) - (cond ((and quant-is-char-p - (characterp seq)) - (make-array-from-two-chars seq quant)) - ((and quant-is-char-p - (stringp seq)) - (vector-push-extend quant seq) - seq) - ((and quant-is-char-p - seq-is-sequence-p - (characterp (second seq))) - (cond ((cddr seq) - (setf (cdr seq) - (cons - (make-array-from-two-chars (second seq) - quant) - (cddr seq))) - seq) - (t (make-array-from-two-chars (second seq) quant)))) - ((and quant-is-char-p - seq-is-sequence-p - (stringp (second seq))) - (cond ((cddr seq) - (setf (cdr seq) - (cons - (progn - (vector-push-extend quant (second seq)) - (second seq)) - (cddr seq))) - seq) - (t - (vector-push-extend quant (second seq)) - (second seq)))) - (seq-is-sequence-p - ;; if is also a :SEQUENCE parse tree we merge - ;; both lists into one to avoid unnecessary consing - (setf (cdr seq) - (cons quant (cdr seq))) - seq) - (t (list :sequence quant seq)))) - quant)) - :void))) - + (loop with seq-is-sequence-p = nil + with last-cdr + for quant = (quant lexer) + for quant-is-char-p = (characterp quant) + for seq = quant + then + (cond ((and quant-is-char-p (characterp seq)) + (make-array-from-two-chars seq quant)) + ((and quant-is-char-p (stringp seq)) + (vector-push-extend quant seq) + seq) + ((not seq-is-sequence-p) + (setf last-cdr (list quant) + seq-is-sequence-p t) + (list* :sequence seq last-cdr)) + ((and quant-is-char-p + (characterp (car last-cdr))) + (setf (car last-cdr) + (make-array-from-two-chars (car last-cdr) + quant)) + seq) + ((and quant-is-char-p + (stringp (car last-cdr))) + (vector-push-extend quant (car last-cdr)) + seq) + (t + ;; if is also a :SEQUENCE parse tree we merge + ;; both lists into one + (let ((cons (list quant))) + (psetf last-cdr cons + (cdr last-cdr) cons)) + seq)) + while (start-of-subexpr-p lexer) + finally (return seq)) + :void))) + (defun reg-expr (lexer) "Parses and consumes a , a complete regular expression. The productions are: -> | \"|\". @@ -294,25 +279,11 @@ Will return or (:ALTERNATION )." (setf (lexer-pos lexer) pos) seq))))))) -(defun reverse-strings (parse-tree) - "Recursively walks through PARSE-TREE and destructively reverses all -strings in it." - (declare #.*standard-optimize-settings*) - (cond ((stringp parse-tree) - (nreverse parse-tree)) - ((consp parse-tree) - (loop for parse-tree-rest on parse-tree - while parse-tree-rest - do (setf (car parse-tree-rest) - (reverse-strings (car parse-tree-rest)))) - parse-tree) - (t parse-tree))) - (defun parse-string (string) "Translate the regex string STRING into a parse tree." (declare #.*standard-optimize-settings*) (let* ((lexer (make-lexer string)) - (parse-tree (reverse-strings (reg-expr lexer)))) + (parse-tree (reg-expr lexer))) ;; check whether we've consumed the whole regex string (if (end-of-string-p lexer) parse-tree From b7706d3a1017135cf320a6d0fad0f4ce7753ecbc Mon Sep 17 00:00:00 2001 From: Stas Boukarev Date: Thu, 11 Apr 2013 14:06:37 +0400 Subject: [PATCH 2/3] Add a test-case which exercises a branch of end-string-aux. --- test/simple | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/simple b/test/simple index 69c3fe7..35f052f 100644 --- a/test/simple +++ b/test/simple @@ -347,3 +347,19 @@ characters if there's a match." for s = (create-scanner "(([a-c])+)x") always (equalp (multiple-value-list (scan s "abcxy")) (list 0 4 #(0 2) #(3 3)))) + +(labels ((char-code-odd-p (char) + (oddp (char-code char))) + (char-code-even-p (char) + (evenp (char-code char))) + (resolver (name) + (cond ((string= name "odd") #'char-code-odd-p) + ((string= name "even") #'char-code-even-p) + ((string= name "true") (constantly t)) + (t (error "Can't resolve ~S." name))))) + (equalp + (let ((*property-resolver* #'resolver)) + (list (regex-replace-all (create-scanner "\\p{odd}") "abcd" "+") + (regex-replace-all (create-scanner "\\p{even}") "abcd" "+") + (regex-replace-all (create-scanner "\\p{true}") "abcd" "+"))) + '("+b+d" "a+c+" "++++"))) From 792c82fc39a5013fbf9c25436b369e0519ae1d10 Mon Sep 17 00:00:00 2001 From: Stas Boukarev Date: Thu, 11 Apr 2013 14:08:37 +0400 Subject: [PATCH 3/3] Add print-object methods for some regex classes. --- optimize.lisp | 2 +- regex-class.lisp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/optimize.lisp b/optimize.lisp index 16b4b36..d538733 100644 --- a/optimize.lisp +++ b/optimize.lisp @@ -367,7 +367,7 @@ zero-length assertion.")) ends with wrapped into a STR object, otherwise NIL. OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR collected or :VOID if no STR has been collected yet. (This is a helper -function called by END-STRIN.)")) +function called by END-STRING.)")) (defmethod end-string-aux ((str str) &optional (old-case-insensitive-p :void)) diff --git a/regex-class.lisp b/regex-class.lisp index fb89c25..43c36e5 100644 --- a/regex-class.lisp +++ b/regex-class.lisp @@ -108,6 +108,10 @@ must appear after this repetition.") register.")) (:documentation "REPETITION objects represent repetitions of regexes.")) +(defmethod print-object ((repetition repetition) stream) + (print-unreadable-object (repetition stream :type t :identity t) + (princ (regex repetition) stream))) + (defclass register (regex) ((regex :initarg :regex :accessor regex @@ -122,6 +126,10 @@ This is the index into *REGS-START* and *REGS-END*.") :documentation "Name of this register or NIL.")) (:documentation "REGISTER objects represent register groups.")) +(defmethod print-object ((register register) stream) + (print-unreadable-object (register stream :type t :identity t) + (princ (regex register) stream))) + (defclass standalone (regex) ((regex :initarg :regex :accessor regex @@ -181,6 +189,10 @@ string because the SCAN function has done this already.") STR which starts END-STRING (a slot of MATCHER).")) (:documentation "STR objects represent string.")) +(defmethod print-object ((str str) stream) + (print-unreadable-object (str stream :type t :identity t) + (princ (str str) stream))) + (defclass anchor (regex) ((startp :initarg :startp :reader startp