git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@3581 4281704c-cde7-0310-8518-8e2dc76b1ff0
738 lines
33 KiB
Common Lisp
738 lines
33 KiB
Common Lisp
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
|
|
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.34 2008/07/06 22:36:30 edi Exp $
|
|
|
|
;;; The lexer's responsibility is to convert the regex string into a
|
|
;;; sequence of tokens which are in turn consumed by the parser.
|
|
;;;
|
|
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
|
|
;;; (with a little help from the parser) how many register groups it
|
|
;;; has opened so far. (The latter is necessary for interpreting
|
|
;;; strings like "\\10" correctly.)
|
|
|
|
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
|
|
|
|
;;; Redistribution and use in source and binary forms, with or without
|
|
;;; modification, are permitted provided that the following conditions
|
|
;;; are met:
|
|
|
|
;;; * Redistributions of source code must retain the above copyright
|
|
;;; notice, this list of conditions and the following disclaimer.
|
|
|
|
;;; * Redistributions in binary form must reproduce the above
|
|
;;; copyright notice, this list of conditions and the following
|
|
;;; disclaimer in the documentation and/or other materials
|
|
;;; provided with the distribution.
|
|
|
|
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
|
|
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
(in-package :cl-ppcre)
|
|
|
|
(declaim (inline map-char-to-special-class))
|
|
(defun map-char-to-special-char-class (chr)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Maps escaped characters like \"\\d\" to the tokens which represent
|
|
their associated character classes."
|
|
(case chr
|
|
((#\d)
|
|
:digit-class)
|
|
((#\D)
|
|
:non-digit-class)
|
|
((#\w)
|
|
:word-char-class)
|
|
((#\W)
|
|
:non-word-char-class)
|
|
((#\s)
|
|
:whitespace-char-class)
|
|
((#\S)
|
|
:non-whitespace-char-class)))
|
|
|
|
(defstruct (lexer (:constructor make-lexer-internal))
|
|
"LEXER structures are used to hold the regex string which is
|
|
currently lexed and to keep track of the lexer's state."
|
|
(str "" :type string :read-only t)
|
|
(len 0 :type fixnum :read-only t)
|
|
(reg 0 :type fixnum)
|
|
(pos 0 :type fixnum)
|
|
(last-pos nil :type list))
|
|
|
|
(defun make-lexer (string)
|
|
(declare (inline make-lexer-internal)
|
|
#-:genera (string string))
|
|
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
|
|
:len (length string)))
|
|
|
|
(declaim (inline end-of-string-p))
|
|
(defun end-of-string-p (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Tests whether we're at the end of the regex string."
|
|
(<= (lexer-len lexer)
|
|
(lexer-pos lexer)))
|
|
|
|
(declaim (inline looking-at-p))
|
|
(defun looking-at-p (lexer chr)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Tests whether the next character the lexer would see is CHR.
|
|
Does not respect extended mode."
|
|
(and (not (end-of-string-p lexer))
|
|
(char= (schar (lexer-str lexer) (lexer-pos lexer))
|
|
chr)))
|
|
|
|
(declaim (inline next-char-non-extended))
|
|
(defun next-char-non-extended (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Returns the next character which is to be examined and updates the
|
|
POS slot. Does not respect extended mode."
|
|
(cond ((end-of-string-p lexer) nil)
|
|
(t (prog1
|
|
(schar (lexer-str lexer) (lexer-pos lexer))
|
|
(incf (lexer-pos lexer))))))
|
|
|
|
(defun next-char (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Returns the next character which is to be examined and updates the
|
|
POS slot. Respects extended mode, i.e. whitespace, comments, and also
|
|
nested comments are skipped if applicable."
|
|
(let ((next-char (next-char-non-extended lexer))
|
|
last-loop-pos)
|
|
(loop
|
|
;; remember where we started
|
|
(setq last-loop-pos (lexer-pos lexer))
|
|
;; first we look for nested comments like (?#foo)
|
|
(when (and next-char
|
|
(char= next-char #\()
|
|
(looking-at-p lexer #\?))
|
|
(incf (lexer-pos lexer))
|
|
(cond ((looking-at-p lexer #\#)
|
|
;; must be a nested comment - so we have to search for
|
|
;; the closing parenthesis
|
|
(let ((error-pos (- (lexer-pos lexer) 2)))
|
|
(unless
|
|
;; loop 'til ')' or end of regex string and
|
|
;; return NIL if ')' wasn't encountered
|
|
(loop for skip-char = next-char
|
|
then (next-char-non-extended lexer)
|
|
while (and skip-char
|
|
(char/= skip-char #\)))
|
|
finally (return skip-char))
|
|
(signal-syntax-error* error-pos "Comment group not closed.")))
|
|
(setq next-char (next-char-non-extended lexer)))
|
|
(t
|
|
;; undo effect of previous INCF if we didn't see a #
|
|
(decf (lexer-pos lexer)))))
|
|
(when *extended-mode-p*
|
|
;; now - if we're in extended mode - we skip whitespace and
|
|
;; comments; repeat the following loop while we look at
|
|
;; whitespace or #\#
|
|
(loop while (and next-char
|
|
(or (char= next-char #\#)
|
|
(whitespacep next-char)))
|
|
do (setq next-char
|
|
(if (char= next-char #\#)
|
|
;; if we saw a comment marker skip until
|
|
;; we're behind #\Newline...
|
|
(loop for skip-char = next-char
|
|
then (next-char-non-extended lexer)
|
|
while (and skip-char
|
|
(char/= skip-char #\Newline))
|
|
finally (return (next-char-non-extended lexer)))
|
|
;; ...otherwise (whitespace) skip until we
|
|
;; see the next non-whitespace character
|
|
(loop for skip-char = next-char
|
|
then (next-char-non-extended lexer)
|
|
while (and skip-char
|
|
(whitespacep skip-char))
|
|
finally (return skip-char))))))
|
|
;; if the position has moved we have to repeat our tests
|
|
;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which
|
|
;; would be equivalent to /^a{3}c/ in Perl
|
|
(unless (> (lexer-pos lexer) last-loop-pos)
|
|
(return next-char)))))
|
|
|
|
(declaim (inline fail))
|
|
(defun fail (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Moves (LEXER-POS LEXER) back to the last position stored in
|
|
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
|
|
(unless (lexer-last-pos lexer)
|
|
(signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer))
|
|
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
|
|
nil)
|
|
|
|
(defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Read and consume the number the lexer is currently looking at and
|
|
return it. Returns NIL if no number could be identified.
|
|
RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
|
|
at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
|
|
we don't tolerate whitespace in front of the number."
|
|
(when (or (end-of-string-p lexer)
|
|
(and no-whitespace-p
|
|
(whitespacep (schar (lexer-str lexer) (lexer-pos lexer)))))
|
|
(return-from get-number nil))
|
|
(multiple-value-bind (integer new-pos)
|
|
(parse-integer (lexer-str lexer)
|
|
:start (lexer-pos lexer)
|
|
:end (if max-length
|
|
(let ((end-pos (+ (lexer-pos lexer)
|
|
(the fixnum max-length)))
|
|
(lexer-len (lexer-len lexer)))
|
|
(if (< end-pos lexer-len)
|
|
end-pos
|
|
lexer-len))
|
|
(lexer-len lexer))
|
|
:radix radix
|
|
:junk-allowed t)
|
|
(cond ((and integer (>= (the fixnum integer) 0))
|
|
(setf (lexer-pos lexer) new-pos)
|
|
integer)
|
|
(t nil))))
|
|
|
|
(declaim (inline try-number))
|
|
(defun try-number (lexer &key (radix 10) max-length no-whitespace-p)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Like GET-NUMBER but won't consume anything if no number is seen."
|
|
;; remember current position
|
|
(push (lexer-pos lexer) (lexer-last-pos lexer))
|
|
(let ((number (get-number lexer
|
|
:radix radix
|
|
:max-length max-length
|
|
:no-whitespace-p no-whitespace-p)))
|
|
(or number (fail lexer))))
|
|
|
|
(declaim (inline make-char-from-code))
|
|
(defun make-char-from-code (number error-pos)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Create character from char-code NUMBER. NUMBER can be NIL
|
|
which is interpreted as 0. ERROR-POS is the position where
|
|
the corresponding number started within the regex string."
|
|
;; only look at rightmost eight bits in compliance with Perl
|
|
(let ((code (logand #o377 (the fixnum (or number 0)))))
|
|
(or (and (< code char-code-limit)
|
|
(code-char code))
|
|
(signal-syntax-error* error-pos "No character for hex-code ~X." number))))
|
|
|
|
(defun unescape-char (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Convert the characters\(s) following a backslash into a token
|
|
which is returned. This function is to be called when the backslash
|
|
has already been consumed. Special character classes like \\W are
|
|
handled elsewhere."
|
|
(when (end-of-string-p lexer)
|
|
(signal-syntax-error "String ends with backslash."))
|
|
(let ((chr (next-char-non-extended lexer)))
|
|
(case chr
|
|
((#\E)
|
|
;; if \Q quoting is on this is ignored, otherwise it's just an
|
|
;; #\E
|
|
(if *allow-quoting*
|
|
:void
|
|
#\E))
|
|
((#\c)
|
|
;; \cx means control-x in Perl
|
|
(let ((next-char (next-char-non-extended lexer)))
|
|
(unless next-char
|
|
(signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c' at position ~A."))
|
|
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
|
|
((#\x)
|
|
;; \x should be followed by a hexadecimal char code,
|
|
;; two digits or less
|
|
(let* ((error-pos (lexer-pos lexer))
|
|
(number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t)))
|
|
;; note that it is OK if \x is followed by zero digits
|
|
(make-char-from-code number error-pos)))
|
|
((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
|
|
;; \x should be followed by an octal char code,
|
|
;; three digits or less
|
|
(let* ((error-pos (decf (lexer-pos lexer)))
|
|
(number (get-number lexer :radix 8 :max-length 3)))
|
|
(make-char-from-code number error-pos)))
|
|
;; the following five character names are 'semi-standard'
|
|
;; according to the CLHS but I'm not aware of any implementation
|
|
;; that doesn't implement them
|
|
((#\t)
|
|
#\Tab)
|
|
((#\n)
|
|
#\Newline)
|
|
((#\r)
|
|
#\Return)
|
|
((#\f)
|
|
#\Page)
|
|
((#\b)
|
|
#\Backspace)
|
|
((#\a)
|
|
(code-char 7)) ; ASCII bell
|
|
((#\e)
|
|
(code-char 27)) ; ASCII escape
|
|
(otherwise
|
|
;; all other characters aren't affected by a backslash
|
|
chr))))
|
|
|
|
(defun read-char-property (lexer first-char)
|
|
(declare #.*standard-optimize-settings*)
|
|
(unless (eql (next-char-non-extended lexer) #\{)
|
|
(signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char))
|
|
(let ((name (with-output-to-string (out nil :element-type
|
|
#+:lispworks 'lw:simple-char #-:lispworks 'character)
|
|
(loop
|
|
(let ((char (or (next-char-non-extended lexer)
|
|
(signal-syntax-error "Unexpected EOF after \\~A{." first-char))))
|
|
(when (char= char #\})
|
|
(return))
|
|
(write-char char out))))))
|
|
(list (if (char= first-char #\p) :property :inverted-property)
|
|
;; we must reverse here because of what PARSE-STRING does
|
|
(nreverse name))))
|
|
|
|
(defun collect-char-class (lexer)
|
|
"Reads and consumes characters from regex string until a right
|
|
bracket is seen. Assembles them into a list \(which is returned) of
|
|
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
|
|
tokens representing special character classes."
|
|
(declare #.*standard-optimize-settings*)
|
|
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
|
|
hyphen-seen
|
|
last-char
|
|
list)
|
|
(flet ((handle-char (c)
|
|
"Do the right thing with character C depending on whether
|
|
we're inside a range or not."
|
|
(cond ((and hyphen-seen last-char)
|
|
(setf (car list) (list :range last-char c)
|
|
last-char nil))
|
|
(t
|
|
(push c list)
|
|
(setq last-char c)))
|
|
(setq hyphen-seen nil)))
|
|
(loop for first = t then nil
|
|
for c = (next-char-non-extended lexer)
|
|
;; leave loop if at end of string
|
|
while c
|
|
do (cond
|
|
((char= c #\\)
|
|
;; we've seen a backslash
|
|
(let ((next-char (next-char-non-extended lexer)))
|
|
(case next-char
|
|
((#\d #\D #\w #\W #\s #\S)
|
|
;; a special character class
|
|
(push (map-char-to-special-char-class next-char) list)
|
|
;; if the last character was a hyphen
|
|
;; just collect it literally
|
|
(when hyphen-seen
|
|
(push #\- list))
|
|
;; if the next character is a hyphen do the same
|
|
(when (looking-at-p lexer #\-)
|
|
(push #\- list)
|
|
(incf (lexer-pos lexer)))
|
|
(setq hyphen-seen nil))
|
|
((#\P #\p)
|
|
;; maybe a character property
|
|
(cond ((null *property-resolver*)
|
|
(handle-char next-char))
|
|
(t
|
|
(push (read-char-property lexer next-char) list)
|
|
;; if the last character was a hyphen
|
|
;; just collect it literally
|
|
(when hyphen-seen
|
|
(push #\- list))
|
|
;; if the next character is a hyphen do the same
|
|
(when (looking-at-p lexer #\-)
|
|
(push #\- list)
|
|
(incf (lexer-pos lexer)))
|
|
(setq hyphen-seen nil))))
|
|
((#\E)
|
|
;; if \Q quoting is on we ignore \E,
|
|
;; otherwise it's just a plain #\E
|
|
(unless *allow-quoting*
|
|
(handle-char #\E)))
|
|
(otherwise
|
|
;; otherwise unescape the following character(s)
|
|
(decf (lexer-pos lexer))
|
|
(handle-char (unescape-char lexer))))))
|
|
(first
|
|
;; the first character must not be a right bracket
|
|
;; and isn't treated specially if it's a hyphen
|
|
(handle-char c))
|
|
((char= c #\])
|
|
;; end of character class
|
|
;; make sure we collect a pending hyphen
|
|
(when hyphen-seen
|
|
(setq hyphen-seen nil)
|
|
(handle-char #\-))
|
|
;; reverse the list to preserve the order intended
|
|
;; by the author of the regex string
|
|
(return-from collect-char-class (nreverse list)))
|
|
((and (char= c #\-)
|
|
last-char
|
|
(not hyphen-seen))
|
|
;; if the last character was 'just a character'
|
|
;; we expect to be in the middle of a range
|
|
(setq hyphen-seen t))
|
|
((char= c #\-)
|
|
;; otherwise this is just an ordinary hyphen
|
|
(handle-char #\-))
|
|
(t
|
|
;; default case - just collect the character
|
|
(handle-char c))))
|
|
;; we can only exit the loop normally if we've reached the end
|
|
;; of the regex string without seeing a right bracket
|
|
(signal-syntax-error* start-pos "Missing right bracket to close character class."))))
|
|
|
|
(defun maybe-parse-flags (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Reads a sequence of modifiers \(including #\\- to reverse their
|
|
meaning) and returns a corresponding list of \"flag\" tokens. The
|
|
\"x\" modifier is treated specially in that it dynamically modifies
|
|
the behaviour of the lexer itself via the special variable
|
|
*EXTENDED-MODE-P*."
|
|
(prog1
|
|
(loop with set = t
|
|
for chr = (next-char-non-extended lexer)
|
|
unless chr
|
|
do (signal-syntax-error "Unexpected end of string.")
|
|
while (find chr "-imsx" :test #'char=)
|
|
;; the first #\- will invert the meaning of all modifiers
|
|
;; following it
|
|
if (char= chr #\-)
|
|
do (setq set nil)
|
|
else if (char= chr #\x)
|
|
do (setq *extended-mode-p* set)
|
|
else collect (if set
|
|
(case chr
|
|
((#\i)
|
|
:case-insensitive-p)
|
|
((#\m)
|
|
:multi-line-mode-p)
|
|
((#\s)
|
|
:single-line-mode-p))
|
|
(case chr
|
|
((#\i)
|
|
:case-sensitive-p)
|
|
((#\m)
|
|
:not-multi-line-mode-p)
|
|
((#\s)
|
|
:not-single-line-mode-p))))
|
|
(decf (lexer-pos lexer))))
|
|
|
|
(defun get-quantifier (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Returns a list of two values (min max) if what the lexer is looking
|
|
at can be interpreted as a quantifier. Otherwise returns NIL and
|
|
resets the lexer to its old position."
|
|
;; remember starting position for FAIL and UNGET-TOKEN functions
|
|
(push (lexer-pos lexer) (lexer-last-pos lexer))
|
|
(let ((next-char (next-char lexer)))
|
|
(case next-char
|
|
((#\*)
|
|
;; * (Kleene star): match 0 or more times
|
|
'(0 nil))
|
|
((#\+)
|
|
;; +: match 1 or more times
|
|
'(1 nil))
|
|
((#\?)
|
|
;; ?: match 0 or 1 times
|
|
'(0 1))
|
|
((#\{)
|
|
;; one of
|
|
;; {n}: match exactly n times
|
|
;; {n,}: match at least n times
|
|
;; {n,m}: match at least n but not more than m times
|
|
;; note that anything not matching one of these patterns will
|
|
;; be interpreted literally - even whitespace isn't allowed
|
|
(let ((num1 (get-number lexer :no-whitespace-p t)))
|
|
(if num1
|
|
(let ((next-char (next-char-non-extended lexer)))
|
|
(case next-char
|
|
((#\,)
|
|
(let* ((num2 (get-number lexer :no-whitespace-p t))
|
|
(next-char (next-char-non-extended lexer)))
|
|
(case next-char
|
|
((#\})
|
|
;; this is the case {n,} (NUM2 is NIL) or {n,m}
|
|
(list num1 num2))
|
|
(otherwise
|
|
(fail lexer)))))
|
|
((#\})
|
|
;; this is the case {n}
|
|
(list num1 num1))
|
|
(otherwise
|
|
(fail lexer))))
|
|
;; no number following left curly brace, so we treat it
|
|
;; like a normal character
|
|
(fail lexer))))
|
|
;; cannot be a quantifier
|
|
(otherwise
|
|
(fail lexer)))))
|
|
|
|
(defun parse-register-name-aux (lexer)
|
|
"Reads and returns the name in a named register group. It is
|
|
assumed that the starting #\< character has already been read. The
|
|
closing #\> will also be consumed."
|
|
;; we have to look for an ending > character now
|
|
(let ((end-name (position #\>
|
|
(lexer-str lexer)
|
|
:start (lexer-pos lexer)
|
|
:test #'char=)))
|
|
(unless end-name
|
|
;; there has to be > somewhere, syntax error otherwise
|
|
(signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>."))
|
|
(let ((name (subseq (lexer-str lexer)
|
|
(lexer-pos lexer)
|
|
end-name)))
|
|
(unless (every #'(lambda (char)
|
|
(or (alphanumericp char)
|
|
(char= #\- char)))
|
|
name)
|
|
;; register name can contain only alphanumeric characters or #\-
|
|
(signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group."))
|
|
;; advance lexer beyond "<name>" part
|
|
(setf (lexer-pos lexer) (1+ end-name))
|
|
name)))
|
|
|
|
(defun get-token (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Returns and consumes the next token from the regex string \(or NIL)."
|
|
;; remember starting position for UNGET-TOKEN function
|
|
(push (lexer-pos lexer)
|
|
(lexer-last-pos lexer))
|
|
(let ((next-char (next-char lexer)))
|
|
(cond (next-char
|
|
(case next-char
|
|
;; the easy cases first - the following six characters
|
|
;; always have a special meaning and get translated
|
|
;; into tokens immediately
|
|
((#\))
|
|
:close-paren)
|
|
((#\|)
|
|
:vertical-bar)
|
|
((#\?)
|
|
:question-mark)
|
|
((#\.)
|
|
:everything)
|
|
((#\^)
|
|
:start-anchor)
|
|
((#\$)
|
|
:end-anchor)
|
|
((#\+ #\*)
|
|
;; quantifiers will always be consumend by
|
|
;; GET-QUANTIFIER, they must not appear here
|
|
(signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char))
|
|
((#\{)
|
|
;; left brace isn't a special character in it's own
|
|
;; right but we must check if what follows might
|
|
;; look like a quantifier
|
|
(let ((this-pos (lexer-pos lexer))
|
|
(this-last-pos (lexer-last-pos lexer)))
|
|
(unget-token lexer)
|
|
(when (get-quantifier lexer)
|
|
(signal-syntax-error* (car this-last-pos)
|
|
"Quantifier '~A' not allowed."
|
|
(subseq (lexer-str lexer)
|
|
(car this-last-pos)
|
|
(lexer-pos lexer))))
|
|
(setf (lexer-pos lexer) this-pos
|
|
(lexer-last-pos lexer) this-last-pos)
|
|
next-char))
|
|
((#\[)
|
|
;; left bracket always starts a character class
|
|
(cons (cond ((looking-at-p lexer #\^)
|
|
(incf (lexer-pos lexer))
|
|
:inverted-char-class)
|
|
(t
|
|
:char-class))
|
|
(collect-char-class lexer)))
|
|
((#\\)
|
|
;; backslash might mean different things so we have
|
|
;; to peek one char ahead:
|
|
(let ((next-char (next-char-non-extended lexer)))
|
|
(case next-char
|
|
((#\A)
|
|
:modeless-start-anchor)
|
|
((#\Z)
|
|
:modeless-end-anchor)
|
|
((#\z)
|
|
:modeless-end-anchor-no-newline)
|
|
((#\b)
|
|
:word-boundary)
|
|
((#\B)
|
|
:non-word-boundary)
|
|
((#\k)
|
|
(cond ((and *allow-named-registers*
|
|
(looking-at-p lexer #\<))
|
|
;; back-referencing a named register
|
|
(incf (lexer-pos lexer))
|
|
(list :back-reference
|
|
(nreverse (parse-register-name-aux lexer))))
|
|
(t
|
|
;; false alarm, just unescape \k
|
|
#\k)))
|
|
((#\d #\D #\w #\W #\s #\S)
|
|
;; these will be treated like character classes
|
|
(map-char-to-special-char-class next-char))
|
|
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
|
|
;; uh, a digit...
|
|
(let* ((old-pos (decf (lexer-pos lexer)))
|
|
;; ...so let's get the whole number first
|
|
(backref-number (get-number lexer)))
|
|
(declare (fixnum backref-number))
|
|
(cond ((and (> backref-number (lexer-reg lexer))
|
|
(<= 10 backref-number))
|
|
;; \10 and higher are treated as octal
|
|
;; character codes if we haven't
|
|
;; opened that much register groups
|
|
;; yet
|
|
(setf (lexer-pos lexer) old-pos)
|
|
;; re-read the number from the old
|
|
;; position and convert it to its
|
|
;; corresponding character
|
|
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
|
old-pos))
|
|
(t
|
|
;; otherwise this must refer to a
|
|
;; backreference
|
|
(list :back-reference backref-number)))))
|
|
((#\0)
|
|
;; this always means an octal character code
|
|
;; (at most three digits)
|
|
(let ((old-pos (decf (lexer-pos lexer))))
|
|
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
|
|
old-pos)))
|
|
((#\P #\p)
|
|
;; might be a named property
|
|
(cond (*property-resolver* (read-char-property lexer next-char))
|
|
(t next-char)))
|
|
(otherwise
|
|
;; in all other cases just unescape the
|
|
;; character
|
|
(decf (lexer-pos lexer))
|
|
(unescape-char lexer)))))
|
|
((#\()
|
|
;; an open parenthesis might mean different things
|
|
;; depending on what follows...
|
|
(cond ((looking-at-p lexer #\?)
|
|
;; this is the case '(?' (and probably more behind)
|
|
(incf (lexer-pos lexer))
|
|
;; we have to check for modifiers first
|
|
;; because a colon might follow
|
|
(let* ((flags (maybe-parse-flags lexer))
|
|
(next-char (next-char-non-extended lexer)))
|
|
;; modifiers are only allowed if a colon
|
|
;; or a closing parenthesis are following
|
|
(when (and flags
|
|
(not (find next-char ":)" :test #'char=)))
|
|
(signal-syntax-error* (car (lexer-last-pos lexer))
|
|
"Sequence '~A' not recognized."
|
|
(subseq (lexer-str lexer)
|
|
(car (lexer-last-pos lexer))
|
|
(lexer-pos lexer))))
|
|
(case next-char
|
|
((nil)
|
|
;; syntax error
|
|
(signal-syntax-error "End of string following '(?'."))
|
|
((#\))
|
|
;; an empty group except for the flags
|
|
;; (if there are any)
|
|
(or (and flags
|
|
(cons :flags flags))
|
|
:void))
|
|
((#\()
|
|
;; branch
|
|
:open-paren-paren)
|
|
((#\>)
|
|
;; standalone
|
|
:open-paren-greater)
|
|
((#\=)
|
|
;; positive look-ahead
|
|
:open-paren-equal)
|
|
((#\!)
|
|
;; negative look-ahead
|
|
:open-paren-exclamation)
|
|
((#\:)
|
|
;; non-capturing group - return flags as
|
|
;; second value
|
|
(values :open-paren-colon flags))
|
|
((#\<)
|
|
;; might be a look-behind assertion or a named group, so
|
|
;; check next character
|
|
(let ((next-char (next-char-non-extended lexer)))
|
|
(if (alpha-char-p next-char)
|
|
(progn
|
|
;; we have encountered a named group
|
|
;; are we supporting register naming?
|
|
(unless *allow-named-registers*
|
|
(signal-syntax-error* (1- (lexer-pos lexer))
|
|
"Character '~A' may not follow '(?<'."
|
|
next-char))
|
|
;; put the letter back
|
|
(decf (lexer-pos lexer))
|
|
;; named group
|
|
:open-paren-less-letter)
|
|
(case next-char
|
|
((#\=)
|
|
;; positive look-behind
|
|
:open-paren-less-equal)
|
|
((#\!)
|
|
;; negative look-behind
|
|
:open-paren-less-exclamation)
|
|
((#\))
|
|
;; Perl allows "(?<)" and treats
|
|
;; it like a null string
|
|
:void)
|
|
((nil)
|
|
;; syntax error
|
|
(signal-syntax-error "End of string following '(?<'."))
|
|
(t
|
|
;; also syntax error
|
|
(signal-syntax-error* (1- (lexer-pos lexer))
|
|
"Character '~A' may not follow '(?<'."
|
|
next-char ))))))
|
|
(otherwise
|
|
(signal-syntax-error* (1- (lexer-pos lexer))
|
|
"Character '~A' may not follow '(?'."
|
|
next-char)))))
|
|
(t
|
|
;; if next-char was not #\? (this is within
|
|
;; the first COND), we've just seen an opening
|
|
;; parenthesis and leave it like that
|
|
:open-paren)))
|
|
(otherwise
|
|
;; all other characters are their own tokens
|
|
next-char)))
|
|
;; we didn't get a character (this if the "else" branch from
|
|
;; the first IF), so we don't return a token but NIL
|
|
(t
|
|
(pop (lexer-last-pos lexer))
|
|
nil))))
|
|
|
|
(declaim (inline unget-token))
|
|
(defun unget-token (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Moves the lexer back to the last position stored in the LAST-POS stack."
|
|
(if (lexer-last-pos lexer)
|
|
(setf (lexer-pos lexer)
|
|
(pop (lexer-last-pos lexer)))
|
|
(error "No token to unget \(this should not happen)")))
|
|
|
|
(declaim (inline start-of-subexpr-p))
|
|
(defun start-of-subexpr-p (lexer)
|
|
(declare #.*standard-optimize-settings*)
|
|
"Tests whether the next token can start a valid sub-expression, i.e.
|
|
a stand-alone regex."
|
|
(let* ((pos (lexer-pos lexer))
|
|
(next-char (next-char lexer)))
|
|
(not (or (null next-char)
|
|
(prog1
|
|
(member (the character next-char)
|
|
'(#\) #\|)
|
|
:test #'char=)
|
|
(setf (lexer-pos lexer) pos))))))
|