diff --git a/src/main/url.cl b/src/main/url.cl index 7ff2fb7..e95df90 100644 --- a/src/main/url.cl +++ b/src/main/url.cl @@ -10,15 +10,19 @@ ;;;; %File Description: ;;;; -;;;; This file provides all the facilities for the parsing of URLs -;;;; into their component parts, as well as the mapping mechanism from -;;;; URLs to resources. +;;;; This file provides all the facilities for parsing and unparsing +;;;; of URIs into/from their component parts, as well as features to +;;;; manipulate and handle parsed URIs. This file tries to conform to +;;;; RFC 2396, which differs in important aspects from earlier RFCs +;;;; that defined URLs, URNs and URIs, like RFC 1738 and RFC 1808. +;;;; All in all the new definition seems much more consistent and +;;;; useful, and is therefore preferred. ;;;; ;;; Conditions (define-condition url-error (clash-error) ((url-string :initarg :url-string :reader url-error-url-string)) - (:default-initargs :code +HTTP-Code-Not-Found+) + (:default-initargs :code +HTTP-Code-Bad-Request+) (:report (lambda (condition stream) (report-clash-error condition stream "Unspecified URL parsing error on URL ~A" @@ -33,7 +37,34 @@ (url-unknown-scheme-error-scheme condition) (url-error-url-string condition)))))) -;;; Constants +(define-condition url-syntax-error (url-error) + () + (:report (lambda (condition stream) + (report-clash-error condition stream + "URL syntax error while parsing URL ~A" + (list (url-error-url-string condition)))))) + +(define-condition url-illegal-escape-error (url-syntax-error) + ((escape-seq :initarg :escape-seq + :reader url-illegal-escape-error-escape-seq)) + (:report (lambda (condition stream) + (report-clash-error condition stream + "URL syntax error (Illegal URL escape sequence ~A) while parsing URL ~A" + (list + (url-illegal-escape-error-escape-seq + condition) + (url-error-url-string condition)))))) + +(define-condition url-simple-syntax-error (url-syntax-error) + ((reason :initarg :reason :reader url-simple-syntax-error-reason)) + (:report (lambda (condition stream) + (report-clash-error condition stream + "URL syntax error (~A) while parsing URL ~A" + (list + (url-simple-syntax-error-reason condition) + (url-error-url-string condition)))))) + +;;; Constants and predicates on URI component characters (defconstant +url-scheme-separator+ #\: "Separator Character for URL schemes.") @@ -41,49 +72,305 @@ (defconstant +url-fragment-separator+ #\# "Separator Character for URL fragments, which are not part of URLs.") +(defconstant +url-escape-character+ #\% + "Character that indicates an escaped character.") + (defconstant +url-host-port-separator+ #\: "Host:Port Separator Character for some URL schemes.") (defconstant +url-path-separator+ #\/ "Path Separator Character for some URL schemes.") +(defconstant +url-param-separator+ #\; + "Parameter Separator Character for some URL schemes.") + (defconstant +url-query-separator+ #\? "Query Separator Character for some URL schemes.") -(defconstant +url-port-host-indicator+ "//" +(defconstant +url-query-key-separator+ #\= + "Separator of query keywords from query values.") + +(defconstant +url-query-pair-separator+ #\& + "Separator of query pairs.") + +(defconstant +url-authority-indicator+ "//" "String that when present at the beginning of some scheme-specific parts -indicates the presence of host and/or port information.") +indicates the presence of authority information.") + +;;; URI character class predicates + +(defmacro def-uri-char-p (name char &body body) + "Declares an inlined predicate function on characters that is named +`URI-name-CHAR-P', has a lambda list of (char) and the body body." + (let ((realname (intern (format nil "URI-~A-CHAR-P" (symbol-name name)) + (symbol-package name)))) + `(progn + (declaim (inline ,realname)) + (defun ,realname (,char) + (declare (optimize (speed 3)) (type character ,char)) + ,@body)))) + +(def-uri-char-p lowalpha char + (char<= #\a char #\z)) + +(def-uri-char-p upalpha char + (char<= #\A char #\Z)) + +(def-uri-char-p alpha char + (or (uri-lowalpha-char-p char) + (uri-upalpha-char-p char))) + +(def-uri-char-p digit char + (char<= #\0 char #\9)) + +(def-uri-char-p alphanum char + (or (uri-alpha-char-p char) + (uri-digit-char-p char))) + +(def-uri-char-p mark char + (or (char= char #\-) + (char= char #\_) + (char= char #\.) + (char= char #\!) + (char= char #\~) + (char= char #\*) + (char= char #\') + (char= char #\() + (char= char #\)))) + +(def-uri-char-p unreserved char + (or (uri-alphanum-char-p char) + (uri-mark-char-p char))) + +(def-uri-char-p reserved char + (or (char= char #\;) + (char= char #\/) + (char= char #\?) + (char= char #\:) + (char= char #\@) + (char= char #\&) + (char= char #\=) + (char= char #\+) + (char= char #\$) + (char= char #\,))) + +(def-uri-char-p scheme char + (or (uri-alphanum-char-p char) + (char= char #\+) + (char= char #\-) + (char= char #\.))) + +(def-uri-char-p host char + (or (uri-alphanum-char-p char) + (char= char #\-) + (char= char #\.))) + +(def-uri-char-p port char + (uri-digit-char-p char)) ;;; URLs (defclass url () ((source-string :initarg :source-string :reader url-source-string - :initform nil))) + :initform nil) + (scheme :initarg :scheme :reader url-scheme))) (defgeneric url-string (url) (:documentation "Renders the given URL object into a new string and returns that.")) -(defgeneric url-scheme (url) - (:documentation "Return a string with the URLs canonical scheme.")) - -(defgeneric url-specific-part (url) +(defgeneric url-specific-string (url) (:documentation "Return a string with the URLs scheme-specific part.")) -(defgeneric url-significant-string (url) +(defgeneric url-significant-string (url)) + +(defgeneric merge-urls (url-a url-b) (:documentation - "Return a string which contains those parts of the URL, that are necessary -to identify a resource for the HTTP server to look-up the provider of that -resource. For example with HTTP-URLs this excludes the search-key part.")) + "Merges two URLs into a new URL which is returned. Merging is done +in a component-wise fashion, with url-a taking precedence over url-b. +Merging is only defined on usefully similar URLs, i.e. URLs that are +part of the same scheme.")) (defmethod url-string ((url url)) - (format nil "~A:~A" + (format nil "~@[~A:~]~A" (url-scheme url) - (url-specific-part url))) + (url-specific-string url))) (defmethod print-object ((obj url) stream) (print-unreadable-object (obj stream :type t :identity nil) - (format stream "~S" (url-specific-part obj)))) + (write-string (url-string obj) stream))) + +;;; Schemes and Parsers + +(defvar *scheme-parser-map* nil + "Mapping from schemes to parsers.") + +(defun register-scheme (scheme parser-function) + "Registers a new URL scheme." + (let ((scheme (string-downcase scheme))) + (let ((present (assoc scheme *scheme-parser-map* :test #'string=))) + (if present + (setf (cdr present) parser-function) + (push (cons scheme parser-function) *scheme-parser-map*))) + scheme)) + +(defun unregister-scheme (scheme) + "Unregisters an URL scheme." + (let ((scheme (string-downcase scheme))) + (let ((present (assoc scheme *scheme-parser-map* :test #'string=))) + (if present + (setf *scheme-parser-map* + (delete present *scheme-parser-map*)) + (error "URL scheme ~A is not defined." scheme)) + present))) + +(defun lookup-scheme-parser (string) + (cdr + (assoc string *scheme-parser-map* :test #'string-equal))) + +(defun reset-schemes () + "Clear the list of recognized schemes." + (setq *scheme-parser-map* nil)) + +;;; Parsing of URLs + +(defparameter *parse-url-default-base* "http" + "This parameter supplies the default value to the `base' parameter +of `parse-url-from-string'. Set this to nil to prevent parsing of +relative URIs.") + +(defun parse-url-from-string (string &key (base *parse-url-default-base*) + (clear-fragment t)) + "Parse a URI from the simple string `string' and return the URL +object that represents the parsed URI. If `base' is supplied, it must +either be a string indicating an url-scheme or a valid Absolute-URI. +If a string is passed, then a relative URI for this scheme will be +parsed and returned. If a URI is passed, then a relative URI for the +absolute URI's scheme will be parsed and merged with this base +URL. Otherwise trying to parse a relative URI will result in an error. + +If `clear-fragment' is true, then any fragment present in the URI +string will be cleared before parsing begins and returned as a second +value. Otherwise it will be handled by the scheme-specific parsing +routine (usually by including it in the relevant scheme-specific part +of the URL, e.g. the path or query component). + +This function will raise conditions of type `url-error' if parsing +fails. The raised condition will be of type `url-syntax-error' if the +failure results from a syntactic error in the URI string passed." + (let* ((base-url (and (not (stringp base)) base)) + (base-scheme (and base (if (stringp base) base (url-scheme base)))) + (end (or (and clear-fragment + (position +url-fragment-separator+ + string :from-end t)) + (length string))) + (fragment (and clear-fragment + (< end (length string)) + (subseq string (1+ end))))) + (let ((scheme-end (position-if-not #'uri-scheme-char-p string :end end))) + (cond + ;; Absolute URI, i.e. scheme is included + ((and scheme-end + (uri-alpha-char-p (schar string 0)) + (char= +url-scheme-separator+ (schar string scheme-end))) + (let* ((scheme (nstring-downcase (subseq string 0 scheme-end))) + (parser (lookup-scheme-parser scheme))) + (unless parser + (error 'url-unknown-scheme-error + :scheme scheme :url-string string)) + (values (funcall parser scheme string (1+ scheme-end) end nil) + fragment))) + (t + ;; A relative URI, check if we are allowed to parse relative-URIs + (unless base-scheme + (error 'url-error :url-string string)) + (let ((parser (lookup-scheme-parser base-scheme))) + (unless parser + (error 'url-unknown-scheme-error + :scheme base-scheme :url-string string)) + (values (funcall parser nil string 0 end base-url) + fragment))))))) + +(defun escape-url-string (string &optional (allowed "")) + (declare (type simple-string string allowed) + (optimize (speed 3))) + (flet ((escape-p (char) + (declare (type character char)) + (not + (or + (char<= #\0 char #\9) + (char<= #\A char #\Z) + (char<= #\a char #\z) + (char= char #\-) + (char= char #\_) + (char= char #\.) + (char= char #\!) + (char= char #\~) + (char= char #\*) + (char= char #\') + (find char allowed)))) + (escape-char (char string pos) + (declare (type character char) (type simple-string string) + (type (string-inner-index 3) pos)) + (setf (schar string pos) #\%) + (let* ((value (char-code char)) + (msn-char (digit-char (ldb (byte 4 4) value) 16)) + (lsn-char (digit-char (ldb (byte 4 0) value) 16))) + (setf (schar string (1+ pos)) msn-char + (schar string (+ pos 2)) lsn-char)))) + (let ((escape-count (count-if #'escape-p string))) + (when (zerop escape-count) + (return-from escape-url-string string)) + (do* ((length (length string)) + (result (make-string (+ length (* escape-count 2)))) + (result-pos 0 (1+ result-pos)) + (source-pos 0 (1+ source-pos))) + ((= source-pos length) result) + (declare (type (string-inner-index 3) result-pos source-pos)) + (let ((char (schar string source-pos))) + (cond + ((escape-p char) + (escape-char char result result-pos) + (incf result-pos 2)) + (t + (setf (schar result result-pos) char)))))))) + +(defun unescape-url-subseq (string start end) + "Copies a subsequence of a string, unescaping escaped characters on the way." + (declare (type simple-string string) + (type string-index start end) + (optimize (speed 3))) + (let ((first-escaped-pos (position +url-escape-character+ string + :start start :end end))) + (unless first-escaped-pos + (return-from unescape-url-subseq (subseq string start end))) + (let ((escape-end (- end 3))) + (flet ((unescaped-char (pos) + (unless (<= pos escape-end) + (error 'url-illegal-escape-error + :escape-seq (subseq string pos) + :url-string string)) + (let ((msn-value (digit-char-p (schar string (1+ pos)) 16)) + (lsn-value (digit-char-p (schar string (+ pos 2)) 16))) + (unless (and msn-value lsn-value) + (error 'url-illegal-escape-error + :escape-seq (subseq string pos (+ pos 3)) + :url-string string)) + (code-char (+ (* 16 msn-value) lsn-value))))) + (do* ((escape-count (count +url-escape-character+ string + :start start :end end)) + (result-string (subseq string start (- end (* 2 escape-count)))) + (result-pos (- first-escaped-pos start) (1+ result-pos)) + (source-pos first-escaped-pos (1+ source-pos))) + ((>= source-pos end) result-string) + (declare (type string-index result-pos source-pos) + (type simple-string result-string)) + (when (char= +url-escape-character+ + (setf (schar result-string result-pos) + (schar string source-pos))) + (setf (schar result-string result-pos) + (unescaped-char source-pos)) + (incf source-pos 2))))))) ;;; The following are some commonly used mix-ins for URLs. @@ -91,95 +378,253 @@ resource. For example with HTTP-URLs this excludes the search-key part.")) ((host :initarg :host :initform nil :reader url-host) (port :initarg :port :initform nil :reader url-port))) +(defgeneric url-authority-p (url) + (:documentation + "Returns true if the given url supplies an authority component.") + (:method (url) (declare (ignorable url)) nil)) + +(defmethod url-authority-p ((url url-host-port-mixin)) + (or (url-host url) (url-port url))) + +(defgeneric url-authority-string (url) + (:documentation "Return the url authority part as a formatted string.")) + +(defmethod url-authority-string ((url url-host-port-mixin)) + (format nil "~:[~;//~@[~A~]~@[:~D~]~]" + (url-authority-p url) + (let ((host (url-host url))) + (if host (escape-url-string host) nil)) + (url-port url))) + +(defun parse-host-port-part (url start end) + (let* ((host-end (loop for index from start below end + for char = (schar url index) + while (uri-host-char-p char) + finally + (return index))) + (port-end (if (and (< host-end end) + (char= (schar url host-end) + +url-host-port-separator+)) + (loop for index from (1+ host-end) below end + for char = (schar url index) + while (uri-port-char-p char) + finally (return index)) + host-end))) + (values + (subseq url start host-end) + (and (> port-end host-end) + (parse-integer url :start (1+ host-end) :end port-end)) + port-end))) + (defclass url-path-mixin () - ((path :initarg :path :initform nil :reader url-path))) + ((directory :initarg :directory :initform nil :reader url-directory) + (params :initarg :params :initform nil :reader url-params) + (name :initarg :name :initform nil :reader url-name) + (name-param :initarg :name-param :initform nil :reader url-name-param))) -(defclass url-parameter-mixin () - ((parameters :initarg :parameters :initform nil :reader url-parameters))) +(defgeneric url-path-p (url) + (:documentation + "Returns true if the given url supplies a path component.") + (:method (url) (declare (ignorable url)) nil)) -(defclass url-search-mixin () - ((searchpart :initarg :searchpart :initform nil :reader url-searchpart))) +(defmethod url-path-p ((url url-path-mixin)) + (or (url-directory url) (url-name url))) -;;; Schemes and Parsers +(defgeneric url-path-string (url) + (:documentation "Return the url path part as a formatted string.")) -(defvar *default-url-scheme* "http" - "This specifies the default scheme to use for parsing of URLs that -don't provide a scheme by themselves.") +(defmethod url-path-string ((url url-path-mixin)) + (let ((directory (url-directory url)) + (name (url-name url)) + (params (url-params url)) + (name-param (url-name-param url))) + (cond + ((or params name-param) + ;; Handle Parameters correctly + (error "URL's with path parameters not yet implemented")) + (t + (format nil "~:[~;~:[/~;~]~{~A/~}~@[~A~]~]" + (or directory name) + (eq (first directory) :relative) + (mapcar #'escape-url-string (rest directory)) + (and name (escape-url-string name))))))) -(defvar *scheme-parser-map* (make-hash-table :test #'equal) - "Mapping from schemes to parsers.") +(defun parse-path-part (url start end &optional allow-params) + (cond + ((zerop (- end start)) + ;; No path supplied + (values nil nil nil)) + (allow-params + (loop with absolute-p = (char= (schar url start) +url-path-separator+) + with real-start = (if absolute-p (1+ start) start) + with name = nil + with name-param = nil + for segment-start = real-start then (min end (1+ segment-end)) + for segment-end = (or (position +url-path-separator+ url + :start segment-start :end end) + end) + for param-start = (or (position +url-param-separator+ url + :start segment-start + :end segment-end) + segment-end) + while (< segment-start end) + if (= segment-end end) + do + (setq name + (unescape-url-subseq url segment-start param-start) + name-param + (unescape-url-subseq url (1+ param-start) segment-end)) + else + collect + (unescape-url-subseq url segment-start param-start) + into directory and + collect + (when (< param-start segment-end) + (unescape-url-subseq url (1+ param-start) segment-end)) + into params + end + finally + (return (values (cons (if absolute-p :absolute :relative) directory) + name + (cons nil params) + name-param)))) + (t + (loop with absolute-p = (char= (schar url start) +url-path-separator+) + with real-start = (if absolute-p (1+ start) start) + with name = nil + for segment-start = real-start then (min end (1+ segment-end)) + for segment-end = (or (position +url-path-separator+ url + :start segment-start :end end) + end) + while (< segment-start end) + if (= segment-end end) + do + (setq name + (unescape-url-subseq url segment-start segment-end)) + else + collect + (unescape-url-subseq url segment-start segment-end) + into directory + end + finally + (return (values (cons (if absolute-p :absolute :relative) directory) + name)))))) -(defun register-scheme (scheme parser-function) - "Registers a new URL scheme." - (let ((scheme (string-downcase scheme))) - (setf (gethash scheme *scheme-parser-map*) parser-function))) +(defclass url-query-mixin () + ((query-arguments :initarg :query-arguments :initform nil + :reader url-query-arguments))) -(defun unregister-scheme (scheme) - "Unregisters an URL scheme." - (let ((scheme (string-downcase scheme))) - (remhash scheme *scheme-parser-map*))) +(defgeneric url-query-string (url)) -(defun reset-schemes () - "Clear the list of recognized schemes." - (clrhash *scheme-parser-map*)) +(defmethod url-query-string ((url url-query-mixin)) + (format nil "~@[?~{~A=~A~^&~}~]" + (loop for (key . value) in (url-query-arguments url) + collect (escape-url-string key) + collect (escape-url-string value)))) -;;; Parsing of URLs - -(defun parse-url-from-string (string &optional clear-fragment) - (let* ((string (if (and clear-fragment - (position +url-fragment-separator+ - string)) - (subseq string 0 - (position +url-fragment-separator+ - string)) - string)) - (scheme-end (position +url-scheme-separator+ string)) - (scheme (if scheme-end - (string-downcase - (subseq string 0 scheme-end)) - *default-url-scheme*)) - (parser (gethash scheme *scheme-parser-map* nil))) - (unless parser - (error 'url-unknown-scheme-error :scheme scheme :url-string string)) - (funcall parser scheme string (if scheme-end (1+ scheme-end) 0)))) +(defun parse-query-part (url start end) + (loop for key-start = start then (min end (1+ pair-end)) + for pair-end = (or (position +url-query-pair-separator+ url + :start key-start :end end) + end) + for key-end = (position +url-query-key-separator+ url + :start key-start :end pair-end) + while (< key-start end) + do + (unless key-end + (error 'url-simple-syntax-error :url-string url + :reason "Malformed query key-value pair")) + collect + (cons (unescape-url-subseq url key-start key-end) + (unescape-url-subseq url (1+ key-end) pair-end)))) ;;; The specific URL schemes: -(defclass url-http (url url-host-port-mixin url-path-mixin url-search-mixin) - ((scheme :allocation :class :initform "http" :reader url-scheme))) +(defclass url-http (url url-host-port-mixin url-path-mixin url-query-mixin) + ((scheme :initform "http"))) -(defmethod url-specific-part ((url url-http)) - (format nil "~A~@[?~A~]" - (url-significant-string url) - (url-searchpart url))) +(defmethod url-specific-string ((url url-http)) + (concatenate 'string + (url-authority-string url) + (url-path-string url) + (url-query-string url))) (defmethod url-significant-string ((url url-http)) - (format nil "~:[~2*~;//~A~@[:~A~]/~]~@[~A~]" - (or (url-host url) (url-port url)) - (url-host url) - (url-port url) - (url-path url))) + (concatenate 'string + (url-authority-string url) + (url-path-string url))) -(defun url-http-parser (scheme url start) - (declare (ignore scheme)) - (let ((rest (subseq url start)) - host port path searchpart) - (when (and (> (length rest) 1) (equal "//" (subseq rest 0 2))) - (let* ((rest-pos (position +url-path-separator+ rest :start 2)) - (port-pos (position +url-host-port-separator+ rest :start 2 - :end rest-pos))) - (setq host (subseq rest 2 (or port-pos rest-pos))) - (when port-pos - (setq port (subseq rest (1+ port-pos) rest-pos))) - (setq rest (subseq rest (1+ rest-pos))))) - (let ((searchpart-pos (position +url-query-separator+ rest))) - (if searchpart-pos - (setq searchpart (subseq rest (1+ searchpart-pos)) - path (subseq rest 0 searchpart-pos)) - (setq path rest))) - (make-instance 'url-http - :source-string url - :host host :port port :path path - :searchpart searchpart))) +(defun url-http-parser (scheme url start end base-url) + (let ((host nil) + (port nil) + (directory nil) + (name nil) + (query nil)) + ;; Check for relpath here, because later on we can't reliably + ;; discriminate a valid relpath from a mal-formed abs-path + (when (and (char/= (schar url start) +url-path-separator+) + scheme) + (error 'url-simple-syntax-error :url-string url + :reason "Relative path component in Absolute-URI")) + ;; Authority + (when (and (> (- end start) 2) + (string= url +url-authority-indicator+ + :start1 start :end1 (+ start 2))) + (multiple-value-setq (host port start) + (parse-host-port-part url (+ start 2) end))) + ;; Query + (let ((query-pos (position +url-query-separator+ url + :start start :end end))) + (when query-pos + (setq query (parse-query-part url (1+ query-pos) end) + end query-pos))) + ;; Path is the rest. + (multiple-value-setq (directory name) + (parse-path-part url start end)) + ;; Create URL object, possibly merge with base-url + (let ((result + (make-instance 'url-http + :source-string url + :scheme scheme + :host host :port port + :directory directory + :name name + :query-arguments query))) + (if base-url + (merge-urls result base-url) + result)))) (register-scheme "http" #'url-http-parser) + +(defmethod merge-urls ((a url-http) (b url-http)) + (let* ((new-scheme (or (url-scheme a) (url-scheme b))) + (authority-p (url-authority-p a)) + (new-host (if authority-p (url-host a) (url-host b))) + (new-port (if authority-p (url-port a) (url-port b))) + (directory-a (url-directory a)) + (directory-b (url-directory b)) + (new-name (or (url-name a) (url-name b))) + (new-directory + (cond + ((null directory-a) + (copy-list directory-b)) + ((eq (first directory-a) :absolute) + (copy-list directory-a)) + (t + (loop with new-directory = (reverse (rest directory-b)) + for segment in (rest directory-a) + do + (cond + ((string= segment "..") + (pop new-directory)) + ((string= segment ".") + t) + (t + (push segment new-directory))) + finally + (return (cons :absolute (nreverse new-directory)))))))) + (make-instance 'url-http :source-string nil + :scheme new-scheme + :host new-host :port new-port + :directory new-directory + :name new-name)))