;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; Encoding: utf-8; -*- ;;; --------------------------------------------------------------------------- ;;; Title: Unparse XML ;;; Title: (including support for canonic XML according to J.Clark) ;;; Created: 1999-09-09 ;;; Author: Gilbert Baumann ;;; Author: David Lichteblau ;;; License: LGPL (See file COPYING for details). ;;; --------------------------------------------------------------------------- ;;; © copyright 1999 by Gilbert Baumann ;;; This library is free software; you can redistribute it and/or ;;; modify it under the terms of the GNU Library General Public ;;; License as published by the Free Software Foundation; either ;;; version 2 of the License, or (at your option) any later version. ;;; ;;; This library is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; Library General Public License for more details. ;;; ;;; You should have received a copy of the GNU Library General Public ;;; License along with this library; if not, write to the ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;;; Boston, MA 02111-1307 USA. (in-package :cxml) ;; ;; | Canonical XML ;; | ============= ;; | ;; | This document defines a subset of XML called canonical XML. The ;; | intended use of canonical XML is in testing XML processors, as a ;; | representation of the result of parsing an XML document. ;; | ;; | Every well-formed XML document has a unique structurally equivalent ;; | canonical XML document. Two structurally equivalent XML documents have ;; | a byte-for-byte identical canonical XML document. Canonicalizing an ;; | XML document requires only information that an XML processor is ;; | required to make available to an application. ;; | ;; | A canonical XML document conforms to the following grammar: ;; | ;; | CanonXML ::= Pi* element Pi* ;; | element ::= Stag (Datachar | Pi | element)* Etag ;; | Stag ::= '<' Name Atts '>' ;; | Etag ::= '' ;; | Pi ::= '' Char*)) '?>' ;; | Atts ::= (' ' Name '=' '"' Datachar* '"')* ;; | Datachar ::= '&' | '<' | '>' | '"' ;; | | ' '| ' '| ' ' ;; | | (Char - ('&' | '<' | '>' | '"' | #x9 | #xA | #xD)) ;; | Name ::= (see XML spec) ;; | Char ::= (see XML spec) ;; | S ::= (see XML spec) ;; | ;; | Attributes are in lexicographical order (in Unicode bit order). ;; | ;; | A canonical XML document is encoded in UTF-8. ;; | ;; | Ignorable white space is considered significant and is treated ;; | equivalently to data. ;; ;; -- James Clark (jjc@jclark.com) ;;;; SINK: a rune output "stream" (defclass sink () ((high-surrogate :initform nil) (column :initform 0 :accessor column) (width :initform 79 :initarg :width :accessor width) (canonical :initform t :initarg :canonical :accessor canonical) (indentation :initform nil :initarg :indentation :accessor indentation) (current-indentation :initform 0 :accessor current-indentation) (notations :initform (make-buffer :element-type t) :accessor notations) (name-for-dtd :accessor name-for-dtd) (previous-notation :initform nil :accessor previous-notation) (stack :initform nil :accessor stack))) (defmethod initialize-instance :after ((instance sink) &key) (when (eq (canonical instance) t) (setf (canonical instance) 1)) (unless (member (canonical instance) '(nil 1 2)) (error "Invalid canonical form: ~A" (canonical instance))) (when (and (canonical instance) (indentation instance)) (error "Cannot indent XML in canonical mode"))) ;; WRITE-OCTET als generisch zu machen ist vielleicht nicht die schnellste ;; Loesung, aber die einfachste. (defgeneric write-octet (octet sink)) (defun make-buffer (&key (element-type '(unsigned-byte 8))) (make-array 1 :element-type element-type :adjustable t :fill-pointer 0)) (defmethod write-octet :after (octet sink) (with-slots (column) sink (setf column (if (eql octet 10) 0 (1+ column))))) ;; vector (octet) sinks (defclass vector-sink (sink) ((target-vector :initform (make-buffer)))) (defun make-octet-vector-sink (&rest initargs) (apply #'make-instance 'vector-sink initargs)) (defmethod write-octet (octet (sink vector-sink)) (let ((target-vector (slot-value sink 'target-vector))) (vector-push-extend octet target-vector (length target-vector)))) (defmethod sax:end-document ((sink vector-sink)) (slot-value sink 'target-vector)) ;; character stream sinks (defclass character-stream-sink (sink) ((target-stream :initarg :target-stream))) (defun make-character-stream-sink (character-stream &rest initargs) (apply #'make-instance 'character-stream-sink :target-stream character-stream initargs)) (defmethod write-octet (octet (sink character-stream-sink)) (write-char (code-char octet) (slot-value sink 'target-stream))) (defmethod sax:end-document ((sink character-stream-sink)) (slot-value sink 'target-stream)) ;; octet stream sinks (defclass octet-stream-sink (sink) ((target-stream :initarg :target-stream))) (defun make-octet-stream-sink (octet-stream &rest initargs) (apply #'make-instance 'octet-stream-sink :target-stream octet-stream initargs)) (defmethod write-octet (octet (sink octet-stream-sink)) (write-byte octet (slot-value sink 'target-stream))) (defmethod sax:end-document ((sink octet-stream-sink)) (slot-value sink 'target-stream)) ;;;; doctype and notations (defmethod sax:start-dtd ((sink sink) name public-id system-id) (declare (ignore public-id system-id)) (setf (name-for-dtd sink) name)) (defmethod sax:notation-declaration ((sink sink) name public-id system-id) (when (and (canonical sink) (>= (canonical sink) 2)) (let ((prev (previous-notation sink))) (cond (prev (unless (rod< prev name) (error "misordered notations; cannot unparse canonically"))) (t ;; need a doctype declaration (write-rod #" sink) (write-rune #/U+000A sink))) (defmethod sax:end-dtd ((sink sink)) (when (previous-notation sink) (write-rod #"]>" sink) (write-rune #/U+000A sink))) ;;;; elements (defun sink-fresh-line (sink) (unless (zerop (column sink)) (write-rune-0 10 sink) (indent sink))) (defmethod sax:start-element ((sink sink) namespace-uri local-name qname attributes) (declare (ignore namespace-uri local-name)) (when (stack sink) (incf (cdr (first (stack sink))))) (push (cons qname 0) (stack sink)) (when (indentation sink) (sink-fresh-line sink) (start-indentation-block sink)) (write-rune #/< sink) (write-rod qname sink) (let ((atts (sort (copy-list attributes) #'rod< :key #'sax:attribute-qname))) (dolist (a atts) (write-rune #/space sink) (write-rod (sax:attribute-qname a) sink) (write-rune #/= sink) (write-rune #/\" sink) (map nil (lambda (c) (unparse-datachar c sink)) (sax:attribute-value a)) (write-rune #/\" sink))) (write-rod '#.(string-rod ">") sink)) (defmethod sax:end-element ((sink sink) namespace-uri local-name qname) (declare (ignore namespace-uri local-name)) (let ((cons (pop (stack sink)))) (unless (consp cons) (error "output does not nest: not in an element")) (unless (rod= (car cons) qname) (error "output does not nest: expected ~A but got ~A" (rod qname) (rod (car cons)))) (when (indentation sink) (end-indentation-block sink) (unless (zerop (cdr cons)) (sink-fresh-line sink)))) (write-rod '#.(string-rod "") sink)) (defmethod sax:processing-instruction ((sink sink) target data) (unless (rod-equal target '#.(string-rod "xml")) (write-rod '#.(string-rod "") sink))) (defmethod sax:start-cdata ((sink sink)) (push :cdata (stack sink))) (defmethod sax:characters ((sink sink) data) (cond ((and (eq (car (stack sink)) :cdata) (not (canonical sink)) (not (search #"]]" data))) (when (indentation sink) (sink-fresh-line sink)) (write-rod #"" sink)) (t (if (indentation sink) (unparse-indented-text data sink) (map nil (if (canonical sink) (lambda (c) (unparse-datachar c sink)) (lambda (c) (unparse-datachar-readable c sink))) data))))) (defmethod sax:end-cdata ((sink sink)) (unless (eq (pop (stack sink)) :cdata) (error "output does not nest: not in a cdata section"))) (defun indent (sink) (dotimes (x (current-indentation sink)) (write-rune-0 32 sink))) (defun start-indentation-block (sink) (incf (current-indentation sink) (indentation sink))) (defun end-indentation-block (sink) (decf (current-indentation sink) (indentation sink))) (defun unparse-indented-text (data sink) (flet ((whitespacep (x) (or (rune= x #/U+000A) (rune= x #/U+0020)))) (let* ((n (length data)) (pos (position-if-not #'whitespacep data)) (need-whitespace-p nil)) (cond ((zerop n)) (pos (sink-fresh-line sink) (while (< pos n) (let* ((w (or (position-if #'whitespacep data :start (1+ pos)) n)) (next (or (position-if-not #'whitespacep data :start w) n))) (when need-whitespace-p (if (< (+ (column sink) w (- pos)) (width sink)) (write-rune-0 32 sink) (sink-fresh-line sink))) (loop for i from pos below w do (unparse-datachar-readable (elt data i) sink)) (setf need-whitespace-p (< w n)) (setf pos next)))) (t (write-rune-0 32 sink)))))) (defun unparse-datachar (c sink) (cond ((rune= c #/&) (write-rod '#.(string-rod "&") sink)) ((rune= c #/<) (write-rod '#.(string-rod "<") sink)) ((rune= c #/>) (write-rod '#.(string-rod ">") sink)) ((rune= c #/\") (write-rod '#.(string-rod """) sink)) ((rune= c #/U+0009) (write-rod '#.(string-rod " ") sink)) ((rune= c #/U+000A) (write-rod '#.(string-rod " ") sink)) ((rune= c #/U+000D) (write-rod '#.(string-rod " ") sink)) (t (write-rune c sink)))) (defun unparse-datachar-readable (c sink) (cond ((rune= c #/&) (write-rod '#.(string-rod "&") sink)) ((rune= c #/<) (write-rod '#.(string-rod "<") sink)) ((rune= c #/>) (write-rod '#.(string-rod ">") sink)) ((rune= c #/\") (write-rod '#.(string-rod """) sink)) (t (write-rune c sink)))) ;;;; UTF-8 output for SINKs (defun write-rod (rod sink) (map nil (lambda (c) (write-rune c sink)) rod)) (defun write-rune (rune sink) (let ((code (rune-code rune))) (with-slots (high-surrogate) sink (cond ((<= #xD800 code #xDBFF) (setf high-surrogate code)) ((<= #xDC00 code #xDFFF) (let ((q (logior (ash (- high-surrogate #xD7C0) 10) (- code #xDC00)))) (write-rune-0 q sink)) (setf high-surrogate nil)) (t (write-rune-0 code sink)))))) (defun write-rune-0 (code sink) (labels ((wr (x) (write-octet x sink))) (cond ((<= #x00000000 code #x0000007F) (wr code)) ((<= #x00000080 code #x000007FF) (wr (logior #b11000000 (ldb (byte 5 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x00000800 code #x0000FFFF) (wr (logior #b11100000 (ldb (byte 4 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x00010000 code #x001FFFFF) (wr (logior #b11110000 (ldb (byte 3 18) code))) (wr (logior #b10000000 (ldb (byte 6 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x00200000 code #x03FFFFFF) (wr (logior #b11111000 (ldb (byte 2 24) code))) (wr (logior #b10000000 (ldb (byte 6 18) code))) (wr (logior #b10000000 (ldb (byte 6 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x04000000 code #x7FFFFFFF) (wr (logior #b11111100 (ldb (byte 1 30) code))) (wr (logior #b10000000 (ldb (byte 6 24) code))) (wr (logior #b10000000 (ldb (byte 6 18) code))) (wr (logior #b10000000 (ldb (byte 6 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code))))))) ;;;; convenience functions for DOMless XML serialization (defvar *current-element*) (defvar *sink*) (defmacro with-xml-output (sink &body body) `(invoke-with-xml-output (lambda () ,@body) ,sink)) (defun invoke-with-xml-output (fn sink) (let ((*sink* sink) (*current-element* nil)) (sax:start-document *sink*) (funcall fn) (sax:end-document *sink*))) (defmacro with-element (qname &body body) ;; XXX Statt qname soll man in zukunft auch mal (lname uri) angeben koennen. ;; Hat aber Zeit bis DOM 2. (when (listp qname) (destructuring-bind (n) qname (setf qname n))) `(invoke-with-element (lambda () ,@body) ,qname)) (defun maybe-emit-start-tag () (when *current-element* ;; starting child node, need to emit opening tag of parent first: (destructuring-bind (qname &rest attributes) *current-element* (sax:start-element *sink* nil nil qname (reverse attributes))) (setf *current-element* nil))) (defun invoke-with-element (fn qname) (maybe-emit-start-tag) (let ((*current-element* (list qname))) (multiple-value-prog1 (funcall fn) (maybe-emit-start-tag) (sax:end-element *sink* nil nil qname)))) (defun attribute (name value) (push (sax:make-attribute :qname name :value value) (cdr *current-element*)) value) (defun cdata (data) (sax:start-cdata *sink*) (sax:characters *sink* data) (sax:end-cdata *sink*) data) (defun text (data) (maybe-emit-start-tag) (sax:characters *sink* data) data)