;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: XML; readtable: runes; Encoding: utf-8; -*- ;;; --------------------------------------------------------------------------- ;;; Title: Dump canonic XML according to J.Clark ;;; Created: 1999-09-09 ;;; Author: Gilbert Baumann ;;; License: LGPL (See file COPYING for details). ;;; --------------------------------------------------------------------------- ;;; © copyright 1999 by Gilbert Baumann ;;; This library is free software; you can redistribute it and/or ;;; modify it under the terms of the GNU Library General Public ;;; License as published by the Free Software Foundation; either ;;; version 2 of the License, or (at your option) any later version. ;;; ;;; This library is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; Library General Public License for more details. ;;; ;;; You should have received a copy of the GNU Library General Public ;;; License along with this library; if not, write to the ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;;; Boston, MA 02111-1307 USA. (in-package :xml) ;; ;; | Canonical XML ;; | ============= ;; | ;; | This document defines a subset of XML called canonical XML. The ;; | intended use of canonical XML is in testing XML processors, as a ;; | representation of the result of parsing an XML document. ;; | ;; | Every well-formed XML document has a unique structurally equivalent ;; | canonical XML document. Two structurally equivalent XML documents have ;; | a byte-for-byte identical canonical XML document. Canonicalizing an ;; | XML document requires only information that an XML processor is ;; | required to make available to an application. ;; | ;; | A canonical XML document conforms to the following grammar: ;; | ;; | CanonXML ::= Pi* element Pi* ;; | element ::= Stag (Datachar | Pi | element)* Etag ;; | Stag ::= '<' Name Atts '>' ;; | Etag ::= '' ;; | Pi ::= '' Char*)) '?>' ;; | Atts ::= (' ' Name '=' '"' Datachar* '"')* ;; | Datachar ::= '&' | '<' | '>' | '"' ;; | | ' '| ' '| ' ' ;; | | (Char - ('&' | '<' | '>' | '"' | #x9 | #xA | #xD)) ;; | Name ::= (see XML spec) ;; | Char ::= (see XML spec) ;; | S ::= (see XML spec) ;; | ;; | Attributes are in lexicographical order (in Unicode bit order). ;; | ;; | A canonical XML document is encoded in UTF-8. ;; | ;; | Ignorable white space is considered significant and is treated ;; | equivalently to data. ;; ;; -- James Clark (jjc@jclark.com) (defvar *quux*) ;!!!BIG HACK!!! (defun unparse-document (doc sink) (map nil (rcurry #'unparse-node sink) (dom:child-nodes doc))) (defun unparse-node (node sink) (cond ((dom:element-p node) (write-rune #/< sink) (write-rod (dom:tag-name node) sink) ;; atts (let ((atts (sort (copy-list (dom:items (dom:attributes node))) #'rod< :key #'dom:name))) (dolist (a atts) (write-rune #/space sink) (write-rod (dom:name a) sink) (write-rune #/= sink) (write-rune #/\" sink) (let ((*quux* nil)) (map nil (lambda (c) (unparse-datachar c sink)) (dom:value a))) (write-rune #/\" sink))) (write-rod '#.(string-rod ">") sink) (dom:do-node-list (k (dom:child-nodes node)) (unparse-node k sink)) (write-rod '#.(string-rod "") sink)) ((dom:processing-instruction-p node) (unless (rod-equal (dom:target node) '#.(string-rod "xml")) (write-rod '#.(string-rod "") sink) )) ((dom:text-node-p node) (let ((*quux* nil)) (map nil (lambda (c) (unparse-datachar c sink)) (dom:data node)))) ((dom:comment-p node)) (t (error "Oops in unparse: ~S." node)))) (defun unparse-datachar (c sink) (cond ((rune= c #/&) (write-rod '#.(string-rod "&") sink)) ((rune= c #/<) (write-rod '#.(string-rod "<") sink)) ((rune= c #/>) (write-rod '#.(string-rod ">") sink)) ((rune= c #/\") (write-rod '#.(string-rod """) sink)) ((rune= c #/U+0009) (write-rod '#.(string-rod " ") sink)) ((rune= c #/U+000A) (write-rod '#.(string-rod " ") sink)) ((rune= c #/U+000D) (write-rod '#.(string-rod " ") sink)) (t (write-rune c sink)))) (defun write-rod (rod sink) (let ((*quux* nil)) (map nil (lambda (c) (write-rune c sink)) rod))) (defun write-rune (rune sink) (let ((code (rune-code rune))) (cond ((<= #xD800 code #xDBFF) (setf *quux* code)) ((<= #xDC00 code #xDFFF) (let ((q (logior (ash (- *quux* #xD7C0) 10) (- code #xDC00)))) (write-rune-0 q sink)) (setf *quux* nil)) (t (write-rune-0 code sink))))) (defun write-rune-0 (code sink) (labels ((wr (x) (write-char (code-char x) sink))) (cond ((<= #x00000000 code #x0000007F) (wr code)) ((<= #x00000080 code #x000007FF) (wr (logior #b11000000 (ldb (byte 5 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x00000800 code #x0000FFFF) (wr (logior #b11100000 (ldb (byte 4 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x00010000 code #x001FFFFF) (wr (logior #b11110000 (ldb (byte 3 18) code))) (wr (logior #b10000000 (ldb (byte 6 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x00200000 code #x03FFFFFF) (wr (logior #b11111000 (ldb (byte 2 24) code))) (wr (logior #b10000000 (ldb (byte 6 18) code))) (wr (logior #b10000000 (ldb (byte 6 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))) ((<= #x04000000 code #x7FFFFFFF) (wr (logior #b11111100 (ldb (byte 1 30) code))) (wr (logior #b10000000 (ldb (byte 6 24) code))) (wr (logior #b10000000 (ldb (byte 6 18) code))) (wr (logior #b10000000 (ldb (byte 6 12) code))) (wr (logior #b10000000 (ldb (byte 6 6) code))) (wr (logior #b10000000 (ldb (byte 6 0) code)))))))