wstools/c14n.py

   1 #! /usr/bin/env python
   2 '''XML Canonicalization
   3
   4 Patches Applied to xml.dom.ext.c14n:
   5     http://sourceforge.net/projects/pyxml/
   6
   7     [ 1444526 ] c14n.py: http://www.w3.org/TR/xml-exc-c14n/ fix
   8         -- includes [ 829905 ] c14n.py fix for bug #825115,
   9            Date Submitted: 2003-10-24 23:43
  10         -- include dependent namespace declarations declared in ancestor nodes
  11            (checking attributes and tags),
  12         -- handle InclusiveNamespaces PrefixList parameter
  13
  14 This module generates canonical XML of a document or element.
  15     http://www.w3.org/TR/2001/REC-xml-c14n-20010315
  16 and includes a prototype of exclusive canonicalization
  17     http://www.w3.org/Signature/Drafts/xml-exc-c14n
  18
  19 Requires PyXML 0.7.0 or later.
  20
  21 Known issues if using Ft.Lib.pDomlette:
  22     1. Unicode
  23     2. does not white space normalize attributes of type NMTOKEN and ID?
  24     3. seems to be include "\n" after importing external entities?
  25
  26 Note, this version processes a DOM tree, and consequently it processes
  27 namespace nodes as attributes, not from a node's namespace axis. This
  28 permits simple document and element canonicalization without
  29 XPath. When XPath is used, the XPath result node list is passed and used to
  30 determine if the node is in the XPath result list, but little else.
  31
  32 Authors:
  33     "Joseph M. Reagle Jr." <reagle@w3.org>
  34     "Rich Salz" <rsalz@zolera.com>
  35
  36 $Date$ by $Author$
  37 '''
  38
  39 _copyright = '''Copyright 2001, Zolera Systems Inc.  All Rights Reserved.
  40 Copyright 2001, MIT. All Rights Reserved.
  41
  42 Distributed under the terms of:
  43   Python 2.0 License or later.
  44   http://www.python.org/2.0.1/license.html
  45 or
  46   W3C Software License
  47   http://www.w3.org/Consortium/Legal/copyright-software-19980720
  48 '''
  49
  50 import string
  51 from xml.dom import Node
  52 try:
  53     from xml.ns import XMLNS
  54 except:
  55     class XMLNS:
  56         BASE = "http://www.w3.org/2000/xmlns/"
  57         XML = "http://www.w3.org/XML/1998/namespace"
  58 try:
  59     import cStringIO
  60     StringIO = cStringIO
  61 except ImportError:
  62     import StringIO
  63
  64 _attrs = lambda E: (E.attributes and E.attributes.values()) or []
  65 _children = lambda E: E.childNodes or []
  66 _IN_XML_NS = lambda n: n.name.startswith("xmlns")
  67 _inclusive = lambda n: n.unsuppressedPrefixes == None
  68
  69
  70 # Does a document/PI has lesser/greater document order than the
  71 # first element?
  72 _LesserElement, _Element, _GreaterElement = range(3)
  73
  74 def _sorter(n1,n2):
  75     '''_sorter(n1,n2) -> int
  76     Sorting predicate for non-NS attributes.'''
  77
  78     i = cmp(n1.namespaceURI, n2.namespaceURI)
  79     if i: return i
  80     return cmp(n1.localName, n2.localName)
  81
  82
  83 def _sorter_ns(n1,n2):
  84     '''_sorter_ns((n,v),(n,v)) -> int
  85     "(an empty namespace URI is lexicographically least)."'''
  86
  87     if n1[0] == 'xmlns': return -1
  88     if n2[0] == 'xmlns': return 1
  89     return cmp(n1[0], n2[0])
  90
  91 def _utilized(n, node, other_attrs, unsuppressedPrefixes):
  92     '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean
  93     Return true if that nodespace is utilized within the node'''
  94     if n.startswith('xmlns:'):
  95         n = n[6:]
  96     elif n.startswith('xmlns'):
  97         n = n[5:]
  98     if (n=="" and node.prefix in ["#default", None]) or \
  99         n == node.prefix or n in unsuppressedPrefixes:
 100             return 1
 101     for attr in other_attrs:
 102         if n == attr.prefix: return 1
 103     # For exclusive need to look at attributes
 104     if unsuppressedPrefixes is not None:
 105         for attr in _attrs(node):
 106             if n == attr.prefix: return 1
 107
 108     return 0
 109
 110
 111 def _inclusiveNamespacePrefixes(node, context, unsuppressedPrefixes):
 112     '''http://www.w3.org/TR/xml-exc-c14n/
 113     InclusiveNamespaces PrefixList parameter, which lists namespace prefixes that
 114     are handled in the manner described by the Canonical XML Recommendation'''
 115     inclusive = []
 116     if node.prefix:
 117         usedPrefixes = ['xmlns:%s' %node.prefix]
 118     else:
 119         usedPrefixes = ['xmlns']
 120
 121     for a in _attrs(node):
 122         if a.nodeName.startswith('xmlns') or not a.prefix: continue
 123         usedPrefixes.append('xmlns:%s' %a.prefix)
 124
 125     unused_namespace_dict = {}
 126     for attr in context:
 127         n = attr.nodeName
 128         if n in unsuppressedPrefixes:
 129             inclusive.append(attr)
 130         elif n.startswith('xmlns:') and n[6:] in unsuppressedPrefixes:
 131             inclusive.append(attr)
 132         elif n.startswith('xmlns') and n[5:] in unsuppressedPrefixes:
 133             inclusive.append(attr)
 134         elif attr.nodeName in usedPrefixes:
 135             inclusive.append(attr)
 136         elif n.startswith('xmlns:'):
 137             unused_namespace_dict[n] = attr.value
 138
 139     return inclusive, unused_namespace_dict
 140
 141 #_in_subset = lambda subset, node: not subset or node in subset
 142 _in_subset = lambda subset, node: subset is None or node in subset # rich's tweak
 143
 144
 145 class _implementation:
 146     '''Implementation class for C14N. This accompanies a node during it's
 147     processing and includes the parameters and processing state.'''
 148
 149     # Handler for each node type; populated during module instantiation.
 150     handlers = {}
 151
 152     def __init__(self, node, write, **kw):
 153         '''Create and run the implementation.'''
 154         self.write = write
 155         self.subset = kw.get('subset')
 156         self.comments = kw.get('comments', 0)
 157         self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes')
 158         nsdict = kw.get('nsdict', { 'xml': XMLNS.XML, 'xmlns': XMLNS.BASE })
 159
 160         # Processing state.
 161         self.state = (nsdict, {'xml':''}, {}, {}) #0422
 162
 163         if node.nodeType == Node.DOCUMENT_NODE:
 164             self._do_document(node)
 165         elif node.nodeType == Node.ELEMENT_NODE:
 166             self.documentOrder = _Element        # At document element
 167             if not _inclusive(self):
 168                 inherited,unused = _inclusiveNamespacePrefixes(node, self._inherit_context(node),
 169                                 self.unsuppressedPrefixes)
 170                 self._do_element(node, inherited, unused=unused)
 171             else:
 172                 inherited = self._inherit_context(node)
 173                 self._do_element(node, inherited)
 174         elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
 175             pass
 176         else:
 177             raise TypeError, str(node)
 178
 179
 180     def _inherit_context(self, node):
 181         '''_inherit_context(self, node) -> list
 182         Scan ancestors of attribute and namespace context.  Used only
 183         for single element node canonicalization, not for subset
 184         canonicalization.'''
 185
 186         # Collect the initial list of xml:foo attributes.
 187         xmlattrs = filter(_IN_XML_NS, _attrs(node))
 188
 189         # Walk up and get all xml:XXX attributes we inherit.
 190         inherited, parent = [], node.parentNode
 191         while parent and parent.nodeType == Node.ELEMENT_NODE:
 192             for a in filter(_IN_XML_NS, _attrs(parent)):
 193                 n = a.localName
 194                 if n not in xmlattrs:
 195                     xmlattrs.append(n)
 196                     inherited.append(a)
 197             parent = parent.parentNode
 198         return inherited
 199
 200
 201     def _do_document(self, node):
 202         '''_do_document(self, node) -> None
 203         Process a document node. documentOrder holds whether the document
 204         element has been encountered such that PIs/comments can be written
 205         as specified.'''
 206
 207         self.documentOrder = _LesserElement
 208         for child in node.childNodes:
 209             if child.nodeType == Node.ELEMENT_NODE:
 210                 self.documentOrder = _Element        # At document element
 211                 self._do_element(child)
 212                 self.documentOrder = _GreaterElement # After document element
 213             elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
 214                 self._do_pi(child)
 215             elif child.nodeType == Node.COMMENT_NODE:
 216                 self._do_comment(child)
 217             elif child.nodeType == Node.DOCUMENT_TYPE_NODE:
 218                 pass
 219             else:
 220                 raise TypeError, str(child)
 221     handlers[Node.DOCUMENT_NODE] = _do_document
 222
 223
 224     def _do_text(self, node):
 225         '''_do_text(self, node) -> None
 226         Process a text or CDATA node.  Render various special characters
 227         as their C14N entity representations.'''
 228         if not _in_subset(self.subset, node): return
 229         s = string.replace(node.data, "&", "&amp;")
 230         s = string.replace(s, "<", "&lt;")
 231         s = string.replace(s, ">", "&gt;")
 232         s = string.replace(s, "\015", "&#xD;")
 233         if s: self.write(s)
 234     handlers[Node.TEXT_NODE] = _do_text
 235     handlers[Node.CDATA_SECTION_NODE] = _do_text
 236
 237
 238     def _do_pi(self, node):
 239         '''_do_pi(self, node) -> None
 240         Process a PI node. Render a leading or trailing #xA if the
 241         document order of the PI is greater or lesser (respectively)
 242         than the document element.
 243         '''
 244         if not _in_subset(self.subset, node): return
 245         W = self.write
 246         if self.documentOrder == _GreaterElement: W('\n')
 247         W('<?')
 248         W(node.nodeName)
 249         s = node.data
 250         if s:
 251             W(' ')
 252             W(s)
 253         W('?>')
 254         if self.documentOrder == _LesserElement: W('\n')
 255     handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi
 256
 257
 258     def _do_comment(self, node):
 259         '''_do_comment(self, node) -> None
 260         Process a comment node. Render a leading or trailing #xA if the
 261         document order of the comment is greater or lesser (respectively)
 262         than the document element.
 263         '''
 264         if not _in_subset(self.subset, node): return
 265         if self.comments:
 266             W = self.write
 267             if self.documentOrder == _GreaterElement: W('\n')
 268             W('<!--')
 269             W(node.data)
 270             W('-->')
 271             if self.documentOrder == _LesserElement: W('\n')
 272     handlers[Node.COMMENT_NODE] = _do_comment
 273
 274
 275     def _do_attr(self, n, value):
 276         ''''_do_attr(self, node) -> None
 277         Process an attribute.'''
 278
 279         W = self.write
 280         W(' ')
 281         W(n)
 282         W('="')
 283         s = string.replace(value, "&", "&amp;")
 284         s = string.replace(s, "<", "&lt;")
 285         s = string.replace(s, '"', '&quot;')
 286         s = string.replace(s, '\011', '&#x9')
 287         s = string.replace(s, '\012', '&#xA')
 288         s = string.replace(s, '\015', '&#xD')
 289         W(s)
 290         W('"')
 291
 292
 293     def _do_element(self, node, initial_other_attrs = [], unused = None):
 294         '''_do_element(self, node, initial_other_attrs = [], unused = {}) -> None
 295         Process an element (and its children).'''
 296
 297         # Get state (from the stack) make local copies.
 298         #   ns_parent -- NS declarations in parent
 299         #   ns_rendered -- NS nodes rendered by ancestors
 300         #        ns_local -- NS declarations relevant to this element
 301         #   xml_attrs -- Attributes in XML namespace from parent
 302         #       xml_attrs_local -- Local attributes in XML namespace.
 303         #   ns_unused_inherited -- not rendered namespaces, used for exclusive
 304         ns_parent, ns_rendered, xml_attrs = \
 305                 self.state[0], self.state[1].copy(), self.state[2].copy() #0422
 306
 307         ns_unused_inherited = unused
 308         if unused is None:
 309             ns_unused_inherited = self.state[3].copy()
 310
 311         ns_local = ns_parent.copy()
 312         inclusive = _inclusive(self)
 313         xml_attrs_local = {}
 314
 315         # Divide attributes into NS, XML, and others.
 316         other_attrs = []
 317         in_subset = _in_subset(self.subset, node)
 318         for a in initial_other_attrs + _attrs(node):
 319             if a.namespaceURI == XMLNS.BASE:
 320                 n = a.nodeName
 321                 if n == "xmlns:": n = "xmlns"        # DOM bug workaround
 322                 ns_local[n] = a.nodeValue
 323             elif a.namespaceURI == XMLNS.XML:
 324                 if inclusive or (in_subset and  _in_subset(self.subset, a)): #020925 Test to see if attribute node in subset
 325                     xml_attrs_local[a.nodeName] = a #0426
 326             else:
 327                 if  _in_subset(self.subset, a):     #020925 Test to see if attribute node in subset
 328                     other_attrs.append(a)
 329
 330 #                # TODO: exclusive, might need to define xmlns:prefix here
 331 #                if not inclusive and a.prefix is not None and not ns_rendered.has_key('xmlns:%s' %a.prefix):
 332 #                    ns_local['xmlns:%s' %a.prefix] = ??
 333
 334             #add local xml:foo attributes to ancestor's xml:foo attributes
 335             xml_attrs.update(xml_attrs_local)
 336
 337         # Render the node
 338         W, name = self.write, None
 339         if in_subset:
 340             name = node.nodeName
 341             if not inclusive:
 342                 if node.prefix is not None:
 343                     prefix = 'xmlns:%s' %node.prefix
 344                 else:
 345                     prefix = 'xmlns'
 346
 347                 if not ns_rendered.has_key(prefix) and not ns_local.has_key(prefix):
 348                     if not ns_unused_inherited.has_key(prefix):
 349                         raise RuntimeError,\
 350                             'For exclusive c14n, unable to map prefix "%s" in %s' %(
 351                             prefix, node)
 352
 353                     ns_local[prefix] = ns_unused_inherited[prefix]
 354                     del ns_unused_inherited[prefix]
 355
 356             W('<')
 357             W(name)
 358
 359             # Create list of NS attributes to render.
 360             ns_to_render = []
 361             for n,v in ns_local.items():
 362
 363                 # If default namespace is XMLNS.BASE or empty,
 364                 # and if an ancestor was the same
 365                 if n == "xmlns" and v in [ XMLNS.BASE, '' ] \
 366                 and ns_rendered.get('xmlns') in [ XMLNS.BASE, '', None ]:
 367                     continue
 368
 369                 # "omit namespace node with local name xml, which defines
 370                 # the xml prefix, if its string value is
 371                 # http://www.w3.org/XML/1998/namespace."
 372                 if n in ["xmlns:xml", "xml"] \
 373                 and v in [ 'http://www.w3.org/XML/1998/namespace' ]:
 374                     continue
 375
 376
 377                 # If not previously rendered
 378                 # and it's inclusive  or utilized
 379                 if (n,v) not in ns_rendered.items():
 380                     if inclusive or _utilized(n, node, other_attrs, self.unsuppressedPrefixes):
 381                         ns_to_render.append((n, v))
 382                     elif not inclusive:
 383                         ns_unused_inherited[n] = v
 384
 385             # Sort and render the ns, marking what was rendered.
 386             ns_to_render.sort(_sorter_ns)
 387             for n,v in ns_to_render:
 388                 self._do_attr(n, v)
 389                 ns_rendered[n]=v    #0417
 390
 391             # If exclusive or the parent is in the subset, add the local xml attributes
 392             # Else, add all local and ancestor xml attributes
 393             # Sort and render the attributes.
 394             if not inclusive or _in_subset(self.subset,node.parentNode):  #0426
 395                 other_attrs.extend(xml_attrs_local.values())
 396             else:
 397                 other_attrs.extend(xml_attrs.values())
 398             other_attrs.sort(_sorter)
 399             for a in other_attrs:
 400                 self._do_attr(a.nodeName, a.value)
 401             W('>')
 402
 403         # Push state, recurse, pop state.
 404         state, self.state = self.state, (ns_local, ns_rendered, xml_attrs, ns_unused_inherited)
 405         for c in _children(node):
 406             _implementation.handlers[c.nodeType](self, c)
 407         self.state = state
 408
 409         if name: W('</%s>' % name)
 410     handlers[Node.ELEMENT_NODE] = _do_element
 411
 412
 413 def Canonicalize(node, output=None, **kw):
 414     '''Canonicalize(node, output=None, **kw) -> UTF-8
 415
 416     Canonicalize a DOM document/element node and all descendents.
 417     Return the text; if output is specified then output.write will
 418     be called to output the text and None will be returned
 419     Keyword parameters:
 420         nsdict: a dictionary of prefix:uri namespace entries
 421                 assumed to exist in the surrounding context
 422         comments: keep comments if non-zero (default is 0)
 423         subset: Canonical XML subsetting resulting from XPath
 424                 (default is [])
 425         unsuppressedPrefixes: do exclusive C14N, and this specifies the
 426                 prefixes that should be inherited.
 427     '''
 428     if output:
 429         apply(_implementation, (node, output.write), kw)
 430     else:
 431         s = StringIO.StringIO()
 432         apply(_implementation, (node, s.write), kw)
 433         return s.getvalue()