1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19  """Extension to libxml2 for XMPP stream and stanza processing""" 
 20   
 21  __revision__="$Id: xmlextra.py,v 1.15 2004/10/11 18:33:51 jajcus Exp $" 
 22  __docformat__="restructuredtext en" 
 23   
 24  import sys 
 25  import libxml2 
 26  import threading 
 27  import re 
 28   
 29  from pyxmpp.exceptions import StreamParseError 
 30   
 31  common_doc = libxml2.newDoc("1.0") 
 32  common_root = common_doc.newChild(None,"root",None) 
 33  COMMON_NS = "http://pyxmpp.jajcus.net/xmlns/common" 
 34  common_ns = common_root.newNs(COMMON_NS, None) 
 35  common_root.setNs(common_ns) 
 36  common_doc.setRootElement(common_root) 
 37   
 39      """Base class for stream handler.""" 
 42   
 47   
 49          """Process stream end.""" 
 50          doc=libxml2.xmlDoc(_doc) 
 51          self.stream_end(doc) 
  52   
 54          """Process complete stanza.""" 
 55          doc=libxml2.xmlDoc(_doc) 
 56          node=libxml2.xmlNode(_node) 
 57          self.stanza(doc,node) 
  58   
 60          """Called when the start tag of root element is encountered 
 61          in the stream. 
 62   
 63          :Parameters: 
 64              - `doc`: the document being parsed. 
 65          :Types: 
 66              - `doc`: `libxml2.xmlDoc`""" 
 67          print >>sys.stderr,"Unhandled stream start:",`doc.serialize()` 
  68   
 70          """Called when the end tag of root element is encountered 
 71          in the stream. 
 72   
 73          :Parameters: 
 74              - `doc`: the document being parsed. 
 75          :Types: 
 76              - `doc`: `libxml2.xmlDoc`""" 
 77          print >>sys.stderr,"Unhandled stream end",`doc.serialize()` 
  78   
 79 -    def stanza(self, _unused, node): 
  80          """Called when the end tag of a direct child of the root 
 81          element is encountered in the stream. 
 82   
 83          Please note, that node will be removed from the document 
 84          and freed after this method returns. If it is needed after 
 85          that a copy must be made before the method returns. 
 86   
 87          :Parameters: 
 88              - `_unused`: the document being parsed. 
 89              - `node`: the (complete) element being processed 
 90          :Types: 
 91              - `_unused`: `libxml2.xmlDoc` 
 92              - `node`: `libxml2.xmlNode`""" 
 93          print >>sys.stderr,"Unhandled stanza",`node.serialize()` 
  94   
 96          """Called when an error is encountered in the stream. 
 97   
 98          :Parameters: 
 99              - `descr`: description of the error 
100          :Types: 
101              - `descr`: `str`""" 
102          raise StreamParseError,descr 
  103   
104  try: 
105   
106   
107   
108      from pyxmpp import _xmlextra 
109      from pyxmpp._xmlextra import error 
110   
111      _create_reader = _xmlextra.sax_reader_new 
112   
114          """Replace namespaces in a whole subtree. 
115   
116          The old namespace declaration will be removed if present on the `node`. 
117   
118          :Parameters: 
119             - `node`: the root of the subtree where namespaces should be replaced. 
120             - `old_ns`: the namespace to replace. 
121             - `new_ns`: the namespace to be used instead of old_ns. 
122          :Types: 
123              - `node`: `libxml2.xmlNode` 
124              - `old_ns`: `libxml2.xmlNs` 
125              - `new_ns`: `libxml2.xmlNs` 
126   
127          Both old_ns and new_ns may be None meaning no namespace set.""" 
128          if old_ns is None: 
129              old_ns__o = None 
130          else: 
131              old_ns__o = old_ns._o 
132          if new_ns is None: 
133              new_ns__o = None 
134          else: 
135              new_ns__o = new_ns._o 
136          if node is None: 
137              node__o = None 
138          else: 
139              node__o = node._o 
140          _xmlextra.replace_ns(node__o, old_ns__o, new_ns__o) 
141          if old_ns__o: 
142              _xmlextra.remove_ns(node__o, old_ns__o) 
 143   
144      pure_python = False 
145   
146  except ImportError: 
147   
148   
149   
151          """Exception raised on a stream parse error.""" 
152          pass 
 153   
155          """Escape data for XML""" 
156          data=data.replace("&","&") 
157          data=data.replace("<","<") 
158          data=data.replace(">",">") 
159          data=data.replace("'","'") 
160          data=data.replace('"',""") 
161          return data 
 162   
164          """SAX events handler for the python-only stream parser.""" 
166              """Initialize the SAX handler. 
167   
168              :Parameters: 
169                  - `handler`: Object to handle stream start, end and stanzas. 
170              :Types: 
171                  - `handler`: `StreamHandler` 
172              """ 
173              self._handler = handler 
174              self._head = "" 
175              self._tail = "" 
176              self._current = "" 
177              self._level = 0 
178              self._doc = None 
179              self._root = None 
 180   
182              "" 
183              if self._level>1: 
184                  self._current += _escape(data) 
 185   
187              "" 
188              if self._level>1: 
189                  self._current += _escape(data) 
 190   
194   
198   
200              "" 
201              self._current+="</%s>" % (tag,) 
202              self._level -= 1 
203              if self._level > 1: 
204                  return 
205              if self._level==1: 
206                  xml=self._head+self._current+self._tail 
207                  doc=libxml2.parseDoc(xml) 
208                  try: 
209                      node = doc.getRootElement().children 
210                      try: 
211                          node1 = node.docCopyNode(self._doc, 1) 
212                          try: 
213                              self._root.addChild(node1) 
214                              self._handler.stanza(self._doc, node1) 
215                          except: 
216                              node1.unlinkNode() 
217                              node1.freeNode() 
218                              del node1 
219                      finally: 
220                          del node 
221                  finally: 
222                      doc.freeDoc() 
223              else: 
224                  xml=self._head+self._tail 
225                  doc=libxml2.parseDoc(xml) 
226                  try: 
227                      self._handler.stream_end(self._doc) 
228                      self._doc.freeDoc() 
229                      self._doc = None 
230                      self._root = None 
231                  finally: 
232                      doc.freeDoc() 
 233   
235              "" 
236              self._handler.error(msg) 
 237   
238          fatalError = error 
239   
240          ignorableWhitespace = characters 
241   
243              "" 
244              self._current += "&" + name + ";" 
 245   
249   
251              "" 
252              s = "<"+tag 
253              if attrs: 
254                  for a,v in attrs.items(): 
255                      s+=" %s='%s'" % (a,_escape(v)) 
256              s += ">" 
257              if self._level == 0: 
258                  self._head = s 
259                  self._tail = "</%s>" % (tag,) 
260                  xml=self._head+self._tail 
261                  self._doc = libxml2.parseDoc(xml) 
262                  self._handler.stream_start(self._doc) 
263                  self._root = self._doc.getRootElement() 
264              elif self._level == 1: 
265                  self._current = s 
266              else: 
267                  self._current += s 
268              self._level += 1 
 269   
 273   
275          """Python-only stream reader.""" 
277              """Initialize the reader. 
278   
279              :Parameters: 
280                  - `handler`: Object to handle stream start, end and stanzas. 
281              :Types: 
282                  - `handler`: `StreamHandler` 
283              """ 
284              self.handler = handler 
285              self.sax = _SAXCallback(handler) 
286              self.parser = libxml2.createPushParser(self.sax, '', 0, 'stream') 
 287   
288 -        def feed(self, data): 
 289              """Feed the parser with a chunk of data. Apropriate methods 
290              of `self.handler` will be called whenever something interesting is 
291              found. 
292   
293              :Parameters: 
294                  - `data`: the chunk of data to parse. 
295              :Types: 
296                  - `data`: `str`""" 
297              return self.parser.parseChunk(data, len(data), 0) 
  298   
299      _create_reader = _PythonReader 
300   
302          """Get namespace of node. 
303   
304          :return: the namespace object or `None` if the node has no namespace 
305          assigned. 
306          :returntype: `libxml2.xmlNs`""" 
307          try: 
308              return node.ns() 
309          except libxml2.treeError: 
310              return None 
 311   
313          """Replace namespaces in a whole subtree. 
314   
315          :Parameters: 
316             - `node`: the root of the subtree where namespaces should be replaced. 
317             - `old_ns`: the namespace to replace. 
318             - `new_ns`: the namespace to be used instead of old_ns. 
319          :Types: 
320              - `node`: `libxml2.xmlNode` 
321              - `old_ns`: `libxml2.xmlNs` 
322              - `new_ns`: `libxml2.xmlNs` 
323   
324          Both old_ns and new_ns may be None meaning no namespace set.""" 
325   
326          if old_ns is not None: 
327              old_ns_uri = old_ns.content 
328              old_ns_prefix = old_ns.name 
329          else: 
330              old_ns_uri = None 
331              old_ns_prefix = None 
332   
333          ns = _get_ns(node) 
334          if ns is None and old_ns is None: 
335              node.setNs(new_ns) 
336          elif ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 
337              node.setNs(new_ns) 
338   
339          p = node.properties 
340          while p: 
341              ns = _get_ns(p) 
342              if ns is None and old_ns is None: 
343                  p.setNs(new_ns) 
344              if ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 
345                  p.setNs(new_ns) 
346              p = p.next 
347   
348          n = node.children 
349          while n: 
350              if n.type == 'element': 
351                  skip_element = False 
352                  try: 
353                      nsd = n.nsDefs() 
354                  except libxml2.treeError: 
355                      nsd = None 
356                  while nsd: 
357                      if nsd.name == old_ns_prefix: 
358                          skip_element = True 
359                          break 
360                      nsd = nsd.next 
361                  if not skip_element: 
362                      replace_ns(n, old_ns, new_ns) 
363              n = n.next 
 364   
365      pure_python = True 
366   
367   
368   
369   
370   
372      """Namespace of an XML node. 
373   
374      :Parameters: 
375          - `xmlnode`: the XML node to query. 
376      :Types: 
377          - `xmlnode`: `libxml2.xmlNode` 
378   
379      :return: namespace of the node or `None` 
380      :returntype: `libxml2.xmlNs`""" 
381      try: 
382          return xmlnode.ns() 
383      except libxml2.treeError: 
384          return None 
 385   
387      """Return namespace URI of an XML node. 
388   
389      :Parameters: 
390          - `xmlnode`: the XML node to query. 
391      :Types: 
392          - `xmlnode`: `libxml2.xmlNode` 
393   
394      :return: namespace URI of the node or `None` 
395      :returntype: `unicode`""" 
396      ns=get_node_ns(xmlnode) 
397      if ns: 
398          return unicode(ns.getContent(),"utf-8") 
399      else: 
400          return None 
 401   
403      """Iterate over sibling XML nodes. All types of nodes will be returned 
404      (not only the elements). 
405   
406      Usually used to iterade over node's children like this:: 
407   
408          xml_node_iter(node.children) 
409   
410      :Parameters: 
411          - `nodelist`: start node of the list. 
412      :Types: 
413          - `nodelist`: `libxml2.xmlNode` 
414      """ 
415      node = nodelist 
416      while node: 
417          yield node 
418          node = node.next 
 419   
421      """Iterate over sibling XML elements. Non-element nodes will be skipped. 
422   
423      Usually used to iterade over node's children like this:: 
424   
425          xml_node_iter(node.children) 
426   
427      :Parameters: 
428          - `nodelist`: start node of the list. 
429      :Types: 
430          - `nodelist`: `libxml2.xmlNode` 
431      """ 
432      node = nodelist 
433      while node: 
434          if node.type == "element": 
435              yield node 
436          node = node.next 
 437   
439      """Iterate over sibling XML elements. Only elements in the given namespace will be returned. 
440   
441      Usually used to iterade over node's children like this:: 
442   
443          xml_node_iter(node.children) 
444   
445      :Parameters: 
446          - `nodelist`: start node of the list. 
447      :Types: 
448          - `nodelist`: `libxml2.xmlNode` 
449      """ 
450      node = nodelist 
451      while node: 
452          if node.type == "element" and get_node_ns_uri(node)==ns_uri: 
453              yield node 
454          node = node.next 
 455   
456  evil_characters_re=re.compile(r"[\000-\010\013\014\016-\037]",re.UNICODE) 
457  utf8_replacement_char=u"\ufffd".encode("utf-8") 
458   
465   
466  bad_nsdef_replace_re=re.compile(r"^([^<]*\<[^><]*\s+)(xmlns=((\"[^\"]*\")|(\'[^\']*\')))") 
467   
469      """Serialize an XML element making sure the result is sane. 
470   
471      Remove control characters and invalid namespace declarations from the 
472      result string. 
473   
474      :Parameters: 
475          - `xmlnode`: the XML element to serialize. 
476      :Types: 
477          - `xmlnode`: `libxml2.xmlNode` 
478   
479      :return: UTF-8 encoded serialized and sanitized element. 
480      :returntype: `string`""" 
481      try: 
482          ns = xmlnode.ns() 
483      except libxml2.treeError: 
484          ns = None 
485      try: 
486          nsdef = xmlnode.nsDefs() 
487      except libxml2.treeError: 
488          nsdef = None 
489      s=xmlnode.serialize(encoding="UTF-8") 
490      while nsdef: 
491          if nsdef.name is None and (not ns or (nsdef.name, nsdef.content)!=(ns.name, ns.content)): 
492              s = bad_nsdef_replace_re.sub("\\1",s,1) 
493              break 
494          nsdef = nsdef.next 
495      s=remove_evil_characters(s) 
496      return s 
 497   
499      """A simple push-parser interface for XML streams.""" 
501          """Initialize `StreamReader` object. 
502   
503          :Parameters: 
504              - `handler`: handler object for the stream content 
505          :Types: 
506              - `handler`: `StreamHandler` derived class 
507          """ 
508          self.reader=_create_reader(handler) 
509          self.lock=threading.RLock() 
510          self.in_use=0 
 512          """Get the document being parsed. 
513   
514          :return: the document. 
515          :returntype: `libxml2.xmlNode`""" 
516          ret=self.reader.doc() 
517          if ret: 
518              return libxml2.xmlDoc(ret) 
519          else: 
520              return None 
 522          """Pass a string to the stream parser. 
523   
524          Parameters: 
525              - `s`: string to parse. 
526          Types: 
527              - `s`: `str` 
528   
529          :return: `None` on EOF, `False` when whole input was parsed and `True` 
530              if there is something still left in the buffer.""" 
531          self.lock.acquire() 
532          if self.in_use: 
533              self.lock.release() 
534              raise StreamParseError,"StreamReader.feed() is not reentrant!" 
535          self.in_use=1 
536          try: 
537              return self.reader.feed(s) 
538          finally: 
539              self.in_use=0 
540              self.lock.release() 
  541   
542   
543   
544