Extensions/EpozPostTidy.py

   1 ###
   2 # EpozPostTidy.py
   3 #
   4 # This is just an example for transforming
   5 # absolute urls to relative urls with Epoz.
   6 #
   7 # Use it at your own risk or improve it!
   8 ###
   9
  10 from HTMLParser import HTMLParser
  11 import re
  12
  13 # These tags will get a newline after the closing tag
  14 blocktags = ['p', 'pre', 'div',
  15              'table', 'tr', 'th', 'td', 'thead', 'tbody', 'tfoot',
  16              'ul','ol','li',
  17              'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
  18
  19 # Just a simple htmlparser
  20 class aHTMLParser(HTMLParser):
  21     res = ""
  22
  23     def handle_starttag(self, tag, attrs):
  24         attributes=""
  25         for (key,value) in attrs:
  26             # Internal Link?
  27             if (tag=="a" and key=="href") or (tag=="img" and key=="src"):
  28                 value = self.getRelativeUrl(self.pageurl, value)
  29             attributes += ' %s="%s"' % (key,value)
  30         self.res += "<%s%s>" % (tag, attributes)
  31
  32     def handle_endtag(self, tag):
  33         self.res += "</%s>" % (tag,)
  34         # Some pretty-nice-printing for block-elements
  35         if tag in blocktags:
  36             self.res += "\n"
  37
  38     def handle_startendtag(self, tag, attrs):
  39         attributes=""
  40         for (key,value) in attrs:
  41             # Image?
  42             if tag=="img" and key=="src":
  43                 value = self.getRelativeUrl(self.pageurl, value)
  44             attributes += ' %s="%s"' % (key,value)
  45         self.res += "<%s%s />" % (tag, attributes)
  46
  47     def handle_data(self, data):
  48         self.res += data
  49
  50     def handle_charref(self, data):
  51         self.res += "&%s;" % data
  52
  53     def handle_entityref(self, data):
  54         self.res += "&%s;" % data
  55
  56     def handle_comment(self, data):
  57         self.res += "<!-- %s -->"
  58
  59
  60 def EpozPostTidy(self, html, pageurl):
  61
  62     # Create a parser
  63     parser = aHTMLParser()
  64
  65     # Give the parser the global method for relative urls
  66     parser.getRelativeUrl = self.EpozGetRelativeUrl
  67
  68     # Submit the pageurl as base-url for calculating urls
  69     parser.pageurl = pageurl
  70
  71     # And now lets turn the wheels
  72     parser.feed(html)
  73     parser.close()
  74
  75     # Get & return postprocessed html from parser
  76     html = parser.res
  77
  78     # Just some cleanups to remove useless whitespace
  79     html = re.sub("[ ]+"," ",html)
  80     html = re.sub("[\n]+","\n", html)
  81
  82     return html