Paste #17

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import logging
import re
from xml.sax.saxutils import unescape

class SanitizeError(Exception):
  pass

attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
                   'xlink:href', 'xml:base']

allowed_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 
                      'news', 'gopher', 'nntp', 'telnet', 'webcal','xmpp',
                      'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
                      'ssh', 'sftp', 'rtsp', 'afs' ]

def get_child_contents(soup):
  text = ''.join([s.string if s.string else get_child_contents(s) for s in soup ])
  return text

def get_plain_text(value):
  from BeautifulSoup import BeautifulSoup, Comment
  soup = BeautifulSoup(value)
  for comment in soup.findAll(
    text=lambda text: isinstance(text, Comment)):
    comment.extract()
  for style in soup.findAll(name='style'):
    style.extract()
  return get_child_contents(soup)


class Sanitizer(object):

  def __init__(self, valid_tags=None, valid_attrs=None,
               invalid_tags=None, invalid_attrs=None,
               raise_on_invalid=False):
    self.valid_tags = valid_tags or []
    self.valid_attrs = valid_attrs or []
    self.invalid_tags = invalid_tags or []
    self.invalid_attrs = invalid_attrs or []
    self.raise_on_invalid = raise_on_invalid

  def check_tag(self, tag):
    raise NotImplementedError("A child class must override ceck_tag")

  def check_attr(self, attr):
    raise NotImplementedError("A child class must override ceck_attr")

  def _check_tag(self, tag):
    if self.check_tag(tag):
      return True
    self.handle_error("Tag: %s is invalid." % tag)

  def _check_attr(self, attr, val):
    val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                           unescape(val)).lower()
    #remove replacement characters from unescaped characters
    val_unescaped = val_unescaped.replace(u"\ufffd", "")
    if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
        (val_unescaped.split(':')[0] not in 
         allowed_protocols)):
      self.handle_error("Val: %s is unacceptable protocol." % val)
      return False
    if self.check_attr(attr):
      return True
    self.handle_error("Attr: %s and Val: %s conbination is invalid." %
                      (attr, val))

  def handle_error(self, error_message):
    logging.debug(error_message)
    if self.raise_on_invalid:
      raise SanitizeError(error_message)

  def sanitize(self, value):
    from BeautifulSoup import BeautifulSoup, Comment
    soup = BeautifulSoup(value)
    for comment in soup.findAll(
      text=lambda text: isinstance(text, Comment)):
      comment.extract()
    for tag in soup.findAll(True):
      if not self._check_tag(tag.name):
        tag.hidden = True
      tag.attrs = [(attr, val) for attr, val in tag.attrs
                   if self._check_attr(attr, val)]
    return soup.renderContents().decode('utf8')


class WhitelistSanitizer(Sanitizer):

  def check_tag(self, tag):
    if tag in self.valid_tags:
      return True
    
  def check_attr(self, attr):
    if attr in self.valid_attrs:
      return True


class BlacklistSanitizer(Sanitizer):

  def check_tag(self, tag):
    if tag not in self.invalid_tags:
      return True

  def check_attr(self, attr):
    if attr not in self.invalid_attrs:
      return True


def get_default_sanitizer():
  return WhitelistSanitizer(
    valid_tags=('p i strong b u a h1 h2 h3 h4 h5 h6 pre br img'
                ' font span ol ul li div sup sub hr').split(),
    valid_attrs=('href src alt width style size align class target height'
                 ' title').split(),
  )