Paste #15

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import logging

class SanitizeError(Exception):
  pass


def get_child_contents(soup):
  text = ''.join([s.string if s.string else get_child_contents(s) for s in soup ])
  return text

def get_plain_text(value):
  from BeautifulSoup import BeautifulSoup, Comment
  soup = BeautifulSoup(value)
  for comment in soup.findAll(
    text=lambda text: isinstance(text, Comment)):
    comment.extract()
  for style in soup.findAll(name='style'):
    style.extract()
  return get_child_contents(soup)


class Sanitizer(object):

  def __init__(self, valid_tags=None, valid_attrs=None,
               invalid_tags=None, invalid_attrs=None,
               raise_on_invalid=False):
    self.valid_tags = valid_tags or []
    self.valid_attrs = valid_attrs or []
    self.invalid_tags = invalid_tags or []
    self.invalid_attrs = invalid_attrs or []
    self.raise_on_invalid = raise_on_invalid

  def check_tag(self, tag):
    raise NotImplementedError("A child class must override ceck_tag")

  def check_attr(self, attr):
    raise NotImplementedError("A child class must override ceck_attr")

  def _check_tag(self, tag):
    if self.check_tag(tag):
      return True
    self.handle_error("Tag: %s is invalid." % tag)

  def _check_attr(self, attr):
    if self.check_attr(attr):
      return True
    self.handle_error("Attr: %s is invalid." % attr)

  def handle_error(self, error_message):
    logging.debug(error_message)
    if self.raise_on_invalid:
      raise SanitizeError(error_message)

  def sanitize(self, value):
    from BeautifulSoup import BeautifulSoup, Comment
    soup = BeautifulSoup(value)
    for comment in soup.findAll(
      text=lambda text: isinstance(text, Comment)):
      comment.extract()
    for tag in soup.findAll(True):
      if not self._check_tag(tag.name):
        tag.hidden = True
      tag.attrs = [(attr, val) for attr, val in tag.attrs
                   if self._check_attr(attr)]
    return soup.renderContents().decode('utf8')


class WhitelistSanitizer(Sanitizer):

  def check_tag(self, tag):
    if tag in self.valid_tags:
      return True
    
  def check_attr(self, attr):
    if attr in self.valid_attrs:
      return True


class BlacklistSanitizer(Sanitizer):

  def check_tag(self, tag):
    if tag not in self.invalid_tags:
      return True

  def check_attr(self, attr):
    if attr not in self.invalid_attrs:
      return True


def get_default_sanitizer():
  return WhitelistSanitizer(
    valid_tags=('p i strong b u a h1 h2 h3 h4 h5 h6 pre br img'
                ' font span ol ul li div sup sub hr').split(),
    valid_attrs=('href src alt width style size align class target height'
                 ' title').split(),
  )