1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 | import logging
class SanitizeError(Exception):
pass
def get_child_contents(soup):
text = ''.join([s.string if s.string else get_child_contents(s) for s in soup ])
return text
def get_plain_text(value):
from BeautifulSoup import BeautifulSoup, Comment
soup = BeautifulSoup(value)
for comment in soup.findAll(
text=lambda text: isinstance(text, Comment)):
comment.extract()
for style in soup.findAll(name='style'):
style.extract()
return get_child_contents(soup)
class Sanitizer(object):
def __init__(self, valid_tags=None, valid_attrs=None,
invalid_tags=None, invalid_attrs=None,
raise_on_invalid=False):
self.valid_tags = valid_tags or []
self.valid_attrs = valid_attrs or []
self.invalid_tags = invalid_tags or []
self.invalid_attrs = invalid_attrs or []
self.raise_on_invalid = raise_on_invalid
def check_tag(self, tag):
raise NotImplementedError("A child class must override ceck_tag")
def check_attr(self, attr):
raise NotImplementedError("A child class must override ceck_attr")
def _check_tag(self, tag):
if self.check_tag(tag):
return True
self.handle_error("Tag: %s is invalid." % tag)
def _check_attr(self, attr):
if self.check_attr(attr):
return True
self.handle_error("Attr: %s is invalid." % attr)
def handle_error(self, error_message):
logging.debug(error_message)
if self.raise_on_invalid:
raise SanitizeError(error_message)
def sanitize(self, value):
from BeautifulSoup import BeautifulSoup, Comment
soup = BeautifulSoup(value)
for comment in soup.findAll(
text=lambda text: isinstance(text, Comment)):
comment.extract()
for tag in soup.findAll(True):
if not self._check_tag(tag.name):
tag.hidden = True
tag.attrs = [(attr, val) for attr, val in tag.attrs
if self._check_attr(attr)]
return soup.renderContents().decode('utf8')
class WhitelistSanitizer(Sanitizer):
def check_tag(self, tag):
if tag in self.valid_tags:
return True
def check_attr(self, attr):
if attr in self.valid_attrs:
return True
class BlacklistSanitizer(Sanitizer):
def check_tag(self, tag):
if tag not in self.invalid_tags:
return True
def check_attr(self, attr):
if attr not in self.invalid_attrs:
return True
def get_default_sanitizer():
return WhitelistSanitizer(
valid_tags=('p i strong b u a h1 h2 h3 h4 h5 h6 pre br img'
' font span ol ul li div sup sub hr').split(),
valid_attrs=('href src alt width style size align class target height'
' title').split(),
)
|