From 1236c4d7f040fed1c2a60063715ab76d9ab2de2d Mon Sep 17 00:00:00 2001
From: Pierre-Yves Chibon <pingou@pingoured.fr>
Date: Jul 20 2016 21:20:45 +0000
Subject: Add a clean_input module in hubs.widget


This allows to clean the input provided by the users to ensure there is
nothing malicious in it.

---

diff --git a/hubs/widgets/clean_input.py b/hubs/widgets/clean_input.py
new file mode 100644
index 0000000..46bccbc
--- /dev/null
+++ b/hubs/widgets/clean_input.py
@@ -0,0 +1,40 @@
+import urlparse
+
+import bleach
+
+
+def filter_img_src(name, value):
+    ''' Filter in img html tags images coming from a different domain. '''
+    import hubs.app
+    if name in ('alt', 'height', 'width', 'class'):
+        return True
+    if name == 'src':
+        p = urlparse.urlparse(value)
+        return (not p.netloc) or p.netloc == urlparse.urlparse(
+            hubs.app.app.config['APP_URL']).netloc
+    return False
+
+
+def clean(text, ignore=None):
+    """ For a given html text, escape everything we do not want to support
+    to avoid potential security breach.
+    """
+    if ignore and not isinstance(ignore, (tuple, set, list)):
+        ignore = [ignore]
+
+    attrs = bleach.ALLOWED_ATTRIBUTES
+    if not ignore or not 'img' in ignore:
+        attrs['img'] = filter_img_src
+
+    tags = bleach.ALLOWED_TAGS + [
+        'p', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+        'table', 'td', 'tr', 'th',
+        'col', 'tbody', 'pre', 'img', 'hr', 'dl', 'dt', 'dd', 'span',
+        'kbd', 'var',
+    ]
+    if ignore:
+        for tag in ignore:
+            if tag in tags:
+                tags.remove(tag)
+
+    return bleach.clean(text, tags=tags, attributes=attrs)
diff --git a/requirements.txt b/requirements.txt
index 01e9cc9..c2dadcf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 arrow
+bleach
 datanommer.models
 dogpile.cache
 fedmsg