#235 Sanitize html returned
Merged 8 years ago by . Opened 8 years ago by pingou.

file modified
+2
@@ -18,6 +18,7 @@ 

  

  BuildRequires:  python-alembic

  BuildRequires:  python-arrow

+ BuildRequires:  python-bleach

  BuildRequires:  python-blinker

  BuildRequires:  python-chardet

  BuildRequires:  python-docutils
@@ -47,6 +48,7 @@ 

  

  Requires:  python-alembic

  Requires:  python-arrow

+ Requires:  python-bleach

  Requires:  python-blinker

  Requires:  python-chardet

  Requires:  python-docutils

file modified
+24 -4
@@ -10,9 +10,11 @@ 

  

  import datetime

  import textwrap

+ import urlparse

  

- import flask

  import arrow

+ import bleach

+ import flask

  import markdown

  

  from pygments import highlight
@@ -300,14 +302,32 @@ 

      return output

  

  

+ def filter_img_src(name, value):

+     ''' Filter in img html tags images coming from a different domain. '''

+     if name in ('alt', 'height', 'width', 'class'):

+         return True

+     if name == 'src':

+         p = urlparse.urlparse(value)

+         return (not p.netloc) \

+             or p.netloc == urlparse.urlparse(APP.config['APP_URL']).netloc

+     return False

+ 

+ 

  @APP.template_filter('noJS')

  def no_js(content):

      """ Template filter replacing <script by &lt;script and </script> by

      &lt;/script&gt;

      """

-     content = content.replace('<script', '&lt;script')

-     content = content.replace('</script>', '&lt;/script&gt;')

-     return content

+     attrs = bleach.ALLOWED_ATTRIBUTES

+     attrs['img'] = filter_img_src

+     return bleach.clean(

+         content,

+         tags=bleach.ALLOWED_TAGS + [

+             'p', 'br', 'div', 'h1', 'h2', 'h3', 'table', 'td', 'tr', 'th',

+             'col', 'tbody', 'pre', 'img',

+         ],

+         attributes=attrs,

+     )

  

  

  @APP.template_filter('toRGB')

file modified
+1
@@ -2,6 +2,7 @@ 

  # Use this file by running "$ pip install -r requirements.txt"

  alembic

  arrow

+ bleach

  blinker

  chardet

  docutils

no initial comment

+1, but it might be better to nuke the no_js method entirely and ensure that this is done globally -- anywhere that markdown is rendered. I'm worried about someone forgetting to call no_js and leaving open an XSS attack vector. I think this should be handled automatically.

The markdown filter calls the no_js() method, so everywhere we put markdown it will run bleach, but keeping the no_js filter also allows to have other places where we can strip down potential JS (doc or README for example).

But I'm seeing some issue with the bleach library, need to check out more carefully how to use it.

Ok with the latest commit at least pagure's README displays about correctly

Jenkins told me there were still some problems but this last commit should resolve them.
Let see what it says but otherwise, second review welcome :)

Still looks good to me. :earth_africa: 頑張って