#2288 Improve mimetype guess
Closed 6 years ago by pingou. Opened 6 years ago by zhsj.
zhsj/pagure improve-mimetype  into  master

file modified
+11 -17
@@ -20,6 +20,7 @@ 

  import pagure.doc_utils

  import pagure.exceptions

  import pagure.lib

+ import pagure.lib.mimetype

  import pagure.forms

  

  # Create the application.
@@ -102,28 +103,29 @@ 

          repo_obj, commit.tree, path)

  

      if blob_or_tree is None:

-         return (tree_obj, None, False, extended)

+         return (tree_obj, None, None)

  

      if not repo_obj[blob_or_tree.oid]:

          # Not tested and no idea how to test it, but better safe than sorry

          flask.abort(404, 'File not found')

  

      if isinstance(blob_or_tree, pygit2.TreeEntry):  # Returned a file

-         ext = os.path.splitext(blob_or_tree.name)[1]

+         filename = blob_or_tree.name

+         name, ext = os.path.splitext(filename)

          blob_obj = repo_obj[blob_or_tree.oid]

          if not is_binary_string(blob_obj.data):

              try:

                  content, safe = pagure.doc_utils.convert_readme(

                      blob_obj.data, ext)

+                 if safe:

+                     filename = name + '.html'

              except pagure.exceptions.PagureEncodingException:

-                 safe = False

                  content = blob_obj.data

          else:

-             safe = True

              content = blob_obj.data

  

      tree = sorted(tree_obj, key=lambda x: x.filemode)

-     return (tree, content, safe, extended)

+     return (tree, content, filename)

  

  

  @APP.route('/<repo>/')
@@ -170,7 +172,6 @@ 

  

      content = None

      tree = None

-     safe = False

      if not filename:

          path = ['']

      else:
@@ -178,24 +179,14 @@ 

  

      if commit:

          try:

-             (tree, content, safe, extended) = __get_tree_and_content(

+             (tree, content, filename) = __get_tree_and_content(

                  repo_obj, commit, path)

-             if extended:

-                 filename += '/'

          except pagure.exceptions.FileNotFoundException as err:

              flask.flash(err.message, 'error')

          except Exception as err:

              _log.exception(err)

              flask.abort(500, 'Unkown error encountered and reported')

  

-     mimetype = None

-     if not filename:

-         pass

-     elif filename.endswith('.css'):

-         mimetype = 'text/css'

-     elif filename.endswith('.js'):

-         mimetype = 'application/javascript'

- 

      if not content:

          if not tree or not len(tree):

              flask.abort(404, 'No content found is the repository')
@@ -208,5 +199,8 @@ 

              html += '<ul><a href="{0}">{1}</a></ul>'.format(name, name)

          html += '</li>'

          content = TMPL_HTML.format(content=html)

+         mimetype = 'text/html'

+     else:

+         mimetype, _ = pagure.lib.mimetype.guess_type(filename, content)

  

      return flask.Response(content, mimetype=mimetype)

@@ -0,0 +1,63 @@ 

+ # -*- coding: utf-8 -*-

+ import logging

+ import mimetypes

+ import kitchen.text.converters as ktc

+ import pagure.lib.encoding_utils

+ 

+ 

+ _log = logging.getLogger(__name__)

+ 

+ 

+ def guess_type(filename, data):

+     '''

+     Guess the type of a file based on its filename and data.

+ 

+     Return value is a tuple (type, encoding) where type or encoding is None

+     if it can't be guessed.

+ 

+     :param filename: file name string

+     :param data: file data string

+     '''

+     mimetype = None

+     encoding = None

+     if filename:

+         mimetype, encoding = mimetypes.guess_type(filename)

+     if data:

+         if not mimetype:

+             if '\0' in data:

+                 mimetype = 'application/octet-stream'

+             else:

+                 mimetype = 'text/plain'

+ 

+         if mimetype.startswith('text/') and not encoding:

+             try:

+                 encoding = pagure.lib.encoding_utils.guess_encoding(

+                     ktc.to_bytes(data))

+             except pagure.exceptions.PagureException:  # pragma: no cover

+                 # We cannot decode the file, so bail but warn the admins

+                 _log.exception('File could not be decoded')

+ 

+     return mimetype, encoding

+ 

+ 

+ def get_type_headers(filename, data):

+     '''

+     Get the HTTP headers used for downloading or previewing the file.

+ 

+     If the file is html, it will return headers which make browser start

+     downloading.

+ 

+     :param filename: file name string

+     :param data: file data string

+     '''

+     mimetype, encoding = guess_type(filename, data)

+     if not mimetype:

+         return None

+     headers = {'X-Content-Type-Options': 'nosniff'}

+     if 'html' in mimetype or 'javascript' in mimetype:

+         mimetype = 'application/octet-stream'

+         headers['Content-Disposition'] = 'attachment'

+     if encoding:

+         mimetype += '; charset={encoding}'.format(encoding=encoding)

+     headers['Content-Type'] = mimetype

+     return headers

file modified
+2 -32
@@ -27,13 +27,10 @@ 

  from sqlalchemy.exc import SQLAlchemyError

  from binaryornot.helpers import is_binary_string

  

- import kitchen.text.converters as ktc

- import mimetypes

- 

  import pagure.doc_utils

  import pagure.exceptions

  import pagure.lib

- import pagure.lib.encoding_utils

+ import pagure.lib.mimetype

  import pagure.forms

  from pagure import (APP, SESSION, __get_file_in_tree,

                      login_required, authenticated, urlpattern)
@@ -1298,8 +1295,6 @@ 

      if not repo.settings.get('issue_tracker', True):

          flask.abort(404, 'No issue tracker found for this project')

  

-     mimetype, encoding = mimetypes.guess_type(filename)

- 

      attachdir = os.path.join(APP.config['ATTACHMENTS_FOLDER'], repo.fullname)

      attachpath = os.path.join(attachdir, filename)

      if not os.path.exists(attachpath):
@@ -1352,32 +1347,7 @@ 

              form=pagure.forms.ConfirmationForm(),

          )

  

-     if not mimetype and data[:2] == '#!':

-         mimetype = 'text/plain'

- 

-     headers = {}

-     if not mimetype:

-         if '\0' in data:

-             mimetype = 'application/octet-stream'

-         else:

-             mimetype = 'text/plain'

-     elif 'html' in mimetype:

-         mimetype = 'application/octet-stream'

-         headers['Content-Disposition'] = 'attachment'

- 

-     if mimetype.startswith('text/') and not encoding:

-         try:

-             encoding = pagure.lib.encoding_utils.guess_encoding(

-                 ktc.to_bytes(data))

-         except pagure.exceptions.PagureException:

-             # We cannot decode the file, so bail but warn the admins

-             _log.exception('File could not be decoded')

- 

-     if encoding:

-         mimetype += '; charset={encoding}'.format(encoding=encoding)

-     headers['Content-Type'] = mimetype

- 

-     return (data, 200, headers)

+     return (data, 200, pagure.lib.mimetype.get_type_headers(filename, data))

  

  

  @APP.route('/<repo>/issue/<int:issueid>/comment/<int:commentid>/edit',

file modified
+2 -30
@@ -39,13 +39,12 @@ 

  from pygments.filters import VisibleWhitespaceFilter

  from sqlalchemy.exc import SQLAlchemyError

  

- import mimetypes

- 

  from binaryornot.helpers import is_binary_string

  

  import pagure.exceptions

  import pagure.lib

  import pagure.lib.git

+ import pagure.lib.mimetype

  import pagure.lib.plugins

  import pagure.forms

  import pagure
@@ -605,8 +604,6 @@ 

      if isinstance(commit, pygit2.Tag):

          commit = commit.get_object()

  

-     mimetype = None

-     encoding = None

      if filename:

          if isinstance(commit, pygit2.Blob):

              content = commit
@@ -616,7 +613,6 @@ 

          if not content or isinstance(content, pygit2.Tree):

              flask.abort(404, 'File not found')

  

-         mimetype, encoding = mimetypes.guess_type(filename)

          data = repo_obj[content.oid].data

      else:

          if commit.parents:
@@ -635,31 +631,7 @@ 

      if not data:

          flask.abort(404, 'No content found')

  

-     if not mimetype and data[:2] == '#!':

-         mimetype = 'text/plain'

- 

-     headers = {}

-     if not mimetype:

-         if '\0' in data:

-             mimetype = 'application/octet-stream'

-         else:

-             mimetype = 'text/plain'

-     elif 'html' in mimetype:

-         mimetype = 'application/octet-stream'

-         headers['Content-Disposition'] = 'attachment'

- 

-     if mimetype.startswith('text/') and not encoding:

-         try:

-             encoding = encoding_utils.guess_encoding(ktc.to_bytes(data))

-         except pagure.exceptions.PagureException:

-             # We cannot decode the file, so bail but warn the admins

-             _log.exception('File could not be decoded')

- 

-     if encoding:

-         mimetype += '; charset={encoding}'.format(encoding=encoding)

-     headers['Content-Type'] = mimetype

- 

-     return (data, 200, headers)

+     return (data, 200, pagure.lib.mimetype.get_type_headers(filename, data))

  

  

  @APP.route('/<repo>/blame/<path:filename>')

@@ -0,0 +1,61 @@ 

+ # -*- coding: utf-8 -*-

+ """

+ Tests for :module:`pagure.lib.mimetype`.

+ """

+ 

+ import os

+ import unittest

+ import sys

+ 

+ sys.path.insert(0, os.path.join(os.path.dirname(

+     os.path.abspath(__file__)), '..'))

+ 

+ from pagure.lib import mimetype

+ from pagure import exceptions

+ 

+ 

+ class TestMIMEType(unittest.TestCase):

+     def test_guess_type(self):

+         dataset = [

+             ('hello.html', None, 'text/html', None),

+             ('hello.html', '#!', 'text/html', 'ascii'),

+             ('hello', '#!', 'text/plain', 'ascii'),

+             ('hello.jpg', None, 'image/jpeg', None),

+             ('hello.jpg', '#!', 'image/jpeg', None),

+             ('hello.jpg', '\0', 'image/jpeg', None),

+             (None, '😋', 'text/plain', 'utf-8'),

+             ('hello', '\0', 'application/octet-stream', None),

+             ('hello', None, None, None)

+         ]

+         for data in dataset:

+             result = mimetype.guess_type(data[0], data[1])

+             self.assertEqual((data[2], data[3]), result)

+ 

+     def test_get_html_file_headers(self):

+         result = mimetype.get_type_headers('hello.html', None)

+         expected = {

+             'Content-Type': 'application/octet-stream',

+             'Content-Disposition': 'attachment',

+             'X-Content-Type-Options': 'nosniff'

+         }

+         self.assertEqual(result, expected)

+ 

+     def test_get_normal_headers(self):

+         dataset = [

+             ('hello', '#!', 'text/plain; charset=ascii'),

+             ('hello.jpg', None, 'image/jpeg'),

+             ('hello.jpg', '#!', 'image/jpeg'),

+             ('hello.jpg', '\0', 'image/jpeg'),

+             (None, '😋', 'text/plain; charset=utf-8'),

+             ('hello', '\0', 'application/octet-stream')

+         ]

+         for data in dataset:

+             result = mimetype.get_type_headers(data[0], data[1])

+             self.assertEqual(result['Content-Type'], data[2])

+ 

+     def test_get_none_header(self):

+         self.assertIsNone(mimetype.get_type_headers('hello', None))

+ 

+ 

+ if __name__ == '__main__':

+     unittest.main(verbosity=2)

refactor the origin code to pagure.lib.mimetype, so we can reuse it.
guess the mimetype based on filename and content in docs server, this also fixes https://pagure.io/pagure/issue/496

rebased

6 years ago

I recommend a docblock here that explains the function's purpose, return value, and return type, and documents its arguments and their types.

It looks like there's a trailing character on the end of this line.

It looks like there's a trailing character on the end of this line.

rebased

6 years ago

@bowlofeggs Thanks for reviewing, updated

I recommend tests, but LGTM otherwise.

1 new commit added

  • Add tests for pagure.lib.mimetype
6 years ago

Added some tests, no sure whether the test cases are enough :P

Please also add the X-Content-Type-Options header with value "nosniff".

How about we drop the manual hackery?
I'd say just get rid of this case.

I'd say we should add the Content-Disposition header to application/ types as well, to prevent application/javascript etc

so you mean to remove all the codes that guess the mimetype based on the file content?

The manual one we do here by detecting #!, yes. Not removing the mimetypes call.

rebased

6 years ago

all comments are addressed, and rebased to 2 commits

rebased

6 years ago

Sorry I dropped the ball on this PR, are you still interesting it getting it merged? Or would you prefer someone else to take it over?

Please go head to take it over. I haven't tracked the new changes in pagure's code base for a while.

@cverna finished this PR in https://pagure.io/pagure/pull-request/2778

Thanks for working on this @zhsj & @cverna it's nice to finally have it in! :)

Pull-Request has been closed by pingou

6 years ago