#2778 Improve mimetype guess - old PR 2288
Merged 7 years ago by pingou. Opened 7 years ago by cverna.
cverna/pagure pr/2288  into  master

file modified
+11 -17
@@ -20,6 +20,7 @@ 

  import pagure.doc_utils

  import pagure.exceptions

  import pagure.lib

+ import pagure.lib.mimetype

  import pagure.forms

  

  # Create the application.
@@ -102,28 +103,29 @@ 

          repo_obj, commit.tree, path)

  

      if blob_or_tree is None:

-         return (tree_obj, None, False, extended)

+         return (tree_obj, None, None)

  

      if not repo_obj[blob_or_tree.oid]:

          # Not tested and no idea how to test it, but better safe than sorry

          flask.abort(404, 'File not found')

  

      if isinstance(blob_or_tree, pygit2.TreeEntry):  # Returned a file

-         ext = os.path.splitext(blob_or_tree.name)[1]

+         filename = blob_or_tree.name

+         name, ext = os.path.splitext(filename)

          blob_obj = repo_obj[blob_or_tree.oid]

          if not is_binary_string(blob_obj.data):

              try:

                  content, safe = pagure.doc_utils.convert_readme(

                      blob_obj.data, ext)

+                 if safe:

+                     filename = name + '.html'

              except pagure.exceptions.PagureEncodingException:

-                 safe = False

                  content = blob_obj.data

          else:

-             safe = True

              content = blob_obj.data

  

      tree = sorted(tree_obj, key=lambda x: x.filemode)

-     return (tree, content, safe, extended)

+     return (tree, content, filename)

  

  

  @APP.route('/<repo>/')
@@ -170,7 +172,6 @@ 

  

      content = None

      tree = None

-     safe = False

      if not filename:

          path = ['']

      else:
@@ -178,24 +179,14 @@ 

  

      if commit:

          try:

-             (tree, content, safe, extended) = __get_tree_and_content(

+             (tree, content, filename) = __get_tree_and_content(

                  repo_obj, commit, path)

-             if extended:

-                 filename += '/'

          except pagure.exceptions.FileNotFoundException as err:

              flask.flash(err.message, 'error')

          except Exception as err:

              _log.exception(err)

              flask.abort(500, 'Unkown error encountered and reported')

  

-     mimetype = None

-     if not filename:

-         pass

-     elif filename.endswith('.css'):

-         mimetype = 'text/css'

-     elif filename.endswith('.js'):

-         mimetype = 'application/javascript'

- 

      if not content:

          if not tree or not len(tree):

              flask.abort(404, 'No content found is the repository')
@@ -208,5 +199,8 @@ 

              html += '<ul><a href="{0}">{1}</a></ul>'.format(name, name)

          html += '</li>'

          content = TMPL_HTML.format(content=html)

+         mimetype = 'text/html'

+     else:

+         mimetype, _ = pagure.lib.mimetype.guess_type(filename, content)

  

      return flask.Response(content, mimetype=mimetype)

@@ -0,0 +1,63 @@ 

+ # -*- coding: utf-8 -*-

+ import logging

+ import mimetypes

+ import kitchen.text.converters as ktc

+ import pagure.lib.encoding_utils

+ 

+ 

+ _log = logging.getLogger(__name__)

+ 

+ 

+ def guess_type(filename, data):

+     '''

+     Guess the type of a file based on its filename and data.

+ 

+     Return value is a tuple (type, encoding) where type or encoding is None

+     if it can't be guessed.

+ 

+     :param filename: file name string

+     :param data: file data string

+     '''

+     mimetype = None

+     encoding = None

+     if filename:

+         mimetype, encoding = mimetypes.guess_type(filename)

+     if data:

+         if not mimetype:

+             if '\0' in data:

+                 mimetype = 'application/octet-stream'

+             else:

+                 mimetype = 'text/plain'

+ 

+         if mimetype.startswith('text/') and not encoding:

+             try:

+                 encoding = pagure.lib.encoding_utils.guess_encoding(

+                     ktc.to_bytes(data))

+             except pagure.exceptions.PagureException:  # pragma: no cover

+                 # We cannot decode the file, so bail but warn the admins

+                 _log.exception('File could not be decoded')

+ 

+     return mimetype, encoding

+ 

+ 

+ def get_type_headers(filename, data):

+     '''

+     Get the HTTP headers used for downloading or previewing the file.

+ 

+     If the file is html, it will return headers which make browser start

+     downloading.

+ 

+     :param filename: file name string

+     :param data: file data string

+     '''

+     mimetype, encoding = guess_type(filename, data)

+     if not mimetype:

+         return None

+     headers = {'X-Content-Type-Options': 'nosniff'}

+     if 'html' in mimetype or 'javascript' in mimetype:

+         mimetype = 'application/octet-stream'

+         headers['Content-Disposition'] = 'attachment'

+     if encoding:

+         mimetype += '; charset={encoding}'.format(encoding=encoding)

+     headers['Content-Type'] = mimetype

+     return headers

file modified
+4 -31
@@ -28,13 +28,10 @@ 

  from sqlalchemy.exc import SQLAlchemyError

  from binaryornot.helpers import is_binary_string

  

- import kitchen.text.converters as ktc

- import mimetypes

- 

  import pagure.doc_utils

  import pagure.exceptions

  import pagure.lib

- import pagure.lib.encoding_utils

+ import pagure.lib.mimetype

  import pagure.forms

  from pagure import (APP, SESSION, __get_file_in_tree,

                      login_required, authenticated, urlpattern)
@@ -1343,7 +1340,8 @@ 

  

      repo = flask.g.repo

  

-     mimetype, encoding = mimetypes.guess_type(filename)

+     if not repo.settings.get('issue_tracker', True):

+         flask.abort(404, 'No issue tracker found for this project')

  

      attachdir = os.path.join(APP.config['ATTACHMENTS_FOLDER'], repo.fullname)

      attachpath = os.path.join(attachdir, filename)
@@ -1397,32 +1395,7 @@ 

              form=pagure.forms.ConfirmationForm(),

          )

  

-     if not mimetype and data[:2] == '#!':

-         mimetype = 'text/plain'

- 

-     headers = {}

-     if not mimetype:

-         if '\0' in data:

-             mimetype = 'application/octet-stream'

-         else:

-             mimetype = 'text/plain'

-     elif 'html' in mimetype:

-         mimetype = 'application/octet-stream'

-         headers['Content-Disposition'] = 'attachment'

- 

-     if mimetype.startswith('text/') and not encoding:

-         try:

-             encoding = pagure.lib.encoding_utils.guess_encoding(

-                 ktc.to_bytes(data))

-         except pagure.exceptions.PagureException:

-             # We cannot decode the file, so bail but warn the admins

-             _log.exception('File could not be decoded')

- 

-     if encoding:

-         mimetype += '; charset={encoding}'.format(encoding=encoding)

-     headers['Content-Type'] = mimetype

- 

-     return (data, 200, headers)

+     return (data, 200, pagure.lib.mimetype.get_type_headers(filename, data))

  

  

  @APP.route('/<repo>/issue/<int:issueid>/comment/<int:commentid>/edit',

file modified
+2 -30
@@ -38,13 +38,12 @@ 

  from pygments.filters import VisibleWhitespaceFilter

  from sqlalchemy.exc import SQLAlchemyError

  

- import mimetypes

- 

  from binaryornot.helpers import is_binary_string

  

  import pagure.exceptions

  import pagure.lib

  import pagure.lib.git

+ import pagure.lib.mimetype

  import pagure.lib.plugins

  import pagure.lib.tasks

  import pagure.forms
@@ -614,8 +613,6 @@ 

      if isinstance(commit, pygit2.Tag):

          commit = commit.get_object()

  

-     mimetype = None

-     encoding = None

      if filename:

          if isinstance(commit, pygit2.Blob):

              content = commit
@@ -625,7 +622,6 @@ 

          if not content or isinstance(content, pygit2.Tree):

              flask.abort(404, 'File not found')

  

-         mimetype, encoding = mimetypes.guess_type(filename)

          data = repo_obj[content.oid].data

      else:

          if commit.parents:
@@ -644,31 +640,7 @@ 

      if not data:

          flask.abort(404, 'No content found')

  

-     if not mimetype and data[:2] == '#!':

-         mimetype = 'text/plain'

- 

-     headers = {}

-     if not mimetype:

-         if '\0' in data:

-             mimetype = 'application/octet-stream'

-         else:

-             mimetype = 'text/plain'

-     elif 'html' in mimetype:

-         mimetype = 'application/octet-stream'

-         headers['Content-Disposition'] = 'attachment'

- 

-     if mimetype.startswith('text/') and not encoding:

-         try:

-             encoding = encoding_utils.guess_encoding(ktc.to_bytes(data))

-         except pagure.exceptions.PagureException:

-             # We cannot decode the file, so bail but warn the admins

-             _log.exception('File could not be decoded')

- 

-     if encoding:

-         mimetype += '; charset={encoding}'.format(encoding=encoding)

-     headers['Content-Type'] = mimetype

- 

-     return (data, 200, headers)

+     return (data, 200, pagure.lib.mimetype.get_type_headers(filename, data))

  

  

  @APP.route('/<repo>/blame/<path:filename>')

@@ -0,0 +1,60 @@ 

+ # -*- coding: utf-8 -*-

+ """

+ Tests for :module:`pagure.lib.mimetype`.

+ """

+ 

+ import os

+ import unittest

+ import sys

+ 

+ from pagure.lib import mimetype

+ 

+ sys.path.insert(0, os.path.join(os.path.dirname(

+     os.path.abspath(__file__)), '..'))

+ 

+ 

+ class TestMIMEType(unittest.TestCase):

+     def test_guess_type(self):

+         dataset = [

+             ('hello.html', None, 'text/html', None),

+             ('hello.html', '#!', 'text/html', 'ascii'),

+             ('hello', '#!', 'text/plain', 'ascii'),

+             ('hello.jpg', None, 'image/jpeg', None),

+             ('hello.jpg', '#!', 'image/jpeg', None),

+             ('hello.jpg', '\0', 'image/jpeg', None),

+             (None, '😋', 'text/plain', 'utf-8'),

+             ('hello', '\0', 'application/octet-stream', None),

+             ('hello', None, None, None)

+         ]

+         for data in dataset:

+             result = mimetype.guess_type(data[0], data[1])

+             self.assertEqual((data[2], data[3]), result)

+ 

+     def test_get_html_file_headers(self):

+         result = mimetype.get_type_headers('hello.html', None)

+         expected = {

+             'Content-Type': 'application/octet-stream',

+             'Content-Disposition': 'attachment',

+             'X-Content-Type-Options': 'nosniff'

+         }

+         self.assertEqual(result, expected)

+ 

+     def test_get_normal_headers(self):

+         dataset = [

+             ('hello', '#!', 'text/plain; charset=ascii'),

+             ('hello.jpg', None, 'image/jpeg'),

+             ('hello.jpg', '#!', 'image/jpeg'),

+             ('hello.jpg', '\0', 'image/jpeg'),

+             (None, '😋', 'text/plain; charset=utf-8'),

+             ('hello', '\0', 'application/octet-stream')

+         ]

+         for data in dataset:

+             result = mimetype.get_type_headers(data[0], data[1])

+             self.assertEqual(result['Content-Type'], data[2])

+ 

+     def test_get_none_header(self):

+         self.assertIsNone(mimetype.get_type_headers('hello', None))

+ 

+ 

+ if __name__ == '__main__':

+     unittest.main(verbosity=2)

This is a rebase of @zhsj PR #2288. All comments have been addressed, and I am currently running the tests.

1 new commit added

  • Fixing pep8 in mimetype test
7 years ago

Test looked good on my machine, just add a few errors due to missing fedmsg.

rebased onto c28196f

7 years ago

Local testing is fixing a few issue in the doc server and unit-tests are passing so far.

If the tests all pass, will merge, thanks! :)

Pull-Request has been merged by pingou

7 years ago