#4991 Add support for cchardet
Merged 3 years ago by pingou. Opened 3 years ago by pingou.

@@ -15,6 +15,7 @@ 

      python3-black \

      python3-flake8 \

      python3-pytest-xdist \

+     python3-cchardet \

      redis \

      which \

      git

file modified
+10 -2
@@ -15,7 +15,12 @@ 

  from collections import namedtuple

  import logging

  

- from chardet import universaldetector, __version__ as ch_version

+ try:

+     import cchardet

+     from cchardet import __version__ as ch_version

+ except ImportError:

+     cchardet = None

+     from chardet import universaldetector, __version__ as ch_version

  

  from pagure.exceptions import PagureEncodingException

  
@@ -44,7 +49,10 @@ 

  

      # We can't use ``chardet.detect`` because we want to dig in the internals

      # of the detector to bias the utf-8 result.

-     detector = universaldetector.UniversalDetector()

+     if cchardet is not None:

+         detector = cchardet.UniversalDetector()

+     else:

+         detector = universaldetector.UniversalDetector()

      detector.reset()

      detector.feed(data)

      result = detector.close()

@@ -20,6 +20,12 @@ 

  import time

  import os

  

+ cchardet = None

+ try:

+     import cchardet

+ except ImportError:

+     pass

+ 

  import pygit2

  import six

  from mock import ANY, patch, MagicMock
@@ -2763,9 +2769,16 @@ 

          output = self.app.get("/test/raw/master")

          self.assertEqual(output.status_code, 200)

          output_text = output.get_data(as_text=True)

-         self.assertEqual(

-             output.headers["Content-Type"].lower(), "text/plain; charset=ascii"

-         )

+         if cchardet is not None:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=utf-8",

+             )

+         else:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=ascii",

+             )

          self.assertIn(":Author: Pierre-Yves Chibon", output_text)

  

          # Add some more content to the repo
@@ -2784,9 +2797,16 @@ 

  

          # View in a branch

          output = self.app.get("/test/raw/master/f/sources")

-         self.assertEqual(

-             output.headers["Content-Type"].lower(), "text/plain; charset=ascii"

-         )

+         if cchardet is not None:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=utf-8",

+             )

+         else:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=ascii",

+             )

          self.assertEqual(output.status_code, 200)

          output_text = output.get_data(as_text=True)

          self.assertIn("foo\n bar", output_text)
@@ -2837,9 +2857,16 @@ 

          output = self.app.get("/test/raw/master")

          self.assertEqual(output.status_code, 200)

          output_text = output.get_data(as_text=True)

-         self.assertEqual(

-             output.headers["Content-Type"].lower(), "text/plain; charset=ascii"

-         )

+         if cchardet is not None:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=utf-8",

+             )

+         else:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=ascii",

+             )

          self.assertTrue(

              output_text.startswith("diff --git a/test_binary b/test_binary\n")

          )
@@ -2877,9 +2904,16 @@ 

          output = self.app.get("/fork/pingou/test3/raw/master/f/sources")

          self.assertEqual(output.status_code, 200)

          output_text = output.get_data(as_text=True)

-         self.assertEqual(

-             output.headers["Content-Type"].lower(), "text/plain; charset=ascii"

-         )

+         if cchardet is not None:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=utf-8",

+             )

+         else:

+             self.assertEqual(

+                 output.headers["Content-Type"].lower(),

+                 "text/plain; charset=ascii",

+             )

          self.assertIn("foo\n bar", output_text)

  

      def test_view_commit(self):

@@ -5,11 +5,18 @@ 

  

  from __future__ import unicode_literals, absolute_import

  

- import chardet

  import os

  import unittest

  import sys

  

+ cchardet = None

+ try:

+     import cchardet

+ except ImportError:

+     pass

+ 

+ import chardet

+ 

  sys.path.insert(

      0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")

  )
@@ -24,7 +31,10 @@ 

          """

          data = "Twas bryllyg, and the slythy toves did gyre and gymble"

          result = encoding_utils.guess_encoding(data.encode("ascii"))

-         self.assertEqual(result, "ascii")

+         if cchardet is not None:

+             self.assertEqual(result, "utf-8")

+         else:

+             self.assertEqual(result, "ascii")

  

      def test_guess_encoding_favor_utf_8(self):

          """
@@ -56,17 +66,24 @@ 

          chardet_result = chardet.detect(data)

          if chardet.__version__[0] == "3":

              # The first three have different confidence values

+             if cchardet is not None:

+                 expexted_list = ["utf-8"]

+                 # The last one in the list (which apparently has only one)

+                 self.assertEqual(result[-1].encoding, "utf-8")

+             else:

+                 expexted_list = ["utf-8", "ISO-8859-9", "ISO-8859-1"]

+                 # This is the one with the least confidence

+                 self.assertEqual(result[-1].encoding, "windows-1255")

              self.assertListEqual(

-                 [encoding.encoding for encoding in result][:3],

-                 ["utf-8", "ISO-8859-9", "ISO-8859-1"],

+                 [encoding.encoding for encoding in result][:3], expexted_list

              )

-             # This is the one with the least confidence

-             self.assertEqual(result[-1].encoding, "windows-1255")

+ 

              # The values in the middle of the list all have the same confidence

              # value and can't be sorted reliably: use sets.

-             self.assertEqual(

-                 set([encoding.encoding for encoding in result]),

-                 set(

+             if cchardet is not None:

+                 expected_list = sorted(["utf-8"])

+             else:

+                 expected_list = sorted(

                      [

                          "utf-8",

                          "ISO-8859-9",
@@ -89,7 +106,10 @@ 

                          "windows-1251",

                          "windows-1255",

                      ]

-                 ),

+                 )

+             self.assertListEqual(

+                 sorted(set([encoding.encoding for encoding in result])),

+                 expected_list,

              )

              self.assertEqual(chardet_result["encoding"], "ISO-8859-9")

          else:

@@ -9,6 +9,12 @@ 

  import unittest

  import sys

  

+ cchardet = None

+ try:

+     import cchardet

+ except ImportError:

+     pass

+ 

  from pagure.lib import mimetype

  

  sys.path.insert(
@@ -20,8 +26,18 @@ 

      def test_guess_type(self):

          dataset = [

              ("hello.html", None, "text/html", None),

-             ("hello.html", b"#!", "text/html", "ascii"),

-             ("hello", b"#!", "text/plain", "ascii"),

+             (

+                 "hello.html",

+                 b"#!",

+                 "text/html",

+                 "ascii" if cchardet is None else "utf-8",

+             ),

+             (

+                 "hello",

+                 b"#!",

+                 "text/plain",

+                 "ascii" if cchardet is None else "utf-8",

+             ),

              ("hello.jpg", None, "image/jpeg", None),

              ("hello.jpg", b"#!", "image/jpeg", None),

              ("hello.jpg", b"\0", "image/jpeg", None),
@@ -49,7 +65,13 @@ 

  

      def test_get_normal_headers(self):

          dataset = [

-             ("hello", b"#!", "text/plain; charset=ascii"),

+             (

+                 "hello",

+                 b"#!",

+                 "text/plain; charset=ascii"

+                 if cchardet is None

+                 else "text/plain; charset=utf-8",

+             ),

              ("hello.jpg", None, "image/jpeg"),

              ("hello.jpg", b"#!", "image/jpeg"),

              ("hello.jpg", b"\0", "image/jpeg"),

no initial comment

pretty please pagure-ci rebuild

3 years ago

2 new commits added

  • Make the F31/RPM testing environment use cchardet
  • Add support for using cchardet to detect files' encoding
3 years ago

2 new commits added

  • Make the F31/RPM testing environment use cchardet
  • Add support for using cchardet to detect files' encoding
3 years ago

2 new commits added

  • Make the F31/RPM testing environment use cchardet
  • Add support for using cchardet to detect files' encoding
3 years ago

rebased onto 1163ed9

3 years ago

Thanks for the reviews!

Pull-Request has been merged by pingou

3 years ago