From df55e8b96b5a83c5b83496511c06f061a7219132 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Jan 19 2021 15:55:56 +0000 Subject: Improve cchardet integration Signed-off-by: Pierre-Yves Chibon --- diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py index c855e08..2746f81 100644 --- a/pagure/lib/encoding_utils.py +++ b/pagure/lib/encoding_utils.py @@ -51,14 +51,23 @@ def detect_encodings(data): # of the detector to bias the utf-8 result. if cchardet is not None: detector = cchardet.UniversalDetector() + detector.reset() + detector.feed(data) + detector.close() + result = detector.result else: detector = universaldetector.UniversalDetector() - detector.reset() - detector.feed(data) - result = detector.close() - if not result: + detector.reset() + detector.feed(data) + result = detector.close() + + if not result or not result["encoding"]: return {"utf-8": 1.0} encodings = {result["encoding"]: result["confidence"]} + + if cchardet: + return encodings + if ch_version[0] in ("3", "4"): for prober in detector._charset_probers: if hasattr(prober, "probers"): diff --git a/tests/test_pagure_flask_ui_repo.py b/tests/test_pagure_flask_ui_repo.py index 7658344..460dd63 100644 --- a/tests/test_pagure_flask_ui_repo.py +++ b/tests/test_pagure_flask_ui_repo.py @@ -3176,16 +3176,10 @@ class PagureFlaskRepotests(tests.Modeltests): output = self.app.get("/test/raw/master") self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) - if cchardet is not None: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=utf-8", - ) - else: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=ascii", - ) + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertIn(":Author: Pierre-Yves Chibon", output_text) # Add some more content to the repo @@ -3204,16 +3198,10 @@ class PagureFlaskRepotests(tests.Modeltests): # View in a branch output = self.app.get("/test/raw/master/f/sources") - if cchardet is not None: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=utf-8", - ) - else: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=ascii", - ) + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) self.assertIn("foo\n bar", output_text) @@ -3264,16 +3252,10 @@ class PagureFlaskRepotests(tests.Modeltests): output = self.app.get("/test/raw/master") self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) - if cchardet is not None: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=utf-8", - ) - else: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=ascii", - ) + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertTrue( output_text.startswith("diff --git a/test_binary b/test_binary\n") ) @@ -3311,16 +3293,10 @@ class PagureFlaskRepotests(tests.Modeltests): output = self.app.get("/fork/pingou/test3/raw/master/f/sources") self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) - if cchardet is not None: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=utf-8", - ) - else: - self.assertEqual( - output.headers["Content-Type"].lower(), - "text/plain; charset=ascii", - ) + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertIn("foo\n bar", output_text) def test_view_commit(self): diff --git a/tests/test_pagure_lib_encoding_utils.py b/tests/test_pagure_lib_encoding_utils.py index aff7d8b..cc3ac4b 100644 --- a/tests/test_pagure_lib_encoding_utils.py +++ b/tests/test_pagure_lib_encoding_utils.py @@ -32,7 +32,7 @@ class TestGuessEncoding(unittest.TestCase): data = "Twas bryllyg, and the slythy toves did gyre and gymble" result = encoding_utils.guess_encoding(data.encode("ascii")) if cchardet is not None: - self.assertEqual(result, "utf-8") + self.assertEqual(result, "ASCII") else: self.assertEqual(result, "ascii") @@ -46,11 +46,14 @@ class TestGuessEncoding(unittest.TestCase): data = "Šabata".encode("utf-8") result = encoding_utils.guess_encoding(data) chardet_result = chardet.detect(data) - self.assertEqual(result, "utf-8") - if chardet.__version__[0] == "3": - self.assertEqual(chardet_result["encoding"], "ISO-8859-9") + if cchardet: + self.assertEqual(result, "WINDOWS-1250") else: - self.assertEqual(chardet_result["encoding"], "ISO-8859-2") + self.assertEqual(result, "utf-8") + if chardet.__version__[0] in ("3", "4"): + self.assertEqual(chardet_result["encoding"], "ISO-8859-9") + else: + self.assertEqual(chardet_result["encoding"], "ISO-8859-2") def test_guess_encoding_no_data(self): """ Test encoding_utils.guess_encoding() with an empty string """ @@ -64,25 +67,22 @@ class TestGuessEncodings(unittest.TestCase): data = "Šabata".encode("utf-8") result = encoding_utils.guess_encodings(data) chardet_result = chardet.detect(data) - if chardet.__version__[0] == "3": - # The first three have different confidence values - if cchardet is not None: - expexted_list = ["utf-8"] - # The last one in the list (which apparently has only one) - self.assertEqual(result[-1].encoding, "utf-8") - else: + if cchardet is not None: + # The last one in the list (which apparently has only one) + self.assertEqual(result[-1].encoding, "WINDOWS-1250") + else: + if chardet.__version__[0] in ("3", "4"): + # The first three have different confidence values expexted_list = ["utf-8", "ISO-8859-9", "ISO-8859-1"] # This is the one with the least confidence self.assertEqual(result[-1].encoding, "windows-1255") - self.assertListEqual( - [encoding.encoding for encoding in result][:3], expexted_list - ) - - # The values in the middle of the list all have the same confidence - # value and can't be sorted reliably: use sets. - if cchardet is not None: - expected_list = sorted(["utf-8"]) - else: + self.assertListEqual( + [encoding.encoding for encoding in result][:3], + expexted_list, + ) + + # The values in the middle of the list all have the same confidence + # value and can't be sorted reliably: use sets. expected_list = sorted( [ "utf-8", @@ -107,17 +107,17 @@ class TestGuessEncodings(unittest.TestCase): "windows-1255", ] ) - self.assertListEqual( - sorted(set([encoding.encoding for encoding in result])), - expected_list, - ) - self.assertEqual(chardet_result["encoding"], "ISO-8859-9") - else: - self.assertListEqual( - [encoding.encoding for encoding in result], - ["utf-8", "ISO-8859-2", "windows-1252"], - ) - self.assertEqual(chardet_result["encoding"], "ISO-8859-2") + self.assertListEqual( + sorted(set([encoding.encoding for encoding in result])), + expected_list, + ) + self.assertEqual(chardet_result["encoding"], "ISO-8859-9") + else: + self.assertListEqual( + [encoding.encoding for encoding in result], + ["utf-8", "ISO-8859-2", "windows-1252"], + ) + self.assertEqual(chardet_result["encoding"], "ISO-8859-2") def test_guess_encodings_no_data(self): """ Test encoding_utils.guess_encodings() with an emtpy string """ @@ -128,7 +128,12 @@ class TestGuessEncodings(unittest.TestCase): class TestDecode(unittest.TestCase): def test_decode(self): """ Test encoding_utils.decode() """ - data = "Šabata" + data = ( + "This is a little longer text for testing Šabata's encoding. " + "With more characters, let's see if it become more clear as to what " + "encoding should be used for this. We'll include from french words " + "in there for non-ascii: français, gagné!" + ) self.assertEqual(data, encoding_utils.decode(data.encode("utf-8"))) diff --git a/tests/test_pagure_lib_mimetype.py b/tests/test_pagure_lib_mimetype.py index 8c2f4a3..11b079d 100644 --- a/tests/test_pagure_lib_mimetype.py +++ b/tests/test_pagure_lib_mimetype.py @@ -30,13 +30,13 @@ class TestMIMEType(unittest.TestCase): "hello.html", b"#!", "text/html", - "ascii" if cchardet is None else "utf-8", + "ascii" if cchardet is None else "ASCII", ), ( "hello", b"#!", "text/plain", - "ascii" if cchardet is None else "utf-8", + "ascii" if cchardet is None else "ASCII", ), ("hello.jpg", None, "image/jpeg", None), ("hello.jpg", b"#!", "image/jpeg", None), @@ -70,7 +70,7 @@ class TestMIMEType(unittest.TestCase): b"#!", "text/plain; charset=ascii" if cchardet is None - else "text/plain; charset=utf-8", + else "text/plain; charset=ASCII", ), ("hello.jpg", None, "image/jpeg"), ("hello.jpg", b"#!", "image/jpeg"),