#403 fixEncoding for changelogs
Merged 6 years ago by mikem. Opened 6 years ago by tkopecek.
tkopecek/koji issue349  into  master

file modified
+3 -2
@@ -9376,7 +9376,8 @@ 

              else:

                  results.append({'date': cldate, 'date_ts': cltime, 'author': clname, 'text': cltext})

  

-         return _applyQueryOpts(results, queryOpts)

+         results = _applyQueryOpts(results, queryOpts)

+         return koji.fixEncodingRecurse(results, remove_nonprintable=True)

  

      def cancelBuild(self, buildID):

          """Cancel the build with the given buildID
@@ -9884,7 +9885,7 @@ 

          headers = koji.get_header_fields(rpm_path, headers)

          for key, value in headers.items():

              if isinstance(value, basestring):

-                 headers[key] = koji.fixEncoding(value)

+                 headers[key] = koji.fixEncoding(value, remove_nonprintable=True)

          return headers

  

      queryRPMSigs = staticmethod(query_rpm_sigs)

file modified
+30 -13
@@ -2894,7 +2894,13 @@ 

      else:

          return '%s (%s)' % (method, arch)

  

- def fixEncoding(value, fallback='iso8859-15'):

+ CONTROL_CHARS = [chr(i) for i in range(32)]

+ NONPRINTABLE_CHARS = ''.join([c for c in CONTROL_CHARS if c not in '\r\n\t'])

+ def removeNonprintable(value):

+     # expects raw-encoded string, not unicode

+     return value.translate(None, NONPRINTABLE_CHARS)

+ 

+ def fixEncoding(value, fallback='iso8859-15', remove_nonprintable=False):

      """

      Convert value to a 'str' object encoded as UTF-8.

      If value is not valid UTF-8 to begin with, assume it is
@@ -2906,43 +2912,54 @@ 

      if isinstance(value, unicode):

          # value is already unicode, so just convert it

          # to a utf8-encoded str

-         return value.encode('utf8')

+         s = value.encode('utf8')

      else:

          # value is a str, but may be encoded in utf8 or some

          # other non-ascii charset.  Try to verify it's utf8, and if not,

          # decode it using the fallback encoding.

          try:

-             return value.decode('utf8').encode('utf8')

+             s = value.decode('utf8').encode('utf8')

          except UnicodeDecodeError:

-             return value.decode(fallback).encode('utf8')

+             s = value.decode(fallback).encode('utf8')

+     if remove_nonprintable:

+         return removeNonprintable(s)

+     else:

+         return s

  

  

- def fixEncodingRecurse(value, fallback='iso8859-15'):

+ def fixEncodingRecurse(value, fallback='iso8859-15', remove_nonprintable=False):

      """Recursively fix string encoding in an object

  

      Similar behavior to fixEncoding, but recursive

      """

      if isinstance(value, tuple):

-         return tuple([fixEncodingRecurse(x) for x in value])

+         return tuple([fixEncodingRecurse(x, fallback=fallback, remove_nonprintable=remove_nonprintable) for x in value])

      elif isinstance(value, list):

-         return list([fixEncodingRecurse(x) for x in value])

+         return list([fixEncodingRecurse(x, fallback=fallback, remove_nonprintable=remove_nonprintable) for x in value])

      elif isinstance(value, dict):

          ret = {}

          for k in value:

-             v = fixEncodingRecurse(value[k])

-             k = fixEncodingRecurse(k)

+             v = fixEncodingRecurse(value[k], fallback=fallback, remove_nonprintable=remove_nonprintable)

+             k = fixEncodingRecurse(k, fallback=fallback, remove_nonprintable=remove_nonprintable)

              ret[k] = v

          return ret

      elif isinstance(value, unicode):

-         return value.encode('utf8')

+         if remove_nonprintable:

+             return removeNonprintable(value.encode('utf8'))

+         else:

+             return value.encode('utf8')

      elif isinstance(value, str):

          # value is a str, but may be encoded in utf8 or some

          # other non-ascii charset.  Try to verify it's utf8, and if not,

          # decode it using the fallback encoding.

          try:

-             return value.decode('utf8').encode('utf8')

-         except UnicodeDecodeError, err:

-             return value.decode(fallback).encode('utf8')

+             s = value.decode('utf8').encode('utf8')

+         except UnicodeDecodeError:

+             s = value.decode(fallback).encode('utf8')

+         if remove_nonprintable:

+             return removeNonprintable(s)

+         else:

+             return s

      else:

          return value

  

Some real-world changelogs contains non-printable characters or invalid
unicode ones. xmlrpc fails on such strings, so we sanitize changelog
strings before passing it to client.

Related: https://pagure.io/koji/issue/349

typo here:

result['text'] = koji.fixEncoding(result['author'])

If I correct author to text the error returns. Using fixEncoding isn't wrong, but it's not going to remove control characters.

With the correct field, fixEncoding doesn't actually change the result of the handler (before encoding).

Hmm, typo was the reason, why it helped in my case...

rebased

6 years ago

except for \r\n\t, ascii characters below \x20 are not legal in xmlrpc. Luckily these never appear in multibyte utf8 representations. I think we can just filter them out.

ctrl = [chr(i) for i in range(32)]
filter = ''.join([c for c in ctrl if c not in '\r\n\t'])
good = bad.translate(None, filter)

1 new commit added

  • less restrictive filter
6 years ago

If we're going to apply this that broadly, we really need to check all the uses.

I'm really not sure that silently stripping these characters in all situations is the right thing. We're doing this to work around an xmlrpc limitation, but fixEncoding is used for other purposes.

So, what about setting remove_nonpritable to False as default and for now use it only in getChangelogEntries?

1 new commit added

  • don't remove nonprintable characters by default
6 years ago

looks like remove_nonprintable is not propagated through the recursion (or the fallback value for that matter)

1 new commit added

  • propagate parametere recursively
6 years ago

Added also fallback encoding to recursion.

4 new commits added

  • propagate parameters recursively
  • don't remove nonprintable characters by default
  • less restrictive filter
  • remove non-printable characters in fixEncoding
6 years ago

This patch is working fine in our production instance and fixes #349.

You need to also add it to the getRPMHeaders call end.

What is #349? its marked as private apparently and I can not access it

rebased

6 years ago

Commit a094176 fixes this pull-request

Pull-Request has been merged by mikem@redhat.com

6 years ago