#36 Correctly decode using the original document charset and force re-encoding via UTF-8 BOM only when needed.
Merged 5 years ago by quidam. Opened 5 years ago by gioma1.
gioma1/librejs fix/charset  into  master

file modified
+40 -17
@@ -25,6 +25,9 @@ 

    to parse textual data through a decoder.

  */

  

+ const BOM = [0xEF, 0xBB, 0xBF];

+ const DECODER_PARAMS = {stream: true};

+ 

  class ResponseMetaData {

    constructor(request) {

      let {responseHeaders} = request;
@@ -37,7 +40,7 @@ 

          this.headers[propertyName] = h;

        }

      }

-     this.forcedUTF8 = false;

+     this.computedCharset = "";

    }

  

    get charset() {
@@ -49,34 +52,54 @@ 

        }

      }

      Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });

-     return charset;

+     return this.computedCharset = charset;

    }

  

-   get isUTF8() {

-     return /^utf-?8$/i.test(this.charset);

-   }

+   decode(data) {

+     let charset = this.charset;

+     let decoder = this.createDecoder();

+     let text = decoder.decode(data, DECODER_PARAMS);

+     if (!charset && /html/i.test(this.contentType)) {

+       // missing HTTP charset, sniffing in content...

  

-   forceUTF8() {

-     if (!(this.forcedUTF8 || this.isUTF8)) {

-       let h = this.headers.contentType;

-       if (h) {

-         h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8");

-         this.forcedUTF8 = true;

-       } // if the header doesn't exist the browser should default to UTF-8 anyway

+       if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) {

+         // forced UTF-8, nothing to do

+         return text;

+       }

+ 

+       // let's try figuring out the charset from <meta> tags

+       let parser = new DOMParser();

+       let doc = parser.parseFromString(text, "text/html");

+       let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]');

+       for (let m of meta) {

+         charset = m.getAttribute("charset");

+         if (!charset) {

+           let match = m.getAttribute("content").match(/;\s*charset\s*=\s*([\w-]+)/i)

+           if (match) charset = match[1];

+         }

+         if (charset) {

+           decoder = this.createDecoder(charset, null);

+           if (decoder) {

+             this.computedCharset = charset;

+             return decoder.decode(data, DECODER_PARAMS);

+           }

+         }

+       }

      }

-     return this.forcedUTF8;

+     return text;

    }

  

-   createDecoder() {

-     if (this.charset) {

+   createDecoder(charset = this.charset, def = "latin1") {

+     if (charset) {

        try {

-         return new TextDecoder(this.charset);

+         return new TextDecoder(charset);

        } catch (e) {

          console.error(e);

        }

      }

-     return new TextDecoder("utf-8");

+     return def ? new TextDecoder(def) : null;

    }

  };

+ ResponseMetaData.UTF8BOM = new Uint8Array(BOM);

  

  module.exports = { ResponseMetaData };

file modified
+11 -13
@@ -90,8 +90,6 @@ 

      };

  

      filter.onstop = async event => {

- 

-       let params = {stream: true};

        // concatenate chunks

        let size = buffer.reduce((sum, chunk, n) => sum + chunk.byteLength, 0)

        let allBytes = new Uint8Array(size);
@@ -108,10 +106,10 @@ 

            response.text = await (await fetch(request.url, {cache: "reload", credentials: "include"})).text();

          } else {

            console.debug("It's a %s, trying to decode it as UTF-16.", request.type);

-           response.text = new TextDecoder("utf-16be").decode(allBytes);

+           response.text = new TextDecoder("utf-16be").decode(allBytes, {stream: true});

          }

        } else {

-         response.text = metaData.createDecoder().decode(allBytes, {stream: true});

+         response.text = metaData.decode(allBytes);

        }

        let editedText = null;

        try {
@@ -119,19 +117,19 @@ 

        } catch(e) {

          console.error(e);

        }

-       if (editedText !== null &&

-         (metaData.forcedUTF8 && request.type !== "script" ||

-           response.text !== editedText)) {

-         // if we changed the charset, the text or both, let's re-encode

-         filter.write(new TextEncoder().encode(editedText));

-       } else {

-         // ... otherwise pass all the raw bytes through

-         filter.write(allBytes);

+       if (editedText !== null) {

+         // we changed the content, let's re-encode

+         let encoded = new TextEncoder().encode(editedText);

+         // pre-pending the UTF-8 BOM will force the charset per HTML 5 specs

+         allBytes = new Uint8Array(encoded.byteLength + 3);

+         allBytes.set(ResponseMetaData.UTF8BOM, 0); // UTF-8 BOM

+         allBytes.set(encoded, 3);

        }

+       filter.write(allBytes);

        filter.close();

      }

  

-     return metaData.forceUTF8() ? {responseHeaders} : ResponseProcessor.ACCEPT;;

+     return ResponseProcessor.ACCEPT;

    }

  }

  

  1. Simplified the ResponseMetaData class by removing all the UTF-8 forced signaling machinery
  2. Made ResponseMetaData standard-compliant by defaulting to "latin1" for decoding when no charset information is provided by the server.
  3. Using the BOM, rather than HTTP headers, as a more robust and timely means to enforce UTF-8 re-encoding when (and only when) we actually had to modify the response content.
  4. Added sniffing of in-content charset information (<meta> tags and BOM) when it's not delivered by HTTP headers.

All this should fix a whole bunch of decoding and encoding bugs, including https://lists.gnu.org/archive/html/bug-librejs/2019-02/msg00004.html and http://savannah.gnu.org/bugs/?54857

Pull-Request has been merged by quidam

5 years ago