PR#36: Correctly decode using the original document charset and force re-encoding via UTF-8 BOM only when needed. - librejs

librejs

#36 Correctly decode using the original document charset and force re-encoding via UTF-8 BOM only when needed.

Merged 5 years ago by quidam. Opened 5 years ago by gioma1.

gioma1/librejs fix/charset into master

Sniffing of charset in <meta> tags and BOM.

hackademix • 5 years ago

50d4b4b

Correctly decode using the original document charset and force re-encoding via UTF-8 BOM only when needed.

hackademix • 5 years ago

2a272e9

bg/ResponseMetaData.js

file modified

+40 -17

		`@@ -25,6 +25,9 @@`
		`to parse textual data through a decoder.`
		`*/`

		`+ const BOM = [0xEF, 0xBB, 0xBF];`
		`+ const DECODER_PARAMS = {stream: true};`
		`+`
		`class ResponseMetaData {`
		`constructor(request) {`
		`let {responseHeaders} = request;`
		`@@ -37,7 +40,7 @@`
		`this.headers[propertyName] = h;`
		`}`
		`}`
		`- this.forcedUTF8 = false;`
		`+ this.computedCharset = "";`
		`}`

		`get charset() {`
		`@@ -49,34 +52,54 @@`
		`}`
		`}`
		`Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });`
		`- return charset;`
		`+ return this.computedCharset = charset;`
		`}`

		`- get isUTF8() {`
		`- return /^utf-?8$/i.test(this.charset);`
		`- }`
		`+ decode(data) {`
		`+ let charset = this.charset;`
		`+ let decoder = this.createDecoder();`
		`+ let text = decoder.decode(data, DECODER_PARAMS);`
		`+ if (!charset && /html/i.test(this.contentType)) {`
		`+ // missing HTTP charset, sniffing in content...`

		`- forceUTF8() {`
		`- if (!(this.forcedUTF8 \|\| this.isUTF8)) {`
		`- let h = this.headers.contentType;`
		`- if (h) {`
		`- h.value = h.value.replace(/;\scharset\s=.*\|$/, "; charset=utf8");`
		`- this.forcedUTF8 = true;`
		`- } // if the header doesn't exist the browser should default to UTF-8 anyway`
		`+ if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) {`
		`+ // forced UTF-8, nothing to do`
		`+ return text;`
		`+ }`
		`+`
		`+ // let's try figuring out the charset from <meta> tags`
		`+ let parser = new DOMParser();`
		`+ let doc = parser.parseFromString(text, "text/html");`
		`+ let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]');`
		`+ for (let m of meta) {`
		`+ charset = m.getAttribute("charset");`
		`+ if (!charset) {`
		`+ let match = m.getAttribute("content").match(/;\scharset\s=\s*([\w-]+)/i)`
		`+ if (match) charset = match[1];`
		`+ }`
		`+ if (charset) {`
		`+ decoder = this.createDecoder(charset, null);`
		`+ if (decoder) {`
		`+ this.computedCharset = charset;`
		`+ return decoder.decode(data, DECODER_PARAMS);`
		`+ }`
		`+ }`
		`+ }`
		`}`
		`- return this.forcedUTF8;`
		`+ return text;`
		`}`

		`- createDecoder() {`
		`- if (this.charset) {`
		`+ createDecoder(charset = this.charset, def = "latin1") {`
		`+ if (charset) {`
		`try {`
		`- return new TextDecoder(this.charset);`
		`+ return new TextDecoder(charset);`
		`} catch (e) {`
		`console.error(e);`
		`}`
		`}`
		`- return new TextDecoder("utf-8");`
		`+ return def ? new TextDecoder(def) : null;`
		`}`
		`};`
		`+ ResponseMetaData.UTF8BOM = new Uint8Array(BOM);`

		`module.exports = { ResponseMetaData };`

bg/ResponseProcessor.js

file modified

+11 -13

		`@@ -90,8 +90,6 @@`
		`};`

		`filter.onstop = async event => {`
		`-`
		`- let params = {stream: true};`
		`// concatenate chunks`
		`let size = buffer.reduce((sum, chunk, n) => sum + chunk.byteLength, 0)`
		`let allBytes = new Uint8Array(size);`
		`@@ -108,10 +106,10 @@`
		`response.text = await (await fetch(request.url, {cache: "reload", credentials: "include"})).text();`
		`} else {`
		`console.debug("It's a %s, trying to decode it as UTF-16.", request.type);`
		`- response.text = new TextDecoder("utf-16be").decode(allBytes);`
		`+ response.text = new TextDecoder("utf-16be").decode(allBytes, {stream: true});`
		`}`
		`} else {`
		`- response.text = metaData.createDecoder().decode(allBytes, {stream: true});`
		`+ response.text = metaData.decode(allBytes);`
		`}`
		`let editedText = null;`
		`try {`
		`@@ -119,19 +117,19 @@`
		`} catch(e) {`
		`console.error(e);`
		`}`
		`- if (editedText !== null &&`
		`- (metaData.forcedUTF8 && request.type !== "script" \|\|`
		`- response.text !== editedText)) {`
		`- // if we changed the charset, the text or both, let's re-encode`
		`- filter.write(new TextEncoder().encode(editedText));`
		`- } else {`
		`- // ... otherwise pass all the raw bytes through`
		`- filter.write(allBytes);`
		`+ if (editedText !== null) {`
		`+ // we changed the content, let's re-encode`
		`+ let encoded = new TextEncoder().encode(editedText);`
		`+ // pre-pending the UTF-8 BOM will force the charset per HTML 5 specs`
		`+ allBytes = new Uint8Array(encoded.byteLength + 3);`
		`+ allBytes.set(ResponseMetaData.UTF8BOM, 0); // UTF-8 BOM`
		`+ allBytes.set(encoded, 3);`
		`}`
		`+ filter.write(allBytes);`
		`filter.close();`
		`}`

		`- return metaData.forceUTF8() ? {responseHeaders} : ResponseProcessor.ACCEPT;;`
		`+ return ResponseProcessor.ACCEPT;`
		`}`
		`}`

gioma1 commented 5 years ago

Simplified the ResponseMetaData class by removing all the UTF-8 forced signaling machinery
Made ResponseMetaData standard-compliant by defaulting to "latin1" for decoding when no charset information is provided by the server.
Using the BOM, rather than HTTP headers, as a more robust and timely means to enforce UTF-8 re-encoding when (and only when) we actually had to modify the response content.
Added sniffing of in-content charset information (<meta> tags and BOM) when it's not delivered by HTTP headers.

All this should fix a whole bunch of decoding and encoding bugs, including https://lists.gnu.org/archive/html/bug-librejs/2019-02/msg00004.html and http://savannah.gnu.org/bugs/?54857