#7 Refactoring and fixing the loading mechanism for documents, subdocuments and scripts
Merged 5 years ago by quidam. Opened 5 years ago by gioma1.
gioma1/librejs refactoring/loading  into  master

@@ -0,0 +1,82 @@ 

+ /**

+ * GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript.

+ *

+ * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>

+ *

+ * This file is part of GNU LibreJS.

+ *

+ * GNU LibreJS is free software: you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation, either version 3 of the License, or

+ * (at your option) any later version.

+ *

+ * GNU LibreJS is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GNU LibreJS.  If not, see <http://www.gnu.org/licenses/>.

+ */

+ 

+ /**

+   This class parses HTTP response headers to extract both the

+   MIME Content-type and the character set to be used, if specified,

+   to parse textual data through a decoder.

+ */

+ 

+ class ResponseMetaData {

+   constructor(request) {

+     let {responseHeaders} = request;

+     this.headers = {};

+     for (let h of responseHeaders) {

+       if (/^\s*Content-(Type|Disposition)\s*$/i.test(h.name)) {

+         let propertyName =  h.name.split("-")[1].trim();

+         propertyName = `content${propertyName.charAt(0).toUpperCase()}${propertyName.substring(1).toLowerCase()}`;

+         this[propertyName] = h.value;

+         this.headers[propertyName] = h;

+       }

+     }

+     this.forcedUTF8 = false;

+   }

+ 

+   get charset() {

+     let charset = "";

+     if (this.contentType) {

+       let m = this.contentType.match(/;\s*charset\s*=\s*(\S+)/);

+       if (m) {

+         charset = m[1];

+       }

+     }

+     Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });

+     return charset;

+   }

+ 

+   get isUTF8() {

+     return /^utf-8$/i.test(this.charset);

+   }

+ 

+   forceUTF8() {

+     if (!(this.forcedUTF8 || this.isUTF8)) {

+       let h = this.headers.contentType;

+       if (h) {

+         h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8");

+         this.forcedUTF8 = true;

+       } // if the header doesn't exist the browser should default to UTF-8 anyway

+     }

+     return this.forcedUTF8;

+   }

+ 

+   createDecoder() {

+     if (this.charset) {

+       try {

+         return new TextDecoder(this.charset);

+       } catch (e) {

+         console.error(e);

+       }

+     }

+     return new TextDecoder("utf-8");

+   }

+ };

+ 

+ module.exports = { ResponseMetaData };

@@ -0,0 +1,110 @@ 

+ /**

+ * GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript.

+ *

+ * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>

+ *

+ * This file is part of GNU LibreJS.

+ *

+ * GNU LibreJS is free software: you can redistribute it and/or modify

+ * it under the terms of the GNU General Public License as published by

+ * the Free Software Foundation, either version 3 of the License, or

+ * (at your option) any later version.

+ *

+ * GNU LibreJS is distributed in the hope that it will be useful,

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

+ * GNU General Public License for more details.

+ *

+ * You should have received a copy of the GNU General Public License

+ * along with GNU LibreJS.  If not, see <http://www.gnu.org/licenses/>.

+ */

+ 

+ /**

+   An abstraction layer over the StreamFilter API, allowing its clients to process

+   only the "interesting" HTML and script requests and leaving the other alone

+ */

+ 

+ let {ResponseMetaData} = require("./ResponseMetaData.js");

+ 

+ let listeners = new WeakMap();

+ let webRequestEvent = browser.webRequest.onHeadersReceived;

+ 

+ class ResponseProcessor {

+ 

+   static install(handler, types = ["main_frame", "sub_frame", "script"]) {

+     if (listeners.has(handler)) return false;

+     let listener =

+       request =>  new ResponseTextFilter(request).process(handler);

+     listeners.set(handler, listener);

+     webRequestEvent.addListener(

+   		listener,

+   		{urls: ["<all_urls>"], types},

+   		["blocking", "responseHeaders"]

+   	);

+     return true;

+   }

+ 

+   static uninstall(handler) {

+     let listener = listeners.get(handler);

+     if (listener) {

+       webRequestEvent.removeListener(listener);

+     }

+   }

+ }

+ 

+ class ResponseTextFilter {

+   constructor(request) {

+     this.request = request;

+     let {type, statusCode} = request;

+     let md = this.metaData = new ResponseMetaData(request);

+     this.canProcess = // we want to process html documents and scripts only

+       (statusCode < 300 || statusCode >= 400) && // skip redirections

+       !md.disposition && // skip forced downloads

+       (type === "script" || /\bhtml\b/i.test(md.contentType));

+   }

+ 

+   process(handler) {

+     if (!this.canProcess) return {};

+     let metaData = this.metaData;

+     let {requestId, responseHeaders} = this.request;

+     let filter = browser.webRequest.filterResponseData(requestId);

+     let buffer = [];

+ 

+     filter.ondata = event => {

+       buffer.push(event.data);

+     };

+ 

+     filter.onstop = async event => {

+       let decoder = metaData.createDecoder();

+       let params = {stream: true};

+       let text = this.text = buffer.map(

+         chunk => decoder.decode(chunk, params))

+         .join('');

+       let editedText = null;

+       try {

+         let response = {

+           request: this.request,

+           metaData,

+           text,

+         };

+         editedText = await handler(response);

+       } catch(e) {

+         console.error(e);

+       }

+       if (metaData.forcedUTF8 ||

+         editedText !== null && text !== editedText) {

+         // if we changed the charset, the text or both, let's re-encode

+         filter.write(new TextEncoder().encode(editedText));

+       } else {

+         // ... otherwise pass all the raw bytes through

+         for (let chunk of buffer) filter.write(chunk);

+       }

+ 

+       filter.disconnect();

+     }

+ 

+     return metaData.forceUTF8() ? {responseHeaders} : {};

+   }

+ }

+ 

+ module.exports = { ResponseProcessor };

file modified
+42 -100
@@ -25,6 +25,7 @@ 

  var jssha = require('jssha');

  var walk = require("acorn/dist/walk");

  var legacy_license_lib = require("./legacy_license_check.js");

+ var {ResponseProcessor} = require("./bg/ResponseProcessor");

  

  console.log("main_background.js");

  /**
@@ -853,7 +854,7 @@ 

  *		reason text		

  *	]

  */

- function license_read(script_src, name){

+ function license_read(script_src, name, external = false){

  	

  	var reason_text = "";

  
@@ -970,7 +971,7 @@ 

  			}

  		edited = [true,response,"Page is whitelisted in preferences"];

  		}else{

- 			edited = license_read(response,scriptname);

+ 			edited = license_read(response,scriptname,index == -2);

  		}

  		var src_hash = hash(response);

  		var verdict = edited[0];
@@ -1066,35 +1067,28 @@ 

  	else return {};

  }

  

+ 

+ 

  /**

- *	This is a callback trigged from requests caused by script tags with the src="" attribute.

+ *	This listener gets called as soon as we've got all the HTTP headers, can guess

+ * content type and encoding, and therefore correctly parse HTML documents and

+ * and external script inclusion in search of non-free JavaScript

  */

- function read_script(a){

- 	var GA = test_GA(a);

- 	if(GA !== false){

- 		return GA;

- 	}

  

- 	var filter = webex.webRequest.filterResponseData(a.requestId);

- 	var decoder = new TextDecoder("utf-8");

- 	var encoder = new TextEncoder();

- 	var str = "";

- 

- 	filter.onstop = event => {

- 		dbg_print("read_script "+a.url);

- 		var res = test_url_whitelisted(a.url);

- 		res.then(function(whitelisted){

- 			var edit_script = get_script(str,a.url,a["tabId"],whitelisted,-1);

- 			edit_script.then(function(edited){

- 				filter.write(encoder.encode(edited));

- 				filter.disconnect();

- 			});

- 		});

- 	}

-         filter.ondata = event => {

-                 str += decoder.decode(event.data, {stream: true});

-         }

- 	return {};

+ async function responseHandler(response) {

+ 	let {url, type} = response.request;

+ 	let whitelisted = await test_url_whitelisted(url);

+ 	let handle_it = type === "script" ? handle_script : handle_html;

+ 	return await handle_it(response, whitelisted);

+ }

+ 

+ /**

+ * Here we handle external script requests

+ */

+ async function handle_script(response, whitelisted){

+ 	let {text, request} = response;

+ 	let {url, tabId} = request;

+   return await get_script(text, url, tabId, whitelisted, -2);

  }

  

  /**
@@ -1260,61 +1254,21 @@ 

  }

  

  /**

- * Callback for main frame requests

- * 

+ * Here we handle html document responses

  */

- function read_document(a){

- 	var GA = test_GA(a);

- 	if(GA != false){

- 		return GA;

- 	}

- 	var str = "";

- 	var filter = webex.webRequest.filterResponseData(a.requestId);

- 	var decoder = new TextDecoder("utf-8");

- 	var encoder = new TextEncoder();

- 	filter.onerror = event => {

- 		dbg_print("%c Error in getting document","color:red");

- 	}

- 	filter.onstop = event => {

- 		time = Date.now();

- 		delete unused_data[a["tabId"]];

- 		webex.browserAction.setBadgeText({

- 			text: "✓",

- 			tabId: a["tabId"]

- 		});

- 		webex.browserAction.setBadgeBackgroundColor({

- 			color: "green",

- 			tabId: a["tabId"]

- 		});

- 		var test = new ArrayBuffer();

- 		var res = test_url_whitelisted(a.url);

- 		res.then(function(whitelisted){

- 			var edit_page;

- 			// TODO Fix this ugly HACK!

- 			if(! str.includes("<html")){

- 				dbg_print("not html");

- 				filter.write(encoder.encode(str));

- 				filter.disconnect();

- 				return {};

- 			}

- 			if(whitelisted == true){

- 				dbg_print("WHITELISTED");

- 				// Doesn't matter if this is accepted or blocked, it will still be whitelisted

- 				filter.write(encoder.encode(str));

- 				filter.disconnect();

- 			} else{

- 				edit_page = edit_html(str,a.url,a["tabId"],false);

- 				edit_page.then(function(edited){

- 					filter.write(encoder.encode(edited));

- 					filter.disconnect();

- 				});

- 			}

- 		});

- 	}

- 	filter.ondata = event => {

- 		str += decoder.decode(event.data, {stream: true});

- 	}

- 	return {};

+ async function handle_html(response, whitelisted) {

+ 	let {text, request} = response;

+ 	let {url, tabId} = request;

+ 	delete unused_data[tabId];

+ 	browser.browserAction.setBadgeText({

+ 		text: "✓",

+ 		tabId

+ 	});

+ 	browser.browserAction.setBadgeBackgroundColor({

+ 		color: "green",

+ 		tabId

+ 	});

+ 	return await edit_html(text, url, tabId, false);

  }

  

  /**
@@ -1329,32 +1283,20 @@ 

  	webex.tabs.onRemoved.addListener(delete_removed_tab_info);

  

  	// Prevents Google Analytics from being loaded from Google servers

- 	var all_types = [

+ 	let all_types = [

  		"beacon", "csp_report", "font", "image", "imageset", "main_frame", "media",

  		"object", "object_subrequest", "ping", "script", "stylesheet", "sub_frame",

  		"web_manifest", "websocket", "xbl", "xml_dtd", "xmlhttprequest", "xslt", 

  		"other"

- 	]

- 	// Analyzes remote scripts

+ 	];

  	webex.webRequest.onBeforeRequest.addListener(

  		block_ga,

- 		{urls:["<all_urls>"], types:all_types},

- 		["blocking"]

- 	);

- 

- 	// Analyzes remote scripts

- 	webex.webRequest.onBeforeRequest.addListener(

- 		read_script,

- 		{urls:["<all_urls>"], types:["script"]},

- 		["blocking"]

- 	);

- 

- 	// Analyzes the scripts inside of HTML

- 	webex.webRequest.onBeforeRequest.addListener(

- 		read_document,

- 		{urls:["<all_urls>"], types:["main_frame"]},

+ 		{urls: ["<all_urls>"], types: all_types},

  		["blocking"]

  	);

+ 	

+ 	// Analyzes all the html documents and external scripts as they're loaded

+ 	ResponseProcessor.install(responseHandler);

  

  	legacy_license_lib.init();

  }

file modified
+2 -2
@@ -2,13 +2,13 @@ 

    "manifest_version": 2,

    "name": "GNU LibreJS [webExtensions]",

    "short_name": "LibreJS [experimental]",

-   "version": "7.14.1",

+   "version": "7.14.2",

    "author": "various",

    "description": "Only allows free and/or trivial Javascript to run.",

    "applications": {

      "gecko": {

  	  "id": "jid1-KtlZuoiikVfFew@jetpack",

- 	  "strict_min_version": "57.0"

+ 	  "strict_min_version": "60.0"

      }

    },

    "icons":{

This branch refactors out the response processing machinery, except for the script analysis/editing details left to a client callback, into a ResponseProcessor module which:

  • shields the webRequest and StreamFilter API details;
  • automatically selects HTML documents and scripts for processing, skipping other kinds of resources loaded as main_frame or sub_frame and therefore reducing the risk of breaking them;
  • handles character sets as gracefully as possible, by properly configuring the TextDecoder for the declared encoding and by forcing UTF-8 on output if necessary*

* The TextEncoder native Web API in its current specification dropped support for any encoding except UTF-8: therefore when using it to edit the response payload as text, we need to rewrite the "Content-type" header as well, forcing a "charset=utf-8" to remove encoding mismatches, if any. Another option would be opting for a non-native alternate API which supports as many encodings in output as TextDecoder does in input.

We also added a build-time option to minify the browserify output, since this and further upcoming refactorings/bug-fixes unavoidably increase the number of source files, multiplying comments and license boilerplate sections, which in turn may make bundle.js uncomfortably large.

if this 'uglifyify' dependency is to be used there should be another guard above such as:

  which uglifyify > /dev/null || (echo "can not find uglifyify" && false) || exit

can you explain more about what particular problem these patches solve or which features they are adding - or are they all purely refactoring?

i agree that this program is in dire need of refactoring; but patches that solve a problem, patches that add a feature, and patches that merely refactor should be all separated from each other

i would particularly like to ask why minification is desirable but this PR seem that it addressed multiple concerns

can you explain more about what particular problem these patches solve or which features they are adding - or are they all purely refactoring?

The specific concern addressed by this PR is "Fixing the loading mechanism", which had multiple bugs:

  • it didn't discriminate among different types of non-HTML resources loaded in windows/frames, e.g. images, potentially breaking their rendering
  • it did not handle sub frames
  • it did not handle charsets different than UTF-8

The fix is implemented through a refactoring which extracts out the HTTP response processing machinery into a separate module (ResponseProcessor): this preemptively skips responses we don't want to process and provides content-type / charset management.

The minification support was added because this and other work to come in this phase (e.g. to fix blacklisting/whitelisting) is adding source files whose growth could make it desirable; but I agree, it might better go in a different PR.
However, regarding the guard you suggest to add for uglifyify, since it's used as a browserify transform, "which" will generally fail. Nevertheless we'll better exit on any browserify error (either triggered by a missing dependency or by other problems) before packaging the XPI. Browserify is verbose enough about its errors, so this will likely suffice:

browserify $OPTS main_background.js -o bundle.js || exit

On Sun, 15 Jul 2018 23:06:19 +0000 (UTC) Gorgio wrote:

The fix is implemented through a refactoring

that is where i must object - refactoring by definition should never change
the behavior of the program - it really helps future readers of the commit
history to separate those concerns - it should always be possible to do the refactoring first without changing behavior and then in another commit on top of that making the fix that changes behavior - they could probably be included in the same PR; but separate commits would make the diffs much more
clear semantically

On Sun, 15 Jul 2018 23:06:19 +0000 (UTC) Gorgio wrote:

The minification support was added because this and other work to come in
this phase
is adding source files whose growth could make it
desirable; but I agree, it might better go in a different PR.

it should go in a separate PR because it is a separate feature - but more-so
because i would contend that minification is undesirable - it is a fairly
minor issue either way; but the point can be argued that there is little value in it - this is a tiny program (137K according to mozilla) so there is little savings to be gained in terms of network bandwidth, drive space, memory usage, or anything else - leaving all the code in human readable form allows users to read the code they already have in the .xpi without downloading the upstream source

On Sun, 15 Jul 2018 23:06:19 +0000 (UTC) Gorgio wrote:

However, regarding the guard you suggest to add for uglifyify,
we'll better exit on any browserify error

i think it is better engineering to be as clear and pedantic as possible in
such cases - that extra LOC hurts nothing; but makes it clear that the error was not a program bug but was a user error

regarding refactoring - i would also invite you to notice some of the "feature" branches that exist in this repo such as:

  • 'comments' - for more thorough commenting of the source
  • 'housekeeping' - for anything else that does not change behavior (refactoring)
  • 'useless-files' - just because there were a lot of them

those were intended as i described above to separate different concerns - it is usually best not to ever target the master branch in a merge request; but to always target one such feature-specific development branch - feel free to add more branches if there are other general concerns or features

I would also prefer to not minify the code, the savings will likely be small since the xpi is compressed anyway, and shipping readable code is more desirable than the size saved, in particular due to the topic at hand.

1 new commit added

  • Revert "Support for optional minification ("mini" build.sh argument)."
5 years ago

2 new commits added

  • Merge branch 'refactoring/loading' of ssh://pagure.io/forks/gioma1/librejs into refactoring/loading
  • Renamings to avoid use of the "content" word, per RMS request.
5 years ago

I would also prefer to not minify the code, the savings will likely be small since the xpi is compressed anyway, and shipping readable code is more desirable than the size saved, in particular due to the topic at hand.

Fair. I reverted that commit and added the renamings requested by RMS to avoid dubious usages of the word "content".

I've tested the new loading mechanism and it is a huge improvement! I have a small patch which corrects the logic applied to external scripts, I'll send it your way after this set is merged.

Things to improve on the PR:

  • On the two new files (bg/*) I think only your copyright line is necessary as they don't derive from code by Nathan or me.
  • There are a lot of changes to remove trailing spaces, in the same commit as functional changes, they should be separate.
  • commit https://pagure.io/fork/gioma1/librejs/c/a25ff071bb9c828954019be8c18a291639845654 has changes to the build.sh that you later revert, it would make more sense to correct that initial commit.

Notes for future tasks (no need to do them as part of this PR):

  • It would be good to unify the indentation styles (there is a mix of tabs and spaces) as a future task.
  • I think scripts that are to be part of the bundle should be all put together instead of having bg, evaluation_hash, nontrivial_utility, and scripts on the root dir. It makes it confusing to know where to look for things.

rebased onto eb929a0

5 years ago

I've tested the new loading mechanism and it is a huge improvement! I have a small patch which corrects the logic applied to external scripts, I'll send it your way after this set is merged.
Things to improve on the PR:
[...]

I've implemented your suggestions and rebased my patch, merging also yours about external scripts as agreed on IRC.

Pull-Request has been merged by quidam

5 years ago