From 38289f25ad201775e5135367886844f391110f04 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Wed, 4 Nov 2020 23:05:11 -0600 Subject: [PATCH 01/10] Making http-response more robust to demonstrate real-world usage --- http-response/README.md | 13 ++- http-response/background.js | 157 +++++++++++++++++++++++++++++++++--- http-response/manifest.json | 2 +- 3 files changed, 156 insertions(+), 16 deletions(-) diff --git a/http-response/README.md b/http-response/README.md index 40c23380..25d3f453 100755 --- a/http-response/README.md +++ b/http-response/README.md @@ -2,10 +2,19 @@ ## What it does -Listens to HTTP Responses from example.com and changes the body of the response as it comes through. So that the word "Example" on https://example.com becomes "WebExtension Example". +Listens to HTTP Responses from example.com and w3.org and changes "Example" to "WebExtension Example" and +"Test" to "WebExtension Test" in the web pages contents. ## What it shows -How to use the response parser on bytes. +A real-world example of WebRequest that shows three important details not always found in beginning examples: + - The accumulation of data through multiple calls to .ondata + - The decoding of binary data to text in a streaming fashion. + - Text decoding that tries to respect the page's reported encoding via Content-Type. + +The domain w3.org is included in the list of domains to allow for testing against [this suite of standardized tests](https://www.w3.org/2006/11/mwbp-tests/index.xhtml) +regarding text encoding. Tests #1-8 pass, test #9 currently fails. + +## Credits Icon is from: https://www.iconfinder.com/icons/763339/draw_edit_editor_pen_pencil_tool_write_icon#size=128 diff --git a/http-response/background.js b/http-response/background.js index b7a78800..2693e02b 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -1,22 +1,153 @@ function listener(details) { + // The received data is a stream of bytes. In order to do text-based + // modifications, it is necessary to decode the bytes into a string + // using the proper character encoding, do any modifications, then + // encode back into a stream of bytes. + // Historically, detecting character encoding has been a tricky task + // taken on by the browser. Here, a simplified approach is taken + // and the complexity is hidden in a helper method. + let decoder, encoder; + [decoder, encoder] = detectCharsetAndSetupDecoderEncoder(details); let filter = browser.webRequest.filterResponseData(details.requestId); - let decoder = new TextDecoder("utf-8"); - let encoder = new TextEncoder(); + let fullStr = ''; + + filter.ondata = e => { + // Note that the event's data may break in the middle of an encoded + // character - the stream parameter is critical for success as this + // method gets called multiple times. + let str = decoder.decode(e.data, {stream: true}); + fullStr += str; + } + + filter.onstop = async e => { + // Just change any instance of Example or Test in the HTTP response + // to WebExtension Example or WebExtension Test. + let mutatedStr = fullStr.replace(/Example/g, 'WebExtension Example'); + mutatedStr = mutatedStr.replace(/Test/g, 'WebExtension Test'); + filter.write(encoder.encode(mutatedStr)); + filter.close(); + } - filter.ondata = event => { - let str = decoder.decode(event.data, {stream: true}); - // Just change any instance of Example in the HTTP response - // to WebExtension Example. - str = str.replace(/Example/g, 'WebExtension Example'); - filter.write(encoder.encode(str)); - filter.disconnect(); + filter.onerror = e => { + try { + filter.close(); + console.log('Filter error: '+e+', '+ex); + } catch(ex) { + console.log('Filter error while closing: '+e+', '+ex); + } } - return {}; + // Because details response headers have been mutated, return it + return details; } -browser.webRequest.onBeforeRequest.addListener( +browser.webRequest.onHeadersReceived.addListener( listener, - {urls: ["https://example.com/*"], types: ["main_frame"]}, - ["blocking"] + { + urls: ["https://example.com/*", "https://www.w3.org/*"], // Include W3 for testing charset detection. + types: ["main_frame"] + }, + ["blocking","responseHeaders"] ); + +// This helper method does a few things regarding character encoding: +// 1) Detects the charset for the TextDecoder so that bytes are properly turned into strings +// 2) Ensures the output Content-Type is UTF-8 because that is what TextEncoder supports +// 3) Returns the decoder/encoder pair +function detectCharsetAndSetupDecoderEncoder(details) { + let contentType = ''; + let headerIndex = -1; + for(let i=0; i Date: Wed, 4 Nov 2020 23:11:07 -0600 Subject: [PATCH 02/10] Small typos --- http-response/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/http-response/README.md b/http-response/README.md index 25d3f453..f5b040ca 100755 --- a/http-response/README.md +++ b/http-response/README.md @@ -3,12 +3,12 @@ ## What it does Listens to HTTP Responses from example.com and w3.org and changes "Example" to "WebExtension Example" and -"Test" to "WebExtension Test" in the web pages contents. +"Test" to "WebExtension Test" in the web pages' contents. ## What it shows A real-world example of WebRequest that shows three important details not always found in beginning examples: - - The accumulation of data through multiple calls to .ondata + - The accumulation of data through multiple calls to `.ondata` - The decoding of binary data to text in a streaming fashion. - Text decoding that tries to respect the page's reported encoding via Content-Type. From 334cb109a5788963b1d2c8ebd01f594fa9a93872 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Wed, 4 Nov 2020 23:40:34 -0600 Subject: [PATCH 03/10] Ensure TextDecoder flushes at the end --- http-response/background.js | 1 + 1 file changed, 1 insertion(+) diff --git a/http-response/background.js b/http-response/background.js index 2693e02b..ecf29396 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -20,6 +20,7 @@ function listener(details) { } filter.onstop = async e => { + fullStr += decoder.decode(); //Flush the buffer // Just change any instance of Example or Test in the HTTP response // to WebExtension Example or WebExtension Test. let mutatedStr = fullStr.replace(/Example/g, 'WebExtension Example'); From 8f5f7001b982668978c9d04388c9159b02c8a5d7 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Thu, 5 Nov 2020 08:34:50 -0600 Subject: [PATCH 04/10] Travis is complaining about the escaping of quote being useless --- http-response/background.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http-response/background.js b/http-response/background.js index ecf29396..96c0fa1a 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -149,6 +149,6 @@ function detectCharset(contentType) { return undefined; } let charsetMaybeQuoted = contentType.substr(foundIndex+charsetMarker.length).trim(); - let charset = charsetMaybeQuoted.replace(/\"/g, ''); + let charset = charsetMaybeQuoted.replace(/"/g, ''); return charset; } \ No newline at end of file From dea141bf2e34ea50119ac00dae3ad28eecb4f5f4 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Wed, 11 Nov 2020 21:08:31 -0600 Subject: [PATCH 05/10] Removing Example/example.com --- http-response/README.md | 3 +-- http-response/background.js | 8 +++----- http-response/manifest.json | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/http-response/README.md b/http-response/README.md index f5b040ca..c8098932 100755 --- a/http-response/README.md +++ b/http-response/README.md @@ -2,8 +2,7 @@ ## What it does -Listens to HTTP Responses from example.com and w3.org and changes "Example" to "WebExtension Example" and -"Test" to "WebExtension Test" in the web pages' contents. +Listens to HTTP Responses from example.com and w3.org and changes "Test" to "WebExtension Test" in the web pages' contents. ## What it shows diff --git a/http-response/background.js b/http-response/background.js index 96c0fa1a..193a7cc8 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -21,10 +21,8 @@ function listener(details) { filter.onstop = async e => { fullStr += decoder.decode(); //Flush the buffer - // Just change any instance of Example or Test in the HTTP response - // to WebExtension Example or WebExtension Test. - let mutatedStr = fullStr.replace(/Example/g, 'WebExtension Example'); - mutatedStr = mutatedStr.replace(/Test/g, 'WebExtension Test'); + // Just change any instance of Test in the HTTP response to WebExtension Test. + let mutatedStr = mutatedStr.replace(/Test/g, 'WebExtension Test'); filter.write(encoder.encode(mutatedStr)); filter.close(); } @@ -45,7 +43,7 @@ function listener(details) { browser.webRequest.onHeadersReceived.addListener( listener, { - urls: ["https://example.com/*", "https://www.w3.org/*"], // Include W3 for testing charset detection. + urls: ["https://www.w3.org/*"], // Include W3 for testing charset detection. types: ["main_frame"] }, ["blocking","responseHeaders"] diff --git a/http-response/manifest.json b/http-response/manifest.json index bac4038b..4bc6ba5e 100755 --- a/http-response/manifest.json +++ b/http-response/manifest.json @@ -10,7 +10,7 @@ }, "permissions": [ - "webRequest", "webRequestBlocking", "https://example.com/*", "https://www.w3.org/*" + "webRequest", "webRequestBlocking", "https://www.w3.org/*" ], "background": { From 74be63d97d9d51a0b73bf20fcafdbb8e0ac41369 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Wed, 11 Nov 2020 21:10:19 -0600 Subject: [PATCH 06/10] Removing unhelpful error handling --- http-response/background.js | 9 --------- 1 file changed, 9 deletions(-) diff --git a/http-response/background.js b/http-response/background.js index 193a7cc8..4e4b21a7 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -27,15 +27,6 @@ function listener(details) { filter.close(); } - filter.onerror = e => { - try { - filter.close(); - console.log('Filter error: '+e+', '+ex); - } catch(ex) { - console.log('Filter error while closing: '+e+', '+ex); - } - } - // Because details response headers have been mutated, return it return details; } From 5b58552b61f47970ed7adabafc0274224380941b Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Wed, 11 Nov 2020 22:34:53 -0600 Subject: [PATCH 07/10] HTTP response code check and fixing mutatedStr regression --- http-response/background.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/http-response/background.js b/http-response/background.js index 4e4b21a7..b7367bc3 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -1,4 +1,8 @@ function listener(details) { + // If the HTTP response code is not OK, just let it flow through normally. + if (details.statusCode < 200 || 300 <= details.statusCode) { + return details; + } // The received data is a stream of bytes. In order to do text-based // modifications, it is necessary to decode the bytes into a string // using the proper character encoding, do any modifications, then @@ -22,7 +26,7 @@ function listener(details) { filter.onstop = async e => { fullStr += decoder.decode(); //Flush the buffer // Just change any instance of Test in the HTTP response to WebExtension Test. - let mutatedStr = mutatedStr.replace(/Test/g, 'WebExtension Test'); + let mutatedStr = fullStr.replace(/Test/g, 'WebExtension Test'); filter.write(encoder.encode(mutatedStr)); filter.close(); } From 6e374f5e34b7a753e2a6fe061cc78d49a132eb44 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Thu, 12 Nov 2020 19:48:47 -0600 Subject: [PATCH 08/10] Trying to clean up charset detection code --- http-response/background.js | 178 ++++++++++++++---------------------- 1 file changed, 68 insertions(+), 110 deletions(-) diff --git a/http-response/background.js b/http-response/background.js index b7367bc3..e2b1d4e4 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -1,17 +1,64 @@ function listener(details) { // If the HTTP response code is not OK, just let it flow through normally. if (details.statusCode < 200 || 300 <= details.statusCode) { - return details; + console.log('HTTP Status Code was '+details.statusCode+' not 2XX for '+details.url+', skipping filtering.'); + return; } + // The received data is a stream of bytes. In order to do text-based // modifications, it is necessary to decode the bytes into a string // using the proper character encoding, do any modifications, then // encode back into a stream of bytes. - // Historically, detecting character encoding has been a tricky task - // taken on by the browser. Here, a simplified approach is taken - // and the complexity is hidden in a helper method. - let decoder, encoder; - [decoder, encoder] = detectCharsetAndSetupDecoderEncoder(details); + // + // In order to use the correct decoding, one needs to detect the charset. + // Please note that there are many complex rules to detect the charset, + // and no approach with scanning only the response headers will be + // fully accurate. The simplified approach here is to find the + // Content-Type and extract the charset if found. + + let {responseHeaders} = details; + + // Find the last Content-Type header. + let contentTypeHeader = responseHeaders + .slice().reverse() + .find(h => h.name.toLowerCase() == "content-type"); + + // If Content-Type header is not set, the browser is going to do content-sniffing, + // and we should also return to avoid trouble (e.g. breaking downloads, PDFs, videos, ...). + if (contentTypeHeader === undefined) { + console.log('Content-Type header not found for '+details.url+', skipping filtering'); + return; + } + + // If it not a supported content type, we will return rather than guess. + let baseType; + let contentType = contentTypeHeader.value.trim(); + if(contentType.startsWith('text/html')) { + baseType = 'text/html'; + } else if (contentType.startsWith('application/xhtml+xml')) { + baseType = 'application/xhtml+xml'; + } else { + console.log('Content type '+contentType+' not supported for '+details.url+', skipping filtering.'); + return; + } + + // Set up TextDecoder + console.log('Initial checks passed, beginning charset detection for '+details.url); + let charset = detectCharset(contentType) || 'utf-8'; + let decoder = new TextDecoder(charset); + console.log('The detected charset was '+charset+' for '+details.url); + + // While TextDecoder supports most charset encodings, TextEncoder does NOT support + // other than 'utf-8', so it is necessary to change the Content-Type on the header + // to UTF-8. If modifying this block of code, ensure that the tests at + // https://www.w3.org/2006/11/mwbp-tests/index.xhtml + // pass - current implementation only fails on #9 but this detection ensures + // tests #3,4,5, and 8 pass. + let encoder = new TextEncoder(); + contentTypeHeader.value = baseType+';charset=utf-8'; + + + // Now the actual filtering can begin! let filter = browser.webRequest.filterResponseData(details.requestId); let fullStr = ''; @@ -31,10 +78,23 @@ function listener(details) { filter.close(); } - // Because details response headers have been mutated, return it + // Because details response headers have been mutated, return them return details; } +// This code tries to snag the last charset indicated +// but is still not robust to poorly formed inputs. +function detectCharset(contentType) { + let charsetMarker = "charset="; + let foundIndex = contentType.lastIndexOf(charsetMarker); + if (foundIndex == -1) { + return undefined; + } + let charsetMaybeQuoted = contentType.substr(foundIndex+charsetMarker.length).trim().toLowerCase(); + let charset = charsetMaybeQuoted.replace(/"/g, ''); + return charset; +} + browser.webRequest.onHeadersReceived.addListener( listener, { @@ -42,106 +102,4 @@ browser.webRequest.onHeadersReceived.addListener( types: ["main_frame"] }, ["blocking","responseHeaders"] -); - -// This helper method does a few things regarding character encoding: -// 1) Detects the charset for the TextDecoder so that bytes are properly turned into strings -// 2) Ensures the output Content-Type is UTF-8 because that is what TextEncoder supports -// 3) Returns the decoder/encoder pair -function detectCharsetAndSetupDecoderEncoder(details) { - let contentType = ''; - let headerIndex = -1; - for(let i=0; i Date: Sun, 15 Nov 2020 16:05:27 -0600 Subject: [PATCH 09/10] Streaming replacement improvements and a bit of cleanup --- http-response/README.md | 13 +++++++++---- http-response/background.js | 31 ++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/http-response/README.md b/http-response/README.md index c8098932..be4252bc 100755 --- a/http-response/README.md +++ b/http-response/README.md @@ -2,16 +2,21 @@ ## What it does -Listens to HTTP Responses from example.com and w3.org and changes "Test" to "WebExtension Test" in the web pages' contents. +Listens to HTTP Responses from w3.org and changes "Test" to "WebExtension Check" in the web pages' contents. ## What it shows -A real-world example of WebRequest that shows three important details not always found in beginning examples: - - The accumulation of data through multiple calls to `.ondata` +A real-world example of WebRequest that shows four important details not always found in beginning examples: + - The accumulation of data through multiple calls to `.ondata`. - The decoding of binary data to text in a streaming fashion. - Text decoding that tries to respect the page's reported encoding via Content-Type. + - The encoding of replaced data back to `filter.write` in a streaming fashion. -The domain w3.org is included in the list of domains to allow for testing against [this suite of standardized tests](https://www.w3.org/2006/11/mwbp-tests/index.xhtml) + Note that both correctly detecting the character encoding and performing streaming replacements are deeper subjects + than can be fully covered in a small example but that this code provides a starting point for solving these problems + in your own solution. + +The domain w3.org is included in the list of domains to allow testing against [this suite of standardized tests](https://www.w3.org/2006/11/mwbp-tests/index.xhtml) regarding text encoding. Tests #1-8 pass, test #9 currently fails. ## Credits diff --git a/http-response/background.js b/http-response/background.js index e2b1d4e4..87a3ce89 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -53,28 +53,40 @@ function listener(details) { // to UTF-8. If modifying this block of code, ensure that the tests at // https://www.w3.org/2006/11/mwbp-tests/index.xhtml // pass - current implementation only fails on #9 but this detection ensures - // tests #3,4,5, and 8 pass. + // tests #3, 4, 5, and 8 pass. let encoder = new TextEncoder(); contentTypeHeader.value = baseType+';charset=utf-8'; // Now the actual filtering can begin! let filter = browser.webRequest.filterResponseData(details.requestId); - let fullStr = ''; + let unprocessedStr = ''; + let searchString = 'Test'; + let leaveUnprocessedLength = searchString.length - 1; filter.ondata = e => { // Note that the event's data may break in the middle of an encoded // character - the stream parameter is critical for success as this // method gets called multiple times. - let str = decoder.decode(e.data, {stream: true}); - fullStr += str; + unprocessedStr += decoder.decode(e.data, {stream: true}); + // Process the received data as far as possible. + // Note this replacement is rather naive but demonstrates the idea + // If the search string was contained in the replacement string, + // for instance, the repeated replacement like this could be bad. + unprocessedStr = unprocessedStr.replace(/Test/g, 'WebExtension Check'); + if(unprocessedStr.length > leaveUnprocessedLength) { + let processedStr = unprocessedStr.substr(0, leaveUnprocessedLength); + unprocessedStr = unprocessedStr.substr(leaveUnprocessedLength); + filter.write(encoder.encode(processedStr)); + } } - filter.onstop = async e => { - fullStr += decoder.decode(); //Flush the buffer - // Just change any instance of Test in the HTTP response to WebExtension Test. - let mutatedStr = fullStr.replace(/Test/g, 'WebExtension Test'); - filter.write(encoder.encode(mutatedStr)); + filter.onstop = async _ => { + // Flush the decoding buffer + unprocessedStr += decoder.decode(); + // Flush our replacement buffer + let processedStr = unprocessedStr.replace(/Test/g, 'WebExtension Check'); + filter.write(encoder.encode(processedStr)); filter.close(); } @@ -95,6 +107,7 @@ function detectCharset(contentType) { return charset; } +// Set up the actual webRequest hook browser.webRequest.onHeadersReceived.addListener( listener, { From 6a5b6da73c81742b110f9d6abff0c8546b1af9b4 Mon Sep 17 00:00:00 2001 From: Jesse Trana Date: Sun, 15 Nov 2020 16:16:36 -0600 Subject: [PATCH 10/10] Adding Rob W's example as a link in the README --- http-response/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/http-response/README.md b/http-response/README.md index be4252bc..f3c76764 100755 --- a/http-response/README.md +++ b/http-response/README.md @@ -19,6 +19,9 @@ A real-world example of WebRequest that shows four important details not always The domain w3.org is included in the list of domains to allow testing against [this suite of standardized tests](https://www.w3.org/2006/11/mwbp-tests/index.xhtml) regarding text encoding. Tests #1-8 pass, test #9 currently fails. +For inspiration about how to make the charset detection more robust, see: +https://github.com/Rob--W/open-in-browser/commit/a6b926ea9522b35298632e5e6a2c89ddb456c5d9 + ## Credits Icon is from: https://www.iconfinder.com/icons/763339/draw_edit_editor_pen_pencil_tool_write_icon#size=128