diff --git a/http-response/README.md b/http-response/README.md index 40c23380..f3c76764 100755 --- a/http-response/README.md +++ b/http-response/README.md @@ -2,10 +2,26 @@ ## What it does -Listens to HTTP Responses from example.com and changes the body of the response as it comes through. So that the word "Example" on https://example.com becomes "WebExtension Example". +Listens to HTTP Responses from w3.org and changes "Test" to "WebExtension Check" in the web pages' contents. ## What it shows -How to use the response parser on bytes. +A real-world example of WebRequest that shows four important details not always found in beginning examples: + - The accumulation of data through multiple calls to `.ondata`. + - The decoding of binary data to text in a streaming fashion. + - Text decoding that tries to respect the page's reported encoding via Content-Type. + - The encoding of replaced data back to `filter.write` in a streaming fashion. + + Note that both correctly detecting the character encoding and performing streaming replacements are deeper subjects + than can be fully covered in a small example but that this code provides a starting point for solving these problems + in your own solution. + +The domain w3.org is included in the list of domains to allow testing against [this suite of standardized tests](https://www.w3.org/2006/11/mwbp-tests/index.xhtml) +regarding text encoding. Tests #1-8 pass, test #9 currently fails. + +For inspiration about how to make the charset detection more robust, see: +https://github.com/Rob--W/open-in-browser/commit/a6b926ea9522b35298632e5e6a2c89ddb456c5d9 + +## Credits Icon is from: https://www.iconfinder.com/icons/763339/draw_edit_editor_pen_pencil_tool_write_icon#size=128 diff --git a/http-response/background.js b/http-response/background.js index b7a78800..87a3ce89 100755 --- a/http-response/background.js +++ b/http-response/background.js @@ -1,22 +1,118 @@ function listener(details) { + // If the HTTP response code is not OK, just let it flow through normally. + if (details.statusCode < 200 || 300 <= details.statusCode) { + console.log('HTTP Status Code was '+details.statusCode+' not 2XX for '+details.url+', skipping filtering.'); + return; + } + + // The received data is a stream of bytes. In order to do text-based + // modifications, it is necessary to decode the bytes into a string + // using the proper character encoding, do any modifications, then + // encode back into a stream of bytes. + // + // In order to use the correct decoding, one needs to detect the charset. + // Please note that there are many complex rules to detect the charset, + // and no approach with scanning only the response headers will be + // fully accurate. The simplified approach here is to find the + // Content-Type and extract the charset if found. + + let {responseHeaders} = details; + + // Find the last Content-Type header. + let contentTypeHeader = responseHeaders + .slice().reverse() + .find(h => h.name.toLowerCase() == "content-type"); + + // If Content-Type header is not set, the browser is going to do content-sniffing, + // and we should also return to avoid trouble (e.g. breaking downloads, PDFs, videos, ...). + if (contentTypeHeader === undefined) { + console.log('Content-Type header not found for '+details.url+', skipping filtering'); + return; + } + + // If it not a supported content type, we will return rather than guess. + let baseType; + let contentType = contentTypeHeader.value.trim(); + if(contentType.startsWith('text/html')) { + baseType = 'text/html'; + } else if (contentType.startsWith('application/xhtml+xml')) { + baseType = 'application/xhtml+xml'; + } else { + console.log('Content type '+contentType+' not supported for '+details.url+', skipping filtering.'); + return; + } + + // Set up TextDecoder + console.log('Initial checks passed, beginning charset detection for '+details.url); + let charset = detectCharset(contentType) || 'utf-8'; + let decoder = new TextDecoder(charset); + console.log('The detected charset was '+charset+' for '+details.url); + + // While TextDecoder supports most charset encodings, TextEncoder does NOT support + // other than 'utf-8', so it is necessary to change the Content-Type on the header + // to UTF-8. If modifying this block of code, ensure that the tests at + // https://www.w3.org/2006/11/mwbp-tests/index.xhtml + // pass - current implementation only fails on #9 but this detection ensures + // tests #3, 4, 5, and 8 pass. + let encoder = new TextEncoder(); + contentTypeHeader.value = baseType+';charset=utf-8'; + + + // Now the actual filtering can begin! let filter = browser.webRequest.filterResponseData(details.requestId); - let decoder = new TextDecoder("utf-8"); - let encoder = new TextEncoder(); - - filter.ondata = event => { - let str = decoder.decode(event.data, {stream: true}); - // Just change any instance of Example in the HTTP response - // to WebExtension Example. - str = str.replace(/Example/g, 'WebExtension Example'); - filter.write(encoder.encode(str)); - filter.disconnect(); + let unprocessedStr = ''; + let searchString = 'Test'; + let leaveUnprocessedLength = searchString.length - 1; + + filter.ondata = e => { + // Note that the event's data may break in the middle of an encoded + // character - the stream parameter is critical for success as this + // method gets called multiple times. + unprocessedStr += decoder.decode(e.data, {stream: true}); + // Process the received data as far as possible. + // Note this replacement is rather naive but demonstrates the idea + // If the search string was contained in the replacement string, + // for instance, the repeated replacement like this could be bad. + unprocessedStr = unprocessedStr.replace(/Test/g, 'WebExtension Check'); + if(unprocessedStr.length > leaveUnprocessedLength) { + let processedStr = unprocessedStr.substr(0, leaveUnprocessedLength); + unprocessedStr = unprocessedStr.substr(leaveUnprocessedLength); + filter.write(encoder.encode(processedStr)); + } + } + + filter.onstop = async _ => { + // Flush the decoding buffer + unprocessedStr += decoder.decode(); + // Flush our replacement buffer + let processedStr = unprocessedStr.replace(/Test/g, 'WebExtension Check'); + filter.write(encoder.encode(processedStr)); + filter.close(); } - return {}; + // Because details response headers have been mutated, return them + return details; +} + +// This code tries to snag the last charset indicated +// but is still not robust to poorly formed inputs. +function detectCharset(contentType) { + let charsetMarker = "charset="; + let foundIndex = contentType.lastIndexOf(charsetMarker); + if (foundIndex == -1) { + return undefined; + } + let charsetMaybeQuoted = contentType.substr(foundIndex+charsetMarker.length).trim().toLowerCase(); + let charset = charsetMaybeQuoted.replace(/"/g, ''); + return charset; } -browser.webRequest.onBeforeRequest.addListener( +// Set up the actual webRequest hook +browser.webRequest.onHeadersReceived.addListener( listener, - {urls: ["https://example.com/*"], types: ["main_frame"]}, - ["blocking"] -); + { + urls: ["https://www.w3.org/*"], // Include W3 for testing charset detection. + types: ["main_frame"] + }, + ["blocking","responseHeaders"] +); \ No newline at end of file diff --git a/http-response/manifest.json b/http-response/manifest.json index 3e3d75cd..4bc6ba5e 100755 --- a/http-response/manifest.json +++ b/http-response/manifest.json @@ -10,7 +10,7 @@ }, "permissions": [ - "webRequest", "webRequestBlocking", "https://example.com/*" + "webRequest", "webRequestBlocking", "https://www.w3.org/*" ], "background": {