Skip to content

Commit

Permalink
more indexing compatibility fixes: (#77)
Browse files Browse the repository at this point in the history
- additional fixes to make surt conversion and indexing closer to
original python implementation in cdxj-indexer and surt.
- sort query args by splitting on '&', instead of parsing via
URLSearchParams to preserve empty query args
- SURT contains non urlencoded query args when possible, including POST
request body
- warc record: parse URL to ensure trailing slash included in http/https
URLs
- fixes #70
  • Loading branch information
ikreymer authored Aug 28, 2024
1 parent 9ce5b44 commit 1d36144
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 29 deletions.
25 changes: 4 additions & 21 deletions src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,26 +45,9 @@ export function getSurt(url: string) {
surt += ")";
surt += urlObj.pathname;
if (urlObj.search) {
urlObj.searchParams.sort();
surt += urlObj.search;
for (const [key, value] of urlObj.searchParams.entries()) {
if (!value) {
// if no value set, by default the surt contains 'key='
// however, for compatibility, only want to add a trailing '='
// if original URL has it.
const keyEncoded = encodeURIComponent(key);
const rx = new RegExp(`(?<=[&?])${rxEscape(key)}=(?=&|$)`);
// if original URL does *not* have trailing '=', attempt to remove it below
if (!rx.exec(urlLower)) {
// use URI encoded version to match the query arg if key is %-encoded
const rxEncoded =
key === keyEncoded
? rx
: new RegExp(`(?<=[&?])${rxEscape(keyEncoded)}=(?=&|$)`);
surt = surt.replace(rxEncoded, keyEncoded);
}
}
}
const args = urlObj.search.slice(1).split("&");
args.sort();
surt += "?" + args.join("&");
}
return surt;
} catch (_e) {
Expand Down Expand Up @@ -125,7 +108,7 @@ export function postToGetUrl(request: Request) {
}

if (query != null) {
request.url = appendRequestQuery(request.url, query, request.method);
request.url = appendRequestQuery(request.url, decodeURI(query), request.method);
request.method = "GET";
request.requestBody = query;
return true;
Expand Down
6 changes: 5 additions & 1 deletion src/lib/warcrecord.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,11 @@ export class WARCRecord extends BaseAsyncIterReader {
warcHeaders["WARC-Filename"] = filename;
}
} else if (url) {
warcHeaders["WARC-Target-URI"] = url;
try {
warcHeaders["WARC-Target-URI"] = new URL(url).href;
} catch (_e) {
warcHeaders["WARC-Target-URI"] = url;
}
}

warcHeaders["WARC-Date"] = date;
Expand Down
8 changes: 4 additions & 4 deletions test/testIndexer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMS
["cdx-index", get_warc_path("data/post-test.warc.gz")],
`\
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc"}
org,httpbin)/post?__wb_method=post&a=1&b=%5B%5D&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3"}
org,httpbin)/post?__wb_method=post&data=%5E&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^"}
`,
);
});
Expand All @@ -167,9 +167,9 @@ org,httpbin)/post?__wb_method=post&data=%5E&foo=bar 20140610001255 {"url":"http:
await index(
["cdx-index", get_warc_path("data/post-test-more.warc")],
`\
org,httpbin)/post?__wb_method=post&another=more%5Edata&test=some+data 20200809195334 {"url":"https://httpbin.org/post","mime":"application/json","status":"200","digest":"7AWVEIPQMCA4KTCNDXWSZ465FITB7LSK","length":"688","offset":"0","filename":"post-test-more.warc","method":"POST","requestBody":"test=some+data&another=more%5Edata"}
org,httpbin)/post?__wb_method=post&another=more^data&test=some+data 20200809195334 {"url":"https://httpbin.org/post","mime":"application/json","status":"200","digest":"7AWVEIPQMCA4KTCNDXWSZ465FITB7LSK","length":"688","offset":"0","filename":"post-test-more.warc","method":"POST","requestBody":"test=some+data&another=more%5Edata"}
org,httpbin)/post?__wb_method=post&a=json-data 20200809195334 {"url":"https://httpbin.org/post","mime":"application/json","status":"200","digest":"BYOQWRSQFW3A5SNUBDSASHFLXGL4FNGB","length":"655","offset":"1227","filename":"post-test-more.warc","method":"POST","requestBody":"a=json-data"}
org,httpbin)/post?__wb_method=post&__wb_post_data=na0kc29tzq0kza0ky2h1bmstzw5jb2rlza0kna0kzgf0yq0kma0kdqo%3D 20200810055049 {"url":"https://httpbin.org/post","mime":"application/json","status":"200","digest":"34LEADQD3MOBQ42FCO2WA5TUSEL5QOKP","length":"628","offset":"2338","filename":"post-test-more.warc","method":"POST","requestBody":"__wb_post_data=NA0Kc29tZQ0KZA0KY2h1bmstZW5jb2RlZA0KNA0KZGF0YQ0KMA0KDQo="}
org,httpbin)/post?__wb_method=post&__wb_post_data=na0kc29tzq0kza0ky2h1bmstzw5jb2rlza0kna0kzgf0yq0kma0kdqo= 20200810055049 {"url":"https://httpbin.org/post","mime":"application/json","status":"200","digest":"34LEADQD3MOBQ42FCO2WA5TUSEL5QOKP","length":"628","offset":"2338","filename":"post-test-more.warc","method":"POST","requestBody":"__wb_post_data=NA0Kc29tZQ0KZA0KY2h1bmstZW5jb2RlZA0KNA0KZGF0YQ0KMA0KDQo="}
`,
);
});
Expand Down
3 changes: 2 additions & 1 deletion test/testSerializer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ text\r\n\r\n',
yield encoder.encode("text");
}

const url = "http://example.com/";
// missing trailing slash, will be added automatically
const url = "http://example.com";
const date = "2000-01-01T00:00:00Z";
const type = "response";
const warcHeaders = {
Expand Down
11 changes: 9 additions & 2 deletions test/testUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,19 @@ describe("utils", () => {

test("surt with %-encoded query, trailing = param", () => {
expect(getSurt("https://www.example.com/some/path?a=b&c=d&e^=&z")).toBe(
"com,example)/some/path?a=b&c=d&e%5E=&z",
"com,example)/some/path?a=b&c=d&e^=&z",
);
});

test("surt with %-encoded query, no trailing = param", () => {
expect(getSurt("https://www.example.com/some/path?a=b&c=d&e^&z")).toBe(
"com,example)/some/path?a=b&c=d&e%5E&z",
"com,example)/some/path?a=b&c=d&e^&z",
);
});

test("surt with space", () => {
expect(getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d ")).toBe(
"com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h="
);
});
});

0 comments on commit 1d36144

Please sign in to comment.