-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathcaa-info.ts
45 lines (37 loc) · 1.93 KB
/
caa-info.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pRetry from 'p-retry';
import { getItemMetadata } from '@lib/IA/archive-metadata';
import { LOGGER } from '@lib/logging/logger';
import { parseDOM, qs, qsa } from '@lib/util/dom';
import { memoize } from '@lib/util/functions';
import { request } from '@lib/util/request';
import type { FileInfo } from './image-info';
// Use memoized fetch so that a single page can reuse the same metadata.
// Don't cache metadata across page loads, as it might change.
const fetchIAMetadata = memoize((itemId: string) => pRetry(() => getItemMetadata(itemId), {
retries: 5,
onFailedAttempt: /* istanbul ignore next: Difficult to cover */ (error) => {
LOGGER.warn(`Failed to retrieve IA metadata: ${error.cause}. Retrying…`);
},
}));
export async function getCAAInfo(itemId: string, imageId: string): Promise<FileInfo> {
const iaManifest = await fetchIAMetadata(itemId);
const fileNameRegex = new RegExp(`mbid-[a-f0-9-]{36}-${imageId}\\.\\w+$`);
const imageMetadata = iaManifest.files.find((fileMeta) => fileNameRegex.test(fileMeta.name));
if (imageMetadata === undefined) {
throw new Error(`Could not find image "${imageId}" in IA manifest`);
}
const pageCount = imageMetadata.format.endsWith('PDF') ? await tryGetPDFPageCount(itemId, imageId) : undefined;
return {
fileType: imageMetadata.format,
size: Number.parseInt(imageMetadata.size),
pageCount,
};
}
async function tryGetPDFPageCount(itemId: string, imageId: string): Promise<number> {
const zipListingUrl = `https://archive.org/download/${itemId}/${itemId}-${imageId}_jp2.zip/`;
const zipListingResponse = await request.get(zipListingUrl);
const page = parseDOM(zipListingResponse.text, zipListingUrl);
// Grabbing tbody via `qs` separately so we throw an error in case we can't find it.
const tbody = qs('tbody', page);
return qsa('tr', tbody).length - 2; // Decrement the header and the directory entry.
}