Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed IIIFv3 manifest parsing to max image dimensions. #51

Merged
merged 2 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 78 additions & 13 deletions ingest_wikimedia/metadata.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import json
import logging
import re
from operator import itemgetter
from urllib.parse import urlparse

import validators

from .common import null_safe, get_str, get_list, get_dict
from .s3 import write_iiif_manifest
from .tracker import Tracker, Result
from .web import get_http_session, HTTP_REQUEST_HEADERS


Expand Down Expand Up @@ -71,10 +73,6 @@ def is_wiki_eligible(item_metadata: dict, provider: dict, data_provider: dict) -

id_ok = True

logging.info(
f"Rights: {rights_category_ok}, Asset: {asset_ok}, Provider: {provider_ok}, ID: {id_ok}"
)

return rights_category_ok and asset_ok and provider_ok and id_ok


Expand Down Expand Up @@ -134,6 +132,7 @@ def extract_urls(
return get_iiif_urls(manifest)

else:
Tracker().increment(Result.NO_MEDIA)
raise NotImplementedError(
f"No {MEDIA_MASTER_FIELD_NAME} or {IIIF_MANIFEST_FIELD_NAME}"
)
Expand All @@ -152,6 +151,7 @@ def iiif_v2_urls(iiif: dict) -> list[str]:
for image in get_list(canvas, IIIF_IMAGES):
resource = get_dict(image, IIIF_RESOURCE)
url = get_str(resource, JSON_LD_AT_ID)
# todo do these always max the resolution?
if url:
urls.append(url)
return urls
Expand All @@ -160,28 +160,68 @@ def iiif_v2_urls(iiif: dict) -> list[str]:
def iiif_v3_urls(iiif: dict) -> list[str]:
"""
Extracts image URLs from a v3 IIIF manifest and returns them as a list
Servers specify urls in multiple ways.
"""
urls = []
for item in get_list(iiif, IIIF_ITEMS):
try:
url = get_str(
get_dict(item[IIIF_ITEMS][0][IIIF_ITEMS][0], IIIF_BODY), IIIF_ID
)
# This is a hack to get around that v3 presumes the user supplies the
# resolution in the URL
new_url = ""
if url:
# This condition may not be necessary but I'm leaving it in for now
# TODO does this end up giving us smaller resources than we want?
if url.endswith(IIIF_DEFAULT_JPG_SUFFIX):
urls.append(url)
else:
urls.append(url + IIIF_FULL_RES_JPG_SUFFIX)
new_url = maximize_iiif_v3_url(url)
# This always adds something to the list.
# If we didn't get a URL, it's just an empty string.
# This prevents getting the page order wrong if we don't
# figure out the URL one time and fix it later.
urls.append(new_url)

except (IndexError, TypeError, KeyError) as e:
logging.warning("Unable to parse IIIF manifest.", e)
Tracker().increment(Result.BAD_IIIF_MANIFEST)
return []
return urls


def maximize_iiif_v3_url(url: str) -> str:
m = None

if match := FULL_IMAGE_API_URL_REGEX.match(url):
m = match.groupdict()

elif match := IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX.match(url):
m = match.groupdict()

if m is not None:
scheme, server, prefix, identifier = itemgetter(
"scheme",
"server",
"prefix",
"identifier",
)(m)

return f"{scheme}://{server}/{prefix}/{identifier}/full/max/0/default.jpg"

if match := FULL_IMAGE_API_URL_REGEX_NO_PREFIX.match(url):
m = match.groupdict()

elif match := IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX.match(url):
m = match.groupdict()

if m is not None:
scheme, server, prefix, identifier = itemgetter(
"scheme",
"server",
"identifier",
)(m)

return f"{scheme}://{server}/{identifier}/full/max/0/default.jpg"

Tracker().increment(Result.BAD_IMAGE_API_V3)
return "" # we give up


def get_iiif_urls(manifest: dict) -> list[str]:
"""
Extracts image URLs from IIIF manifest and returns them as a list
Expand Down Expand Up @@ -247,6 +287,31 @@ def contentdm_iiif_url(is_shown_at: str) -> str | None:
)


# {scheme}://{server}{/prefix}/{identifier}/
IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)(?P<prefix>/[^/]+)/"
r"(?P<identifier>[^/]+)/?$"
)

IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX_NO_PREFIX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)/" r"(?P<identifier>[^/]+)/?$"
)


# {scheme}://{server}{/prefix}/{identifier}/{region}/{size}/{rotation}/{quality}.{format}
FULL_IMAGE_API_URL_REGEX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)/(?P<prefix>[^/]+)/"
r"(?P<identifier>[^/]+)/(?P<region>[^/]+)/(?P<size>[^/]+)/"
r"(?P<rotation>[^/]+)/(?P<quality>[^.]+).(?P<format>.*)$"
)

FULL_IMAGE_API_URL_REGEX_NO_PREFIX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)/(?P<prefix>[^/]+)/"
r"(?P<identifier>[^/]+)/(?P<region>[^/]+)/(?P<size>[^/]+)/"
r"(?P<rotation>[^/]+)/(?P<quality>[^.]+).(?P<format>.*)$"
)


DPLA_API_URL_BASE = "https://api.dp.la/v2/items/"
DPLA_API_DOCS = "docs"
INSTITUTIONS_URL = (
Expand Down Expand Up @@ -284,7 +349,7 @@ def contentdm_iiif_url(is_shown_at: str) -> str | None:
IIIF_IMAGES = "images"
IIIF_CANVASES = "canvases"
IIIF_SEQUENCES = "sequences"
IIIF_FULL_RES_JPG_SUFFIX = "/full/full/0/default.jpg"
IIIF_FULL_RES_JPG_SUFFIX = "/full/max/0/default.jpg"
IIIF_PRESENTATION_API_MANIFEST_V2 = "http://iiif.io/api/presentation/2/context.json"
IIIF_PRESENTATION_API_MANIFEST_V3 = "http://iiif.io/api/presentation/3/context.json"
CONTENTDM_IIIF_MANIFEST_JSON = "/manifest.json"
Expand Down
16 changes: 13 additions & 3 deletions ingest_wikimedia/tracker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from enum import Enum
from enum import Enum, auto
from threading import Lock

Result = Enum("Result", ["DOWNLOADED", "FAILED", "SKIPPED", "UPLOADED", "BYTES"])

class Result(Enum):
DOWNLOADED = auto()
FAILED = auto()
SKIPPED = auto()
UPLOADED = auto()
BYTES = auto()
BAD_IIIF_MANIFEST = auto()
NO_MEDIA = auto()
BAD_IMAGE_API_V3 = auto()


class SingletonBase:
Expand Down Expand Up @@ -32,5 +41,6 @@ def __str__(self) -> str:
result = "COUNTS:\n"
for key in self.data:
value = self.data[key]
result += f"{key.name}: {value}\n"
if value > 0:
result += f"{key.name}: {value}\n"
return result
18 changes: 16 additions & 2 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,25 @@ def test_iiif_v2_urls():
def test_iiif_v3_urls():
iiif = {
"items": [
{"items": [{"items": [{"body": {"id": "http://example.com/image"}}]}]}
{
"items": [
{
"items": [
{
"body": {
"id": "https://iiif.oregondigital.org/iiif/f0%2Fdf%2F72%2Fhj%2F15%2Ft-jp2.jp2/full/640,/0/default.jpg"
}
}
]
}
]
}
]
}
result = iiif_v3_urls(iiif)
assert result == ["http://example.com/image/full/full/0/default.jpg"]
assert result == [
"https://iiif.oregondigital.org/iiif/f0%2Fdf%2F72%2Fhj%2F15%2Ft-jp2.jp2/full/max/0/default.jpg"
]


def test_get_iiif_urls():
Expand Down
5 changes: 2 additions & 3 deletions tests/test_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def increment_tracker():
def test_str_representation(tracker):
tracker.increment(Result.FAILED, 2)
tracker.increment(Result.SKIPPED, 3)
expected_output = (
"COUNTS:\nDOWNLOADED: 0\nFAILED: 2\nSKIPPED: 3\nUPLOADED: 0\nBYTES: 0\n"
)
expected_output = "COUNTS:\nFAILED: 2\nSKIPPED: 3\n"

assert str(tracker) == expected_output
14 changes: 11 additions & 3 deletions tools/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,14 @@ def process_item(
tracker = Tracker()
try:
item_metadata = get_item_metadata(dpla_id, api_key)

if not item_metadata:
logging.info(f"{dpla_id} was not found in the DPLA API.")
tracker.increment(Result.SKIPPED)
return

write_item_metadata(partner, dpla_id, json.dumps(item_metadata))

provider, data_provider = get_provider_and_data_provider(
item_metadata, providers_json
)
Expand All @@ -204,9 +211,9 @@ def process_item(
media_urls = get_iiif_urls(manifest)

else:
raise NotImplementedError(
f"No {MEDIA_MASTER_FIELD_NAME} or {IIIF_MANIFEST_FIELD_NAME}"
)
# not sure how we got here
tracker.increment(Result.SKIPPED)
return

write_file_list(partner, dpla_id, media_urls)

Expand Down Expand Up @@ -283,6 +290,7 @@ def main(
providers_json,
api_key,
)
exit()
mdellabitta marked this conversation as resolved.
Show resolved Hide resolved

finally:
logging.info("\n" + str(tracker))
Expand Down