Skip to content

Commit

Permalink
Fixed IIIFv3 manifest parsing to max image dimensions.
Browse files Browse the repository at this point in the history
  • Loading branch information
mdellabitta committed Nov 29, 2024
1 parent da8f695 commit 35469ab
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 24 deletions.
91 changes: 78 additions & 13 deletions ingest_wikimedia/metadata.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import json
import logging
import re
from operator import itemgetter
from urllib.parse import urlparse

import validators

from .common import null_safe, get_str, get_list, get_dict
from .s3 import write_iiif_manifest
from .tracker import Tracker, Result
from .web import get_http_session, HTTP_REQUEST_HEADERS


Expand Down Expand Up @@ -71,10 +73,6 @@ def is_wiki_eligible(item_metadata: dict, provider: dict, data_provider: dict) -

id_ok = True

logging.info(
f"Rights: {rights_category_ok}, Asset: {asset_ok}, Provider: {provider_ok}, ID: {id_ok}"
)

return rights_category_ok and asset_ok and provider_ok and id_ok


Expand Down Expand Up @@ -134,6 +132,7 @@ def extract_urls(
return get_iiif_urls(manifest)

else:
Tracker().increment(Result.NO_MEDIA)
raise NotImplementedError(
f"No {MEDIA_MASTER_FIELD_NAME} or {IIIF_MANIFEST_FIELD_NAME}"
)
Expand All @@ -152,6 +151,7 @@ def iiif_v2_urls(iiif: dict) -> list[str]:
for image in get_list(canvas, IIIF_IMAGES):
resource = get_dict(image, IIIF_RESOURCE)
url = get_str(resource, JSON_LD_AT_ID)
# todo do these always max the resolution?
if url:
urls.append(url)
return urls
Expand All @@ -160,28 +160,68 @@ def iiif_v2_urls(iiif: dict) -> list[str]:
def iiif_v3_urls(iiif: dict) -> list[str]:
"""
Extracts image URLs from a v3 IIIF manifest and returns them as a list
Servers specify urls in multiple ways.
"""
urls = []
for item in get_list(iiif, IIIF_ITEMS):
try:
url = get_str(
get_dict(item[IIIF_ITEMS][0][IIIF_ITEMS][0], IIIF_BODY), IIIF_ID
)
# This is a hack to get around that v3 presumes the user supplies the
# resolution in the URL
new_url = ""
if url:
# This condition may not be necessary but I'm leaving it in for now
# TODO does this end up giving us smaller resources than we want?
if url.endswith(IIIF_DEFAULT_JPG_SUFFIX):
urls.append(url)
else:
urls.append(url + IIIF_FULL_RES_JPG_SUFFIX)
new_url = maximize_iiif_v3_url(url)
# This always adds something to the list.
# If we didn't get a URL, it's just an empty string.
# This prevents getting the page order wrong if we don't
# figure out the URL one time and fix it later.
urls.append(new_url)

except (IndexError, TypeError, KeyError) as e:
logging.warning("Unable to parse IIIF manifest.", e)
Tracker().increment(Result.BAD_IIIF_MANIFEST)
return []
return urls


def maximize_iiif_v3_url(url: str) -> str:
m = None

if match := FULL_IMAGE_API_URL_REGEX.match(url):
m = match.groupdict()

elif match := IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX.match(url):
m = match.groupdict()

if m is not None:
scheme, server, prefix, identifier = itemgetter(
"scheme",
"server",
"prefix",
"identifier",
)(m)

return f"{scheme}://{server}/{prefix}/{identifier}/full/max/0/default.jpg"

if match := FULL_IMAGE_API_URL_REGEX_NO_PREFIX.match(url):
m = match.groupdict()

elif match := IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX.match(url):
m = match.groupdict()

if m is not None:
scheme, server, prefix, identifier = itemgetter(
"scheme",
"server",
"identifier",
)(m)

return f"{scheme}://{server}/{identifier}/full/max/0/default.jpg"

Tracker().increment(Result.BAD_IMAGE_API_V3)
return "" # we give up


def get_iiif_urls(manifest: dict) -> list[str]:
"""
Extracts image URLs from IIIF manifest and returns them as a list
Expand Down Expand Up @@ -247,6 +287,31 @@ def contentdm_iiif_url(is_shown_at: str) -> str | None:
)


# {scheme}://{server}{/prefix}/{identifier}/
IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)(?P<prefix>/[^/]+)/"
r"(?P<identifier>[^/]+)/?$"
)

IMAGE_API_UP_THROUGH_IDENTIFIER_REGEX_NO_PREFIX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)/" r"(?P<identifier>[^/]+)/?$"
)


# {scheme}://{server}{/prefix}/{identifier}/{region}/{size}/{rotation}/{quality}.{format}
FULL_IMAGE_API_URL_REGEX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)/(?P<prefix>[^/]+)/"
r"(?P<identifier>[^/]+)/(?P<region>[^/]+)/(?P<size>[^/]+)/"
r"(?P<rotation>[^/]+)/(?P<quality>[^.]+).(?P<format>.*)$"
)

FULL_IMAGE_API_URL_REGEX_NO_PREFIX = re.compile(
r"^(?P<scheme>http|https)://(?P<server>[^/]+)/(?P<prefix>[^/]+)/"
r"(?P<identifier>[^/]+)/(?P<region>[^/]+)/(?P<size>[^/]+)/"
r"(?P<rotation>[^/]+)/(?P<quality>[^.]+).(?P<format>.*)$"
)


DPLA_API_URL_BASE = "https://api.dp.la/v2/items/"
DPLA_API_DOCS = "docs"
INSTITUTIONS_URL = (
Expand Down Expand Up @@ -284,7 +349,7 @@ def contentdm_iiif_url(is_shown_at: str) -> str | None:
IIIF_IMAGES = "images"
IIIF_CANVASES = "canvases"
IIIF_SEQUENCES = "sequences"
IIIF_FULL_RES_JPG_SUFFIX = "/full/full/0/default.jpg"
IIIF_FULL_RES_JPG_SUFFIX = "/full/max/0/default.jpg"
IIIF_PRESENTATION_API_MANIFEST_V2 = "http://iiif.io/api/presentation/2/context.json"
IIIF_PRESENTATION_API_MANIFEST_V3 = "http://iiif.io/api/presentation/3/context.json"
CONTENTDM_IIIF_MANIFEST_JSON = "/manifest.json"
Expand Down
16 changes: 13 additions & 3 deletions ingest_wikimedia/tracker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from enum import Enum
from enum import Enum, auto
from threading import Lock

Result = Enum("Result", ["DOWNLOADED", "FAILED", "SKIPPED", "UPLOADED", "BYTES"])

class Result(Enum):
DOWNLOADED = auto()
FAILED = auto()
SKIPPED = auto()
UPLOADED = auto()
BYTES = auto()
BAD_IIIF_MANIFEST = auto()
NO_MEDIA = auto()
BAD_IMAGE_API_V3 = auto()


class SingletonBase:
Expand Down Expand Up @@ -32,5 +41,6 @@ def __str__(self) -> str:
result = "COUNTS:\n"
for key in self.data:
value = self.data[key]
result += f"{key.name}: {value}\n"
if value > 0:
result += f"{key.name}: {value}\n"
return result
18 changes: 16 additions & 2 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,25 @@ def test_iiif_v2_urls():
def test_iiif_v3_urls():
iiif = {
"items": [
{"items": [{"items": [{"body": {"id": "http://example.com/image"}}]}]}
{
"items": [
{
"items": [
{
"body": {
"id": "https://iiif.oregondigital.org/iiif/f0%2Fdf%2F72%2Fhj%2F15%2Ft-jp2.jp2/full/640,/0/default.jpg"
}
}
]
}
]
}
]
}
result = iiif_v3_urls(iiif)
assert result == ["http://example.com/image/full/full/0/default.jpg"]
assert result == [
"https://iiif.oregondigital.org/iiif/f0%2Fdf%2F72%2Fhj%2F15%2Ft-jp2.jp2/full/max/0/default.jpg"
]


def test_get_iiif_urls():
Expand Down
5 changes: 2 additions & 3 deletions tests/test_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def increment_tracker():
def test_str_representation(tracker):
tracker.increment(Result.FAILED, 2)
tracker.increment(Result.SKIPPED, 3)
expected_output = (
"COUNTS:\nDOWNLOADED: 0\nFAILED: 2\nSKIPPED: 3\nUPLOADED: 0\nBYTES: 0\n"
)
expected_output = "COUNTS:\nFAILED: 2\nSKIPPED: 3\n"

assert str(tracker) == expected_output
14 changes: 11 additions & 3 deletions tools/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,14 @@ def process_item(
tracker = Tracker()
try:
item_metadata = get_item_metadata(dpla_id, api_key)

if not item_metadata:
logging.info(f"{dpla_id} was not found in the DPLA API.")
tracker.increment(Result.SKIPPED)
return

write_item_metadata(partner, dpla_id, json.dumps(item_metadata))

provider, data_provider = get_provider_and_data_provider(
item_metadata, providers_json
)
Expand All @@ -204,9 +211,9 @@ def process_item(
media_urls = get_iiif_urls(manifest)

else:
raise NotImplementedError(
f"No {MEDIA_MASTER_FIELD_NAME} or {IIIF_MANIFEST_FIELD_NAME}"
)
# not sure how we got here
tracker.increment(Result.SKIPPED)
return

write_file_list(partner, dpla_id, media_urls)

Expand Down Expand Up @@ -283,6 +290,7 @@ def main(
providers_json,
api_key,
)
exit()

finally:
logging.info("\n" + str(tracker))
Expand Down

0 comments on commit 35469ab

Please sign in to comment.