diff --git a/common.py b/common.py index f29815f..1aeec7a 100644 --- a/common.py +++ b/common.py @@ -1,65 +1,5 @@ import csv -import logging -import os -import re -import sys -import tempfile -from datetime import datetime -from enum import Enum from typing import IO -from urllib.parse import urlparse -from tqdm import tqdm - -import boto3 -import requests -import validators -from botocore.config import Config -from mypy_boto3_s3.service_resource import S3ServiceResource -from requests.adapters import HTTPAdapter -from urllib3.util import Retry -from botocore.exceptions import ClientError - -from constants import ( - AUTHORIZATION_HEADER, - CONTENT_DM_ISSHOWNAT_REGEX, - CONTENTDM_IIIF_INFO, - CONTENTDM_IIIF_MANIFEST_JSON, - DATA_PROVIDER_FIELD_NAME, - DPLA_API_DOCS, - DPLA_API_URL_BASE, - DPLA_PARTNERS, - EDM_AGENT_NAME, - HTTP_REQUEST_HEADERS, - IIIF_BODY, - IIIF_CANVASES, - IIIF_DEFAULT_JPG_SUFFIX, - IIIF_FULL_RES_JPG_SUFFIX, - IIIF_ID, - IIIF_IMAGES, - IIIF_ITEMS, - IIIF_MANIFEST_FIELD_NAME, - IIIF_PRESENTATION_API_MANIFEST_V2, - IIIF_PRESENTATION_API_MANIFEST_V3, - IIIF_RESOURCE, - IIIF_SEQUENCES, - INSTITUTIONS_FIELD_NAME, - INSTITUTIONS_URL, - JSON_LD_AT_CONTEXT, - JSON_LD_AT_ID, - LOGS_DIR_BASE, - MEDIA_MASTER_FIELD_NAME, - PROVIDER_FIELD_NAME, - RIGHTS_CATEGORY_FIELD_NAME, - S3_RETRIES, - UNLIMITED_RE_USE, - UPLOAD_FIELD_NAME, - WIKIDATA_FIELD_NAME, - S3_BUCKET, - EDM_IS_SHOWN_AT, -) - -__http_session: requests.Session | None = None -__temp_dir: tempfile.TemporaryDirectory | None = None def load_ids(ids_file: IO) -> list[str]: @@ -70,30 +10,6 @@ def load_ids(ids_file: IO) -> list[str]: return dpla_ids -def get_http_session() -> requests.Session: - global __http_session - if __http_session is not None: - return __http_session - retry_strategy = Retry( - connect=3, - read=3, - redirect=5, - status=5, - other=5, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], - allowed_methods=["HEAD", "GET", "OPTIONS"], - respect_retry_after_header=True, - raise_on_status=True, - raise_on_redirect=True, - ) - adapter = HTTPAdapter(max_retries=retry_strategy) - __http_session = requests.Session() - __http_session.mount("https://", adapter) - __http_session.mount("http://", adapter) - return __http_session - - def null_safe[T](data: dict, field_name: str, identity_element: T) -> T: if data is not None: return data.get(field_name, identity_element) @@ -114,319 +30,3 @@ def get_str(data: dict, field_name: str) -> str: def get_dict(data: dict, field_name: str) -> dict: """Null safe shortcut for getting a dict from a dict.""" return null_safe(data, field_name, {}) - - -def check_partner(partner: str) -> None: - if partner not in DPLA_PARTNERS: - sys.exit("Unrecognized partner.") - - -def get_item_metadata(dpla_id: str, api_key: str) -> dict: - url = DPLA_API_URL_BASE + dpla_id - headers = {AUTHORIZATION_HEADER: api_key} - response = get_http_session().get(url, headers=headers) - response_json = response.json() - return response_json.get(DPLA_API_DOCS)[0] - - -def extract_urls(item_metadata: dict) -> list[str]: - if MEDIA_MASTER_FIELD_NAME in item_metadata: - return get_list(item_metadata, MEDIA_MASTER_FIELD_NAME) - - elif IIIF_MANIFEST_FIELD_NAME in item_metadata: - return get_iiif_urls(get_str(item_metadata, IIIF_MANIFEST_FIELD_NAME)) - - else: - raise NotImplementedError( - f"No {MEDIA_MASTER_FIELD_NAME} or {IIIF_MANIFEST_FIELD_NAME}" - ) - - -def iiif_v2_urls(iiif: dict) -> list[str]: - """ - Extracts image URLs from a v2 IIIF manifest and returns them as a list - """ - urls = [] - sequences = get_list(iiif, IIIF_SEQUENCES) - sequence = sequences[0:1] if len(sequences) == 1 else None - canvases = get_list(sequence[0], IIIF_CANVASES) - - for canvas in canvases: - for image in get_list(canvas, IIIF_IMAGES): - resource = get_dict(image, IIIF_RESOURCE) - url = get_str(resource, JSON_LD_AT_ID) - if url: - urls.append(url) - return urls - - -def iiif_v3_urls(iiif: dict) -> list[str]: - """ - Extracts image URLs from a v3 IIIF manifest and returns them as a list - """ - urls = [] - for item in get_list(iiif, IIIF_ITEMS): - try: - url = get_str( - get_dict(item[IIIF_ITEMS][0][IIIF_ITEMS][0], IIIF_BODY), IIIF_ID - ) - # This is a hack to get around that v3 presumes the user supplies the - # resolution in the URL - if url: - # This condition may not be necessary but I'm leaving it in for now - # TODO does this end up giving us smaller resources than we want? - if url.endswith(IIIF_DEFAULT_JPG_SUFFIX): - urls.append(url) - else: - urls.append(url + IIIF_FULL_RES_JPG_SUFFIX) - except (IndexError, TypeError, KeyError) as e: - logging.warning("Unable to parse IIIF manifest.", e) - return [] - return urls - - -def get_iiif_urls(iiif_presentation_api_url: str) -> list[str]: - """ - Extracts image URLs from IIIF manifest and returns them as a list - Currently only supports IIIF v2 and v3 - """ - manifest = _get_iiif_manifest(iiif_presentation_api_url) - # v2 or v3? - if get_str(manifest, JSON_LD_AT_CONTEXT) == IIIF_PRESENTATION_API_MANIFEST_V3: - return iiif_v3_urls(manifest) - elif get_str(manifest, JSON_LD_AT_CONTEXT) == IIIF_PRESENTATION_API_MANIFEST_V2: - return iiif_v2_urls(manifest) - else: - raise Exception("Unimplemented IIIF version") - - -def _get_iiif_manifest(url: str) -> dict: - """ - :return: parsed JSON - """ - if not validators.url(url): - raise Exception(f"Invalid url {url}") - try: - request = get_http_session().get(url, headers=HTTP_REQUEST_HEADERS) - request.raise_for_status() - return request.json() - - except Exception as ex: - # todo maybe this should return None? - raise Exception(f"Error getting IIIF manifest at {url}") from ex - - -def contentdm_iiif_url(is_shown_at: str) -> str | None: - """ - Creates a IIIF presentation API manifest URL from the - link to the object in ContentDM - - We want to go from - http://www.ohiomemory.org/cdm/ref/collection/p16007coll33/id/126923 - to - http://www.ohiomemory.org/iiif/info/p16007coll33/126923/manifest.json - - """ - parsed_url = urlparse(is_shown_at) - match_result = re.match(CONTENT_DM_ISSHOWNAT_REGEX, parsed_url.path) - if not match_result: - return None - else: - return ( - parsed_url.scheme - + "://" - + parsed_url.netloc - + CONTENTDM_IIIF_INFO - + match_result.group(1) - + "/" - + match_result.group(2) - + CONTENTDM_IIIF_MANIFEST_JSON - ) - - -def get_s3_path(dpla_id: str, ordinal: int, partner: str) -> str: - return ( - f"{partner}/images/{dpla_id[0]}/{dpla_id[1]}/" - f"{dpla_id[2]}/{dpla_id[3]}/{dpla_id}/{ordinal}_{dpla_id}" - ).strip() - - -def s3_file_exists(path: str, s3: S3ServiceResource): - try: - s3.Object(S3_BUCKET, path).load() - return True - except ClientError as e: - if e.response["Error"]["Code"] == "404": - # The object does not exist. - return False - else: - # Something else has gone wrong. - raise - - -def setup_temp_dir() -> None: - global __temp_dir - if __temp_dir is None: - __temp_dir = tempfile.TemporaryDirectory( - "tmp", "wiki", dir=".", ignore_cleanup_errors=True, delete=False - ) - - -def cleanup_temp_dir() -> None: - global __temp_dir - if __temp_dir is not None: - __temp_dir.cleanup() - - -def get_temp_file(): - global __temp_dir - if __temp_dir is None: - raise Exception("Temp dir not initialized.") - return tempfile.NamedTemporaryFile(delete=False, dir=__temp_dir.name) - - -def clean_up_tmp_file(temp_file) -> None: - try: - if temp_file: - os.unlink(temp_file.name) - except Exception as e: - logging.warning("Temp file unlink failed.", exc_info=e) - - -def get_s3() -> S3ServiceResource: - config = Config( - signature_version="s3v4", - max_pool_connections=25, - retries={"max_attempts": S3_RETRIES}, - ) - - return boto3.resource("s3", config=config) - - -class TqdmLoggingHandler(logging.Handler): - def __init__(self, level=logging.NOTSET): - super().__init__(level) - - def emit(self, record): - try: - msg = self.format(record) - tqdm.write(msg) - self.flush() - except Exception: - self.handleError(record) - - -def setup_logging(partner: str, event_type: str, level: int = logging.INFO) -> None: - os.makedirs(LOGS_DIR_BASE, exist_ok=True) - time_str = datetime.now().strftime("%Y%m%d-%H%M%S") - log_file_name = f"{time_str}-{partner}-{event_type}.log" - filename = f"{LOGS_DIR_BASE}/{log_file_name}" - logging.basicConfig( - level=level, - datefmt="%H:%M:%S", - handlers=[ - TqdmLoggingHandler(), - logging.FileHandler(filename=filename, mode="w"), - ], - format="[%(levelname)s] " "%(asctime)s: " "%(message)s", - ) - logging.info(f"Logging to {filename}.") - for d in logging.Logger.manager.loggerDict: - if d.startswith("pywiki"): - logging.getLogger(d).setLevel(logging.ERROR) - - -Result = Enum("Result", ["DOWNLOADED", "FAILED", "SKIPPED", "UPLOADED", "BYTES"]) - - -class Tracker: - def __init__(self): - self.data = {} - - def increment(self, status: Result, amount=1) -> None: - if status not in self.data: - self.data[status] = 0 - self.data[status] = self.data[status] + amount - - def count(self, status: Result) -> int: - if status not in self.data: - return 0 - else: - return self.data[status] - - def __str__(self) -> str: - result = "COUNTS:\n" - for key in self.data: - value = self.data[key] - result += f"{key.name}: {value}\n" - return result - - -def is_wiki_eligible(item_metadata: dict, provider: dict, data_provider: dict) -> bool: - provider_ok = null_safe(provider, UPLOAD_FIELD_NAME, False) or null_safe( - data_provider, UPLOAD_FIELD_NAME, False - ) - - rights_category_ok = ( - get_str(item_metadata, RIGHTS_CATEGORY_FIELD_NAME) == UNLIMITED_RE_USE - ) - - is_shown_at = get_str(item_metadata, EDM_IS_SHOWN_AT) - media_master = len(get_list(item_metadata, MEDIA_MASTER_FIELD_NAME)) > 0 - iiif_manifest = null_safe(item_metadata, IIIF_MANIFEST_FIELD_NAME, False) - - if not iiif_manifest and not media_master: - iiif_url = contentdm_iiif_url(is_shown_at) - if iiif_url is not None: - response = get_http_session().head(iiif_url, allow_redirects=True) - if response.status_code < 400: - item_metadata[IIIF_MANIFEST_FIELD_NAME] = iiif_url - iiif_manifest = True - - asset_ok = media_master or iiif_manifest - - # todo create banlist. item based? sha based? local id based? all three? - # todo don't reupload if deleted - - id_ok = True - - logging.info( - f"Rights: {rights_category_ok}, Asset: {asset_ok}, Provider: {provider_ok}, ID: {id_ok}" - ) - - return rights_category_ok and asset_ok and provider_ok and id_ok - - -def get_provider_and_data_provider( - item_metadata: dict, providers_json: dict -) -> tuple[dict, dict]: - """ - Loads metadata about the provider and data provider from the providers json file. - """ - - provider_name = get_str( - get_dict(item_metadata, PROVIDER_FIELD_NAME), EDM_AGENT_NAME - ) - data_provider_name = get_str( - get_dict(item_metadata, DATA_PROVIDER_FIELD_NAME), EDM_AGENT_NAME - ) - provider = get_dict(providers_json, provider_name) - data_provider = get_dict( - get_dict(provider, INSTITUTIONS_FIELD_NAME), data_provider_name - ) - return provider, data_provider - - -def get_providers_data() -> dict: - """Loads the institutions file from ingestion3 in GitHub.""" - return get_http_session().get(INSTITUTIONS_URL).json() - - -def provider_str(provider: dict) -> str: - if provider is None: - return "Provider: None" - else: - return ( - f"Provider: {provider.get(WIKIDATA_FIELD_NAME, "")}, " - f"{provider.get(UPLOAD_FIELD_NAME, "")}" - ) diff --git a/constants.py b/constants.py deleted file mode 100644 index 49efe33..0000000 --- a/constants.py +++ /dev/null @@ -1,167 +0,0 @@ -DPLA_PARTNERS = [ - "bpl", - "georgia", - "il", - "indiana", - "nara", - "northwest-heritage", - "ohio", - "p2p", - "pa", - "texas", - "minnesota", -] - -LOGS_DIR_BASE = "./logs" - -# For temporarily storing local downloads. -TMP_DIR_BASE = "./tmp" - -# Wikimedia constants -WIKIDATA_URL_BASE = "http://www.wikidata.org/entity/" -COMMONS_URL_PREFIX = "https://commons.wikimedia.org/wiki/File:" -ERROR_FILEEXISTS = "fileexists-shared-forbidden" -ERROR_MIME = "filetype-badmime" -ERROR_BANNED = "filetype-banned" -ERROR_DUPLICATE = "duplicate" -ERROR_NOCHANGE = "no-change" -COMMONS_SITE_NAME = "commons" -WMC_UPLOAD_CHUNK_SIZE = 20_000_000 # 20 MB -VALUE_JOIN_DELIMITER = "; " -RESERVED_WIKITEXT_STRINGS = ["|", "=", "[[", "]]", "{{", "}}", "''"] - - -# This list exists mainly to exclude 'duplicate' records/images from being uploaded -# Full list of warnings: -# https://doc.wikimedia.org/pywikibot/master/_modules/pywikibot/site/_upload.html -IGNORE_WIKIMEDIA_WARNINGS = [ - # Target filename has a bad prefix {msg}. - "bad-prefix", - # Target filename is invalid. - "badfilename", - # The file is a duplicate of a deleted file {msg}. - "duplicate-archive", - # The upload is an exact duplicate of older version(s) of this file - "duplicate-version", - # File {msg} is empty. - "empty-file", - # File [Page] {msg} already exists - "exists", - # File exists with different extension as {msg}. - "exists-normalized", - # File {msg} type is unwanted type. - "filetype-unwanted-type", - # Target filename exists but with a different file {msg} - "page-exists", - # The file {msg} was previously deleted. - "was-deleted", - # Not ignored: - # Uploaded file is a duplicate of {msg} - # 'duplicate', - # The upload is an exact duplicate of the current version of this file - # 'no-change', -] -INVALID_CONTENT_TYPES = [ - "text/html", - "application/json", - "application/xml", - "text/plain", -] - -# API documentation: https://www.mediawiki.org/wiki/API:Allimages -FIND_BY_HASH_URL_PREFIX: str = ( - "https://commons.wikimedia.org/w/api.php?action=query&format=json" - "&list=allimages&aisha1=" -) - -FIND_BY_HASH_QUERY_FIELD_NAME = "query" -FIND_BY_HASH_ALLIMAGES_FIELD_NAME = "allimages" - -# API documentation: https://www.mediawiki.org/wiki/API:Imageinfo -FIND_BY_TITLE_URL_PREFIX: str = ( - "https://commons.wikimedia.org/w/api.php?action=query&format=json&prop=imageinfo" - "&iiprop=sha1&titles=" -) - -# rights statements -RIGHTS_STATEMENTS_URL_BASE = "http://rightsstatements.org" -CC_URL_BASE = "http://creativecommons.org" -CC_URL_REGEX = "^http://creativecommons.org/licenses/(.*)" -RS_NKC_URL_BASE = RIGHTS_STATEMENTS_URL_BASE + "/vocab/NKC/" -RS_NKC_TEMPLATE = "NKC" -RS_NOC_URL_BASE = RIGHTS_STATEMENTS_URL_BASE + "/vocab/NoC-US/" -NOC_US_TEMPLATE = "NoC-US" -CC_PD_URL_BASE = CC_URL_BASE + "/publicdomain/mark/" -PD_US_TEMPLATE = "PD-US" -CC_ZERO_URL_BASE = CC_URL_BASE + "/publicdomain/zero/" -CC_ZERO_TEMPLATE = "cc-zero" -CC_BY_URL_BASE = CC_URL_BASE + "/licenses/by/" -CC_BY_SA_URL_BASE = CC_URL_BASE + "/licenses/by-sa/" - - -# DPLA API -DPLA_API_URL_BASE = "https://api.dp.la/v2/items/" -DPLA_API_DOCS = "docs" - -# DPLA MAP field names -SOURCE_RESOURCE_FIELD_NAME = "sourceResource" -MEDIA_MASTER_FIELD_NAME = "mediaMaster" -IIIF_MANIFEST_FIELD_NAME = "iiifManifest" -PROVIDER_FIELD_NAME = "provider" -DATA_PROVIDER_FIELD_NAME = "dataProvider" -EXACT_MATCH_FIELD_NAME = "exactMatch" -EDM_AGENT_NAME = "name" -EDM_IS_SHOWN_AT = "isShownAt" -RIGHTS_CATEGORY_FIELD_NAME = "rightsCategory" -EDM_RIGHTS_FIELD_NAME = "rights" -EDM_TIMESPAN_PREF_LABEL = "prefLabel" -UNLIMITED_RE_USE = "Unlimited Re-Use" -DC_CREATOR_FIELD_NAME = "creator" -DC_DATE_FIELD_NAME = "date" -DC_DESCRIPTION_FIELD_NAME = "description" -DC_TITLE_FIELD_NAME = "title" -DC_IDENTIFIER_FIELD_NAME = "identifier" - -# Institutions file constants -INSTITUTIONS_URL = ( - "https://raw.githubusercontent.com/dpla/ingestion3" - "/refs/heads/develop/src/main/resources/wiki/institutions_v2.json" -) -UPLOAD_FIELD_NAME = "upload" -INSTITUTIONS_FIELD_NAME = "institutions" -WIKIDATA_FIELD_NAME = "Wikidata" - -# AWS constants -S3_RETRIES = 3 -S3_BUCKET = "dpla-mdpdb" # TODO change for prod -# we use sha1 because that's what commons uses for identifying files -S3_KEY_CHECKSUM = "sha1" -S3_KEY_METADATA = "Metadata" -S3_KEY_CONTENT_TYPE = "ContentType" - - -# http -HTTP_REQUEST_HEADERS = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ - (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36" -} -AUTHORIZATION_HEADER = "Authorization" - -# IIIF -JSON_LD_AT_CONTEXT = "@context" -JSON_LD_AT_ID = "@id" -IIIF_DEFAULT_JPG_SUFFIX = "default.jpg" -IIIF_ID = "id" -IIIF_BODY = "body" -IIIF_ITEMS = "items" -IIIF_RESOURCE = "resource" -IIIF_IMAGES = "images" -IIIF_CANVASES = "canvases" -IIIF_SEQUENCES = "sequences" -IIIF_FULL_RES_JPG_SUFFIX = "/full/full/0/default.jpg" -IIIF_PRESENTATION_API_MANIFEST_V2 = "http://iiif.io/api/presentation/2/context.json" -IIIF_PRESENTATION_API_MANIFEST_V3 = "http://iiif.io/api/presentation/3/context.json" - -CONTENTDM_IIIF_MANIFEST_JSON = "/manifest.json" -CONTENTDM_IIIF_INFO = "/iiif/info/" -CONTENT_DM_ISSHOWNAT_REGEX = r"^/cdm/ref/collection/(.*)/id/(.*)$" # todo diff --git a/downloader.py b/downloader.py index 9530456..c8932d8 100644 --- a/downloader.py +++ b/downloader.py @@ -2,6 +2,7 @@ import logging import os import time +from web import get_http_session from typing import IO import click @@ -11,32 +12,30 @@ from tqdm import tqdm from common import ( - Result, - Tracker, + load_ids, +) +from wikimedia import INVALID_CONTENT_TYPES +from dpla import ( check_partner, - cleanup_temp_dir, - extract_urls, - get_http_session, get_item_metadata, get_provider_and_data_provider, get_providers_data, - get_s3, - get_s3_path, - get_temp_file, is_wiki_eligible, provider_str, - s3_file_exists, - setup_logging, - setup_temp_dir, - load_ids, + extract_urls, ) -from constants import ( - INVALID_CONTENT_TYPES, +from logs import setup_logging +from s3 import ( + get_s3, + get_s3_path, + s3_file_exists, S3_BUCKET, S3_KEY_CHECKSUM, - S3_KEY_CONTENT_TYPE, S3_KEY_METADATA, + S3_KEY_CONTENT_TYPE, ) +from temp import cleanup_temp_dir, get_temp_file, setup_temp_dir +from tracker import Result, Tracker def download_media( @@ -101,6 +100,7 @@ def upload_temp_file( unit="B", unit_divisor=1024, unit_scale=True, + delay=2, ) as t: obj.upload_fileobj( Fileobj=file, @@ -140,6 +140,7 @@ def download_file_to_temp_path(media_url: str): unit="B", unit_divisor=1024, unit_scale=True, + delay=2, ) as t: with open(temp_file.name, "wb") as f: for chunk in response.iter_content(None): @@ -184,7 +185,7 @@ def main( dpla_ids = load_ids(ids_file) - for dpla_id in tqdm(dpla_ids, desc="Downloading Items", unit=" Items"): + for dpla_id in tqdm(dpla_ids, desc="Downloading Items", unit="Item"): logging.info(f"DPLA ID: {dpla_id}") try: item_metadata = get_item_metadata(dpla_id, api_key) @@ -212,7 +213,7 @@ def main( logging.info(f"Data Provider: {provider_str(data_provider)}") for media_url in tqdm( - media_urls, desc="Downloading Files", leave=False, unit=" Files" + media_urls, desc="Downloading Files", leave=False, unit="File" ): count += 1 # hack to fix bad nara data diff --git a/dpla.py b/dpla.py new file mode 100644 index 0000000..4b08712 --- /dev/null +++ b/dpla.py @@ -0,0 +1,265 @@ +import logging +import re +import sys +from urllib.parse import urlparse + +import validators + +from common import null_safe, get_str, get_list, get_dict +from web import get_http_session, HTTP_REQUEST_HEADERS + + +def check_partner(partner: str) -> None: + if partner not in DPLA_PARTNERS: + sys.exit("Unrecognized partner.") + + +def get_item_metadata(dpla_id: str, api_key: str) -> dict: + url = DPLA_API_URL_BASE + dpla_id + headers = {AUTHORIZATION_HEADER: api_key} + response = get_http_session().get(url, headers=headers) + response_json = response.json() + return response_json.get(DPLA_API_DOCS)[0] + + +def is_wiki_eligible(item_metadata: dict, provider: dict, data_provider: dict) -> bool: + provider_ok = null_safe(provider, UPLOAD_FIELD_NAME, False) or null_safe( + data_provider, UPLOAD_FIELD_NAME, False + ) + + rights_category_ok = ( + get_str(item_metadata, RIGHTS_CATEGORY_FIELD_NAME) == UNLIMITED_RE_USE + ) + + is_shown_at = get_str(item_metadata, EDM_IS_SHOWN_AT) + media_master = len(get_list(item_metadata, MEDIA_MASTER_FIELD_NAME)) > 0 + iiif_manifest = null_safe(item_metadata, IIIF_MANIFEST_FIELD_NAME, False) + + if not iiif_manifest and not media_master: + iiif_url = contentdm_iiif_url(is_shown_at) + if iiif_url is not None: + response = get_http_session().head(iiif_url, allow_redirects=True) + if response.status_code < 400: + item_metadata[IIIF_MANIFEST_FIELD_NAME] = iiif_url + iiif_manifest = True + + asset_ok = media_master or iiif_manifest + + # todo create banlist. item based? sha based? local id based? all three? + # todo don't reupload if deleted + + id_ok = True + + logging.info( + f"Rights: {rights_category_ok}, Asset: {asset_ok}, Provider: {provider_ok}, ID: {id_ok}" + ) + + return rights_category_ok and asset_ok and provider_ok and id_ok + + +def get_provider_and_data_provider( + item_metadata: dict, providers_json: dict +) -> tuple[dict, dict]: + """ + Loads metadata about the provider and data provider from the providers json file. + """ + + provider_name = get_str( + get_dict(item_metadata, PROVIDER_FIELD_NAME), EDM_AGENT_NAME + ) + data_provider_name = get_str( + get_dict(item_metadata, DATA_PROVIDER_FIELD_NAME), EDM_AGENT_NAME + ) + provider = get_dict(providers_json, provider_name) + data_provider = get_dict( + get_dict(provider, INSTITUTIONS_FIELD_NAME), data_provider_name + ) + return provider, data_provider + + +def get_providers_data() -> dict: + """Loads the institutions file from ingestion3 in GitHub.""" + return get_http_session().get(INSTITUTIONS_URL).json() + + +def provider_str(provider: dict) -> str: + if provider is None: + return "Provider: None" + else: + return ( + f"Provider: {provider.get(WIKIDATA_FIELD_NAME, "")}, " + f"{provider.get(UPLOAD_FIELD_NAME, "")}" + ) + + +def extract_urls(item_metadata: dict) -> list[str]: + if MEDIA_MASTER_FIELD_NAME in item_metadata: + return get_list(item_metadata, MEDIA_MASTER_FIELD_NAME) + + elif IIIF_MANIFEST_FIELD_NAME in item_metadata: + return get_iiif_urls(get_str(item_metadata, IIIF_MANIFEST_FIELD_NAME)) + + else: + raise NotImplementedError( + f"No {MEDIA_MASTER_FIELD_NAME} or {IIIF_MANIFEST_FIELD_NAME}" + ) + + +def iiif_v2_urls(iiif: dict) -> list[str]: + """ + Extracts image URLs from a v2 IIIF manifest and returns them as a list + """ + urls = [] + sequences = get_list(iiif, IIIF_SEQUENCES) + sequence = sequences[0:1] if len(sequences) == 1 else None + canvases = get_list(sequence[0], IIIF_CANVASES) + + for canvas in canvases: + for image in get_list(canvas, IIIF_IMAGES): + resource = get_dict(image, IIIF_RESOURCE) + url = get_str(resource, JSON_LD_AT_ID) + if url: + urls.append(url) + return urls + + +def iiif_v3_urls(iiif: dict) -> list[str]: + """ + Extracts image URLs from a v3 IIIF manifest and returns them as a list + """ + urls = [] + for item in get_list(iiif, IIIF_ITEMS): + try: + url = get_str( + get_dict(item[IIIF_ITEMS][0][IIIF_ITEMS][0], IIIF_BODY), IIIF_ID + ) + # This is a hack to get around that v3 presumes the user supplies the + # resolution in the URL + if url: + # This condition may not be necessary but I'm leaving it in for now + # TODO does this end up giving us smaller resources than we want? + if url.endswith(IIIF_DEFAULT_JPG_SUFFIX): + urls.append(url) + else: + urls.append(url + IIIF_FULL_RES_JPG_SUFFIX) + except (IndexError, TypeError, KeyError) as e: + logging.warning("Unable to parse IIIF manifest.", e) + return [] + return urls + + +def get_iiif_urls(iiif_presentation_api_url: str) -> list[str]: + """ + Extracts image URLs from IIIF manifest and returns them as a list + Currently only supports IIIF v2 and v3 + """ + manifest = _get_iiif_manifest(iiif_presentation_api_url) + # v2 or v3? + if get_str(manifest, JSON_LD_AT_CONTEXT) == IIIF_PRESENTATION_API_MANIFEST_V3: + return iiif_v3_urls(manifest) + elif get_str(manifest, JSON_LD_AT_CONTEXT) == IIIF_PRESENTATION_API_MANIFEST_V2: + return iiif_v2_urls(manifest) + else: + raise Exception("Unimplemented IIIF version") + + +def _get_iiif_manifest(url: str) -> dict: + """ + :return: parsed JSON + """ + if not validators.url(url): + raise Exception(f"Invalid url {url}") + try: + request = get_http_session().get(url, headers=HTTP_REQUEST_HEADERS) + request.raise_for_status() + return request.json() + + except Exception as ex: + # todo maybe this should return None? + raise Exception(f"Error getting IIIF manifest at {url}") from ex + + +def contentdm_iiif_url(is_shown_at: str) -> str | None: + """ + Creates a IIIF presentation API manifest URL from the + link to the object in ContentDM + + We want to go from + http://www.ohiomemory.org/cdm/ref/collection/p16007coll33/id/126923 + to + http://www.ohiomemory.org/iiif/info/p16007coll33/126923/manifest.json + + """ + parsed_url = urlparse(is_shown_at) + match_result = re.match(CONTENT_DM_ISSHOWNAT_REGEX, parsed_url.path) + if not match_result: + return None + else: + return ( + parsed_url.scheme + + "://" + + parsed_url.netloc + + CONTENTDM_IIIF_INFO + + match_result.group(1) + + "/" + + match_result.group(2) + + CONTENTDM_IIIF_MANIFEST_JSON + ) + + +DPLA_API_URL_BASE = "https://api.dp.la/v2/items/" +DPLA_API_DOCS = "docs" +INSTITUTIONS_URL = ( + "https://raw.githubusercontent.com/dpla/ingestion3" + "/refs/heads/develop/src/main/resources/wiki/institutions_v2.json" +) +UPLOAD_FIELD_NAME = "upload" +INSTITUTIONS_FIELD_NAME = "institutions" +SOURCE_RESOURCE_FIELD_NAME = "sourceResource" +MEDIA_MASTER_FIELD_NAME = "mediaMaster" +IIIF_MANIFEST_FIELD_NAME = "iiifManifest" +PROVIDER_FIELD_NAME = "provider" +DATA_PROVIDER_FIELD_NAME = "dataProvider" +EXACT_MATCH_FIELD_NAME = "exactMatch" +EDM_AGENT_NAME = "name" +EDM_IS_SHOWN_AT = "isShownAt" +RIGHTS_CATEGORY_FIELD_NAME = "rightsCategory" +EDM_RIGHTS_FIELD_NAME = "rights" +EDM_TIMESPAN_PREF_LABEL = "prefLabel" +UNLIMITED_RE_USE = "Unlimited Re-Use" +DC_CREATOR_FIELD_NAME = "creator" +DC_DATE_FIELD_NAME = "date" +DC_DESCRIPTION_FIELD_NAME = "description" +DC_TITLE_FIELD_NAME = "title" +DC_IDENTIFIER_FIELD_NAME = "identifier" +WIKIDATA_FIELD_NAME = "Wikidata" +AUTHORIZATION_HEADER = "Authorization" +JSON_LD_AT_CONTEXT = "@context" +JSON_LD_AT_ID = "@id" +IIIF_DEFAULT_JPG_SUFFIX = "default.jpg" +IIIF_ID = "id" +IIIF_BODY = "body" +IIIF_ITEMS = "items" +IIIF_RESOURCE = "resource" +IIIF_IMAGES = "images" +IIIF_CANVASES = "canvases" +IIIF_SEQUENCES = "sequences" +IIIF_FULL_RES_JPG_SUFFIX = "/full/full/0/default.jpg" +IIIF_PRESENTATION_API_MANIFEST_V2 = "http://iiif.io/api/presentation/2/context.json" +IIIF_PRESENTATION_API_MANIFEST_V3 = "http://iiif.io/api/presentation/3/context.json" +CONTENTDM_IIIF_MANIFEST_JSON = "/manifest.json" +CONTENTDM_IIIF_INFO = "/iiif/info/" +CONTENT_DM_ISSHOWNAT_REGEX = r"^/cdm/ref/collection/(.*)/id/(.*)$" # todo +DPLA_PARTNERS = [ + "bpl", + "georgia", + "il", + "indiana", + "nara", + "northwest-heritage", + "ohio", + "p2p", + "pa", + "texas", + "minnesota", +] diff --git a/logs.py b/logs.py new file mode 100644 index 0000000..55b15c0 --- /dev/null +++ b/logs.py @@ -0,0 +1,46 @@ +import os +import logging +from datetime import datetime + +from tqdm import tqdm + + +class TqdmLoggingHandler(logging.Handler): + """ + This class redirects logging's console output through tqdm so the progress + bars don't get mangled. + """ + + def __init__(self, level=logging.NOTSET): + super().__init__(level) + + def emit(self, record): + try: + msg = self.format(record) + tqdm.write(msg) + self.flush() + except Exception: + self.handleError(record) + + +def setup_logging(partner: str, event_type: str, level: int = logging.INFO) -> None: + os.makedirs(LOGS_DIR_BASE, exist_ok=True) + time_str = datetime.now().strftime("%Y%m%d-%H%M%S") + log_file_name = f"{time_str}-{partner}-{event_type}.log" + filename = f"{LOGS_DIR_BASE}/{log_file_name}" + logging.basicConfig( + level=level, + datefmt="%H:%M:%S", + handlers=[ + TqdmLoggingHandler(), + logging.FileHandler(filename=filename, mode="w"), + ], + format="[%(levelname)s] " "%(asctime)s: " "%(message)s", + ) + logging.info(f"Logging to {filename}.") + for d in logging.Logger.manager.loggerDict: + if d.startswith("pywiki"): + logging.getLogger(d).setLevel(logging.ERROR) + + +LOGS_DIR_BASE = "logs" diff --git a/s3.py b/s3.py new file mode 100644 index 0000000..3a7dd2a --- /dev/null +++ b/s3.py @@ -0,0 +1,41 @@ +import boto3 +from botocore.config import Config +from botocore.exceptions import ClientError +from mypy_boto3_s3 import S3ServiceResource + + +def get_s3_path(dpla_id: str, ordinal: int, partner: str) -> str: + return ( + f"{partner}/images/{dpla_id[0]}/{dpla_id[1]}/" + f"{dpla_id[2]}/{dpla_id[3]}/{dpla_id}/{ordinal}_{dpla_id}" + ).strip() + + +def s3_file_exists(path: str, s3: S3ServiceResource): + try: + s3.Object(S3_BUCKET, path).load() + return True + except ClientError as e: + if e.response["Error"]["Code"] == "404": + # The object does not exist. + return False + else: + # Something else has gone wrong. + raise + + +def get_s3() -> S3ServiceResource: + config = Config( + signature_version="s3v4", + max_pool_connections=25, + retries={"max_attempts": S3_RETRIES}, + ) + + return boto3.resource("s3", config=config) + + +S3_RETRIES = 3 +S3_BUCKET = "dpla-mdpdb" # TODO change for prod +S3_KEY_CHECKSUM = "sha1" +S3_KEY_METADATA = "Metadata" +S3_KEY_CONTENT_TYPE = "ContentType" diff --git a/temp.py b/temp.py new file mode 100644 index 0000000..e8bb92a --- /dev/null +++ b/temp.py @@ -0,0 +1,33 @@ +import os +import tempfile + +__temp_dir: tempfile.TemporaryDirectory | None = None + + +def setup_temp_dir() -> None: + global __temp_dir + if __temp_dir is None: + __temp_dir = tempfile.TemporaryDirectory( + "tmp", "wiki", dir="", ignore_cleanup_errors=True, delete=False + ) + + +def cleanup_temp_dir() -> None: + global __temp_dir + if __temp_dir is not None: + __temp_dir.cleanup() + + +def get_temp_file(): + global __temp_dir + if __temp_dir is None: + raise Exception("Temp dir not initialized.") + return tempfile.NamedTemporaryFile(delete=False, dir=__temp_dir.name) + + +def clean_up_tmp_file(temp_file) -> None: + try: + if temp_file: + os.unlink(temp_file.name) + except Exception as e: + logging.warning("Temp file unlink failed.", exc_info=e) diff --git a/tracker.py b/tracker.py new file mode 100644 index 0000000..9676b70 --- /dev/null +++ b/tracker.py @@ -0,0 +1,26 @@ +from enum import Enum + +Result = Enum("Result", ["DOWNLOADED", "FAILED", "SKIPPED", "UPLOADED", "BYTES"]) + + +class Tracker: + def __init__(self): + self.data = {} + + def increment(self, status: Result, amount=1) -> None: + if status not in self.data: + self.data[status] = 0 + self.data[status] = self.data[status] + amount + + def count(self, status: Result) -> int: + if status not in self.data: + return 0 + else: + return self.data[status] + + def __str__(self) -> str: + result = "COUNTS:\n" + for key in self.data: + value = self.data[key] + result += f"{key.name}: {value}\n" + return result diff --git a/uploader.py b/uploader.py index 33c2601..0e6bc70 100644 --- a/uploader.py +++ b/uploader.py @@ -12,70 +12,69 @@ from pywikibot.tools.chars import replace_invisible from common import ( - get_item_metadata, - extract_urls, - get_s3_path, - get_temp_file, - setup_temp_dir, - cleanup_temp_dir, - get_s3, - setup_logging, - clean_up_tmp_file, - Tracker, - Result, - is_wiki_eligible, - get_provider_and_data_provider, - get_providers_data, - check_partner, - provider_str, get_str, get_list, get_dict, - get_http_session, load_ids, ) -from constants import ( - COMMONS_SITE_NAME, - WMC_UPLOAD_CHUNK_SIZE, - IGNORE_WIKIMEDIA_WARNINGS, - S3_BUCKET, - S3_KEY_CHECKSUM, - INVALID_CONTENT_TYPES, - WIKIDATA_FIELD_NAME, - EDM_RIGHTS_FIELD_NAME, - RESERVED_WIKITEXT_STRINGS, +from logs import setup_logging +from s3 import get_s3_path, get_s3, S3_BUCKET, S3_KEY_CHECKSUM +from tracker import Result, Tracker +from temp import setup_temp_dir, cleanup_temp_dir, get_temp_file, clean_up_tmp_file +from dpla import ( + check_partner, + get_item_metadata, + is_wiki_eligible, + get_provider_and_data_provider, + get_providers_data, + provider_str, SOURCE_RESOURCE_FIELD_NAME, - VALUE_JOIN_DELIMITER, + EDM_IS_SHOWN_AT, + EDM_RIGHTS_FIELD_NAME, + EDM_TIMESPAN_PREF_LABEL, DC_CREATOR_FIELD_NAME, - DC_TITLE_FIELD_NAME, - DC_DESCRIPTION_FIELD_NAME, DC_DATE_FIELD_NAME, - EDM_TIMESPAN_PREF_LABEL, - EDM_IS_SHOWN_AT, + DC_DESCRIPTION_FIELD_NAME, + DC_TITLE_FIELD_NAME, DC_IDENTIFIER_FIELD_NAME, - CC_URL_REGEX, - CC_BY_SA_URL_BASE, - CC_BY_URL_BASE, - CC_ZERO_URL_BASE, - CC_PD_URL_BASE, - RS_NOC_URL_BASE, - RS_NKC_URL_BASE, - RS_NKC_TEMPLATE, - NOC_US_TEMPLATE, - PD_US_TEMPLATE, - CC_ZERO_TEMPLATE, - RIGHTS_STATEMENTS_URL_BASE, + WIKIDATA_FIELD_NAME, + extract_urls, +) +from web import get_http_session +from wikimedia import ( + INVALID_CONTENT_TYPES, COMMONS_URL_PREFIX, - FIND_BY_HASH_URL_PREFIX, - FIND_BY_HASH_QUERY_FIELD_NAME, - FIND_BY_HASH_ALLIMAGES_FIELD_NAME, ERROR_FILEEXISTS, ERROR_MIME, ERROR_BANNED, ERROR_DUPLICATE, ERROR_NOCHANGE, + COMMONS_SITE_NAME, + WMC_UPLOAD_CHUNK_SIZE, + VALUE_JOIN_DELIMITER, + RESERVED_WIKITEXT_STRINGS, + IGNORE_WIKIMEDIA_WARNINGS, + FIND_BY_HASH_URL_PREFIX, + FIND_BY_HASH_QUERY_FIELD_NAME, + FIND_BY_HASH_ALLIMAGES_FIELD_NAME, ) +CC_URL_REGEX = "^http://creativecommons.org/licenses/(.*)" + +RIGHTS_STATEMENTS_URL_BASE = "http://rightsstatements.org" +RS_NKC_URL_BASE = RIGHTS_STATEMENTS_URL_BASE + "/vocab/NKC/" +RS_NOC_URL_BASE = RIGHTS_STATEMENTS_URL_BASE + "/vocab/NoC-US/" +CC_URL_BASE = "http://creativecommons.org" +CC_PD_URL_BASE = CC_URL_BASE + "/publicdomain/mark/" +CC_ZERO_URL_BASE = CC_URL_BASE + "/publicdomain/zero/" +CC_BY_URL_BASE = CC_URL_BASE + "/licenses/by/" +CC_BY_SA_URL_BASE = CC_URL_BASE + "/licenses/by-sa/" + +CC_ZERO_TEMPLATE = "cc-zero" +RS_NKC_TEMPLATE = "NKC" +NOC_US_TEMPLATE = "NoC-US" +PD_US_TEMPLATE = "PD-US" + def get_page(site: pywikibot.Site, title: str) -> FilePage: """ @@ -288,7 +287,7 @@ def main(ids_file, partner: str, api_key: str, dry_run: bool, verbose: bool) -> dpla_ids = load_ids(ids_file) - for dpla_id in tqdm(dpla_ids, desc="Uploading Items", unit=" Items"): + for dpla_id in tqdm(dpla_ids, desc="Uploading Items", unit="Item"): logging.info(f"DPLA ID: {dpla_id}") item_metadata = get_item_metadata(dpla_id, api_key) @@ -313,7 +312,7 @@ def main(ids_file, partner: str, api_key: str, dry_run: bool, verbose: bool) -> # todo manifest of files? files = extract_urls(item_metadata) - for file in tqdm(files, desc="Uploading Files", leave=False, unit=" Files"): + for file in tqdm(files, desc="Uploading Files", leave=False, unit="File"): ordinal += 1 # todo if we're walking s3, this comes from the name logging.info(f"Page {ordinal}") # one-pagers don't have page numbers in their titles @@ -385,6 +384,7 @@ def main(ids_file, partner: str, api_key: str, dry_run: bool, verbose: bool) -> unit="B", unit_scale=1024, unit_divisor=True, + delay=2, ) as t: s3_object.download_file( temp_file.name, diff --git a/web.py b/web.py new file mode 100644 index 0000000..ace8804 --- /dev/null +++ b/web.py @@ -0,0 +1,35 @@ +import requests +from requests.adapters import HTTPAdapter +from urllib3 import Retry + +__http_session: requests.Session | None = None + + +def get_http_session() -> requests.Session: + global __http_session + if __http_session is not None: + return __http_session + retry_strategy = Retry( + connect=3, + read=3, + redirect=5, + status=5, + other=5, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "OPTIONS"], + respect_retry_after_header=True, + raise_on_status=True, + raise_on_redirect=True, + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + __http_session = requests.Session() + __http_session.mount("https://", adapter) + __http_session.mount("http://", adapter) + return __http_session + + +HTTP_REQUEST_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ + (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36" +} diff --git a/wikimedia.py b/wikimedia.py new file mode 100644 index 0000000..b0e72d5 --- /dev/null +++ b/wikimedia.py @@ -0,0 +1,49 @@ +INVALID_CONTENT_TYPES = [ + "text/html", + "application/json", + "application/xml", + "text/plain", +] +COMMONS_URL_PREFIX = "https://commons.wikimedia.org/wiki/File:" +ERROR_FILEEXISTS = "fileexists-shared-forbidden" +ERROR_MIME = "filetype-badmime" +ERROR_BANNED = "filetype-banned" +ERROR_DUPLICATE = "duplicate" +ERROR_NOCHANGE = "no-change" +COMMONS_SITE_NAME = "commons" +WMC_UPLOAD_CHUNK_SIZE = 20_000_000 # 20 MB +VALUE_JOIN_DELIMITER = "; " +RESERVED_WIKITEXT_STRINGS = ["|", "=", "[[", "]]", "{{", "}}", "''"] +IGNORE_WIKIMEDIA_WARNINGS = [ + # Target filename has a bad prefix {msg}. + "bad-prefix", + # Target filename is invalid. + "badfilename", + # The file is a duplicate of a deleted file {msg}. + "duplicate-archive", + # The upload is an exact duplicate of older version(s) of this file + "duplicate-version", + # File {msg} is empty. + "empty-file", + # File [Page] {msg} already exists + "exists", + # File exists with different extension as {msg}. + "exists-normalized", + # File {msg} type is unwanted type. + "filetype-unwanted-type", + # Target filename exists but with a different file {msg} + "page-exists", + # The file {msg} was previously deleted. + "was-deleted", + # Not ignored: + # Uploaded file is a duplicate of {msg} + # 'duplicate', + # The upload is an exact duplicate of the current version of this file + # 'no-change', +] +FIND_BY_HASH_URL_PREFIX: str = ( + "https://commons.wikimedia.org/w/api.php?action=query&format=json" + "&list=allimages&aisha1=" +) +FIND_BY_HASH_QUERY_FIELD_NAME = "query" +FIND_BY_HASH_ALLIMAGES_FIELD_NAME = "allimages"