Skip to content

Commit

Permalink
Lint
Browse files Browse the repository at this point in the history
  • Loading branch information
mdellabitta committed Jun 24, 2024
1 parent 2f86030 commit b838132
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 36 deletions.
34 changes: 17 additions & 17 deletions wikimedia/entries/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

import logging

from entries.entry import Entry
from executors.uploader import Uploader
from utilities.exceptions import UploadException, UploadWarning
from utilities.helpers import S3Helper, Text, InputHelper
from utilities.tracker import Result, Tracker
from entry import Entry
from wikimedia.executors.uploader import Uploader
from wikimedia.utilities.exceptions import UploadException, UploadWarning
from wikimedia.utilities.helpers import S3Helper, Text, InputHelper
from wikimedia.utilities.tracker import Result, Tracker


class UploadEntry(Entry):
Expand Down Expand Up @@ -42,17 +42,17 @@ def execute(self, **kwargs):
# Get the most recent parquet file from the input path
bucket, key = s3_helper.get_bucket_key(input_partner)
recent_key = s3_helper.most_recent(bucket=bucket, key=key, type="object")
input = f"s3://{bucket}/{recent_key}"
input_path = f"s3://{bucket}/{recent_key}"
# Read in most recent parquet file
df = Entry.load_data(data_in=input, columns=self.READ_COLUMNS).rename(
df = Entry.load_data(data_in=input_path, columns=self.READ_COLUMNS).rename(
columns=self.READ_COLUMNS
)
unique_ids = self.uploader._unique_ids(df)
# Set the total number DPLA records and intended uploads
self.tracker.set_dpla_count(len(unique_ids))
self.tracker.set_total(len(df))
# Summary of input parameters
self.log.info(f"Input............{input}")
self.log.info(f"Input............{input_path}")
self.log.info(f"Images...........{len(df)}")
self.log.info(f"DPLA records.....{self.tracker.item_cnt}")

Expand Down Expand Up @@ -82,14 +82,14 @@ def execute(self, **kwargs):
page_title = self.uploader.get_page_title(
title=title, dpla_identifier=dpla_id, suffix=ext, page=page
)
except UploadException as exec:
self.log.error(f"{str(exec)}")
except UploadException as error:
self.log.error(f"{str(error)}")
self.tracker.increment(Result.FAILED)
continue
try:
wikimedia_page = self.uploader.get_page(title=page_title)
except UploadException as exec:
self.log.error(f"{str(exec)}")
except UploadException as error:
self.log.error(f"{str(error)}")
self.tracker.increment(Result.FAILED)
continue
if wikimedia_page is None:
Expand All @@ -109,15 +109,15 @@ def execute(self, **kwargs):
)
self.tracker.increment(Result.UPLOADED, size=size)

except UploadWarning as _:
except UploadWarning:
self.log.info(f"Exists {Text.wikimedia_url(page_title)}")
self.tracker.increment(Result.SKIPPED)
continue
except UploadException as exec:
self.log.error(f"{str(exec)} -- {Text.wikimedia_url(page_title)}")
except UploadException as error:
self.log.error(f"{str(error)} -- {Text.wikimedia_url(page_title)}")
self.tracker.increment(Result.FAILED)
continue
except Exception as exception:
self.log.error(f"{str(exception)} -- {Text.wikimedia_url(page_title)}")
except Exception as error:
self.log.error(f"{str(error)} -- {Text.wikimedia_url(page_title)}")
self.tracker.increment(Result.FAILED)
continue
42 changes: 24 additions & 18 deletions wikimedia/executors/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
import botocore
import numpy as np
import pywikibot
from utilities.exceptions import UploadException, UploadWarning
from utilities.helpers import S3Helper
from utilities.helpers import Text
from wikimedia.utilities.exceptions import UploadException, UploadWarning
from wikimedia.utilities.helpers import S3Helper
from wikimedia.utilities.helpers import Text


class Uploader:
Expand All @@ -29,7 +29,8 @@ class Uploader:
"bad-prefix", # Target filename has a bad prefix {msg}.
"badfilename", # Target filename is invalid.
"duplicate-archive", # The file is a duplicate of a deleted file {msg}.
"duplicate-version", # The upload is an exact duplicate of older version(s) of this file
"duplicate-version", # The upload is an exact duplicate of older version(s)
# of this file
"empty-file", # File {msg} is empty.
"exists", # File [Page] {msg} already exists
"exists-normalized", # File exists with different extension as {msg}.
Expand All @@ -38,7 +39,8 @@ class Uploader:
"was-deleted", # The file {msg} was previously deleted.
#
# 'duplicate', # Uploaded file is a duplicate of {msg}
# 'no-change', # The upload is an exact duplicate of the current version of this file
# 'no-change', # The upload is an exact duplicate of the current version
# of this file
]

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -84,19 +86,20 @@ def download(self, bucket, key, destination):
to {destination.name}: {str(cex)}"
) from cex

def _unique_ids(self, df):
@staticmethod
def _unique_ids(df):
"""
Return a dictionary of unique dpla_ids and their counts"""
unique, counts = np.unique(df["dpla_id"], return_counts=True)
return dict(zip(unique, counts))

def upload(self, wiki_file_page, dpla_identifier, text, file, page_title):
"""
:param wiki_file_page:
:param dpla_identifier:
:param text
:param file
:param page_title
:return:
"""
comment = f'Uploading DPLA ID "[[dpla:{dpla_identifier}|{dpla_identifier}]]".'
Expand Down Expand Up @@ -126,23 +129,26 @@ def upload(self, wiki_file_page, dpla_identifier, text, file, page_title):
# FIXME this is dumb and should be better, it either raises and exception
# or returns True; kinda worthless?
return True
except Exception as exec:
error_string = str(exec)
except Exception as error:
error_string = str(error)
# TODO what does this error message actually mean? Page name?
if "fileexists-shared-forbidden:" in error_string:
raise UploadWarning("File already uploaded") from exec
raise UploadWarning("File already uploaded") from error
if "filetype-badmime" in error_string:
raise UploadException("Invalid MIME type") from exec
raise UploadException("Invalid MIME type") from error
if "filetype-banned" in error_string:
raise UploadException("Banned file type") from exec
raise UploadException("Banned file type") from error
# TODO what does this error message actually mean? MD5 hash collision?
if "duplicate" in error_string:
raise UploadWarning(f"File already exists, {error_string}") from exec
raise UploadWarning(f"File already exists, {error_string}") from error
if "no-change" in error_string:
raise UploadWarning(f"File exists, no change, {error_string}") from exec
raise UploadException(f"Failed: {error_string}") from exec
raise UploadWarning(
f"File exists, no change, {error_string}"
) from error
raise UploadException(f"Failed: {error_string}") from error

def get_page_title(self, title, dpla_identifier, suffix, page=None):
@staticmethod
def get_page_title(title, dpla_identifier, suffix, page=None):
"""
Makes a proper Wikimedia page title from the DPLA identifier and
the title of the image.
Expand Down Expand Up @@ -208,5 +214,5 @@ def get_extension(self, path):
else:
mime = mimetypes.guess_type(path)[0]
return mimetypes.guess_extension(mime)
except Exception as exec:
raise UploadException(f"No extension {path}: {str(exec)}") from exec
except Exception as error:
raise UploadException(f"No extension {path}: {str(error)}") from exec
2 changes: 1 addition & 1 deletion wikimedia/utilities/emailer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def body_html(self):


# Taken from Amzaon example code:
# > https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/ses/ses_email.py
# https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/ses/ses_email.py
class SesMailSender:
"""Encapsulates functions to send emails with Amazon SES."""

Expand Down

0 comments on commit b838132

Please sign in to comment.