Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for uploading to s3 #15

Merged
merged 1 commit into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion download_assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,18 @@ def file_sha1_hexdigest(filepath: Path):
h.update(b)
return h.hexdigest()

def file_md5_hexdigest(filepath: Path):
h = hashlib.md5()
with filepath.open("rb") as in_file:
while True:
b = in_file.read(2**16)
if not b:
break
h.update(b)
return h.hexdigest()

IAItem = namedtuple("IAItem", ["identifier", "filename", "sha1"])


def list_all_ia_items(identifier: str) -> List[IAItem]:
ia_items = []
resp = req_session.get(
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ internetarchive==3.0.2
lxml==4.9.1
maxminddb==2.2.0
requests==2.31.0
boto3==1.24.91

# Indirect
black==24.3.0
boto3==1.24.91
botocore==1.27.91
certifi==2023.7.22
charset-normalizer==2.1.1
Expand Down
76 changes: 64 additions & 12 deletions upload_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@
import time
from pathlib import Path
from itertools import chain
from download_assets import list_all_ia_items, file_sha1_hexdigest
from download_assets import list_all_ia_items, file_sha1_hexdigest, file_md5_hexdigest

import boto3
import internetarchive as ia


def iter_outputs(outputs_dir: Path):
for fp in chain(
outputs_dir.glob("*.mmdb.gz"), outputs_dir.glob("all_as_org_map.json")
):
yield fp

def upload_to_ia(identifier: str, filepath: Path, access_key: str, secret_key: str):
print(f" uploading {filepath.name}")
files = {filepath.name: filepath.open("rb")}
Expand All @@ -18,15 +25,13 @@ def upload_to_ia(identifier: str, filepath: Path, access_key: str, secret_key: s
time.sleep(backoff)


def upload_missing(outputs_dir: Path, secret_key: str, access_key: str):
def upload_missing_ia(outputs_dir: Path, secret_key: str, access_key: str):
identifier = "ip2country-as"
existing_items = {}
for itm in list_all_ia_items(identifier=identifier):
existing_items[itm.filename] = itm

for fp in chain(
outputs_dir.glob("*.mmdb.gz"), outputs_dir.glob("all_as_org_map.json")
):
for fp in iter_outputs(outputs_dir):
if (
fp.name in existing_items
and file_sha1_hexdigest(fp) == existing_items[fp.name].sha1
Expand All @@ -39,17 +44,64 @@ def upload_missing(outputs_dir: Path, secret_key: str, access_key: str):
secret_key=secret_key,
)

def upload_missing_s3(outputs_dir: Path, access_key: str, secret_key: str):
session = boto3.Session(
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
)
s3_client = session.client('s3')
s3 = session.resource('s3')

existing_items = {}
for obj in s3.Bucket('ooni-data-eu-fra').objects.filter(Prefix='ip2country-as/'):
if obj.size == 0:
# Skip directories
continue
filename = obj.key.split("/")[-1]
# Note: the etag is not actually always the md5sum. For example it
# never is for the .json files. Maybe we should switch to something
# different for uploads in s3
md5_sum = obj.e_tag.replace('"', '')
existing_items[filename] = md5_sum

for fp in iter_outputs(outputs_dir):
if (
fp.name in existing_items
and file_md5_hexdigest(fp) == existing_items[fp.name]
):
continue

with fp.open("rb") as in_file:
s3_client.upload_fileobj(in_file, 'ooni-data-eu-fra', f'ip2country-as/{fp.name}')

def main():
outputs_dir = Path("outputs")
access_key = os.environ.get("IA_ACCESS_KEY", "")
secret_key = os.environ.get("IA_SECRET_KEY", "")
if access_key == "" or secret_key == "":
print("WARNING IA_ACCESS_KEY or IA_SECRET_KEY are not set. Upload will fail")
upload_missing(
outputs_dir=outputs_dir, access_key=access_key, secret_key=secret_key
)
ia_access_key = os.environ.get("IA_ACCESS_KEY", "")
ia_secret_key = os.environ.get("IA_SECRET_KEY", "")

s3_access_key = os.environ.get("S3_ACCESS_KEY", "")
s3_secret_key = os.environ.get("S3_SECRET_KEY", "")

did_upload = False
if ia_access_key == "" or ia_secret_key == "":
print("WARNING IA_ACCESS_KEY or IA_SECRET_KEY are not set. Skipping internet archive upload")
else:
upload_missing_ia(
outputs_dir=outputs_dir, access_key=ia_access_key, secret_key=ia_secret_key
)
did_upload = True

if s3_access_key == "" or s3_secret_key == "":
print("WARNING S3_ACCESS_KEY or S3_SECRET_KEY are not set. Skipping s3 upload")
else:
upload_missing_s3(
outputs_dir=outputs_dir, access_key=s3_access_key, secret_key=s3_secret_key
)
did_upload = True

if did_upload == False:
print("No upload performed!")
sys.exit(1)

if __name__ == "__main__":
main()