Skip to content

Commit

Permalink
Add support for uploading to s3
Browse files Browse the repository at this point in the history
  • Loading branch information
hellais committed Aug 19, 2024
1 parent 5523ee5 commit ffea0e0
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 14 deletions.
10 changes: 9 additions & 1 deletion download_assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,18 @@ def file_sha1_hexdigest(filepath: Path):
h.update(b)
return h.hexdigest()

def file_md5_hexdigest(filepath: Path):
h = hashlib.md5()
with filepath.open("rb") as in_file:
while True:
b = in_file.read(2**16)
if not b:
break
h.update(b)
return h.hexdigest()

IAItem = namedtuple("IAItem", ["identifier", "filename", "sha1"])


def list_all_ia_items(identifier: str) -> List[IAItem]:
ia_items = []
resp = req_session.get(
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ internetarchive==3.0.2
lxml==4.9.1
maxminddb==2.2.0
requests==2.31.0
boto3==1.24.91

# Indirect
black==24.3.0
boto3==1.24.91
botocore==1.27.91
certifi==2023.7.22
charset-normalizer==2.1.1
Expand Down
76 changes: 64 additions & 12 deletions upload_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@
import time
from pathlib import Path
from itertools import chain
from download_assets import list_all_ia_items, file_sha1_hexdigest
from download_assets import list_all_ia_items, file_sha1_hexdigest, file_md5_hexdigest

import boto3
import internetarchive as ia


def iter_outputs(outputs_dir: Path):
for fp in chain(
outputs_dir.glob("*.mmdb.gz"), outputs_dir.glob("all_as_org_map.json")
):
yield fp

def upload_to_ia(identifier: str, filepath: Path, access_key: str, secret_key: str):
print(f" uploading {filepath.name}")
files = {filepath.name: filepath.open("rb")}
Expand All @@ -18,15 +25,13 @@ def upload_to_ia(identifier: str, filepath: Path, access_key: str, secret_key: s
time.sleep(backoff)


def upload_missing(outputs_dir: Path, secret_key: str, access_key: str):
def upload_missing_ia(outputs_dir: Path, secret_key: str, access_key: str):
identifier = "ip2country-as"
existing_items = {}
for itm in list_all_ia_items(identifier=identifier):
existing_items[itm.filename] = itm

for fp in chain(
outputs_dir.glob("*.mmdb.gz"), outputs_dir.glob("all_as_org_map.json")
):
for fp in iter_outputs(outputs_dir):
if (
fp.name in existing_items
and file_sha1_hexdigest(fp) == existing_items[fp.name].sha1
Expand All @@ -39,17 +44,64 @@ def upload_missing(outputs_dir: Path, secret_key: str, access_key: str):
secret_key=secret_key,
)

def upload_missing_s3(outputs_dir: Path, access_key: str, secret_key: str):
session = boto3.Session(
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
)
s3_client = session.client('s3')
s3 = session.resource('s3')

existing_items = {}
for obj in s3.Bucket('ooni-data-eu-fra').objects.filter(Prefix='ip2country-as/'):
if obj.size == 0:
# Skip directories
continue
filename = obj.key.split("/")[-1]
# Note: the etag is not actually always the md5sum. For example it
# never is for the .json files. Maybe we should switch to something
# different for uploads in s3
md5_sum = obj.e_tag.replace('"', '')
existing_items[filename] = md5_sum

for fp in iter_outputs(outputs_dir):
if (
fp.name in existing_items
and file_md5_hexdigest(fp) == existing_items[fp.name]
):
continue

with fp.open("rb") as in_file:
s3_client.upload_fileobj(in_file, 'ooni-data-eu-fra', f'ip2country-as/{fp.name}')

def main():
outputs_dir = Path("outputs")
access_key = os.environ.get("IA_ACCESS_KEY", "")
secret_key = os.environ.get("IA_SECRET_KEY", "")
if access_key == "" or secret_key == "":
print("WARNING IA_ACCESS_KEY or IA_SECRET_KEY are not set. Upload will fail")
upload_missing(
outputs_dir=outputs_dir, access_key=access_key, secret_key=secret_key
)
ia_access_key = os.environ.get("IA_ACCESS_KEY", "")
ia_secret_key = os.environ.get("IA_SECRET_KEY", "")

s3_access_key = os.environ.get("S3_ACCESS_KEY", "")
s3_secret_key = os.environ.get("S3_SECRET_KEY", "")

did_upload = False
if ia_access_key == "" or ia_secret_key == "":
print("WARNING IA_ACCESS_KEY or IA_SECRET_KEY are not set. Skipping internet archive upload")
else:
upload_missing_ia(
outputs_dir=outputs_dir, access_key=ia_access_key, secret_key=ia_secret_key
)
did_upload = True

if s3_access_key == "" or s3_secret_key == "":
print("WARNING S3_ACCESS_KEY or S3_SECRET_KEY are not set. Skipping s3 upload")
else:
upload_missing_s3(
outputs_dir=outputs_dir, access_key=s3_access_key, secret_key=s3_secret_key
)
did_upload = True

if did_upload == False:
print("No upload performed!")
sys.exit(1)

if __name__ == "__main__":
main()

0 comments on commit ffea0e0

Please sign in to comment.