Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

63 google dataset search support #136

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
Draft
10 changes: 10 additions & 0 deletions backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@
tags=["extractors"],
dependencies=[Depends(get_current_username)],
)
api_router.include_router(
extractors.router,
Copy link
Member

@longshuicy longshuicy Oct 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your newly added routes are in routers/datasets.py; but this new prefix is importing routes from extractors.
So i can actually access your routes under the /dataset/{id}/summary.jsonld but not here

I'd recommend you separate them out in a new router file (e.g. summary.py) and include them here. summary.router

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that was left over, thanks, will skip the tags for now, and take your other advice to put those routes in another file

prefix="/summary.jsonld",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefix is for the prefix :-p

e.g. prefix = "/dataset" means the routes will all start with /dataset s--> /datasets/{id}, /datasets/{id}/files etc

tags=["jsonld"],
)
api_router.include_router(
extractors.router,
prefix="/sitemap.xml",
tags=["sitemap"],
)
api_router.include_router(keycloak.router, prefix="/auth", tags=["auth"])
app.include_router(api_router, prefix=settings.API_V2_STR)

Expand Down
43 changes: 42 additions & 1 deletion backend/app/routers/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from collections.abc import Mapping, Iterable
from typing import List, Optional, Union

import pymongo
import pika
from bson import ObjectId
from bson import json_util
Expand All @@ -28,6 +29,11 @@

from app import dependencies
from app import keycloak_auth
from app.search.connect import (
connect_elasticsearch,
insert_record,
delete_document_by_id,
)
from app.config import settings
from app.keycloak_auth import get_user, get_current_user
from app.models.datasets import (
Expand Down Expand Up @@ -183,11 +189,30 @@ async def save_dataset(
user=Depends(keycloak_auth.get_current_user),
db: MongoClient = Depends(dependencies.get_db),
):
# Make connection to elatsicsearch
es = connect_elasticsearch()

# Check all connection and abort if any one of them is not available
if db is None or es is None:
raise HTTPException(status_code=503, detail="Service not available")
return

result = dataset_in.dict()
dataset_db = DatasetDB(**dataset_in.dict(), author=user)
new_dataset = await db["datasets"].insert_one(dataset_db.to_mongo())
found = await db["datasets"].find_one({"_id": new_dataset.inserted_id})
dataset_out = DatasetOut.from_mongo(found)

# Add en entry to the dataset index
doc = {
"name": dataset_out.name,
"description": dataset_out.description,
"author": dataset_out.author.email,
"created": dataset_out.created,
"modified": dataset_out.modified,
"download": dataset_out.downloads,
}
insert_record(es, "dataset", doc, dataset_out.id)
return dataset_out


Expand All @@ -204,14 +229,20 @@ async def get_datasets(
for doc in (
await db["datasets"]
.find({"author.email": user_id})
.sort([("created", pymongo.DESCENDING)])
.skip(skip)
.limit(limit)
.to_list(length=limit)
):
datasets.append(DatasetOut.from_mongo(doc))
else:
for doc in (
await db["datasets"].find().skip(skip).limit(limit).to_list(length=limit)
await db["datasets"]
.find()
.sort([("created", pymongo.DESCENDING)])
.skip(skip)
.limit(limit)
.to_list(length=limit)
):
datasets.append(DatasetOut.from_mongo(doc))
return datasets
Expand Down Expand Up @@ -323,7 +354,17 @@ async def delete_dataset(
db: MongoClient = Depends(dependencies.get_db),
fs: Minio = Depends(dependencies.get_fs),
):
# Make connection to elatsicsearch
es = connect_elasticsearch()

# Check all connection and abort if any one of them is not available
if db is None or fs is None or es is None:
raise HTTPException(status_code=503, detail="Service not available")
return

if (await db["datasets"].find_one({"_id": ObjectId(dataset_id)})) is not None:
# delete from elasticsearch
delete_document_by_id(es, "dataset", dataset_id)
# delete dataset first to minimize files/folder being uploaded to a delete dataset

await db["datasets"].delete_one({"_id": ObjectId(dataset_id)})
Expand Down
190 changes: 190 additions & 0 deletions backend/app/routers/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import datetime
import hashlib
import io
import os
import shutil
import tempfile
import zipfile
from collections.abc import Mapping, Iterable
from typing import List, Optional, Union

import pika
from bson import ObjectId
from bson import json_util
from fastapi import (
APIRouter,
HTTPException,
Depends,
File,
UploadFile,
Response,
Request,
)
from minio import Minio
from pika.adapters.blocking_connection import BlockingChannel
from pymongo import MongoClient
from rocrate.model.person import Person
from rocrate.rocrate import ROCrate

from app import dependencies
from app import keycloak_auth
from app.config import settings
from app.keycloak_auth import get_user, get_current_user
from app.models.datasets import (
DatasetBase,
DatasetIn,
DatasetDB,
DatasetOut,
DatasetPatch,
)
from app.models.files import FileOut, FileDB
from app.models.folders import FolderOut, FolderIn, FolderDB
from app.models.pyobjectid import PyObjectId
from app.models.users import UserOut
from app.routers.files import add_file_entry, remove_file_entry

router = APIRouter()

clowder_bucket = os.getenv("MINIO_BUCKET_NAME", "clowder")


def is_str(v):
"string predicate"
return type(v) is str


def is_dict(v):
"dict predicate"
return type(v) is dict


schemaOrg_mapping = {
"id": "identifier",
"first_name": "givenName",
"last_name": "familyName",
"created": "dateCreated",
"modified": "dateModified",
"views": "interactionStatistic",
"downloads": "DataDownload",
}


def datasetout_str2jsonld(jstr):
"map json-string keys to schema.org"
if not is_str(jstr):
jt = type(jstr)
print(f"str2jsonld:{jstr},wrong type:{jt}")
return None
global schemaOrg_mapping
for k, v in schemaOrg_mapping.items():
if k in jstr:
ks = f'"{k}":'
vs = f'"{v}":'
print(f"replace:{ks} with:{vs}")
jstr = jstr.replace(ks, vs)
print(f"==jstr:{jstr}")
jstr = jstr.replace("{", '{"@context": {"@vocab": "https://schema.org/"},', 1)
print(f"==jstr:{jstr}")
return jstr


serializable_keys = ["name", "description", "status", "views", "downloads"]


def datasetout2jsonld(dso):
"dataset attributes as jsonld"
dt = type(dso)
print(f"datasetout2jsonld:{dso},type:{dt}")
if is_dict(dso):
import json

dso2 = {}
for k, v in dso.items():
if k in serializable_keys:
dso2[k] = dso[k]
print(f"dso2:{dso2}")
jstr = json.dumps(dso2)
elif isinstance(dso, DatasetOut):
dt = type(dso)
print(f".json for:{dt}")
jstr = dso.json()
else:
jstr = ""
if len(jstr) > 9:
return datasetout_str2jsonld(jstr)
else:
return ""


def datasetout2jsonld_script(dso):
"dataset attributes in scrapable jsonld"
jld = datasetout2jsonld(dso)
print(f'<script type="application/ld+json">{jld}</script>')


def datasets2sitemap(datasets):
"given an array of datasetObjs put out sitemap.xml"
top = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"""
sm = "sitemap.xml" # could write to string and ret it all
outstr = ""
if datasets and len(datasets) > 0:
outstr += top # put_txtfile(sm, top, "w")
URLb = settings.frontend_url
for ds in datasets:
objid = getattr(ds, "id")
if objid:
id = str(objid)
# put_txtfile(sm,f'<url><loc>{URLb}/datasets/{id}</loc></url> ')
# put_txtfile( sm, f"<url><loc>{URLb}/datasets/{id}/summary.jsonld</loc></url> ")
outstr += f"<url><loc>{URLb}/datasets/{id}/summary.jsonld</loc></url> "
# put_txtfile(sm, "</urlset>")
outstr += "</urlset>"
return outstr


# get_datasets was ("", response_model=List[DatasetOut])
# @router.get("/sitemap.xml", response_model=String)
@router.get("/sitemap.xml")
async def sitemap(
user_id=Depends(get_user),
db: MongoClient = Depends(dependencies.get_db),
skip: int = 0,
limit: int = 100,
mine: bool = False,
):
datasets = []
for doc in (
await db["datasets"]
.find()
.sort([("created", pymongo.DESCENDING)])
.skip(skip)
.limit(limit)
.to_list(length=limit)
):
datasets.append(DatasetOut.from_mongo(doc))
s = datasets2sitemap(datasets)
print(f"sitemap={s}")
return s


# get_dataset was ("/{dataset_id}", response_model=DatasetOut)
# @router.get("/{dataset_id}/summary.jsonld", response_model=String)
@router.get("/{dataset_id}/summary.jsonld")
async def get_dataset_jsonld(
dataset_id: str, db: MongoClient = Depends(dependencies.get_db)
):
if (
dataset := await db["datasets"].find_one({"_id": ObjectId(dataset_id)})
) is not None:
# now can return the ld+json script
dso = DatasetOut.from_mongo(dataset)
dt = type(dso)
print(f"= =dataset of:{dt}")
# jlds = datasetout2jsonld_script(dso) #could do this in /summary_jsonld_script but now just the jsonld
jlds = datasetout2jsonld(dso)
print(f"get_dataset_jsonld:{jlds}")
# return DatasetOut.from_mongo(dataset)
return jlds
raise HTTPException(status_code=404, detail=f"Dataset {dataset_id} not found")