-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
63 google dataset search support #136
base: main
Are you sure you want to change the base?
Changes from 10 commits
70836a3
df16803
6ec7ee6
cbda9a7
4c70af8
0c949c2
fb35ac1
7bab100
8be33be
d299cdd
854142c
0bd73cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,6 +116,16 @@ | |
tags=["extractors"], | ||
dependencies=[Depends(get_current_username)], | ||
) | ||
api_router.include_router( | ||
extractors.router, | ||
prefix="/summary.jsonld", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prefix is for the prefix :-p e.g. prefix = "/dataset" means the routes will all start with /dataset s--> /datasets/{id}, /datasets/{id}/files etc |
||
tags=["jsonld"], | ||
) | ||
api_router.include_router( | ||
extractors.router, | ||
prefix="/sitemap.xml", | ||
tags=["sitemap"], | ||
) | ||
api_router.include_router(keycloak.router, prefix="/auth", tags=["auth"]) | ||
app.include_router(api_router, prefix=settings.API_V2_STR) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -191,6 +191,50 @@ async def save_dataset( | |
return dataset_out | ||
|
||
|
||
def is_str(v): | ||
return type(v) is str | ||
|
||
|
||
schemaOrg_mapping = { | ||
"id": "identifier", | ||
"first_name": "givenName", | ||
"last_name": "familyName", | ||
"created": "dateCreated", | ||
"modified": "dateModified", | ||
"views": "interactionStatistic", | ||
"downloads": "DataDownload", | ||
} | ||
|
||
|
||
def datasetout_str2jsonld(jstr): | ||
"remap to schema.org key in a json str" | ||
if not is_str(jstr): | ||
jt = type(jstr) | ||
print(f"str2jsonld:{jstr},wrong type:{jt}") | ||
return None | ||
global schemaOrg_mapping | ||
for k, v in schemaOrg_mapping.items(): | ||
if k in jstr: | ||
ks = f'"{k}":' | ||
vs = f'"{v}":' | ||
# print(f'replace:{ks} with:{vs}') | ||
jstr = jstr.replace(ks, vs) | ||
jstr = jstr.replace("{", '{"@context": {"@vocab": "https://schema.org/"},', 1) | ||
return jstr | ||
|
||
|
||
def datasetout2jsonld(dso): | ||
"dataset attributes as jsonld" | ||
jstr = dso.json() | ||
return datasetout_str2jsonld(jstr) | ||
|
||
|
||
def datasetout2jsonld_script(dso): | ||
"dataset attributes in scrapable ld+json script" | ||
jld = datasetout2jsonld(dso) | ||
return f'<script type="application/ld+json">{jld}</script>' | ||
|
||
|
||
@router.get("", response_model=List[DatasetOut]) | ||
async def get_datasets( | ||
user_id=Depends(get_user), | ||
|
@@ -226,6 +270,56 @@ async def get_dataset(dataset_id: str, db: MongoClient = Depends(dependencies.ge | |
raise HTTPException(status_code=404, detail=f"Dataset {dataset_id} not found") | ||
|
||
|
||
@router.get("/{dataset_id}/summary.jsonld", response_model=DatasetOut) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. during a cleanup I make sure it is a DatasetOut to safe to call .json on it now |
||
async def get_dataset_jsonld( | ||
dataset_id: str, db: MongoClient = Depends(dependencies.get_db) | ||
): | ||
"get ld+json script for inside the dataset page, for scraping" | ||
dso = get_dataset(dataset_id, db) | ||
jlds = datasetout2jsonld_script(dso) | ||
return jlds | ||
|
||
|
||
def put_txtfile(fn, s, wa="a"): | ||
with open(fn, wa) as f: | ||
return f.write(s) | ||
|
||
|
||
def datasets2sitemap(datasets): | ||
"given an array of datasetObjs put out sitemap.xml" | ||
top = """<?xml version="1.0" encoding="UTF-8"?> | ||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | ||
""" | ||
sm = "sitemap.xml" # could write to string and ret it all | ||
if datasets and len(datasets) > 0: | ||
put_txtfile(sm, top, "w") | ||
URLb = settings.frontend_url | ||
for ds in datasets: | ||
objid = getattr(ds, "id") | ||
if objid: | ||
id = str(objid) | ||
put_txtfile(sm, f"<url><loc>{URLb}/datasets/{id}</loc></url> ") | ||
put_txtfile(sm, "</urlset>") | ||
|
||
|
||
# now the route | ||
|
||
|
||
def get_txtfile(fn): | ||
"ret str from file" | ||
with open(fn, "r") as f: | ||
return f.read() | ||
|
||
|
||
@router.get("/sitemap.xml") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this does not work... when i try if you separate these out to separate file and give different prefix than dataset, it might work There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. separated out into a sitemap.py file, &will make another pass soon, to see if it will open up the route There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. different file now, w/no overlapping routes |
||
async def sitemap() -> str: | ||
datasets = get_datasets() | ||
# could compare len(datasets) w/len of sitemap-file to see if could use cached one | ||
datasets2sitemap(datasets) # creates the sitemap.xml file, in case want to cache it | ||
s = get_txtfile("sitemap.xml") | ||
return s | ||
|
||
|
||
@router.get("/{dataset_id}/files") | ||
async def get_dataset_files( | ||
dataset_id: str, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Your newly added routes are in routers/datasets.py; but this new prefix is importing routes from extractors.
So i can actually access your routes under the
/dataset/{id}/summary.jsonld
but not hereI'd recommend you separate them out in a new router file (e.g. summary.py) and include them here.
summary.router
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that was left over, thanks, will skip the tags for now, and take your other advice to put those routes in another file