Skip to content

Commit

Permalink
Merge pull request #62 from sul-dlss-labs/merge-pubs-follow-up
Browse files Browse the repository at this point in the history
Cleanup on merge_pubs
  • Loading branch information
edsu authored Jun 27, 2024
2 parents 791de22 + aa6bf97 commit a66ad03
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
4 changes: 1 addition & 3 deletions rialto_airflow/dags/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ def merge_publications(sul_pub, openalex_pubs, dimensions_pubs, snapshot_dir):
"""
Merge the OpenAlex, Dimensions and sul_pub data.
"""
output = (
Path(snapshot_dir) / "publications.parquet"
) # TODO: update file extension to actual format used
output = Path(snapshot_dir) / "publications.parquet"
merge_pubs.merge(sul_pub, openalex_pubs, dimensions_pubs, output)
return str(output)

Expand Down
1 change: 1 addition & 0 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,6 @@ def sulpub_df(sul_pub):
"""
df = pl.scan_csv(sul_pub)
df = df.drop_nulls("doi")
df = df.with_columns(pl.col("doi").str.replace("https://doi.org/", ""))
df = df.rename(lambda column_name: "sul_pub_" + column_name)
return df
17 changes: 13 additions & 4 deletions test/harvest/test_merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ def sul_pubs_csv(tmp_path):
"2024",
]
)
writer.writerow(
[
"[]",
"A Published Research Article",
"2024",
"https://doi.org/10.0000/dddd",
]
)
return fixture_file


Expand All @@ -150,22 +158,23 @@ def test_sulpub_df(sul_pubs_csv):
lazy_df = merge_pubs.sulpub_df(sul_pubs_csv)
assert type(lazy_df) == pl.lazyframe.frame.LazyFrame
df = lazy_df.collect()
assert df.shape[0] == 1, "Row without a doi has been dropped"
assert df.shape[0] == 2, "Row without a doi has been dropped"
assert df.columns == [
"sul_pub_authorship",
"sul_pub_title",
"sul_pub_year",
"sul_pub_doi",
]
assert df["sul_pub_doi"].to_list() == ["10.0000/cccc"]
assert df["sul_pub_doi"].to_list() == ["10.0000/cccc", "10.0000/dddd"]


def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
output = tmp_path / "merged_pubs.parquet"
merge_pubs.merge(sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv, output)
assert output.is_file(), "output file has been created"
df = pl.read_parquet(output)
assert df.shape[0] == 3
assert df.shape[0] == 4
assert df.shape[1] == 25
assert set(df["doi"].to_list()) == set(
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc"]
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd"]
)

0 comments on commit a66ad03

Please sign in to comment.