Skip to content

Commit

Permalink
HYC-1951 - Refine Dimensions Query (#1116)
Browse files Browse the repository at this point in the history
* prioritize unc author affiliations

* update query string to catch all variances of UNC and eliminate false positives

* author to hash tests

* slightly changed wording

* update test expectations

* update test expectations
  • Loading branch information
davidcam-src authored Aug 16, 2024
1 parent a81755a commit ef05a82
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 3 deletions.
14 changes: 13 additions & 1 deletion app/services/tasks/dimensions_ingest_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ module Tasks
class DimensionsIngestService
include Tasks::IngestHelper
attr_reader :admin_set, :depositor
UNC_GRID_ID = 'grid.410711.2'

def initialize(config)
@config = config
Expand Down Expand Up @@ -104,11 +105,22 @@ def author_to_hash(author, index)
}
# Add first author affiliation to other affiliation array
if author['affiliations'].present?
hash['other_affiliation'] = author['affiliations'][0]['raw_affiliation']
hash['other_affiliation'] = retrieve_author_affiliation(author['affiliations'])
end
hash
end

def retrieve_author_affiliation(affiliations)
unc_affiliations = affiliations.select { |affiliation| affiliation['id'] == UNC_GRID_ID }
if !unc_affiliations.empty?
# Prioritize UNC affiliations, only retrieving the first one
return unc_affiliations[0]['raw_affiliation']
end
# Otherwise, retrieve the first affiliation
return affiliations[0]['raw_affiliation']
end


def format_publication_identifiers(publication)
[
publication['id'].present? ? "Dimensions ID: #{publication['id']}" : nil,
Expand Down
3 changes: 2 additions & 1 deletion app/services/tasks/dimensions_query_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ def solr_query_builder(pub)
def generate_query_string(start_date, end_date, page_size, cursor)
search_clauses = ['where type = "article"', "date >= \"#{start_date}\"", "date < \"#{end_date}\""].join(' and ')
return_fields = ['basics', 'extras', 'abstract', 'issn', 'publisher', 'journal_title_raw', 'linkout', 'concepts'].join(' + ')
unc_affiliation_variants = ['"UNC-CH"', '"University of North Carolina at Chapel Hill"', '"UNC-Chapel Hill"', '"University of North Carolina-Chapel Hill"', '"University of North Carolina, Chapel Hill"'].join(' OR ')
<<~QUERY
search publications #{search_clauses} in raw_affiliations
for """
"University of North Carolina, Chapel Hill" OR "UNC"
#{unc_affiliation_variants}
"""
return publications[#{return_fields}]
limit #{page_size}
Expand Down
4 changes: 3 additions & 1 deletion spec/fixtures/files/dimensions_ingest_test_fixture.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@
"current_organization_id": "grid.10698.36",
"first_name": "Susan L",
"last_name": "Hogan",
"orcid": null,
"orcid": [
"0000-0000-0000-0000"
],
"raw_affiliation": [
"UNC Kidney Center, Division of Nephrology and Hypertension, University of North Carolina, Chapel Hill."
],
Expand Down
55 changes: 55 additions & 0 deletions spec/services/tasks/dimensions_ingest_service_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -305,4 +305,59 @@
expect(article.keyword).to eq([])
end
end

describe '#author_to_hash' do
let (:unc_grid_id) { 'grid.410711.2' }
let (:non_unc_affiliation) {
{
'city' => 'Test City',
'city_id' => 5318313,
'country' => 'United States',
'country_code' => 'US',
'id' => 'grid.134563.6',
'name' => 'Test University',
'raw_affiliation' => 'Test Raw Affiliation',
'state' => 'Test-State',
'state_code' => 'US-AZ'
}
}

context 'when an author has multiple affiliations' do
it 'uses their first affiliation to populate the author hash if no UNC affiliation exists' do
non_unc_affiliated_author = test_publications.first['authors'].find { |author| author['id'] != unc_grid_id }
# Ensure the author has multiple non-unc affiliations
non_unc_affiliated_author['affiliations'].append(non_unc_affiliation) unless non_unc_affiliated_author['affiliations'].size > 1
author_hash = service.author_to_hash(non_unc_affiliated_author, 0)
# Check that the author hash contains the expected metadata from the first affiliation
expect(author_hash).to eq(
{
'name' => 'Thorpe, Carolyn T',
'other_affiliation' => 'Eshelman School of Pharmacy, University of North Carolina, Chapel Hill.',
'orcid' => 'https://orcid.org/0000-0002-7662-7497',
'index' => '1'
}
)
end

it 'prioritizes retrieval of the UNC affiliation even if it is not the first one' do
first_publication_authors = test_publications.first['authors']
# Retrieve the first UNC-affiliated author and their first UNC-affiliation
unc_affiliated_author = first_publication_authors.find do |author|
author['affiliations'].any? { |affiliation| affiliation['id'] == unc_grid_id }
end
first_unc_affiliation = unc_affiliated_author['affiliations'].find { |affiliation| affiliation['id'] == unc_grid_id }
# Ensure the author's first affiliation is not the UNC affiliation
unc_affiliated_author['affiliations'].unshift(non_unc_affiliation)
author_hash = service.author_to_hash(unc_affiliated_author, 0)
expect(author_hash).to eq(
{
'name' => 'Hogan, Susan L',
'other_affiliation' => 'UNC Kidney Center, Division of Nephrology and Hypertension, University of North Carolina, Chapel Hill.',
'orcid' => 'https://orcid.org/0000-0000-0000-0000',
'index' => '1'
}
)
end
end
end
end

0 comments on commit ef05a82

Please sign in to comment.