From ef05a822d0113bae2cdf3e6dfb33301bce1a65c5 Mon Sep 17 00:00:00 2001 From: David Campbell <102170536+davidcam-src@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:44:41 -0400 Subject: [PATCH] HYC-1951 - Refine Dimensions Query (#1116) * prioritize unc author affiliations * update query string to catch all variances of UNC and eliminate false positives * author to hash tests * slightly changed wording * update test expectations * update test expectations --- .../tasks/dimensions_ingest_service.rb | 14 ++++- .../tasks/dimensions_query_service.rb | 3 +- .../files/dimensions_ingest_test_fixture.json | 4 +- .../tasks/dimensions_ingest_service_spec.rb | 55 +++++++++++++++++++ 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/app/services/tasks/dimensions_ingest_service.rb b/app/services/tasks/dimensions_ingest_service.rb index deadf57ef..98cdc781d 100644 --- a/app/services/tasks/dimensions_ingest_service.rb +++ b/app/services/tasks/dimensions_ingest_service.rb @@ -4,6 +4,7 @@ module Tasks class DimensionsIngestService include Tasks::IngestHelper attr_reader :admin_set, :depositor + UNC_GRID_ID = 'grid.410711.2' def initialize(config) @config = config @@ -104,11 +105,22 @@ def author_to_hash(author, index) } # Add first author affiliation to other affiliation array if author['affiliations'].present? - hash['other_affiliation'] = author['affiliations'][0]['raw_affiliation'] + hash['other_affiliation'] = retrieve_author_affiliation(author['affiliations']) end hash end + def retrieve_author_affiliation(affiliations) + unc_affiliations = affiliations.select { |affiliation| affiliation['id'] == UNC_GRID_ID } + if !unc_affiliations.empty? + # Prioritize UNC affiliations, only retrieving the first one + return unc_affiliations[0]['raw_affiliation'] + end + # Otherwise, retrieve the first affiliation + return affiliations[0]['raw_affiliation'] + end + + def format_publication_identifiers(publication) [ publication['id'].present? ? "Dimensions ID: #{publication['id']}" : nil, diff --git a/app/services/tasks/dimensions_query_service.rb b/app/services/tasks/dimensions_query_service.rb index a844fce47..22c72915c 100644 --- a/app/services/tasks/dimensions_query_service.rb +++ b/app/services/tasks/dimensions_query_service.rb @@ -197,10 +197,11 @@ def solr_query_builder(pub) def generate_query_string(start_date, end_date, page_size, cursor) search_clauses = ['where type = "article"', "date >= \"#{start_date}\"", "date < \"#{end_date}\""].join(' and ') return_fields = ['basics', 'extras', 'abstract', 'issn', 'publisher', 'journal_title_raw', 'linkout', 'concepts'].join(' + ') + unc_affiliation_variants = ['"UNC-CH"', '"University of North Carolina at Chapel Hill"', '"UNC-Chapel Hill"', '"University of North Carolina-Chapel Hill"', '"University of North Carolina, Chapel Hill"'].join(' OR ') <<~QUERY search publications #{search_clauses} in raw_affiliations for """ - "University of North Carolina, Chapel Hill" OR "UNC" + #{unc_affiliation_variants} """ return publications[#{return_fields}] limit #{page_size} diff --git a/spec/fixtures/files/dimensions_ingest_test_fixture.json b/spec/fixtures/files/dimensions_ingest_test_fixture.json index da5dd5206..14c69f821 100644 --- a/spec/fixtures/files/dimensions_ingest_test_fixture.json +++ b/spec/fixtures/files/dimensions_ingest_test_fixture.json @@ -98,7 +98,9 @@ "current_organization_id": "grid.10698.36", "first_name": "Susan L", "last_name": "Hogan", - "orcid": null, + "orcid": [ + "0000-0000-0000-0000" + ], "raw_affiliation": [ "UNC Kidney Center, Division of Nephrology and Hypertension, University of North Carolina, Chapel Hill." ], diff --git a/spec/services/tasks/dimensions_ingest_service_spec.rb b/spec/services/tasks/dimensions_ingest_service_spec.rb index a3021e85f..826aa73ad 100644 --- a/spec/services/tasks/dimensions_ingest_service_spec.rb +++ b/spec/services/tasks/dimensions_ingest_service_spec.rb @@ -305,4 +305,59 @@ expect(article.keyword).to eq([]) end end + + describe '#author_to_hash' do + let (:unc_grid_id) { 'grid.410711.2' } + let (:non_unc_affiliation) { + { + 'city' => 'Test City', + 'city_id' => 5318313, + 'country' => 'United States', + 'country_code' => 'US', + 'id' => 'grid.134563.6', + 'name' => 'Test University', + 'raw_affiliation' => 'Test Raw Affiliation', + 'state' => 'Test-State', + 'state_code' => 'US-AZ' + } + } + + context 'when an author has multiple affiliations' do + it 'uses their first affiliation to populate the author hash if no UNC affiliation exists' do + non_unc_affiliated_author = test_publications.first['authors'].find { |author| author['id'] != unc_grid_id } + # Ensure the author has multiple non-unc affiliations + non_unc_affiliated_author['affiliations'].append(non_unc_affiliation) unless non_unc_affiliated_author['affiliations'].size > 1 + author_hash = service.author_to_hash(non_unc_affiliated_author, 0) + # Check that the author hash contains the expected metadata from the first affiliation + expect(author_hash).to eq( + { + 'name' => 'Thorpe, Carolyn T', + 'other_affiliation' => 'Eshelman School of Pharmacy, University of North Carolina, Chapel Hill.', + 'orcid' => 'https://orcid.org/0000-0002-7662-7497', + 'index' => '1' + } + ) + end + + it 'prioritizes retrieval of the UNC affiliation even if it is not the first one' do + first_publication_authors = test_publications.first['authors'] + # Retrieve the first UNC-affiliated author and their first UNC-affiliation + unc_affiliated_author = first_publication_authors.find do |author| + author['affiliations'].any? { |affiliation| affiliation['id'] == unc_grid_id } + end + first_unc_affiliation = unc_affiliated_author['affiliations'].find { |affiliation| affiliation['id'] == unc_grid_id } + # Ensure the author's first affiliation is not the UNC affiliation + unc_affiliated_author['affiliations'].unshift(non_unc_affiliation) + author_hash = service.author_to_hash(unc_affiliated_author, 0) + expect(author_hash).to eq( + { + 'name' => 'Hogan, Susan L', + 'other_affiliation' => 'UNC Kidney Center, Division of Nephrology and Hypertension, University of North Carolina, Chapel Hill.', + 'orcid' => 'https://orcid.org/0000-0000-0000-0000', + 'index' => '1' + } + ) + end + end + end end