From 1ce8dd4fad585821a499929659c9118c8aa7addf Mon Sep 17 00:00:00 2001 From: kyokukou Date: Tue, 10 Sep 2024 11:32:42 -0700 Subject: [PATCH] get all caetegories from string also returns groups, lists for the phyics group also get purged (supporting catchup) --- arxiv/integration/fastly/purge.py | 15 ++++++--- arxiv/integration/tests/test_fastly.py | 8 ++--- arxiv/taxonomy/category.py | 9 ++--- arxiv/taxonomy/taxonomy_test.py | 46 +++++++++++++++----------- 4 files changed, 45 insertions(+), 33 deletions(-) diff --git a/arxiv/integration/fastly/purge.py b/arxiv/integration/fastly/purge.py index 92be9d7d..2c0e49d0 100644 --- a/arxiv/integration/fastly/purge.py +++ b/arxiv/integration/fastly/purge.py @@ -12,7 +12,7 @@ from arxiv.db import Session from arxiv.db.models import Metadata, Updates from arxiv.identifier import Identifier, IdentifierException -from arxiv.taxonomy.definitions import CATEGORIES +from arxiv.taxonomy.definitions import GROUPS from arxiv.taxonomy.category import get_all_cats_from_string @@ -70,9 +70,10 @@ def _get_category_and_date(arxiv_id:Identifier)-> Tuple[str, date]: def _purge_category_change(arxiv_id:Identifier, old_cats:Optional[str]=None )-> List[str]: """determines all list and year pages required for a category change to a paper returns list of all keys to purge - does not includepaths for the paper itself - assumes categories will be provided as string like from abs_categories feild, but could be imporved if categories could be specified in a list + does not include paths for the paper itself + assumes categories will be provided as string like from abs_categories feild, but could be improved if categories could be specified in a list """ + grp_physics=GROUPS['grp_physics'] new_cats, recent_date= _get_category_and_date(arxiv_id) #get time period affected @@ -84,16 +85,20 @@ def _purge_category_change(arxiv_id:Identifier, old_cats:Optional[str]=None )-> if today - timedelta(days=7) <= recent_date: recent=True - archives, cats = get_all_cats_from_string(new_cats, True) + groups, archives, cats = get_all_cats_from_string(new_cats, True) new_archive_ids={arch.id for arch in archives} list_pages={cat.id for cat in cats} | new_archive_ids + if grp_physics in groups: #grp_physics is a group for catchup + list_pages.add(grp_physics.id) year_pages=[] if old_cats: #clear any pages this paper may have been removed from or added to - old_archives, old_categories = get_all_cats_from_string(old_cats, True) + old_groups, old_archives, old_categories = get_all_cats_from_string(old_cats, True) old_cat_ids= {cat.id for cat in old_categories} old_archive_ids={arch.id for arch in old_archives} list_pages= list_pages | old_cat_ids | old_archive_ids + if grp_physics in old_groups: #grp_physics is a group for catchup + list_pages.add(grp_physics.id) year_pages= old_archive_ids.symmetric_difference(new_archive_ids) #collect all relevant keys diff --git a/arxiv/integration/tests/test_fastly.py b/arxiv/integration/tests/test_fastly.py index 84baa3bc..2c5836d0 100644 --- a/arxiv/integration/tests/test_fastly.py +++ b/arxiv/integration/tests/test_fastly.py @@ -146,14 +146,14 @@ def test_purge_category_change_alias_cats(mockToday,mockDBQuery): mockToday.today.return_value=date(2024,1,1) mockDBQuery.return_value=("solv-int cs.SY", date(2010,1,1)) result=_purge_category_change(Identifier('1001.5678')) - expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs" ] + expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs", "list-2010-01-grp_physics", "list-2010-grp_physics" ] assert sorted(result)==sorted(expected) #find archives of unlisted noncanonical categories mockToday.today.return_value=date(2024,1,1) mockDBQuery.return_value=("nlin.SI eess.SY", date(2010,1,1)) result=_purge_category_change(Identifier('1001.5678')) - expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs" ] + expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs", "list-2010-01-grp_physics", "list-2010-grp_physics" ] assert sorted(result)==sorted(expected) @patch('arxiv.integration.fastly.purge._get_category_and_date') @@ -170,7 +170,7 @@ def test_purge_category_change_remove_cats(mockToday,mockDBQuery): mockToday.today.return_value=date(2024,1,1) mockDBQuery.return_value=("cs.LG", date(2010,1,1)) result=_purge_category_change(Identifier('1001.5678'),"cs.LG hep-lat") - expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010"] + expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010", "list-2010-01-grp_physics", "list-2010-grp_physics"] assert sorted(result)==sorted(expected) #remove archive of alias @@ -194,7 +194,7 @@ def test_purge_category_change_add_cats(mockToday,mockDBQuery): mockToday.today.return_value=date(2024,1,1) mockDBQuery.return_value=("cs.LG hep-lat", date(2010,1,1)) result=_purge_category_change(Identifier('1001.5678'),"cs.LG") - expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010"] + expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010", "list-2010-01-grp_physics", "list-2010-grp_physics"] assert sorted(result)==sorted(expected) #add archive of alias diff --git a/arxiv/taxonomy/category.py b/arxiv/taxonomy/category.py index a77f41f8..44a00135 100644 --- a/arxiv/taxonomy/category.py +++ b/arxiv/taxonomy/category.py @@ -139,8 +139,8 @@ def create_bad_category(name:str) -> Category: is_general=False ) -def get_all_cats_from_string(cat_string:str, only_cannonical:Optional[bool]=False)->Tuple[List[Archive],List[Category]]: - """returns all possble lists and archives the categories and article would appear in based on the category string. +def get_all_cats_from_string(cat_string:str, only_cannonical:Optional[bool]=False)->Tuple[List[Group], List[Archive],List[Category]]: + """returns all possble groups, archives and category lists an article would appear in based on the category string. This is needed because alternate names are not always recorded in the strings setting only_cannonical to true will not return non canonical categories, but will stil return the archives from non-cannonical alias pairs This is meant to be used on catgory strings from the database, not user input @@ -151,11 +151,12 @@ def get_all_cats_from_string(cat_string:str, only_cannonical:Optional[bool]=Fals archives_canon={cat.get_archive() for cat in cats} archives_noncanon = {CATEGORIES[cat.alt_name].get_archive() for cat in cats if (cat.alt_name and cat.alt_name in CATEGORY_ALIASES)} #papers also belong in the archives of their non-cannon alias categories archives= archives_noncanon | archives_canon - + if not only_cannonical: noncannon_cats = {CATEGORIES[cat.alt_name] for cat in cats if cat.alt_name is not None and cat.alt_name in CATEGORIES} cats= cats | noncannon_cats noncannon_archives={cat.get_archive() for cat in noncannon_cats} archives= archives | noncannon_archives - return list(archives), list(cats) \ No newline at end of file + groups={arch.get_group() for arch in archives} + return list(groups), list(archives), list(cats) \ No newline at end of file diff --git a/arxiv/taxonomy/taxonomy_test.py b/arxiv/taxonomy/taxonomy_test.py index 40bca8eb..69ec9d88 100644 --- a/arxiv/taxonomy/taxonomy_test.py +++ b/arxiv/taxonomy/taxonomy_test.py @@ -172,55 +172,61 @@ def test_bad_objects(self): def test_all_cats_from_string(self): #empty string - self.assertEqual(get_all_cats_from_string(""),([],[]), "empty string doesn't cause error") - self.assertEqual(get_all_cats_from_string(" "),([],[]), "white space doesn't cause error") + self.assertEqual(get_all_cats_from_string(""),([],[],[]), "empty string doesn't cause error") + self.assertEqual(get_all_cats_from_string(" "),([],[],[]), "white space doesn't cause error") #basic - expected=([ARCHIVES["hep-lat"]], [CATEGORIES["hep-lat"]]) - self.assertEqual(get_all_cats_from_string("hep-lat"), expected, "returns both archive and category") - expected=([ARCHIVES["math"]], [CATEGORIES["math.SP"]]) - self.assertEqual(get_all_cats_from_string("math.SP"), expected, "returns both archive and category") + expected=([GROUPS['grp_physics']], [ARCHIVES["hep-lat"]], [CATEGORIES["hep-lat"]]) + self.assertEqual(get_all_cats_from_string("hep-lat"), expected, "returns group, archive and category") + expected=([GROUPS['grp_math']], [ARCHIVES["math"]], [CATEGORIES["math.SP"]]) + self.assertEqual(get_all_cats_from_string("math.SP"), expected, "returns group, archive and category") #multiple per archive - archs, cats= get_all_cats_from_string("math.GN math.SP") + grps, archs, cats= get_all_cats_from_string("math.GN math.SP") + self.assertCountEqual(grps, [GROUPS['grp_math']], "only one copy of the group") self.assertCountEqual(archs, [ARCHIVES["math"]], "only one copy of the archive") expected_cats=[CATEGORIES["math.SP"], CATEGORIES["math.GN"]] self.assertCountEqual(cats, expected_cats, "both categories present") #different archives - archs, cats= get_all_cats_from_string("math.GN math.SP cs.OS") + grps, archs, cats= get_all_cats_from_string("math.GN math.SP cs.OS") + self.assertCountEqual(grps, [GROUPS['grp_math'], GROUPS['grp_cs']], "one copy of each group") self.assertCountEqual(archs, [ARCHIVES["math"], ARCHIVES["cs"]], "one copy of each archive") expected_cats=[CATEGORIES["math.SP"], CATEGORIES["math.GN"], CATEGORIES["cs.OS"]] self.assertCountEqual(cats, expected_cats, "all categories present") #alliases all - archs, cats= get_all_cats_from_string("cs.SY") - self.assertEqual((archs,cats),get_all_cats_from_string("eess.SY"), "either verison of a category name should return the same thing") - self.assertEqual((archs,cats),get_all_cats_from_string("cs.SY eess.SY"), "one part of pair should have the same result as both") + grps, archs, cats= get_all_cats_from_string("cs.SY") + self.assertEqual((grps, archs,cats),get_all_cats_from_string("eess.SY"), "either verison of a category name should return the same thing") + self.assertEqual((grps, archs,cats),get_all_cats_from_string("cs.SY eess.SY"), "one part of pair should have the same result as both") + self.assertCountEqual(grps, [GROUPS['grp_eess'], GROUPS['grp_cs']], "part of two different groups") self.assertCountEqual(archs, [ARCHIVES["cs"], ARCHIVES["eess"]], "part of two different archives") expected_cats=[CATEGORIES["eess.SY"], CATEGORIES["cs.SY"]] self.assertCountEqual(cats, expected_cats, "all versions present") #alliases only cannonical - archs, cats= get_all_cats_from_string("cs.SY", True) - self.assertEqual((archs,cats),get_all_cats_from_string("eess.SY", True), "either verison of a category name should return the same thing") - self.assertEqual((archs,cats),get_all_cats_from_string("cs.SY eess.SY", True), "one part of pair should have the same result as both") + grps, archs, cats= get_all_cats_from_string("cs.SY", True) + self.assertEqual((grps, archs,cats),get_all_cats_from_string("eess.SY", True), "either verison of a category name should return the same thing") + self.assertEqual((grps, archs,cats),get_all_cats_from_string("cs.SY eess.SY", True), "one part of pair should have the same result as both") + self.assertCountEqual(grps, [GROUPS['grp_eess'], GROUPS['grp_cs']], "part of two different groups") self.assertCountEqual(archs, [ARCHIVES["cs"], ARCHIVES["eess"]], "part of two different archives") expected_cats=[CATEGORIES["eess.SY"]] self.assertCountEqual(cats, expected_cats, "only canonical present") #subsumed only canonical - archs, cats= get_all_cats_from_string("solv-int", True) - self.assertEqual((archs,cats),get_all_cats_from_string("nlin.SI", True), "either verison of a category name should return the same thing") - self.assertEqual((archs,cats),get_all_cats_from_string("solv-int nlin.SI", True), "one part of pair should have the same result as both") + grps, archs, cats= get_all_cats_from_string("solv-int", True) + self.assertEqual((grps, archs,cats),get_all_cats_from_string("nlin.SI", True), "either verison of a category name should return the same thing") + self.assertEqual((grps, archs,cats),get_all_cats_from_string("solv-int nlin.SI", True), "one part of pair should have the same result as both") self.assertCountEqual(archs, [ ARCHIVES["nlin"]], "don't include subsumed archive") + self.assertCountEqual(grps, [GROUPS['grp_physics']], "one copy of group") expected_cats=[CATEGORIES["nlin.SI"]] self.assertCountEqual(cats, expected_cats, "only canonical category returned") #subsumed all - archs, cats= get_all_cats_from_string("solv-int") - self.assertEqual((archs,cats),get_all_cats_from_string("nlin.SI"), "either verison of a category name should return the same thing") - self.assertEqual((archs,cats),get_all_cats_from_string("solv-int nlin.SI"), "one part of pair should have the same result as both") + grps, archs, cats= get_all_cats_from_string("solv-int") + self.assertEqual((grps, archs,cats),get_all_cats_from_string("nlin.SI"), "either verison of a category name should return the same thing") + self.assertEqual((grps, archs,cats),get_all_cats_from_string("solv-int nlin.SI"), "one part of pair should have the same result as both") + self.assertCountEqual(grps, [GROUPS['grp_physics']], "one copy of group") self.assertCountEqual(archs, [ARCHIVES["solv-int"], ARCHIVES["nlin"]], "include subsumed archive") expected_cats=[CATEGORIES["nlin.SI"], CATEGORIES["solv-int"]] self.assertCountEqual(cats, expected_cats, "all versions present") \ No newline at end of file