Skip to content

Commit

Permalink
Merge pull request #311 from arXiv/ARXIVCE-2576-support-purging-grp-p…
Browse files Browse the repository at this point in the history
…hysics

get all caetegories from string also returns groups, lists for the ph…
  • Loading branch information
kyokukou authored Sep 10, 2024
2 parents ca21eaa + 1ce8dd4 commit 25952ff
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 33 deletions.
15 changes: 10 additions & 5 deletions arxiv/integration/fastly/purge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from arxiv.db import Session
from arxiv.db.models import Metadata, Updates
from arxiv.identifier import Identifier, IdentifierException
from arxiv.taxonomy.definitions import CATEGORIES
from arxiv.taxonomy.definitions import GROUPS
from arxiv.taxonomy.category import get_all_cats_from_string


Expand Down Expand Up @@ -70,9 +70,10 @@ def _get_category_and_date(arxiv_id:Identifier)-> Tuple[str, date]:
def _purge_category_change(arxiv_id:Identifier, old_cats:Optional[str]=None )-> List[str]:
"""determines all list and year pages required for a category change to a paper
returns list of all keys to purge
does not includepaths for the paper itself
assumes categories will be provided as string like from abs_categories feild, but could be imporved if categories could be specified in a list
does not include paths for the paper itself
assumes categories will be provided as string like from abs_categories feild, but could be improved if categories could be specified in a list
"""
grp_physics=GROUPS['grp_physics']
new_cats, recent_date= _get_category_and_date(arxiv_id)

#get time period affected
Expand All @@ -84,16 +85,20 @@ def _purge_category_change(arxiv_id:Identifier, old_cats:Optional[str]=None )->
if today - timedelta(days=7) <= recent_date:
recent=True

archives, cats = get_all_cats_from_string(new_cats, True)
groups, archives, cats = get_all_cats_from_string(new_cats, True)
new_archive_ids={arch.id for arch in archives}
list_pages={cat.id for cat in cats} | new_archive_ids
if grp_physics in groups: #grp_physics is a group for catchup
list_pages.add(grp_physics.id)

year_pages=[]
if old_cats: #clear any pages this paper may have been removed from or added to
old_archives, old_categories = get_all_cats_from_string(old_cats, True)
old_groups, old_archives, old_categories = get_all_cats_from_string(old_cats, True)
old_cat_ids= {cat.id for cat in old_categories}
old_archive_ids={arch.id for arch in old_archives}
list_pages= list_pages | old_cat_ids | old_archive_ids
if grp_physics in old_groups: #grp_physics is a group for catchup
list_pages.add(grp_physics.id)
year_pages= old_archive_ids.symmetric_difference(new_archive_ids)

#collect all relevant keys
Expand Down
8 changes: 4 additions & 4 deletions arxiv/integration/tests/test_fastly.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,14 @@ def test_purge_category_change_alias_cats(mockToday,mockDBQuery):
mockToday.today.return_value=date(2024,1,1)
mockDBQuery.return_value=("solv-int cs.SY", date(2010,1,1))
result=_purge_category_change(Identifier('1001.5678'))
expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs" ]
expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs", "list-2010-01-grp_physics", "list-2010-grp_physics" ]
assert sorted(result)==sorted(expected)

#find archives of unlisted noncanonical categories
mockToday.today.return_value=date(2024,1,1)
mockDBQuery.return_value=("nlin.SI eess.SY", date(2010,1,1))
result=_purge_category_change(Identifier('1001.5678'))
expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs" ]
expected=["list-2010-01-nlin.SI", "list-2010-nlin.SI", "list-2010-01-nlin", "list-2010-nlin","list-2010-01-eess.SY", "list-2010-eess.SY", "list-2010-01-eess", "list-2010-eess", "list-2010-01-cs", "list-2010-cs", "list-2010-01-grp_physics", "list-2010-grp_physics" ]
assert sorted(result)==sorted(expected)

@patch('arxiv.integration.fastly.purge._get_category_and_date')
Expand All @@ -170,7 +170,7 @@ def test_purge_category_change_remove_cats(mockToday,mockDBQuery):
mockToday.today.return_value=date(2024,1,1)
mockDBQuery.return_value=("cs.LG", date(2010,1,1))
result=_purge_category_change(Identifier('1001.5678'),"cs.LG hep-lat")
expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010"]
expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010", "list-2010-01-grp_physics", "list-2010-grp_physics"]
assert sorted(result)==sorted(expected)

#remove archive of alias
Expand All @@ -194,7 +194,7 @@ def test_purge_category_change_add_cats(mockToday,mockDBQuery):
mockToday.today.return_value=date(2024,1,1)
mockDBQuery.return_value=("cs.LG hep-lat", date(2010,1,1))
result=_purge_category_change(Identifier('1001.5678'),"cs.LG")
expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010"]
expected=["list-2010-01-cs.LG", "list-2010-cs.LG", "list-2010-01-cs", "list-2010-cs", "list-2010-01-hep-lat", "list-2010-hep-lat", "year-hep-lat-2010", "list-2010-01-grp_physics", "list-2010-grp_physics"]
assert sorted(result)==sorted(expected)

#add archive of alias
Expand Down
9 changes: 5 additions & 4 deletions arxiv/taxonomy/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def create_bad_category(name:str) -> Category:
is_general=False
)

def get_all_cats_from_string(cat_string:str, only_cannonical:Optional[bool]=False)->Tuple[List[Archive],List[Category]]:
"""returns all possble lists and archives the categories and article would appear in based on the category string.
def get_all_cats_from_string(cat_string:str, only_cannonical:Optional[bool]=False)->Tuple[List[Group], List[Archive],List[Category]]:
"""returns all possble groups, archives and category lists an article would appear in based on the category string.
This is needed because alternate names are not always recorded in the strings
setting only_cannonical to true will not return non canonical categories, but will stil return the archives from non-cannonical alias pairs
This is meant to be used on catgory strings from the database, not user input
Expand All @@ -151,11 +151,12 @@ def get_all_cats_from_string(cat_string:str, only_cannonical:Optional[bool]=Fals
archives_canon={cat.get_archive() for cat in cats}
archives_noncanon = {CATEGORIES[cat.alt_name].get_archive() for cat in cats if (cat.alt_name and cat.alt_name in CATEGORY_ALIASES)} #papers also belong in the archives of their non-cannon alias categories
archives= archives_noncanon | archives_canon

if not only_cannonical:
noncannon_cats = {CATEGORIES[cat.alt_name] for cat in cats if cat.alt_name is not None and cat.alt_name in CATEGORIES}
cats= cats | noncannon_cats
noncannon_archives={cat.get_archive() for cat in noncannon_cats}
archives= archives | noncannon_archives

return list(archives), list(cats)
groups={arch.get_group() for arch in archives}
return list(groups), list(archives), list(cats)
46 changes: 26 additions & 20 deletions arxiv/taxonomy/taxonomy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,55 +172,61 @@ def test_bad_objects(self):

def test_all_cats_from_string(self):
#empty string
self.assertEqual(get_all_cats_from_string(""),([],[]), "empty string doesn't cause error")
self.assertEqual(get_all_cats_from_string(" "),([],[]), "white space doesn't cause error")
self.assertEqual(get_all_cats_from_string(""),([],[],[]), "empty string doesn't cause error")
self.assertEqual(get_all_cats_from_string(" "),([],[],[]), "white space doesn't cause error")

#basic
expected=([ARCHIVES["hep-lat"]], [CATEGORIES["hep-lat"]])
self.assertEqual(get_all_cats_from_string("hep-lat"), expected, "returns both archive and category")
expected=([ARCHIVES["math"]], [CATEGORIES["math.SP"]])
self.assertEqual(get_all_cats_from_string("math.SP"), expected, "returns both archive and category")
expected=([GROUPS['grp_physics']], [ARCHIVES["hep-lat"]], [CATEGORIES["hep-lat"]])
self.assertEqual(get_all_cats_from_string("hep-lat"), expected, "returns group, archive and category")
expected=([GROUPS['grp_math']], [ARCHIVES["math"]], [CATEGORIES["math.SP"]])
self.assertEqual(get_all_cats_from_string("math.SP"), expected, "returns group, archive and category")

#multiple per archive
archs, cats= get_all_cats_from_string("math.GN math.SP")
grps, archs, cats= get_all_cats_from_string("math.GN math.SP")
self.assertCountEqual(grps, [GROUPS['grp_math']], "only one copy of the group")
self.assertCountEqual(archs, [ARCHIVES["math"]], "only one copy of the archive")
expected_cats=[CATEGORIES["math.SP"], CATEGORIES["math.GN"]]
self.assertCountEqual(cats, expected_cats, "both categories present")

#different archives
archs, cats= get_all_cats_from_string("math.GN math.SP cs.OS")
grps, archs, cats= get_all_cats_from_string("math.GN math.SP cs.OS")
self.assertCountEqual(grps, [GROUPS['grp_math'], GROUPS['grp_cs']], "one copy of each group")
self.assertCountEqual(archs, [ARCHIVES["math"], ARCHIVES["cs"]], "one copy of each archive")
expected_cats=[CATEGORIES["math.SP"], CATEGORIES["math.GN"], CATEGORIES["cs.OS"]]
self.assertCountEqual(cats, expected_cats, "all categories present")

#alliases all
archs, cats= get_all_cats_from_string("cs.SY")
self.assertEqual((archs,cats),get_all_cats_from_string("eess.SY"), "either verison of a category name should return the same thing")
self.assertEqual((archs,cats),get_all_cats_from_string("cs.SY eess.SY"), "one part of pair should have the same result as both")
grps, archs, cats= get_all_cats_from_string("cs.SY")
self.assertEqual((grps, archs,cats),get_all_cats_from_string("eess.SY"), "either verison of a category name should return the same thing")
self.assertEqual((grps, archs,cats),get_all_cats_from_string("cs.SY eess.SY"), "one part of pair should have the same result as both")
self.assertCountEqual(grps, [GROUPS['grp_eess'], GROUPS['grp_cs']], "part of two different groups")
self.assertCountEqual(archs, [ARCHIVES["cs"], ARCHIVES["eess"]], "part of two different archives")
expected_cats=[CATEGORIES["eess.SY"], CATEGORIES["cs.SY"]]
self.assertCountEqual(cats, expected_cats, "all versions present")

#alliases only cannonical
archs, cats= get_all_cats_from_string("cs.SY", True)
self.assertEqual((archs,cats),get_all_cats_from_string("eess.SY", True), "either verison of a category name should return the same thing")
self.assertEqual((archs,cats),get_all_cats_from_string("cs.SY eess.SY", True), "one part of pair should have the same result as both")
grps, archs, cats= get_all_cats_from_string("cs.SY", True)
self.assertEqual((grps, archs,cats),get_all_cats_from_string("eess.SY", True), "either verison of a category name should return the same thing")
self.assertEqual((grps, archs,cats),get_all_cats_from_string("cs.SY eess.SY", True), "one part of pair should have the same result as both")
self.assertCountEqual(grps, [GROUPS['grp_eess'], GROUPS['grp_cs']], "part of two different groups")
self.assertCountEqual(archs, [ARCHIVES["cs"], ARCHIVES["eess"]], "part of two different archives")
expected_cats=[CATEGORIES["eess.SY"]]
self.assertCountEqual(cats, expected_cats, "only canonical present")

#subsumed only canonical
archs, cats= get_all_cats_from_string("solv-int", True)
self.assertEqual((archs,cats),get_all_cats_from_string("nlin.SI", True), "either verison of a category name should return the same thing")
self.assertEqual((archs,cats),get_all_cats_from_string("solv-int nlin.SI", True), "one part of pair should have the same result as both")
grps, archs, cats= get_all_cats_from_string("solv-int", True)
self.assertEqual((grps, archs,cats),get_all_cats_from_string("nlin.SI", True), "either verison of a category name should return the same thing")
self.assertEqual((grps, archs,cats),get_all_cats_from_string("solv-int nlin.SI", True), "one part of pair should have the same result as both")
self.assertCountEqual(archs, [ ARCHIVES["nlin"]], "don't include subsumed archive")
self.assertCountEqual(grps, [GROUPS['grp_physics']], "one copy of group")
expected_cats=[CATEGORIES["nlin.SI"]]
self.assertCountEqual(cats, expected_cats, "only canonical category returned")

#subsumed all
archs, cats= get_all_cats_from_string("solv-int")
self.assertEqual((archs,cats),get_all_cats_from_string("nlin.SI"), "either verison of a category name should return the same thing")
self.assertEqual((archs,cats),get_all_cats_from_string("solv-int nlin.SI"), "one part of pair should have the same result as both")
grps, archs, cats= get_all_cats_from_string("solv-int")
self.assertEqual((grps, archs,cats),get_all_cats_from_string("nlin.SI"), "either verison of a category name should return the same thing")
self.assertEqual((grps, archs,cats),get_all_cats_from_string("solv-int nlin.SI"), "one part of pair should have the same result as both")
self.assertCountEqual(grps, [GROUPS['grp_physics']], "one copy of group")
self.assertCountEqual(archs, [ARCHIVES["solv-int"], ARCHIVES["nlin"]], "include subsumed archive")
expected_cats=[CATEGORIES["nlin.SI"], CATEGORIES["solv-int"]]
self.assertCountEqual(cats, expected_cats, "all versions present")

0 comments on commit 25952ff

Please sign in to comment.