Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6284] render tsv/csv #834

Merged
merged 28 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 4 additions & 16 deletions share/search/index_strategy/_trovesearch_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@
)
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri
from trove.vocab.namespaces import (
DCTERMS,
FOAF,
OSFMAP,
OWL,
RDF,
RDFS,
SKOS,
TROVE,
XSD,
)
from trove.vocab.osfmap import is_date_property
from trove.vocab.osfmap import (
is_date_property,
SKIPPABLE_PROPERTIES,
)


_logger = logging.getLogger(__name__)
Expand All @@ -38,16 +36,6 @@
###
# constants

SKIPPABLE_PROPERTIES = (
OSFMAP.contains, # too much, not helpful
OWL.sameAs, # handled special
)

TITLE_PROPERTIES = (DCTERMS.title,)
NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName)
LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel)
NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES)

KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit
# (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html)
KEYWORD_MAPPING = {'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX}
Expand Down
16 changes: 6 additions & 10 deletions share/search/index_strategy/trove_indexcard_flats.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,11 @@
PropertypathUsage,
)
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword
from trove.vocab.osfmap import is_date_property
from trove.vocab import osfmap
from trove.vocab.namespaces import RDF, OWL
from ._trovesearch_util import (
latest_rdf_for_indexcard_pks,
GraphWalk,
TITLE_PROPERTIES,
NAME_PROPERTIES,
LABEL_PROPERTIES,
NAMELIKE_PROPERTIES,
KEYWORD_LENGTH_MAX,
)

Expand Down Expand Up @@ -322,7 +318,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear

def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
_cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor)
_is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1])
_is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1])
_search_kwargs = dict(
query=self._cardsearch_query(
valuesearch_params.cardsearch_filter_set,
Expand Down Expand Up @@ -833,7 +829,7 @@ def _inner_hits(self, *, highlight_query=None) -> dict:

def _should_skip_card(indexcard_rdf, rdfdoc):
# skip cards without some value for name/title/label
return not any(rdfdoc.q(indexcard_rdf.focus_iri, NAMELIKE_PROPERTIES))
return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES))


def _bucketlist(agg_result: dict) -> list[str]:
Expand Down Expand Up @@ -911,17 +907,17 @@ def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc):
# TODO: don't discard language for name/title/label
name_text=frozenset(
_text.unicode_value
for _text in rdfdoc.q(iri, NAME_PROPERTIES)
for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES)
if isinstance(_text, primitive_rdf.Literal)
),
title_text=frozenset(
_text.unicode_value
for _text in rdfdoc.q(iri, TITLE_PROPERTIES)
for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES)
if isinstance(_text, primitive_rdf.Literal)
),
label_text=frozenset(
_text.unicode_value
for _text in rdfdoc.q(iri, LABEL_PROPERTIES)
for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES)
if isinstance(_text, primitive_rdf.Literal)
),
)
Expand Down
12 changes: 6 additions & 6 deletions share/search/index_strategy/trovesearch_denorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
ValuesearchResponse,
ValuesearchResult,
)
from trove.vocab.osfmap import is_date_property
from trove.vocab import osfmap
from trove.vocab.namespaces import OWL, RDF
from . import _trovesearch_util as ts

Expand Down Expand Up @@ -230,7 +230,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear
def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
_path = valuesearch_params.valuesearch_propertypath
_cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor)
_is_date_search = is_date_property(_path[-1])
_is_date_search = osfmap.is_date_property(_path[-1])
_query = (
_build_date_valuesearch(valuesearch_params)
if _is_date_search
Expand Down Expand Up @@ -275,7 +275,7 @@ def should_skip(self) -> bool:
# skip cards that belong to an obsolete suid with a later duplicate
_suid.has_forecompat_replacement()
# ...or that are without some value for name/title/label
or not any(self.rdfdoc.q(self.focus_iri, ts.NAMELIKE_PROPERTIES))
or not any(self.rdfdoc.q(self.focus_iri, osfmap.NAMELIKE_PROPERTIES))
)

def build_docs(self) -> Iterator[tuple[str, dict]]:
Expand Down Expand Up @@ -319,9 +319,9 @@ def _iri_value_subdoc(self, iri: str) -> dict:
_shortwalk = self._fullwalk.shortwalk_from(iri)
return {
**self._paths_and_values(_shortwalk),
'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)),
'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)),
'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)),
'value_name': list(self._texts_at_properties(_shortwalk, osfmap.NAME_PROPERTIES)),
'value_title': list(self._texts_at_properties(_shortwalk, osfmap.TITLE_PROPERTIES)),
'value_label': list(self._texts_at_properties(_shortwalk, osfmap.LABEL_PROPERTIES)),
'at_card_propertypaths': [
ts.propertypath_as_keyword(_path)
for _path in self._fullwalk.paths_by_iri[iri]
Expand Down
56 changes: 56 additions & 0 deletions tests/trove/_input_output_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import abc
from unittest import TestCase
import typing


class BasicInputOutputTestCase(TestCase):
'''base for tests that have a simple/repetitive input/output pattern
'''
maxDiff = None # usually want the full diff for these tests, tho can override if you prefer

# expected on subclasses:
inputs: typing.ClassVar[
dict[str, typing.Any]
]
expected_outputs: typing.ClassVar[
# keys should match `inputs` keys (enforce with types? maybe someday)
dict[str, typing.Any]
]

# required in subclasses
@abc.abstractmethod
def compute_output(self, given_input: typing.Any) -> typing.Any:
raise NotImplementedError

# (optional override, for when equality isn't so easy)
def assert_outputs_equal(self, expected_output: typing.Any, actual_output: typing.Any) -> None:
self.assertEqual(expected_output, actual_output)

# (optional override, for when logic is more complicated)
def run_input_output_test(self, given_input, expected_output):
_actual_output = self.compute_output(given_input)
self.assert_outputs_equal(expected_output, _actual_output)

###
# private details

def __init_subclass__(cls):
# HACK: assign `test_*` method only on concrete subclasses,
# so the test runner doesn't try instantiating a base class
if getattr(cls, 'inputs', None) and getattr(cls, 'expected_outputs', None):
cls.test_inputs_match_outputs = cls._test_inputs_match_outputs # type: ignore[attr-defined]

# the only actual test method -- assigned to concrete subclasses in __init_subclass__
def _test_inputs_match_outputs(self):
for _name, _input, _expected_output in self._iter_cases():
with self.subTest(name=_name):
self.run_input_output_test(_input, _expected_output)

def _iter_cases(self) -> typing.Iterator[tuple[str, typing.Any, typing.Any]]:
# yields (name, input, expected_output) tuples
for _name, _input in self.inputs.items():
try:
_expected_output = self.expected_outputs[_name]
except KeyError:
raise NotImplementedError(f'{self.__class__.__qualname__}.expected_outputs["{_name}"]')
yield (_name, _input, _expected_output)
74 changes: 19 additions & 55 deletions tests/trove/derive/_base.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,35 @@
import datetime
from unittest import mock, TestCase
import typing
from unittest import mock

from primitive_metadata import primitive_rdf as rdf

from trove.derive._base import IndexcardDeriver
from tests.trove._input_output_tests import BasicInputOutputTestCase
from ._inputs import DERIVER_TEST_DOCS, DeriverTestDoc


SHOULD_SKIP = object() # for deriver inputs that should be skipped


class BaseIndexcardDeriverTest(TestCase):
maxDiff = None
class BaseIndexcardDeriverTest(BasicInputOutputTestCase):
inputs = DERIVER_TEST_DOCS # (leave this one alone)

#######
# implement these things:
# required on subclasses: `deriver_class` and `expected_outputs`
deriver_class: type[IndexcardDeriver]
# expected_outputs: dict[str, typing.Any]
# ^ (from BasicInputOutputTestCase) must have the same keys as
# `DERIVER_TEST_DOCS` and values that are either `SHOULD_SKIP`
# (when `deriver.should_skip()` should return true) or a value
# that can be compared against `deriver.derive_card_as_text()`

# a subclass of IndexcardDeriver
deriver_class: type
def compute_output(self, given_input):
return self._get_deriver(given_input).derive_card_as_text()

# dictionary with the same keys as `DERIVER_TEST_DOCS` and values that
# are either `SHOULD_SKIP` (above) or strings that will be passed as
# `expected_text` to `derived_texts_equal`
expected_outputs: dict

# (optional override, for when equality isn't so easy)
def assert_derived_texts_equal(self, expected_text: str, actual_text: str) -> None:
self.assertEqual(expected_text, actual_text)

#######
# don't override anything else

test_should_skip: typing.Callable[['BaseIndexcardDeriverTest'], None]
test_derive_card_as_text: typing.Callable[['BaseIndexcardDeriverTest'], None]

def __init_subclass__(cls):
# add test methods on subclasses (but not the base class!)
cls.test_should_skip = _test_should_skip
cls.test_derive_card_as_text = _test_derive_card_as_text

def setUp(self):
_patcher = mock.patch('share.util.IDObfuscator.encode', new=lambda x: x.id)
_patcher.start()
self.addCleanup(_patcher.stop)
def run_input_output_test(self, given_input, expected_output):
if expected_output is SHOULD_SKIP:
self.assertTrue(self._get_deriver(given_input).should_skip())
else:
super().run_input_output_test(given_input, expected_output)

def _get_deriver(self, input_doc: DeriverTestDoc):
_mock_suid = mock.Mock()
Expand All @@ -62,26 +49,3 @@ def _get_deriver(self, input_doc: DeriverTestDoc):
_mock_indexcard_rdf.indexcard.id = '--indexcard-id--'
_mock_indexcard_rdf.indexcard.source_record_suid = _mock_suid
return self.deriver_class(_mock_indexcard_rdf)

def _iter_test_cases(self):
for _input_key, _input_doc in DERIVER_TEST_DOCS.items():
_expected_output = self.expected_outputs.get(_input_key)
if _expected_output is None:
raise NotImplementedError(f'{self.__class__.__qualname__}.expected_outputs["{_input_key}"]')
with self.subTest(input_key=_input_key):
yield (_input_key, self._get_deriver(_input_doc), _expected_output)


def _test_should_skip(self: BaseIndexcardDeriverTest) -> None:
for _input_key, _deriver, _expected_output in self._iter_test_cases():
self.assertEqual(
bool(_expected_output is SHOULD_SKIP),
_deriver.should_skip(),
)


def _test_derive_card_as_text(self: BaseIndexcardDeriverTest) -> None:
for _input_key, _deriver, _expected_output in self._iter_test_cases():
if _expected_output is not SHOULD_SKIP:
_output = _deriver.derive_card_as_text()
self.assert_derived_texts_equal(_expected_output, _output)
2 changes: 1 addition & 1 deletion tests/trove/derive/test_osfmap_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class TestOsfmapJsonDeriver(BaseIndexcardDeriverTest):
deriver_class = OsfmapJsonDeriver

def assert_derived_texts_equal(self, expected, actual):
def assert_outputs_equal(self, expected, actual):
self.assertEqual(expected, json.loads(actual))

expected_outputs = {
Expand Down
9 changes: 8 additions & 1 deletion tests/trove/derive/test_sharev2_elastic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from unittest import mock

from trove.derive.sharev2_elastic import ShareV2ElasticDeriver

Expand All @@ -8,7 +9,13 @@
class TestShareV2ElasticDeriver(BaseIndexcardDeriverTest):
deriver_class = ShareV2ElasticDeriver

def assert_derived_texts_equal(self, expected, actual):
def setUp(self):
# un-obfuscated ids, please
_patcher = mock.patch('share.util.IDObfuscator.encode', new=lambda x: x.id)
_patcher.start()
self.addCleanup(_patcher.stop)

def assert_outputs_equal(self, expected, actual):
self.assertEqual(expected, json.loads(actual))

expected_outputs = {
Expand Down
Empty file added tests/trove/render/__init__.py
Empty file.
48 changes: 48 additions & 0 deletions tests/trove/render/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import json

from trove.render._base import BaseRenderer
from trove.render._rendering import ProtoRendering
from tests.trove._input_output_tests import BasicInputOutputTestCase
from ._inputs import UNRENDERED_RDF, UNRENDERED_SEARCH_RDF, RdfCase


class TroveRendererTests(BasicInputOutputTestCase):
inputs = UNRENDERED_RDF

# required on subclasses: `renderer_class` and `expected_outputs`
renderer_class: type[BaseRenderer]
# expected_outputs: dict[str, typing.Any] (from BasicInputOutputTestCase)

def compute_output(self, given_input: RdfCase):
_renderer = self.renderer_class(
response_focus_iri=given_input.focus,
response_tripledict=given_input.tripledict,
)
return _renderer.render_document()

def assert_outputs_equal(self, expected_output, actual_output) -> None:
if expected_output is ...:
print(repr(actual_output))
raise NotImplementedError
self.assertEqual(expected_output.mediatype, actual_output.mediatype)
self.assertEqual(
self._get_rendered_output(expected_output),
self._get_rendered_output(actual_output),
)

def _get_rendered_output(self, rendering: ProtoRendering):
# for now, they always iter strings (update if/when bytes are in play)
return ''.join(rendering.iter_content()) # type: ignore[arg-type]


class TrovesearchRendererTests(TroveRendererTests):
inputs = UNRENDERED_SEARCH_RDF


class TroveJsonRendererTests(TroveRendererTests):
def _get_rendered_output(self, rendering: ProtoRendering):
return json.loads(super()._get_rendered_output(rendering))


class TrovesearchJsonRendererTests(TroveJsonRendererTests, TrovesearchRendererTests):
pass
Loading
Loading