[ENG-6284] render tsv/csv (#834)

allow rendering search responses as lines of tab-separated or comma-separated values main point: - add simple_tsv and simple_csv renderers in trove.render - - can be seen with query param acceptMediatype=text/tab-separated-values or acceptMediatype=text/csv - - get default columns from static DEFAULT_TABULAR_SEARCH_COLUMN_PATHS in trove.vocab.osfmap - allow "download" responses -- add withFileName=foo query param to get a response with Content-Disposition: attachment and a filename based on "foo" - allow absurd page sizes changes made along the way: - introduce ProtoRendering as renderer output type, to better decouple rendering from view logic - - include StreamableRendering for responses that might could be streamed, like csv/tsv (tho it's not currently handled any differently from SimpleRendering) - - reshape BaseRenderer (and each existing renderer) to have a consistent call signature (and return ProtoRendering) - - - replace trove.render.get_renderer with trove.render.get_renderer_type -- instantiate the renderer with response data - add trove.views._responder with common logic for building a django HttpResponse for a ProtoRendering - - consistently handles withFileName/Content-Disposition - move some osf-specific constants to trove.vocab.osfmap for easier reuse - pull out some abstractable logic: - - from existing trove.render.simple_json into trove.render._simple_trovesearch (for renderers that include only the list of search results) - - from existing tests.trove.derive._base into tests.trove._input_output_tests (for tests following the same simple input/output pattern as deriver and renderer tests) - add tests.trove.render to cover the new renderers simple_tsv and simple_csv, as well as the existing renderers jsonapi, simple_json, jsonld, and turtle - - minimally update existing renderers to create consistent output
CenterForOpenScience · Dec 23, 2024 · 75ab046 · 75ab046
1 parent 24bc70a
commit 75ab046
Show file tree

Hide file tree

Showing 44 changed files with 2,366 additions and 636 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -43,4 +43,4 @@ xmltodict==0.12.0  # MIT
 # Allows custom-rendered IDs, hiding null values, and including data in error responses
 git+https://github.com/cos-forks/[email protected]+cos0
 
-git+https://github.com/aaxelb/[email protected].09
+git+https://github.com/aaxelb/[email protected].14
diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py
@@ -12,9 +12,9 @@
     CardsearchParams,
     ValuesearchParams,
 )
-from trove.trovesearch.search_response import (
-    CardsearchResponse,
-    ValuesearchResponse,
+from trove.trovesearch.search_handle import (
+    CardsearchHandle,
+    ValuesearchHandle,
 )
 
 
@@ -219,10 +219,10 @@ def pls_stop_keeping_live(self):
         def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict:
             raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__sharev2_backcompat (either implement it or don\'t use this strategy for backcompat)')
 
-        def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse:
+        def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle:
             raise NotImplementedError
 
-        def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
+        def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle:
             raise NotImplementedError
 
         def pls_get_mappings(self) -> dict:

diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py
@@ -19,17 +19,15 @@
 )
 from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri
 from trove.vocab.namespaces import (
-    DCTERMS,
-    FOAF,
-    OSFMAP,
     OWL,
     RDF,
-    RDFS,
-    SKOS,
     TROVE,
     XSD,
 )
-from trove.vocab.osfmap import is_date_property
+from trove.vocab.osfmap import (
+    is_date_property,
+    SKIPPABLE_PROPERTIES,
+)
 
 
 _logger = logging.getLogger(__name__)
@@ -38,16 +36,6 @@
 ###
 # constants
 
-SKIPPABLE_PROPERTIES = (
-    OSFMAP.contains,  # too much, not helpful
-    OWL.sameAs,  # handled special
-)
-
-TITLE_PROPERTIES = (DCTERMS.title,)
-NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName)
-LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel)
-NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES)
-
 KEYWORD_LENGTH_MAX = 8191  # skip keyword terms that might exceed lucene's internal limit
 # (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html)
 KEYWORD_MAPPING = {'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX}
@@ -160,7 +148,7 @@ def __post_init__(self):
                 self.integer_values[_walk_path].add(_walk_obj)
             elif isinstance(_walk_obj, rdf.Literal):
                 if XSD.integer in _walk_obj.datatype_iris:
-                    self.integer_values[_walk_path].add(_walk_obj)
+                    self.integer_values[_walk_path].add(int(_walk_obj.unicode_value))
                 if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris):
                     self.text_values[_walk_path].add(_walk_obj)
             # try for date in a date property, regardless of the above

diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py
@@ -31,24 +31,20 @@
     SortParam,
     GLOB_PATHSTEP,
 )
-from trove.trovesearch.search_response import (
-    CardsearchResponse,
-    ValuesearchResponse,
+from trove.trovesearch.search_handle import (
+    CardsearchHandle,
+    ValuesearchHandle,
     TextMatchEvidence,
     CardsearchResult,
     ValuesearchResult,
     PropertypathUsage,
 )
 from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword
-from trove.vocab.osfmap import is_date_property
+from trove.vocab import osfmap
 from trove.vocab.namespaces import RDF, OWL
 from ._trovesearch_util import (
     latest_rdf_for_indexcard_pks,
     GraphWalk,
-    TITLE_PROPERTIES,
-    NAME_PROPERTIES,
-    LABEL_PROPERTIES,
-    NAMELIKE_PROPERTIES,
     KEYWORD_LENGTH_MAX,
 )
 
@@ -288,7 +284,7 @@ def pls_handle_search__sharev2_backcompat(self, request_body=None, request_query
                 params=(request_queryparams or {}),
             )
 
-        def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse:
+        def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle:
             _cursor = self._cardsearch_cursor(cardsearch_params)
             _sort = self._cardsearch_sort(cardsearch_params.sort_list)
             _query = self._cardsearch_query(
@@ -306,7 +302,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear
                 aggs=self._cardsearch_aggs(cardsearch_params),
                 sort=_sort,
                 from_=_from_offset,
-                size=_cursor.page_size,
+                size=_cursor.bounded_page_size,
                 source=False,  # no need to get _source; _id is enough
             )
             if settings.DEBUG:
@@ -318,11 +314,11 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear
                 )
             except elasticsearch8.TransportError as error:
                 raise exceptions.IndexStrategyError() from error  # TODO: error messaging
-            return self._cardsearch_response(cardsearch_params, _es8_response, _cursor)
+            return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor)
 
-        def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
+        def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle:
             _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor)
-            _is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1])
+            _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1])
             _search_kwargs = dict(
                 query=self._cardsearch_query(
                     valuesearch_params.cardsearch_filter_set,
@@ -347,7 +343,7 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value
                 )
             except elasticsearch8.TransportError as error:
                 raise exceptions.IndexStrategyError() from error  # TODO: error messaging
-            return self._valuesearch_response(valuesearch_params, _es8_response, _cursor)
+            return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor)
 
         ###
         # query implementation
@@ -449,7 +445,7 @@ def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: O
             _nested_terms_agg = {
                 'field': 'nested_iri.iri_value',
                 # WARNING: terribly inefficient pagination (part one)
-                'size': cursor.start_offset + cursor.page_size + 1,
+                'size': cursor.start_offset + cursor.bounded_page_size + 1,
             }
             _iris = list(valuesearch_params.valuesearch_iris())
             if _iris:
@@ -526,7 +522,7 @@ def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams):
             }
             return _aggs
 
-        def _valuesearch_response(
+        def _valuesearch_handle(
             self,
             valuesearch_params: ValuesearchParams,
             es8_response: dict,
@@ -537,31 +533,33 @@ def _valuesearch_response(
                 _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets']
                 _bucket_count = len(_buckets)
                 # WARNING: terribly inefficient pagination (part two)
-                _page_end_index = cursor.start_offset + cursor.page_size
+                _page_end_index = cursor.start_offset + cursor.bounded_page_size
                 _bucket_page = _buckets[cursor.start_offset:_page_end_index]  # discard prior pages
                 cursor.total_count = (
                     MANY_MORE
                     if (_bucket_count > _page_end_index)  # agg includes one more, if there
                     else _bucket_count
                 )
-                return ValuesearchResponse(
+                return ValuesearchHandle(
                     cursor=cursor,
                     search_result_page=[
                         self._valuesearch_iri_result(_iri_bucket)
                         for _iri_bucket in _bucket_page
                     ],
+                    search_params=valuesearch_params,
                 )
             else:  # assume date
                 _year_buckets = (
                     es8_response['aggregations']['in_nested_date']
                     ['value_at_propertypath']['count_by_year']['buckets']
                 )
-                return ValuesearchResponse(
+                return ValuesearchHandle(
                     cursor=PageCursor(len(_year_buckets)),
                     search_result_page=[
                         self._valuesearch_date_result(_year_bucket)
                         for _year_bucket in _year_buckets
                     ],
+                    search_params=valuesearch_params,
                 )
 
         def _valuesearch_iri_result(self, iri_bucket):
@@ -664,7 +662,7 @@ def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]:
             else:
                 raise ValueError(f'invalid date filter operator (got {search_filter.operator})')
 
-        def _cardsearch_sort(self, sort_list: tuple[SortParam]):
+        def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]):
             if not sort_list:
                 return None
             return [
@@ -683,12 +681,12 @@ def _cardsearch_sort(self, sort_list: tuple[SortParam]):
                 for _sortparam in sort_list
             ]
 
-        def _cardsearch_response(
+        def _cardsearch_handle(
             self,
             cardsearch_params: CardsearchParams,
             es8_response: dict,
             cursor: OffsetCursor,
-        ) -> CardsearchResponse:
+        ) -> CardsearchHandle:
             _es8_total = es8_response['hits']['total']
             if _es8_total['relation'] != 'eq':
                 cursor.total_count = MANY_MORE
@@ -717,11 +715,11 @@ def _cardsearch_response(
                 for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']:
                     _path = tuple(json.loads(_bucket['key']))
                     _relatedproperty_by_path[_path].usage_count += _bucket['doc_count']
-            return CardsearchResponse(
+            return CardsearchHandle(
                 cursor=cursor,
                 search_result_page=_results,
                 related_propertypath_results=_relatedproperty_list,
-                cardsearch_params=cardsearch_params,
+                search_params=cardsearch_params,
             )
 
         def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]:
@@ -833,7 +831,7 @@ def _inner_hits(self, *, highlight_query=None) -> dict:
 
 def _should_skip_card(indexcard_rdf, rdfdoc):
     # skip cards without some value for name/title/label
-    return not any(rdfdoc.q(indexcard_rdf.focus_iri, NAMELIKE_PROPERTIES))
+    return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES))
 
 
 def _bucketlist(agg_result: dict) -> list[str]:
@@ -911,17 +909,17 @@ def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc):
             # TODO: don't discard language for name/title/label
             name_text=frozenset(
                 _text.unicode_value
-                for _text in rdfdoc.q(iri, NAME_PROPERTIES)
+                for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES)
                 if isinstance(_text, primitive_rdf.Literal)
             ),
             title_text=frozenset(
                 _text.unicode_value
-                for _text in rdfdoc.q(iri, TITLE_PROPERTIES)
+                for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES)
                 if isinstance(_text, primitive_rdf.Literal)
             ),
             label_text=frozenset(
                 _text.unicode_value
-                for _text in rdfdoc.q(iri, LABEL_PROPERTIES)
+                for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES)
                 if isinstance(_text, primitive_rdf.Literal)
             ),
         )