CenterForOpenScience · aaxelb · Aug 8, 2024 · Aug 8, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/_TODO.txt b/_TODO.txt
@@ -0,0 +1,171 @@
+using trove for a dashboard of metrics
+======================================
+
+on frontend...
+
+a dashboard has a consistent `cardSearchFilter` set
+
+for each metadata property (or property-path) of interest,
+make a request to `/trove/index-value-search` with that `valueSearchPropertyPath`
+and the dashboard's `cardSearchFilter` set
+
+
+
+denormalized IndexStrategy
+==========================
+
+current mappings:
+  simple:
+    indexcard_uuid
+    focus_iri
+    suffuniq_focus_iri
+    source_record_identifier
+    source_config_label
+    iri_paths_present
+    iri_paths_present_suffuniq
+  flattened:
+    flat_iri_values
+    flat_iri_values_suffuniq
+  nested: (THE PROBLEM)
+    nested_iri...
+    nested_date...
+    nested_text...
+
+
+to denormalize for performance (removing (most) `nested` mappings)
+while supporting existing api used by osf-search...
+
+edges to consider:
+- `cardSearchText[property.path]`
+  - dynamic template for text values per property-path (...to limited depth?)
+- `valueSearchFilter[resourceType]`
+  - dynamic template for iri values per resource-type?
+- `valueSearchText`
+  - ...new index for value-search?
+  - ...maybe can use the same dynamic fields added for `cardSearchText[property.path]`?
+      ...but how to keep the text associated with the iri value...
+  - ...could keep the old `nested` garbage around, but only use it when `valueSearchText`?
+- `cardSearchFilter[sameAs][iri-prefix]=https://orcid.org/`
+  - new filter operator
+- `cardSearchText[*.*.*]`, `cardSearchFilter[*.*.*]`
+  - dynamic templates for values by depth?
+
+
+possible future card-index mappings:
+  simple:
+    indexcard_uuid
+    suid.source_config_label
+    suid.source_record_identifier
+    focus_iri.exact
+    focus_iri.suffuniq
+    propertypaths_present
+  flattened:
+    iri_by_propertypath.exact.*
+    iri_by_propertypath.suffuniq.*
+    iri_by_propertypath_length.exact.*
+    iri_by_propertypath_length.suffuniq.*
+  dynamic: (used instead of the old nested fields for most queries)
+    dynamics.text_by_propertypath.*
+    dynamics.text_by_propertypath_length.*
+    dynamics.date_by_propertypath.*
+    (maybe) dynamics.number_by_propertypath.*
+  nested: (ONLY for index-value-search with `valueSearchText` or `valueSearchFilter[resourceType]`)
+    iri_usage
+      iri.exact
+      iri.suffuniq
+      propertypath
+      propertypath_length
+      type_iri.exact
+      type_iri.suffuniq
+      name_text
+      title_text
+      label_text
+      namelike_text (combined three)
+
+
+multiple strategies?
+====================
+after reluctantly accepting `nested` for certain value-searches... how about multiple index strategies?
+
+select suitable index-strategy based on query
+
+most queries go to a more constrained index-strategy with a smaller, faster,
+completely non-nested index (calling it "trovesearch_indexcard")
+
+queries that need the extra complexity go to a more complex index-strategy
+with larger, slower index (calling it "trovesearch_excessive")
+
+however... even simple value-searches need to get metadata about each iri value
+(at least `rdf:type` and something name-like (`dcterms:title`, `foaf:name`, `rdfs:label`...))
+-- without the `nested` mapping, there's not a good way (that i see) to do that in a single query
+
+so how about a third index strategy just for looking up iri-value metadata?
+(calling it "trovesearch_irivalues")
+
+
+trovesearch_indexcard (one per indexcard):
+  simple:
+    indexcard_iri
+    indexcard_pk
+    suid.source_config_label
+    suid.source_record_identifier
+    focus_iri.exact
+    focus_iri.suffuniq
+    propertypaths_present
+  flattened:
+    iri_by_propertypath.*
+    iri_by_depth.*
+  dynamic:
+    dynamics.text_by_propertypath.*
+    dynamics.text_by_depth.*
+    dynamics.date_by_propertypath.*
+
+
+trovesearch_irivalues (one per (indexcard, iri) pair)
+  simple:
+    iri.exact (includes sameAs synonyms)
+    iri.suffuniq (includes sameAs synonyms)
+    indexcard_iri
+    indexcard_pk
+    propertypath_from_focus
+    depth_from_focus
+  flattened:
+    iri_by_relative_propertypath.*
+    iri_by_relative_depth.*
+  dynamic:
+    dynamics.text_by_relative_propertypath.*
+    dynamics.text_by_relative_depth.*
+    dynamics.date_by_relative_propertypath.*
+
+
+trovesearch_excessive:
+  (all fields from trovesearch_indexcard, plus a nested field with
+   fields from (or similar to) trovesearch_irivalues)
+
+
+...ok maybe, but revisiting "trovesearch_irivalues (one per (indexcard, iri) pair)",
+that's a looot of documents, and awful wasteful for the common case of commonly used iris,
+and trickier to remove docs for iri values no longer used
+
+returning to an old idea discarded from the first "index-card-search" implementation...
+how about an index with (only) one doc per referenced iri? would need to:
+- use IDENTIFIER_USAGE/BACKFILL_IDENTIFIER_USAGE messages
+    emit after non-backfill indexcard indexing, perhaps deduped within each message chunk
+- index strategy should, for each identifier message:
+    query for indexcards that include that identifier,
+    aggregate metadata included in those indexcards about that identifier,
+    store document describing that identifier and its usage
+
+important to account for erroneous sameAs assertions (make it easy to undo)
+
+revised trovesearch_irivalues (one per iri)
+  simple:
+    iri
+    used_at_propertypath
+  flattened:
+    iri_by_relative_propertypath.*
+    iri_by_relative_depth.*
+  dynamic:
+    dynamics.text_by_relative_propertypath.*
+    dynamics.text_by_relative_depth.*
+    dynamics.date_by_relative_propertypath.*
diff --git a/api/search/views.py b/api/search/views.py
@@ -8,7 +8,7 @@
 
 from api import authentication
 from share.search import exceptions
-from share.search.index_strategy import IndexStrategy
+from share.search import index_strategy
 
 
 class Sharev2ElasticSearchView(views.APIView):
@@ -32,7 +32,7 @@ def _handle_request(self, request):
         if 'scroll' in queryparams:
             return http.HttpResponseForbidden(reason='Scroll is not supported.')
         try:
-            specific_index = IndexStrategy.get_for_sharev2_search(requested_index_strategy)
+            specific_index = index_strategy.get_index_for_sharev2_search(requested_index_strategy)
         except exceptions.IndexStrategyError as error:
             raise http.Http404(str(error))
         try:

diff --git a/api/views/feeds.py b/api/views/feeds.py
@@ -10,7 +10,7 @@
 import pendulum
 import sentry_sdk
 
-from share.search import IndexStrategy
+from share.search import index_strategy
 from share.search.exceptions import IndexStrategyError
 from share.util.xml import strip_illegal_xml_chars
 
@@ -34,14 +34,16 @@ class MetadataRecordsRSS(Feed):
     description = 'Updates to the SHARE open dataset'
     author_name = 'SHARE'
 
+    _search_index: index_strategy.IndexStrategy.SpecificIndex
+
     def title(self, obj):
         query = json.dumps(obj.get('query', 'All'))
         return prepare_string('SHARE: Atom feed for query: {}'.format(query))
 
     def get_object(self, request):
         self._order = request.GET.get('order')
         elastic_query = request.GET.get('elasticQuery')
-        self._index_strategy = IndexStrategy.get_for_sharev2_search(request.GET.get('indexStrategy'))
+        self._search_index = index_strategy.get_index_for_sharev2_search(request.GET.get('indexStrategy'))
 
         if self._order not in {'date_modified', 'date_updated', 'date_created', 'date_published'}:
             self._order = 'date_modified'
@@ -62,7 +64,7 @@ def get_object(self, request):
 
     def items(self, obj):
         try:
-            json_response = self._index_strategy.pls_handle_search__sharev2_backcompat(
+            json_response = self._search_index.pls_handle_search__sharev2_backcompat(
                 request_body=obj,
             )
         except IndexStrategyError:

diff --git a/project/settings.py b/project/settings.py
@@ -314,52 +314,15 @@ def split(string, delim):
     'TIMEOUT': int(os.environ.get('ELASTICSEARCH_TIMEOUT', '45')),
     'CHUNK_SIZE': int(os.environ.get('ELASTICSEARCH_CHUNK_SIZE', 2000)),
     'MAX_RETRIES': int(os.environ.get('ELASTICSEARCH_MAX_RETRIES', 7)),
-    'INDEX_STRATEGIES': {},  # populated below based on environment
 }
 ELASTICSEARCH5_URL = (
     os.environ.get('ELASTICSEARCH5_URL')
-    or os.environ.get('ELASTICSEARCH_URL')
+    or os.environ.get('ELASTICSEARCH_URL')  # backcompat
 )
-if ELASTICSEARCH5_URL:
-    ELASTICSEARCH['INDEX_STRATEGIES']['sharev2_elastic5'] = {
-        'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic5.Sharev2Elastic5IndexStrategy',
-        'CLUSTER_SETTINGS': {
-            'URL': ELASTICSEARCH5_URL,
-        },
-    }
 ELASTICSEARCH8_URL = os.environ.get('ELASTICSEARCH8_URL')
-if ELASTICSEARCH8_URL:
-    ELASTICSEARCH8_CERT_PATH = os.environ.get('ELASTICSEARCH8_CERT_PATH')
-    ELASTICSEARCH8_USERNAME = os.environ.get('ELASTICSEARCH8_USERNAME', 'elastic')
-    ELASTICSEARCH8_SECRET = os.environ.get('ELASTICSEARCH8_SECRET')
-    ELASTICSEARCH8_CLUSTER_SETTINGS = {
-        'URL': ELASTICSEARCH8_URL,
-        'AUTH': (
-            (ELASTICSEARCH8_USERNAME, ELASTICSEARCH8_SECRET)
-            if ELASTICSEARCH8_SECRET is not None
-            else None
-        ),
-        'CERT_PATH': ELASTICSEARCH8_CERT_PATH,
-    }
-    ELASTICSEARCH['INDEX_STRATEGIES'].update({
-        'sharev2_elastic8': {
-            'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy',
-            'CLUSTER_SETTINGS': ELASTICSEARCH8_CLUSTER_SETTINGS,
-        },
-        'trove_indexcard_flats': {
-            'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy',
-            'CLUSTER_SETTINGS': ELASTICSEARCH8_CLUSTER_SETTINGS,
-        },
-    })
-DEFAULT_INDEX_STRATEGY_FOR_LEGACY_SEARCH = (
-    'sharev2_elastic5'
-    if ELASTICSEARCH5_URL
-    else (
-        'sharev2_elastic8'
-        if ELASTICSEARCH8_URL
-        else None
-    )
-)
+ELASTICSEARCH8_CERT_PATH = os.environ.get('ELASTICSEARCH8_CERT_PATH')
+ELASTICSEARCH8_USERNAME = os.environ.get('ELASTICSEARCH8_USERNAME', 'elastic')
+ELASTICSEARCH8_SECRET = os.environ.get('ELASTICSEARCH8_SECRET')
 
 # Seconds, not an actual celery settings
 CELERY_RETRY_BACKOFF_BASE = int(os.environ.get('CELERY_RETRY_BACKOFF_BASE', 2 if DEBUG else 10))

diff --git a/share/admin/__init__.py b/share/admin/__init__.py
@@ -17,7 +17,7 @@
 from share.admin.celery import CeleryTaskResultAdmin
 from share.admin.jobs import HarvestJobAdmin
 from share.admin.readonly import ReadOnlyAdmin
-from share.admin.search import search_indexes_view
+from share.admin.search import search_indexes_view, search_index_mappings_view
 from share.admin.util import TimeLimitedPaginator, linked_fk, linked_many, SourceConfigFilter
 from share.harvest.scheduler import HarvestScheduler
 from share.models import (
@@ -49,6 +49,11 @@ def get_urls(self):
                 self.admin_view(search_indexes_view),
                 name='search-indexes',
             ),
+            path(
+                'search-index-mappings/<index_name>',
+                self.admin_view(search_index_mappings_view),
+                name='search-index-mappings',
+            ),
             *super().get_urls(),
         ]