opensearch-project · opensearch-trigger-bot · Jan 6, 2025
@@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290]
 - Optimizes lucene query execution to prevent unnecessary rewrites (#2305)[https://github.com/opensearch-project/k-NN/pull/2305]
 - Add check to directly use ANN Search when filters match all docs. (#2320)[https://github.com/opensearch-project/k-NN/pull/2320]
+- Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357]
 ### Bug Fixes
 * Fixing the bug when a segment has no vector field present for disk based vector search (#2282)[https://github.com/opensearch-project/k-NN/pull/2282]
 * Fix for NPE while merging segments after all the vector fields docs are deleted (#2365)[https://github.com/opensearch-project/k-NN/pull/2365]

@@ -60,9 +60,21 @@ public float scoreToDistanceTranslation(float score) {
         }
     },
     COSINESIMIL("cosinesimil") {
+        /**
+         * Cosine similarity has range of [-1, 1] where -1 represents vectors are at diametrically opposite, and 1 is where
+         * they are identical in direction and perfectly similar. In Lucene, scores have to be in the range of [0, Float.MAX_VALUE].
+         * Hence, to move the range from [-1, 1] to [ 0, Float.MAX_VALUE], we convert  using following formula which is adopted
+         * by Lucene as mentioned here
+         * https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
+         * We expect raw score = 1 - cosine(x,y), if underlying library returns different range or other than expected raw score,
+         * they should override this method to either provide valid range or convert raw score to the format as 1 - cosine and call this method
+         *
+         * @param rawScore score returned from underlying library
+         * @return Lucene scaled score
+         */
         @Override
         public float scoreTranslation(float rawScore) {
-            return 1 / (1 + rawScore);
+            return Math.max((2.0F - rawScore) / 2.0F, 0.0F);
         }
 
         @Override

@@ -5,6 +5,7 @@
 
 package org.opensearch.knn.index.mapper;
 
+import org.opensearch.Version;
 import org.opensearch.knn.index.engine.KNNMethodContext;
 import org.opensearch.knn.index.engine.qframe.QuantizationConfig;
 
@@ -62,4 +63,12 @@ default QuantizationConfig getQuantizationConfig() {
      * @return the dimension of the index; for model based indices, it will be null
      */
     int getDimension();
+
+    /**
+     * Returns index created Version
+     * @return Version
+     */
+    default Version getIndexCreatedVersion() {
+        return Version.CURRENT;
+    }
 }
@@ -17,6 +17,7 @@
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.KnnByteVectorField;
 import org.apache.lucene.document.KnnFloatVectorField;
+import org.opensearch.Version;
 import org.opensearch.common.Explicit;
 import org.opensearch.knn.index.KNNVectorSimilarityFunction;
 import org.opensearch.knn.index.VectorDataType;
@@ -73,6 +74,11 @@ public Mode getMode() {
                 public CompressionLevel getCompressionLevel() {
                     return knnMethodConfigContext.getCompressionLevel();
                 }
+
+                @Override
+                public Version getIndexCreatedVersion() {
+                    return knnMethodConfigContext.getVersionCreated();
+                }
             }
         );
 

@@ -8,6 +8,7 @@
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.VectorEncoding;
+import org.opensearch.Version;
 import org.opensearch.common.Explicit;
 import org.opensearch.common.xcontent.XContentFactory;
 import org.opensearch.knn.index.SpaceType;
@@ -86,6 +87,11 @@ public CompressionLevel getCompressionLevel() {
                 public QuantizationConfig getQuantizationConfig() {
                     return quantizationConfig;
                 }
+
+                @Override
+                public Version getIndexCreatedVersion() {
+                    return knnMethodConfigContext.getVersionCreated();
+                }
             }
         );
         return new MethodFieldMapper(

@@ -107,6 +107,11 @@ public QuantizationConfig getQuantizationConfig() {
                 return quantizationConfig;
             }
 
+            @Override
+            public Version getIndexCreatedVersion() {
+                return indexCreatedVersion;
+            }
+
             // ModelMetadata relies on cluster state which may not be available during field mapper creation. Thus,
             // we lazily initialize it.
             private void initFromModelMetadata() {

@@ -8,6 +8,7 @@
 import lombok.Getter;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.IndexSearcher;
+import org.opensearch.Version;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.knn.index.SpaceType;
 import org.opensearch.knn.index.VectorDataType;
@@ -69,7 +70,7 @@ public KNNFieldSpace(
         ) {
             KNNVectorFieldType knnVectorFieldType = toKNNVectorFieldType(fieldType, spaceName, supportingVectorDataTypes);
             this.processedQuery = getProcessedQuery(query, knnVectorFieldType);
-            this.scoringMethod = getScoringMethod(this.processedQuery);
+            this.scoringMethod = getScoringMethod(this.processedQuery, knnVectorFieldType.getKnnMappingConfig().getIndexCreatedVersion());
         }
 
         public ScoreScript getScoreScript(
@@ -122,6 +123,10 @@ protected float[] getProcessedQuery(final Object query, final KNNVectorFieldType
 
         protected abstract BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery);
 
+        protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery, Version indexCreatedVersion) {
+            return getScoringMethod(processedQuery);
+        }
+
     }
 
     class L2 extends KNNFieldSpace {
@@ -141,9 +146,29 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) {
         }
 
         @Override
-        protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery) {
+        protected BiFunction<float[], float[], Float> getScoringMethod(float[] processedQuery) {
+            return getScoringMethod(processedQuery, Version.CURRENT);
+        }
+
+        @Override
+        protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery, Version indexCreatedVersion) {
             SpaceType.COSINESIMIL.validateVector(processedQuery);
             float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery);
+            if (indexCreatedVersion.onOrAfter(Version.V_2_19_0)) {
+                // To be consistent, we will be using same formula used by lucene as mentioned below
+                // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
+                // for indices that are created on or after 2.19.0
+                //
+                // OS Score = ( 2 − cosineSimil) / 2
+                // However cosineSimil = 1 - cos θ, after applying this to above formula,
+                // OS Score = ( 2 − ( 1 − cos θ ) ) / 2
+                // which simplifies to
+                // OS Score = ( 1 + cos θ ) / 2
+                return (float[] q, float[] v) -> Math.max(
+                    ((1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F),
+                    0
+                );
+            }
             return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude);
         }
     }

@@ -195,6 +195,61 @@ public void testEndToEnd() throws Exception {
         fail("Graphs are not getting evicted");
     }
 
+    public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception {
+        String indexName = "test-index-1";
+        String fieldName = "test-field-1";
+        SpaceType spaceType = SpaceType.COSINESIMIL;
+        Integer dimension = testData.indexData.vectors[0].length;
+
+        // Create an index
+        XContentBuilder builder = XContentFactory.jsonBuilder()
+            .startObject()
+            .startObject("properties")
+            .startObject(fieldName)
+            .field("type", "knn_vector")
+            .field("dimension", dimension)
+            .field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue())
+            .startObject(KNNConstants.KNN_METHOD)
+            .field(KNNConstants.NAME, KNNConstants.METHOD_HNSW)
+            .field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName())
+            .endObject()
+            .endObject()
+            .endObject()
+            .endObject();
+
+        Map<String, Object> mappingMap = xContentBuilderToMap(builder);
+        String mapping = builder.toString();
+
+        createKnnIndex(indexName, buildKNNIndexSettings(0), mapping);
+
+        // Index one document
+        addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());
+
+        // Assert we have the right number of documents in the index
+        refreshAllIndices();
+        assertEquals(1, getDocCount(indexName));
+        // update threshold setting to skip building graph
+        updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1));
+        // add duplicate document with different id
+        addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());
+        assertEquals(2, getDocCount(indexName));
+        final int k = 2;
+        // search index
+        Response response = searchKNNIndex(
+            indexName,
+            KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(),
+            k
+        );
+        String responseBody = EntityUtils.toString(response.getEntity());
+        List<KNNResult> knnResults = parseSearchResponse(responseBody, fieldName);
+        assertEquals(k, knnResults.size());
+
+        List<Float> actualScores = parseSearchResponseScore(responseBody, fieldName);
+
+        // both document should have identical score
+        assertEquals(actualScores.get(0), actualScores.get(1), 0.001);
+    }
+
     @SneakyThrows
     private void validateSearch(
         final String indexName,

@@ -10,6 +10,7 @@
 import java.util.Locale;
 
 import lombok.SneakyThrows;
+import org.apache.lucene.index.VectorSimilarityFunction;
 import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.knn.KNNTestCase;
 import org.opensearch.knn.index.engine.KNNMethodContext;
@@ -86,7 +87,11 @@ public void testCosineSimilarity_whenValid_thenSucceed() {
             getMappingConfigForMethodMapping(knnMethodContext, 3)
         );
         KNNScoringSpace.CosineSimilarity cosineSimilarity = new KNNScoringSpace.CosineSimilarity(arrayListQueryObject, fieldType);
-        assertEquals(2F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F);
+        assertEquals(
+            VectorSimilarityFunction.COSINE.compare(arrayFloat2, arrayFloat),
+            cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat),
+            0.1F
+        );
 
         // invalid zero vector
         final List<Float> queryZeroVector = List.of(0.0f, 0.0f, 0.0f);