Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Use one formula to calculate cosine similarity #2375

Open
wants to merge 1 commit into
base: 2.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290]
- Optimizes lucene query execution to prevent unnecessary rewrites (#2305)[https://github.com/opensearch-project/k-NN/pull/2305]
- Add check to directly use ANN Search when filters match all docs. (#2320)[https://github.com/opensearch-project/k-NN/pull/2320]
- Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357]
### Bug Fixes
* Fixing the bug when a segment has no vector field present for disk based vector search (#2282)[https://github.com/opensearch-project/k-NN/pull/2282]
* Fix for NPE while merging segments after all the vector fields docs are deleted (#2365)[https://github.com/opensearch-project/k-NN/pull/2365]
Expand Down
14 changes: 13 additions & 1 deletion src/main/java/org/opensearch/knn/index/SpaceType.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,21 @@ public float scoreToDistanceTranslation(float score) {
}
},
COSINESIMIL("cosinesimil") {
/**
* Cosine similarity has range of [-1, 1] where -1 represents vectors are at diametrically opposite, and 1 is where
* they are identical in direction and perfectly similar. In Lucene, scores have to be in the range of [0, Float.MAX_VALUE].
* Hence, to move the range from [-1, 1] to [ 0, Float.MAX_VALUE], we convert using following formula which is adopted
* by Lucene as mentioned here
* https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
* We expect raw score = 1 - cosine(x,y), if underlying library returns different range or other than expected raw score,
* they should override this method to either provide valid range or convert raw score to the format as 1 - cosine and call this method
*
* @param rawScore score returned from underlying library
* @return Lucene scaled score
*/
@Override
public float scoreTranslation(float rawScore) {
return 1 / (1 + rawScore);
return Math.max((2.0F - rawScore) / 2.0F, 0.0F);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

package org.opensearch.knn.index.mapper;

import org.opensearch.Version;
import org.opensearch.knn.index.engine.KNNMethodContext;
import org.opensearch.knn.index.engine.qframe.QuantizationConfig;

Expand Down Expand Up @@ -62,4 +63,12 @@ default QuantizationConfig getQuantizationConfig() {
* @return the dimension of the index; for model based indices, it will be null
*/
int getDimension();

/**
* Returns index created Version
* @return Version
*/
default Version getIndexCreatedVersion() {
return Version.CURRENT;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnByteVectorField;
import org.apache.lucene.document.KnnFloatVectorField;
import org.opensearch.Version;
import org.opensearch.common.Explicit;
import org.opensearch.knn.index.KNNVectorSimilarityFunction;
import org.opensearch.knn.index.VectorDataType;
Expand Down Expand Up @@ -73,6 +74,11 @@ public Mode getMode() {
public CompressionLevel getCompressionLevel() {
return knnMethodConfigContext.getCompressionLevel();
}

@Override
public Version getIndexCreatedVersion() {
return knnMethodConfigContext.getVersionCreated();
}
}
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.VectorEncoding;
import org.opensearch.Version;
import org.opensearch.common.Explicit;
import org.opensearch.common.xcontent.XContentFactory;
import org.opensearch.knn.index.SpaceType;
Expand Down Expand Up @@ -86,6 +87,11 @@ public CompressionLevel getCompressionLevel() {
public QuantizationConfig getQuantizationConfig() {
return quantizationConfig;
}

@Override
public Version getIndexCreatedVersion() {
return knnMethodConfigContext.getVersionCreated();
}
}
);
return new MethodFieldMapper(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ public QuantizationConfig getQuantizationConfig() {
return quantizationConfig;
}

@Override
public Version getIndexCreatedVersion() {
return indexCreatedVersion;
}

// ModelMetadata relies on cluster state which may not be available during field mapper creation. Thus,
// we lazily initialize it.
private void initFromModelMetadata() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import lombok.Getter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.IndexSearcher;
import org.opensearch.Version;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.knn.index.SpaceType;
import org.opensearch.knn.index.VectorDataType;
Expand Down Expand Up @@ -69,7 +70,7 @@ public KNNFieldSpace(
) {
KNNVectorFieldType knnVectorFieldType = toKNNVectorFieldType(fieldType, spaceName, supportingVectorDataTypes);
this.processedQuery = getProcessedQuery(query, knnVectorFieldType);
this.scoringMethod = getScoringMethod(this.processedQuery);
this.scoringMethod = getScoringMethod(this.processedQuery, knnVectorFieldType.getKnnMappingConfig().getIndexCreatedVersion());
}

public ScoreScript getScoreScript(
Expand Down Expand Up @@ -122,6 +123,10 @@ protected float[] getProcessedQuery(final Object query, final KNNVectorFieldType

protected abstract BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery);

protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery, Version indexCreatedVersion) {
return getScoringMethod(processedQuery);
}

}

class L2 extends KNNFieldSpace {
Expand All @@ -141,9 +146,29 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) {
}

@Override
protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery) {
protected BiFunction<float[], float[], Float> getScoringMethod(float[] processedQuery) {
return getScoringMethod(processedQuery, Version.CURRENT);
}

@Override
protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery, Version indexCreatedVersion) {
SpaceType.COSINESIMIL.validateVector(processedQuery);
float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery);
if (indexCreatedVersion.onOrAfter(Version.V_2_19_0)) {
// To be consistent, we will be using same formula used by lucene as mentioned below
// https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
// for indices that are created on or after 2.19.0
//
// OS Score = ( 2 − cosineSimil) / 2
// However cosineSimil = 1 - cos θ, after applying this to above formula,
// OS Score = ( 2 − ( 1 − cos θ ) ) / 2
// which simplifies to
// OS Score = ( 1 + cos θ ) / 2
return (float[] q, float[] v) -> Math.max(
((1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F),
0
);
}
return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude);
}
}
Expand Down
55 changes: 55 additions & 0 deletions src/test/java/org/opensearch/knn/index/NmslibIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,61 @@ public void testEndToEnd() throws Exception {
fail("Graphs are not getting evicted");
}

public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception {
String indexName = "test-index-1";
String fieldName = "test-field-1";
SpaceType spaceType = SpaceType.COSINESIMIL;
Integer dimension = testData.indexData.vectors[0].length;

// Create an index
XContentBuilder builder = XContentFactory.jsonBuilder()
.startObject()
.startObject("properties")
.startObject(fieldName)
.field("type", "knn_vector")
.field("dimension", dimension)
.field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue())
.startObject(KNNConstants.KNN_METHOD)
.field(KNNConstants.NAME, KNNConstants.METHOD_HNSW)
.field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName())
.endObject()
.endObject()
.endObject()
.endObject();

Map<String, Object> mappingMap = xContentBuilderToMap(builder);
String mapping = builder.toString();

createKnnIndex(indexName, buildKNNIndexSettings(0), mapping);

// Index one document
addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());

// Assert we have the right number of documents in the index
refreshAllIndices();
assertEquals(1, getDocCount(indexName));
// update threshold setting to skip building graph
updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1));
// add duplicate document with different id
addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());
assertEquals(2, getDocCount(indexName));
final int k = 2;
// search index
Response response = searchKNNIndex(
indexName,
KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(),
k
);
String responseBody = EntityUtils.toString(response.getEntity());
List<KNNResult> knnResults = parseSearchResponse(responseBody, fieldName);
assertEquals(k, knnResults.size());

List<Float> actualScores = parseSearchResponseScore(responseBody, fieldName);

// both document should have identical score
assertEquals(actualScores.get(0), actualScores.get(1), 0.001);
}

@SneakyThrows
private void validateSearch(
final String indexName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.Locale;

import lombok.SneakyThrows;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.knn.KNNTestCase;
import org.opensearch.knn.index.engine.KNNMethodContext;
Expand Down Expand Up @@ -86,7 +87,11 @@ public void testCosineSimilarity_whenValid_thenSucceed() {
getMappingConfigForMethodMapping(knnMethodContext, 3)
);
KNNScoringSpace.CosineSimilarity cosineSimilarity = new KNNScoringSpace.CosineSimilarity(arrayListQueryObject, fieldType);
assertEquals(2F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F);
assertEquals(
VectorSimilarityFunction.COSINE.compare(arrayFloat2, arrayFloat),
cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat),
0.1F
);

// invalid zero vector
final List<Float> queryZeroVector = List.of(0.0f, 0.0f, 0.0f);
Expand Down
Loading