Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bib 809 machine translation scoring #620

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
25a9a74
Add StringSimilarityUtilities and Unit tests
kaseywright Dec 18, 2024
a01fc1b
Add ScoreResourceContentVersionSimilarityMessage
kaseywright Dec 18, 2024
ae7a53d
Add ScoreResourceContentVersionSimilarityMessagePublisher
kaseywright Dec 18, 2024
a298b51
Add GenerateResourceContentSimilarityScore Queue
kaseywright Dec 18, 2024
58cf542
Add ScoreResourceContentVersionSimilarityMessagePublisher to DI for API
kaseywright Dec 18, 2024
da6a4d9
Add enums for determining similarity scoring type and status
kaseywright Dec 18, 2024
5c29b64
WIP Add ScoreResourceContentVersionSimilarityMessageSubscriber
kaseywright Dec 18, 2024
1c1493b
Drop custom statuses for ResourceContentStatus
kaseywright Dec 18, 2024
6215477
Remove status
kaseywright Dec 19, 2024
979d8f0
Add database writes for similarity scores
kaseywright Dec 20, 2024
8c5c670
Apply similarity scoring to Publish endpoint
kaseywright Dec 20, 2024
df2311d
Move Similarity Scoring Unit tests under `Utilities`
kaseywright Dec 20, 2024
3efd08a
Rename subscriber fn to match new naming convention
kaseywright Dec 20, 2024
6caa071
Remove stopwatch from StringSimilarityUtilities
kaseywright Dec 20, 2024
401957e
Improve logging information for scoring resources
kaseywright Dec 20, 2024
a9990aa
Remove extra lines in subscriber
kaseywright Dec 20, 2024
28baf03
Updated machineId variable name for more clarity.
kaseywright Dec 20, 2024
bd6243e
Update log message when no machine translation is found for published…
kaseywright Dec 20, 2024
3a9c089
Make ResourceContentVersionSimilarityComparisonTypes singular
kaseywright Dec 20, 2024
68c83df
Refactor to simplify RCVSimilarityMessageSubscriber
kaseywright Jan 2, 2025
6a92781
Merge branch 'master' into bib-809-machine-translation-scoring
kaseywright Jan 7, 2025
e202477
Correct entity name to ResourceContentVersionSimilarityScoreEntity
kaseywright Jan 7, 2025
6b033df
Apply changes to make logged id searchable
kaseywright Jan 7, 2025
8ae563b
Refactor to allow MachineTranslation lookup on queue
kaseywright Jan 8, 2025
b244f9b
Refactor StringSimilarityUtilitiesTests.cs
kaseywright Jan 8, 2025
ff988c0
Add OrderBy for LastOrDefaultAsync
kaseywright Jan 8, 2025
6a8ff10
Merge branch 'master' into bib-809-machine-translation-scoring
kaseywright Jan 8, 2025
c0f3f92
Add QueueMessagePublisherServices
kaseywright Jan 8, 2025
306306e
Merge branch 'master' into bib-809-machine-translation-scoring
kaseywright Jan 8, 2025
adb9ab2
Update ResourceContentVersionSimilarityMessageSubscriber according to…
kaseywright Jan 10, 2025
c5bdb02
Replace FirsOrDefaultAsync in favor of SingleOrDefaultAsync
kaseywright Jan 10, 2025
786e9a7
Update logging variable names for readbility
kaseywright Jan 10, 2025
ef2f339
Reimplement LINQ query for GetMachineTranslationForSnapshot
kaseywright Jan 10, 2025
0aa2cb1
Update LINQ query to use object references
kaseywright Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/Aquifer.API/Endpoints/Resources/Content/Publish/Endpoint.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Aquifer.API.Common;
using Aquifer.API.Services;
using Aquifer.Common.Messages.Models;
using Aquifer.Common.Messages.Publishers;
using Aquifer.Data;
using Aquifer.Data.Entities;
Expand All @@ -9,7 +10,13 @@

namespace Aquifer.API.Endpoints.Resources.Content.Publish;

public class Endpoint(AquiferDbContext dbContext, IUserService userService, ITranslationMessagePublisher translationMessagePublisher, IResourceHistoryService historyService) : Endpoint<Request>
public class Endpoint(
AquiferDbContext dbContext,
IUserService userService,
ITranslationMessagePublisher translationMessagePublisher,
IResourceHistoryService historyService,
IScoreResourceContentVersionSimilarityMessagePublisher scoreResourceContentVersionSimilarityMessagePublisher,
ILogger<Endpoint> logger) : Endpoint<Request>
{
public override void Configure()
{
Expand All @@ -24,6 +31,7 @@ public override async Task HandleAsync(Request request, CancellationToken ct)
ThrowError(Helpers.InvalidUserIdResponse);
}

List<ScoreResourceContentVersionSimilarityMessage> similarityScoreMessages = [];
var contentIds = request.ContentId is not null ? [(int)request.ContentId] : request.ContentIds!;

foreach (var contentId in contentIds)
Expand Down Expand Up @@ -86,10 +94,35 @@ await Helpers.CreateNewDraft(dbContext,

await historyService.AddStatusHistoryAsync(mostRecentContentVersion, ResourceContentStatus.Complete, user.Id, ct);
}

var machineTranslation = await dbContext
kaseywright marked this conversation as resolved.
Show resolved Hide resolved
.ResourceContentVersionMachineTranslations
.AsTracking()
.FirstOrDefaultAsync(x => x.Id == mostRecentContentVersion.Id, ct);

if (machineTranslation is not null)
{
similarityScoreMessages.Add(
new ScoreResourceContentVersionSimilarityMessage(
machineTranslation.Id,
mostRecentContentVersion.Id,
ResourceContentVersionSimilarityComparisonTypes.MachineTranslationToResourceContentVersion)
);
}
else
{
logger.LogInformation($"No machine translation found for published content version {mostRecentContentVersion.Id}.");
kaseywright marked this conversation as resolved.
Show resolved Hide resolved
}
}

await dbContext.SaveChangesAsync(ct);

foreach (var similarityScoreMessage in similarityScoreMessages)
{
await scoreResourceContentVersionSimilarityMessagePublisher
.PublishScoreResourceContentVersionSimilarityMessageAsync(similarityScoreMessage, ct);
kaseywright marked this conversation as resolved.
Show resolved Hide resolved
}

await SendNoContentAsync(ct);
}
}
1 change: 1 addition & 0 deletions src/Aquifer.API/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
.AddSingleton<IResourceContentRequestTrackingMessagePublisher, ResourceContentRequestTrackingMessagePublisher>()
.AddSingleton<INotificationMessagePublisher, NotificationMessagePublisher>()
.AddSingleton<ITranslationMessagePublisher, TranslationMessagePublisher>()
.AddSingleton<IScoreResourceContentVersionSimilarityMessagePublisher, ScoreResourceContentVersionSimilarityMessagePublisher>()
.AddAzureClient(builder.Environment.IsDevelopment())
.AddFastEndpoints()
.AddResponseCaching()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace Aquifer.Common.Messages.Models;

public enum ResourceContentVersionSimilarityComparisonTypes
kaseywright marked this conversation as resolved.
Show resolved Hide resolved
{
MachineTranslationToResourceContentVersion,
MachineTranslationToSnapshot,
ResourceContentVersionToSnapshot,
SnapshotToSnapshot,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace Aquifer.Common.Messages.Models;

public sealed record ScoreResourceContentVersionSimilarityMessage(
int BaseVersionId,
int CompareVersionId,
ResourceContentVersionSimilarityComparisonTypes ComparisonType
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using Aquifer.Common.Messages.Models;

namespace Aquifer.Common.Messages.Publishers;

public interface IScoreResourceContentVersionSimilarityMessagePublisher
{
Task PublishScoreResourceContentVersionSimilarityMessageAsync(ScoreResourceContentVersionSimilarityMessage message, CancellationToken ct);
}

public sealed class ScoreResourceContentVersionSimilarityMessagePublisher(
IQueueClientFactory _queueClientFactory) : IScoreResourceContentVersionSimilarityMessagePublisher
{
public async Task PublishScoreResourceContentVersionSimilarityMessageAsync(ScoreResourceContentVersionSimilarityMessage message,
CancellationToken ct)
{
var queueClient = await _queueClientFactory.GetQueueClientAsync(Queues.GenerateResourceContentSimilarityScore, ct);
await queueClient.SendMessageAsync(message, cancellationToken: ct);
}
}
2 changes: 2 additions & 0 deletions src/Aquifer.Common/Messages/Queues.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ public static class Queues
public const string TranslateLanguageResources = "translate-language-resources";
public const string TranslateProjectResources = "translate-project-resources";
public const string TranslateResource = "translate-resource";

public const string GenerateResourceContentSimilarityScore = "generate-resource-content-similarity-score";

public static string GetPoisonQueueName(string queueName)
{
Expand Down
117 changes: 117 additions & 0 deletions src/Aquifer.Common/Utilities/StringSimilarityUtilities.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
using System.Diagnostics;

namespace Aquifer.Common.Utilities;

public static class StringSimilarityUtilities
{
public static (double similarity, double executionTime) ComputeLevenshteinSimilarity(string textA, string textB, int limit = 15000)
{
var stopwatch = Stopwatch.StartNew();
kaseywright marked this conversation as resolved.
Show resolved Hide resolved

var textASubstrings = GetSubstrings(textA, limit);
var textBSubstrings = GetSubstrings(textB, limit);
var distance = ComputeTotalStringDistance(textASubstrings, textBSubstrings);

var similarity = 1 - ((double)distance / Math.Max(textA.Length, textB.Length));

stopwatch.Stop();
var executionTime = stopwatch.Elapsed.TotalMilliseconds;

return (similarity, executionTime);
}

public static int LevenshteinDistance(string textA, string textB)
kaseywright marked this conversation as resolved.
Show resolved Hide resolved
{
var aLength = textA.Length;
var bLength = textB.Length;
var matrix = new int[bLength + 1];

for (var j = 0; j <= bLength; j++)
{
matrix[j] = j;
}

for (var i = 1; i <= aLength; i++)
{
var prev = matrix[0];
matrix[0] = i;
for (var j = 1; j <= bLength; j++)
{
var cost = (textA[i - 1] == textB[j - 1]) ? 0 : 1;
var temp = matrix[j];
matrix[j] = Math.Min(
Math.Min(
prev + cost,
matrix[j] + 1
),
matrix[j - 1] + 1
);
prev = temp;
}
}

return matrix[bLength];
}

public static List<string> GetSubstrings(string text, int limit)
{
var result = new List<string>();

while (text.Length > limit)
{
string? substring;

if (text.LastIndexOf(' ', limit) == -1)
{
substring = text[..limit];
text = text[limit..];
}
else
{
substring = text[..text.LastIndexOf(' ', limit)];
text = text[(text.LastIndexOf(' ', limit) + 1)..];
}

result.Add(substring);
}
result.Add(text);

return result;
}

public static int ComputeTotalStringDistance(List<string> listA, List<string> listB)
{
var totalDistance = 0;
List<string> longestList;
List<string> shortestList;

if (listA.Count > listB.Count)
{
longestList = listA;
shortestList = listB;
}
else
{
longestList = listB;
shortestList = listA;
}

// We have to account for lists that are not the same length.
// Levenshtein distance is cumulative, so the distance of a string
// compared to an empty string is the character count of that non-empty string
// i.e. "123" and "" is a distance of 3
for (var i = 0; i < longestList.Count; i++)
{
if (i >= shortestList.Count)
{
totalDistance += LevenshteinDistance(longestList[i], string.Empty);
}
else
{
totalDistance += LevenshteinDistance(longestList[i], shortestList[i]);
}
}

return totalDistance;
}
}
Loading
Loading