From 5a2603ff5c4315a77c7b03d88698eadbcf648d56 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 10 Jul 2024 21:48:23 +0800 Subject: [PATCH] Handle invalid utf8 sequence from Whisper for Dart API. (#1106) Fixes #1104 --- CHANGELOG.md | 4 +++ .../lib/src/offline_recognizer.dart | 3 ++- .../lib/src/online_recognizer.dart | 3 ++- flutter/sherpa_onnx/lib/src/utils.dart | 25 +++++++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 flutter/sherpa_onnx/lib/src/utils.dart diff --git a/CHANGELOG.md b/CHANGELOG.md index 6672c41be..5b8908fc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.10.14 (to-be-released) + +* Fix invalid utf8 sequence from Whisper for Dart API. + ## 1.10.13 * Update onnxruntime from 1.17.1 to 1.18.0 diff --git a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart index b5619e3e0..ca2a4dbf0 100644 --- a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart @@ -7,6 +7,7 @@ import 'package:ffi/ffi.dart'; import './feature_config.dart'; import './offline_stream.dart'; import './sherpa_onnx_bindings.dart'; +import './utils.dart'; class OfflineTransducerModelConfig { const OfflineTransducerModelConfig({ @@ -287,7 +288,7 @@ class OfflineRecognizer { return OfflineRecognizerResult(text: '', tokens: [], timestamps: []); } - final parsedJson = jsonDecode(json.toDartString()); + final parsedJson = jsonDecode(toDartString(json)); SherpaOnnxBindings.destroyOfflineStreamResultJson?.call(json); diff --git a/flutter/sherpa_onnx/lib/src/online_recognizer.dart b/flutter/sherpa_onnx/lib/src/online_recognizer.dart index 49ca3d2e8..5975d68bd 100644 --- a/flutter/sherpa_onnx/lib/src/online_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/online_recognizer.dart @@ -7,6 +7,7 @@ import 'package:ffi/ffi.dart'; import './feature_config.dart'; import './online_stream.dart'; import './sherpa_onnx_bindings.dart'; +import './utils.dart'; class OnlineTransducerModelConfig { const OnlineTransducerModelConfig({ @@ -268,7 +269,7 @@ class OnlineRecognizer { return OnlineRecognizerResult(text: '', tokens: [], timestamps: []); } - final parsedJson = jsonDecode(json.toDartString()); + final parsedJson = jsonDecode(toDartString(json)); SherpaOnnxBindings.destroyOnlineStreamResultJson?.call(json); diff --git a/flutter/sherpa_onnx/lib/src/utils.dart b/flutter/sherpa_onnx/lib/src/utils.dart new file mode 100644 index 000000000..f4ecad2c9 --- /dev/null +++ b/flutter/sherpa_onnx/lib/src/utils.dart @@ -0,0 +1,25 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:convert'; +import 'dart:ffi'; +import 'dart:typed_data'; + +import 'package:ffi/ffi.dart'; + +int _strLen(Pointer codeUnits) { + // this function is copied from + // https://github.com/dart-archive/ffi/blob/main/lib/src/utf8.dart#L52 + var length = 0; + while (codeUnits[length] != 0) { + length++; + } + return length; +} + +// This function is modified from +// https://github.com/dart-archive/ffi/blob/main/lib/src/utf8.dart#L41 +// It ignores invalid utf8 sequence +String toDartString(Pointer s) { + final codeUnits = s.cast(); + final length = _strLen(codeUnits); + return utf8.decode(codeUnits.asTypedList(length), allowMalformed: true); +}