From dcb9718f7ff37fedb5e41d48fa794a1d82803863 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 13:08:58 -0400 Subject: [PATCH 01/13] src: avoid allocation in the Size method for BASE64URL and BASE64 --- src/string_bytes.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 3e2b29005a2012..f0d1eed11997a1 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -493,16 +493,12 @@ Maybe StringBytes::Size(Isolate* isolate, case UCS2: return Just(str->Length() * sizeof(uint16_t)); - case BASE64URL: { - String::Value value(isolate, str); - return Just(simdutf::base64_length_from_binary(value.length(), + case BASE64URL: + return Just(simdutf::base64_length_from_binary(str->Length(), simdutf::base64_url)); - } - case BASE64: { - String::Value value(isolate, str); - return Just(simdutf::base64_length_from_binary(value.length())); - } + case BASE64: + return Just(simdutf::base64_length_from_binary(str->Length())); case HEX: return Just(str->Length() / 2); From a69442c9faffd305c67632230816bf581ae6a742 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 14:34:37 -0400 Subject: [PATCH 02/13] simplify the function --- src/string_bytes.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index f0d1eed11997a1..14ea8039d87295 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -449,12 +449,8 @@ Maybe StringBytes::StorageSize(Isolate* isolate, break; case BASE64URL: - data_size = simdutf::base64_length_from_binary(str->Length(), - simdutf::base64_url); - break; - case BASE64: - data_size = simdutf::base64_length_from_binary(str->Length()); + data_size = str->Length() % 4 <= 1 ? str->Length() / 4 * 3 : str->Length() / 4 * 3 + (str->Length() % 4) - 1; break; case HEX: From cf1d6b944d441b231e438a2e74dbe56470e2eb2d Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 14:35:57 -0400 Subject: [PATCH 03/13] lint --- src/string_bytes.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 14ea8039d87295..7e08b93aacfeba 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -450,7 +450,9 @@ Maybe StringBytes::StorageSize(Isolate* isolate, case BASE64URL: case BASE64: - data_size = str->Length() % 4 <= 1 ? str->Length() / 4 * 3 : str->Length() / 4 * 3 + (str->Length() % 4) - 1; + data_size = str->Length() % 4 <= 1 + ? str->Length() / 4 * 3 + : str->Length() / 4 * 3 + (str->Length() % 4) - 1; break; case HEX: From fb76dbde187143c011baaf68e32010ccd1f2f671 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 14:42:18 -0400 Subject: [PATCH 04/13] more simplification --- src/string_bytes.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 7e08b93aacfeba..57f4cbff5a5332 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -492,11 +492,10 @@ Maybe StringBytes::Size(Isolate* isolate, return Just(str->Length() * sizeof(uint16_t)); case BASE64URL: - return Just(simdutf::base64_length_from_binary(str->Length(), - simdutf::base64_url)); - case BASE64: - return Just(simdutf::base64_length_from_binary(str->Length())); + return Just(str->Length() % 4 <= 1 + ? str->Length() / 4 * 3 + : str->Length() / 4 * 3 + (str->Length() % 4) - 1;); case HEX: return Just(str->Length() / 2); From 2588eb936604bfbc4cf3f46b461181df3e3a35bc Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 14:59:55 -0400 Subject: [PATCH 05/13] fix typo --- src/string_bytes.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 57f4cbff5a5332..3c986de2862380 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -495,7 +495,7 @@ Maybe StringBytes::Size(Isolate* isolate, case BASE64: return Just(str->Length() % 4 <= 1 ? str->Length() / 4 * 3 - : str->Length() / 4 * 3 + (str->Length() % 4) - 1;); + : str->Length() / 4 * 3 + (str->Length() % 4) - 1); case HEX: return Just(str->Length() / 2); From b6a694d070f34e74266c67a7c9940a6d687142e8 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 15:16:46 -0400 Subject: [PATCH 06/13] explicit cast --- src/string_bytes.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 3c986de2862380..02242d9916b4bc 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -493,7 +493,7 @@ Maybe StringBytes::Size(Isolate* isolate, case BASE64URL: case BASE64: - return Just(str->Length() % 4 <= 1 + return Just(str->Length() % 4 <= 1 ? str->Length() / 4 * 3 : str->Length() / 4 * 3 + (str->Length() % 4) - 1); From 9ae78a4e7a8619bac9c6db1a5321f0a273885edd Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 15:38:25 -0400 Subject: [PATCH 07/13] lint --- src/string_bytes.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 02242d9916b4bc..8088256c72f8d1 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -493,9 +493,9 @@ Maybe StringBytes::Size(Isolate* isolate, case BASE64URL: case BASE64: - return Just(str->Length() % 4 <= 1 - ? str->Length() / 4 * 3 - : str->Length() / 4 * 3 + (str->Length() % 4) - 1); + return Just(str->Length() % 4 <= 1 ? str->Length() / 4 * 3 + : str->Length() / 4 * 3 + + (str->Length() % 4) - 1); case HEX: return Just(str->Length() / 2); From 87a47a7fd1f75445dfa9f8f8fcf15cbb08d7a39a Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 20:04:26 -0400 Subject: [PATCH 08/13] adjust the output size more precisely, for the simple cases --- src/string_bytes.cc | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 8088256c72f8d1..c56f60d9c49c5d 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -453,6 +453,23 @@ Maybe StringBytes::StorageSize(Isolate* isolate, data_size = str->Length() % 4 <= 1 ? str->Length() / 4 * 3 : str->Length() / 4 * 3 + (str->Length() % 4) - 1; + // When the string is external, we can check if it ends with one or two + // padding characters and adjust the size accordingly. Note that the + // input can contain non-base64 characters, so, at best, we can provide + // an upper bound. A correct size would requires scanning the entire + // input. We try to keep the function as fast as possible, so we only + // check the last two characters when the string is one-byte external. + if (str->IsExternalOneByte()) { + auto ext = str->GetExternalOneByteStringResource(); + if(ext->length() > 1) { + if(ext->data()[ext->length() - 1] == '=') { + data_size--; + if(ext->data()[ext->length() - 2] == '=') { + data_size--; + } + } + } + } break; case HEX: @@ -492,10 +509,29 @@ Maybe StringBytes::Size(Isolate* isolate, return Just(str->Length() * sizeof(uint16_t)); case BASE64URL: - case BASE64: - return Just(str->Length() % 4 <= 1 ? str->Length() / 4 * 3 + case BASE64: { + size_t data_size = str->Length() % 4 <= 1 ? str->Length() / 4 * 3 : str->Length() / 4 * 3 + - (str->Length() % 4) - 1); + (str->Length() % 4) - 1; + // When the string is external, we can check if it ends with one or two + // padding characters and adjust the size accordingly. Note that the + // input can contain non-base64 characters, so, at best, we can provide + // an upper bound. A correct size would requires scanning the entire + // input. We try to keep the function as fast as possible, so we only + // check the last two characters when the string is one-byte external. + if (str->IsExternalOneByte()) { + auto ext = str->GetExternalOneByteStringResource(); + if(ext->length() > 1) { + if(ext->data()[ext->length() - 1] == '=') { + data_size--; + if(ext->data()[ext->length() - 2] == '=') { + data_size--; + } + } + } + } + return Just(data_size); + } case HEX: return Just(str->Length() / 2); From e6981185d58c12b85951147ea6e788bb90d6910b Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 22 Jun 2024 20:23:27 -0400 Subject: [PATCH 09/13] lint --- src/string_bytes.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index c56f60d9c49c5d..ddd96095f33ae6 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -461,10 +461,10 @@ Maybe StringBytes::StorageSize(Isolate* isolate, // check the last two characters when the string is one-byte external. if (str->IsExternalOneByte()) { auto ext = str->GetExternalOneByteStringResource(); - if(ext->length() > 1) { - if(ext->data()[ext->length() - 1] == '=') { + if (ext->length() > 1) { + if (ext->data()[ext->length() - 1] == '=') { data_size--; - if(ext->data()[ext->length() - 2] == '=') { + if (ext->data()[ext->length() - 2] == '=') { data_size--; } } @@ -510,9 +510,9 @@ Maybe StringBytes::Size(Isolate* isolate, case BASE64URL: case BASE64: { - size_t data_size = str->Length() % 4 <= 1 ? str->Length() / 4 * 3 - : str->Length() / 4 * 3 + - (str->Length() % 4) - 1; + size_t data_size = str->Length() % 4 <= 1 + ? str->Length() / 4 * 3 + : str->Length() / 4 * 3 + (str->Length() % 4) - 1; // When the string is external, we can check if it ends with one or two // padding characters and adjust the size accordingly. Note that the // input can contain non-base64 characters, so, at best, we can provide @@ -521,10 +521,10 @@ Maybe StringBytes::Size(Isolate* isolate, // check the last two characters when the string is one-byte external. if (str->IsExternalOneByte()) { auto ext = str->GetExternalOneByteStringResource(); - if(ext->length() > 1) { - if(ext->data()[ext->length() - 1] == '=') { + if (ext->length() > 1) { + if (ext->data()[ext->length() - 1] == '=') { data_size--; - if(ext->data()[ext->length() - 2] == '=') { + if (ext->data()[ext->length() - 2] == '=') { data_size--; } } From 3b177fcc82fdbe4ab51011e86a5434de5106155c Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 31 Aug 2024 17:44:35 -0400 Subject: [PATCH 10/13] using string view --- src/string_bytes.cc | 57 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index ddd96095f33ae6..06fb02bae794b6 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -449,27 +449,27 @@ Maybe StringBytes::StorageSize(Isolate* isolate, break; case BASE64URL: - case BASE64: - data_size = str->Length() % 4 <= 1 - ? str->Length() / 4 * 3 - : str->Length() / 4 * 3 + (str->Length() % 4) - 1; - // When the string is external, we can check if it ends with one or two - // padding characters and adjust the size accordingly. Note that the - // input can contain non-base64 characters, so, at best, we can provide - // an upper bound. A correct size would requires scanning the entire - // input. We try to keep the function as fast as possible, so we only - // check the last two characters when the string is one-byte external. - if (str->IsExternalOneByte()) { - auto ext = str->GetExternalOneByteStringResource(); - if (ext->length() > 1) { - if (ext->data()[ext->length() - 1] == '=') { + case BASE64: { + String::ValueView view(isolate, str); + size_t data_size = view.length() % 4 <= 1 + ? view.length() / 4 * 3 + : view.length() / 4 * 3 + (view.length() % 4) - 1; + // Check if the string ends with one or two padding characters and adjust the + // size accordingly. Note that the input can contain non-base64 characters, so, + // at best, we can provide an upper bound. A correct size would requires + // scanning the entire input. We try to keep the function as fast as possible, + // so we only check the last two characters when the string is one-byte. + if (view.is_one_byte()) { + if (view.length() > 1) { + if (view.data8()[view.length() - 1] == '=') { data_size--; - if (ext->data()[ext->length() - 2] == '=') { + if (view.data8()[view.length() - 2] == '=') { data_size--; } } } } + } break; case HEX: @@ -510,21 +510,20 @@ Maybe StringBytes::Size(Isolate* isolate, case BASE64URL: case BASE64: { - size_t data_size = str->Length() % 4 <= 1 - ? str->Length() / 4 * 3 - : str->Length() / 4 * 3 + (str->Length() % 4) - 1; - // When the string is external, we can check if it ends with one or two - // padding characters and adjust the size accordingly. Note that the - // input can contain non-base64 characters, so, at best, we can provide - // an upper bound. A correct size would requires scanning the entire - // input. We try to keep the function as fast as possible, so we only - // check the last two characters when the string is one-byte external. - if (str->IsExternalOneByte()) { - auto ext = str->GetExternalOneByteStringResource(); - if (ext->length() > 1) { - if (ext->data()[ext->length() - 1] == '=') { + String::ValueView view(isolate, str); + size_t data_size = view.length() % 4 <= 1 + ? view.length() / 4 * 3 + : view.length() / 4 * 3 + (view.length() % 4) - 1; + // Check if the string ends with one or two padding characters and adjust the + // size accordingly. Note that the input can contain non-base64 characters, so, + // at best, we can provide an upper bound. A correct size would requires + // scanning the entire input. We try to keep the function as fast as possible, + // so we only check the last two characters when the string is one-byte. + if (view.is_one_byte()) { + if (view.length() > 1) { + if (view.data8()[view.length() - 1] == '=') { data_size--; - if (ext->data()[ext->length() - 2] == '=') { + if (view.data8()[view.length() - 2] == '=') { data_size--; } } From de7c0298df8824e0095780ac643493394feffc59 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 31 Aug 2024 17:44:50 -0400 Subject: [PATCH 11/13] lint --- src/string_bytes.cc | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 06fb02bae794b6..b4c8c892966de7 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -452,13 +452,14 @@ Maybe StringBytes::StorageSize(Isolate* isolate, case BASE64: { String::ValueView view(isolate, str); size_t data_size = view.length() % 4 <= 1 - ? view.length() / 4 * 3 - : view.length() / 4 * 3 + (view.length() % 4) - 1; - // Check if the string ends with one or two padding characters and adjust the - // size accordingly. Note that the input can contain non-base64 characters, so, - // at best, we can provide an upper bound. A correct size would requires - // scanning the entire input. We try to keep the function as fast as possible, - // so we only check the last two characters when the string is one-byte. + ? view.length() / 4 * 3 + : view.length() / 4 * 3 + (view.length() % 4) - 1; + // Check if the string ends with one or two padding characters and adjust + // the size accordingly. Note that the input can contain non-base64 + // characters, so, at best, we can provide an upper bound. A correct size + // would requires scanning the entire input. We try to keep the function + // as fast as possible, so we only check the last two characters when the + // string is one-byte. if (view.is_one_byte()) { if (view.length() > 1) { if (view.data8()[view.length() - 1] == '=') { @@ -469,8 +470,7 @@ Maybe StringBytes::StorageSize(Isolate* isolate, } } } - } - break; + } break; case HEX: CHECK(str->Length() % 2 == 0 && "invalid hex string length"); @@ -512,13 +512,14 @@ Maybe StringBytes::Size(Isolate* isolate, case BASE64: { String::ValueView view(isolate, str); size_t data_size = view.length() % 4 <= 1 - ? view.length() / 4 * 3 - : view.length() / 4 * 3 + (view.length() % 4) - 1; - // Check if the string ends with one or two padding characters and adjust the - // size accordingly. Note that the input can contain non-base64 characters, so, - // at best, we can provide an upper bound. A correct size would requires - // scanning the entire input. We try to keep the function as fast as possible, - // so we only check the last two characters when the string is one-byte. + ? view.length() / 4 * 3 + : view.length() / 4 * 3 + (view.length() % 4) - 1; + // Check if the string ends with one or two padding characters and adjust + // the size accordingly. Note that the input can contain non-base64 + // characters, so, at best, we can provide an upper bound. A correct size + // would requires scanning the entire input. We try to keep the function + // as fast as possible, so we only check the last two characters when the + // string is one-byte. if (view.is_one_byte()) { if (view.length() > 1) { if (view.data8()[view.length() - 1] == '=') { From f6ba6112f62ed51f893f9e557548aba6e7bbcd2d Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sat, 31 Aug 2024 20:28:53 -0400 Subject: [PATCH 12/13] typo --- src/string_bytes.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index b4c8c892966de7..4716f538de3896 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -451,7 +451,7 @@ Maybe StringBytes::StorageSize(Isolate* isolate, case BASE64URL: case BASE64: { String::ValueView view(isolate, str); - size_t data_size = view.length() % 4 <= 1 + data_size = view.length() % 4 <= 1 ? view.length() / 4 * 3 : view.length() / 4 * 3 + (view.length() % 4) - 1; // Check if the string ends with one or two padding characters and adjust From 486ec52f8a82c0c65181201b5ca01bc7ea95bfa7 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 1 Sep 2024 17:44:31 -0400 Subject: [PATCH 13/13] lint --- src/string_bytes.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 4716f538de3896..34efdd7d295089 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -452,8 +452,8 @@ Maybe StringBytes::StorageSize(Isolate* isolate, case BASE64: { String::ValueView view(isolate, str); data_size = view.length() % 4 <= 1 - ? view.length() / 4 * 3 - : view.length() / 4 * 3 + (view.length() % 4) - 1; + ? view.length() / 4 * 3 + : view.length() / 4 * 3 + (view.length() % 4) - 1; // Check if the string ends with one or two padding characters and adjust // the size accordingly. Note that the input can contain non-base64 // characters, so, at best, we can provide an upper bound. A correct size