diff --git a/AK/String.cpp b/AK/String.cpp index 7ec9c041888..a344f46c093 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -61,6 +61,36 @@ ErrorOr String::from_utf8(StringView view) return result; } +ErrorOr String::from_utf16_le(ReadonlyBytes bytes) +{ + if (!validate_utf16_le(bytes)) + return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE"); + if (bytes.is_empty()) + return String {}; + char16_t const* utf16_data = reinterpret_cast(bytes.data()); + size_t utf16_length = bytes.size() / 2; + size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length); + Vector buffer; + buffer.resize(max_utf8_length); + auto utf8_length = simdutf::convert_utf16le_to_utf8(utf16_data, utf16_length, reinterpret_cast(buffer.data())); + return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length }); +} + +ErrorOr String::from_utf16_be(ReadonlyBytes bytes) +{ + if (!validate_utf16_be(bytes)) + return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE"); + if (bytes.is_empty()) + return String {}; + char16_t const* utf16_data = reinterpret_cast(bytes.data()); + size_t utf16_length = bytes.size() / 2; + size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length); + Vector buffer; + buffer.resize(max_utf8_length); + auto utf8_length = simdutf::convert_utf16be_to_utf8(utf16_data, utf16_length, reinterpret_cast(buffer.data())); + return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length }); +} + ErrorOr String::from_utf16(Utf16View const& utf16) { if (!utf16.validate()) diff --git a/AK/String.h b/AK/String.h index a1fb182219c..f0ba2a58783 100644 --- a/AK/String.h +++ b/AK/String.h @@ -70,6 +70,8 @@ public: // Creates a new String from a sequence of UTF-16 encoded code points. static ErrorOr from_utf16(Utf16View const&); + static ErrorOr from_utf16_le(ReadonlyBytes); + static ErrorOr from_utf16_be(ReadonlyBytes); // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream. static ErrorOr from_stream(Stream&, size_t byte_count); diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 449a1b0b88f..3ae3c50a5f4 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -421,4 +421,14 @@ size_t Utf16CodePointIterator::length_in_code_units() const return *(*this) < first_supplementary_plane_code_point ? 1 : 2; } +bool validate_utf16_le(ReadonlyBytes bytes) +{ + return simdutf::validate_utf16le(reinterpret_cast(bytes.data()), bytes.size() / 2); +} + +bool validate_utf16_be(ReadonlyBytes bytes) +{ + return simdutf::validate_utf16be(reinterpret_cast(bytes.data()), bytes.size() / 2); +} + } diff --git a/AK/Utf16View.h b/AK/Utf16View.h index e993bf40521..176c67fbd8e 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -26,6 +26,9 @@ ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host) ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); ErrorOr code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host); +[[nodiscard]] bool validate_utf16_le(ReadonlyBytes); +[[nodiscard]] bool validate_utf16_be(ReadonlyBytes); + size_t utf16_code_unit_length_from_utf8(StringView); class Utf16View; diff --git a/Libraries/LibTextCodec/Decoder.cpp b/Libraries/LibTextCodec/Decoder.cpp index ecab58df42f..0056586477a 100644 --- a/Libraries/LibTextCodec/Decoder.cpp +++ b/Libraries/LibTextCodec/Decoder.cpp @@ -369,25 +369,9 @@ ErrorOr UTF8Decoder::to_utf8(StringView input) return String::from_utf8_with_replacement_character(input); } -static Utf16View as_utf16(StringView view, AK::Endianness endianness) -{ - return Utf16View { - { reinterpret_cast(view.bytes().data()), view.length() / 2 }, - endianness - }; -} - -ErrorOr UTF16BEDecoder::process(StringView input, Function(u32)> on_code_point) -{ - for (auto code_point : as_utf16(input, AK::Endianness::Big)) - TRY(on_code_point(code_point)); - - return {}; -} - bool UTF16BEDecoder::validate(StringView input) { - return as_utf16(input, AK::Endianness::Big).validate(); + return AK::validate_utf16_be(input.bytes()); } ErrorOr UTF16BEDecoder::to_utf8(StringView input) @@ -396,20 +380,12 @@ ErrorOr UTF16BEDecoder::to_utf8(StringView input) if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) input = input.substring_view(2); - return String::from_utf16(as_utf16(input, AK::Endianness::Big)); -} - -ErrorOr UTF16LEDecoder::process(StringView input, Function(u32)> on_code_point) -{ - for (auto code_point : as_utf16(input, AK::Endianness::Little)) - TRY(on_code_point(code_point)); - - return {}; + return String::from_utf16_be(input.bytes()); } bool UTF16LEDecoder::validate(StringView input) { - return as_utf16(input, AK::Endianness::Little).validate(); + return AK::validate_utf16_le(input.bytes()); } ErrorOr UTF16LEDecoder::to_utf8(StringView input) @@ -418,7 +394,7 @@ ErrorOr UTF16LEDecoder::to_utf8(StringView input) if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) input = input.substring_view(2); - return String::from_utf16(as_utf16(input, AK::Endianness::Little)); + return String::from_utf16_le(input.bytes()); } ErrorOr Latin1Decoder::process(StringView input, Function(u32)> on_code_point) diff --git a/Libraries/LibTextCodec/Decoder.h b/Libraries/LibTextCodec/Decoder.h index cbb8c2467b2..16a961d5b97 100644 --- a/Libraries/LibTextCodec/Decoder.h +++ b/Libraries/LibTextCodec/Decoder.h @@ -17,12 +17,12 @@ namespace TextCodec { class Decoder { public: - virtual ErrorOr process(StringView, Function(u32)> on_code_point) = 0; virtual bool validate(StringView); virtual ErrorOr to_utf8(StringView); protected: virtual ~Decoder() = default; + virtual ErrorOr process(StringView, Function(u32)> on_code_point) = 0; }; class UTF8Decoder final : public Decoder { @@ -34,16 +34,20 @@ public: class UTF16BEDecoder final : public Decoder { public: - virtual ErrorOr process(StringView, Function(u32)> on_code_point) override; virtual bool validate(StringView) override; virtual ErrorOr to_utf8(StringView) override; + +private: + virtual ErrorOr process(StringView, Function(u32)>) override { VERIFY_NOT_REACHED(); } }; class UTF16LEDecoder final : public Decoder { public: - virtual ErrorOr process(StringView, Function(u32)> on_code_point) override; virtual bool validate(StringView) override; virtual ErrorOr to_utf8(StringView) override; + +private: + virtual ErrorOr process(StringView, Function(u32)>) override { VERIFY_NOT_REACHED(); } }; template diff --git a/Tests/LibTextCodec/TestTextDecoders.cpp b/Tests/LibTextCodec/TestTextDecoders.cpp index 98b20bc6bd0..f19ce8ba258 100644 --- a/Tests/LibTextCodec/TestTextDecoders.cpp +++ b/Tests/LibTextCodec/TestTextDecoders.cpp @@ -34,17 +34,6 @@ TEST_CASE(test_utf16be_decode) auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv; EXPECT(decoder.validate(test_string)); - - Vector processed_code_points; - MUST(decoder.process(test_string, [&](u32 code_point) { - return processed_code_points.try_append(code_point); - })); - EXPECT(processed_code_points.size() == 4); - EXPECT(processed_code_points[0] == 0x73); - EXPECT(processed_code_points[1] == 0xE4); - EXPECT(processed_code_points[2] == 0x6B); - EXPECT(processed_code_points[3] == 0x1F600); - auto utf8 = MUST(decoder.to_utf8(test_string)); EXPECT_EQ(utf8, "säk😀"sv); } @@ -56,17 +45,6 @@ TEST_CASE(test_utf16le_decode) auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv; EXPECT(decoder.validate(test_string)); - - Vector processed_code_points; - MUST(decoder.process(test_string, [&](u32 code_point) { - return processed_code_points.try_append(code_point); - })); - EXPECT(processed_code_points.size() == 4); - EXPECT(processed_code_points[0] == 0x73); - EXPECT(processed_code_points[1] == 0xE4); - EXPECT(processed_code_points[2] == 0x6B); - EXPECT(processed_code_points[3] == 0x1F600); - auto utf8 = MUST(decoder.to_utf8(test_string)); EXPECT_EQ(utf8, "säk😀"sv); }