mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-06-08 05:27:14 +09:00
AK+LibTextCodec: Stop using Utf16View endianness override
This is preparation for removing the endianness override, since it was only used by a single client: LibTextCodec. While here, add helpers and make use of simdutf for fast conversion.
This commit is contained in:
parent
96f1f15ad6
commit
0e9480b944
Notes:
github-actions[bot]
2025-04-16 08:06:08 +00:00
Author: https://github.com/awesomekling
Commit: 0e9480b944
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/4370
7 changed files with 56 additions and 53 deletions
|
@ -61,6 +61,36 @@ ErrorOr<String> String::from_utf8(StringView view)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
if (!validate_utf16_le(bytes))
|
||||||
|
return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE");
|
||||||
|
if (bytes.is_empty())
|
||||||
|
return String {};
|
||||||
|
char16_t const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||||
|
size_t utf16_length = bytes.size() / 2;
|
||||||
|
size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length);
|
||||||
|
Vector<u8> buffer;
|
||||||
|
buffer.resize(max_utf8_length);
|
||||||
|
auto utf8_length = simdutf::convert_utf16le_to_utf8(utf16_data, utf16_length, reinterpret_cast<char*>(buffer.data()));
|
||||||
|
return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length });
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
if (!validate_utf16_be(bytes))
|
||||||
|
return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE");
|
||||||
|
if (bytes.is_empty())
|
||||||
|
return String {};
|
||||||
|
char16_t const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||||
|
size_t utf16_length = bytes.size() / 2;
|
||||||
|
size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length);
|
||||||
|
Vector<u8> buffer;
|
||||||
|
buffer.resize(max_utf8_length);
|
||||||
|
auto utf8_length = simdutf::convert_utf16be_to_utf8(utf16_data, utf16_length, reinterpret_cast<char*>(buffer.data()));
|
||||||
|
return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length });
|
||||||
|
}
|
||||||
|
|
||||||
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
||||||
{
|
{
|
||||||
if (!utf16.validate())
|
if (!utf16.validate())
|
||||||
|
|
|
@ -70,6 +70,8 @@ public:
|
||||||
|
|
||||||
// Creates a new String from a sequence of UTF-16 encoded code points.
|
// Creates a new String from a sequence of UTF-16 encoded code points.
|
||||||
static ErrorOr<String> from_utf16(Utf16View const&);
|
static ErrorOr<String> from_utf16(Utf16View const&);
|
||||||
|
static ErrorOr<String> from_utf16_le(ReadonlyBytes);
|
||||||
|
static ErrorOr<String> from_utf16_be(ReadonlyBytes);
|
||||||
|
|
||||||
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
||||||
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
||||||
|
|
|
@ -421,4 +421,14 @@ size_t Utf16CodePointIterator::length_in_code_units() const
|
||||||
return *(*this) < first_supplementary_plane_code_point ? 1 : 2;
|
return *(*this) < first_supplementary_plane_code_point ? 1 : 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool validate_utf16_be(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,9 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host)
|
||||||
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
||||||
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
|
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
|
||||||
|
|
||||||
|
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
|
||||||
|
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
|
||||||
|
|
||||||
size_t utf16_code_unit_length_from_utf8(StringView);
|
size_t utf16_code_unit_length_from_utf8(StringView);
|
||||||
|
|
||||||
class Utf16View;
|
class Utf16View;
|
||||||
|
|
|
@ -369,25 +369,9 @@ ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
|
||||||
return String::from_utf8_with_replacement_character(input);
|
return String::from_utf8_with_replacement_character(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Utf16View as_utf16(StringView view, AK::Endianness endianness)
|
|
||||||
{
|
|
||||||
return Utf16View {
|
|
||||||
{ reinterpret_cast<u16 const*>(view.bytes().data()), view.length() / 2 },
|
|
||||||
endianness
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
|
||||||
{
|
|
||||||
for (auto code_point : as_utf16(input, AK::Endianness::Big))
|
|
||||||
TRY(on_code_point(code_point));
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
bool UTF16BEDecoder::validate(StringView input)
|
bool UTF16BEDecoder::validate(StringView input)
|
||||||
{
|
{
|
||||||
return as_utf16(input, AK::Endianness::Big).validate();
|
return AK::validate_utf16_be(input.bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
||||||
|
@ -396,20 +380,12 @@ ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
||||||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
||||||
input = input.substring_view(2);
|
input = input.substring_view(2);
|
||||||
|
|
||||||
return String::from_utf16(as_utf16(input, AK::Endianness::Big));
|
return String::from_utf16_be(input.bytes());
|
||||||
}
|
|
||||||
|
|
||||||
ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
|
||||||
{
|
|
||||||
for (auto code_point : as_utf16(input, AK::Endianness::Little))
|
|
||||||
TRY(on_code_point(code_point));
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UTF16LEDecoder::validate(StringView input)
|
bool UTF16LEDecoder::validate(StringView input)
|
||||||
{
|
{
|
||||||
return as_utf16(input, AK::Endianness::Little).validate();
|
return AK::validate_utf16_le(input.bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
||||||
|
@ -418,7 +394,7 @@ ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
||||||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
||||||
input = input.substring_view(2);
|
input = input.substring_view(2);
|
||||||
|
|
||||||
return String::from_utf16(as_utf16(input, AK::Endianness::Little));
|
return String::from_utf16_le(input.bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||||
|
|
|
@ -17,12 +17,12 @@ namespace TextCodec {
|
||||||
|
|
||||||
class Decoder {
|
class Decoder {
|
||||||
public:
|
public:
|
||||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
|
|
||||||
virtual bool validate(StringView);
|
virtual bool validate(StringView);
|
||||||
virtual ErrorOr<String> to_utf8(StringView);
|
virtual ErrorOr<String> to_utf8(StringView);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual ~Decoder() = default;
|
virtual ~Decoder() = default;
|
||||||
|
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
class UTF8Decoder final : public Decoder {
|
class UTF8Decoder final : public Decoder {
|
||||||
|
@ -34,16 +34,20 @@ public:
|
||||||
|
|
||||||
class UTF16BEDecoder final : public Decoder {
|
class UTF16BEDecoder final : public Decoder {
|
||||||
public:
|
public:
|
||||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
|
|
||||||
virtual bool validate(StringView) override;
|
virtual bool validate(StringView) override;
|
||||||
virtual ErrorOr<String> to_utf8(StringView) override;
|
virtual ErrorOr<String> to_utf8(StringView) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)>) override { VERIFY_NOT_REACHED(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
class UTF16LEDecoder final : public Decoder {
|
class UTF16LEDecoder final : public Decoder {
|
||||||
public:
|
public:
|
||||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
|
|
||||||
virtual bool validate(StringView) override;
|
virtual bool validate(StringView) override;
|
||||||
virtual ErrorOr<String> to_utf8(StringView) override;
|
virtual ErrorOr<String> to_utf8(StringView) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)>) override { VERIFY_NOT_REACHED(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<Integral ArrayType = u32>
|
template<Integral ArrayType = u32>
|
||||||
|
|
|
@ -34,17 +34,6 @@ TEST_CASE(test_utf16be_decode)
|
||||||
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
|
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
|
||||||
|
|
||||||
EXPECT(decoder.validate(test_string));
|
EXPECT(decoder.validate(test_string));
|
||||||
|
|
||||||
Vector<u32> processed_code_points;
|
|
||||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
|
||||||
return processed_code_points.try_append(code_point);
|
|
||||||
}));
|
|
||||||
EXPECT(processed_code_points.size() == 4);
|
|
||||||
EXPECT(processed_code_points[0] == 0x73);
|
|
||||||
EXPECT(processed_code_points[1] == 0xE4);
|
|
||||||
EXPECT(processed_code_points[2] == 0x6B);
|
|
||||||
EXPECT(processed_code_points[3] == 0x1F600);
|
|
||||||
|
|
||||||
auto utf8 = MUST(decoder.to_utf8(test_string));
|
auto utf8 = MUST(decoder.to_utf8(test_string));
|
||||||
EXPECT_EQ(utf8, "säk😀"sv);
|
EXPECT_EQ(utf8, "säk😀"sv);
|
||||||
}
|
}
|
||||||
|
@ -56,17 +45,6 @@ TEST_CASE(test_utf16le_decode)
|
||||||
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
|
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
|
||||||
|
|
||||||
EXPECT(decoder.validate(test_string));
|
EXPECT(decoder.validate(test_string));
|
||||||
|
|
||||||
Vector<u32> processed_code_points;
|
|
||||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
|
||||||
return processed_code_points.try_append(code_point);
|
|
||||||
}));
|
|
||||||
EXPECT(processed_code_points.size() == 4);
|
|
||||||
EXPECT(processed_code_points[0] == 0x73);
|
|
||||||
EXPECT(processed_code_points[1] == 0xE4);
|
|
||||||
EXPECT(processed_code_points[2] == 0x6B);
|
|
||||||
EXPECT(processed_code_points[3] == 0x1F600);
|
|
||||||
|
|
||||||
auto utf8 = MUST(decoder.to_utf8(test_string));
|
auto utf8 = MUST(decoder.to_utf8(test_string));
|
||||||
EXPECT_EQ(utf8, "säk😀"sv);
|
EXPECT_EQ(utf8, "säk😀"sv);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue