mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-06-08 13:37:10 +09:00

There are two changes happening here: a correctness fix, and an optimization. In theory they are unrelated, but the optimization actually paves the way for the correctness fix. Before this commit, the HTML tokenizer would attempt to look for named character references by checking from after the `&` until the end of m_decoded_input, which meant that it was unable to recognize things like named character references that are inserted via `document.write` one byte at a time. For example, if `∉` was written one-byte-at-a-time with `document.write`, then the tokenizer would only check against `n` since that's all that would exist at the time of the check and therefore erroneously conclude that it was an invalid named character reference. This commit modifies the approach taken for named character reference matching by using a trie-like structure (specifically, a deterministic acyclic finite state automaton or DAFSA), which allows for efficiently matching one-character-at-a-time and therefore it is able to pick up matching where it left off after each code point is consumed. Note: Because it's possible for a partial match to not actually develop into a full match (e.g. `¬indo` which could lead to `⋵̸`), some backtracking is performed after-the-fact in order to only consume the code points within the longest match found (e.g. `¬indo` would backtrack back to `¬`). With this new approach, `document.write` being called one-byte-at-a-time is handled correctly, which allows for passing more WPT tests, with the most directly relevant tests being `/html/syntax/parsing/html5lib_entities01.html` and `/html/syntax/parsing/html5lib_entities02.html` when run with `?run_type=write_single`. Additionally, the implementation now better conforms to the language of the spec (and resolves a FIXME) because exactly the matched characters are consumed and nothing more, so SWITCH_TO is able to be used as the spec says instead of RECONSUME_IN. The new approach is also an optimization: - Instead of a linear search using `starts_with`, the usage of a DAFSA means that it is always aware of which characters can lead to a match at any given point, and will bail out whenever a match is no longer possible. - The DAFSA is able to take advantage of the note in the section `13.5 Named character references` that says "This list is static and will not be expanded or changed in the future." and tailor its Node struct accordingly to tightly pack each node's data into 32-bits. Together with the inherent DAFSA property of redundant node deduplication, the amount of data stored for named character reference matching is minimized. In my testing: - A benchmark tokenizing an arbitrary set of HTML test files was about 1.23x faster (2070ms to 1682ms). - A benchmark tokenizing a file with tens of thousands of named character references mixed in with truncated named character references and arbitrary ASCII characters/ampersands runs about 8x faster (758ms to 93ms). - The size of `liblagom-web.so` was reduced by 94.96KiB. Some technical details: A DAFSA (deterministic acyclic finite state automaton) is essentially a trie flattened into an array, but it also uses techniques to minimize redundant nodes. This provides fast lookups while minimizing the required data size, but normally does not allow for associating data related to each word. However, by adding a count of the number of possible words from each node, it becomes possible to also use it to achieve minimal perfect hashing for the set of words (which allows going from word -> unique index as well as unique index -> word). This allows us to store a second array of data so that the DAFSA can be used as a lookup for e.g. the associated code points. For the Swift implementation, the new NamedCharacterReferenceMatcher was used to satisfy the previous API and the tokenizer was left alone otherwise. In the future, the Swift implementation should be updated to use the same implementation for its NamedCharacterReference state as the updated C++ implementation.
504 lines
17 KiB
C++
504 lines
17 KiB
C++
/*
|
|
* Copyright (c) 2024, the SerenityOS developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include "GeneratorUtil.h"
|
|
#include <AK/Array.h>
|
|
#include <AK/FixedArray.h>
|
|
#include <AK/SourceGenerator.h>
|
|
#include <AK/StringBuilder.h>
|
|
#include <LibCore/ArgsParser.h>
|
|
#include <LibMain/Main.h>
|
|
|
|
ErrorOr<void> generate_header_file(Core::File& file);
|
|
ErrorOr<void> generate_implementation_file(JsonObject& named_character_reference_data, Core::File& file);
|
|
|
|
ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|
{
|
|
StringView generated_header_path;
|
|
StringView generated_implementation_path;
|
|
StringView json_path;
|
|
|
|
Core::ArgsParser args_parser;
|
|
args_parser.add_option(generated_header_path, "Path to the Entities header file to generate", "generated-header-path", 'h', "generated-header-path");
|
|
args_parser.add_option(generated_implementation_path, "Path to the Entities implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
|
|
args_parser.add_option(json_path, "Path to the JSON file to read from", "json-path", 'j', "json-path");
|
|
args_parser.parse(arguments);
|
|
|
|
auto json = TRY(read_entire_file_as_json(json_path));
|
|
VERIFY(json.is_object());
|
|
auto named_character_reference_data = json.as_object();
|
|
|
|
auto generated_header_file = TRY(Core::File::open(generated_header_path, Core::File::OpenMode::Write));
|
|
auto generated_implementation_file = TRY(Core::File::open(generated_implementation_path, Core::File::OpenMode::Write));
|
|
|
|
TRY(generate_header_file(*generated_header_file));
|
|
TRY(generate_implementation_file(named_character_reference_data, *generated_implementation_file));
|
|
|
|
return 0;
|
|
}
|
|
|
|
struct Codepoints {
|
|
u32 first;
|
|
u32 second;
|
|
};
|
|
|
|
inline static StringView get_second_codepoint_enum_name(u32 codepoint)
|
|
{
|
|
switch (codepoint) {
|
|
case 0x0338:
|
|
return "CombiningLongSolidusOverlay"sv;
|
|
case 0x20D2:
|
|
return "CombiningLongVerticalLineOverlay"sv;
|
|
case 0x200A:
|
|
return "HairSpace"sv;
|
|
case 0x0333:
|
|
return "CombiningDoubleLowLine"sv;
|
|
case 0x20E5:
|
|
return "CombiningReverseSolidusOverlay"sv;
|
|
case 0xFE00:
|
|
return "VariationSelector1"sv;
|
|
case 0x006A:
|
|
return "LatinSmallLetterJ"sv;
|
|
case 0x0331:
|
|
return "CombiningMacronBelow"sv;
|
|
default:
|
|
return "None"sv;
|
|
}
|
|
}
|
|
|
|
ErrorOr<void> generate_header_file(Core::File& file)
|
|
{
|
|
StringBuilder builder;
|
|
SourceGenerator generator { builder };
|
|
generator.append(R"~~~(
|
|
#pragma once
|
|
|
|
#include <AK/Optional.h>
|
|
#include <AK/Types.h>
|
|
|
|
namespace Web::HTML {
|
|
|
|
enum class NamedCharacterReferenceSecondCodepoint {
|
|
None,
|
|
CombiningLongSolidusOverlay, // U+0338
|
|
CombiningLongVerticalLineOverlay, // U+20D2
|
|
HairSpace, // U+200A
|
|
CombiningDoubleLowLine, // U+0333
|
|
CombiningReverseSolidusOverlay, // U+20E5
|
|
VariationSelector1, // U+FE00
|
|
LatinSmallLetterJ, // U+006A
|
|
CombiningMacronBelow, // U+0331
|
|
};
|
|
|
|
inline Optional<u16> named_character_reference_second_codepoint_value(NamedCharacterReferenceSecondCodepoint codepoint)
|
|
{
|
|
switch (codepoint) {
|
|
case NamedCharacterReferenceSecondCodepoint::None:
|
|
return {};
|
|
case NamedCharacterReferenceSecondCodepoint::CombiningLongSolidusOverlay:
|
|
return 0x0338;
|
|
case NamedCharacterReferenceSecondCodepoint::CombiningLongVerticalLineOverlay:
|
|
return 0x20D2;
|
|
case NamedCharacterReferenceSecondCodepoint::HairSpace:
|
|
return 0x200A;
|
|
case NamedCharacterReferenceSecondCodepoint::CombiningDoubleLowLine:
|
|
return 0x0333;
|
|
case NamedCharacterReferenceSecondCodepoint::CombiningReverseSolidusOverlay:
|
|
return 0x20E5;
|
|
case NamedCharacterReferenceSecondCodepoint::VariationSelector1:
|
|
return 0xFE00;
|
|
case NamedCharacterReferenceSecondCodepoint::LatinSmallLetterJ:
|
|
return 0x006A;
|
|
case NamedCharacterReferenceSecondCodepoint::CombiningMacronBelow:
|
|
return 0x0331;
|
|
default:
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
}
|
|
|
|
// Note: The first codepoint could fit in 17 bits, and the second could fit in 4 (if unsigned).
|
|
// However, to get any benefit from minimizing the struct size, it would need to be accompanied by
|
|
// bit-packing the g_named_character_reference_codepoints_lookup array, and then either
|
|
// using 5 bits for the second field (since enum bitfields are signed), or using a 4-bit wide
|
|
// unsigned integer type.
|
|
struct NamedCharacterReferenceCodepoints {
|
|
u32 first : 24; // Largest value is U+1D56B
|
|
NamedCharacterReferenceSecondCodepoint second : 8;
|
|
};
|
|
static_assert(sizeof(NamedCharacterReferenceCodepoints) == 4);
|
|
|
|
u16 named_character_reference_child_index(u16 node_index);
|
|
bool named_character_reference_is_end_of_word(u16 node_index);
|
|
Optional<NamedCharacterReferenceCodepoints> named_character_reference_codepoints_from_unique_index(u16 unique_index);
|
|
Optional<u16> named_character_reference_find_sibling_and_update_unique_index(u16 first_child_index, u8 character, u16& unique_index);
|
|
|
|
} // namespace Web::HTML
|
|
|
|
)~~~");
|
|
|
|
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
|
return {};
|
|
}
|
|
|
|
class Node final : public RefCounted<Node> {
|
|
private:
|
|
struct NonnullRefPtrNodeTraits {
|
|
static unsigned hash(NonnullRefPtr<Node> const& node)
|
|
{
|
|
u32 hash = 0;
|
|
for (int i = 0; i < 128; i++) {
|
|
hash ^= ptr_hash(node->m_children[i].ptr());
|
|
}
|
|
hash ^= int_hash(static_cast<u32>(node->m_is_terminal));
|
|
return hash;
|
|
}
|
|
static bool equals(NonnullRefPtr<Node> const& a, NonnullRefPtr<Node> const& b)
|
|
{
|
|
if (a->m_is_terminal != b->m_is_terminal)
|
|
return false;
|
|
for (int i = 0; i < 128; i++) {
|
|
if (a->m_children[i] != b->m_children[i])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
public:
|
|
static NonnullRefPtr<Node> create()
|
|
{
|
|
return adopt_ref(*new (nothrow) Node());
|
|
}
|
|
|
|
using NodeTableType = HashTable<NonnullRefPtr<Node>, NonnullRefPtrNodeTraits, false>;
|
|
|
|
void calc_numbers()
|
|
{
|
|
m_number = static_cast<u16>(m_is_terminal);
|
|
for (int i = 0; i < 128; i++) {
|
|
if (m_children[i] == nullptr)
|
|
continue;
|
|
m_children[i]->calc_numbers();
|
|
m_number += m_children[i]->m_number;
|
|
}
|
|
}
|
|
|
|
u8 num_direct_children()
|
|
{
|
|
u8 num = 0;
|
|
for (int i = 0; i < 128; i++) {
|
|
if (m_children[i] != nullptr)
|
|
num += 1;
|
|
}
|
|
return num;
|
|
}
|
|
|
|
Array<RefPtr<Node>, 128>& children() { return m_children; }
|
|
|
|
void set_as_terminal() { m_is_terminal = true; }
|
|
|
|
bool is_terminal() const { return m_is_terminal; }
|
|
|
|
u16 number() const { return m_number; }
|
|
|
|
private:
|
|
Node() = default;
|
|
|
|
Array<RefPtr<Node>, 128> m_children { 0 };
|
|
bool m_is_terminal { false };
|
|
u16 m_number { 0 };
|
|
};
|
|
|
|
struct UncheckedNode {
|
|
RefPtr<Node> parent;
|
|
char character;
|
|
RefPtr<Node> child;
|
|
};
|
|
|
|
class DafsaBuilder {
|
|
AK_MAKE_NONCOPYABLE(DafsaBuilder);
|
|
|
|
public:
|
|
using MappingType = HashMap<StringView, String>;
|
|
|
|
DafsaBuilder()
|
|
: m_root(Node::create())
|
|
{
|
|
}
|
|
|
|
void insert(StringView str)
|
|
{
|
|
// Must be inserted in sorted order
|
|
VERIFY(str > m_previous_word);
|
|
|
|
size_t common_prefix_len = 0;
|
|
for (size_t i = 0; i < min(str.length(), m_previous_word.length()); i++) {
|
|
if (str[i] != m_previous_word[i])
|
|
break;
|
|
common_prefix_len++;
|
|
}
|
|
|
|
minimize(common_prefix_len);
|
|
|
|
RefPtr<Node> node;
|
|
if (m_unchecked_nodes.size() == 0)
|
|
node = m_root;
|
|
else
|
|
node = m_unchecked_nodes.last().child;
|
|
|
|
auto remaining = str.substring_view(common_prefix_len);
|
|
for (char const c : remaining) {
|
|
VERIFY(node->children().at(c) == nullptr);
|
|
|
|
auto child = Node::create();
|
|
node->children().at(c) = child;
|
|
m_unchecked_nodes.append(UncheckedNode { node, c, child });
|
|
node = child;
|
|
}
|
|
node->set_as_terminal();
|
|
|
|
bool fits = str.copy_characters_to_buffer(m_previous_word_buf, sizeof(m_previous_word_buf));
|
|
// It's guaranteed that m_previous_word_buf is large enough to hold the longest named character reference
|
|
VERIFY(fits);
|
|
m_previous_word = StringView(m_previous_word_buf, str.length());
|
|
}
|
|
|
|
void minimize(size_t down_to)
|
|
{
|
|
if (m_unchecked_nodes.size() == 0)
|
|
return;
|
|
while (m_unchecked_nodes.size() > down_to) {
|
|
auto unchecked_node = m_unchecked_nodes.take_last();
|
|
auto child = unchecked_node.child.release_nonnull();
|
|
auto it = m_minimized_nodes.find(child);
|
|
if (it != m_minimized_nodes.end()) {
|
|
unchecked_node.parent->children().at(unchecked_node.character) = *it;
|
|
} else {
|
|
m_minimized_nodes.set(child);
|
|
}
|
|
}
|
|
}
|
|
|
|
void calc_numbers()
|
|
{
|
|
m_root->calc_numbers();
|
|
}
|
|
|
|
Optional<size_t> get_unique_index(StringView str)
|
|
{
|
|
size_t index = 0;
|
|
Node* node = m_root.ptr();
|
|
|
|
for (char const c : str) {
|
|
if (node->children().at(c) == nullptr)
|
|
return {};
|
|
for (int sibling_c = 0; sibling_c < 128; sibling_c++) {
|
|
if (node->children().at(sibling_c) == nullptr)
|
|
continue;
|
|
if (sibling_c < c) {
|
|
index += node->children().at(sibling_c)->number();
|
|
}
|
|
}
|
|
node = node->children().at(c);
|
|
if (node->is_terminal())
|
|
index += 1;
|
|
}
|
|
|
|
return index;
|
|
}
|
|
|
|
NonnullRefPtr<Node> root()
|
|
{
|
|
return m_root;
|
|
}
|
|
|
|
private:
|
|
NonnullRefPtr<Node> m_root;
|
|
Node::NodeTableType m_minimized_nodes;
|
|
Vector<UncheckedNode> m_unchecked_nodes;
|
|
char m_previous_word_buf[64];
|
|
StringView m_previous_word = { m_previous_word_buf, 0 };
|
|
};
|
|
|
|
static u16 write_children(NonnullRefPtr<Node> node, SourceGenerator& generator, Vector<NonnullRefPtr<Node>>& queue, HashMap<Node*, u16>& child_indexes, u16 first_available_index)
|
|
{
|
|
auto current_available_index = first_available_index;
|
|
auto num_children = node->num_direct_children();
|
|
u16 child_i = 0;
|
|
for (u8 c = 0; c < 128; c++) {
|
|
if (node->children().at(c) == nullptr)
|
|
continue;
|
|
auto child = node->children().at(c).release_nonnull();
|
|
auto is_last_child = child_i == num_children - 1;
|
|
|
|
if (!child_indexes.contains(child.ptr())) {
|
|
auto child_num_children = child->num_direct_children();
|
|
if (child_num_children > 0) {
|
|
child_indexes.set(child, current_available_index);
|
|
current_available_index += child_num_children;
|
|
}
|
|
queue.append(child);
|
|
}
|
|
|
|
auto member_generator = generator.fork();
|
|
member_generator.set("char", StringView(&c, 1));
|
|
member_generator.set("number", String::number(child->number()));
|
|
member_generator.set("end_of_word", MUST(String::formatted("{}", child->is_terminal())));
|
|
member_generator.set("end_of_list", MUST(String::formatted("{}", is_last_child)));
|
|
auto child_index = child_indexes.get(child).value_or(0);
|
|
member_generator.set("child_index", String::number(child_index));
|
|
member_generator.append(R"~~~( { '@char@', @number@, @end_of_word@, @end_of_list@, @child_index@ },
|
|
)~~~");
|
|
|
|
child_i++;
|
|
}
|
|
return current_available_index;
|
|
}
|
|
|
|
ErrorOr<void> generate_implementation_file(JsonObject& named_character_reference_data, Core::File& file)
|
|
{
|
|
StringBuilder builder;
|
|
SourceGenerator generator { builder };
|
|
DafsaBuilder dafsa_builder;
|
|
|
|
named_character_reference_data.for_each_member([&](auto& key, auto&) {
|
|
dafsa_builder.insert(key.bytes_as_string_view().substring_view(1));
|
|
});
|
|
dafsa_builder.minimize(0);
|
|
dafsa_builder.calc_numbers();
|
|
|
|
// As a sanity check, confirm that the minimal perfect hashing doesn't
|
|
// have any collisions
|
|
{
|
|
HashTable<size_t> index_set;
|
|
|
|
named_character_reference_data.for_each_member([&](auto& key, auto&) {
|
|
auto index = dafsa_builder.get_unique_index(key.bytes_as_string_view().substring_view(1)).value();
|
|
VERIFY(!index_set.contains(index));
|
|
index_set.set(index);
|
|
});
|
|
VERIFY(named_character_reference_data.size() == index_set.size());
|
|
}
|
|
|
|
auto index_to_codepoints = MUST(FixedArray<Codepoints>::create(named_character_reference_data.size()));
|
|
|
|
named_character_reference_data.for_each_member([&](auto& key, auto& value) {
|
|
auto codepoints = value.as_object().get_array("codepoints"sv).value();
|
|
auto unique_index = dafsa_builder.get_unique_index(key.bytes_as_string_view().substring_view(1)).value();
|
|
auto array_index = unique_index - 1;
|
|
u32 second_codepoint = 0;
|
|
if (codepoints.size() == 2) {
|
|
second_codepoint = codepoints[1].template as_integer<u32>();
|
|
}
|
|
index_to_codepoints[array_index] = Codepoints { codepoints[0].template as_integer<u32>(), second_codepoint };
|
|
});
|
|
|
|
generator.append(R"~~~(
|
|
#include <LibWeb/HTML/Parser/Entities.h>
|
|
|
|
namespace Web::HTML {
|
|
|
|
static NamedCharacterReferenceCodepoints g_named_character_reference_codepoints_lookup[] = {
|
|
)~~~");
|
|
|
|
for (auto codepoints : index_to_codepoints) {
|
|
auto member_generator = generator.fork();
|
|
member_generator.set("first_codepoint", MUST(String::formatted("0x{:X}", codepoints.first)));
|
|
member_generator.set("second_codepoint_name", get_second_codepoint_enum_name(codepoints.second));
|
|
member_generator.append(R"~~~( {@first_codepoint@, NamedCharacterReferenceSecondCodepoint::@second_codepoint_name@},
|
|
)~~~");
|
|
}
|
|
|
|
generator.append(R"~~~(};
|
|
|
|
struct DafsaNode {
|
|
// The actual alphabet of characters used in the list of named character references only
|
|
// includes 61 unique characters ('1'...'8', ';', 'a'...'z', 'A'...'Z'), but we have
|
|
// bits to spare and encoding this as a `u8` allows us to avoid the need for converting
|
|
// between an `enum(u6)` containing only the alphabet and the actual `u8` character value.
|
|
u8 character;
|
|
// Nodes are numbered with "an integer which gives the number of words that
|
|
// would be accepted by the automaton starting from that state." This numbering
|
|
// allows calculating "a one-to-one correspondence between the integers 1 to L
|
|
// (L is the number of words accepted by the automaton) and the words themselves."
|
|
//
|
|
// Essentially, this allows us to have a minimal perfect hashing scheme such that
|
|
// it's possible to store & lookup the codepoint transformations of each named character
|
|
// reference using a separate array.
|
|
//
|
|
// Empirically, the largest number in our DAFSA is 168, so all number values fit in a u8.
|
|
u8 number;
|
|
// If true, this node is the end of a valid named character reference.
|
|
// Note: This does not necessarily mean that this node does not have child nodes.
|
|
bool end_of_word : 1;
|
|
// If true, this node is the end of a sibling list.
|
|
// If false, then (index + 1) will contain the next sibling.
|
|
bool end_of_list : 1;
|
|
// Index of the first child of this node.
|
|
// There are 3872 nodes in our DAFSA, so all indexes could fit in a u12.
|
|
u16 child_index : 14;
|
|
};
|
|
static_assert(sizeof(DafsaNode) == 4);
|
|
|
|
static DafsaNode g_named_character_reference_dafsa[] = {
|
|
{ 0, 0, false, true, 1 },
|
|
)~~~");
|
|
|
|
Vector<NonnullRefPtr<Node>> queue;
|
|
HashMap<Node*, u16> child_indexes;
|
|
|
|
u16 first_available_index = dafsa_builder.root()->num_direct_children() + 1;
|
|
|
|
NonnullRefPtr<Node> node = dafsa_builder.root();
|
|
while (true) {
|
|
first_available_index = write_children(node, generator, queue, child_indexes, first_available_index);
|
|
|
|
if (queue.size() == 0)
|
|
break;
|
|
node = queue.take_first();
|
|
}
|
|
|
|
generator.append(R"~~~(};
|
|
|
|
u16 named_character_reference_child_index(u16 node_index) {
|
|
return g_named_character_reference_dafsa[node_index].child_index;
|
|
}
|
|
|
|
bool named_character_reference_is_end_of_word(u16 node_index) {
|
|
return g_named_character_reference_dafsa[node_index].end_of_word;
|
|
}
|
|
|
|
// Note: The unique index is 1-based.
|
|
Optional<NamedCharacterReferenceCodepoints> named_character_reference_codepoints_from_unique_index(u16 unique_index) {
|
|
if (unique_index == 0) return {};
|
|
return g_named_character_reference_codepoints_lookup[unique_index - 1];
|
|
}
|
|
|
|
// Search `first_child_index` and siblings of `first_child_index` for a node with the value `character`.
|
|
// If found, returns the index of the node within the `dafsa` array. Otherwise, returns `null`.
|
|
// Updates `unique_index` as the array is traversed
|
|
Optional<u16> named_character_reference_find_sibling_and_update_unique_index(u16 first_child_index, u8 character, u16& unique_index) {
|
|
auto index = first_child_index;
|
|
while (true) {
|
|
if (g_named_character_reference_dafsa[index].character < character) {
|
|
unique_index += g_named_character_reference_dafsa[index].number;
|
|
}
|
|
if (g_named_character_reference_dafsa[index].character == character) {
|
|
if (g_named_character_reference_dafsa[index].end_of_word) unique_index++;
|
|
return index;
|
|
}
|
|
if (g_named_character_reference_dafsa[index].end_of_list) return {};
|
|
index += 1;
|
|
}
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
|
|
} // namespace Web::HTML
|
|
)~~~");
|
|
|
|
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
|
return {};
|
|
}
|