1
0
Fork 0
mirror of https://github.com/LadybirdBrowser/ladybird.git synced 2025-06-08 05:27:14 +09:00
ladybird/Libraries/LibURL/Pattern/String.cpp
Timothy Flynn ee6b2db009 AK+LibURL+LibWeb: Use simdutf to validate ASCII strings
simdutf provides a vectorized ASCII validator, so let's use that instead
of looping over strings manually.
2025-04-06 11:05:58 -04:00

310 lines
12 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/StringBuilder.h>
#include <LibURL/Pattern/String.h>
#include <LibURL/Pattern/Tokenizer.h>
namespace URL::Pattern {
// https://urlpattern.spec.whatwg.org/#escape-a-pattern-string
String escape_a_pattern_string(String const& input)
{
// 1. Assert: input is an ASCII string.
VERIFY(input.is_ascii());
// 2. Let result be the empty string.
StringBuilder result;
// 3. Let index be 0.
// 4. While index is less than inputs length:
for (auto c : input.bytes_as_string_view()) {
// 1. Let c be input[index].
// 2. Increment index by 1.
// 3. If c is one of:
// * U+002B (+);
// * U+002A (*);
// * U+003F (?);
// * U+003A (:);
// * U+007B ({);
// * U+007D (});
// * U+0028 (();
// * U+0029 ()); or
// * U+005C (\),
// then append U+005C (\) to the end of result.
if ("+*?:{}()\\"sv.contains(c))
result.append('\\');
// 4. Append c to the end of result.
result.append(c);
}
// 5. Return result.
return result.to_string_without_validation();
}
// https://urlpattern.spec.whatwg.org/#escape-a-regexp-string
String escape_a_regexp_string(String const& input)
{
// 1. Assert: input is an ASCII string.
VERIFY(input.is_ascii());
// 2. Let result be the empty string.
StringBuilder builder;
// 3. Let index be 0.
// 4. While index is less than inputs length:
for (auto c : input.bytes_as_string_view()) {
// 1. Let c be input[index].
// 2. Increment index by 1.
// 3. If c is one of:
// * U+002E (.);
// * U+002B (+);
// * U+002A (*);
// * U+003F (?);
// * U+005E (^);
// * U+0024 ($);
// * U+007B ({);
// * U+007D (});
// * U+0028 (();
// * U+0029 ());
// * U+005B ([);
// * U+005D (]);
// * U+007C (|);
// * U+002F (/); or
// * U+005C (\),
// then append "\" to the end of result.
if (".+*?^${}()[]|/\\"sv.contains(c))
builder.append('\\');
// 4. Append c to the end of result.
builder.append(c);
}
// 5. Return result.
return builder.to_string_without_validation();
}
// https://urlpattern.spec.whatwg.org/#generate-a-segment-wildcard-regexp
String generate_a_segment_wildcard_regexp(Options const& options)
{
// 1. Let result be "[^".
StringBuilder result;
result.append("[^"sv);
// 2. Append the result of running escape a regexp string given optionss delimiter code point to the end of result.
if (options.delimiter_code_point.has_value())
result.append(escape_a_regexp_string(String::from_code_point(*options.delimiter_code_point)));
// 3. Append "]+?" to the end of result.
result.append("]+?"sv);
// 4. Return result.
return result.to_string_without_validation();
}
// https://urlpattern.spec.whatwg.org/#generate-a-pattern-string
String generate_a_pattern_string(ReadonlySpan<Part> part_list, Options const& options)
{
// 1. Let result be the empty string.
StringBuilder result;
// 2. Let index list be the result of getting the indices for part list.
// 3. For each index of index list:
for (size_t index = 0; index < part_list.size(); ++index) {
// 1. Let part be part list[index].
auto const& part = part_list[index];
// 2. Let previous part be part list[index - 1] if index is greater than 0, otherwise let it be null.
Part const* previous_part = index > 0 ? &part_list[index - 1] : nullptr;
// 3. Let next part be part list[index + 1] if index is less than index lists size - 1, otherwise let it be null.
Part const* next_part = index + 1 < part_list.size() ? &part_list[index + 1] : nullptr;
// 4. If parts type is "fixed-text" then:
if (part.type == Part::Type::FixedText) {
// 1. If parts modifier is "none" then:
if (part.modifier == Part::Modifier::None) {
// 1. Append the result of running escape a pattern string given parts value to the end of result.
result.append(escape_a_pattern_string(part.value));
// 2. Continue.
continue;
}
// 2. Append "{" to the end of result.
result.append('{');
// 3. Append the result of running escape a pattern string given parts value to the end of result.
result.append(escape_a_pattern_string(part.value));
// 4. Append "}" to the end of result.
result.append('}');
// 5. Append the result of running convert a modifier to a string given parts modifier to the end of result.
result.append(Part::convert_modifier_to_string(part.modifier));
// 6. Continue.
continue;
}
// 5. Let custom name be true if parts name[0] is not an ASCII digit; otherwise false.
bool custom_name = !is_ascii_digit(part.name.bytes()[0]);
// 6. Let needs grouping be true if at least one of the following are true, otherwise let it be false:
// * parts suffix is not the empty string.
// * parts prefix is not the empty string and is not optionss prefix code point.
bool needs_grouping = !part.suffix.is_empty()
|| (!part.prefix.is_empty() && (options.prefix_code_point.has_value() && part.prefix != String::from_code_point(*options.prefix_code_point)));
// 7. If all of the following are true:
// * needs grouping is false; and
// * custom name is true; and
// * parts type is "segment-wildcard"; and
// * parts modifier is "none"; and
// * next part is not null; and
// * next parts prefix is the empty string; and
// * next parts suffix is the empty string
// then:
if (!needs_grouping
&& custom_name
&& part.type == Part::Type::SegmentWildcard
&& part.modifier == Part::Modifier::None
&& next_part != nullptr
&& next_part->prefix.is_empty()
&& next_part->suffix.is_empty()) {
// 1. If next parts type is "fixed-text":
if (next_part->type == Part::Type::FixedText) {
// 1. Set needs grouping to true if the result of running is a valid name code point given next parts
// value's first code point and the boolean false is true.
// FIXME: Raise spec bug, the language here is weird.
needs_grouping = Tokenizer::is_a_valid_name_code_point(*next_part->value.code_points().begin(), false);
}
// 2. Otherwise:
else {
// 1. Set needs grouping to true if next parts name[0] is an ASCII digit.
needs_grouping = is_ascii_digit(*next_part->name.code_points().begin());
}
}
// 8. If all of the following are true:
// * needs grouping is false; and
// * parts prefix is the empty string; and
// * previous part is not null; and
// * previous parts type is "fixed-text"; and
// * previous parts value's last code point is optionss prefix code point.
// then set needs grouping to true.
if (!needs_grouping
&& part.prefix.is_empty()
&& previous_part != nullptr
&& previous_part->type == Part::Type::FixedText
&& ((previous_part->value.is_empty() && !options.prefix_code_point.has_value())
|| (options.prefix_code_point.has_value() && previous_part->value == String::from_code_point(*options.prefix_code_point)))) {
needs_grouping = true;
}
// 9. Assert: parts name is not the empty string or null.
VERIFY(!part.name.is_empty());
// 10. If needs grouping is true, then append "{" to the end of result.
if (needs_grouping)
result.append('{');
// 11. Append the result of running escape a pattern string given parts prefix to the end of result.
result.append(escape_a_pattern_string(part.prefix));
// 12. If custom name is true:
if (custom_name) {
// 1. Append ":" to the end of result.
result.append(':');
// 2. Append parts name to the end of result.
result.append(part.name);
}
// 13. If parts type is "regexp" then:
if (part.type == Part::Type::Regexp) {
// 1. Append "(" to the end of result.
result.append('(');
// 2. Append parts value to the end of result.
result.append(part.value);
// 3. Append ")" to the end of result.
result.append(')');
}
// 14. Otherwise if parts type is "segment-wildcard" and custom name is false:
else if (part.type == Part::Type::SegmentWildcard && !custom_name) {
// 1. Append "(" to the end of result.
result.append('(');
// 2. Append the result of running generate a segment wildcard regexp given options to the end of result.
result.append(generate_a_segment_wildcard_regexp(options));
// 3. Append ")" to the end of result.
result.append(')');
}
// 15. Otherwise if parts type is "full-wildcard":
else if (part.type == Part::Type::FullWildcard) {
// 1. If custom name is false and one of the following is true:
// * previous part is null; or
// * previous parts type is "fixed-text"; or
// * previous parts modifier is not "none"; or
// * needs grouping is true; or
// * parts prefix is not the empty string
// then append "*" to the end of result.
if (!custom_name
&& (previous_part == nullptr
|| previous_part->type == Part::Type::FixedText
|| previous_part->modifier != Part::Modifier::None
|| needs_grouping
|| !part.prefix.is_empty())) {
result.append('*');
}
// 2. Otherwise:
else {
// 1. Append "(" to the end of result.
result.append('(');
// 2. Append full wildcard regexp value to the end of result.
result.append(full_wildcard_regexp_value);
// 3. Append ")" to the end of result.
result.append(')');
}
}
// 16. If all of the following are true:
// * parts type is "segment-wildcard"; and
// * custom name is true; and
// * parts suffix is not the empty string; and
// * The result of running is a valid name code point given parts suffix's first code point and the boolean false is true
// then append U+005C (\) to the end of result.
if (part.type == Part::Type::SegmentWildcard
&& custom_name
&& !part.suffix.is_empty()
&& Tokenizer::is_a_valid_name_code_point(*part.suffix.code_points().begin(), false)) {
result.append('\\');
}
// 17. Append the result of running escape a pattern string given parts suffix to the end of result.
result.append(escape_a_pattern_string(part.suffix));
// 18. If needs grouping is true, then append "}" to the end of result.
if (needs_grouping)
result.append('}');
// 19. Append the result of running convert a modifier to a string given parts modifier to the end of result.
result.append(Part::convert_modifier_to_string(part.modifier));
}
// 4. Return result.
return result.to_string_without_validation();
}
}