diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 490caec9d94..8e6d33ce086 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -13,9 +13,6 @@ set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip") set(UNICODE_DATA_SOURCE "UnicodeData.txt") set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}") -set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt") -set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}") - set(PROP_VALUE_ALIAS_SOURCE "PropertyValueAliases.txt") set(PROP_VALUE_ALIAS_PATH "${UCD_PATH}/${PROP_VALUE_ALIAS_SOURCE}") @@ -39,7 +36,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (ENABLE_NETWORK_DOWNLOADS) download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_VALUE_ALIAS_SOURCE}" "${PROP_VALUE_ALIAS_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPTS_SOURCE}" "${SCRIPTS_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}") @@ -67,7 +63,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UCD_VERSION_FILE}" "${UNICODE_DATA_HEADER}" "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" + arguments -u "${UNICODE_DATA_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" ) invoke_generator( "EmojiData" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 327e53b29db..d1024bb9537 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -60,17 +60,12 @@ struct CodePointBidiClass { struct UnicodeData { Vector code_point_data; - // https://www.unicode.org/reports/tr44/#General_Category_Values - PropList general_categories; - Vector general_category_aliases; - PropList script_list { { "Unknown"sv, {} }, }; Vector script_aliases; PropList script_extensions; - CodePointTables general_category_tables; CodePointTables script_tables; CodePointTables script_extension_tables; @@ -290,7 +285,6 @@ enum class @name@ : @underlying@ {)~~~"); namespace Unicode { )~~~"); - generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); @@ -405,7 +399,6 @@ static constexpr Array<@type@, @size@> @name@ { { return {}; }; - TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table)); TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table)); TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table)); @@ -489,9 +482,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) return {}; }; - TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv)); - TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases)); - TRY(append_prop_search("Script"sv, "script"sv, "s_scripts"sv)); TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv)); TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases)); @@ -561,29 +551,6 @@ static void sort_and_merge_code_point_ranges(Vector& co code_points = form_code_point_ranges(all_code_points); } -static void populate_general_category_unions(PropList& general_categories) -{ - // The Unicode standard defines General Category values which are not in any UCD file. These - // values are simply unions of other values. - // https://www.unicode.org/reports/tr44/#GC_Values_Table - auto populate_union = [&](auto alias, auto categories) { - auto& code_points = general_categories.ensure(alias); - for (auto const& category : categories) - code_points.extend(general_categories.find(category)->value); - - sort_and_merge_code_point_ranges(code_points); - }; - - populate_union("LC"sv, Array { "Ll"sv, "Lu"sv, "Lt"sv }); - populate_union("L"sv, Array { "Lu"sv, "Ll"sv, "Lt"sv, "Lm"sv, "Lo"sv }); - populate_union("M"sv, Array { "Mn"sv, "Mc"sv, "Me"sv }); - populate_union("N"sv, Array { "Nd"sv, "Nl"sv, "No"sv }); - populate_union("P"sv, Array { "Pc"sv, "Pd"sv, "Ps"sv, "Pe"sv, "Pi"sv, "Pf"sv, "Po"sv }); - populate_union("S"sv, Array { "Sm"sv, "Sc"sv, "Sk"sv, "So"sv }); - populate_union("Z"sv, Array { "Zs"sv, "Zl"sv, "Zp"sv }); - populate_union("C"sv, Array { "Cc"sv, "Cf"sv, "Cs"sv, "Co"sv, "Cn"sv }); -} - static ErrorOr normalize_script_extensions(PropList& script_extensions, PropList const& script_list, Vector const& script_aliases) { // The ScriptExtensions UCD file lays out its code point ranges rather uniquely compared to @@ -755,12 +722,10 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) return {}; }; - auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories)); auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list)); auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions)); for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { - TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata)); TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata)); } @@ -773,7 +738,6 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView generated_header_path; StringView generated_implementation_path; StringView unicode_data_path; - StringView derived_general_category_path; StringView prop_value_alias_path; StringView scripts_path; StringView script_extensions_path; @@ -782,7 +746,6 @@ ErrorOr serenity_main(Main::Arguments arguments) args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); - args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path"); args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path"); args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path"); args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); @@ -791,19 +754,15 @@ ErrorOr serenity_main(Main::Arguments arguments) auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read)); - auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read)); auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read)); auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read)); auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read)); UnicodeData unicode_data {}; - TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories)); TRY(parse_prop_list(*scripts_file, unicode_data.script_list)); TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true)); - populate_general_category_unions(unicode_data.general_categories); TRY(parse_unicode_data(*unicode_data_file, unicode_data)); - TRY(parse_value_alias_list(*prop_value_alias_file, "gc"sv, unicode_data.general_categories.keys(), unicode_data.general_category_aliases)); TRY(parse_value_alias_list(*prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false)); TRY(normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases)); diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp index 1c7a91e8dcd..41de0ea0934 100644 --- a/Userland/Libraries/LibJS/Lexer.cpp +++ b/Userland/Libraries/LibJS/Lexer.cpp @@ -421,11 +421,7 @@ bool Lexer::is_whitespace() const auto code_point = current_code_point(); if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE) return true; - - static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv); - if (space_separator_category.has_value()) - return Unicode::code_point_has_general_category(code_point, *space_separator_category); - return false; + return Unicode::code_point_has_space_separator_general_category(code_point); } // UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index dcceb3ee9fd..36d55d502a0 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -780,15 +780,11 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp bool OpCode_Compare::matches_character_class(CharClass character_class, u32 ch, bool insensitive) { constexpr auto is_space_or_line_terminator = [](u32 code_point) { - static auto space_separator = Unicode::general_category_from_string("Space_Separator"sv); - if (!space_separator.has_value()) - return is_ascii_space(code_point); - if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029)) return true; if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff)) return true; - return Unicode::code_point_has_general_category(code_point, *space_separator); + return Unicode::code_point_has_space_separator_general_category(code_point); }; switch (character_class) { diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index e51bca2f2f7..f620f27846f 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1647,7 +1647,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() }); }, [&](Unicode::GeneralCategory general_category) { - compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); + compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() }); }, [&](Script script) { if (script.is_extension) @@ -1998,7 +1998,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& if (atom.is_property) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property.value()) }); else if (atom.is_general_category) - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) }); + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category.value()) }); else if (atom.is_script) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) }); else if (atom.is_script_extension) @@ -2335,7 +2335,7 @@ bool ECMA262Parser::parse_class_set_operand(Vector& c compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() }); }, [&](Unicode::GeneralCategory general_category) { - compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); + compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() }); }, [&](Script script) { if (script.is_extension) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 5771ff33d02..514f2e5087b 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -37,8 +37,84 @@ struct AK::Traits> { namespace Unicode { -Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } -bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } +static constexpr GeneralCategory GENERAL_CATEGORY_CASED_LETTER = U_CHAR_CATEGORY_COUNT + 1; +static constexpr GeneralCategory GENERAL_CATEGORY_LETTER = U_CHAR_CATEGORY_COUNT + 2; +static constexpr GeneralCategory GENERAL_CATEGORY_MARK = U_CHAR_CATEGORY_COUNT + 3; +static constexpr GeneralCategory GENERAL_CATEGORY_NUMBER = U_CHAR_CATEGORY_COUNT + 4; +static constexpr GeneralCategory GENERAL_CATEGORY_PUNCTUATION = U_CHAR_CATEGORY_COUNT + 5; +static constexpr GeneralCategory GENERAL_CATEGORY_SYMBOL = U_CHAR_CATEGORY_COUNT + 6; +static constexpr GeneralCategory GENERAL_CATEGORY_SEPARATOR = U_CHAR_CATEGORY_COUNT + 7; +static constexpr GeneralCategory GENERAL_CATEGORY_OTHER = U_CHAR_CATEGORY_COUNT + 8; +static constexpr GeneralCategory GENERAL_CATEGORY_LIMIT = U_CHAR_CATEGORY_COUNT + 9; + +Optional general_category_from_string(StringView general_category) +{ + static auto general_category_names = []() { + Array, GENERAL_CATEGORY_LIMIT.value()> names; + + auto set_names = [&](auto property, auto index, auto general_category) { + if (char const* name = u_getPropertyValueName(property, general_category, U_LONG_PROPERTY_NAME)) + names[index.value()].long_name = StringView { name, strlen(name) }; + if (char const* name = u_getPropertyValueName(property, general_category, U_SHORT_PROPERTY_NAME)) + names[index.value()].short_name = StringView { name, strlen(name) }; + if (char const* name = u_getPropertyValueName(property, general_category, ADDITIONAL_NAME)) + names[index.value()].additional_name = StringView { name, strlen(name) }; + }; + + for (GeneralCategory general_category = 0; general_category < U_CHAR_CATEGORY_COUNT; ++general_category) + set_names(UCHAR_GENERAL_CATEGORY, general_category, static_cast(general_category.value())); + + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_CASED_LETTER, U_GC_LC_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_LETTER, U_GC_L_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_MARK, U_GC_M_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_NUMBER, U_GC_N_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_PUNCTUATION, U_GC_P_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SYMBOL, U_GC_S_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_SEPARATOR, U_GC_Z_MASK); + set_names(UCHAR_GENERAL_CATEGORY_MASK, GENERAL_CATEGORY_OTHER, U_GC_C_MASK); + + return names; + }(); + + if (auto index = find_index(general_category_names.begin(), general_category_names.end(), general_category); index != general_category_names.size()) + return static_cast(index); + return {}; +} + +bool code_point_has_general_category(u32 code_point, GeneralCategory general_category) +{ + auto icu_code_point = static_cast(code_point); + auto icu_general_category = static_cast(general_category.value()); + + if (general_category == GENERAL_CATEGORY_CASED_LETTER) + return (U_GET_GC_MASK(icu_code_point) & U_GC_LC_MASK) != 0; + if (general_category == GENERAL_CATEGORY_LETTER) + return (U_GET_GC_MASK(icu_code_point) & U_GC_L_MASK) != 0; + if (general_category == GENERAL_CATEGORY_MARK) + return (U_GET_GC_MASK(icu_code_point) & U_GC_M_MASK) != 0; + if (general_category == GENERAL_CATEGORY_NUMBER) + return (U_GET_GC_MASK(icu_code_point) & U_GC_N_MASK) != 0; + if (general_category == GENERAL_CATEGORY_PUNCTUATION) + return (U_GET_GC_MASK(icu_code_point) & U_GC_P_MASK) != 0; + if (general_category == GENERAL_CATEGORY_SYMBOL) + return (U_GET_GC_MASK(icu_code_point) & U_GC_S_MASK) != 0; + if (general_category == GENERAL_CATEGORY_SEPARATOR) + return (U_GET_GC_MASK(icu_code_point) & U_GC_Z_MASK) != 0; + if (general_category == GENERAL_CATEGORY_OTHER) + return (U_GET_GC_MASK(icu_code_point) & U_GC_C_MASK) != 0; + + return u_charType(icu_code_point) == icu_general_category; +} + +bool code_point_has_control_general_category(u32 code_point) +{ + return code_point_has_general_category(code_point, U_CONTROL_CHAR); +} + +bool code_point_has_space_separator_general_category(u32 code_point) +{ + return code_point_has_general_category(code_point, U_SPACE_SEPARATOR); +} static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1; static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2; diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index e343c30108e..0674bd4cbfa 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -29,6 +29,9 @@ struct CodePointRangeComparator { Optional general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); +bool code_point_has_control_general_category(u32 code_point); +bool code_point_has_space_separator_general_category(u32 code_point); + Optional property_from_string(StringView); bool code_point_has_property(u32 code_point, Property property); diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index 8b6d65bc9b0..749d09c2c8b 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -13,12 +13,12 @@ namespace Unicode { enum class BidirectionalClass : u8; enum class EmojiGroup : u8; -enum class GeneralCategory : u8; enum class Script : u8; struct CurrencyCode; struct Emoji; +AK_TYPEDEF_DISTINCT_NUMERIC_GENERAL(u32, GeneralCategory, CastToUnderlying, Comparison, Increment); AK_TYPEDEF_DISTINCT_NUMERIC_GENERAL(u32, Property, CastToUnderlying, Comparison, Increment); } diff --git a/Userland/Libraries/LibWeb/UIEvents/KeyboardEvent.cpp b/Userland/Libraries/LibWeb/UIEvents/KeyboardEvent.cpp index 5cd8357df2f..67a48a8451c 100644 --- a/Userland/Libraries/LibWeb/UIEvents/KeyboardEvent.cpp +++ b/Userland/Libraries/LibWeb/UIEvents/KeyboardEvent.cpp @@ -246,11 +246,7 @@ static ErrorOr> get_event_key_string(u32 code_point) auto is_non_control_character = [&]() { // A non-control character is any valid Unicode character except those that are part of the "Other, Control" // ("Cc") General Category. - static auto control_general_category = Unicode::general_category_from_string("Cc"sv); - if (!control_general_category.has_value()) - return true; - - return !Unicode::code_point_has_general_category(code_point, *control_general_category); + return !Unicode::code_point_has_control_general_category(code_point); }; // A key string is a string containing a 0 or 1 non-control characters ("base" characters) followed by 0 or more