From 0a6363d3e93ac880e00a4d53d0e24b8c6e07fded Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 20 Jul 2022 13:52:36 -0400 Subject: [PATCH] LibUnicode: Implement the range pattern processing algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This algorithm is to inject spacing around the range separator under certain conditions. For example, in en-US, the range [3, 5] should be formatted as "3–5" if unitless, but as "$3 – $5" for currency. --- .../Libraries/LibUnicode/NumberFormat.cpp | 54 +++++++++++++++---- Userland/Libraries/LibUnicode/NumberFormat.h | 1 + 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/Userland/Libraries/LibUnicode/NumberFormat.cpp b/Userland/Libraries/LibUnicode/NumberFormat.cpp index b15322fd78a..38adf51c329 100644 --- a/Userland/Libraries/LibUnicode/NumberFormat.cpp +++ b/Userland/Libraries/LibUnicode/NumberFormat.cpp @@ -50,6 +50,17 @@ String replace_digits_for_number_system(StringView system, StringView number) return builder.build(); } +static u32 last_code_point(StringView string) +{ + Utf8View utf8_string { string }; + u32 code_point = 0; + + for (auto it = utf8_string.begin(); it != utf8_string.end(); ++it) + code_point = *it; + + return code_point; +} + // https://www.unicode.org/reports/tr35/tr35-numbers.html#Currencies Optional augment_currency_format_pattern([[maybe_unused]] StringView currency_display, [[maybe_unused]] StringView base_pattern) { @@ -67,16 +78,6 @@ Optional augment_currency_format_pattern([[maybe_unused]] StringView cur Utf8View utf8_currency_display { currency_display }; Optional currency_key_with_spacing; - auto last_code_point = [](StringView string) { - Utf8View utf8_string { string }; - u32 code_point = 0; - - for (auto it = utf8_string.begin(); it != utf8_string.end(); ++it) - code_point = *it; - - return code_point; - }; - if (*number_index < *currency_index) { u32 last_pattern_code_point = last_code_point(base_pattern.substring_view(0, *currency_index)); @@ -104,4 +105,37 @@ Optional augment_currency_format_pattern([[maybe_unused]] StringView cur return {}; } +// https://unicode.org/reports/tr35/tr35-numbers.html#83-range-pattern-processing +Optional augment_range_pattern(StringView range_separator, StringView lower, StringView upper) +{ +#if ENABLE_UNICODE_DATA + auto range_pattern_with_spacing = [&]() { + return String::formatted(" {} ", range_separator); + }; + + Utf8View utf8_range_separator { range_separator }; + Utf8View utf8_upper { upper }; + + // NOTE: Our implementation does the prescribed checks backwards for simplicity. + + // To determine whether to add spacing, the currently recommended heuristic is: + // 2. If the range pattern does not contain a character having the White_Space binary Unicode property after the {0} or before the {1} placeholders. + for (auto it = utf8_range_separator.begin(); it != utf8_range_separator.end(); ++it) { + if (code_point_has_property(*it, Property::White_Space)) + return {}; + } + + // 1. If the lower string ends with a character other than a digit, or if the upper string begins with a character other than a digit. + if (auto it = utf8_upper.begin(); it != utf8_upper.end()) { + if (!code_point_has_general_category(*it, GeneralCategory::Decimal_Number)) + return range_pattern_with_spacing(); + } + + if (!code_point_has_general_category(last_code_point(lower), GeneralCategory::Decimal_Number)) + return range_pattern_with_spacing(); +#endif + + return {}; +} + } diff --git a/Userland/Libraries/LibUnicode/NumberFormat.h b/Userland/Libraries/LibUnicode/NumberFormat.h index 38bb279d03d..315416977d1 100644 --- a/Userland/Libraries/LibUnicode/NumberFormat.h +++ b/Userland/Libraries/LibUnicode/NumberFormat.h @@ -71,5 +71,6 @@ Vector get_compact_number_system_formats(StringView locale, String Vector get_unit_formats(StringView locale, StringView unit, Style style); Optional augment_currency_format_pattern(StringView currency_display, StringView base_pattern); +Optional augment_range_pattern(StringView range_separator, StringView lower, StringView upper); }