mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-06-10 18:10:56 +09:00
LibUnicode: Implement grapheme segmentation
This commit is contained in:
parent
44e8c05c67
commit
58b0eed6a7
Notes:
sideshowbarker
2024-07-17 19:55:09 +09:00
Author: https://github.com/IdanHo
Commit: 58b0eed6a7
Pull-request: https://github.com/SerenityOS/serenity/pull/12229
Reviewed-by: https://github.com/trflynn89 ✅
2 changed files with 95 additions and 0 deletions
|
@ -8,6 +8,7 @@
|
|||
#include <AK/Platform.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/Locale.h>
|
||||
|
@ -357,4 +358,96 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB
|
|||
bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }
|
||||
|
||||
Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
using GBP = GraphemeBreakProperty;
|
||||
Vector<size_t> boundaries;
|
||||
|
||||
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
if (view.length_in_code_points() == 0)
|
||||
return boundaries;
|
||||
|
||||
auto has_any_gbp = [](u32 code_point, auto&&... properties) {
|
||||
return (code_point_has_grapheme_break_property(code_point, properties) || ...);
|
||||
};
|
||||
|
||||
// GB1
|
||||
boundaries.append(0);
|
||||
|
||||
if (view.length_in_code_points() > 1) {
|
||||
auto it = view.begin();
|
||||
auto code_point = *it;
|
||||
u32 next_code_point;
|
||||
auto current_ri_chain = 0;
|
||||
auto in_emoji_sequence = false;
|
||||
|
||||
for (++it; it != view.end(); ++it, code_point = next_code_point) {
|
||||
next_code_point = *it;
|
||||
|
||||
auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
|
||||
auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
|
||||
|
||||
// GB3
|
||||
if (code_point_is_cr && next_code_point_is_lf)
|
||||
continue;
|
||||
// GB4, GB5
|
||||
if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
|
||||
boundaries.append(view.code_unit_offset_of(it));
|
||||
continue;
|
||||
}
|
||||
|
||||
auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
|
||||
auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
|
||||
|
||||
// GB6
|
||||
if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
|
||||
continue;
|
||||
// GB7
|
||||
if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
|
||||
continue;
|
||||
// GB8
|
||||
if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
|
||||
continue;
|
||||
|
||||
auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
|
||||
if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
|
||||
in_emoji_sequence = true;
|
||||
else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
|
||||
in_emoji_sequence = false;
|
||||
|
||||
// GB9
|
||||
if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
|
||||
continue;
|
||||
// GB9a
|
||||
if (has_any_gbp(next_code_point, GBP::SpacingMark))
|
||||
continue;
|
||||
// GB9b
|
||||
if (has_any_gbp(code_point, GBP::Prepend))
|
||||
continue;
|
||||
|
||||
// GB11
|
||||
if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
|
||||
continue;
|
||||
|
||||
auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
|
||||
current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
|
||||
|
||||
// GB12, GB13
|
||||
if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
|
||||
continue;
|
||||
|
||||
// GB999
|
||||
boundaries.append(view.code_unit_offset_of(it));
|
||||
}
|
||||
}
|
||||
|
||||
// GB2
|
||||
boundaries.append(view.length_in_code_units());
|
||||
return boundaries;
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -44,4 +44,6 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert
|
|||
bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
|
||||
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
|
||||
|
||||
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue