From 8ba273a2f38320e432fb7bfae80729f6a02d0019 Mon Sep 17 00:00:00 2001 From: AnotherTest Date: Thu, 19 Nov 2020 01:50:00 +0330 Subject: [PATCH] LibJS: Hook up Regex to RegExpObject and implement `test()' This makes RegExpObject compile and store a Regex, adds all flag-related properties, and implements `RegExpPrototype.test()` (complete with 'lastIndex' support) :^) It should be noted that this only implements `test()' using the builtin `exec()'. --- Libraries/LibC/regex.h | 1 + Libraries/LibJS/CMakeLists.txt | 2 +- Libraries/LibJS/Runtime/CommonPropertyNames.h | 10 ++ Libraries/LibJS/Runtime/ErrorTypes.h | 3 + Libraries/LibJS/Runtime/RegExpObject.cpp | 118 ++++++++++++++++ Libraries/LibJS/Runtime/RegExpObject.h | 15 ++ Libraries/LibJS/Runtime/RegExpPrototype.cpp | 129 ++++++++++++++++++ Libraries/LibJS/Runtime/RegExpPrototype.h | 12 ++ .../builtins/RegExp/RegExp.prototype.test.js | 58 ++++++++ Libraries/LibRegex/RegexMatch.h | 1 + Libraries/LibRegex/RegexMatcher.cpp | 45 ++++-- Libraries/LibRegex/RegexMatcher.h | 7 + Libraries/LibRegex/RegexOptions.h | 7 +- 13 files changed, 396 insertions(+), 12 deletions(-) create mode 100644 Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.test.js diff --git a/Libraries/LibC/regex.h b/Libraries/LibC/regex.h index 3fbf8fb4c3c..3fbcadecd75 100644 --- a/Libraries/LibC/regex.h +++ b/Libraries/LibC/regex.h @@ -97,6 +97,7 @@ enum __RegexAllFlags { __Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended. __Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one. __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results. + __Regex_Internal_Stateful = __Regex_Global << 14, // Internal flag; enables stateful matches. __Regex_Last = __Regex_SkipTrimEmptyMatches }; diff --git a/Libraries/LibJS/CMakeLists.txt b/Libraries/LibJS/CMakeLists.txt index 200202e214e..03323a6b70b 100644 --- a/Libraries/LibJS/CMakeLists.txt +++ b/Libraries/LibJS/CMakeLists.txt @@ -79,4 +79,4 @@ set(SOURCES ) serenity_lib(LibJS js) -target_link_libraries(LibJS LibM LibCore LibCrypto) +target_link_libraries(LibJS LibM LibCore LibCrypto LibRegex) diff --git a/Libraries/LibJS/Runtime/CommonPropertyNames.h b/Libraries/LibJS/Runtime/CommonPropertyNames.h index b474726054d..5b6ed4b613f 100644 --- a/Libraries/LibJS/Runtime/CommonPropertyNames.h +++ b/Libraries/LibJS/Runtime/CommonPropertyNames.h @@ -86,6 +86,7 @@ namespace JS { P(deleteProperty) \ P(description) \ P(done) \ + P(dotAll) \ P(entries) \ P(enumerable) \ P(error) \ @@ -96,6 +97,7 @@ namespace JS { P(filter) \ P(find) \ P(findIndex) \ + P(flags) \ P(floor) \ P(forEach) \ P(from) \ @@ -122,9 +124,11 @@ namespace JS { P(getUTCMinutes) \ P(getUTCMonth) \ P(getUTCSeconds) \ + P(global) \ P(globalThis) \ P(has) \ P(hasOwnProperty) \ + P(ignoreCase) \ P(includes) \ P(indexOf) \ P(info) \ @@ -138,6 +142,7 @@ namespace JS { P(join) \ P(keyFor) \ P(keys) \ + P(lastIndex) \ P(lastIndexOf) \ P(length) \ P(log) \ @@ -146,6 +151,7 @@ namespace JS { P(max) \ P(message) \ P(min) \ + P(multiline) \ P(name) \ P(next) \ P(now) \ @@ -174,12 +180,15 @@ namespace JS { P(sin) \ P(slice) \ P(some) \ + P(source) \ P(splice) \ P(sqrt) \ P(startsWith) \ P(stringify) \ + P(sticky) \ P(substring) \ P(tan) \ + P(test) \ P(toDateString) \ P(toISOString) \ P(toJSON) \ @@ -196,6 +205,7 @@ namespace JS { P(trimStart) \ P(trunc) \ P(undefined) \ + P(unicode) \ P(unshift) \ P(value) \ P(valueOf) \ diff --git a/Libraries/LibJS/Runtime/ErrorTypes.h b/Libraries/LibJS/Runtime/ErrorTypes.h index 7192c800612..edad984c653 100644 --- a/Libraries/LibJS/Runtime/ErrorTypes.h +++ b/Libraries/LibJS/Runtime/ErrorTypes.h @@ -146,6 +146,9 @@ M(ReflectBadArgumentsList, "Arguments list must be an object") \ M(ReflectBadNewTarget, "Optional third argument of Reflect.construct() must be a constructor") \ M(ReflectBadDescriptorArgument, "Descriptor argument is not an object") \ + M(RegExpCompileError, "RegExp compile error: '{}'") \ + M(RegExpObjectBadFlag, "Invalid RegExp flag '{}'") \ + M(RegExpObjectRepeatedFlag, "Repeated RegExp flag '{}'") \ M(StringRawCannotConvert, "Cannot convert property 'raw' to object from {}") \ M(StringRepeatCountMustBe, "repeat count must be a {} number") \ M(ThisHasNotBeenInitialized, "|this| has not been initialized") \ diff --git a/Libraries/LibJS/Runtime/RegExpObject.cpp b/Libraries/LibJS/Runtime/RegExpObject.cpp index 6959034f525..da72f819ffe 100644 --- a/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -24,6 +24,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include @@ -33,6 +34,73 @@ namespace JS { +static Flags options_from(const String& flags, VM& vm, GlobalObject& global_object) +{ + bool g = false, i = false, m = false, s = false, u = false, y = false; + Flags options { + { (regex::ECMAScriptFlags)regex::AllFlags::Global }, // JS regexps are all 'global' by default as per our definition, but the "global" flag enables "stateful". + {}, + }; + + for (auto ch : flags) { + switch (ch) { + case 'g': + if (g) + vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + g = true; + options.effective_flags |= regex::ECMAScriptFlags::Global; + options.declared_flags |= regex::ECMAScriptFlags::Global; + break; + case 'i': + if (i) + vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + i = true; + options.effective_flags |= regex::ECMAScriptFlags::Insensitive; + options.declared_flags |= regex::ECMAScriptFlags::Insensitive; + break; + case 'm': + if (m) + vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + m = true; + options.effective_flags |= regex::ECMAScriptFlags::Multiline; + options.declared_flags |= regex::ECMAScriptFlags::Multiline; + break; + case 's': + if (s) + vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + s = true; + options.effective_flags |= regex::ECMAScriptFlags::SingleLine; + options.declared_flags |= regex::ECMAScriptFlags::SingleLine; + break; + case 'u': + if (u) + vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + u = true; + options.effective_flags |= regex::ECMAScriptFlags::Unicode; + options.declared_flags |= regex::ECMAScriptFlags::Unicode; + break; + case 'y': + if (y) + vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + y = true; + // Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default. + options.effective_flags.reset_flag(regex::ECMAScriptFlags::Global); + // "What's the difference between sticky and global, then", that's simple. + // all the other flags imply 'global', and the "global" flag implies 'stateful'; + // however, the "sticky" flag does *not* imply 'global', only 'stateful'. + options.effective_flags |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful; + options.effective_flags |= regex::ECMAScriptFlags::Sticky; + options.declared_flags |= regex::ECMAScriptFlags::Sticky; + break; + default: + vm.throw_exception(global_object, ErrorType::RegExpObjectBadFlag, ch); + return options; + } + } + + return options; +} + RegExpObject* RegExpObject::create(GlobalObject& global_object, String pattern, String flags) { return global_object.heap().allocate(global_object, pattern, flags, *global_object.regexp_prototype()); @@ -42,11 +110,61 @@ RegExpObject::RegExpObject(String pattern, String flags, Object& prototype) : Object(prototype) , m_pattern(pattern) , m_flags(flags) + , m_active_flags(options_from(m_flags, this->vm(), this->global_object())) + , m_regex(pattern, m_active_flags.effective_flags) { + if (m_regex.parser_result.error != regex::Error::NoError) { + vm().throw_exception(global_object(), ErrorType::RegExpCompileError, m_regex.error_string()); + } +} + +void RegExpObject::initialize(GlobalObject& global_object) +{ + auto& vm = this->vm(); + Object::initialize(global_object); + + define_native_property(vm.names.lastIndex, last_index, set_last_index, Attribute::Writable); } RegExpObject::~RegExpObject() { } +static RegExpObject* regexp_object_from(VM& vm, GlobalObject& global_object) +{ + auto* this_object = vm.this_value(global_object).to_object(global_object); + if (!this_object) + return nullptr; + if (!this_object->is_regexp_object()) { + vm.throw_exception(global_object, ErrorType::NotA, "RegExp"); + return nullptr; + } + return static_cast(this_object); +} + +JS_DEFINE_NATIVE_GETTER(RegExpObject::last_index) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value((unsigned)regexp_object->regex().start_offset); +} + +JS_DEFINE_NATIVE_SETTER(RegExpObject::set_last_index) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return; + + auto index = value.to_i32(global_object); + if (vm.exception()) + return; + + if (index < 0) + index = 0; + + regexp_object->regex().start_offset = index; +} + } diff --git a/Libraries/LibJS/Runtime/RegExpObject.h b/Libraries/LibJS/Runtime/RegExpObject.h index 341b8acf0a0..ef1651298ea 100644 --- a/Libraries/LibJS/Runtime/RegExpObject.h +++ b/Libraries/LibJS/Runtime/RegExpObject.h @@ -28,6 +28,12 @@ #include #include +#include + +struct Flags { + regex::RegexOptions effective_flags; + regex::RegexOptions declared_flags; +}; namespace JS { @@ -38,16 +44,25 @@ public: static RegExpObject* create(GlobalObject&, String pattern, String flags); RegExpObject(String pattern, String flags, Object& prototype); + virtual void initialize(GlobalObject&) override; virtual ~RegExpObject() override; const String& pattern() const { return m_pattern; } const String& flags() const { return m_flags; } + const regex::RegexOptions& declared_options() { return m_active_flags.declared_flags; } + const Regex& regex() { return m_regex; } + const Regex& regex() const { return m_regex; } private: virtual bool is_regexp_object() const override { return true; } + JS_DECLARE_NATIVE_GETTER(last_index); + JS_DECLARE_NATIVE_SETTER(set_last_index); + String m_pattern; String m_flags; + Flags m_active_flags; + Regex m_regex; }; } diff --git a/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Libraries/LibJS/Runtime/RegExpPrototype.cpp index 83f7c9fb8be..bbd21ed53cc 100644 --- a/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -43,6 +43,17 @@ void RegExpPrototype::initialize(GlobalObject& global_object) Object::initialize(global_object); u8 attr = Attribute::Writable | Attribute::Configurable; define_native_function(vm.names.toString, to_string, 0, attr); + define_native_function(vm.names.test, test, 1, attr); + + u8 readable_attr = Attribute::Configurable; + define_native_property(vm.names.dotAll, dot_all, nullptr, readable_attr); + define_native_property(vm.names.flags, flags, nullptr, readable_attr); + define_native_property(vm.names.global, global, nullptr, readable_attr); + define_native_property(vm.names.ignoreCase, ignore_case, nullptr, readable_attr); + define_native_property(vm.names.multiline, multiline, nullptr, readable_attr); + define_native_property(vm.names.source, source, nullptr, readable_attr); + define_native_property(vm.names.sticky, sticky, nullptr, readable_attr); + define_native_property(vm.names.unicode, unicode, nullptr, readable_attr); } RegExpPrototype::~RegExpPrototype() @@ -61,6 +72,124 @@ static RegExpObject* regexp_object_from(VM& vm, GlobalObject& global_object) return static_cast(this_object); } +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::dot_all) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value(regexp_object->declared_options().has_flag_set(ECMAScriptFlags::SingleLine)); +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::flags) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + auto flags = regexp_object->declared_options(); + StringBuilder builder(8); + + if (flags.has_flag_set(ECMAScriptFlags::Global)) + builder.append('g'); + if (flags.has_flag_set(ECMAScriptFlags::Insensitive)) + builder.append('i'); + if (flags.has_flag_set(ECMAScriptFlags::Multiline)) + builder.append('m'); + if (flags.has_flag_set(ECMAScriptFlags::SingleLine)) + builder.append('s'); + if (flags.has_flag_set(ECMAScriptFlags::Unicode)) + builder.append('u'); + if (flags.has_flag_set(ECMAScriptFlags::Sticky)) + builder.append('y'); + + return js_string(vm, builder.to_string()); +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::global) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value(regexp_object->declared_options().has_flag_set(ECMAScriptFlags::Global)); // Note that this "Global" is actually "Global | Stateful" +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::ignore_case) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value(regexp_object->declared_options().has_flag_set(ECMAScriptFlags::Insensitive)); +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::multiline) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value(regexp_object->declared_options().has_flag_set(ECMAScriptFlags::Multiline)); +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::source) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return js_string(vm, regexp_object->pattern()); +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::sticky) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value(regexp_object->declared_options().has_flag_set(ECMAScriptFlags::Sticky)); +} + +JS_DEFINE_NATIVE_GETTER(RegExpPrototype::unicode) +{ + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + return Value(regexp_object->declared_options().has_flag_set(ECMAScriptFlags::Unicode)); +} + +RegexResult RegExpPrototype::do_match(const Regex& re, const StringView& subject) +{ + auto result = re.match(subject); + // The 'lastIndex' property is reset on failing tests (if 'global') + if (!result.success && re.options().has_flag_set(ECMAScriptFlags::Global)) + re.start_offset = 0; + + return result; +} + +JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::test) +{ + // FIXME: This should try using dynamic properties for 'exec' first, + // before falling back to builtin_exec. + auto regexp_object = regexp_object_from(vm, global_object); + if (!regexp_object) + return {}; + + auto str = vm.argument(0).to_string(global_object); + if (vm.exception()) + return {}; + + // RegExps without "global" and "sticky" always start at offset 0. + if (!regexp_object->regex().options().has_flag_set((ECMAScriptFlags)regex::AllFlags::Internal_Stateful)) + regexp_object->regex().start_offset = 0; + + auto result = do_match(regexp_object->regex(), str); + return Value(result.success); +} + JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::to_string) { auto* regexp_object = regexp_object_from(vm, global_object); diff --git a/Libraries/LibJS/Runtime/RegExpPrototype.h b/Libraries/LibJS/Runtime/RegExpPrototype.h index 1949dea6b67..fd721fd06d5 100644 --- a/Libraries/LibJS/Runtime/RegExpPrototype.h +++ b/Libraries/LibJS/Runtime/RegExpPrototype.h @@ -39,6 +39,18 @@ public: virtual ~RegExpPrototype() override; private: + static RegexResult do_match(const Regex&, const StringView&); + + JS_DECLARE_NATIVE_GETTER(dot_all); + JS_DECLARE_NATIVE_GETTER(flags); + JS_DECLARE_NATIVE_GETTER(global); + JS_DECLARE_NATIVE_GETTER(ignore_case); + JS_DECLARE_NATIVE_GETTER(multiline); + JS_DECLARE_NATIVE_GETTER(source); + JS_DECLARE_NATIVE_GETTER(sticky); + JS_DECLARE_NATIVE_GETTER(unicode); + + JS_DECLARE_NATIVE_FUNCTION(test); JS_DECLARE_NATIVE_FUNCTION(to_string); }; diff --git a/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.test.js b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.test.js new file mode 100644 index 00000000000..df3e8b9771d --- /dev/null +++ b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.test.js @@ -0,0 +1,58 @@ +test("basic functionality", () => { + expect(RegExp.prototype.test).toHaveLength(1); +}); + +test("simple test", () => { + let re = /test/; + expect(re.test("test")).toBe(true); + expect(re.test("test")).toBe(true); +}); + +test("simple global test", () => { + let re = /test/g; + expect(re.test("testtest")).toBe(true); + expect(re.test("testtest")).toBe(true); + expect(re.test("testtest")).toBe(false); + expect(re.test("testtest")).toBe(true); + expect(re.test("testtest")).toBe(true); +}); + +test("global test with offset lastIndex", () => { + let re = /test/g; + re.lastIndex = 2; + expect(re.test("testtest")).toBe(true); + expect(re.test("testtest")).toBe(false); + expect(re.test("testtest")).toBe(true); + expect(re.test("testtest")).toBe(true); + expect(re.test("testtest")).toBe(false); +}); + +test("sticky test with offset lastIndex", () => { + let re = /test/y; + re.lastIndex = 2; + expect(re.test("aatest")).toBe(true); + expect(re.test("aatest")).toBe(false); + expect(re.test("aatest")).toBe(false); +}); + +test("flag and options", () => { + expect(/foo/gi.flags).toBe("gi"); + expect(/foo/mu.flags).toBe("mu"); + expect(/foo/gimsuy.flags).toBe("gimsuy"); + + let re = /foo/gim; + expect(re.dotAll).toBe(false); + expect(re.global).toBe(true); + expect(re.ignoreCase).toBe(true); + expect(re.multiline).toBe(true); + expect(re.sticky).toBe(false); + expect(re.unicode).toBe(false); + + expect(() => { + /foo/gg; + }).toThrowWithMessage(SyntaxError, "Repeated RegExp flag 'g'"); + + expect(() => { + /foo/x; + }).toThrowWithMessage(SyntaxError, "Invalid RegExp flag 'x'"); +}); diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index 6f2dfacd5f3..26f31d7eebc 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -261,6 +261,7 @@ public: struct MatchInput { RegexStringView view { nullptr }; AllOptions regex_options {}; + size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset. size_t match_index { 0 }; size_t line { 0 }; diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index 9f1ef25709c..33753299848 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -52,6 +52,15 @@ Regex::Regex(StringView pattern, typename ParserTraits::OptionsT } } +template +typename ParserTraits::OptionsType Regex::options() const +{ + if (parser_result.error != Error::NoError) + return {}; + + return matcher->options(); +} + template String Regex::error_string(Optional message) const { @@ -81,6 +90,10 @@ RegexResult Matcher::match(const RegexStringView& view, Optional RegexResult Matcher::match(const Vector views, Optional::OptionsType> regex_options) const { + // If the pattern *itself* isn't stateful, reset any changes to start_offset. + if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful)) + m_pattern.start_offset = 0; + size_t match_count { 0 }; MatchInput input; @@ -88,8 +101,12 @@ RegexResult Matcher::match(const Vector views, Optional MatchOutput output; input.regex_options = m_regex_options | regex_options.value_or({}).value(); + input.start_offset = m_pattern.start_offset; output.operations = 0; + if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) + ASSERT(views.size() == 1); + if (c_match_preallocation_count) { output.matches.ensure_capacity(c_match_preallocation_count); output.capture_group_matches.ensure_capacity(c_match_preallocation_count); @@ -115,7 +132,7 @@ RegexResult Matcher::match(const Vector views, Optional output.matches.empend(); ASSERT(start_position + state.string_position - start_position <= input.view.length()); - if (input.regex_options & AllFlags::StringCopyMatches) { + if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) { output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position }; } else { // let the view point to the original string ... output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; @@ -126,7 +143,9 @@ RegexResult Matcher::match(const Vector views, Optional s_regex_dbg.print_header(); #endif - bool continue_search = (input.regex_options & AllFlags::Global) || (input.regex_options & AllFlags::Multiline); + bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline); + if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) + continue_search = false; for (auto& view : views) { input.view = view; @@ -135,7 +154,9 @@ RegexResult Matcher::match(const Vector views, Optional #endif auto view_length = view.length(); - for (size_t view_index = 0; view_index < view_length; ++view_index) { + size_t view_index = m_pattern.start_offset; + state.string_position = view_index; + for (; view_index < view_length; ++view_index) { auto& match_length_minimum = m_pattern.parser_result.match_length_minimum; // FIXME: More performant would be to know the remaining minimum string // length needed to match from the current position onwards within @@ -158,12 +179,12 @@ RegexResult Matcher::match(const Vector views, Optional if (success.value()) { - if ((input.regex_options & AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) { + if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) { if (!continue_search) break; continue; } - if ((input.regex_options & AllFlags::MatchNotBeginOfLine) && view_index == 0) { + if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) { if (!continue_search) break; continue; @@ -182,26 +203,34 @@ RegexResult Matcher::match(const Vector views, Optional view_index = state.string_position - (has_zero_length ? 0 : 1); continue; - } else if (!continue_search && state.string_position < view_length) + } else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { + append_match(input, state, output, view_index); + break; + + } else if (state.string_position < view_length) { return { false, 0, {}, {}, {}, output.operations }; + } append_match(input, state, output, view_index); break; } - if (!continue_search) + if (!continue_search && !input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) break; } ++input.line; input.global_offset += view.length() + 1; // +1 includes the line break character + + if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) + m_pattern.start_offset = state.string_position; } MatchOutput output_copy; if (match_count) { auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size()); for (size_t i = 0; i < capture_groups_count; ++i) { - if (input.regex_options & AllFlags::SkipTrimEmptyMatches) { + if (input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) { output_copy.capture_group_matches.append(output.capture_group_matches.at(i)); } else { Vector capture_group_matches; diff --git a/Libraries/LibRegex/RegexMatcher.h b/Libraries/LibRegex/RegexMatcher.h index 7597361f459..158eab716fc 100644 --- a/Libraries/LibRegex/RegexMatcher.h +++ b/Libraries/LibRegex/RegexMatcher.h @@ -73,6 +73,11 @@ public: RegexResult match(const RegexStringView&, Optional::OptionsType> = {}) const; RegexResult match(const Vector, Optional::OptionsType> = {}) const; + typename ParserTraits::OptionsType options() const + { + return m_regex_options; + } + private: Optional execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const; ALWAYS_INLINE Optional execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector states, size_t recursion_level) const; @@ -87,10 +92,12 @@ public: String pattern_value; regex::Parser::Result parser_result; OwnPtr> matcher { nullptr }; + mutable size_t start_offset { 0 }; explicit Regex(StringView pattern, typename ParserTraits::OptionsType regex_options = {}); ~Regex() = default; + typename ParserTraits::OptionsType options() const; void print_bytecode(FILE* f = stdout) const; String error_string(Optional message = {}) const; diff --git a/Libraries/LibRegex/RegexOptions.h b/Libraries/LibRegex/RegexOptions.h index cbebe82ea7a..5e860215fd2 100644 --- a/Libraries/LibRegex/RegexOptions.h +++ b/Libraries/LibRegex/RegexOptions.h @@ -53,7 +53,8 @@ enum class AllFlags { Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended. Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one. SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results. - Last = SkipTrimEmptyMatches + Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off. + Last = Internal_Stateful, }; enum class PosixFlags : FlagsUnderlyingType { @@ -72,7 +73,7 @@ enum class PosixFlags : FlagsUnderlyingType { }; enum class ECMAScriptFlags : FlagsUnderlyingType { - Global = (FlagsUnderlyingType)AllFlags::Global, + Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex. Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive, Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy, Unicode = (FlagsUnderlyingType)AllFlags::Unicode, @@ -123,7 +124,7 @@ public: void reset_flags() { m_flags = (T)0; } void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); } void set_flag(T flag) { *this |= flag; } - bool has_flag_set(T flag) const { return *this & flag; } + bool has_flag_set(T flag) const { return (FlagsUnderlyingType)flag == ((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag); } T value() const { return m_flags; } private: