diff --git a/Makefile.am b/Makefile.am index 8aaa0cc5b..1468b4df4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -766,6 +766,8 @@ EXTRA_DIST += \ testdata/testinput25 \ testdata/testinput26 \ testdata/testinput27 \ + testdata/testinput28 \ + testdata/testinput29 \ testdata/testinputEBC \ testdata/testinputheap \ testdata/testoutput1 \ @@ -811,6 +813,8 @@ EXTRA_DIST += \ testdata/testoutput25 \ testdata/testoutput26 \ testdata/testoutput27 \ + testdata/testoutput28 \ + testdata/testoutput29 \ testdata/testoutputEBC \ testdata/testoutputheap-16 \ testdata/testoutputheap-32 \ diff --git a/RunTest b/RunTest index dafef3e23..f4214c8f4 100755 --- a/RunTest +++ b/RunTest @@ -90,7 +90,9 @@ title24="Test 24: Non-UTF pattern conversion tests" title25="Test 25: UTF pattern conversion tests" title26="Test 26: Unicode property tests (compatible with Perl >= 5.38)" title27="Test 27: Auto-generated unicode property tests" -maxtest=27 +title28="Test 28: Pattern rewriter tests without UTF" +title29="Test 29: Pattern rewriter tests with UTF" +maxtest=29 titleheap="Test 'heap': Environment-specific heap tests" if [ $# -eq 1 -a "$1" = "list" ]; then @@ -122,6 +124,8 @@ if [ $# -eq 1 -a "$1" = "list" ]; then echo $title25 echo $title26 echo $title27 + echo $title28 + echo $title29 echo "" echo $titleheap echo "" @@ -247,6 +251,8 @@ do24=no do25=no do26=no do27=no +do28=no +do29=no doheap=no doebcdic=no @@ -280,6 +286,8 @@ while [ $# -gt 0 ] ; do 25) do25=yes;; 26) do26=yes;; 27) do27=yes;; + 28) do28=yes;; + 29) do29=yes;; heap) doheap=yes;; ebcdic) doebcdic=yes;; -8) arg8=yes;; @@ -433,7 +441,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \ $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \ $do24 = no -a $do25 = no -a $do26 = no -a $do27 = no -a \ - $doheap = no -a $doebcdic = no \ + $do28 = no -a $do29 = no -a $doheap = no -a $doebcdic = no \ ]; then do0=yes do1=yes @@ -463,6 +471,8 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \ do25=yes do26=yes do27=yes + do28=yes + do29=yes fi # Handle any explicit skips at this stage, so that an argument list may consist @@ -912,6 +922,26 @@ for bmode in "$test8" "$test16" "$test32"; do fi fi + # Pattern rewriter tests (without UTF) + + if [ $do28 = yes ] ; then + echo $title28 + $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinput28 testtry + checkresult $? 28 "" + fi + + # Pattern rewriter tests (with UTF) + + if [ $do29 = yes ] ; then + echo $title29 + if [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + $sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinput29 testtry + checkresult $? 29 "" + fi + fi + # Manually selected heap tests - output may vary in different environments, # which is why that are not automatically run. diff --git a/RunTest.bat b/RunTest.bat index 67e520200..65a54010d 100644 --- a/RunTest.bat +++ b/RunTest.bat @@ -13,7 +13,7 @@ @rem line. Added argument validation and added error reporting. @rem @rem Sheri Pierce added logic to skip feature dependent tests -@rem tests 4 5 7 10 12 14 19 22 25 and 26 require Unicode support +@rem tests 4 5 7 10 12 14 19 22 25 26 27 and 29 require Unicode support @rem 8 requires Unicode and link size 2 @rem 16 requires absence of jit support @rem 17 requires presence of jit support @@ -114,18 +114,20 @@ set do24=no set do25=no set do26=no set do27=no +set do28=no +set do29=no set all=yes for %%a in (%*) do ( set valid=no - for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27) do if %%v == %%a set valid=yes + for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29) do if %%v == %%a set valid=yes if "!valid!" == "yes" ( set do%%a=yes set all=no ) else ( echo Invalid test number - %%a! echo Usage %0 [ test_number ] ... - echo Where test_number is one or more optional test numbers 1 through 27, default is all tests. + echo Where test_number is one or more optional test numbers 1 through 29, default is all tests. exit /b 1 ) ) @@ -159,6 +161,8 @@ if "%all%" == "yes" ( set do25=yes set do26=yes set do27=yes + set do28=yes + set do29=yes ) @echo RunTest.bat's pcre2test output is written to newly created subfolders @@ -214,6 +218,8 @@ if "%do24%" == "yes" call :do24 if "%do25%" == "yes" call :do25 if "%do26%" == "yes" call :do26 if "%do27%" == "yes" call :do27 +if "%do28%" == "yes" call :do28 +if "%do29%" == "yes" call :do29 :modeSkip if "%mode%" == "" ( set mode=-16 @@ -540,6 +546,19 @@ if %unicode% EQU 0 ( if %jit% EQU 1 call :runsub 27 testoutjit "Test with JIT Override" -q -jit goto :eof +:do28 +call :runsub 28 testout "Pattern rewriter tests without UTF" -q +goto :eof + +:do29 +if %unicode% EQU 0 ( + echo Test 29 Skipped due to absence of Unicode support. + goto :eof +) + call :runsub 29 testout "Pattern rewriter tests with UTF" -q + if %jit% EQU 1 call :runsub 29 testoutjit "Test with JIT Override" -q -jit +goto :eof + :conferror @echo. @echo Either your build is incomplete or you have a configuration error. diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 079cf176d..e392c41ca 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -1014,6 +1014,20 @@

pcre2api man page

Dotstar anchor optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). +
+  PCRE2_PATTERN_REWRITE
+  PCRE2_PATTERN_REWRITE_OFF
+
+Enable/disable optimizations which occur during the pattern rewriting +phase (after parsing but before compilation). Pattern rewriting may remove +redundant items, coalesce items, adjust group structure, or replace some +constructs with an equivalent construct. Pattern rewriting will never affect +which strings are and are not matched, or what substrings are captured by +capture groups. However, since it may change the structure of a pattern, +if you are tracing the matching process, you might prefer PCRE2 to use the +original pattern without rewriting. Disabling rewriting may also be useful +for testing. Pattern rewriting is disabled if the compile option +PCRE2_AUTO_CALLOUT is set.
   PCRE2_START_OPTIMIZE
   PCRE2_START_OPTIMIZE_OFF
diff --git a/doc/html/pcre2callout.html b/doc/html/pcre2callout.html
index cdb65ad63..583db5f62 100644
--- a/doc/html/pcre2callout.html
+++ b/doc/html/pcre2callout.html
@@ -98,7 +98,9 @@ 

pcre2callout man page

program has a pattern qualifier (/auto_callout) that sets automatic callouts. When any callouts are present, the output from pcre2test indicates how the pattern is being matched. This is useful information when you are trying to -optimize the performance of a particular pattern. +optimize the performance of a particular pattern. However, note that some +optimizations which adjust the structure of the pattern are disabled when +automatic callouts are enabled.


MISSING CALLOUTS

diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html index 46da3d71f..f38c14704 100644 --- a/doc/html/pcre2syntax.html +++ b/doc/html/pcre2syntax.html @@ -487,6 +487,7 @@

pcre2syntax man page

(*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) (*NO_JIT) disable JIT optimization + (*NO_REWRITE) disable pattern rewriting phase of compilation (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching (*UTF) set appropriate UTF mode for the library in use diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html index db9073f0e..95d538d4a 100644 --- a/doc/html/pcre2test.html +++ b/doc/html/pcre2test.html @@ -705,6 +705,8 @@

pcre2test man page

auto_possess_off don't auto-possessify variable quantifiers dotstar_anchor anchor patterns starting with .* dotstar_anchor_off don't anchor patterns starting with .* + pattern_rewrite rewrite some slow constructs + pattern_rewrite_off don't rewrite slow constructs start_optimize enable pre-scan of subject string start_optimize_off disable pre-scan of subject string
diff --git a/doc/pcre2.txt b/doc/pcre2.txt index 7e402b29e..77366c24c 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -1040,6 +1040,20 @@ PCRE2 CONTEXTS inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). + PCRE2_PATTERN_REWRITE + PCRE2_PATTERN_REWRITE_OFF + + Enable/disable optimizations which occur during the pattern rewriting + phase (after parsing but before compilation). Pattern rewriting may re- + move redundant items, coalesce items, adjust group structure, or re- + place some constructs with an equivalent construct. Pattern rewriting + will never affect which strings are and are not matched, or what sub- + strings are captured by capture groups. However, since it may change + the structure of a pattern, if you are tracing the matching process, + you might prefer PCRE2 to use the original pattern without rewriting. + Disabling rewriting may also be useful for testing. Pattern rewriting + is disabled if the compile option PCRE2_AUTO_CALLOUT is set. + PCRE2_START_OPTIMIZE PCRE2_START_OPTIMIZE_OFF @@ -5005,7 +5019,9 @@ DESCRIPTION automatic callouts. When any callouts are present, the output from pcre2test indicates how the pattern is being matched. This is useful information when you are trying to optimize the performance of a par- - ticular pattern. + ticular pattern. However, note that some optimizations which adjust the + structure of the pattern are disabled when automatic callouts are en- + abled. MISSING CALLOUTS diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 01b6db3a6..f1369ec47 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -939,6 +939,20 @@ this can cause callouts to be skipped. Dotstar anchor optimization is automatically disabled for .* if it is inside an atomic group or a capture group that is the subject of a backreference, or if the pattern contains (*PRUNE) or (*SKIP). +.sp + PCRE2_PATTERN_REWRITE + PCRE2_PATTERN_REWRITE_OFF +.sp +Enable/disable optimizations which occur during the pattern rewriting +phase (after parsing but before compilation). Pattern rewriting may remove +redundant items, coalesce items, adjust group structure, or replace some +constructs with an equivalent construct. Pattern rewriting will never affect +which strings are and are not matched, or what substrings are captured by +capture groups. However, since it may change the structure of a pattern, +if you are tracing the matching process, you might prefer PCRE2 to use the +original pattern without rewriting. Disabling rewriting may also be useful +for testing. Pattern rewriting is disabled if the compile option +PCRE2_AUTO_CALLOUT is set. .sp PCRE2_START_OPTIMIZE PCRE2_START_OPTIMIZE_OFF diff --git a/doc/pcre2callout.3 b/doc/pcre2callout.3 index 7e62dc102..1aa3fc397 100644 --- a/doc/pcre2callout.3 +++ b/doc/pcre2callout.3 @@ -83,7 +83,9 @@ Callouts can be useful for tracking the progress of pattern matching. The program has a pattern qualifier (/auto_callout) that sets automatic callouts. When any callouts are present, the output from \fBpcre2test\fP indicates how the pattern is being matched. This is useful information when you are trying to -optimize the performance of a particular pattern. +optimize the performance of a particular pattern. However, note that some +optimizations which adjust the structure of the pattern are disabled when +automatic callouts are enabled. . . .SH "MISSING CALLOUTS" diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index 3d5b5a832..b79200893 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -459,6 +459,7 @@ of them may appear. For the first three, d is a decimal number. (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) (*NO_JIT) disable JIT optimization + (*NO_REWRITE) disable pattern rewriting phase of compilation (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching (*UTF) set appropriate UTF mode for the library in use diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index e2325b4d8..7cdd479c4 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -659,6 +659,8 @@ calling \fBpcre2_set_optimize()\fP before invoking the regex compiler. auto_possess_off don't auto-possessify variable quantifiers dotstar_anchor anchor patterns starting with .* dotstar_anchor_off don't anchor patterns starting with .* + pattern_rewrite rewrite some slow constructs + pattern_rewrite_off don't rewrite slow constructs start_optimize enable pre-scan of subject string start_optimize_off disable pre-scan of subject string .sp diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt index 4e229148c..745b1f093 100644 --- a/doc/pcre2test.txt +++ b/doc/pcre2test.txt @@ -639,6 +639,8 @@ PATTERN MODIFIERS auto_possess_off don't auto-possessify variable quantifiers dotstar_anchor anchor patterns starting with .* dotstar_anchor_off don't anchor patterns starting with .* + pattern_rewrite rewrite some slow constructs + pattern_rewrite_off don't rewrite slow constructs start_optimize enable pre-scan of subject string start_optimize_off disable pre-scan of subject string diff --git a/maint/manifest-tarball b/maint/manifest-tarball index 7cdc99554..73667bd3c 100644 --- a/maint/manifest-tarball +++ b/maint/manifest-tarball @@ -401,6 +401,8 @@ drwxr-xr-x tarball-dir/pcre2-SNAPSHOT/testdata -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput25 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput26 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput27 +-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput28 +-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput29 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput3 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput4 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testinput5 @@ -436,6 +438,8 @@ drwxr-xr-x tarball-dir/pcre2-SNAPSHOT/testdata -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput25 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput26 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput27 +-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput28 +-rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput29 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput3 -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput3A -rw-r--r-- tarball-dir/pcre2-SNAPSHOT/testdata/testoutput3B diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index 68639bef5..30aae1e9a 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -497,6 +497,8 @@ For binary compatibility, only add to this list; do not renumber. */ #define PCRE2_DOTSTAR_ANCHOR_OFF 67 #define PCRE2_START_OPTIMIZE 68 #define PCRE2_START_OPTIMIZE_OFF 69 +#define PCRE2_PATTERN_REWRITE 70 +#define PCRE2_PATTERN_REWRITE_OFF 71 /* Types used in pcre2_set_substitute_case_callout(). diff --git a/src/pcre2.h.in b/src/pcre2.h.in index ca3f0b413..5559dfbeb 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -497,6 +497,8 @@ For binary compatibility, only add to this list; do not renumber. */ #define PCRE2_DOTSTAR_ANCHOR_OFF 67 #define PCRE2_START_OPTIMIZE 68 #define PCRE2_START_OPTIMIZE_OFF 69 +#define PCRE2_PATTERN_REWRITE 70 +#define PCRE2_PATTERN_REWRITE_OFF 71 /* Types used in pcre2_set_substitute_case_callout(). diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 0ffac8939..539a47002 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -94,6 +94,7 @@ them will be able to (i.e. assume a 64-bit world). */ #define GETPLUSOFFSET(s,p) s = *(++p) #define READPLUSOFFSET(s,p) s = p[1] #define SKIPOFFSET(p) p++ +#define READOFFSET(p) *(p) #define SIZEOFFSET 1 #else #define PUTOFFSET(s,p) \ @@ -105,6 +106,7 @@ them will be able to (i.e. assume a 64-bit world). */ #define READPLUSOFFSET(s,p) \ { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } #define SKIPOFFSET(p) p += 2 +#define READOFFSET(p) (((PCRE2_SIZE)(p)[0] << 32) | (PCRE2_SIZE)(p)[1]) #define SIZEOFFSET 2 #endif @@ -713,6 +715,7 @@ static const pso pso_list[] = { { STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS }, { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR }, { STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, + { STRING_NO_REWRITE_RIGHTPAR, 11, PSO_OPTMZ, PCRE2_OPTIM_PATTERN_REWRITE }, { STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE }, { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT }, { STRING_TURKISH_CASING_RIGHTPAR, 15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING }, @@ -2819,6 +2822,10 @@ days. */ if ((options & PCRE2_AUTO_CALLOUT) != 0) parsed_size_needed += (ptrend - ptr) * 4; +/* All patterns are wrapped in a pair of non-capturing parentheses, to make + * recursive traversal of the pattern easier. */ +parsed_size_needed += 2; + return parsed_size_needed; } @@ -2934,7 +2941,7 @@ uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ uint32_t *verbstartptr = NULL; uint32_t *previous_callout = NULL; uint32_t *parsed_pattern = cb->parsed_pattern; -uint32_t *parsed_pattern_end = cb->parsed_pattern_end; +uint32_t *parsed_pattern_limit = cb->parsed_pattern_limit; uint32_t *this_parsed_item = NULL; uint32_t *prev_parsed_item = NULL; uint32_t meta_quantifier = 0; @@ -2970,6 +2977,10 @@ PCRE2_SPTR ptr_check; PCRE2_ASSERT(parsed_pattern != NULL); +/* All patterns are wrapped in non-capturing parentheses; this avoids the need for + * special-casing the top level when recursing over groups in the parsed pattern */ +*parsed_pattern++ = META_NOCAPTURE; + /* Insert leading items for word and line matching (features provided for the benefit of pcre2grep). */ @@ -2996,7 +3007,7 @@ if ((options & PCRE2_LITERAL) != 0) { while (ptr < ptrend) { - if (parsed_pattern >= parsed_pattern_end) + if (parsed_pattern >= parsed_pattern_limit) { PCRE2_DEBUG_UNREACHABLE(); errorcode = ERR63; /* Internal error (parsed pattern overflow) */ @@ -3067,7 +3078,7 @@ while (ptr < ptrend) ptr_check = ptr; #endif - if (parsed_pattern >= parsed_pattern_end) + if (parsed_pattern >= parsed_pattern_limit) { /* Weak pre-write check; only ensures parsed_pattern[0] is writeable (but the code below can write many chars). Better than nothing. */ @@ -5723,14 +5734,16 @@ else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0) /* Terminate the parsed pattern, then return success if all groups are closed. Otherwise we have unclosed parentheses. */ -if (parsed_pattern >= parsed_pattern_end) +if (parsed_pattern >= parsed_pattern_limit) { PCRE2_DEBUG_UNREACHABLE(); errorcode = ERR63; /* Internal error (parsed pattern overflow) */ goto FAILED; } -*parsed_pattern = META_END; +*parsed_pattern++ = META_KET; +*parsed_pattern++ = META_END; +cb->parsed_pattern_end = parsed_pattern; if (nest_depth == 0) return 0; UNCLOSED_PARENTHESIS: @@ -5755,7 +5768,814 @@ errorcode = ERR79; goto FAILED; } +/************************************************* +* Rewrite parsed pattern (to optimize) * +*************************************************/ + +/* First type of rewrite: Common prefixes in alternation branches + * are pulled out from the alternation. + * + * For example: (ab|ac|ad) ⇒ (a(?:b|c|d)) + * + * Care is needed with this transformation, or we might change behavior. + * We cannot pull out any item which is quantified with * or +; for + * example, this transformation would be incorrect: + * + * (a*b|a*c) ⇒ (a*(?:b|c)) (✗ BAD!) + * + * Also, while it is usually safe to pull out non-quantified items from + * a non-capturing group, we cannot do this: + * + * (?:ab|ac)? ⇒ a(?:b|c)? (✗ BAD!) + * + * Further, certain constructs are never safe to pull out from alternation, + * notably: callouts and certain backtracking control verbs. Also, we can + * never pull out a common prefix from a 'conditional' group. + * + * Second type of rewrite: Alternations which only contain single, literal + * characters are rewritten to character classes. + * + * For example: (?:a|b|c) ⇒ [a-c] + */ + +static inline BOOL is_lookahead(uint32_t code) +{ + return code == META_LOOKAHEAD || code == META_LOOKAHEADNOT || code == META_LOOKAHEAD_NA; +} + +static inline BOOL is_lookbehind(uint32_t code) +{ + return code == META_LOOKBEHIND || code == META_LOOKBEHIND_NA || code == META_LOOKBEHINDNOT; +} + +static inline BOOL is_condition(uint32_t code) +{ + return code >= META_COND_ASSERT && code <= META_COND_VERSION; +} + +static inline BOOL is_substring_scan(uint32_t code) +{ + return code == META_SCS; +} + +static inline BOOL is_script_run(uint32_t code) +{ + return code == META_SCRIPT_RUN; +} + +/* Does this item (from within a parsed_pattern) start a grouping construct? */ +static inline BOOL is_group_starter(uint32_t code) +{ + return code == META_ATOMIC || is_lookahead(code) || is_lookbehind(code) || code == META_NOCAPTURE || code == META_CAPTURE || is_condition(code) || is_substring_scan(code) || is_script_run(code); +} + +static inline BOOL is_group_ender(uint32_t code) +{ + return code == META_KET; +} + +static inline BOOL is_class_starter(uint32_t code) +{ + return code == META_CLASS || code == META_CLASS_NOT; +} + +static inline BOOL is_quantifier(uint32_t item) +{ + return item >= META_ASTERISK && item <= META_QUERY_QUERY; +} + +static inline BOOL is_possessive(uint32_t item) +{ + return item == META_ASTERISK_PLUS || item == META_PLUS_PLUS || item == META_QUERY_PLUS || item == META_MINMAX_PLUS; +} + +static inline BOOL is_minmax(uint32_t item) +{ + return item == META_MINMAX || item == META_MINMAX_PLUS || item == META_MINMAX_QUERY; +} + +static inline BOOL is_callout(uint32_t item) +{ + return item == META_CALLOUT_NUMBER || item == META_CALLOUT_STRING; +} + +static inline BOOL is_backtrack_control(uint32_t item) +{ + return item >= META_COMMIT && item <= META_THEN_ARG; +} + +static inline BOOL specific_repeat_count(uint32_t *item) +{ + /* `item` points to a quantifier; is it one with a specific, fixed count like {2}? + * Or is it something like *, +, or {1,2}? */ + if (!is_minmax(*item)) + return FALSE; + return item[1] == item[2]; +} + +static inline unsigned int number_of_dataitems(uint32_t item, uint32_t *p) +{ + uint32_t data; + + PCRE2_ASSERT(item >= META_END); + + switch (META_CODE(item)) + { + case META_ESCAPE: + data = META_DATA(item); + if (data == ESC_P || data == ESC_p) + return 1; + else if (data == ESC_g || data == ESC_k) + return 2; + else + return 0; + + case META_BACKREF: + return (META_DATA(item) >= 10) ? SIZEOFFSET : 0; + + case META_MARK: + case META_COMMIT_ARG: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + /* Data for this item is variable-length; the next value is the number + * of data items */ + return *(p+1) + 1; + + default: + return meta_extra_lengths[(META_CODE(item) >> 16) & 0x7fff]; + } +} + +static uint32_t* find_group_end(uint32_t *start, uint32_t *limit) +{ + unsigned int nest_level = 1; + uint32_t item, code; + + PCRE2_ASSERT(start < limit); + PCRE2_ASSERT(is_group_starter(META_CODE(*start))); + start++; + + while (start < limit) + { + item = *start; + if (item >= META_END) + { + code = META_CODE(item); + if (is_group_starter(code)) + { + nest_level++; + } + else if (is_group_ender(code)) + { + if (--nest_level == 0) + return start+1; + } + start += number_of_dataitems(item, start); + } + start++; + } + PCRE2_UNREACHABLE(); /* Regexp improperly formed; should have been caught during parsing */ + return limit; /* To satisfy compilers for which we haven't implemented PCRE2_UNREACHABLE */ +} + +static uint32_t* find_class_end(uint32_t *start, uint32_t *limit) +{ + unsigned int nest_level = 1; + uint32_t item; + + PCRE2_ASSERT(start < limit); + PCRE2_ASSERT(is_class_starter(META_CODE(*start))); + + start++; + while (start < limit) + { + item = *start++; + if (item == META_CLASS_END) + { + if (--nest_level == 0) + return start; + } + else if (is_class_starter(META_CODE(item))) + { + nest_level++; + } + } + PCRE2_UNREACHABLE(); /* Regexp improperly formed; should have been caught during parsing */ + return limit; /* To satisfy compilers for which we haven't implemented PCRE2_UNREACHABLE */ +} + +static uint32_t* scan_item(uint32_t *start, uint32_t *limit, uint32_t **quantifier) +{ + uint32_t item = *start; + uint32_t code = META_CODE(item); + uint32_t *result = NULL; + + PCRE2_ASSERT(start < limit); + + if (is_group_starter(code)) + result = find_group_end(start, limit); + else if (is_class_starter(code)) + result = find_class_end(start, limit); + else if (code < META_END) + result = start+1; + else + result = start+number_of_dataitems(item, start)+1; + + /* Check for quantifier suffix */ + item = *result; + if (is_quantifier(item)) + { + if (quantifier != NULL) + *quantifier = result; + result++; + } + else if (is_minmax(item)) + { + if (quantifier != NULL) + *quantifier = result; + result += 3; + } + return result; +} + +static BOOL group_has_no_backtrack_points(uint32_t *start, uint32_t *end) +{ + uint32_t meta = META_CODE(*start); + uint32_t *p, *item_end, *item_quant, code; + + if (meta == META_ATOMIC || meta == META_LOOKAHEAD || meta == META_LOOKAHEADNOT || meta == META_LOOKBEHIND || meta == META_LOOKBEHINDNOT) + return TRUE; + + p = ++start; + if (is_lookbehind(meta)) + p += 2; + + while (p < end) + { + item_quant = NULL; + item_end = scan_item(p, end, &item_quant); + code = META_CODE(*p); + if (code == META_ALT) + return FALSE; + if (item_quant != NULL && !specific_repeat_count(item_quant) && !is_possessive(*item_quant)) + return FALSE; + if (is_group_starter(code) && !group_has_no_backtrack_points(p, item_end)) + return FALSE; + p = item_end; + } + + return TRUE; +} + +static BOOL group_has_no_callouts_or_backtrack_control(uint32_t *start, uint32_t *end) +{ + uint32_t meta = META_CODE(*start); + uint32_t *p = ++start; + uint32_t *item_end, code; + + if (is_lookbehind(meta)) + p += 2; + + while (p < end) + { + item_end = scan_item(p, end, NULL); + code = META_CODE(*p); + if (is_callout(code) || is_backtrack_control(code)) + return FALSE; + if (is_group_starter(code) && !group_has_no_callouts_or_backtrack_control(p, item_end)) + return FALSE; + p = item_end; + } + + return TRUE; +} + +/* This function assumes there is exactly ONE item in each alternation branch */ +static BOOL all_branches_are_literals(uint32_t *first, unsigned int alt_index, uint32_t **alt_positions) +{ + if (*first >= META_END) + return FALSE; + PCRE2_ASSERT(first+1 == alt_positions[0]); + for (unsigned int i = 0; i < alt_index; i++) + { + if (*(alt_positions[i]+1) >= META_END) + return FALSE; + } + return TRUE; +} + +static BOOL all_branches_end_with_literal(unsigned int alt_index, uint32_t **alt_positions, uint32_t last_item) +{ + if (last_item >= META_END) + return FALSE; + for (unsigned int i = 0; i < alt_index; i++) + { + if (*(alt_positions[i]-1) >= META_END) + return FALSE; + } + return TRUE; +} +typedef struct { + uint32_t *p; + uint32_t *start; + uint32_t *limit; +} rewrite_buf; + +#define REWRITE_BUF_BASE_SIZE 16 +#define rewrite_buf_size(buf) ((buf)->limit - (buf)->start) +#define rewrite_buf_offset(buf) ((buf)->p - (buf)->start) +#define rewrite_buf_space(buf) (size_t)((buf)->limit - (buf)->p) + +static inline void rewrite_buf_init(rewrite_buf *buf) +{ + buf->start = buf->p = buf->limit = NULL; +} + +/* To increase performance when required size of buffer is known ahead of time */ +static inline void rewrite_buf_prealloc(rewrite_buf *buf, size_t size, pcre2_memctl *memctl) +{ + PCRE2_ASSERT(buf->start == NULL); + buf->start = buf->p = memctl->malloc(size * sizeof(uint32_t), memctl->memory_data); + buf->limit = buf->start + size; +} + +static inline void rewrite_buf_realloc(rewrite_buf *buf, size_t new_size, pcre2_memctl *memctl) +{ + uint32_t *expanded; + size_t current_size; + + PCRE2_ASSERT(buf->start != NULL); + PCRE2_ASSERT(buf->p >= buf->start); + + expanded = memctl->malloc(new_size * sizeof(uint32_t), memctl->memory_data); + current_size = buf->p - buf->start; + memcpy(expanded, buf->start, current_size * sizeof(uint32_t)); + memctl->free(buf->start, memctl->memory_data); + buf->start = expanded; + buf->p = expanded + current_size; + buf->limit = expanded + new_size; +} + +static inline void rewrite_buf_ensure(rewrite_buf *buf, size_t needed, pcre2_memctl *memctl) +{ + size_t grow_size; + + if (buf->start == NULL) + { + if (REWRITE_BUF_BASE_SIZE > needed) + needed = REWRITE_BUF_BASE_SIZE; + rewrite_buf_prealloc(buf, needed, memctl); + } + else if (rewrite_buf_space(buf) < needed) + { + needed += rewrite_buf_offset(buf); + grow_size = (size_t)rewrite_buf_size(buf) * 2; + if (grow_size > needed) + needed = grow_size; + rewrite_buf_realloc(buf, needed, memctl); + } +} + +static inline void rewrite_buf_append(rewrite_buf *buf, uint32_t item, pcre2_memctl *memctl) +{ + rewrite_buf_ensure(buf, 1, memctl); + *(buf->p)++ = item; +} + +static inline void rewrite_buf_copy(rewrite_buf *buf, uint32_t *src, size_t count, pcre2_memctl *memctl) +{ + rewrite_buf_ensure(buf, count, memctl); + memcpy(buf->p, src, count * sizeof(uint32_t)); + buf->p += count; +} + +static inline void rewrite_finish(rewrite_buf *buf, compile_block *cb, pcre2_memctl *memctl, BOOL heap_parsed_pattern) +{ + /* Was the regex actually modified? If so, update `cb` accordingly */ + if (buf->start != NULL) + { + rewrite_buf_append(buf, META_END, memctl); + + if (heap_parsed_pattern) + memctl->free(cb->parsed_pattern_buf, memctl->memory_data); + + cb->parsed_pattern_buf = cb->parsed_pattern = buf->start; + cb->parsed_pattern_end = buf->p; + cb->parsed_pattern_limit = buf->limit; + rewrite_buf_init(buf); + } +} + +static void rewrite_alternation(uint32_t *start, uint32_t *end, uint32_t *quantifier, rewrite_buf *buf, uint32_t *pattern, PCRE2_SPTR patstring, pcre2_memctl *memctl) +{ + uint32_t *alt_position_buf[16]; + uint32_t **alt_positions = alt_position_buf; + uint32_t alt_limit = (sizeof(alt_position_buf) / sizeof(uint32_t*)); + unsigned int alt_index = 0; + + BOOL copy_opening_paren; + uint32_t *p, *item, *item_end, *item_quant, *first_branch_end, *extract_up_to, *compare_with; + uint32_t code, meta, following, last_item; + size_t branch_size, skip, offset, prefix_size, compare_len, compare_offset, name_len, name_offset; + PCRE2_SPTR name_ptr, compare_ptr; + + /* Skip over the opening paren */ + uint32_t *first = start + 1; + + /* We can't rewrite alternation in a conditional group, since it has a special meaning + * (the first branch is taken if the condition is true, second branch if false) + * + * We don't attempt to rewrite alternation in a lookbehind assertion, for a different reason: + * In many cases, doing so would convert fixed-length lookbehind to variable lookbehind, + * and PCRE2 handles fixed-length lookbehind far more efficiently + * Further, in some cases, rewriting alternation in lookbehind assertions could even cause + * compilation to fail with a "maximum variable lookbehind length exceeded" error + * + * However, in either case, the group might still contain subgroups which can be rewritten */ + if (is_condition(*start) || is_lookbehind(META_CODE(*start))) + goto DONT_REWRITE; + + /* Sometimes this function can just pull out some common prefix from a group, like (?:abc|abd) ⇒ ab(?:c|d) + * In other cases, we need to copy over the opening paren when rewriting, like (abc|abd) ⇒ (ab(?:c|d)) */ + copy_opening_paren = (*start != META_NOCAPTURE) || quantifier != NULL; + if (*first == META_OPTIONS) + { + copy_opening_paren = TRUE; + first += 3; + } + + /* First, pass over the group and find alternation branches */ + p = first; + while (p < end) + { + item_end = scan_item(p, end, NULL); + code = META_CODE(*p); + if (code == META_ALT) + { + if (alt_index == alt_limit) + { + unsigned int new_limit = alt_limit * 3; + uint32_t **new_alt_positions = memctl->malloc(new_limit * sizeof(uint32_t*), memctl->memory_data); + memcpy(new_alt_positions, alt_positions, alt_limit * sizeof(uint32_t*)); + if (alt_positions != alt_position_buf) + memctl->free(alt_positions, memctl->memory_data); + alt_positions = new_alt_positions; + alt_limit = new_limit; + } + alt_positions[alt_index++] = p; + } + else if (code == META_KET) + { + break; + } + p = item_end; + } + + /* See if we can pull out any common prefix */ + if (alt_index > 0) + { + size_t smallest_branch = alt_positions[0] - first; + size_t largest_branch = smallest_branch; + for (unsigned int i = 1; i < alt_index; i++) + { + branch_size = (alt_positions[i] - alt_positions[i-1]) - 1; + if (branch_size < smallest_branch) smallest_branch = branch_size; + if (branch_size > largest_branch) largest_branch = branch_size; + } + branch_size = ((quantifier != NULL ? quantifier : end) - alt_positions[alt_index-1]) - 2; + if (branch_size < smallest_branch) smallest_branch = branch_size; + if (branch_size > largest_branch) largest_branch = branch_size; + + /* We can't pull a common prefix out if there is an empty alternation branch */ + if (smallest_branch > 0) + { + /* First check if we have an alternation like (a|b|c) which can be rewritten + * as a character class (i.e. [abc]) */ + if (largest_branch == 1 && all_branches_are_literals(first, alt_index, alt_positions)) + { + /* First copy some prefix of overall pattern if needed */ + if (buf->start == NULL) + { + rewrite_buf_prealloc(buf, end - pattern, memctl); + rewrite_buf_copy(buf, pattern, start - pattern, memctl); + } + if (copy_opening_paren) + { + rewrite_buf_copy(buf, start, first - start, memctl); + } + /* Rewrite alternation into character class */ + rewrite_buf_append(buf, META_CLASS, memctl); + rewrite_buf_append(buf, *first, memctl); + for (unsigned int i = 0; i < alt_index; i++) + { + rewrite_buf_append(buf, *(alt_positions[i]+1), memctl); + } + rewrite_buf_append(buf, META_CLASS_END, memctl); + goto FINISH_REWRITE; + } + + /* Find longest common prefix, if any, in all the alternation branches */ + first_branch_end = alt_positions[0]; + extract_up_to = first; + while (extract_up_to < first_branch_end) + { + item = extract_up_to; + item_quant = NULL; + item_end = scan_item(item, first_branch_end, &item_quant); + if (item_quant != NULL && !specific_repeat_count(item_quant) && !is_possessive(*item_quant)) + { + /* We can't pull out an item with a quantifier like * or + */ + goto FOUND_LONGEST_PREFIX; + } + if (is_backtrack_control(*item)) + { + /* Pulling out (*PRUNE), (*COMMIT), (*SKIP), or (*THEN) would change behavior. + * These verbs take effect if there is a matching failure which causes + * backtracking to reach them, which might not happen if they are pulled + * out from an alternation */ + goto FOUND_LONGEST_PREFIX; + } + if (is_callout(*item)) + { + /* Pulling out a callout from each alternation branch would change + * observable behavior; instead of being called at the beginning of + * each branch, the callout function would be called just once */ + goto FOUND_LONGEST_PREFIX; + } + PCRE2_ASSERT(item_end > item); /* Don't get stuck in an infinite loop! */ + if ((size_t)(item_end - first) > smallest_branch) + { + goto FOUND_LONGEST_PREFIX; + } + meta = META_CODE(*item); + if (is_group_starter(meta)) + { + if (!group_has_no_backtrack_points(item, item_end)) + { + /* We can't pull out a group which the regex engine might backtrack into; + * doing so could change what the regex matches + * (It would never change a match failure into success, but if there is + * more than one substring in the target string which could possibly + * match the regex, it might change which one is actually returned) */ + goto FOUND_LONGEST_PREFIX; + } + if (!group_has_no_callouts_or_backtrack_control(item, item_end)) + { + /* We can't pull out groups which contain callouts or certain + * backtracking control verbs, for the same reasons explained above */ + goto FOUND_LONGEST_PREFIX; + } + } + /* Check if the corresponding item in all subsequent alternation branches + * match the item in the first branch */ + offset = item - first; + compare_len = item_end - item; + if (!is_lookbehind(meta) && meta != META_RECURSE && meta != META_RECURSE_BYNAME) + { + for (unsigned int i = 0; i < alt_index; i++) + { + compare_with = alt_positions[i]+offset+1; + if (memcmp(item, compare_with, compare_len * sizeof(uint32_t)) != 0) + { + goto FOUND_LONGEST_PREFIX; + } + following = *(compare_with + compare_len); + if (is_quantifier(following) || is_minmax(following)) + { + /* There is an 'identical' item in the first alternation branch and + * the one we are just checking... but the latter one is quantified + * while the first one was not, so they don't really match */ + goto FOUND_LONGEST_PREFIX; + } + } + } + else + { + /* For each lookbehind assertion and by-number subroutine call, we + * have an offset which points to the location where the construct + * occurred in the original pattern string. + * Those are used only for error messages, and will obviously be different + * even if the constructs are otherwise the same, so don't compare them */ + skip = SIZEOFFSET+1; + if (meta == META_RECURSE_BYNAME) + skip++; + compare_len -= skip; + for (unsigned int i = 0; i < alt_index; i++) + { + compare_with = alt_positions[i]+offset+1; + if (*compare_with != *item) + goto FOUND_LONGEST_PREFIX; + if (memcmp(item+skip, compare_with+skip, compare_len*sizeof(uint32_t)) != 0) + goto FOUND_LONGEST_PREFIX; + following = *(compare_with + compare_len + skip); + if (is_quantifier(following) || is_minmax(following)) + goto FOUND_LONGEST_PREFIX; + } + if (meta == META_RECURSE_BYNAME) + { + /* All alternation branches have a by-name subroutine call in the same place + * Confirm if they are calling the same named group */ + name_len = item[1]; + name_offset = READOFFSET(&item[2]); + name_ptr = &patstring[name_offset]; + for (unsigned int i = 0; i < alt_index; i++) + { + compare_with = alt_positions[i]+offset+1; + compare_offset = READOFFSET(&compare_with[2]); + if (compare_offset == name_offset) + continue; + compare_ptr = &patstring[compare_offset]; + if (name_len != compare_with[1] || memcmp((const char*)name_ptr, (const char*)compare_ptr, name_len * (PCRE2_CODE_UNIT_WIDTH >> 3)) != 0) + goto FOUND_LONGEST_PREFIX; + } + } + } + extract_up_to = item_end; + } + + FOUND_LONGEST_PREFIX: + if (extract_up_to != first) + { + /* Rewrite alternation + * Do we need to copy over some prefix of the pattern up until here? */ + if (buf->start == NULL) + { + rewrite_buf_prealloc(buf, end - pattern, memctl); + rewrite_buf_copy(buf, pattern, start - pattern, memctl); + } + + /* Do we need an opening paren? */ + if (copy_opening_paren) + rewrite_buf_copy(buf, start, first - start, memctl); + + /* Copy the common prefix + * If it is necessary to grow the rewrite_buf, try to do it just once for + * performance. The estimate of buffer space needed for the entire rewritten + * group is a bit obscure; the '3' is for an added BRA if needed, plus 2× KET + * The '+ 1' is to make space for all instances of | (META_ALT) */ + rewrite_buf_ensure(buf, 3 + largest_branch + (alt_index * (largest_branch + 1 - (extract_up_to - first))), memctl); + item = first; + while (item < extract_up_to) + { + item_quant = NULL; + item_end = scan_item(item, extract_up_to, &item_quant); + meta = META_CODE(*item); + if (is_group_starter(meta)) + { + /* We may need to rewrite groups within the common prefix */ + rewrite_alternation(item, item_end, item_quant, buf, pattern, patstring, memctl); + } + else + { + rewrite_buf_copy(buf, item, item_end - item, memctl); + } + item = item_end; + } + + if (extract_up_to != first_branch_end || smallest_branch != largest_branch) + { + if (smallest_branch == largest_branch && (first_branch_end - extract_up_to) == 1) + { + last_item = (quantifier != NULL) ? *(quantifier-2) : *(end-2); + if (all_branches_end_with_literal(alt_index, alt_positions, last_item)) + { + /* Create character class for the last literal in each branch */ + rewrite_buf_append(buf, META_CLASS, memctl); + for (unsigned int i = 0; i < alt_index; i++) + rewrite_buf_append(buf, *(alt_positions[i]-1), memctl); + rewrite_buf_append(buf, last_item, memctl); + rewrite_buf_append(buf, META_CLASS_END, memctl); + goto FINISH_REWRITE; + } + } + + /* Add non-capturing paren */ + rewrite_buf_append(buf, META_NOCAPTURE, memctl); + + /* Copy the part AFTER the common prefix for each branch, separated by META_ALT + * When copying each part, allow subgroups to be rewritten */ + prefix_size = extract_up_to - first; + rewrite_buf_copy(buf, extract_up_to, first_branch_end - extract_up_to + 1, memctl); + for (unsigned int i = 0; i < alt_index; i++) + { + uint32_t *copy_from = alt_positions[i] + prefix_size + 1; + uint32_t *copy_to = (i+1 < alt_index) ? alt_positions[i+1]+1 : end; + while (copy_from < copy_to) + { + item_quant = NULL; + item_end = scan_item(copy_from, copy_to, &item_quant); + meta = META_CODE(*copy_from); + if (is_group_starter(meta)) + { + rewrite_alternation(copy_from, item_end, item_quant, buf, pattern, patstring, memctl); + } + else if (meta == META_KET) + { + /* Finish last part of group */ + rewrite_buf_append(buf, META_KET, memctl); + goto FINISH_REWRITE; + } + else + { + rewrite_buf_copy(buf, copy_from, item_end - copy_from, memctl); + } + copy_from = item_end; + } + } + + PCRE2_UNREACHABLE(); + } + + FINISH_REWRITE: + if (quantifier != NULL) + rewrite_buf_copy(buf, quantifier-1, end-quantifier+1, memctl); + else if (copy_opening_paren) + rewrite_buf_append(buf, META_KET, memctl); + + if (alt_positions != alt_position_buf) + memctl->free(alt_positions, memctl->memory_data); + return; + } + } + + if (alt_positions != alt_position_buf) + memctl->free(alt_positions, memctl->memory_data); + } + + /* We didn't rewrite this group + * Even so, a rewrite of some subgroups might be needed */ + DONT_REWRITE: + item = first; + if (buf->start != NULL) + { + /* It has already been decided that some part of the overall pattern does need to be rewritten */ + rewrite_buf_copy(buf, start, first - start, memctl); + while (item < end) + { + item_quant = NULL; + item_end = scan_item(item, end, &item_quant); + meta = META_CODE(*item); + if (is_group_starter(meta)) + rewrite_alternation(item, item_end, item_quant, buf, pattern, patstring, memctl); + else + rewrite_buf_copy(buf, item, item_end - item, memctl); + item = item_end; + REWRITE_UNDER_WAY: ; + } + } + else + { + /* It hasn't yet been decided that we need to rewrite some part of the overall pattern... + * But if we find a subgroup which needs rewrite, then we can still initiate that process */ + while (item < end) + { + item_quant = NULL; + item_end = scan_item(item, end, &item_quant); + meta = META_CODE(*item); + if (is_group_starter(meta)) + { + rewrite_alternation(item, item_end, item_quant, buf, pattern, patstring, memctl); + if (buf->start != NULL) + { + /* A rewrite has now started, so go to the other loop */ + item = item_end; + goto REWRITE_UNDER_WAY; + } + } + item = item_end; + } + } +} + +static void rewrite_regex(compile_block *cb, pcre2_compile_context *ccontext, BOOL heap_parsed_pattern) +{ + rewrite_buf buf; + rewrite_buf_init(&buf); + + rewrite_alternation(cb->parsed_pattern, cb->parsed_pattern_end-1, NULL, &buf, cb->parsed_pattern, cb->start_pattern, &ccontext->memctl); + rewrite_finish(&buf, cb, &ccontext->memctl, heap_parsed_pattern); +} + +/* Strip non-capturing parentheses which are wrapping the entire regex. + * (During the parsing phase, an extra BRA/KET pair is inserted around the + * entire regex to make it easier to traverse recursively during the pattern + * rewrite phase.) */ +static void strip_enclosing_nocaptures(compile_block *cb) +{ + uint32_t *group_end; + + while (cb->parsed_pattern_end > cb->parsed_pattern && META_CODE(cb->parsed_pattern[0]) == META_NOCAPTURE) + { + group_end = find_group_end(cb->parsed_pattern, cb->parsed_pattern_end); + if (group_end != cb->parsed_pattern_end-1) + break; + + cb->parsed_pattern++; + cb->parsed_pattern_end--; + } +} /************************************************* * Find first significant opcode * @@ -10239,7 +11059,9 @@ cb.named_groups = named_groups; cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; cb.names_found = 0; cb.parens_depth = 0; -cb.parsed_pattern = stack_parsed_pattern; +cb.parsed_pattern_buf = cb.parsed_pattern = stack_parsed_pattern; +cb.parsed_pattern_end = NULL; +cb.parsed_pattern_limit = cb.parsed_pattern + PARSED_PATTERN_DEFAULT_SIZE; cb.req_varyopt = 0; cb.start_code = cworkspace; cb.start_pattern = pattern; @@ -10519,7 +11341,6 @@ parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options); /* Allow for 2x uint32_t at the start and 2 at the end, for PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */ - if ((ccontext->extra_options & (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) parsed_size_needed += 4; @@ -10533,14 +11354,14 @@ parsed_size_needed += 1; /* For the final META_END */ if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE) { - uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( - parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data); - if (heap_parsed_pattern == NULL) + cb.parsed_pattern_buf = ccontext->memctl.malloc(parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data); + if (cb.parsed_pattern_buf == NULL) { *errorptr = ERR21; goto EXIT; } - cb.parsed_pattern = heap_parsed_pattern; + cb.parsed_pattern = cb.parsed_pattern_buf; + cb.parsed_pattern_limit = cb.parsed_pattern + parsed_size_needed + 1; } cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed; @@ -10549,6 +11370,12 @@ cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed; errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb); if (errorcode != 0) goto HAD_CB_ERROR; +if ((cb.external_options & PCRE2_AUTO_CALLOUT) == 0 && + (optim_flags & PCRE2_OPTIM_PATTERN_REWRITE) != 0) + rewrite_regex(&cb, ccontext, cb.parsed_pattern_buf != stack_parsed_pattern); + +strip_enclosing_nocaptures(&cb); + /* If there are any lookbehinds, scan the parsed pattern to figure out their lengths. Workspace is needed to remember whether numbered groups are or are not of limited length, and if limited, what the minimum and maximum lengths are. @@ -10581,7 +11408,7 @@ if (has_lookbehind) /* For debugging, there is a function that shows the parsed pattern vector. */ #ifdef DEBUG_SHOW_PARSED -fprintf(stderr, "+++ Pre-scan complete:\n"); +fprintf(stderr, "+++ Pre-scan (and rewrite phase) complete:\n"); show_parsed(&cb); #endif @@ -11048,8 +11875,8 @@ PCRE2_ASSERT(cb.cranges == NULL); #ifdef SUPPORT_VALGRIND if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); #endif -if (cb.parsed_pattern != stack_parsed_pattern) - ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); +if (cb.parsed_pattern_buf != stack_parsed_pattern) + ccontext->memctl.free(cb.parsed_pattern_buf, ccontext->memctl.memory_data); if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); if (cb.groupinfo != stack_groupinfo) diff --git a/src/pcre2_context.c b/src/pcre2_context.c index 2345145d3..cdf6a7a2f 100644 --- a/src/pcre2_context.c +++ b/src/pcre2_context.c @@ -429,7 +429,7 @@ switch (directive) break; default: - if (directive >= PCRE2_AUTO_POSSESS && directive <= PCRE2_START_OPTIMIZE_OFF) + if (directive >= PCRE2_AUTO_POSSESS && directive <= PCRE2_PATTERN_REWRITE_OFF) { /* Even directive numbers starting from 64 switch a bit on; * Odd directive numbers starting from 65 switch a bit off */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 6e0a5e05d..8d1c8b01c 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -630,8 +630,9 @@ total length of the tables. */ #define PCRE2_OPTIM_AUTO_POSSESS 0x00000001u #define PCRE2_OPTIM_DOTSTAR_ANCHOR 0x00000002u #define PCRE2_OPTIM_START_OPTIMIZE 0x00000004u +#define PCRE2_OPTIM_PATTERN_REWRITE 0x00000008u -#define PCRE2_OPTIMIZATION_ALL 0x00000007u +#define PCRE2_OPTIMIZATION_ALL 0x0000000Fu /* -------------------- Character and string names ------------------------ */ @@ -987,6 +988,7 @@ a positive value. */ #define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" #define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR "NO_DOTSTAR_ANCHOR)" #define STRING_NO_JIT_RIGHTPAR "NO_JIT)" +#define STRING_NO_REWRITE_RIGHTPAR "NO_REWRITE)" #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" #define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)" #define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" @@ -1292,6 +1294,7 @@ only. */ #define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS #define STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_D STR_O STR_T STR_S STR_T STR_A STR_R STR_UNDERSCORE STR_A STR_N STR_C STR_H STR_O STR_R STR_RIGHT_PARENTHESIS #define STRING_NO_JIT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_J STR_I STR_T STR_RIGHT_PARENTHESIS +#define STRING_NO_REWRITE_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_R STR_E STR_W STR_R STR_I STR_T STR_E STR_RIGHT_PARENTHESIS #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS #define STRING_NOTEMPTY_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_RIGHT_PARENTHESIS #define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 6b858139f..9be7527fa 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -772,8 +772,10 @@ typedef struct compile_block { uint32_t external_flags; /* External flag bits to be set */ uint32_t bracount; /* Count of capturing parentheses */ uint32_t lastcapture; /* Last capture encountered */ - uint32_t *parsed_pattern; /* Parsed pattern buffer */ - uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ + uint32_t *parsed_pattern_buf; /* Parsed pattern buffer */ + uint32_t *parsed_pattern; /* First used position in parsed pattern buffer */ + uint32_t *parsed_pattern_end; /* Last used position in parsed pattern buffer + 1 */ + uint32_t *parsed_pattern_limit; /* Parsed pattern should not reach here */ uint32_t *groupinfo; /* Group info vector */ uint32_t top_backref; /* Maximum back reference */ uint32_t backref_map; /* Bitmap of low back refs */ diff --git a/src/pcre2test.c b/src/pcre2test.c index 80ab4f809..a88da93a2 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -760,6 +760,8 @@ static modstruct modlist[] = { { "parens_nest_limit", MOD_CTC, MOD_INT, 0, CO(parens_nest_limit) }, { "partial_hard", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, { "partial_soft", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, + { "pattern_rewrite", MOD_CTC, MOD_OPTMZ, PCRE2_PATTERN_REWRITE, 0 }, + { "pattern_rewrite_off", MOD_CTC, MOD_OPTMZ, PCRE2_PATTERN_REWRITE_OFF, 0 }, { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, { "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) }, @@ -4464,13 +4466,15 @@ static void show_optimize_flags(uint32_t flags, const char *before, const char *after) { if (flags == 0) fprintf(outfile, "%s%s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s", before, ((flags & PCRE2_OPTIM_AUTO_POSSESS) != 0) ? "auto_possess" : "", ((flags & PCRE2_OPTIM_AUTO_POSSESS) != 0 && (flags >> 1) != 0) ? "," : "", ((flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0) ? "dotstar_anchor" : "", ((flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0 && (flags >> 2) != 0) ? "," : "", ((flags & PCRE2_OPTIM_START_OPTIMIZE) != 0) ? "start_optimize" : "", + ((flags & PCRE2_OPTIM_START_OPTIMIZE) != 0 && (flags >> 3) != 0) ? "," : "", + ((flags & PCRE2_OPTIM_PATTERN_REWRITE) != 0) ? "pattern_rewrite" : "", after); } diff --git a/testdata/testinput17 b/testdata/testinput17 index 7dd2d8ea9..62dd4500a 100644 --- a/testdata/testinput17 +++ b/testdata/testinput17 @@ -282,7 +282,11 @@ /[axm]{7}/ -/(.|.)*?bx/ +/(.|.)*?bx/pattern_rewrite_off +\= Expect limit exceeded + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabax\=match_limit=10000000 + +/(a|.)*?bx/ \= Expect limit exceeded aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabax\=match_limit=10000000 diff --git a/testdata/testinput21 b/testdata/testinput21 index 5904af3de..642791e81 100644 --- a/testdata/testinput21 +++ b/testdata/testinput21 @@ -15,4 +15,12 @@ /[\C]/ +# Pattern Rewrite + +/\Ca|\Cb/B + \xffa + \xffb +\= Expect no match + \xff\xfe + # End of testinput21 diff --git a/testdata/testinput22 b/testdata/testinput22 index 5e01fdcab..5e96b437e 100644 --- a/testdata/testinput22 +++ b/testdata/testinput22 @@ -104,4 +104,11 @@ /\C[^\d]+\x80/utf [AΏBŀC] +# Pattern Rewrite + +# \C escape (single code unit) can be pulled out +# ⇒ \C[ab] + +/\C\Ca|\C\Cb/B,utf + # End of testinput22 diff --git a/testdata/testinput28 b/testdata/testinput28 new file mode 100644 index 000000000..45f1f062f --- /dev/null +++ b/testdata/testinput28 @@ -0,0 +1,453 @@ +# Tests for the pattern rewriter which precedes regex compilation (without UTF) + +#forbid_utf + +# Pulling out common prefixes from alternation: + +# Simplest case: +# ⇒ a[b-d] +/ab|ac|ad/B + +# ⇒ a(?:b|c|(d)) +/ab|ac|a(d)/B + +# ⇒ ab[bc] +/abb|abc/B + +# Inside a group is also OK: +# ⇒ (a[b-d]) +/(ab|ac|ad)/B + +# Can recurse into deeply nested groups: +# ⇒ (((a[b-d]))) +/(((ab|ac|ad)))/B + +# Another variant +# ⇒ (a((a[b-d]))b) +/(a((ab|ac|ad))b)/B + +# Common prefix covers all of one alternation branch +# ⇒ a(?:|b) +/a|ab/B + +# Multiple identical alternation branches +# (We don't detect this and prune the redundant ones) +/a|ab|a/B + +# Can't be rewritten +/.+a|.+b/B + aaab + +# Can't be rewritten +/.*b|.*c/B + bbbc + +# Can't be rewritten +/.?b|.?c/B + bc + +# However, a quantifier which matches a specific number of times is OK: +# ⇒ .{2}[bc] +/.{2}b|.{2}c/B + +# Can't be rewritten +# NOTE: During regex compilation, a{1,2} is converted to aa{0,1} +# If this conversion was done earlier, we could pull out a common prefix +# 🤷 +/a{1,2}b|a{1,2}c/B + +# Possessive quantifiers are OK: +# ⇒ .++[ab] +/.++a|.++b/B + +# ⇒ .*+[ab] +/.*+a|.*+b/B + +# ⇒ .?+[ab] +/.?+a|.?+b/B + +# ⇒ .{2,4}+[ab] +/.{2,4}+a|.{2,4}+b/B + +# Non-capturing groups can be pulled out ONLY if they don't contain anything +# which the regex engine can backtrack into +# This includes: alternation, non-possessive quantifiers +# ⇒ (?:aa)[bc] +/(?:aa)b|(?:aa)c/B + +# Can't be rewritten +/(?:\s|\d)b|(?:\s|\d)c/B + +# Will be rewritten into character class +# ⇒ [ab]b|[ab]c +/(?:a|b)b|(?:a|b)c/B + +# Can't be rewritten +/(?:.*)b|(?:.*)c/B + bbbc + +# Capturing groups can't, because it would change the capture numbers +/(\d)b|(\d)c/B + 1b + 1c + +# Will be rewritten into character class +# ⇒ ([ab])b|([ab])c +/(a|b)b|(a|b)c/B + ab + ac + +# Non-capturing group quantified with *, +, ? +# Can't be pulled out from alternation... but (?:a|b) will be converted to (?:[ab]) +/(?:a|b)*b|(?:a|b)*c/B + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)+b|(?:a|b)+c/B + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)?b|(?:a|b)?c/B + +# Lookahead can be pulled out +# ⇒ (?=a)a[AB] +/(?=a)aA|(?=a)aB/B + +# Lookbehind can be pulled out +# ⇒ (?<=a)[bc] +/(?<=a)b|(?<=a)c/B + +# Non-matching lookbehind can't be pulled out (of course) +/(?<=a)b|(?<=b)c/B + +# Negative lookahead can be pulled out +# ⇒ (?!a)[bc] +/(?!a)b|(?!a)c/B + +# Negative lookbehind can be pulled out +# ⇒ (?aa)a[bc] +/(?>aa)ab|(?>aa)ac/B + +# Sometimes items are pulled out FROM an atomic group... +# ⇒ (?>a[bc]) +/(?>ab|ac)/B + +# Character classes can be pulled out +# ⇒ [a-z][bc] +/[a-z]b|[a-z]c/B + +# Semantically identical char classes, written in a different way +# NOTE: If char classes were canonicalized before pattern rewriting, +# we could pull out a common prefix here +# 🤷 Too bad +/[abc]b|[a-c]c/B + +# Character types +# ⇒ \d\s\v\w[ab] +/\d\s\v\wa|\d\s\v\wb/B + +# Start-of-string anchor +/^a|^b/B + +# Rewriting still works with options at beginning of regex +/(*NOTEMPTY)(*NO_AUTO_POSSESS)ab|ac/B + +/(*NO_START_OPT)ab|ac/B + +/(*NO_DOTSTAR_ANCHOR)ab|ac/B + +/(*LIMIT_HEAP=10000)ab|ac/B + +/(*LIMIT_MATCH=10)ab|ac/B + +/(*LIMIT_DEPTH=10)ab|ac/B + +/(*CR)ab|ac/B + +# Dot +/.a|.b/B + +# Extended regex (with embedded whitespace) +# The whitespace doesn't interfere with rewriting +/(?x) a b | a a b/B + +# \Q..\E literal sequences +/\Q$\Ea|\Q$\Eb/B + +# Hex escapes +# 'A' and '\x41' are recognized as equivalent and rewritten +/Aa|\x41b/B + +# Backreferences can be pulled out +/(\w)(?:\1a|\1b)/B + +# Relative backreferences can be pulled out +/(\w)(?:\g{-1}a|\g{-1}b)/B + +# Match reset escape can be pulled out +/a\Kb|a\Kc/B + +# Group with duplicate capture numbers +# ⇒ (a)[bc] +/(?|(a)b|(a)c)/B + ab + ac + +# Although capture numbers are the same, capturing groups can't be pulled out +# if they contain anything which the regex engine can backtrack into +/(?|(a*)b|(a*)c)/B + +/(?|(\d|\s)b|(\d|\s)c)/B + +# Non-capturing group with option letters +/(?i:ab|ac)/B + +# Named capture groups +# This can't be rewritten, because although the capture group +# names are the same, their numbers are different +/(?J)(?:(?a)b|(?a)c)\k/B + +# Named capture groups with identical capture group numbers +# These can be rewritten +/(?|(?a)b|(?a)c)\k/B + +# Capture group condition +# Can't be rewritten +/(a)(?(1)bc|bd)/B + +# Capture group condition (by group name) +# Can't be rewritten +/(?a)(?()bc|bd)/B + +# Version number condition +# Can't be rewritten +/(?(VERSION>=10.4)ab|ac)/B + +# Lookahead assertion condition +# Can't be rewritten +/(?(?=a)ab|ac)/B + +# However, subgroups of a conditional group can be rewritten +/(?(?=a)a(?:bb|bc)|ac)/B + +# Lookbehind assertion condition +# Can't be rewritten +/(?(?<=a)ab|ac)/B + +# Recursion condition +# Can't be rewritten +/(?(R)ab|ac)/B + +# Recursion condition with explicit number +# Can't be rewritten +/(a)(?(R1)ab|ac)/B + +# Recursion condition by name +# Can't be rewritten +/(?a)(?(R&n)ab|ac)/B + +# Define +/(?(DEFINE) a)b|(?(DEFINE) a)\w/B + +# Subroutine call by number +# ⇒ (?|(a)|(b))(?1)[bc] +/(?|(a)|(b))(?:(?1)b|(?1)c)/B + +# Subroutine call by number, but with non-matching number +/(?:(a)|(b))(?:(?1)b|(?2)c)/B + +# Subroutine call by name +# ⇒ (?a)(?&n)[ab] +/(?a)(?:(?&n)a|(?&n)b)/B + +# Subroutine call by name, but with non-matching name +/(?a)(?b)(?:(?&n)a|(?&m)b)/B + +/(?a)(?b)(?:(?&abce)a|(?&abcd)b)/B + +# Callouts are never pulled out +/(?C0)a|(?C0)b/B + +/(?C{ab})a|(?C{ab})b/B + +# Callouts are still not pulled out if they are inside a sub-group +/(?:(?C0))a|(?:(?C0))b/B + +/(?:(?C{ab}))a|(?:(?C{ab}))b/B + +/(?>(?C0))a|(?>(?C0))b/B + +/(?=(?C0))a|(?=(?C0))b/B + +/(?<=(?C0))a|(?<=(?C0))b/B + +# (*ACCEPT) can be pulled out +/(*ACCEPT)ab|(*ACCEPT)ac/B + +# (*ACCEPT:name) can be pulled out +/(*ACCEPT:hello)ab|(*ACCEPT:hello)ac/B + +# (*ACCEPT:name) but with non-matching names +/(*ACCEPT:hello)ab|(*ACCEPT:goodbye)ac/B + +/(*ACCEPT:a)ab|(*ACCEPT:b)ac/B + +# (*FAIL) can be pulled out +/(*FAIL)ab|(*FAIL)ac/B + +# (*FAIL:name) can be pulled out +/(*FAIL:hello)ab|(*FAIL:hello)ac/B + +# (*FAIL:name) but with non-matching names +/(*FAIL:hello)ab|(*FAIL:goodbye)ac/B + +# (*MARK:name) can be pulled out +/(*MARK:hello)ab|(*MARK:hello)ac/B + +# (*MARK:name) but with non-matching names +/(*MARK:hello)ab|(*MARK:goodbye)ac/B + +# (*PRUNE) is never pulled out +/(*PRUNE)a|(*PRUNE)b/B + +/(?:(*PRUNE))a|(?:(*PRUNE))b/B + +# (*PRUNE:name) is never pulled out +/(*PRUNE:abc)a|(*PRUNE:abc)b/B + +/(?:(*PRUNE:abc))a|(?:(*PRUNE:abc))b/B + +# (*COMMIT) is never pulled out +/(*COMMIT)a|(*COMMIT)b/B + +/(?:(*COMMIT))a|(?:(*COMMIT))b/B + +# (*COMMIT:name) is never pulled out +/(*COMMIT:abc)a|(*COMMIT:abc)b/B + +/(?:(*COMMIT:abc))a|(?:(*COMMIT:abc))b/B + +# (*SKIP) is never pulled out +/(*SKIP)a|(*SKIP)b/B + +/(?:(*SKIP))a|(?:(*SKIP))b/B + +# (*SKIP:name) is never pulled out +/(*SKIP:abc)a|(*SKIP:abc)b/B + +/(?:(*SKIP:abc))a|(?:(*SKIP:abc))b/B + +# (*THEN) is never pulled out +/(*THEN)a|(*THEN)b/B + +/(?:(*THEN))a|(?:(*THEN))b/B + +# (*THEN:name) is never pulled out +/(*THEN:abc)a|(*THEN:abc)b/B + +/(?:(*THEN:abc))a|(?:(*THEN:abc))b/B + +# Common prefix which itself has a common prefix +# ⇒ (?>a[bc])[de] +/(?:(?>ab|ac)d|(?>ab|ac)e)/B + +# Rewriting common prefix causes parent group to have a common prefix +# (We don't detect this case) +# ⇒ (?:a[bc]d|a[bc]e) +/(?:a(?:b|c)d|(?:ab|ac)e)/B + +# Another case: +# ⇒ (?>a[bc]d|a[bc]e) +/(?>(?:ab|ac)d|(?:ab|ac)e)/B + +# When rewriting groups which are themselves quantified, +# the extracted items have to stay inside the quantified group +# ⇒ (?:a[bc])? +/(?:ab|ac)?/B + +/(?:ab|ac)*/B + +/(?:ab|ac)+/B + +/(?:ab|ac)?+/B + +/(?:ab|ac)*+/B + +/(?:ab|ac)++/B + +/(?:ab|ac){2}/B + +/(?:ab|ac){2,4}/B + +/(?:ab|ac){2,4}+/B + +# Regression test: +# Pattern rewriter must properly handle assert conditions which contain alternation +# This should not be rewritten: +/b(?(?!)|b)/B + +# Regression test: +# For a conditional group which uses an assertion condition, that assertion condition +# cannot be pulled out +# This should not be rewritten (and the subject string should not match): +/(?(? Overall options: no_auto_possess -Optimizations: dotstar_anchor,start_optimize +Optimizations: dotstar_anchor,start_optimize,pattern_rewrite Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 diff --git a/testdata/testoutput17 b/testdata/testoutput17 index 95f395971..46f378a10 100644 --- a/testdata/testoutput17 +++ b/testdata/testoutput17 @@ -523,7 +523,12 @@ Failed: error -46: JIT stack limit reached /[axm]{7}/ -/(.|.)*?bx/ +/(.|.)*?bx/pattern_rewrite_off +\= Expect limit exceeded + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabax\=match_limit=10000000 +Failed: error -47: match limit exceeded + +/(a|.)*?bx/ \= Expect limit exceeded aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabax\=match_limit=10000000 Failed: error -47: match limit exceeded diff --git a/testdata/testoutput2 b/testdata/testoutput2 index de4752e2b..a336912ba 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -1127,7 +1127,6 @@ Subject length lower bound = 1 /(?s:.*X|^B)/IB ------------------------------------------------------------------ - Bra Bra AllAny* X @@ -1135,7 +1134,6 @@ Subject length lower bound = 1 ^ B Ket - Ket End ------------------------------------------------------------------ Capture group count = 0 @@ -1371,6 +1369,7 @@ Subject length lower bound = 2 /(abc|ab[cd])/I Capture group count = 1 First code unit = 'a' +Last code unit = 'b' Subject length lower bound = 3 /(a|.)/I @@ -3028,7 +3027,7 @@ Subject length lower bound = 1 End ------------------------------------------------------------------ Capture group count = 0 -Optimizations: dotstar_anchor,start_optimize +Optimizations: dotstar_anchor,start_optimize,pattern_rewrite First code unit = 'x' Subject length lower bound = 1 @@ -4173,9 +4172,7 @@ Subject length lower bound = 2 Bra a CBra 1 - b - Alt - c + [bc] Ket d CBra 2 @@ -4723,9 +4720,7 @@ Subject length lower bound = 1 Bra Brazero CBra 1 - a - Alt - b + [ab] KetRmax Any? c @@ -7984,7 +7979,6 @@ Failed: error 142 at offset 4: syntax error in subpattern name (missing terminat /(?|(abc)|(xyz))/B ------------------------------------------------------------------ - Bra Bra CBra 1 abc @@ -7994,7 +7988,6 @@ Failed: error 142 at offset 4: syntax error in subpattern name (missing terminat xyz Ket Ket - Ket End ------------------------------------------------------------------ >abc< @@ -13706,11 +13699,11 @@ Subject length lower bound = 4 /abcd/I,no_start_optimize Capture group count = 0 Options: no_start_optimize -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite /abcd/I,start_optimize_off Capture group count = 0 -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite /abcd/I,optimization_none Capture group count = 0 @@ -13730,7 +13723,7 @@ Subject length lower bound = 1 /(|ab)*?d/I,no_start_optimize Capture group count = 1 Options: no_start_optimize -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite abd 0: abd 1: ab @@ -14189,7 +14182,7 @@ Subject length lower bound = 3 Capture group count = 0 Compile options: no_dotstar_anchor Overall options: anchored no_dotstar_anchor -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite First code unit = 'a' Subject length lower bound = 3 @@ -14197,7 +14190,7 @@ Subject length lower bound = 3 Capture group count = 0 Compile options: Overall options: anchored -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite First code unit = 'a' Subject length lower bound = 3 @@ -14224,7 +14217,7 @@ Subject length lower bound = 3 End ------------------------------------------------------------------ Capture group count = 0 -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite Last code unit = 'c' Subject length lower bound = 3 @@ -14237,7 +14230,7 @@ Subject length lower bound = 3 End ------------------------------------------------------------------ Capture group count = 0 -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite /.*abc/BI,optimization_none ------------------------------------------------------------------ @@ -14260,7 +14253,7 @@ Optimizations: ------------------------------------------------------------------ Capture group count = 0 Options: no_dotstar_anchor -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite Last code unit = 'c' Subject length lower bound = 3 @@ -14282,7 +14275,7 @@ No match /.*\d/info,no_dotstar_anchor,auto_callout Capture group count = 0 Options: auto_callout no_dotstar_anchor -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite Subject length lower bound = 1 \= Expect no match aaa @@ -14310,14 +14303,14 @@ Subject length lower bound = 1 /.*\d/dotall,no_dotstar_anchor,info Capture group count = 0 Options: dotall no_dotstar_anchor -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite Subject length lower bound = 1 /(*NO_DOTSTAR_ANCHOR)(?s).*\d/info Capture group count = 0 Compile options: Overall options: no_dotstar_anchor -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite Subject length lower bound = 1 '^(?:(a)|b)(?(1)A|B)' @@ -15483,7 +15476,6 @@ Subject length lower bound = 65535 /(?|()+|(a)+)/BI ------------------------------------------------------------------ - Bra Bra SCBra 1 KetRmax @@ -15492,7 +15484,6 @@ Subject length lower bound = 65535 a KetRmax Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -15501,7 +15492,6 @@ Subject length lower bound = 0 /(?|(a)+|()+)/BI ------------------------------------------------------------------ - Bra Bra CBra 1 a @@ -15510,7 +15500,6 @@ Subject length lower bound = 0 SCBra 1 KetRmax Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -15519,7 +15508,6 @@ Subject length lower bound = 0 /(?|()|(a))/BI ------------------------------------------------------------------ - Bra Bra CBra 1 Ket @@ -15528,7 +15516,6 @@ Subject length lower bound = 0 a Ket Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -15537,7 +15524,6 @@ Subject length lower bound = 0 /(?|(a)|())/BI ------------------------------------------------------------------ - Bra Bra CBra 1 a @@ -15546,7 +15532,6 @@ Subject length lower bound = 0 CBra 1 Ket Ket - Ket End ------------------------------------------------------------------ Capture group count = 1 @@ -16861,7 +16846,7 @@ No match /(cat)|dog/I,literal,auto_possess_off Capture group count = 0 Options: literal -Optimizations: dotstar_anchor,start_optimize +Optimizations: dotstar_anchor,start_optimize,pattern_rewrite First code unit = '(' Last code unit = 'g' Subject length lower bound = 9 @@ -16874,7 +16859,7 @@ No match /(cat)|dog/I,literal,dotstar_anchor_off Capture group count = 0 Options: literal -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite First code unit = '(' Last code unit = 'g' Subject length lower bound = 9 @@ -17842,7 +17827,8 @@ Subject length lower bound = 1 Capture group count = 0 May match empty string First code unit = 'a' -Subject length lower bound = 1 +Last code unit = 'b' +Subject length lower bound = 2 /(*napla:a|(.)(*ACCEPT)zz)\1../ abcd @@ -18086,7 +18072,7 @@ Subject length lower bound = 5 /(?:c|C)abcd/I Capture group count = 0 -First code unit = 'C' (caseless) +First code unit = 'c' (caseless) Last code unit = 'd' Subject length lower bound = 5 @@ -18626,14 +18612,14 @@ Subject length lower bound = 1 /a?(?=b(*COMMIT)c|)d/I,no_start_optimize Capture group count = 0 Options: no_start_optimize -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite bd No match /(?=b(*COMMIT)c|)d/I,no_start_optimize Capture group count = 0 Options: no_start_optimize -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite bd No match @@ -19697,7 +19683,7 @@ Subject length lower bound = 3 /abc/I,no_auto_possess,auto_possess Capture group count = 0 Options: no_auto_possess -Optimizations: dotstar_anchor,start_optimize +Optimizations: dotstar_anchor,start_optimize,pattern_rewrite First code unit = 'a' Last code unit = 'c' Subject length lower bound = 3 @@ -19705,7 +19691,7 @@ Subject length lower bound = 3 /abc/I,no_dotstar_anchor,dotstar_anchor Capture group count = 0 Options: no_dotstar_anchor -Optimizations: auto_possess,start_optimize +Optimizations: auto_possess,start_optimize,pattern_rewrite First code unit = 'a' Last code unit = 'c' Subject length lower bound = 3 @@ -19713,7 +19699,7 @@ Subject length lower bound = 3 /abc/I,no_start_optimize,start_optimize Capture group count = 0 Options: no_start_optimize -Optimizations: auto_possess,dotstar_anchor +Optimizations: auto_possess,dotstar_anchor,pattern_rewrite # -------------- @@ -20264,11 +20250,9 @@ No match /(?xx:[ ^ a[ ^ b] ])/B,alt_extended_class ------------------------------------------------------------------ - Bra Bra [b] Ket - Ket End ------------------------------------------------------------------ b diff --git a/testdata/testoutput21 b/testdata/testoutput21 index 3ded7ed28..4c0cc27ad 100644 --- a/testdata/testoutput21 +++ b/testdata/testoutput21 @@ -94,4 +94,22 @@ Subject length lower bound = 5 /[\C]/ Failed: error 107 at offset 2: escape sequence is invalid in character class +# Pattern Rewrite + +/\Ca|\Cb/B +------------------------------------------------------------------ + Bra + AllAny + [ab] + Ket + End +------------------------------------------------------------------ + \xffa + 0: \xffa + \xffb + 0: \xffb +\= Expect no match + \xff\xfe +No match + # End of testinput21 diff --git a/testdata/testoutput22-16 b/testdata/testoutput22-16 index 542185405..577ab44a6 100644 --- a/testdata/testoutput22-16 +++ b/testdata/testoutput22-16 @@ -179,4 +179,19 @@ No match [AΏBŀC] No match +# Pattern Rewrite + +# \C escape (single code unit) can be pulled out +# ⇒ \C[ab] + +/\C\Ca|\C\Cb/B,utf +------------------------------------------------------------------ + Bra + Anybyte + Anybyte + [ab] + Ket + End +------------------------------------------------------------------ + # End of testinput22 diff --git a/testdata/testoutput22-32 b/testdata/testoutput22-32 index e96696a9c..c754592ee 100644 --- a/testdata/testoutput22-32 +++ b/testdata/testoutput22-32 @@ -177,4 +177,19 @@ No match [AΏBŀC] No match +# Pattern Rewrite + +# \C escape (single code unit) can be pulled out +# ⇒ \C[ab] + +/\C\Ca|\C\Cb/B,utf +------------------------------------------------------------------ + Bra + AllAny + AllAny + [ab] + Ket + End +------------------------------------------------------------------ + # End of testinput22 diff --git a/testdata/testoutput22-8 b/testdata/testoutput22-8 index eab410eb7..906feb675 100644 --- a/testdata/testoutput22-8 +++ b/testdata/testoutput22-8 @@ -181,4 +181,19 @@ No match [AΏBŀC] No match +# Pattern Rewrite + +# \C escape (single code unit) can be pulled out +# ⇒ \C[ab] + +/\C\Ca|\C\Cb/B,utf +------------------------------------------------------------------ + Bra + Anybyte + Anybyte + [ab] + Ket + End +------------------------------------------------------------------ + # End of testinput22 diff --git a/testdata/testoutput28 b/testdata/testoutput28 new file mode 100644 index 000000000..e986e2538 --- /dev/null +++ b/testdata/testoutput28 @@ -0,0 +1,1974 @@ +# Tests for the pattern rewriter which precedes regex compilation (without UTF) + +#forbid_utf + +# Pulling out common prefixes from alternation: + +# Simplest case: +# ⇒ a[b-d] +/ab|ac|ad/B +------------------------------------------------------------------ + Bra + a + [b-d] + Ket + End +------------------------------------------------------------------ + +# ⇒ a(?:b|c|(d)) +/ab|ac|a(d)/B +------------------------------------------------------------------ + Bra + a + Bra + b + Alt + c + Alt + CBra 1 + d + Ket + Ket + Ket + End +------------------------------------------------------------------ + +# ⇒ ab[bc] +/abb|abc/B +------------------------------------------------------------------ + Bra + ab + [bc] + Ket + End +------------------------------------------------------------------ + +# Inside a group is also OK: +# ⇒ (a[b-d]) +/(ab|ac|ad)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + [b-d] + Ket + Ket + End +------------------------------------------------------------------ + +# Can recurse into deeply nested groups: +# ⇒ (((a[b-d]))) +/(((ab|ac|ad)))/B +------------------------------------------------------------------ + Bra + CBra 1 + CBra 2 + CBra 3 + a + [b-d] + Ket + Ket + Ket + Ket + End +------------------------------------------------------------------ + +# Another variant +# ⇒ (a((a[b-d]))b) +/(a((ab|ac|ad))b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + CBra 2 + CBra 3 + a + [b-d] + Ket + Ket + b + Ket + Ket + End +------------------------------------------------------------------ + +# Common prefix covers all of one alternation branch +# ⇒ a(?:|b) +/a|ab/B +------------------------------------------------------------------ + Bra + a + Bra + Alt + b + Ket + Ket + End +------------------------------------------------------------------ + +# Multiple identical alternation branches +# (We don't detect this and prune the redundant ones) +/a|ab|a/B +------------------------------------------------------------------ + Bra + a + Bra + Alt + b + Alt + Ket + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +/.+a|.+b/B +------------------------------------------------------------------ + Bra + Any+ + a + Alt + Any+ + b + Ket + End +------------------------------------------------------------------ + aaab + 0: aaa + +# Can't be rewritten +/.*b|.*c/B +------------------------------------------------------------------ + Bra + Any* + b + Alt + Any* + c + Ket + End +------------------------------------------------------------------ + bbbc + 0: bbb + +# Can't be rewritten +/.?b|.?c/B +------------------------------------------------------------------ + Bra + Any? + b + Alt + Any? + c + Ket + End +------------------------------------------------------------------ + bc + 0: b + +# However, a quantifier which matches a specific number of times is OK: +# ⇒ .{2}[bc] +/.{2}b|.{2}c/B +------------------------------------------------------------------ + Bra + Any{2} + [bc] + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +# NOTE: During regex compilation, a{1,2} is converted to aa{0,1} +# If this conversion was done earlier, we could pull out a common prefix +# 🤷 +/a{1,2}b|a{1,2}c/B +------------------------------------------------------------------ + Bra + a + a{0,1}+ + b + Alt + a + a{0,1}+ + c + Ket + End +------------------------------------------------------------------ + +# Possessive quantifiers are OK: +# ⇒ .++[ab] +/.++a|.++b/B +------------------------------------------------------------------ + Bra + Any++ + [ab] + Ket + End +------------------------------------------------------------------ + +# ⇒ .*+[ab] +/.*+a|.*+b/B +------------------------------------------------------------------ + Bra + Any*+ + [ab] + Ket + End +------------------------------------------------------------------ + +# ⇒ .?+[ab] +/.?+a|.?+b/B +------------------------------------------------------------------ + Bra + Any?+ + [ab] + Ket + End +------------------------------------------------------------------ + +# ⇒ .{2,4}+[ab] +/.{2,4}+a|.{2,4}+b/B +------------------------------------------------------------------ + Bra + Any{2} + Any{0,2}+ + [ab] + Ket + End +------------------------------------------------------------------ + +# Non-capturing groups can be pulled out ONLY if they don't contain anything +# which the regex engine can backtrack into +# This includes: alternation, non-possessive quantifiers +# ⇒ (?:aa)[bc] +/(?:aa)b|(?:aa)c/B +------------------------------------------------------------------ + Bra + Bra + aa + Ket + [bc] + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +/(?:\s|\d)b|(?:\s|\d)c/B +------------------------------------------------------------------ + Bra + Bra + \s + Alt + \d + Ket + b + Alt + Bra + \s + Alt + \d + Ket + c + Ket + End +------------------------------------------------------------------ + +# Will be rewritten into character class +# ⇒ [ab]b|[ab]c +/(?:a|b)b|(?:a|b)c/B +------------------------------------------------------------------ + Bra + [ab] + b + Alt + [ab] + c + Ket + End +------------------------------------------------------------------ + +# Can't be rewritten +/(?:.*)b|(?:.*)c/B +------------------------------------------------------------------ + Bra + Bra + Any* + Ket + b + Alt + Bra + Any* + Ket + c + Ket + End +------------------------------------------------------------------ + bbbc + 0: bbb + +# Capturing groups can't, because it would change the capture numbers +/(\d)b|(\d)c/B +------------------------------------------------------------------ + Bra + CBra 1 + \d + Ket + b + Alt + CBra 2 + \d + Ket + c + Ket + End +------------------------------------------------------------------ + 1b + 0: 1b + 1: 1 + 1c + 0: 1c + 1: + 2: 1 + +# Will be rewritten into character class +# ⇒ ([ab])b|([ab])c +/(a|b)b|(a|b)c/B +------------------------------------------------------------------ + Bra + CBra 1 + [ab] + Ket + b + Alt + CBra 2 + [ab] + Ket + c + Ket + End +------------------------------------------------------------------ + ab + 0: ab + 1: a + ac + 0: ac + 1: + 2: a + +# Non-capturing group quantified with *, +, ? +# Can't be pulled out from alternation... but (?:a|b) will be converted to (?:[ab]) +/(?:a|b)*b|(?:a|b)*c/B +------------------------------------------------------------------ + Bra + Brazero + Bra + [ab] + KetRmax + b + Alt + Brazero + Bra + [ab] + KetRmax + c + Ket + End +------------------------------------------------------------------ + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)+b|(?:a|b)+c/B +------------------------------------------------------------------ + Bra + Bra + [ab] + KetRmax + b + Alt + Bra + [ab] + KetRmax + c + Ket + End +------------------------------------------------------------------ + +# Can't be pulled out, but can be converted to use [ab] +/(?:a|b)?b|(?:a|b)?c/B +------------------------------------------------------------------ + Bra + Brazero + Bra + [ab] + Ket + b + Alt + Brazero + Bra + [ab] + Ket + c + Ket + End +------------------------------------------------------------------ + +# Lookahead can be pulled out +# ⇒ (?=a)a[AB] +/(?=a)aA|(?=a)aB/B +------------------------------------------------------------------ + Bra + Assert + a + Ket + a + [AB] + Ket + End +------------------------------------------------------------------ + +# Lookbehind can be pulled out +# ⇒ (?<=a)[bc] +/(?<=a)b|(?<=a)c/B +------------------------------------------------------------------ + Bra + Assert back + Reverse + a + Ket + [bc] + Ket + End +------------------------------------------------------------------ + +# Non-matching lookbehind can't be pulled out (of course) +/(?<=a)b|(?<=b)c/B +------------------------------------------------------------------ + Bra + Assert back + Reverse + a + Ket + b + Alt + Assert back + Reverse + b + Ket + c + Ket + End +------------------------------------------------------------------ + +# Negative lookahead can be pulled out +# ⇒ (?!a)[bc] +/(?!a)b|(?!a)c/B +------------------------------------------------------------------ + Bra + Assert not + a + Ket + [bc] + Ket + End +------------------------------------------------------------------ + +# Negative lookbehind can be pulled out +# ⇒ (?aa)a[bc] +/(?>aa)ab|(?>aa)ac/B +------------------------------------------------------------------ + Bra + Once + aa + Ket + a + [bc] + Ket + End +------------------------------------------------------------------ + +# Sometimes items are pulled out FROM an atomic group... +# ⇒ (?>a[bc]) +/(?>ab|ac)/B +------------------------------------------------------------------ + Bra + Once + a + [bc] + Ket + Ket + End +------------------------------------------------------------------ + +# Character classes can be pulled out +# ⇒ [a-z][bc] +/[a-z]b|[a-z]c/B +------------------------------------------------------------------ + Bra + [a-z] + [bc] + Ket + End +------------------------------------------------------------------ + +# Semantically identical char classes, written in a different way +# NOTE: If char classes were canonicalized before pattern rewriting, +# we could pull out a common prefix here +# 🤷 Too bad +/[abc]b|[a-c]c/B +------------------------------------------------------------------ + Bra + [a-c] + b + Alt + [a-c] + c + Ket + End +------------------------------------------------------------------ + +# Character types +# ⇒ \d\s\v\w[ab] +/\d\s\v\wa|\d\s\v\wb/B +------------------------------------------------------------------ + Bra + \d + \s + \v + \w + [ab] + Ket + End +------------------------------------------------------------------ + +# Start-of-string anchor +/^a|^b/B +------------------------------------------------------------------ + Bra + ^ + [ab] + Ket + End +------------------------------------------------------------------ + +# Rewriting still works with options at beginning of regex +/(*NOTEMPTY)(*NO_AUTO_POSSESS)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*NO_START_OPT)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*NO_DOTSTAR_ANCHOR)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*LIMIT_HEAP=10000)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*LIMIT_MATCH=10)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*LIMIT_DEPTH=10)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +/(*CR)ab|ac/B +------------------------------------------------------------------ + Bra + a + [bc] + Ket + End +------------------------------------------------------------------ + +# Dot +/.a|.b/B +------------------------------------------------------------------ + Bra + Any + [ab] + Ket + End +------------------------------------------------------------------ + +# Extended regex (with embedded whitespace) +# The whitespace doesn't interfere with rewriting +/(?x) a b | a a b/B +------------------------------------------------------------------ + Bra + a + Bra + b + Alt + ab + Ket + Ket + End +------------------------------------------------------------------ + +# \Q..\E literal sequences +/\Q$\Ea|\Q$\Eb/B +------------------------------------------------------------------ + Bra + $ + [ab] + Ket + End +------------------------------------------------------------------ + +# Hex escapes +# 'A' and '\x41' are recognized as equivalent and rewritten +/Aa|\x41b/B +------------------------------------------------------------------ + Bra + A + [ab] + Ket + End +------------------------------------------------------------------ + +# Backreferences can be pulled out +/(\w)(?:\1a|\1b)/B +------------------------------------------------------------------ + Bra + CBra 1 + \w + Ket + \1 + [ab] + Ket + End +------------------------------------------------------------------ + +# Relative backreferences can be pulled out +/(\w)(?:\g{-1}a|\g{-1}b)/B +------------------------------------------------------------------ + Bra + CBra 1 + \w + Ket + \1 + [ab] + Ket + End +------------------------------------------------------------------ + +# Match reset escape can be pulled out +/a\Kb|a\Kc/B +------------------------------------------------------------------ + Bra + a + \K + [bc] + Ket + End +------------------------------------------------------------------ + +# Group with duplicate capture numbers +# ⇒ (a)[bc] +/(?|(a)b|(a)c)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + [bc] + Ket + End +------------------------------------------------------------------ + ab + 0: ab + 1: a + ac + 0: ac + 1: a + +# Although capture numbers are the same, capturing groups can't be pulled out +# if they contain anything which the regex engine can backtrack into +/(?|(a*)b|(a*)c)/B +------------------------------------------------------------------ + Bra + CBra 1 + a*+ + Ket + b + Alt + CBra 1 + a*+ + Ket + c + Ket + End +------------------------------------------------------------------ + +/(?|(\d|\s)b|(\d|\s)c)/B +------------------------------------------------------------------ + Bra + CBra 1 + \d + Alt + \s + Ket + b + Alt + CBra 1 + \d + Alt + \s + Ket + c + Ket + End +------------------------------------------------------------------ + +# Non-capturing group with option letters +/(?i:ab|ac)/B +------------------------------------------------------------------ + Bra + /i a + [BCbc] + Ket + End +------------------------------------------------------------------ + +# Named capture groups +# This can't be rewritten, because although the capture group +# names are the same, their numbers are different +/(?J)(?:(?a)b|(?a)c)\k/B +------------------------------------------------------------------ + Bra + Bra + CBra 1 + a + Ket + b + Alt + CBra 2 + a + Ket + c + Ket + \k2 + Ket + End +------------------------------------------------------------------ + +# Named capture groups with identical capture group numbers +# These can be rewritten +/(?|(?a)b|(?a)c)\k/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + [bc] + \1 + Ket + End +------------------------------------------------------------------ + +# Capture group condition +# Can't be rewritten +/(a)(?(1)bc|bd)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + 1 Capture ref + bc + Alt + bd + Ket + Ket + End +------------------------------------------------------------------ + +# Capture group condition (by group name) +# Can't be rewritten +/(?a)(?()bc|bd)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + 1 Capture ref + bc + Alt + bd + Ket + Ket + End +------------------------------------------------------------------ + +# Version number condition +# Can't be rewritten +/(?(VERSION>=10.4)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Cond true + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Lookahead assertion condition +# Can't be rewritten +/(?(?=a)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Assert + a + Ket + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# However, subgroups of a conditional group can be rewritten +/(?(?=a)a(?:bb|bc)|ac)/B +------------------------------------------------------------------ + Bra + Cond + Assert + a + Ket + ab + [bc] + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Lookbehind assertion condition +# Can't be rewritten +/(?(?<=a)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Assert back + Reverse + a + Ket + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Recursion condition +# Can't be rewritten +/(?(R)ab|ac)/B +------------------------------------------------------------------ + Bra + Cond + Cond recurse any + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Recursion condition with explicit number +# Can't be rewritten +/(a)(?(R1)ab|ac)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + Cond recurse 1 + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Recursion condition by name +# Can't be rewritten +/(?a)(?(R&n)ab|ac)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Cond + Cond recurse 1 + ab + Alt + ac + Ket + Ket + End +------------------------------------------------------------------ + +# Define +/(?(DEFINE) a)b|(?(DEFINE) a)\w/B +------------------------------------------------------------------ + Bra + Cond + Cond false + a + Ket + b + Alt + Cond + Cond false + a + Ket + \w + Ket + End +------------------------------------------------------------------ + +# Subroutine call by number +# ⇒ (?|(a)|(b))(?1)[bc] +/(?|(a)|(b))(?:(?1)b|(?1)c)/B +------------------------------------------------------------------ + Bra + Bra + CBra 1 + a + Ket + Alt + CBra 1 + b + Ket + Ket + Recurse + [bc] + Ket + End +------------------------------------------------------------------ + +# Subroutine call by number, but with non-matching number +/(?:(a)|(b))(?:(?1)b|(?2)c)/B +------------------------------------------------------------------ + Bra + Bra + CBra 1 + a + Ket + Alt + CBra 2 + b + Ket + Ket + Bra + Recurse + b + Alt + Recurse + c + Ket + Ket + End +------------------------------------------------------------------ + +# Subroutine call by name +# ⇒ (?a)(?&n)[ab] +/(?a)(?:(?&n)a|(?&n)b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + Recurse + [ab] + Ket + End +------------------------------------------------------------------ + +# Subroutine call by name, but with non-matching name +/(?a)(?b)(?:(?&n)a|(?&m)b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + CBra 2 + b + Ket + Bra + Recurse + a + Alt + Recurse + b + Ket + Ket + End +------------------------------------------------------------------ + +/(?a)(?b)(?:(?&abce)a|(?&abcd)b)/B +------------------------------------------------------------------ + Bra + CBra 1 + a + Ket + CBra 2 + b + Ket + Bra + Recurse + a + Alt + Recurse + b + Ket + Ket + End +------------------------------------------------------------------ + +# Callouts are never pulled out +/(?C0)a|(?C0)b/B +------------------------------------------------------------------ + Bra + Callout 0 5 1 + a + Alt + Callout 0 12 1 + b + Ket + End +------------------------------------------------------------------ + +/(?C{ab})a|(?C{ab})b/B +------------------------------------------------------------------ + Bra + CalloutStr {ab} 4 8 1 + a + Alt + CalloutStr {ab} 14 18 1 + b + Ket + End +------------------------------------------------------------------ + +# Callouts are still not pulled out if they are inside a sub-group +/(?:(?C0))a|(?:(?C0))b/B +------------------------------------------------------------------ + Bra + Bra + Callout 0 8 1 + Ket + a + Alt + Bra + Callout 0 19 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?:(?C{ab}))a|(?:(?C{ab}))b/B +------------------------------------------------------------------ + Bra + Bra + CalloutStr {ab} 7 11 1 + Ket + a + Alt + Bra + CalloutStr {ab} 21 25 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?>(?C0))a|(?>(?C0))b/B +------------------------------------------------------------------ + Bra + Once + Callout 0 8 1 + Ket + a + Alt + Once + Callout 0 19 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?=(?C0))a|(?=(?C0))b/B +------------------------------------------------------------------ + Bra + Assert + Callout 0 8 1 + Ket + a + Alt + Assert + Callout 0 19 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +/(?<=(?C0))a|(?<=(?C0))b/B +------------------------------------------------------------------ + Bra + Assert back + Callout 0 9 1 + Ket + a + Alt + Assert back + Callout 0 21 1 + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*ACCEPT) can be pulled out +/(*ACCEPT)ab|(*ACCEPT)ac/B +------------------------------------------------------------------ + Bra + *ACCEPT + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*ACCEPT:name) can be pulled out +/(*ACCEPT:hello)ab|(*ACCEPT:hello)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *ACCEPT + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*ACCEPT:name) but with non-matching names +/(*ACCEPT:hello)ab|(*ACCEPT:goodbye)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *ACCEPT + ab + Alt + *MARK goodbye + *ACCEPT + ac + Ket + End +------------------------------------------------------------------ + +/(*ACCEPT:a)ab|(*ACCEPT:b)ac/B +------------------------------------------------------------------ + Bra + *MARK a + *ACCEPT + ab + Alt + *MARK b + *ACCEPT + ac + Ket + End +------------------------------------------------------------------ + +# (*FAIL) can be pulled out +/(*FAIL)ab|(*FAIL)ac/B +------------------------------------------------------------------ + Bra + *FAIL + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*FAIL:name) can be pulled out +/(*FAIL:hello)ab|(*FAIL:hello)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *FAIL + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*FAIL:name) but with non-matching names +/(*FAIL:hello)ab|(*FAIL:goodbye)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + *FAIL + ab + Alt + *MARK goodbye + *FAIL + ac + Ket + End +------------------------------------------------------------------ + +# (*MARK:name) can be pulled out +/(*MARK:hello)ab|(*MARK:hello)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + a + [bc] + Ket + End +------------------------------------------------------------------ + +# (*MARK:name) but with non-matching names +/(*MARK:hello)ab|(*MARK:goodbye)ac/B +------------------------------------------------------------------ + Bra + *MARK hello + ab + Alt + *MARK goodbye + ac + Ket + End +------------------------------------------------------------------ + +# (*PRUNE) is never pulled out +/(*PRUNE)a|(*PRUNE)b/B +------------------------------------------------------------------ + Bra + *PRUNE + a + Alt + *PRUNE + b + Ket + End +------------------------------------------------------------------ + +/(?:(*PRUNE))a|(?:(*PRUNE))b/B +------------------------------------------------------------------ + Bra + Bra + *PRUNE + Ket + a + Alt + Bra + *PRUNE + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*PRUNE:name) is never pulled out +/(*PRUNE:abc)a|(*PRUNE:abc)b/B +------------------------------------------------------------------ + Bra + *PRUNE abc + a + Alt + *PRUNE abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*PRUNE:abc))a|(?:(*PRUNE:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *PRUNE abc + Ket + a + Alt + Bra + *PRUNE abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*COMMIT) is never pulled out +/(*COMMIT)a|(*COMMIT)b/B +------------------------------------------------------------------ + Bra + *COMMIT + a + Alt + *COMMIT + b + Ket + End +------------------------------------------------------------------ + +/(?:(*COMMIT))a|(?:(*COMMIT))b/B +------------------------------------------------------------------ + Bra + Bra + *COMMIT + Ket + a + Alt + Bra + *COMMIT + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*COMMIT:name) is never pulled out +/(*COMMIT:abc)a|(*COMMIT:abc)b/B +------------------------------------------------------------------ + Bra + *COMMIT abc + a + Alt + *COMMIT abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*COMMIT:abc))a|(?:(*COMMIT:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *COMMIT abc + Ket + a + Alt + Bra + *COMMIT abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*SKIP) is never pulled out +/(*SKIP)a|(*SKIP)b/B +------------------------------------------------------------------ + Bra + *SKIP + a + Alt + *SKIP + b + Ket + End +------------------------------------------------------------------ + +/(?:(*SKIP))a|(?:(*SKIP))b/B +------------------------------------------------------------------ + Bra + Bra + *SKIP + Ket + a + Alt + Bra + *SKIP + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*SKIP:name) is never pulled out +/(*SKIP:abc)a|(*SKIP:abc)b/B +------------------------------------------------------------------ + Bra + *SKIP abc + a + Alt + *SKIP abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*SKIP:abc))a|(?:(*SKIP:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *SKIP abc + Ket + a + Alt + Bra + *SKIP abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*THEN) is never pulled out +/(*THEN)a|(*THEN)b/B +------------------------------------------------------------------ + Bra + *THEN + a + Alt + *THEN + b + Ket + End +------------------------------------------------------------------ + +/(?:(*THEN))a|(?:(*THEN))b/B +------------------------------------------------------------------ + Bra + Bra + *THEN + Ket + a + Alt + Bra + *THEN + Ket + b + Ket + End +------------------------------------------------------------------ + +# (*THEN:name) is never pulled out +/(*THEN:abc)a|(*THEN:abc)b/B +------------------------------------------------------------------ + Bra + *THEN abc + a + Alt + *THEN abc + b + Ket + End +------------------------------------------------------------------ + +/(?:(*THEN:abc))a|(?:(*THEN:abc))b/B +------------------------------------------------------------------ + Bra + Bra + *THEN abc + Ket + a + Alt + Bra + *THEN abc + Ket + b + Ket + End +------------------------------------------------------------------ + +# Common prefix which itself has a common prefix +# ⇒ (?>a[bc])[de] +/(?:(?>ab|ac)d|(?>ab|ac)e)/B +------------------------------------------------------------------ + Bra + Once + a + [bc] + Ket + [de] + Ket + End +------------------------------------------------------------------ + +# Rewriting common prefix causes parent group to have a common prefix +# (We don't detect this case) +# ⇒ (?:a[bc]d|a[bc]e) +/(?:a(?:b|c)d|(?:ab|ac)e)/B +------------------------------------------------------------------ + Bra + a + [bc] + d + Alt + a + [bc] + e + Ket + End +------------------------------------------------------------------ + +# Another case: +# ⇒ (?>a[bc]d|a[bc]e) +/(?>(?:ab|ac)d|(?:ab|ac)e)/B +------------------------------------------------------------------ + Bra + Once + a + [bc] + d + Alt + a + [bc] + e + Ket + Ket + End +------------------------------------------------------------------ + +# When rewriting groups which are themselves quantified, +# the extracted items have to stay inside the quantified group +# ⇒ (?:a[bc])? +/(?:ab|ac)?/B +------------------------------------------------------------------ + Bra + Brazero + Bra + a + [bc] + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)*/B +------------------------------------------------------------------ + Bra + Brazero + Bra + a + [bc] + KetRmax + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)+/B +------------------------------------------------------------------ + Bra + Bra + a + [bc] + KetRmax + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)?+/B +------------------------------------------------------------------ + Bra + Once + Brazero + Bra + a + [bc] + Ket + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)*+/B +------------------------------------------------------------------ + Bra + Braposzero + BraPos + a + [bc] + KetRpos + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac)++/B +------------------------------------------------------------------ + Bra + BraPos + a + [bc] + KetRpos + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac){2}/B +------------------------------------------------------------------ + Bra + Bra + a + [bc] + Ket + Bra + a + [bc] + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac){2,4}/B +------------------------------------------------------------------ + Bra + Bra + a + [bc] + Ket + Bra + a + [bc] + Ket + Brazero + Bra + Bra + a + [bc] + Ket + Brazero + Bra + a + [bc] + Ket + Ket + Ket + End +------------------------------------------------------------------ + +/(?:ab|ac){2,4}+/B +------------------------------------------------------------------ + Bra + Once + Bra + a + [bc] + Ket + Bra + a + [bc] + Ket + Brazero + Bra + Bra + a + [bc] + Ket + Brazero + Bra + a + [bc] + Ket + Ket + Ket + Ket + End +------------------------------------------------------------------ + +# Regression test: +# Pattern rewriter must properly handle assert conditions which contain alternation +# This should not be rewritten: +/b(?(?!)|b)/B +------------------------------------------------------------------ + Bra + b + Cond + *FAIL + Alt + b + Ket + Ket + End +------------------------------------------------------------------ + +# Regression test: +# For a conditional group which uses an assertion condition, that assertion condition +# cannot be pulled out +# This should not be rewritten (and the subject string should not match): +/(?(?b|c)d(?Pe)/ -Memory allocation - code size : 54 +Memory allocation - code size : 76 Memory allocation - data size : 52 ------------------------------------------------------------------ - 0 24 Bra + 0 35 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 20 CBra 1 + 7 [bc] + 24 20 Ket + 26 d + 28 5 CBra 2 + 31 e + 33 5 Ket + 35 35 Ket + 37 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 14 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 82 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 20 CBra 1 + 11 [de] + 28 20 Ket + 30 *THEN + 31 x + 33 12 CBra 2 + 36 123 + 42 *THEN + 43 4 + 45 35 Alt + 47 567 + 53 20 CBra 3 + 56 [bq] + 73 20 Ket + 75 *THEN + 76 xx + 80 47 Ket + 82 82 Ket + 84 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-16-3 b/testdata/testoutput8-16-3 index d46ceba47..caeeca3f4 100644 --- a/testdata/testoutput8-16-3 +++ b/testdata/testoutput8-16-3 @@ -36,18 +36,16 @@ Memory allocation - code size : 48 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 46 +Memory allocation - code size : 34 ------------------------------------------------------------------ - 0 19 Bra - 3 7 Bra - 6 AllAny* - 8 X - 10 6 Alt - 13 ^ - 14 B - 16 13 Ket - 19 19 Ket - 22 End + 0 7 Bra + 3 AllAny* + 5 X + 7 6 Alt + 10 ^ + 11 B + 13 13 Ket + 16 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 54 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 68 +Memory allocation - code size : 88 Memory allocation - data size : 52 ------------------------------------------------------------------ - 0 30 Bra + 0 40 Bra 3 a - 5 6 CBra 1 - 9 b - 11 5 Alt - 14 c - 16 11 Ket - 19 d - 21 6 CBra 2 - 25 e - 27 6 Ket - 30 30 Ket - 33 End + 5 21 CBra 1 + 9 [bc] + 26 21 Ket + 29 d + 31 6 CBra 2 + 35 e + 37 6 Ket + 40 40 Ket + 43 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 18 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 70 Bra + 0 90 Bra 3 abc - 9 6 CBra 1 - 13 d - 15 5 Alt - 18 e - 20 11 Ket - 23 *THEN - 24 x - 26 13 CBra 2 - 30 123 - 36 *THEN - 37 4 - 39 28 Alt - 42 567 - 48 6 CBra 3 - 52 b - 54 5 Alt - 57 q - 59 11 Ket - 62 *THEN - 63 xx - 67 41 Ket - 70 70 Ket - 73 End + 9 21 CBra 1 + 13 [de] + 30 21 Ket + 33 *THEN + 34 x + 36 13 CBra 2 + 40 123 + 46 *THEN + 47 4 + 49 38 Alt + 52 567 + 58 21 CBra 3 + 62 [bq] + 79 21 Ket + 82 *THEN + 83 xx + 87 51 Ket + 90 90 Ket + 93 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-16-4 b/testdata/testoutput8-16-4 index d46ceba47..caeeca3f4 100644 --- a/testdata/testoutput8-16-4 +++ b/testdata/testoutput8-16-4 @@ -36,18 +36,16 @@ Memory allocation - code size : 48 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 46 +Memory allocation - code size : 34 ------------------------------------------------------------------ - 0 19 Bra - 3 7 Bra - 6 AllAny* - 8 X - 10 6 Alt - 13 ^ - 14 B - 16 13 Ket - 19 19 Ket - 22 End + 0 7 Bra + 3 AllAny* + 5 X + 7 6 Alt + 10 ^ + 11 B + 13 13 Ket + 16 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 54 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 68 +Memory allocation - code size : 88 Memory allocation - data size : 52 ------------------------------------------------------------------ - 0 30 Bra + 0 40 Bra 3 a - 5 6 CBra 1 - 9 b - 11 5 Alt - 14 c - 16 11 Ket - 19 d - 21 6 CBra 2 - 25 e - 27 6 Ket - 30 30 Ket - 33 End + 5 21 CBra 1 + 9 [bc] + 26 21 Ket + 29 d + 31 6 CBra 2 + 35 e + 37 6 Ket + 40 40 Ket + 43 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 18 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 70 Bra + 0 90 Bra 3 abc - 9 6 CBra 1 - 13 d - 15 5 Alt - 18 e - 20 11 Ket - 23 *THEN - 24 x - 26 13 CBra 2 - 30 123 - 36 *THEN - 37 4 - 39 28 Alt - 42 567 - 48 6 CBra 3 - 52 b - 54 5 Alt - 57 q - 59 11 Ket - 62 *THEN - 63 xx - 67 41 Ket - 70 70 Ket - 73 End + 9 21 CBra 1 + 13 [de] + 30 21 Ket + 33 *THEN + 34 x + 36 13 CBra 2 + 40 123 + 46 *THEN + 47 4 + 49 38 Alt + 52 567 + 58 21 CBra 3 + 62 [bq] + 79 21 Ket + 82 *THEN + 83 xx + 87 51 Ket + 90 90 Ket + 93 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-32-2 b/testdata/testoutput8-32-2 index e9865c85a..efa53a63c 100644 --- a/testdata/testoutput8-32-2 +++ b/testdata/testoutput8-32-2 @@ -36,18 +36,16 @@ Memory allocation - code size : 76 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 72 +Memory allocation - code size : 56 ------------------------------------------------------------------ - 0 15 Bra - 2 6 Bra - 4 AllAny* - 6 X - 8 5 Alt - 10 ^ - 11 B - 13 11 Ket - 15 15 Ket - 17 End + 0 6 Bra + 2 AllAny* + 4 X + 6 5 Alt + 8 ^ + 9 B + 11 11 Ket + 13 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 80 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 108 +Memory allocation - code size : 120 Memory allocation - data size : 104 ------------------------------------------------------------------ - 0 24 Bra + 0 27 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 12 CBra 1 + 7 [bc] + 16 12 Ket + 18 d + 20 5 CBra 2 + 23 e + 25 5 Ket + 27 27 Ket + 29 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 28 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 66 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 12 CBra 1 + 11 [de] + 20 12 Ket + 22 *THEN + 23 x + 25 12 CBra 2 + 28 123 + 34 *THEN + 35 4 + 37 27 Alt + 39 567 + 45 12 CBra 3 + 48 [bq] + 57 12 Ket + 59 *THEN + 60 xx + 64 39 Ket + 66 66 Ket + 68 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-32-3 b/testdata/testoutput8-32-3 index e9865c85a..efa53a63c 100644 --- a/testdata/testoutput8-32-3 +++ b/testdata/testoutput8-32-3 @@ -36,18 +36,16 @@ Memory allocation - code size : 76 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 72 +Memory allocation - code size : 56 ------------------------------------------------------------------ - 0 15 Bra - 2 6 Bra - 4 AllAny* - 6 X - 8 5 Alt - 10 ^ - 11 B - 13 11 Ket - 15 15 Ket - 17 End + 0 6 Bra + 2 AllAny* + 4 X + 6 5 Alt + 8 ^ + 9 B + 11 11 Ket + 13 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 80 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 108 +Memory allocation - code size : 120 Memory allocation - data size : 104 ------------------------------------------------------------------ - 0 24 Bra + 0 27 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 12 CBra 1 + 7 [bc] + 16 12 Ket + 18 d + 20 5 CBra 2 + 23 e + 25 5 Ket + 27 27 Ket + 29 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 28 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 66 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 12 CBra 1 + 11 [de] + 20 12 Ket + 22 *THEN + 23 x + 25 12 CBra 2 + 28 123 + 34 *THEN + 35 4 + 37 27 Alt + 39 567 + 45 12 CBra 3 + 48 [bq] + 57 12 Ket + 59 *THEN + 60 xx + 64 39 Ket + 66 66 Ket + 68 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-32-4 b/testdata/testoutput8-32-4 index e9865c85a..efa53a63c 100644 --- a/testdata/testoutput8-32-4 +++ b/testdata/testoutput8-32-4 @@ -36,18 +36,16 @@ Memory allocation - code size : 76 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 72 +Memory allocation - code size : 56 ------------------------------------------------------------------ - 0 15 Bra - 2 6 Bra - 4 AllAny* - 6 X - 8 5 Alt - 10 ^ - 11 B - 13 11 Ket - 15 15 Ket - 17 End + 0 6 Bra + 2 AllAny* + 4 X + 6 5 Alt + 8 ^ + 9 B + 11 11 Ket + 13 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 80 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 108 +Memory allocation - code size : 120 Memory allocation - data size : 104 ------------------------------------------------------------------ - 0 24 Bra + 0 27 Bra 2 a - 4 5 CBra 1 - 7 b - 9 4 Alt - 11 c - 13 9 Ket - 15 d - 17 5 CBra 2 - 20 e - 22 5 Ket - 24 24 Ket - 26 End + 4 12 CBra 1 + 7 [bc] + 16 12 Ket + 18 d + 20 5 CBra 2 + 23 e + 25 5 Ket + 27 27 Ket + 29 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 28 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 60 Bra + 0 66 Bra 2 abc - 8 5 CBra 1 - 11 d - 13 4 Alt - 15 e - 17 9 Ket - 19 *THEN - 20 x - 22 12 CBra 2 - 25 123 - 31 *THEN - 32 4 - 34 24 Alt - 36 567 - 42 5 CBra 3 - 45 b - 47 4 Alt - 49 q - 51 9 Ket - 53 *THEN - 54 xx - 58 36 Ket - 60 60 Ket - 62 End + 8 12 CBra 1 + 11 [de] + 20 12 Ket + 22 *THEN + 23 x + 25 12 CBra 2 + 28 123 + 34 *THEN + 35 4 + 37 27 Alt + 39 567 + 45 12 CBra 3 + 48 [bq] + 57 12 Ket + 59 *THEN + 60 xx + 64 39 Ket + 66 66 Ket + 68 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-8-2 b/testdata/testoutput8-8-2 index 0ebc4d015..292bf8744 100644 --- a/testdata/testoutput8-8-2 +++ b/testdata/testoutput8-8-2 @@ -36,18 +36,16 @@ Memory allocation - code size : 25 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 23 +Memory allocation - code size : 17 ------------------------------------------------------------------ - 0 19 Bra - 3 7 Bra - 6 AllAny* - 8 X - 10 6 Alt - 13 ^ - 14 B - 16 13 Ket - 19 19 Ket - 22 End + 0 7 Bra + 3 AllAny* + 5 X + 7 6 Alt + 10 ^ + 11 B + 13 13 Ket + 16 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 28 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 36 +Memory allocation - code size : 62 Memory allocation - data size : 28 ------------------------------------------------------------------ - 0 32 Bra + 0 58 Bra 3 a - 5 7 CBra 1 - 10 b - 12 5 Alt - 15 c - 17 12 Ket - 20 d - 22 7 CBra 2 - 27 e - 29 7 Ket - 32 32 Ket - 35 End + 5 38 CBra 1 + 10 [bc] + 43 38 Ket + 46 d + 48 7 CBra 2 + 53 e + 55 7 Ket + 58 58 Ket + 61 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 10 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 73 Bra + 0 125 Bra 3 abc - 9 7 CBra 1 - 14 d - 16 5 Alt - 19 e - 21 12 Ket - 24 *THEN - 25 x - 27 14 CBra 2 - 32 123 - 38 *THEN - 39 4 - 41 29 Alt - 44 567 - 50 7 CBra 3 - 55 b - 57 5 Alt - 60 q - 62 12 Ket - 65 *THEN - 66 xx - 70 43 Ket - 73 73 Ket - 76 End + 9 38 CBra 1 + 14 [de] + 47 38 Ket + 50 *THEN + 51 x + 53 14 CBra 2 + 58 123 + 64 *THEN + 65 4 + 67 55 Alt + 70 567 + 76 38 CBra 3 + 81 [bq] +114 38 Ket +117 *THEN +118 xx +122 69 Ket +125 125 Ket +128 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-8-3 b/testdata/testoutput8-8-3 index 04b3a3e6b..2136ae479 100644 --- a/testdata/testoutput8-8-3 +++ b/testdata/testoutput8-8-3 @@ -36,18 +36,16 @@ Memory allocation - code size : 30 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 28 +Memory allocation - code size : 20 ------------------------------------------------------------------ - 0 23 Bra - 4 8 Bra - 8 AllAny* - 10 X - 12 7 Alt - 16 ^ - 17 B - 19 15 Ket - 23 23 Ket - 27 End + 0 8 Bra + 4 AllAny* + 6 X + 8 7 Alt + 12 ^ + 13 B + 15 15 Ket + 19 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 35 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 43 +Memory allocation - code size : 68 Memory allocation - data size : 28 ------------------------------------------------------------------ - 0 38 Bra + 0 63 Bra 4 a - 6 8 CBra 1 - 12 b - 14 6 Alt - 18 c - 20 14 Ket - 24 d - 26 8 CBra 2 - 32 e - 34 8 Ket - 38 38 Ket - 42 End + 6 39 CBra 1 + 12 [bc] + 45 39 Ket + 49 d + 51 8 CBra 2 + 57 e + 59 8 Ket + 63 63 Ket + 67 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 12 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 83 Bra + 0 133 Bra 4 abc - 10 8 CBra 1 - 16 d - 18 6 Alt - 22 e - 24 14 Ket - 28 *THEN - 29 x - 31 15 CBra 2 - 37 123 - 43 *THEN - 44 4 - 46 33 Alt - 50 567 - 56 8 CBra 3 - 62 b - 64 6 Alt - 68 q - 70 14 Ket - 74 *THEN - 75 xx - 79 48 Ket - 83 83 Ket - 87 End + 10 39 CBra 1 + 16 [de] + 49 39 Ket + 53 *THEN + 54 x + 56 15 CBra 2 + 62 123 + 68 *THEN + 69 4 + 71 58 Alt + 75 567 + 81 39 CBra 3 + 87 [bq] +120 39 Ket +124 *THEN +125 xx +129 73 Ket +133 133 Ket +137 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/ diff --git a/testdata/testoutput8-8-4 b/testdata/testoutput8-8-4 index 42119f783..56a964f9f 100644 --- a/testdata/testoutput8-8-4 +++ b/testdata/testoutput8-8-4 @@ -36,18 +36,16 @@ Memory allocation - code size : 35 ------------------------------------------------------------------ /(?s:.*X|^B)/ -Memory allocation - code size : 33 +Memory allocation - code size : 23 ------------------------------------------------------------------ - 0 27 Bra - 5 9 Bra - 10 AllAny* - 12 X - 14 8 Alt - 19 ^ - 20 B - 22 17 Ket - 27 27 Ket - 32 End + 0 9 Bra + 5 AllAny* + 7 X + 9 8 Alt + 14 ^ + 15 B + 17 17 Ket + 22 End ------------------------------------------------------------------ /^[[:alnum:]]/ @@ -197,22 +195,20 @@ Memory allocation - code size : 42 ------------------------------------------------------------------ /a(?Pb|c)d(?Pe)/ -Memory allocation - code size : 50 +Memory allocation - code size : 74 Memory allocation - data size : 28 ------------------------------------------------------------------ - 0 44 Bra + 0 68 Bra 5 a - 7 9 CBra 1 - 14 b - 16 7 Alt - 21 c - 23 16 Ket - 28 d - 30 9 CBra 2 - 37 e - 39 9 Ket - 44 44 Ket - 49 End + 7 40 CBra 1 + 14 [bc] + 47 40 Ket + 52 d + 54 9 CBra 2 + 61 e + 63 9 Ket + 68 68 Ket + 73 End ------------------------------------------------------------------ /(?:a(?Pc(?Pd)))(?Pa)/ @@ -694,31 +690,27 @@ Memory allocation - code size : 14 /abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/ ------------------------------------------------------------------ - 0 93 Bra + 0 141 Bra 5 abc - 11 9 CBra 1 - 18 d - 20 7 Alt - 25 e - 27 16 Ket - 32 *THEN - 33 x - 35 16 CBra 2 - 42 123 - 48 *THEN - 49 4 - 51 37 Alt - 56 567 - 62 9 CBra 3 - 69 b - 71 7 Alt - 76 q - 78 16 Ket - 83 *THEN - 84 xx - 88 53 Ket - 93 93 Ket - 98 End + 11 40 CBra 1 + 18 [de] + 51 40 Ket + 56 *THEN + 57 x + 59 16 CBra 2 + 66 123 + 72 *THEN + 73 4 + 75 61 Alt + 80 567 + 86 40 CBra 3 + 93 [bq] +126 40 Ket +131 *THEN +132 xx +136 77 Ket +141 141 Ket +146 End ------------------------------------------------------------------ /(((a\2)|(a*)\g<-1>))*a?/