Skip to content

Commit

Permalink
Fix OP_REFI for caseless_restrict (#516)
Browse files Browse the repository at this point in the history
  • Loading branch information
NWilson authored Oct 8, 2024
1 parent 440f5d1 commit 2239414
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 24 deletions.
9 changes: 7 additions & 2 deletions HACKING
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,10 @@ Changeable options
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
some others may be changed in the middle of patterns by items such as (?i).
Their processing is handled entirely at compile time by generating different
opcodes for the different settings. The runtime functions do not need to keep
track of an option's state.
opcodes for the different settings. Some options are copied into the opcode's
data, for opcodes such as OP_REFI which depends on the (?r)
(PCRE2_EXTRA_CASELESS_RESTRICT) option. The runtime functions do not need to
keep track of an option's state.

PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
are tracked and processed during the parsing pre-pass. The others are handled
Expand Down Expand Up @@ -639,6 +641,9 @@ generates OP_DNREF or OP_DNREFI. These are followed by two counts: the index
required name, followed by the number of groups with the same name. The
matching code can then search for the first one that is set.

OP_REFI and OP_DNREFI are further followed by an item containing any
case-insensitivity flags.


Repeating character classes and back references
-----------------------------------------------
Expand Down
6 changes: 6 additions & 0 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7188,6 +7188,9 @@ for (;; pptr++)
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
PUT2INC(code, 0, index);
PUT2INC(code, 0, count);
if ((options & PCRE2_CASELESS) != 0)
*code++ = ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
REFI_FLAG_CASELESS_RESTRICT : 0;
}
break;

Expand Down Expand Up @@ -8142,6 +8145,9 @@ for (;; pptr++)
if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
PUT2INC(code, 0, meta_arg);
if ((options & PCRE2_CASELESS) != 0)
*code++ = ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
REFI_FLAG_CASELESS_RESTRICT : 0;

/* Update the map of back references, and keep the highest one. We
could do this in parse_regex() for numerical back references, but not
Expand Down
8 changes: 6 additions & 2 deletions src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1780,9 +1780,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \
0, /* XCLASS - variable length */ \
1+IMM2_SIZE, /* REF */ \
1+IMM2_SIZE, /* REFI */ \
1+IMM2_SIZE+1, /* REFI */ \
1+2*IMM2_SIZE, /* DNREF */ \
1+2*IMM2_SIZE, /* DNREFI */ \
1+2*IMM2_SIZE+1, /* DNREFI */ \
1+LINK_SIZE, /* RECURSE */ \
1+2*LINK_SIZE+1, /* CALLOUT */ \
0, /* CALLOUT_STR - variable length */ \
Expand Down Expand Up @@ -1829,6 +1829,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */

#define RREF_ANY 0xffff

/* Constants used by OP_REFI and OP_DNREFI to control matching behaviour. */

#define REFI_FLAG_CASELESS_RESTRICT 0x1


/* ---------- Private structures that are mode-independent. ---------- */

Expand Down
41 changes: 32 additions & 9 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -1149,7 +1149,7 @@ while (cc < ccend)
/* Fall through. */
case OP_REF:
common->optimized_cbracket[GET2(cc, 1)] = 0;
cc += 1 + IMM2_SIZE;
cc += PRIV(OP_lengths)[*cc];
break;

case OP_ASSERT_NA:
Expand Down Expand Up @@ -1181,8 +1181,16 @@ while (cc < ccend)
cc += 1 + IMM2_SIZE;
break;

case OP_DNREF:
case OP_DNREFI:
#ifdef SUPPORT_UNICODE
if (common->iref_ptr == 0)
{
common->iref_ptr = common->ovector_start;
common->ovector_start += 3 * sizeof(sljit_sw);
}
#endif /* SUPPORT_UNICODE */
/* Fall through */
case OP_DNREF:
case OP_DNCREF:
count = GET2(cc, 1 + IMM2_SIZE);
slot = common->name_table + GET2(cc, 1) * common->name_entry_size;
Expand All @@ -1191,7 +1199,7 @@ while (cc < ccend)
common->optimized_cbracket[GET2(slot, 0)] = 0;
slot += common->name_entry_size;
}
cc += 1 + 2 * IMM2_SIZE;
cc += PRIV(OP_lengths)[*cc];
break;

case OP_RECURSE:
Expand Down Expand Up @@ -9424,6 +9432,10 @@ jump_list *no_match = NULL;
int source_reg = COUNT_MATCH;
int source_end_reg = ARGUMENTS;
int char1_reg = STACK_LIMIT;
PCRE2_UCHAR refi_flag = 0;

if (*cc == OP_REFI || *cc == OP_DNREFI)
refi_flag = cc[PRIV(OP_lengths)[*cc] - 1];
#endif /* SUPPORT_UNICODE */

if (ref)
Expand All @@ -9438,7 +9450,7 @@ else
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0);

#if defined SUPPORT_UNICODE
if (common->utf && *cc == OP_REFI)
if (common->utf && (*cc == OP_REFI || *cc == OP_DNREFI))
{
SLJIT_ASSERT(common->iref_ptr != 0);

Expand Down Expand Up @@ -9491,6 +9503,8 @@ if (common->utf && *cc == OP_REFI)
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);

if (refi_flag & REFI_FLAG_CASELESS_RESTRICT)
add_jump(compiler, &no_match, CMP(SLJIT_LESS, char1_reg, 0, SLJIT_IMM, 128));
add_jump(compiler, &no_match, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0));
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_caseless_sets));
Expand Down Expand Up @@ -9594,6 +9608,9 @@ if (ref)
offset = GET2(cc, 1) << 1;
else
cc += IMM2_SIZE;

if (*ccbegin == OP_REFI || *ccbegin == OP_DNREFI)
cc += 1;
type = cc[1 + IMM2_SIZE];

SLJIT_COMPILE_ASSERT((OP_CRSTAR & 0x1) == 0, crstar_opcode_must_be_even);
Expand Down Expand Up @@ -12687,25 +12704,31 @@ while (cc < ccend)

case OP_REF:
case OP_REFI:
if (cc[1 + IMM2_SIZE] >= OP_CRSTAR && cc[1 + IMM2_SIZE] <= OP_CRPOSRANGE)
{
int op_len = PRIV(OP_lengths)[*cc];
if (cc[op_len] >= OP_CRSTAR && cc[op_len] <= OP_CRPOSRANGE)
cc = compile_ref_iterator_matchingpath(common, cc, parent);
else
{
compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks, TRUE, FALSE);
cc += 1 + IMM2_SIZE;
cc += op_len;
}
}
break;

case OP_DNREF:
case OP_DNREFI:
if (cc[1 + 2 * IMM2_SIZE] >= OP_CRSTAR && cc[1 + 2 * IMM2_SIZE] <= OP_CRPOSRANGE)
{
int op_len = PRIV(OP_lengths)[*cc];
if (cc[op_len] >= OP_CRSTAR && cc[op_len] <= OP_CRPOSRANGE)
cc = compile_ref_iterator_matchingpath(common, cc, parent);
else
{
compile_dnref_search(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks);
compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks, TRUE, FALSE);
cc += 1 + 2 * IMM2_SIZE;
cc += op_len;
}
}
break;

case OP_RECURSE:
Expand Down Expand Up @@ -12992,7 +13015,7 @@ PCRE2_SPTR cc = current->cc;
BOOL ref = (*cc == OP_REF || *cc == OP_REFI);
PCRE2_UCHAR type;

type = cc[ref ? 1 + IMM2_SIZE : 1 + 2 * IMM2_SIZE];
type = cc[PRIV(OP_lengths)[*cc]];

if ((type & 0x1) == 0)
{
Expand Down
28 changes: 19 additions & 9 deletions src/pcre2_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ seems unlikely.)
Arguments:
offset index into the offset vector
caseless TRUE if caseless
caseopts bitmask of REFI_FLAG_XYZ values
F the current backtracking frame pointer
mb points to match block
lengthptr pointer for returning the length matched
Expand All @@ -358,8 +359,8 @@ Returns: = 0 sucessful match; number of code units matched is set
*/

static int
match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
PCRE2_SIZE *lengthptr)
match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F,
match_block *mb, PCRE2_SIZE *lengthptr)
{
PCRE2_SPTR p;
PCRE2_SIZE length;
Expand Down Expand Up @@ -389,6 +390,7 @@ if (caseless)
{
#if defined SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0;

if (utf || (mb->poptions & PCRE2_UCP) != 0)
{
Expand Down Expand Up @@ -424,6 +426,11 @@ if (caseless)
if (c != d && c != (uint32_t)((int)d + ur->other_case))
{
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;

/* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets
that start with an ASCII character. */
if (caseless_restrict && *pp < 128) return -1; /* No match */

for (;;)
{
if (c < *pp) return -1; /* No match */
Expand Down Expand Up @@ -5006,16 +5013,18 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
#define Lmin F->temp_32[0]
#define Lmax F->temp_32[1]
#define Lcaseless F->temp_32[2]
#define Lcaseopts F->temp_32[3]
#define Lstart F->temp_sptr[0]
#define Loffset F->temp_size

case OP_DNREF:
case OP_DNREFI:
Lcaseless = (Fop == OP_DNREFI);
Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0;
{
int count = GET2(Fecode, 1+IMM2_SIZE);
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
Fecode += 1 + 2*IMM2_SIZE;
Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0);

while (count-- > 0)
{
Expand All @@ -5029,8 +5038,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
case OP_REF:
case OP_REFI:
Lcaseless = (Fop == OP_REFI);
Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0;
Loffset = (GET2(Fecode, 1) << 1) - 2;
Fecode += 1 + IMM2_SIZE;
Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0);

/* Set up for repetition, or handle the non-repeated case. The maximum and
minimum must be in the heap frame, but as they are short-term values, we
Expand Down Expand Up @@ -5062,7 +5072,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,

default: /* No repeat follows */
{
rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length);
if (rrc != 0)
{
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
Expand Down Expand Up @@ -5096,7 +5106,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
for (i = 1; i <= Lmin; i++)
{
PCRE2_SIZE slength;
rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
if (rrc != 0)
{
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
Expand All @@ -5120,7 +5130,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
RMATCH(Fecode, RM20);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
if (rrc != 0)
{
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
Expand All @@ -5145,7 +5155,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
for (i = Lmin; i < Lmax; i++)
{
PCRE2_SIZE slength;
rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
if (rrc != 0)
{
/* Can't use CHECK_PARTIAL because we don't want to update Feptr in
Expand Down Expand Up @@ -5196,7 +5206,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
for (i = Lmin; i < Lmax; i++)
{
PCRE2_SIZE slength;
(void)match_ref(Loffset, Lcaseless, F, mb, &slength);
(void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
Feptr += slength;
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/pcre2_printint.c
Original file line number Diff line number Diff line change
Expand Up @@ -633,21 +633,25 @@ for(;;)

case OP_REFI:
flag = "/i";
extra = code[1 + IMM2_SIZE];
/* Fall through */
case OP_REF:
fprintf(f, " %s \\%d", flag, GET2(code,1));
if (extra != 0) fprintf(f, " 0x%02x", extra);
ccode = code + OP_lengths[*code];
goto CLASS_REF_REPEAT;

case OP_DNREFI:
flag = "/i";
extra = code[1 + 2*IMM2_SIZE];
/* Fall through */
case OP_DNREF:
{
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
fprintf(f, " %s \\k<", flag);
print_custring(f, entry);
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
if (extra != 0) fprintf(f, " 0x%02x", extra);
}
ccode = code + OP_lengths[*code];
goto CLASS_REF_REPEAT;
Expand Down
4 changes: 2 additions & 2 deletions src/pcre2_study.c
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ for (;;)
}
}
else d = 0;
cc += 1 + 2*IMM2_SIZE;
cc += PRIV(OP_lengths)[*cc];
goto REPEAT_BACK_REFERENCE;

/* Single back reference by number. References by name are converted to by
Expand Down Expand Up @@ -593,7 +593,7 @@ for (;;)
backref_cache[0] = recno;
}

cc += 1 + IMM2_SIZE;
cc += PRIV(OP_lengths)[*cc];

/* Handle repeated back references */

Expand Down
46 changes: 46 additions & 0 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -2289,6 +2289,52 @@
s\x{212a}s
K\x{17f}K

/(.) \1/i,utf,caseless_restrict
s S
k K
\= Expect no match
s \x{17f}
k \x{212a}

/(.) (?r:\1)/i,utf
s S
k K
\= Expect no match
s \x{17f}
k \x{212a}

/(.) \1/i,utf
s S
k K
s \x{17f}
k \x{212a}

/(?:(?<A>ss)|(?<A>kk)) \k<A>/i,utf,dupnames,caseless_restrict
sS Ss
kK Kk
\= Expect no match
sS \x{17f}s
kK \x{212a}k

/(?:(?<A>ss)|(?<A>kk)) \k<A>/i,utf,dupnames
sS Ss
kK Kk
sS \x{17f}s
kK \x{212a}k

/(?:(?<A>s)|(?<A>k)) \k<A>{3,}!/i,utf,dupnames,caseless_restrict
s SsSs!
k KkKk!
\= Expect no match
s \x{17f}sSs\x{17f}!
k \x{212a}kKk\x{212a}!

/(?:(?<A>s)|(?<A>k)) \k<A>{3,}!/i,utf,dupnames
s SsSs!
k KkKk!
s \x{17f}sSs\x{17f}!
k \x{212a}kKk\x{212a}!

# End caseless restrict tests

# TESTS for PCRE2_EXTRA_ASCII_xxx - again, tests with and without.
Expand Down
Loading

0 comments on commit 2239414

Please sign in to comment.