diff options
Diffstat (limited to 'plugins/Pcre16/src/pcre_compile.c')
-rw-r--r-- | plugins/Pcre16/src/pcre_compile.c | 348 |
1 files changed, 230 insertions, 118 deletions
diff --git a/plugins/Pcre16/src/pcre_compile.c b/plugins/Pcre16/src/pcre_compile.c index 8a5b723347..0efad2645d 100644 --- a/plugins/Pcre16/src/pcre_compile.c +++ b/plugins/Pcre16/src/pcre_compile.c @@ -47,8 +47,8 @@ supporting internal functions that are not used by other modules. */ #endif #define NLBLOCK cd /* Block containing newline information */ -#define PSSTART start_pattern /* Field containing processed string start */ -#define PSEND end_pattern /* Field containing processed string end */ +#define PSSTART start_pattern /* Field containing pattern start */ +#define PSEND end_pattern /* Field containing pattern end */ #include "pcre_internal.h" @@ -549,6 +549,7 @@ static const char error_texts[] = "group name must start with a non-digit\0" /* 85 */ "parentheses are too deeply nested (stack check)\0" + "digits missing in \\x{} or \\o{}\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -1259,6 +1260,7 @@ else case CHAR_o: if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else + if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else { ptr += 2; c = 0; @@ -1328,6 +1330,11 @@ else if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { ptr += 2; + if (*ptr == CHAR_RIGHT_CURLY_BRACKET) + { + *errorcodeptr = ERR86; + break; + } c = 0; overflow = FALSE; while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) @@ -1583,29 +1590,29 @@ read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr) int min = 0; int max = -1; -/* Read the minimum value and do a paranoid check: a negative value indicates -an integer overflow. */ - -while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0); -if (min < 0 || min > 65535) +while (IS_DIGIT(*p)) { - *errorcodeptr = ERR5; - return p; + min = min * 10 + (int)(*p++ - CHAR_0); + if (min > 65535) + { + *errorcodeptr = ERR5; + return p; + } } -/* Read the maximum value if there is one, and again do a paranoid on its size. -Also, max must not be less than min. */ - if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else { if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) { max = 0; - while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0); - if (max < 0 || max > 65535) + while(IS_DIGIT(*p)) { - *errorcodeptr = ERR5; - return p; + max = max * 10 + (int)(*p++ - CHAR_0); + if (max > 65535) + { + *errorcodeptr = ERR5; + return p; + } } if (max < min) { @@ -1615,9 +1622,6 @@ if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else } } -/* Fill in the required variables, and pass back the pointer to the terminating -'}'. */ - *minp = min; *maxp = max; return p; @@ -1700,6 +1704,7 @@ Arguments: utf TRUE in UTF-8 / UTF-16 / UTF-32 mode atend TRUE if called when the pattern is complete cd the "compile data" structure + recurses chain of recurse_check to catch mutual recursion Returns: the fixed length, or -1 if there is no fixed length, @@ -1709,10 +1714,11 @@ Returns: the fixed length, */ static int -find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd) +find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd, + recurse_check *recurses) { int length = -1; - +recurse_check this_recurse; register int branchlength = 0; register pcre_uchar *cc = code + 1 + LINK_SIZE; @@ -1737,7 +1743,8 @@ for (;;) case OP_ONCE: case OP_ONCE_NC: case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd); + d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd, + recurses); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -1771,7 +1778,15 @@ for (;;) cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ if (cc > cs && cc < ce) return -1; /* Recursion */ - d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd); + else /* Check for mutual recursion */ + { + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) return -1; /* Mutual recursion */ + } + this_recurse.prev = recurses; + this_recurse.group = cs; + d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse); if (d < 0) return d; branchlength += d; cc += 1 + LINK_SIZE; @@ -2125,32 +2140,60 @@ for (;;) { case OP_CHAR: case OP_CHARI: + case OP_NOT: + case OP_NOTI: case OP_EXACT: case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: case OP_UPTO: case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: case OP_MINUPTO: case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: case OP_STAR: case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: case OP_MINSTAR: case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: case OP_POSSTAR: case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: case OP_PLUS: case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: case OP_MINPLUS: case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: case OP_POSPLUS: case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: case OP_QUERY: case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: case OP_MINQUERY: case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } @@ -2330,11 +2373,6 @@ Arguments: Returns: TRUE if what is matched could be empty */ -typedef struct recurse_check { - struct recurse_check *prev; - const pcre_uchar *group; -} recurse_check; - static BOOL could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, BOOL utf, compile_data *cd, recurse_check *recurses) @@ -2370,6 +2408,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (c == OP_RECURSE) { const pcre_uchar *scode = cd->start_code + GET(code, 1); + const pcre_uchar *endgroup = scode; BOOL empty_branch; /* Test for forward reference or uncompleted reference. This is disabled @@ -2384,20 +2423,16 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ } - /* If we are scanning a completed pattern, there are no forward references - and all groups are complete. We need to detect whether this is a recursive - call, as otherwise there will be an infinite loop. If it is a recursion, - just skip over it. Simple recursions are easily detected. For mutual - recursions we keep a chain on the stack. */ + /* If the reference is to a completed group, we need to detect whether this + is a recursive call, as otherwise there will be an infinite loop. If it is + a recursion, just skip over it. Simple recursions are easily detected. For + mutual recursions we keep a chain on the stack. */ + do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); + if (code >= scode && code <= endgroup) continue; /* Simple recursion */ else { recurse_check *r = recurses; - const pcre_uchar *endgroup = scode; - - do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); - if (code >= scode && code <= endgroup) continue; /* Simple recursion */ - for (r = recurses; r != NULL; r = r->prev) if (r->group == scode) break; if (r != NULL) continue; /* Mutual recursion */ @@ -2468,8 +2503,8 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); empty_branch = FALSE; do { - if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL)) - empty_branch = TRUE; + if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, + recurses)) empty_branch = TRUE; code += GET(code, 1); } while (*code == OP_ALT); @@ -3038,7 +3073,7 @@ switch(c) end += 1 + 2 * IMM2_SIZE; break; } - list[2] = end - code; + list[2] = (pcre_uint32)(end - code); return end; } return NULL; /* Opcode not accepted */ @@ -3064,7 +3099,7 @@ Returns: TRUE if the auto-possessification is possible static BOOL compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd, - const pcre_uint32 *base_list, const pcre_uchar *base_end) + const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit) { pcre_uchar c; pcre_uint32 list[8]; @@ -3079,6 +3114,10 @@ const pcre_uint8 *class_bitset; const pcre_uint8 *set1, *set2, *set_end; pcre_uint32 chr; BOOL accepted, invert_bits; +BOOL entered_a_group = FALSE; + +if (*rec_limit == 0) return FALSE; +--(*rec_limit); /* Note: the base_list[1] contains whether the current opcode has greedy (represented by a non-zero value) quantifier. This is a different from @@ -3132,8 +3171,10 @@ for(;;) case OP_ONCE: case OP_ONCE_NC: /* Atomic sub-patterns and assertions can always auto-possessify their - last iterator. */ - return TRUE; + last iterator. However, if the group was entered as a result of checking + a previous iterator, this is not possible. */ + + return !entered_a_group; } code += PRIV(OP_lengths)[c]; @@ -3148,10 +3189,13 @@ for(;;) while (*next_code == OP_ALT) { - if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE; + if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit)) + return FALSE; code = next_code + 1 + LINK_SIZE; next_code += GET(next_code, 1); } + + entered_a_group = TRUE; continue; case OP_BRAZERO: @@ -3166,11 +3210,14 @@ for(;;) /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ next_code += 1 + LINK_SIZE; - if (!compare_opcodes(next_code, utf, cd, base_list, base_end)) + if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit)) return FALSE; code += PRIV(OP_lengths)[c]; continue; + + default: + break; } /* Check for a supported opcode, and load its properties. */ @@ -3409,8 +3456,7 @@ for(;;) rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; - if (!accepted) - return FALSE; + if (!accepted) return FALSE; if (list[1] == 0) return TRUE; /* Might be an empty repeat. */ @@ -3597,11 +3643,20 @@ register pcre_uchar c; const pcre_uchar *end; pcre_uchar *repeat_opcode; pcre_uint32 list[8]; +int rec_limit; for (;;) { c = *code; + /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK, + it may compile without complaining, but may get into a loop here if the code + pointer points to a bad value. This is, of course a documentated possibility, + when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and + just give up on this optimization. */ + + if (c >= OP_TABLE_LENGTH) return; + if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) { c -= get_repeat_base(c) - OP_STAR; @@ -3609,7 +3664,8 @@ for (;;) get_chr_property_list(code, utf, cd->fcc, list) : NULL; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; - if (end != NULL && compare_opcodes(end, utf, cd, list, end)) + rec_limit = 1000; + if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit)) { switch(c) { @@ -3665,7 +3721,8 @@ for (;;) list[1] = (c & 1) == 0; - if (compare_opcodes(end, utf, cd, list, end)) + rec_limit = 1000; + if (compare_opcodes(end, utf, cd, list, end, &rec_limit)) { switch (c) { @@ -3939,14 +3996,14 @@ Arguments: adjust the amount by which the group is to be moved utf TRUE in UTF-8 / UTF-16 / UTF-32 mode cd contains pointers to tables etc. - save_hwm the hwm forward reference pointer at the start of the group + save_hwm_offset the hwm forward reference offset at the start of the group Returns: nothing */ static void adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, - pcre_uchar *save_hwm) + size_t save_hwm_offset) { pcre_uchar *ptr = group; @@ -3958,7 +4015,8 @@ while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) /* See if this recursion is on the forward reference list. If so, adjust the reference. */ - for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) + for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; + hc += LINK_SIZE) { offset = (int)GET(hc, 0); if (cd->start_code + offset == ptr + 1) @@ -4163,7 +4221,11 @@ if ((options & PCRE_CASELESS) != 0) range. Otherwise, use a recursive call to add the additional range. */ else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ - else if (od > end && oc <= end + 1) end = od; /* Extend upwards */ + else if (od > end && oc <= end + 1) + { + end = od; /* Extend upwards */ + if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); + } else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od); } } @@ -4403,7 +4465,7 @@ const pcre_uchar *tempptr; const pcre_uchar *nestptr = NULL; pcre_uchar *previous = NULL; pcre_uchar *previous_callout = NULL; -pcre_uchar *save_hwm = NULL; +size_t save_hwm_offset = 0; pcre_uint8 classbits[32]; /* We can fish out the UTF-8 setting once and for all into a BOOL, but we @@ -4683,7 +4745,8 @@ for (;; ptr++) previous = NULL; if ((options & PCRE_MULTILINE) != 0) { - if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; + if (firstcharflags == REQ_UNSET) + zerofirstcharflags = firstcharflags = REQ_NONE; *code++ = OP_CIRCM; } else *code++ = OP_CIRC; @@ -4863,7 +4926,7 @@ for (;; ptr++) if (lengthptr != NULL && class_uchardata > class_uchardata_base) { xclass = TRUE; - *lengthptr += class_uchardata - class_uchardata_base; + *lengthptr += (int)(class_uchardata - class_uchardata_base); class_uchardata = class_uchardata_base; } #endif @@ -5313,7 +5376,7 @@ for (;; ptr++) whatever repeat count may follow. In the case of reqchar, save the previous value for reinstating. */ - if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) { ptr++; zeroreqchar = reqchar; @@ -5461,6 +5524,12 @@ for (;; ptr++) PUT(previous, 1, (int)(code - previous)); break; /* End of class handling */ } + + /* Even though any XCLASS list is now discarded, we must allow for + its memory. */ + + if (lengthptr != NULL) + *lengthptr += (int)(class_uchardata - class_uchardata_base); #endif /* If there are no characters > 255, or they are all to be included or @@ -5861,6 +5930,7 @@ for (;; ptr++) { register int i; int len = (int)(code - previous); + size_t base_hwm_offset = save_hwm_offset; pcre_uchar *bralink = NULL; pcre_uchar *brazeroptr = NULL; @@ -5915,7 +5985,7 @@ for (;; ptr++) if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; - adjust_recurse(previous, 1, utf, cd, save_hwm); + adjust_recurse(previous, 1, utf, cd, save_hwm_offset); memmove(previous + 1, previous, IN_UCHARS(len)); code++; if (repeat_max == 0) @@ -5939,7 +6009,7 @@ for (;; ptr++) { int offset; *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm_offset); memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; @@ -6002,26 +6072,25 @@ for (;; ptr++) for (i = 1; i < repeat_min; i++) { pcre_uchar *hc; - pcre_uchar *this_hwm = cd->hwm; + size_t this_hwm_offset = cd->hwm - cd->start_workspace; memcpy(code, previous, IN_UCHARS(len)); while (cd->hwm > cd->start_workspace + cd->workspace_size - - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) + WORK_SIZE_SAFETY_MARGIN - + (this_hwm_offset - base_hwm_offset)) { - int save_offset = save_hwm - cd->start_workspace; - int this_offset = this_hwm - cd->start_workspace; *errorcodeptr = expand_workspace(cd); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (pcre_uchar *)cd->start_workspace + save_offset; - this_hwm = (pcre_uchar *)cd->start_workspace + this_offset; } - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset; + hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset; + hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len); cd->hwm += LINK_SIZE; } - save_hwm = this_hwm; + base_hwm_offset = this_hwm_offset; code += len; } } @@ -6066,7 +6135,7 @@ for (;; ptr++) else for (i = repeat_max - 1; i >= 0; i--) { pcre_uchar *hc; - pcre_uchar *this_hwm = cd->hwm; + size_t this_hwm_offset = cd->hwm - cd->start_workspace; *code++ = OP_BRAZERO + repeat_type; @@ -6088,22 +6157,21 @@ for (;; ptr++) copying them. */ while (cd->hwm > cd->start_workspace + cd->workspace_size - - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) + WORK_SIZE_SAFETY_MARGIN - + (this_hwm_offset - base_hwm_offset)) { - int save_offset = save_hwm - cd->start_workspace; - int this_offset = this_hwm - cd->start_workspace; *errorcodeptr = expand_workspace(cd); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (pcre_uchar *)cd->start_workspace + save_offset; - this_hwm = (pcre_uchar *)cd->start_workspace + this_offset; } - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset; + hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset; + hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); cd->hwm += LINK_SIZE; } - save_hwm = this_hwm; + base_hwm_offset = this_hwm_offset; code += len; } @@ -6199,7 +6267,7 @@ for (;; ptr++) { int nlen = (int)(code - bracode); *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm_offset); memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); code += 1 + LINK_SIZE; nlen += 1 + LINK_SIZE; @@ -6333,7 +6401,7 @@ for (;; ptr++) else { *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset); memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -6382,7 +6450,7 @@ for (;; ptr++) default: *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset); memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -6411,15 +6479,25 @@ for (;; ptr++) parenthesis forms. */ case CHAR_LEFT_PARENTHESIS: - newoptions = options; - skipbytes = 0; - bravalue = OP_CBRA; - save_hwm = cd->hwm; - reset_bracount = FALSE; + ptr++; - /* First deal with various "verbs" that can be introduced by '*'. */ + /* First deal with comments. Putting this code right at the start ensures + that comments have no bad side effects. */ + + if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) + { + ptr += 2; + while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; + if (*ptr == CHAR_NULL) + { + *errorcodeptr = ERR18; + goto FAILED; + } + continue; + } + + /* Now deal with various "verbs" that can be introduced by '*'. */ - ptr++; if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':' || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0)))) { @@ -6540,10 +6618,18 @@ for (;; ptr++) goto FAILED; } + /* Initialize for "real" parentheses */ + + newoptions = options; + skipbytes = 0; + bravalue = OP_CBRA; + save_hwm_offset = cd->hwm - cd->start_workspace; + reset_bracount = FALSE; + /* Deal with the extended parentheses; all are introduced by '?', and the appearance of any of them means that this is not a capturing group. */ - else if (*ptr == CHAR_QUESTION_MARK) + if (*ptr == CHAR_QUESTION_MARK) { int i, set, unset, namelen; int *optset; @@ -6552,17 +6638,6 @@ for (;; ptr++) switch (*(++ptr)) { - case CHAR_NUMBER_SIGN: /* Comment; skip to ket */ - ptr++; - while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - if (*ptr == CHAR_NULL) - { - *errorcodeptr = ERR18; - goto FAILED; - } - continue; - - /* ------------------------------------------------------------ */ case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ reset_bracount = TRUE; @@ -6611,8 +6686,13 @@ for (;; ptr++) if (tempptr[1] == CHAR_QUESTION_MARK && (tempptr[2] == CHAR_EQUALS_SIGN || tempptr[2] == CHAR_EXCLAMATION_MARK || - tempptr[2] == CHAR_LESS_THAN_SIGN)) + (tempptr[2] == CHAR_LESS_THAN_SIGN && + (tempptr[3] == CHAR_EQUALS_SIGN || + tempptr[3] == CHAR_EXCLAMATION_MARK)))) + { + cd->iscondassert = TRUE; break; + } /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ @@ -6725,6 +6805,7 @@ for (;; ptr++) goto FAILED; } PUT2(code, 2+LINK_SIZE, recno); + if (recno > cd->top_backref) cd->top_backref = recno; break; } @@ -6747,12 +6828,15 @@ for (;; ptr++) int offset = i++; int count = 1; recno = GET2(slot, 0); /* Number from first found */ + if (recno > cd->top_backref) cd->top_backref = recno; for (; i < cd->names_found; i++) { slot += cd->name_entry_size; - if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break; + if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 || + (slot+IMM2_SIZE)[namelen] != 0) break; count++; } + if (count > 1) { PUT2(code, 2+LINK_SIZE, offset); @@ -7101,6 +7185,12 @@ for (;; ptr++) /* Count named back references. */ if (!is_recurse) cd->namedrefcount++; + + /* We have to allow for a named reference to a duplicated name (this + cannot be determined until the second pass). This needs an extra + 16-bit data item. */ + + *lengthptr += IMM2_SIZE; } /* In the real compile, search the name table. We check the name @@ -7147,6 +7237,8 @@ for (;; ptr++) for (i++; i < cd->names_found; i++) { if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break; + + count++; cslot += cd->name_entry_size; } @@ -7455,12 +7547,22 @@ for (;; ptr++) goto FAILED; } - /* Assertions used not to be repeatable, but this was changed for Perl - compatibility, so all kinds can now be repeated. We copy code into a + /* All assertions used not to be repeatable, but this was changed for Perl + compatibility. All kinds can now be repeated except for assertions that are + conditions (Perl also forbids these to be repeated). We copy code into a non-register variable (tempcode) in order to be able to pass its address - because some compilers complain otherwise. */ + because some compilers complain otherwise. At the start of a conditional + group whose condition is an assertion, cd->iscondassert is set. We unset it + here so as to allow assertions later in the group to be quantified. */ + + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && + cd->iscondassert) + { + previous = NULL; + cd->iscondassert = FALSE; + } + else previous = code; - previous = code; /* For handling repetition */ *code = bravalue; tempcode = code; tempreqvary = cd->req_varyopt; /* Save value before bracket */ @@ -7707,7 +7809,7 @@ for (;; ptr++) const pcre_uchar *p; pcre_uint32 cf; - save_hwm = cd->hwm; /* Normally this is set when '(' is read */ + save_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */ terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; @@ -8034,6 +8136,7 @@ int length; unsigned int orig_bracount; unsigned int max_bracount; branch_chain bc; +size_t save_hwm_offset; /* If set, call the external function that checks for stack availability. */ @@ -8051,6 +8154,8 @@ bc.current_branch = code; firstchar = reqchar = 0; firstcharflags = reqcharflags = REQ_UNSET; +save_hwm_offset = cd->hwm - cd->start_workspace; + /* Accumulate the length for use in the pre-compile phase. Start with the length of the BRA and KET and any extra bytes that are required at the beginning. We accumulate in a local variable to save frequent testing of @@ -8192,7 +8297,7 @@ for (;;) int fixed_length; *code = OP_END; fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0, - FALSE, cd); + FALSE, cd, NULL); DPRINTF(("fixed length = %d\n", fixed_length)); if (fixed_length == -3) { @@ -8244,12 +8349,16 @@ for (;;) /* If it was a capturing subpattern, check to see if it contained any recursive back references. If so, we must wrap it in atomic brackets. - In any event, remove the block from the chain. */ + Because we are moving code along, we must ensure that any pending recursive + references are updated. In any event, remove the block from the chain. */ if (capnumber > 0) { if (cd->open_caps->flag) { + *code = OP_END; + adjust_recurse(start_bracket, 1 + LINK_SIZE, + (options & PCRE_UTF8) != 0, cd, save_hwm_offset); memmove(start_bracket + 1 + LINK_SIZE, start_bracket, IN_UCHARS(code - start_bracket)); *start_bracket = OP_ONCE; @@ -8473,6 +8582,7 @@ do { case OP_RREF: case OP_DNRREF: case OP_DEF: + case OP_FAIL: return FALSE; default: /* Assertion */ @@ -9057,6 +9167,7 @@ cd->dupnames = FALSE; cd->namedrefcount = 0; cd->start_code = cworkspace; cd->hwm = cworkspace; +cd->iscondassert = FALSE; cd->start_workspace = cworkspace; cd->workspace_size = COMPILE_WORK_SIZE; cd->named_groups = named_groups; @@ -9094,13 +9205,6 @@ if (length > MAX_PATTERN_SIZE) goto PCRE_EARLY_ERROR_RETURN; } -/* If there are groups with duplicate names and there are also references by -name, we must allow for the possibility of named references to duplicated -groups. These require an extra data item each. */ - -if (cd->dupnames && cd->namedrefcount > 0) - length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar); - /* Compute the size of the data block for storing the compiled pattern. Integer overflow should no longer be possible because nowadays we limit the maximum value of cd->names_found and cd->name_entry_size. */ @@ -9159,6 +9263,7 @@ cd->name_table = (pcre_uchar *)re + re->name_table_offset; codestart = cd->name_table + re->name_entry_size * re->name_count; cd->start_code = codestart; cd->hwm = (pcre_uchar *)(cd->start_workspace); +cd->iscondassert = FALSE; cd->req_varyopt = 0; cd->had_accept = FALSE; cd->had_pruneorskip = FALSE; @@ -9254,11 +9359,18 @@ subpattern. */ if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; -/* Unless disabled, check whether single character iterators can be -auto-possessified. The function overwrites the appropriate opcode values. */ +/* Unless disabled, check whether any single character iterators can be +auto-possessified. The function overwrites the appropriate opcode values, so +the type of the pointer must be cast. NOTE: the intermediate variable "temp" is +used in this code because at least one compiler gives a warning about loss of +"const" attribute if the cast (pcre_uchar *)codestart is used directly in the +function call. */ if ((options & PCRE_NO_AUTO_POSSESS) == 0) - auto_possessify((pcre_uchar *)codestart, utf, cd); + { + pcre_uchar *temp = (pcre_uchar *)codestart; + auto_possessify(temp, utf, cd); + } /* If there were any lookbehind assertions that contained OP_RECURSE (recursions or subroutine calls), a flag is set for them to be checked here, @@ -9288,7 +9400,7 @@ if (cd->check_lookbehind) int end_op = *be; *be = OP_END; fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE, - cd); + cd, NULL); *be = end_op; DPRINTF(("fixed length = %d\n", fixed_length)); if (fixed_length < 0) |