diff options
author | Matthias Clasen <mclasen@redhat.com> | 2011-01-22 00:01:54 -0500 |
---|---|---|
committer | Matthias Clasen <mclasen@redhat.com> | 2011-01-22 00:01:54 -0500 |
commit | fb2809ec996e9e12d06f4bc7239a98718f5f06d7 (patch) | |
tree | 24a472665ff8dacbb97bd748da9588d7b935a1c4 | |
parent | 3f059a6a123dd62257f224b9af7701078783060e (diff) |
Forgotten files
-rw-r--r-- | glib/pcre/pcre.h | 81 | ||||
-rw-r--r-- | glib/pcre/pcre_chartables.c | 2 | ||||
-rw-r--r-- | glib/pcre/pcre_compile.c | 899 | ||||
-rw-r--r-- | glib/pcre/pcre_dfa_exec.c | 257 | ||||
-rw-r--r-- | glib/pcre/pcre_exec.c | 1155 | ||||
-rw-r--r-- | glib/pcre/pcre_internal.h | 286 | ||||
-rw-r--r-- | glib/pcre/pcre_study.c | 277 | ||||
-rw-r--r-- | glib/pcre/pcre_tables.c | 271 | ||||
-rw-r--r-- | glib/pcre/pcre_xclass.c | 37 | ||||
-rw-r--r-- | glib/pcre/ucp.h | 5 |
10 files changed, 2271 insertions, 999 deletions
diff --git a/glib/pcre/pcre.h b/glib/pcre/pcre.h index 4864bd099..7c4c04011 100644 --- a/glib/pcre/pcre.h +++ b/glib/pcre/pcre.h @@ -5,7 +5,7 @@ /* This is the public header file for the PCRE library, to be #included by applications that call the PCRE functions. - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2010 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE_MAJOR 8 -#define PCRE_MINOR 02 +#define PCRE_MINOR 12 #define PCRE_PRERELEASE -#define PCRE_DATE 2010-03-19 +#define PCRE_DATE 2011-01-15 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE, the appropriate @@ -96,41 +96,44 @@ extern "C" { #endif /* Options. Some are compile-time only, some are run-time only, and some are -both, so we keep them all distinct. */ - -#define PCRE_CASELESS 0x00000001 -#define PCRE_MULTILINE 0x00000002 -#define PCRE_DOTALL 0x00000004 -#define PCRE_EXTENDED 0x00000008 -#define PCRE_ANCHORED 0x00000010 -#define PCRE_DOLLAR_ENDONLY 0x00000020 -#define PCRE_EXTRA 0x00000040 -#define PCRE_NOTBOL 0x00000080 -#define PCRE_NOTEOL 0x00000100 -#define PCRE_UNGREEDY 0x00000200 -#define PCRE_NOTEMPTY 0x00000400 -#define PCRE_UTF8 0x00000800 -#define PCRE_NO_AUTO_CAPTURE 0x00001000 -#define PCRE_NO_UTF8_CHECK 0x00002000 -#define PCRE_AUTO_CALLOUT 0x00004000 -#define PCRE_PARTIAL_SOFT 0x00008000 +both, so we keep them all distinct. However, almost all the bits in the options +word are now used. In the long run, we may have to re-use some of the +compile-time only bits for runtime options, or vice versa. */ + +#define PCRE_CASELESS 0x00000001 /* Compile */ +#define PCRE_MULTILINE 0x00000002 /* Compile */ +#define PCRE_DOTALL 0x00000004 /* Compile */ +#define PCRE_EXTENDED 0x00000008 /* Compile */ +#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */ +#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */ +#define PCRE_EXTRA 0x00000040 /* Compile */ +#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */ +#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */ +#define PCRE_UNGREEDY 0x00000200 /* Compile */ +#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */ +#define PCRE_UTF8 0x00000800 /* Compile */ +#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */ +#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */ +#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */ +#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */ #define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */ -#define PCRE_DFA_SHORTEST 0x00010000 -#define PCRE_DFA_RESTART 0x00020000 -#define PCRE_FIRSTLINE 0x00040000 -#define PCRE_DUPNAMES 0x00080000 -#define PCRE_NEWLINE_CR 0x00100000 -#define PCRE_NEWLINE_LF 0x00200000 -#define PCRE_NEWLINE_CRLF 0x00300000 -#define PCRE_NEWLINE_ANY 0x00400000 -#define PCRE_NEWLINE_ANYCRLF 0x00500000 -#define PCRE_BSR_ANYCRLF 0x00800000 -#define PCRE_BSR_UNICODE 0x01000000 -#define PCRE_JAVASCRIPT_COMPAT 0x02000000 -#define PCRE_NO_START_OPTIMIZE 0x04000000 -#define PCRE_NO_START_OPTIMISE 0x04000000 -#define PCRE_PARTIAL_HARD 0x08000000 -#define PCRE_NOTEMPTY_ATSTART 0x10000000 +#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */ +#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */ +#define PCRE_FIRSTLINE 0x00040000 /* Compile */ +#define PCRE_DUPNAMES 0x00080000 /* Compile */ +#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */ +#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */ +#define PCRE_NEWLINE_CRLF 0x00300000 /* Compile, exec, DFA exec */ +#define PCRE_NEWLINE_ANY 0x00400000 /* Compile, exec, DFA exec */ +#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */ +#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */ +#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */ +#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */ +#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */ +#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */ +#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */ +#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */ +#define PCRE_UCP 0x20000000 /* Compile */ /* Exec-time and get/set-time error codes */ @@ -158,6 +161,8 @@ both, so we keep them all distinct. */ #define PCRE_ERROR_RECURSIONLIMIT (-21) #define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */ #define PCRE_ERROR_BADNEWLINE (-23) +#define PCRE_ERROR_BADOFFSET (-24) +#define PCRE_ERROR_SHORTUTF8 (-25) /* Request types for pcre_fullinfo() */ @@ -200,6 +205,7 @@ these bits, just add new ones on the end, in order to remain compatible. */ #define PCRE_EXTRA_CALLOUT_DATA 0x0004 #define PCRE_EXTRA_TABLES 0x0008 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 +#define PCRE_EXTRA_MARK 0x0020 /* Types */ @@ -225,6 +231,7 @@ typedef struct pcre_extra { void *callout_data; /* Data passed back in callouts */ const unsigned char *tables; /* Pointer to character tables */ unsigned long int match_limit_recursion; /* Max recursive calls to match() */ + unsigned char **mark; /* For passing back a mark pointer */ } pcre_extra; /* The structure for passing out data via the pcre_callout_function. We use a diff --git a/glib/pcre/pcre_chartables.c b/glib/pcre/pcre_chartables.c index ae45db0ca..9117ae3c7 100644 --- a/glib/pcre/pcre_chartables.c +++ b/glib/pcre/pcre_chartables.c @@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the current locale. If PCRE is configured with --enable-rebuild-chartables, this happens automatically. -The following #includes are present because without the gcc 4.x may remove the +The following #includes are present because without them gcc 4.x may remove the array definition from the final binary if PCRE is built into a static library and dead code stripping is activated. This leads to link errors. Pulling in the header ensures that the array gets flagged as "someone outside this compilation diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c index a00a99017..f0bae53ee 100644 --- a/glib/pcre/pcre_compile.c +++ b/glib/pcre/pcre_compile.c @@ -124,7 +124,7 @@ static const short int escapes[] = { -ESC_H, 0, 0, -ESC_K, 0, 0, - 0, 0, + -ESC_N, 0, -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, @@ -171,7 +171,7 @@ static const short int escapes[] = { /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, -/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, +/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P, /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, @@ -188,11 +188,14 @@ string is built from string macros so that it works in UTF-8 mode on EBCDIC platforms. */ typedef struct verbitem { - int len; - int op; + int len; /* Length of verb name */ + int op; /* Op when no arg, or -1 if arg mandatory */ + int op_arg; /* Op when arg present, or -1 if not allowed */ } verbitem; static const char verbnames[] = + "\0" /* Empty name is a shorthand for MARK */ + STRING_MARK0 STRING_ACCEPT0 STRING_COMMIT0 STRING_F0 @@ -202,13 +205,15 @@ static const char verbnames[] = STRING_THEN; static const verbitem verbs[] = { - { 6, OP_ACCEPT }, - { 6, OP_COMMIT }, - { 1, OP_FAIL }, - { 4, OP_FAIL }, - { 5, OP_PRUNE }, - { 4, OP_SKIP }, - { 4, OP_THEN } + { 0, -1, OP_MARK }, + { 4, -1, OP_MARK }, + { 6, OP_ACCEPT, -1 }, + { 6, OP_COMMIT, -1 }, + { 1, OP_FAIL, -1 }, + { 4, OP_FAIL, -1 }, + { 5, OP_PRUNE, OP_PRUNE_ARG }, + { 4, OP_SKIP, OP_SKIP_ARG }, + { 4, OP_THEN, OP_THEN_ARG } }; static const int verbcount = sizeof(verbs)/sizeof(verbitem); @@ -256,6 +261,53 @@ static const int posix_class_maps[] = { cbit_xdigit,-1, 0 /* xdigit */ }; +/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class +substitutes must be in the order of the names, defined above, and there are +both positive and negative cases. NULL means no substitute. */ + +#ifdef SUPPORT_UCP +static const uschar *substitutes[] = { + (uschar *)"\\P{Nd}", /* \D */ + (uschar *)"\\p{Nd}", /* \d */ + (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */ + (uschar *)"\\p{Xsp}", /* \s */ + (uschar *)"\\P{Xwd}", /* \W */ + (uschar *)"\\p{Xwd}" /* \w */ +}; + +static const uschar *posix_substitutes[] = { + (uschar *)"\\p{L}", /* alpha */ + (uschar *)"\\p{Ll}", /* lower */ + (uschar *)"\\p{Lu}", /* upper */ + (uschar *)"\\p{Xan}", /* alnum */ + NULL, /* ascii */ + (uschar *)"\\h", /* blank */ + NULL, /* cntrl */ + (uschar *)"\\p{Nd}", /* digit */ + NULL, /* graph */ + NULL, /* print */ + NULL, /* punct */ + (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */ + (uschar *)"\\p{Xwd}", /* word */ + NULL, /* xdigit */ + /* Negated cases */ + (uschar *)"\\P{L}", /* ^alpha */ + (uschar *)"\\P{Ll}", /* ^lower */ + (uschar *)"\\P{Lu}", /* ^upper */ + (uschar *)"\\P{Xan}", /* ^alnum */ + NULL, /* ^ascii */ + (uschar *)"\\H", /* ^blank */ + NULL, /* ^cntrl */ + (uschar *)"\\P{Nd}", /* ^digit */ + NULL, /* ^graph */ + NULL, /* ^print */ + NULL, /* ^punct */ + (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */ + (uschar *)"\\P{Xwd}", /* ^word */ + NULL /* ^xdigit */ +}; +#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *)) +#endif #define STRING(a) # a #define XSTRING(s) STRING(s) @@ -319,7 +371,7 @@ static const char error_texts[] = /* 35 */ "invalid condition (?(0)\0" "\\C not allowed in lookbehind assertion\0" - "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" + "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0" "number after (?C is > 255\0" "closing ) for (?C expected\0" /* 40 */ @@ -345,7 +397,7 @@ static const char error_texts[] = "inconsistent NEWLINE options\0" "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" "a numbered reference must not be zero\0" - "(*VERB) with an argument is not supported\0" + "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" /* 60 */ "(*VERB) not recognized\0" "number is too big\0" @@ -353,7 +405,11 @@ static const char error_texts[] = "digit expected after (?+\0" "] is an invalid data character in JavaScript compatibility mode\0" /* 65 */ - "different names for subpatterns of the same number are not allowed\0"; + "different names for subpatterns of the same number are not allowed\0" + "(*MARK) must have an argument\0" + "this version of PCRE is not compiled with PCRE_UCP support\0" + "\\c must be followed by an ASCII character\0" + ; /* Definition to allow mutual recursion */ @@ -456,7 +512,6 @@ else case CHAR_l: case CHAR_L: - case CHAR_N: case CHAR_u: case CHAR_U: *errorcodeptr = ERR37; @@ -657,7 +712,8 @@ else break; /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. - This coding is ASCII-specific, but then the whole concept of \cx is + An error is given if the byte following \c is not an ASCII character. This + coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ case CHAR_c: @@ -667,11 +723,15 @@ else *errorcodeptr = ERR2; break; } - -#ifndef EBCDIC /* ASCII/UTF-8 coding */ +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (c > 127) /* Excludes all non-ASCII in either mode */ + { + *errorcodeptr = ERR68; + break; + } if (c >= CHAR_a && c <= CHAR_z) c -= 32; c ^= 0x40; -#else /* EBCDIC coding */ +#else /* EBCDIC coding */ if (c >= CHAR_a && c <= CHAR_z) c += 64; c ^= 0xC0; #endif @@ -694,6 +754,19 @@ else } } +/* Perl supports \N{name} for character names, as well as plain \N for "not +newline". PCRE does not support \N{name}. */ + +if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET) + *errorcodeptr = ERR37; + +/* If PCRE_UCP is set, we change the values for \d etc. */ + +if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w) + c -= (ESC_DU - ESC_D); + +/* Set the pointer to the final character before returning. */ + *ptrptr = ptr; return c; } @@ -902,10 +975,21 @@ top-level call starts at the beginning of the pattern. All other calls must start at a parenthesis. It scans along a pattern's text looking for capturing subpatterns, and counting them. If it finds a named pattern that matches the name it is given, it returns its number. Alternatively, if the name is NULL, it -returns when it reaches a given numbered subpattern. We know that if (?P< is -encountered, the name will be terminated by '>' because that is checked in the -first pass. Recursion is used to keep track of subpatterns that reset the -capturing group numbers - the (?| feature. +returns when it reaches a given numbered subpattern. Recursion is used to keep +track of subpatterns that reset the capturing group numbers - the (?| feature. + +This function was originally called only from the second pass, in which we know +that if (?< or (?' or (?P< is encountered, the name will be correctly +terminated because that is checked in the first pass. There is now one call to +this function in the first pass, to check for a recursive back reference by +name (so that we can make the whole group atomic). In this case, we need check +only up to the current position in the pattern, and that is still OK because +and previous occurrences will have been checked. To make this work, the test +for "end of pattern" is a check against cd->end_pattern in the main loop, +instead of looking for a binary zero. This means that the special first-pass +call can adjust cd->end_pattern temporarily. (Checks for binary zero while +processing items within the loop are OK, because afterwards the main loop will +terminate.) Arguments: ptrptr address of the current character pointer (updated) @@ -913,6 +997,7 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode + utf8 TRUE if we are in UTF-8 mode count pointer to the current capturing subpattern number (updated) Returns: the number of the named subpattern, or -1 if not found @@ -920,7 +1005,7 @@ Returns: the number of the named subpattern, or -1 if not found static int find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn, - BOOL xmode, int *count) + BOOL xmode, BOOL utf8, int *count) { uschar *ptr = *ptrptr; int start_count = *count; @@ -932,25 +1017,39 @@ dealing with. The very first call may not start with a parenthesis. */ if (ptr[0] == CHAR_LEFT_PARENTHESIS) { - if (ptr[1] == CHAR_QUESTION_MARK && - ptr[2] == CHAR_VERTICAL_LINE) + /* Handle specials such as (*SKIP) or (*UTF8) etc. */ + + if (ptr[1] == CHAR_ASTERISK) ptr += 2; + + /* Handle a normal, unnamed capturing parenthesis. */ + + else if (ptr[1] != CHAR_QUESTION_MARK) + { + *count += 1; + if (name == NULL && *count == lorn) return *count; + ptr++; + } + + /* All cases now have (? at the start. Remember when we are in a group + where the parenthesis numbers are duplicated. */ + + else if (ptr[2] == CHAR_VERTICAL_LINE) { ptr += 3; dup_parens = TRUE; } - /* Handle a normal, unnamed capturing parenthesis */ + /* Handle comments; all characters are allowed until a ket is reached. */ - else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK) + else if (ptr[2] == CHAR_NUMBER_SIGN) { - *count += 1; - if (name == NULL && *count == lorn) return *count; - ptr++; + for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break; + goto FAIL_EXIT; } /* Handle a condition. If it is an assertion, just carry on so that it is processed as normal. If not, skip to the closing parenthesis of the - condition (there can't be any nested parens. */ + condition (there can't be any nested parens). */ else if (ptr[2] == CHAR_LEFT_PARENTHESIS) { @@ -962,7 +1061,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS) } } - /* We have either (? or (* and not a condition */ + /* Start with (? but not a condition. */ else { @@ -991,9 +1090,11 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS) } /* Past any initial parenthesis handling, scan for parentheses or vertical -bars. */ +bars. Stop if we get to cd->end_pattern. Note that this is important for the +first-pass call when this value is temporarily adjusted to stop at the current +position. So DO NOT change this to a test for binary zero. */ -for (; *ptr != 0; ptr++) +for (; ptr < cd->end_pattern; ptr++) { /* Skip over backslashed characters and also entire \Q...\E */ @@ -1067,7 +1168,15 @@ for (; *ptr != 0; ptr++) if (xmode && *ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0 && *ptr != CHAR_NL) {}; + ptr++; + while (*ptr != 0) + { + if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif + } if (*ptr == 0) goto FAIL_EXIT; continue; } @@ -1076,7 +1185,7 @@ for (; *ptr != 0; ptr++) if (*ptr == CHAR_LEFT_PARENTHESIS) { - int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count); + int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count); if (rc > 0) return rc; if (*ptr == 0) goto FAIL_EXIT; } @@ -1084,8 +1193,7 @@ for (; *ptr != 0; ptr++) else if (*ptr == CHAR_RIGHT_PARENTHESIS) { if (dup_parens && *count < hwm_count) *count = hwm_count; - *ptrptr = ptr; - return -1; + goto FAIL_EXIT; } else if (*ptr == CHAR_VERTICAL_LINE && dup_parens) @@ -1123,12 +1231,14 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode + utf8 TRUE if we are in UTF-8 mode Returns: the number of the found subpattern, or -1 if not found */ static int -find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode) +find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode, + BOOL utf8) { uschar *ptr = (uschar *)cd->start_pattern; int count = 0; @@ -1141,7 +1251,7 @@ matching closing parens. That is why we have to have a loop. */ for (;;) { - rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count); + rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count); if (rc > 0 || *ptr++ == 0) break; } @@ -1485,7 +1595,8 @@ for (;;) /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters. */ + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ else { @@ -1509,6 +1620,16 @@ for (;;) case OP_TYPEPOSUPTO: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + code += code[1]; + break; + + case OP_THEN_ARG: + code += code[1+LINK_SIZE]; + break; } /* Add in the fixed length from the table */ @@ -1580,7 +1701,8 @@ for (;;) /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters. */ + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ else { @@ -1604,6 +1726,16 @@ for (;;) case OP_TYPEEXACT: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + code += code[1]; + break; + + case OP_THEN_ARG: + code += code[1+LINK_SIZE]; + break; } /* Add in the fixed length from the table */ @@ -1873,6 +2005,19 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE break; #endif + /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument + string. */ + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + code += code[1]; + break; + + case OP_THEN_ARG: + code += code[1+LINK_SIZE]; + break; + /* None of the remaining opcodes are required to match a character. */ default: @@ -2093,8 +2238,8 @@ auto_callout(uschar *code, const uschar *ptr, compile_data *cd) { *code++ = OP_CALLOUT; *code++ = 255; -PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ -PUT(code, LINK_SIZE, 0); /* Default length */ +PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */ +PUT(code, LINK_SIZE, 0); /* Default length */ return code + 2*LINK_SIZE; } @@ -2119,7 +2264,7 @@ Returns: nothing static void complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) { -int length = ptr - cd->start_pattern - GET(previous_callout, 2); +int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2)); PUT(previous_callout, 2 + LINK_SIZE, length); } @@ -2169,6 +2314,69 @@ for (++c; c <= d; c++) return TRUE; } + + + +/************************************************* +* Check a character and a property * +*************************************************/ + +/* This function is called by check_auto_possessive() when a property item +is adjacent to a fixed character. + +Arguments: + c the character + ptype the property type + pdata the data for the type + negated TRUE if it's a negated property (\P or \p{^) + +Returns: TRUE if auto-possessifying is OK +*/ + +static BOOL +check_char_prop(int c, int ptype, int pdata, BOOL negated) +{ +int chartype = UCD_CHARTYPE(c); +switch(ptype) + { + case PT_LAMP: + return (chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == negated; + + case PT_GC: + return (pdata == _pcre_ucp_gentype[chartype]) == negated; + + case PT_PC: + return (pdata == chartype) == negated; + + case PT_SC: + return (pdata == UCD_SCRIPT(c)) == negated; + + /* These are specials */ + + case PT_ALNUM: + return (_pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N) == negated; + + case PT_SPACE: /* Perl space */ + return (_pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == negated; + + case PT_PXSPACE: /* POSIX space */ + return (_pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) + == negated; + + case PT_WORD: + return (_pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || + c == CHAR_UNDERSCORE) == negated; + } +return FALSE; +} #endif /* SUPPORT_UCP */ @@ -2182,10 +2390,8 @@ whether the next thing could possibly match the repeated item. If not, it makes sense to automatically possessify the repeated item. Arguments: - op_code the repeated op code - this data for this item, depends on the opcode + previous pointer to the repeated opcode utf8 TRUE in UTF-8 mode - utf8_char used for utf8 character bytes, NULL if not relevant ptr next character in pattern options options bits cd contains pointers to tables etc. @@ -2194,10 +2400,11 @@ Returns: TRUE if possessifying is wanted */ static BOOL -check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, - const uschar *ptr, int options, compile_data *cd) +check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr, + int options, compile_data *cd) { -int next; +int c, next; +int op_code = *previous++; /* Skip whitespace and comments in extended mode */ @@ -2208,8 +2415,15 @@ if ((options & PCRE_EXTENDED) != 0) while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0) + ptr++; + while (*ptr != 0) + { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif + } } else break; } @@ -2245,8 +2459,15 @@ if ((options & PCRE_EXTENDED) != 0) while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0) + ptr++; + while (*ptr != 0) + { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif + } } else break; } @@ -2258,23 +2479,18 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) return FALSE; -/* Now compare the next item with the previous opcode. If the previous is a -positive single character match, "item" either contains the character or, if -"item" is greater than 127 in utf8 mode, the character's bytes are in -utf8_char. */ - - -/* Handle cases when the next item is a character. */ +/* Now compare the next item with the previous opcode. First, handle cases when +the next item is a character. */ if (next >= 0) switch(op_code) { case OP_CHAR: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + GETCHARTEST(c, previous); #else - (void)(utf8_char); /* Keep compiler happy by referencing function argument */ + c = *previous; #endif - return item != next; + return c != next; /* For CHARNC (caseless character) we must check the other case. If we have Unicode property support, we can use it to test the other case of @@ -2282,9 +2498,11 @@ if (next >= 0) switch(op_code) case OP_CHARNC: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + GETCHARTEST(c, previous); +#else + c = *previous; #endif - if (item == next) return FALSE; + if (c == next) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) { @@ -2295,16 +2513,16 @@ if (next >= 0) switch(op_code) #else othercase = NOTACHAR; #endif - return (unsigned int)item != othercase; + return (unsigned int)c != othercase; } else #endif /* SUPPORT_UTF8 */ - return (item != cd->fcc[next]); /* Non-UTF-8 mode */ + return (c != cd->fcc[next]); /* Non-UTF-8 mode */ - /* For OP_NOT, "item" must be a single-byte character. */ + /* For OP_NOT, its data is always a single-byte character. */ case OP_NOT: - if (item == next) return TRUE; + if ((c = *previous) == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) @@ -2316,11 +2534,14 @@ if (next >= 0) switch(op_code) #else othercase = NOTACHAR; #endif - return (unsigned int)item == othercase; + return (unsigned int)c == othercase; } else #endif /* SUPPORT_UTF8 */ - return (item == cd->fcc[next]); /* Non-UTF-8 mode */ + return (c == cd->fcc[next]); /* Non-UTF-8 mode */ + + /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. + When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ case OP_DIGIT: return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; @@ -2363,11 +2584,12 @@ if (next >= 0) switch(op_code) case 0x202f: case 0x205f: case 0x3000: - return op_code != OP_HSPACE; + return op_code == OP_NOT_HSPACE; default: - return op_code == OP_HSPACE; + return op_code != OP_NOT_HSPACE; } + case OP_ANYNL: case OP_VSPACE: case OP_NOT_VSPACE: switch(next) @@ -2379,48 +2601,62 @@ if (next >= 0) switch(op_code) case 0x85: case 0x2028: case 0x2029: - return op_code != OP_VSPACE; + return op_code == OP_NOT_VSPACE; default: - return op_code == OP_VSPACE; + return op_code != OP_NOT_VSPACE; } +#ifdef SUPPORT_UCP + case OP_PROP: + return check_char_prop(next, previous[0], previous[1], FALSE); + + case OP_NOTPROP: + return check_char_prop(next, previous[0], previous[1], TRUE); +#endif + default: return FALSE; } -/* Handle the case when the next item is \d, \s, etc. */ +/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP +is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are +generated only when PCRE_UCP is *not* set, that is, when only ASCII +characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are +replaced by OP_PROP codes when PCRE_UCP is set. */ switch(op_code) { case OP_CHAR: case OP_CHARNC: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + GETCHARTEST(c, previous); +#else + c = *previous; #endif switch(-next) { case ESC_d: - return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; + return c > 127 || (cd->ctypes[c] & ctype_digit) == 0; case ESC_D: - return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; + return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0; case ESC_s: - return item > 127 || (cd->ctypes[item] & ctype_space) == 0; + return c > 127 || (cd->ctypes[c] & ctype_space) == 0; case ESC_S: - return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; + return c <= 127 && (cd->ctypes[c] & ctype_space) != 0; case ESC_w: - return item > 127 || (cd->ctypes[item] & ctype_word) == 0; + return c > 127 || (cd->ctypes[c] & ctype_word) == 0; case ESC_W: - return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; + return c <= 127 && (cd->ctypes[c] & ctype_word) != 0; case ESC_h: case ESC_H: - switch(item) + switch(c) { case 0x09: case 0x20: @@ -2448,7 +2684,7 @@ switch(op_code) case ESC_v: case ESC_V: - switch(item) + switch(c) { case 0x0a: case 0x0b: @@ -2462,38 +2698,92 @@ switch(op_code) return -next == ESC_v; } + /* When PCRE_UCP is set, these values get generated for \d etc. Find + their substitutions and process them. The result will always be either + -ESC_p or -ESC_P. Then fall through to process those values. */ + +#ifdef SUPPORT_UCP + case ESC_du: + case ESC_DU: + case ESC_wu: + case ESC_WU: + case ESC_su: + case ESC_SU: + { + int temperrorcode = 0; + ptr = substitutes[-next - ESC_DU]; + next = check_escape(&ptr, &temperrorcode, 0, options, FALSE); + if (temperrorcode != 0) return FALSE; + ptr++; /* For compatibility */ + } + /* Fall through */ + + case ESC_p: + case ESC_P: + { + int ptype, pdata, errorcodeptr; + BOOL negated; + + ptr--; /* Make ptr point at the p or P */ + ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr); + if (ptype < 0) return FALSE; + ptr++; /* Point past the final curly ket */ + + /* If the property item is optional, we have to give up. (When generated + from \d etc by PCRE_UCP, this test will have been applied much earlier, + to the original \d etc. At this point, ptr will point to a zero byte. */ + + if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || + strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) + return FALSE; + + /* Do the property check. */ + + return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated); + } +#endif + default: return FALSE; } + /* In principle, support for Unicode properties should be integrated here as + well. It means re-organizing the above code so as to get hold of the property + values before switching on the op-code. However, I wonder how many patterns + combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set, + these op-codes are never generated.) */ + case OP_DIGIT: return next == -ESC_D || next == -ESC_s || next == -ESC_W || - next == -ESC_h || next == -ESC_v; + next == -ESC_h || next == -ESC_v || next == -ESC_R; case OP_NOT_DIGIT: return next == -ESC_d; case OP_WHITESPACE: - return next == -ESC_S || next == -ESC_d || next == -ESC_w; + return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R; case OP_NOT_WHITESPACE: return next == -ESC_s || next == -ESC_h || next == -ESC_v; case OP_HSPACE: - return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; + return next == -ESC_S || next == -ESC_H || next == -ESC_d || + next == -ESC_w || next == -ESC_v || next == -ESC_R; case OP_NOT_HSPACE: return next == -ESC_h; /* Can't have \S in here because VT matches \S (Perl anomaly) */ + case OP_ANYNL: case OP_VSPACE: return next == -ESC_V || next == -ESC_d || next == -ESC_w; case OP_NOT_VSPACE: - return next == -ESC_v; + return next == -ESC_v || next == -ESC_R; case OP_WORDCHAR: - return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; + return next == -ESC_W || next == -ESC_s || next == -ESC_h || + next == -ESC_v || next == -ESC_R; case OP_NOT_WORDCHAR: return next == -ESC_w || next == -ESC_d; @@ -2557,6 +2847,7 @@ BOOL inescq = FALSE; BOOL groupsetfirstbyte = FALSE; const uschar *ptr = *ptrptr; const uschar *tempptr; +const uschar *nestptr = NULL; uschar *previous = NULL; uschar *previous_callout = NULL; uschar *save_hwm = NULL; @@ -2627,6 +2918,16 @@ for (;; ptr++) c = *ptr; + /* If we are at the end of a nested substitution, revert to the outer level + string. Nesting only happens one level deep. */ + + if (c == 0 && nestptr != NULL) + { + ptr = nestptr; + nestptr = NULL; + c = *ptr; + } + /* If we are in the pre-compile phase, accumulate the length used for the previous cycle of this loop. */ @@ -2657,7 +2958,7 @@ for (;; ptr++) goto FAILED; } - *lengthptr += code - last_code; + *lengthptr += (int)(code - last_code); DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); /* If "previous" is set and it is not at the start of the work space, move @@ -2739,9 +3040,14 @@ for (;; ptr++) if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0) + ptr++; + while (*ptr != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif } if (*ptr != 0) continue; @@ -2775,7 +3081,7 @@ for (;; ptr++) *errorcodeptr = ERR20; goto FAILED; } - *lengthptr += code - last_code; /* To include callout length */ + *lengthptr += (int)(code - last_code); /* To include callout length */ DPRINTF((">> end branch\n")); } return TRUE; @@ -2980,7 +3286,7 @@ for (;; ptr++) ptr++; } - posix_class = check_posix_name(ptr, tempptr - ptr); + posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); if (posix_class < 0) { *errorcodeptr = ERR30; @@ -2994,10 +3300,25 @@ for (;; ptr++) if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; - /* We build the bit map for the POSIX class in a chunk of local store - because we may be adding and subtracting from it, and we don't want to - subtract bits that may be in the main map already. At the end we or the - result into the bit map that is being built. */ + /* When PCRE_UCP is set, some of the POSIX classes are converted to + different escape sequences that use Unicode properties. */ + +#ifdef SUPPORT_UCP + if ((options & PCRE_UCP) != 0) + { + int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); + if (posix_substitutes[pc] != NULL) + { + nestptr = tempptr + 1; + ptr = posix_substitutes[pc] - 1; + continue; + } + } +#endif + /* In the non-UCP case, we build the bit map for the POSIX class in a + chunk of local store because we may be adding and subtracting from it, + and we don't want to subtract bits that may be in the main map already. + At the end we or the result into the bit map that is being built. */ posix_class *= 3; @@ -3041,19 +3362,18 @@ for (;; ptr++) /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. The sequence \b is a special - case. Inside a class (and only there) it is treated as backspace. - Elsewhere it marks a word boundary. Other escapes have preset maps ready - to 'or' into the one we are building. We assume they have more than one - character in them, so set class_charcount bigger than one. */ + case. Inside a class (and only there) it is treated as backspace. We + assume that other escapes have more than one character in them, so set + class_charcount bigger than one. Unrecognized escapes fall through and + are either treated as literal characters (by default), or are faulted if + PCRE_EXTRA is set. */ if (c == CHAR_BACKSLASH) { c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ - else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */ - else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */ + if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ { if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) @@ -3070,10 +3390,20 @@ for (;; ptr++) register const uschar *cbits = cd->cbits; class_charcount += 2; /* Greater than 1 is what matters */ - /* Save time by not doing this in the pre-compile phase. */ - - if (lengthptr == NULL) switch (-c) + switch (-c) { +#ifdef SUPPORT_UCP + case ESC_du: /* These are the values given for \d etc */ + case ESC_DU: /* when PCRE_UCP is set. We replace the */ + case ESC_wu: /* escape sequence with an appropriate \p */ + case ESC_WU: /* or \P to test Unicode properties instead */ + case ESC_su: /* of the default ASCII testing. */ + case ESC_SU: + nestptr = ptr; + ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */ + class_charcount -= 2; /* Undo! */ + continue; +#endif case ESC_d: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; continue; @@ -3092,9 +3422,14 @@ for (;; ptr++) for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; continue; + /* Perl 5.004 onwards omits VT from \s, but we must preserve it + if it was previously set by something earlier in the character + class. */ + case ESC_s: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; - classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ + classbits[0] |= cbits[cbit_space]; + classbits[1] |= cbits[cbit_space+1] & ~0x08; + for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; continue; case ESC_S: @@ -3103,20 +3438,7 @@ for (;; ptr++) classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; - default: /* Not recognized; fall through */ - break; /* Need "default" setting to stop compiler warning. */ - } - - /* In the pre-compile phase, just do the recognition. */ - - else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || - c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; - - /* We need to deal with \H, \h, \V, and \v in both phases because - they use extra memory. */ - - if (-c == ESC_h) - { + case ESC_h: SETBIT(classbits, 0x09); /* VT */ SETBIT(classbits, 0x20); /* SPACE */ SETBIT(classbits, 0xa0); /* NSBP */ @@ -3140,10 +3462,8 @@ for (;; ptr++) } #endif continue; - } - if (-c == ESC_H) - { + case ESC_H: for (c = 0; c < 32; c++) { int x = 0xff; @@ -3185,10 +3505,8 @@ for (;; ptr++) } #endif continue; - } - if (-c == ESC_v) - { + case ESC_v: SETBIT(classbits, 0x0a); /* LF */ SETBIT(classbits, 0x0b); /* VT */ SETBIT(classbits, 0x0c); /* FF */ @@ -3204,10 +3522,8 @@ for (;; ptr++) } #endif continue; - } - if (-c == ESC_V) - { + case ESC_V: for (c = 0; c < 32; c++) { int x = 0xff; @@ -3237,38 +3553,38 @@ for (;; ptr++) } #endif continue; - } - - /* We need to deal with \P and \p in both phases. */ #ifdef SUPPORT_UCP - if (-c == ESC_p || -c == ESC_P) - { - BOOL negated; - int pdata; - int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); - if (ptype < 0) goto FAILED; - class_utf8 = TRUE; - *class_utf8data++ = ((-c == ESC_p) != negated)? - XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = ptype; - *class_utf8data++ = pdata; - class_charcount -= 2; /* Not a < 256 character */ - continue; - } + case ESC_p: + case ESC_P: + { + BOOL negated; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) goto FAILED; + class_utf8 = TRUE; + *class_utf8data++ = ((-c == ESC_p) != negated)? + XCL_PROP : XCL_NOTPROP; + *class_utf8data++ = ptype; + *class_utf8data++ = pdata; + class_charcount -= 2; /* Not a < 256 character */ + continue; + } #endif - /* Unrecognized escapes are faulted if PCRE is running in its - strict mode. By default, for compatibility with Perl, they are - treated as literals. */ + /* Unrecognized escapes are faulted if PCRE is running in its + strict mode. By default, for compatibility with Perl, they are + treated as literals. */ - if ((options & PCRE_EXTRA) != 0) - { - *errorcodeptr = ERR7; - goto FAILED; + default: + if ((options & PCRE_EXTRA) != 0) + { + *errorcodeptr = ERR7; + goto FAILED; + } + class_charcount -= 2; /* Undo the default count from above */ + c = *ptr; /* Get the final character and fall through */ + break; } - - class_charcount -= 2; /* Undo the default count from above */ - c = *ptr; /* Get the final character and fall through */ } /* Fall through if we have a single character (c >= 0). This may be @@ -3338,14 +3654,11 @@ for (;; ptr++) d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - /* \b is backspace; \X is literal X; \R is literal R; any other - special means the '-' was literal */ + /* \b is backspace; any other special means the '-' was literal */ if (d < 0) { - if (d == -ESC_b) d = CHAR_BS; - else if (d == -ESC_X) d = CHAR_X; - else if (d == -ESC_R) d = CHAR_R; else + if (d == -ESC_b) d = CHAR_BS; else { ptr = oldptr; goto LONE_SINGLE_CHARACTER; /* A few lines below */ @@ -3511,35 +3824,23 @@ for (;; ptr++) } } - /* Loop until ']' reached. This "while" is the end of the "do" above. */ + /* Loop until ']' reached. This "while" is the end of the "do" far above. + If we are at the end of an internal nested string, revert to the outer + string. */ - while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); + while (((c = *(++ptr)) != 0 || + (nestptr != NULL && + (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) && + (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); - if (c == 0) /* Missing terminating ']' */ + /* Check for missing terminating ']' */ + + if (c == 0) { *errorcodeptr = ERR6; goto FAILED; } - -/* This code has been disabled because it would mean that \s counts as -an explicit \r or \n reference, and that's not really what is wanted. Now -we set the flag only if there is a literal "\r" or "\n" in the class. */ - -#if 0 - /* Remember whether \r or \n are in this class */ - - if (negate_class) - { - if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; - } - else - { - if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; - } -#endif - - /* If class_charcount is 1, we saw precisely one character whose value is less than 256. As long as there were no characters >= 128 and there was no use of \p or \P, in other words, no use of any XCLASS features, we can @@ -3603,13 +3904,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* If there are characters with values > 255, we have to compile an extended class, with its own opcode, unless there was a negated special - such as \S in the class, because in that case all characters > 255 are in - the class, so any that were explicitly given as well can be ignored. If - (when there are explicit characters > 255 that must be listed) there are no - characters < 256, we can omit the bitmap in the actual compiled code. */ + such as \S in the class, and PCRE_UCP is not set, because in that case all + characters > 255 are in the class, so any that were explicitly given as + well can be ignored. If (when there are explicit characters > 255 that must + be listed) there are no characters < 256, we can omit the bitmap in the + actual compiled code. */ #ifdef SUPPORT_UTF8 - if (class_utf8 && !should_flip_negation) + if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0)) { *class_utf8data++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; @@ -3635,10 +3937,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } #endif - /* If there are no characters > 255, set the opcode to OP_CLASS or - OP_NCLASS, depending on whether the whole class was negated and whether - there were negative specials such as \S in the class. Then copy the 32-byte - map into the code vector, negating it if necessary. */ + /* If there are no characters > 255, or they are all to be included or + excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the + whole class was negated and whether there were negative specials such as \S + (non-UCP) in the class. Then copy the 32-byte map into the code vector, + negating it if necessary. */ *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) @@ -3762,8 +4065,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, - options, cd)) + check_auto_possessive(previous, utf8, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -3784,7 +4086,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ c = previous[1]; if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) + check_auto_possessive(previous, utf8, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -3808,7 +4110,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) + check_auto_possessive(previous, utf8, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4018,7 +4320,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { register int i; int ketoffset = 0; - int len = code - previous; + int len = (int)(code - previous); uschar *bralink = NULL; /* Repeating a DEFINE group is pointless */ @@ -4039,7 +4341,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { register uschar *ket = previous; do ket += GET(ket, 1); while (*ket != OP_KET); - ketoffset = code - ket; + ketoffset = (int)(code - ket); } /* The case of a zero minimum is special because of the need to stick @@ -4107,7 +4409,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* We chain together the bracket offset fields that have to be filled in later when the ends of the brackets are reached. */ - offset = (bralink == NULL)? 0 : previous - bralink; + offset = (bralink == NULL)? 0 : (int)(previous - bralink); bralink = previous; PUTINC(previous, 0, offset); } @@ -4216,7 +4518,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ { int offset; *code++ = OP_BRA; - offset = (bralink == NULL)? 0 : code - bralink; + offset = (bralink == NULL)? 0 : (int)(code - bralink); bralink = code; PUTINC(code, 0, offset); } @@ -4237,7 +4539,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ while (bralink != NULL) { int oldlinkoffset; - int offset = code - bralink + 1; + int offset = (int)(code - bralink + 1); uschar *bra = code - offset; oldlinkoffset = GET(bra, 1); bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; @@ -4325,7 +4627,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ #endif } - len = code - tempcode; + len = (int)(code - tempcode); if (len > 0) switch (*tempcode) { case OP_STAR: *tempcode = OP_POSSTAR; break; @@ -4384,24 +4686,34 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* First deal with various "verbs" that can be introduced by '*'. */ - if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0) + if (*(++ptr) == CHAR_ASTERISK && + ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':')) { int i, namelen; + int arglen = 0; const char *vn = verbnames; - const uschar *name = ++ptr; + const uschar *name = ptr + 1; + const uschar *arg = NULL; previous = NULL; while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; + namelen = (int)(ptr - name); + if (*ptr == CHAR_COLON) { - *errorcodeptr = ERR59; /* Not supported */ - goto FAILED; + arg = ++ptr; + while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0 + || *ptr == '_') ptr++; + arglen = (int)(ptr - arg); } + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR60; goto FAILED; } - namelen = ptr - name; + + /* Scan the table of verb names */ + for (i = 0; i < verbcount; i++) { if (namelen == verbs[i].len && @@ -4419,13 +4731,51 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ PUT2INC(code, 0, oc->number); } } - *code++ = verbs[i].op; - break; + + /* Handle the cases with/without an argument */ + + if (arglen == 0) + { + if (verbs[i].op < 0) /* Argument is mandatory */ + { + *errorcodeptr = ERR66; + goto FAILED; + } + *code = verbs[i].op; + if (*code++ == OP_THEN) + { + PUT(code, 0, code - bcptr->current_branch - 1); + code += LINK_SIZE; + } + } + + else + { + if (verbs[i].op_arg < 0) /* Argument is forbidden */ + { + *errorcodeptr = ERR59; + goto FAILED; + } + *code = verbs[i].op_arg; + if (*code++ == OP_THEN_ARG) + { + PUT(code, 0, code - bcptr->current_branch - 1); + code += LINK_SIZE; + } + *code++ = arglen; + memcpy(code, arg, arglen); + code += arglen; + *code++ = 0; + } + + break; /* Found verb, exit loop */ } + vn += verbs[i].len + 1; } - if (i < verbcount) continue; - *errorcodeptr = ERR60; + + if (i < verbcount) continue; /* Successfully handled a verb */ + *errorcodeptr = ERR60; /* Verb not recognized */ goto FAILED; } @@ -4544,7 +4894,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ recno * 10 + *ptr - CHAR_0 : -1; ptr++; } - namelen = ptr - name; + namelen = (int)(ptr - name); if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != CHAR_RIGHT_PARENTHESIS) @@ -4605,7 +4955,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Search the pattern for a forward reference */ else if ((i = find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0)) > 0) + (options & PCRE_EXTENDED) != 0, utf8)) > 0) { PUT2(code, 2+LINK_SIZE, i); code[1+LINK_SIZE]++; @@ -4740,8 +5090,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ goto FAILED; } *code++ = n; - PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ - PUT(code, LINK_SIZE, 0); /* Default length */ + PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ code += 2 * LINK_SIZE; } previous = NULL; @@ -4774,7 +5124,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ name = ++ptr; while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = ptr - name; + namelen = (int)(ptr - name); /* In the pre-compile phase, just do a syntax check. */ @@ -4904,13 +5254,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ NAMED_REF_OR_RECURSE: name = ++ptr; while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = ptr - name; + namelen = (int)(ptr - name); - /* In the pre-compile phase, do a syntax check and set a dummy - reference number. */ + /* In the pre-compile phase, do a syntax check. We used to just set + a dummy reference number, because it was not used in the first pass. + However, with the change of recursive back references to be atomic, + we have to look for the number so that this state can be identified, as + otherwise the incorrect length is computed. If it's not a backwards + reference, the dummy number will do. */ if (lengthptr != NULL) { + const uschar *temp; + if (namelen == 0) { *errorcodeptr = ERR62; @@ -4926,7 +5282,22 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ *errorcodeptr = ERR48; goto FAILED; } - recno = 0; + + /* The name table does not exist in the first pass, so we cannot + do a simple search as in the code below. Instead, we have to scan the + pattern to find the number. It is important that we scan it only as + far as we have got because the syntax of named subpatterns has not + been checked for the rest of the pattern, and find_parens() assumes + correct syntax. In any case, it's a waste of resources to scan + further. We stop the scan at the current point by temporarily + adjusting the value of cd->endpattern. */ + + temp = cd->end_pattern; + cd->end_pattern = ptr; + recno = find_parens(cd, name, namelen, + (options & PCRE_EXTENDED) != 0, utf8); + cd->end_pattern = temp; + if (recno < 0) recno = 0; /* Forward ref; set dummy number */ } /* In the real compile, seek the name in the table. We check the name @@ -4951,7 +5322,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } else if ((recno = /* Forward back reference */ find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0)) <= 0) + (options & PCRE_EXTENDED) != 0, utf8)) <= 0) { *errorcodeptr = ERR15; goto FAILED; @@ -5062,7 +5433,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (called == NULL) { if (find_parens(cd, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) + (options & PCRE_EXTENDED) != 0, utf8) < 0) { *errorcodeptr = ERR15; goto FAILED; @@ -5073,7 +5444,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ of the group. */ called = cd->start_code + recno; - PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); + PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code)); } /* If not a forward reference, and the subpattern is still open, @@ -5097,7 +5468,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ code += 1 + LINK_SIZE; *code = OP_RECURSE; - PUT(code, 1, called - cd->start_code); + PUT(code, 1, (int)(called - cd->start_code)); code += 1 + LINK_SIZE; *code = OP_KET; @@ -5208,8 +5579,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } /* End of switch for character following (? */ } /* End of (? handling */ - /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, - all unadorned brackets become non-capturing and behave like (?:...) + /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE + is set, all unadorned brackets become non-capturing and behave like (?:...) brackets. */ else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) @@ -5401,11 +5772,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* ===================================================================*/ /* Handle metasequences introduced by \. For ones like \d, the ESC_ values - are arranged to be the negation of the corresponding OP_values. For the - back references, the values are ESC_REF plus the reference number. Only - back references and those types that consume a character may be repeated. - We can test for values between ESC_b and ESC_Z for the latter; this may - have to change if any new ones are ever created. */ + are arranged to be the negation of the corresponding OP_values in the + default case when PCRE_UCP is not set. For the back references, the values + are ESC_REF plus the reference number. Only back references and those types + that consume a character may be repeated. We can test for values between + ESC_b and ESC_Z for the latter; this may have to change if any new ones are + ever created. */ case CHAR_BACKSLASH: tempptr = ptr; @@ -5565,12 +5937,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ #endif /* For the rest (including \X when Unicode properties are supported), we - can obtain the OP value by negating the escape value. */ + can obtain the OP value by negating the escape value in the default + situation when PCRE_UCP is not set. When it *is* set, we substitute + Unicode property tests. */ else { - previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; - *code++ = -c; +#ifdef SUPPORT_UCP + if (-c >= ESC_DU && -c <= ESC_wu) + { + nestptr = ptr + 1; /* Where to resume */ + ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */ + } + else +#endif + { + previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; + *code++ = -c; + } } continue; } @@ -5902,7 +6286,7 @@ for (;;) { if (lengthptr == NULL) { - int branch_length = code - last_branch; + int branch_length = (int)(code - last_branch); do { int prev_length = GET(last_branch, 1); @@ -5916,7 +6300,7 @@ for (;;) /* Fill in the ket */ *code = OP_KET; - PUT(code, 1, code - start_bracket); + PUT(code, 1, (int)(code - start_bracket)); code += 1 + LINK_SIZE; /* If it was a capturing subpattern, check to see if it contained any @@ -5931,9 +6315,9 @@ for (;;) code - start_bracket); *start_bracket = OP_ONCE; code += 1 + LINK_SIZE; - PUT(start_bracket, 1, code - start_bracket); + PUT(start_bracket, 1, (int)(code - start_bracket)); *code = OP_KET; - PUT(code, 1, code - start_bracket); + PUT(code, 1, (int)(code - start_bracket)); code += 1 + LINK_SIZE; length += 2 + 2*LINK_SIZE; } @@ -5988,7 +6372,7 @@ for (;;) else { *code = OP_ALT; - PUT(code, 1, code - last_branch); + PUT(code, 1, (int)(code - last_branch)); bc.current_branch = last_branch = code; code += 1 + LINK_SIZE; } @@ -6290,8 +6674,6 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ -#ifdef NOT_USED_IN_GLIB - PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) @@ -6299,7 +6681,6 @@ pcre_compile(const char *pattern, int options, const char **errorptr, return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } -#endif PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile2(const char *pattern, int options, int *errorcodeptr, @@ -6310,7 +6691,7 @@ int length = 1; /* For final END opcode */ int firstbyte, reqbyte, newline; int errorcode = 0; int skipatstart = 0; -BOOL utf8 = (options & PCRE_UTF8) != 0; +BOOL utf8; size_t size; uschar *code; const uschar *codestart; @@ -6380,6 +6761,10 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0) { skipatstart += 7; options |= PCRE_UTF8; continue; } + else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0) + { skipatstart += 6; options |= PCRE_UCP; continue; } + else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0) + { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; } if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } @@ -6404,6 +6789,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && else break; } +utf8 = (options & PCRE_UTF8) != 0; + /* Can't support UTF8 unless PCRE has been compiled to include the code. */ #ifdef SUPPORT_UTF8 @@ -6421,6 +6808,16 @@ if (utf8) } #endif +/* Can't support UCP unless PCRE has been compiled to include the code. */ + +#ifndef SUPPORT_UCP +if ((options & PCRE_UCP) != 0) + { + errorcode = ERR67; + goto PCRE_EARLY_ERROR_RETURN; + } +#endif + /* Check validity of \R options. */ switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) @@ -6549,7 +6946,7 @@ regex compiled on a system with 4-byte pointers is run on another with 8-byte pointers. */ re->magic_number = MAGIC_NUMBER; -re->size = size; +re->size = (int)size; re->options = cd->external_options; re->flags = cd->external_flags; re->dummy1 = 0; @@ -6620,7 +7017,7 @@ while (errorcode == 0 && cd->hwm > cworkspace) recno = GET(codestart, offset); groupptr = _pcre_find_bracket(codestart, utf8, recno); if (groupptr == NULL) errorcode = ERR53; - else PUT(((uschar *)codestart), offset, groupptr - codestart); + else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart)); } /* Give an error if there's back reference to a non-existent capturing @@ -6675,7 +7072,7 @@ if (errorcode != 0) { (pcre_free)(re); PCRE_EARLY_ERROR_RETURN: - *erroroffset = ptr - (const uschar *)pattern; + *erroroffset = (int)(ptr - (const uschar *)pattern); PCRE_EARLY_ERROR_RETURN2: *errorptr = find_error_text(errorcode); if (errorcodeptr != NULL) *errorcodeptr = errorcode; diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c index c241f5b05..4d61a325d 100644 --- a/glib/pcre/pcre_dfa_exec.c +++ b/glib/pcre/pcre_dfa_exec.c @@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */ /* This table identifies those opcodes that are followed immediately by a -character that is to be tested in some way. This makes is possible to +character that is to be tested in some way. This makes it possible to centralize the loading of these characters. In the case of Type * etc, the "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a small value. Non-zero values in the table are the offsets from the opcode where @@ -161,8 +161,9 @@ static const uschar coptable[] = { 0, 0, /* RREF, NRREF */ 0, /* DEF */ 0, 0, /* BRAZERO, BRAMINZERO */ - 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ - 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ + 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */ + 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */ + 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */ }; /* This table identifies those opcodes that inspect a character. It is used to @@ -218,8 +219,9 @@ static const uschar poptable[] = { 0, 0, /* RREF, NRREF */ 0, /* DEF */ 0, 0, /* BRAZERO, BRAMINZERO */ - 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ - 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ + 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */ + 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */ + 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */ }; /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, @@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE) { gone_back = (current_subject - max_back < start_subject)? - current_subject - start_subject : max_back; + (int)(current_subject - start_subject) : max_back; current_subject -= gone_back; } @@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE) int back = GET(end_code, 2+LINK_SIZE); if (back <= gone_back) { - int bstate = end_code - start_code + 2 + 2*LINK_SIZE; + int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); ADD_NEW_DATA(-bstate, 0, gone_back - back); } end_code += GET(end_code, 1); @@ -526,7 +528,7 @@ else ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); do { - ADD_NEW(end_code - start_code + length, 0); + ADD_NEW((int)(end_code - start_code + length), 0); end_code += GET(end_code, 1); length = 1 + LINK_SIZE; } @@ -753,8 +755,8 @@ for (;;) if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); if (offsetcount >= 2) { - offsets[0] = current_subject - start_subject; - offsets[1] = ptr - start_subject; + offsets[0] = (int)(current_subject - start_subject); + offsets[1] = (int)(ptr - start_subject); DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, offsets[1] - offsets[0], current_subject)); } @@ -776,7 +778,7 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_ALT: do { code += GET(code, 1); } while (*code == OP_ALT); - ADD_ACTIVE(code - start_code, 0); + ADD_ACTIVE((int)(code - start_code), 0); break; /*-----------------------------------------------------------------*/ @@ -784,7 +786,7 @@ for (;;) case OP_SBRA: do { - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); code += GET(code, 1); } while (*code == OP_ALT); @@ -793,11 +795,11 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_CBRA: case OP_SCBRA: - ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0); code += GET(code, 1); while (*code == OP_ALT) { - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); code += GET(code, 1); } break; @@ -808,14 +810,14 @@ for (;;) ADD_ACTIVE(state_offset + 1, 0); code += 1 + GET(code, 2); while (*code == OP_ALT) code += GET(code, 1); - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); break; /*-----------------------------------------------------------------*/ case OP_SKIPZERO: code += 1 + GET(code, 2); while (*code == OP_ALT) code += GET(code, 1); - ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); + ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); break; /*-----------------------------------------------------------------*/ @@ -829,7 +831,12 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_EOD: - if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } + if (ptr >= end_subject) + { + if ((md->moptions & PCRE_PARTIAL_HARD) != 0) + could_continue = TRUE; + else { ADD_ACTIVE(state_offset + 1, 0); } + } break; /*-----------------------------------------------------------------*/ @@ -869,7 +876,9 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_EODN: - if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) + if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) + could_continue = TRUE; + else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) { ADD_ACTIVE(state_offset + 1, 0); } break; @@ -877,7 +886,9 @@ for (;;) case OP_DOLL: if ((md->moptions & PCRE_NOTEOL) == 0) { - if (clen == 0 || + if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) + could_continue = TRUE; + else if (clen == 0 || ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) )) @@ -920,13 +931,37 @@ for (;;) if (utf8) BACKCHAR(temp); #endif GETCHARTEST(d, temp); +#ifdef SUPPORT_UCP + if ((md->poptions & PCRE_UCP) != 0) + { + if (d == '_') left_word = TRUE; else + { + int cat = UCD_CATEGORY(d); + left_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif left_word = d < 256 && (ctypes[d] & ctype_word) != 0; } - else left_word = 0; + else left_word = FALSE; if (clen > 0) + { +#ifdef SUPPORT_UCP + if ((md->poptions & PCRE_UCP) != 0) + { + if (c == '_') right_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + right_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif right_word = c < 256 && (ctypes[c] & ctype_word) != 0; - else right_word = 0; + } + else right_word = FALSE; if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) { ADD_ACTIVE(state_offset + 1, 0); } @@ -953,7 +988,8 @@ for (;;) break; case PT_LAMP: - OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: @@ -968,6 +1004,30 @@ for (;;) OK = UCD_SCRIPT(c) == code[2]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1122,7 +1182,8 @@ for (;;) break; case PT_LAMP: - OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: @@ -1137,6 +1198,30 @@ for (;;) OK = UCD_SCRIPT(c) == code[3]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1344,7 +1429,8 @@ for (;;) break; case PT_LAMP: - OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: @@ -1359,6 +1445,30 @@ for (;;) OK = UCD_SCRIPT(c) == code[3]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -1591,7 +1701,8 @@ for (;;) break; case PT_LAMP: - OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: @@ -1606,6 +1717,30 @@ for (;;) OK = UCD_SCRIPT(c) == code[5]; break; + /* These are specials for combination cases. */ + + case PT_ALNUM: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ + OK = _pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: + OK = _pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + /* Should never occur, but keep compilers from grumbling. */ default: @@ -2233,7 +2368,7 @@ for (;;) points to the byte after the end of the class. If there is a quantifier, this is where it will be. */ - next_state_offset = ecode - start_code; + next_state_offset = (int)(ecode - start_code); switch (*ecode) { @@ -2304,7 +2439,7 @@ for (;;) md, /* static match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2315,7 +2450,7 @@ for (;;) if (rc == PCRE_ERROR_DFA_UITEM) return rc; if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) - { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } + { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } } break; @@ -2342,9 +2477,9 @@ for (;;) cb.callout_number = code[LINK_SIZE+2]; cb.offset_vector = offsets; cb.subject = (PCRE_SPTR)start_subject; - cb.subject_length = end_subject - start_subject; - cb.start_match = current_subject - start_subject; - cb.current_position = ptr - start_subject; + cb.subject_length = (int)(end_subject - start_subject); + cb.start_match = (int)(current_subject - start_subject); + cb.current_position = (int)(ptr - start_subject); cb.pattern_position = GET(code, LINK_SIZE + 3); cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); cb.capture_top = 1; @@ -2395,7 +2530,7 @@ for (;;) md, /* fixed match data */ asscode, /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2407,7 +2542,7 @@ for (;;) if (rc == PCRE_ERROR_DFA_UITEM) return rc; if ((rc >= 0) == (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) - { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } + { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } } @@ -2428,7 +2563,7 @@ for (;;) md, /* fixed match data */ start_code + GET(code, 1), /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2480,7 +2615,7 @@ for (;;) md, /* fixed match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ - ptr - start_subject, /* start offset */ + (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ @@ -2497,7 +2632,8 @@ for (;;) do { end_subpattern += GET(end_subpattern, 1); } while (*end_subpattern == OP_ALT); - next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; + next_state_offset = + (int)(end_subpattern - start_code + LINK_SIZE + 1); /* If the end of this subpattern is KETRMAX or KETRMIN, we must arrange for the repeat state also to be added to the relevant list. @@ -2505,7 +2641,7 @@ for (;;) repeat_state_offset = (*end_subpattern == OP_KETRMAX || *end_subpattern == OP_KETRMIN)? - end_subpattern - start_code - GET(end_subpattern, 1) : -1; + (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; /* If we have matched an empty string, add the next state at the current character pointer. This is important so that the duplicate @@ -2569,9 +2705,9 @@ for (;;) cb.callout_number = code[1]; cb.offset_vector = offsets; cb.subject = (PCRE_SPTR)start_subject; - cb.subject_length = end_subject - start_subject; - cb.start_match = current_subject - start_subject; - cb.current_position = ptr - start_subject; + cb.subject_length = (int)(end_subject - start_subject); + cb.start_match = (int)(current_subject - start_subject); + cb.current_position = (int)(ptr - start_subject); cb.pattern_position = GET(code, 2); cb.next_item_length = GET(code, 2 + LINK_SIZE); cb.capture_top = 1; @@ -2617,13 +2753,13 @@ for (;;) ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ match_count < 0) /* no matches */ ) && /* And... */ - ptr >= end_subject && /* Reached end of subject */ - ptr > current_subject) /* Matched non-empty string */ + ptr >= end_subject && /* Reached end of subject */ + ptr > md->start_used_ptr) /* Inspected non-empty string */ { if (offsetcount >= 2) { - offsets[0] = md->start_used_ptr - start_subject; - offsets[1] = end_subject - start_subject; + offsets[0] = (int)(md->start_used_ptr - start_subject); + offsets[1] = (int)(end_subject - start_subject); } match_count = PCRE_ERROR_PARTIAL; } @@ -2708,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; +if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; /* We need to find the pointer to any study data before we test for byte flipping, so we scan the extra_data block first. This may set two fields in the @@ -2826,16 +2963,14 @@ back the character offset. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { - if (_pcre_valid_utf8((uschar *)subject, length) >= 0) - return PCRE_ERROR_BADUTF8; + int tb; + if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0) + return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)? + PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { - int tb = ((uschar *)subject)[start_offset]; - if (tb > 127) - { - tb &= 0xc0; - if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; - } + tb = ((USPTR)subject)[start_offset] & 0xc0; + if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET; } } #endif @@ -2922,9 +3057,11 @@ for (;;) /* There are some optimizations that avoid running the match if a known starting point is not found. However, there is an option that disables - these, for testing and for ensuring that all callouts do actually occur. */ + these, for testing and for ensuring that all callouts do actually occur. + The option can be set in the regex by (*NO_START_OPT) or passed in + match-time options. */ - if ((options & PCRE_NO_START_OPTIMIZE) == 0) + if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) { /* Advance to a known first byte. */ @@ -2982,8 +3119,16 @@ for (;;) while (current_subject < end_subject) { register unsigned int c = *current_subject; - if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + current_subject++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(current_subject < end_subject && + (*current_subject & 0xc0) == 0x80) current_subject++; +#endif + } + else break; } } } diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c index 0a44fcced..569207cc3 100644 --- a/glib/pcre/pcre_exec.c +++ b/glib/pcre/pcre_exec.c @@ -71,10 +71,20 @@ defined PCRE_ERROR_xxx codes, which are all negative. */ /* Special internal returns from the match() function. Make them sufficiently negative to avoid the external error codes. */ -#define MATCH_COMMIT (-999) -#define MATCH_PRUNE (-998) -#define MATCH_SKIP (-997) -#define MATCH_THEN (-996) +#define MATCH_ACCEPT (-999) +#define MATCH_COMMIT (-998) +#define MATCH_PRUNE (-997) +#define MATCH_SKIP (-996) +#define MATCH_SKIP_ARG (-995) +#define MATCH_THEN (-994) + +/* This is a convenience macro for code that occurs many times. */ + +#define MRRETURN(ra) \ + { \ + md->mark = markptr; \ + RRETURN(ra); \ + } /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, @@ -245,7 +255,8 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, - RM51, RM52, RM53, RM54 }; + RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, + RM61, RM62 }; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't @@ -283,7 +294,8 @@ argument of match(), which never changes. */ #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ {\ - heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ + heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\ + if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ frame->Xwhere = rw; \ newframe->Xeptr = ra;\ newframe->Xecode = rb;\ @@ -304,9 +316,9 @@ argument of match(), which never changes. */ #define RRETURN(ra)\ {\ - heapframe *newframe = frame;\ - frame = newframe->Xprevframe;\ - (pcre_stack_free)(newframe);\ + heapframe *oldframe = frame;\ + frame = oldframe->Xprevframe;\ + (pcre_stack_free)(oldframe);\ if (frame != NULL)\ {\ rrc = ra;\ @@ -410,17 +422,18 @@ immediately. The second one is used when we already know we are past the end of the subject. */ #define CHECK_PARTIAL()\ - if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\ - {\ - md->hitend = TRUE;\ - if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ + if (md->partial != 0 && eptr >= md->end_subject && \ + eptr > md->start_used_ptr) \ + { \ + md->hitend = TRUE; \ + if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \ } #define SCHECK_PARTIAL()\ - if (md->partial != 0 && eptr > mstart)\ - {\ - md->hitend = TRUE;\ - if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ + if (md->partial != 0 && eptr > md->start_used_ptr) \ + { \ + md->hitend = TRUE; \ + if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \ } @@ -448,13 +461,14 @@ Arguments: Returns: MATCH_MATCH if matched ) these values are >= 0 MATCH_NOMATCH if failed to match ) + a negative MATCH_xxx value for PRUNE, SKIP, etc a negative PCRE_ERROR_xxx value if aborted by an error condition (e.g. stopped by repeated call or recursion limit) */ static int -match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR - markptr, int offset_top, match_data *md, unsigned long int ims, +match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, + const uschar *markptr, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, @@ -475,7 +489,8 @@ heap storage. Set up the top-level frame here; others are obtained from the heap whenever RMATCH() does a "recursion". See the macro definitions above. */ #ifdef NO_RECURSE -heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); +heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe)); +if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY); frame->Xprevframe = NULL; /* Marks the top level */ /* Copy in the original argument variables */ @@ -671,32 +686,99 @@ for (;;) switch(op) { + case OP_MARK: + markptr = ecode + 2; + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM55); + + /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an + argument, and we must check whether that argument matches this MARK's + argument. It is passed back in md->start_match_ptr (an overloading of that + variable). If it does match, we reset that variable to the current subject + position and return MATCH_SKIP. Otherwise, pass back the return code + unaltered. */ + + if (rrc == MATCH_SKIP_ARG && + strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0) + { + md->start_match_ptr = eptr; + RRETURN(MATCH_SKIP); + } + + if (md->mark == NULL) md->mark = markptr; + RRETURN(rrc); + case OP_FAIL: - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); + + /* COMMIT overrides PRUNE, SKIP, and THEN */ + + case OP_COMMIT: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM52); + if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && + rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && + rrc != MATCH_THEN) + RRETURN(rrc); + MRRETURN(MATCH_COMMIT); + + /* PRUNE overrides THEN */ case OP_PRUNE: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM51); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + MRRETURN(MATCH_PRUNE); + + case OP_PRUNE_ARG: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM56); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + md->mark = ecode + 2; RRETURN(MATCH_PRUNE); - case OP_COMMIT: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM52); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RRETURN(MATCH_COMMIT); + /* SKIP overrides PRUNE and THEN */ case OP_SKIP: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM53); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) + RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position */ - RRETURN(MATCH_SKIP); + MRRETURN(MATCH_SKIP); + + case OP_SKIP_ARG: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + ims, eptrb, flags, RM57); + if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) + RRETURN(rrc); + + /* Pass back the current skip name by overloading md->start_match_ptr and + returning the special MATCH_SKIP_ARG return code. This will either be + caught by a matching MARK, or get to the top, where it is treated the same + as PRUNE. */ + + md->start_match_ptr = ecode + 2; + RRETURN(MATCH_SKIP_ARG); + + /* For THEN (and THEN_ARG) we pass back the address of the bracket or + the alt that is at the start of the current branch. This makes it possible + to skip back past alternatives that precede the THEN within the current + branch. */ case OP_THEN: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM54); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->start_match_ptr = ecode - GET(ecode, 1); + MRRETURN(MATCH_THEN); + + case OP_THEN_ARG: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE], + offset_top, md, ims, eptrb, flags, RM58); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->start_match_ptr = ecode - GET(ecode, 1); + md->mark = ecode + LINK_SIZE + 2; RRETURN(MATCH_THEN); /* Handle a capturing bracket. If there is space in the offset vector, save @@ -733,14 +815,17 @@ for (;;) save_capture_last = md->capture_last; DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); - md->offset_vector[md->offset_end - number] = eptr - md->start_subject; + md->offset_vector[md->offset_end - number] = + (int)(eptr - md->start_subject); flags = (op == OP_SCBRA)? match_cbegroup : 0; do { RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM1); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) + RRETURN(rrc); md->capture_last = save_capture_last; ecode += GET(ecode, 1); } @@ -752,6 +837,7 @@ for (;;) md->offset_vector[offset+1] = save_offset2; md->offset_vector[md->offset_end - number] = save_offset3; + if (rrc != MATCH_THEN) md->mark = markptr; RRETURN(MATCH_NOMATCH); } @@ -791,6 +877,7 @@ for (;;) RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM48); + if (rrc == MATCH_NOMATCH) md->mark = markptr; RRETURN(rrc); } @@ -799,7 +886,9 @@ for (;;) RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM2); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) + RRETURN(rrc); ecode += GET(ecode, 1); } /* Control never reaches here. */ @@ -826,15 +915,15 @@ for (;;) cb.callout_number = ecode[LINK_SIZE+2]; cb.offset_vector = md->offset_vector; cb.subject = (PCRE_SPTR)md->start_subject; - cb.subject_length = md->end_subject - md->start_subject; - cb.start_match = mstart - md->start_subject; - cb.current_position = eptr - md->start_subject; + cb.subject_length = (int)(md->end_subject - md->start_subject); + cb.start_match = (int)(mstart - md->start_subject); + cb.current_position = (int)(eptr - md->start_subject); cb.pattern_position = GET(ecode, LINK_SIZE + 3); cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += _pcre_OP_lengths[OP_CALLOUT]; @@ -1000,7 +1089,8 @@ for (;;) ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); while (*ecode == OP_ALT) ecode += GET(ecode, 1); } - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + else if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) { RRETURN(rrc); /* Need braces because of following else */ } @@ -1054,7 +1144,7 @@ for (;;) { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; - md->offset_vector[offset+1] = eptr - md->start_subject; + md->offset_vector[offset+1] = (int)(eptr - md->start_subject); if (offset_top <= offset) offset_top = offset + 2; } ecode += 3; @@ -1089,14 +1179,19 @@ for (;;) (md->notempty || (md->notempty_atstart && mstart == md->start_subject + md->start_offset))) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* Otherwise, we have a match. */ md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ md->start_match_ptr = mstart; /* and the start (\K can modify) */ - RRETURN(MATCH_MATCH); + + /* For some reason, the macros don't work properly if an expression is + given as the argument to MRRETURN when the heap is in use. */ + + rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; + MRRETURN(rrc); /* Change option settings */ @@ -1118,16 +1213,18 @@ for (;;) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM4); - if (rrc == MATCH_MATCH) + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { mstart = md->start_match_ptr; /* In case \K reset it */ break; } - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) + RRETURN(rrc); ecode += GET(ecode, 1); } while (*ecode == OP_ALT); - if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); + if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH); /* If checking an assertion for a condition, return MATCH_MATCH. */ @@ -1151,13 +1248,15 @@ for (;;) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM5); - if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH); if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) { do ecode += GET(ecode,1); while (*ecode == OP_ALT); break; } - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) + RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); @@ -1180,7 +1279,7 @@ for (;;) while (i-- > 0) { eptr--; - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH); BACKCHAR(eptr); } } @@ -1191,7 +1290,7 @@ for (;;) { eptr -= GET(ecode, 1); - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); + if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH); } /* Save the earliest consulted character, then skip to next op code */ @@ -1212,15 +1311,15 @@ for (;;) cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; cb.subject = (PCRE_SPTR)md->start_subject; - cb.subject_length = md->end_subject - md->start_subject; - cb.start_match = mstart - md->start_subject; - cb.current_position = eptr - md->start_subject; + cb.subject_length = (int)(md->end_subject - md->start_subject); + cb.start_match = (int)(mstart - md->start_subject); + cb.current_position = (int)(eptr - md->start_subject); cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += 2 + 2*LINK_SIZE; @@ -1286,15 +1385,16 @@ for (;;) { RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, md, ims, eptrb, flags, RM6); - if (rrc == MATCH_MATCH) + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { DPRINTF(("Recursion matched\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); - RRETURN(MATCH_MATCH); + MRRETURN(MATCH_MATCH); } - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + else if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) { DPRINTF(("Recursion gave error %d\n", rrc)); if (new_recursive.offset_save != stacksave) @@ -1313,7 +1413,7 @@ for (;;) md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never reaches here */ @@ -1332,12 +1432,14 @@ for (;;) do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); - if (rrc == MATCH_MATCH) + if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ { mstart = md->start_match_ptr; break; } - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && + (rrc != MATCH_THEN || md->start_match_ptr != ecode)) + RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); @@ -1467,7 +1569,7 @@ for (;;) md->end_match_ptr = eptr; /* For ONCE */ md->end_offset_top = offset_top; md->start_match_ptr = mstart; - RRETURN(MATCH_MATCH); + MRRETURN(MATCH_MATCH); } /* For capturing groups we have to check the group number back at the start @@ -1491,7 +1593,7 @@ for (;;) { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; - md->offset_vector[offset+1] = eptr - md->start_subject; + md->offset_vector[offset+1] = (int)(eptr - md->start_subject); if (offset_top <= offset) offset_top = offset + 2; } @@ -1562,12 +1664,12 @@ for (;;) /* Start of subject unless notbol, or after internal newline if multiline */ case OP_CIRC: - if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); + if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH); if ((ims & PCRE_MULTILINE) != 0) { if (eptr != md->start_subject && (eptr == md->end_subject || !WAS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; } @@ -1576,14 +1678,14 @@ for (;;) /* Start of subject assertion */ case OP_SOD: - if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); + if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH); ecode++; break; /* Start of match assertion */ case OP_SOM: - if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); + if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1601,39 +1703,42 @@ for (;;) if ((ims & PCRE_MULTILINE) != 0) { if (eptr < md->end_subject) - { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } + { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); } else - { if (md->noteol) RRETURN(MATCH_NOMATCH); } + { + if (md->noteol) MRRETURN(MATCH_NOMATCH); + SCHECK_PARTIAL(); + } ecode++; break; } - else + else /* Not multiline */ { - if (md->noteol) RRETURN(MATCH_NOMATCH); - if (!md->endonly) - { - if (eptr != md->end_subject && - (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - } + if (md->noteol) MRRETURN(MATCH_NOMATCH); + if (!md->endonly) goto ASSERT_NL_OR_EOS; } + /* ... else fall through for endonly */ /* End of subject assertion (\z) */ case OP_EOD: - if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); + if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH); + SCHECK_PARTIAL(); ecode++; break; /* End of subject or ending \n assertion (\Z) */ case OP_EODN: - if (eptr != md->end_subject && + ASSERT_NL_OR_EOS: + if (eptr < md->end_subject && (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); + + /* Either at end of string or \n before end. */ + + SCHECK_PARTIAL(); ecode++; break; @@ -1651,14 +1756,30 @@ for (;;) #ifdef SUPPORT_UTF8 if (utf8) { + /* Get status of previous character */ + if (eptr == md->start_subject) prev_is_word = FALSE; else { USPTR lastptr = eptr - 1; while((*lastptr & 0xc0) == 0x80) lastptr--; if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; GETCHAR(c, lastptr); +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + if (c == '_') prev_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + prev_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } + + /* Get status of next character */ + if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -1667,47 +1788,89 @@ for (;;) else { GETCHAR(c, eptr); +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + if (c == '_') cur_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + cur_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } } else #endif - /* Not in UTF-8 mode */ + /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for + consistency with the behaviour of \w we do use it in this case. */ { + /* Get status of previous character */ + if (eptr == md->start_subject) prev_is_word = FALSE; else { if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + c = eptr[-1]; + if (c == '_') prev_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + prev_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0); } + + /* Get status of next character */ + if (eptr >= md->end_subject) { SCHECK_PARTIAL(); cur_is_word = FALSE; } - else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); + else +#ifdef SUPPORT_UCP + if (md->use_ucp) + { + c = *eptr; + if (c == '_') cur_is_word = TRUE; else + { + int cat = UCD_CATEGORY(c); + cur_is_word = (cat == ucp_L || cat == ucp_N); + } + } + else +#endif + cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); } /* Now see if the situation is what we want */ if ((*ecode++ == OP_WORD_BOUNDARY)? cur_is_word == prev_is_word : cur_is_word != prev_is_word) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; /* Match a single character type; inline for speed */ case OP_ANY: - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); /* Fall through */ case OP_ALLANY: if (eptr++ >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; @@ -1720,7 +1883,7 @@ for (;;) if (eptr++ >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; break; @@ -1729,7 +1892,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1738,7 +1901,7 @@ for (;;) #endif (md->ctypes[c] & ctype_digit) != 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1746,7 +1909,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1755,7 +1918,7 @@ for (;;) #endif (md->ctypes[c] & ctype_digit) == 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1763,7 +1926,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1772,7 +1935,7 @@ for (;;) #endif (md->ctypes[c] & ctype_space) != 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1780,7 +1943,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1789,7 +1952,7 @@ for (;;) #endif (md->ctypes[c] & ctype_space) == 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1797,7 +1960,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1806,7 +1969,7 @@ for (;;) #endif (md->ctypes[c] & ctype_word) != 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1814,7 +1977,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); if ( @@ -1823,7 +1986,7 @@ for (;;) #endif (md->ctypes[c] & ctype_word) == 0 ) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); ecode++; break; @@ -1831,12 +1994,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -1849,7 +2012,7 @@ for (;;) case 0x0085: case 0x2028: case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } ecode++; @@ -1859,7 +2022,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) @@ -1884,7 +2047,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; break; @@ -1893,12 +2056,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -1927,7 +2090,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) @@ -1940,7 +2103,7 @@ for (;;) case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; break; @@ -1949,12 +2112,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -1976,39 +2139,72 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); { int chartype = UCD_CHARTYPE(c); + switch(ecode[1]) { case PT_ANY: - if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH); break; case PT_LAMP: if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; + MRRETURN(MATCH_NOMATCH); + break; case PT_GC: if ((ecode[2] != _pcre_ucp_gentype[chartype]) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case PT_PC: if ((ecode[2] != chartype) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case PT_SC: if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); + break; + + /* These are specials */ + + case PT_ALNUM: + if ((_pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N) == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); break; + case PT_SPACE: /* Perl space */ + if ((_pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + case PT_PXSPACE: /* POSIX space */ + if ((_pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) + == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + case PT_WORD: + if ((_pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || + c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) + MRRETURN(MATCH_NOMATCH); + break; + + /* This should never occur */ + default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -2024,12 +2220,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); { int category = UCD_CATEGORY(c); - if (category == ucp_M) RRETURN(MATCH_NOMATCH); + if (category == ucp_M) MRRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; @@ -2074,7 +2270,7 @@ for (;;) referenced subpattern. */ if (offset >= offset_top || md->offset_vector[offset] < 0) - length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; + length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1); else length = md->offset_vector[offset+1] - md->offset_vector[offset]; @@ -2108,7 +2304,7 @@ for (;;) if (!match_ref(offset, eptr, length, md, ims)) { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += length; continue; /* With the main loop */ @@ -2128,7 +2324,7 @@ for (;;) if (!match_ref(offset, eptr, length, md, ims)) { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += length; } @@ -2146,11 +2342,11 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (!match_ref(offset, eptr, length, md, ims)) { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += length; } @@ -2177,7 +2373,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr -= length; } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2239,16 +2435,16 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); if (c > 255) { - if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); } else { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } } @@ -2261,10 +2457,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } @@ -2286,20 +2482,20 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); if (c > 255) { - if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); } else { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } } @@ -2311,14 +2507,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2384,7 +2580,7 @@ for (;;) } } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2436,10 +2632,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH); } /* If max == min we can continue with the main loop without the @@ -2456,14 +2652,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2492,7 +2688,7 @@ for (;;) if (eptr-- == pp) break; /* Stop if tried at original pos */ if (utf8) BACKCHAR(eptr); } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -2511,9 +2707,9 @@ for (;;) if (length > md->end_subject - eptr) { CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); + while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH); } else #endif @@ -2523,9 +2719,9 @@ for (;;) if (md->end_subject - eptr < 1) { SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); + if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH); ecode += 2; } break; @@ -2543,7 +2739,7 @@ for (;;) if (length > md->end_subject - eptr) { CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* If the pattern character's value is < 128, we have only one byte, and @@ -2551,7 +2747,7 @@ for (;;) if (fc < 128) { - if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } /* Otherwise we must pick up the subject character */ @@ -2570,7 +2766,7 @@ for (;;) #ifdef SUPPORT_UCP if (dc != UCD_OTHERCASE(fc)) #endif - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } } @@ -2582,9 +2778,9 @@ for (;;) if (md->end_subject - eptr < 1) { SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); ecode += 2; } break; @@ -2678,7 +2874,7 @@ for (;;) else { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } @@ -2690,7 +2886,7 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr <= md->end_subject - length && memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP @@ -2701,7 +2897,7 @@ for (;;) else { CHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2732,7 +2928,7 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) { RRETURN(MATCH_NOMATCH); } + if (eptr == pp) { MRRETURN(MATCH_NOMATCH); } #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); @@ -2775,9 +2971,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } if (min == max) continue; if (minimize) @@ -2786,13 +2982,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2818,7 +3014,7 @@ for (;;) eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2832,9 +3028,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != *eptr++) MRRETURN(MATCH_NOMATCH); } if (min == max) continue; @@ -2845,13 +3041,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc != *eptr++) RRETURN(MATCH_NOMATCH); + if (fc != *eptr++) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2876,7 +3072,7 @@ for (;;) eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2888,7 +3084,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } ecode++; GETCHARINCTEST(c, eptr); @@ -2898,11 +3094,11 @@ for (;;) if (c < 256) #endif c = md->lcc[c]; - if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); + if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH); } else { - if (*ecode++ == c) RRETURN(MATCH_NOMATCH); + if (*ecode++ == c) MRRETURN(MATCH_NOMATCH); } break; @@ -2996,11 +3192,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3013,9 +3209,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } } @@ -3032,15 +3228,15 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3051,13 +3247,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -3119,7 +3315,7 @@ for (;;) } } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -3138,10 +3334,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3153,9 +3349,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == *eptr++) RRETURN(MATCH_NOMATCH); + if (fc == *eptr++) MRRETURN(MATCH_NOMATCH); } } @@ -3172,14 +3368,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d) MRRETURN(MATCH_NOMATCH); } } else @@ -3190,13 +3386,13 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (fc == *eptr++) RRETURN(MATCH_NOMATCH); + if (fc == *eptr++) MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -3257,7 +3453,7 @@ for (;;) } } - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -3351,13 +3547,13 @@ for (;;) switch(prop_type) { case PT_ANY: - if (prop_fail_result) RRETURN(MATCH_NOMATCH); + if (prop_fail_result) MRRETURN(MATCH_NOMATCH); for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); } @@ -3369,14 +3565,14 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3386,12 +3582,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3401,12 +3597,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3416,15 +3612,84 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_ALNUM: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_SPACE: /* Perl space */ + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_PXSPACE: /* POSIX space */ + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + break; + + case PT_WORD: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N || + c == CHAR_UNDERSCORE) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); } break; + /* This should not occur */ + default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -3440,11 +3705,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); - if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); + if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; @@ -3471,9 +3736,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3485,7 +3750,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; @@ -3493,7 +3758,7 @@ for (;;) break; case OP_ANYBYTE: - if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); + if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH); eptr += min; break; @@ -3503,12 +3768,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -3521,7 +3786,7 @@ for (;;) case 0x0085: case 0x2028: case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } } @@ -3533,7 +3798,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) @@ -3558,7 +3823,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3569,12 +3834,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -3605,7 +3870,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) @@ -3618,7 +3883,7 @@ for (;;) case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3629,12 +3894,12 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -3653,11 +3918,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINC(c, eptr); if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3667,10 +3932,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -3681,10 +3946,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; @@ -3695,10 +3960,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -3709,10 +3974,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; @@ -3723,10 +3988,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -3749,9 +4014,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); eptr++; } break; @@ -3760,7 +4025,7 @@ for (;;) if (eptr > md->end_subject - min) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += min; break; @@ -3769,7 +4034,7 @@ for (;;) if (eptr > md->end_subject - min) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } eptr += min; break; @@ -3780,11 +4045,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -3794,7 +4059,7 @@ for (;;) case 0x000b: case 0x000c: case 0x0085: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } } @@ -3806,7 +4071,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { @@ -3814,7 +4079,7 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3825,11 +4090,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -3844,7 +4109,7 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { @@ -3854,7 +4119,7 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } } break; @@ -3865,11 +4130,11 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } switch(*eptr++) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -3886,9 +4151,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3898,9 +4163,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3910,9 +4175,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3922,9 +4187,9 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH); } break; @@ -3934,10 +4199,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if ((md->ctypes[*eptr++] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3947,10 +4212,10 @@ for (;;) if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if ((md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; @@ -3979,14 +4244,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); - if (prop_fail_result) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if (prop_fail_result) MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -3995,18 +4260,18 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -4015,16 +4280,16 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -4033,16 +4298,16 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -4051,19 +4316,101 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ + case PT_ALNUM: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_SPACE: /* Perl space */ + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_PXSPACE: /* POSIX space */ + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_WORD: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || + prop_category == ucp_N || + c == CHAR_UNDERSCORE) + == prop_fail_result) + MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* This should never occur */ + default: RRETURN(PCRE_ERROR_INTERNAL); } @@ -4078,15 +4425,15 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); - if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); + if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; @@ -4110,14 +4457,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (ctype == OP_ANY && IS_NEWLINE(eptr)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { @@ -4129,7 +4476,7 @@ for (;;) case OP_ANYNL: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -4141,7 +4488,7 @@ for (;;) case 0x0085: case 0x2028: case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } break; @@ -4169,14 +4516,14 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -4211,14 +4558,14 @@ for (;;) case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -4232,32 +4579,32 @@ for (;;) case OP_NOT_DIGIT: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_DIGIT: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); break; default: @@ -4273,14 +4620,14 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); + if (fi >= max) MRRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) { SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } if (ctype == OP_ANY && IS_NEWLINE(eptr)) - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); c = *eptr++; switch(ctype) { @@ -4292,7 +4639,7 @@ for (;;) case OP_ANYNL: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; @@ -4303,7 +4650,7 @@ for (;;) case 0x000b: case 0x000c: case 0x0085: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); + if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); break; } break; @@ -4315,14 +4662,14 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ @@ -4339,14 +4686,14 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { - default: RRETURN(MATCH_NOMATCH); + default: MRRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ @@ -4357,27 +4704,27 @@ for (;;) break; case OP_NOT_DIGIT: - if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH); break; case OP_DIGIT: - if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: - if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: - if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: - if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: - if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH); break; default: @@ -4410,7 +4757,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); if (prop_fail_result) break; eptr+= len; } @@ -4425,7 +4772,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || @@ -4444,7 +4791,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) break; @@ -4461,7 +4808,7 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) break; @@ -4478,13 +4825,90 @@ for (;;) SCHECK_PARTIAL(); break; } - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; } break; + + case PT_ALNUM: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N) + == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_SPACE: /* Perl space */ + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_PXSPACE: /* POSIX space */ + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || + c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) + == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_WORD: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + prop_category = UCD_CATEGORY(c); + if ((prop_category == ucp_L || prop_category == ucp_N || + c == CHAR_UNDERSCORE) == prop_fail_result) + break; + eptr+= len; + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ @@ -5037,7 +5461,7 @@ for (;;) /* Get here if we can't make it match with any permitted repetitions */ - RRETURN(MATCH_NOMATCH); + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -5070,12 +5494,13 @@ switch (frame->Xwhere) LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) - LBL(53) LBL(54) + LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) #ifdef SUPPORT_UTF8 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) + LBL(59) LBL(60) LBL(61) LBL(62) #endif /* SUPPORT_UCP */ #endif /* SUPPORT_UTF8 */ default: @@ -5204,11 +5629,11 @@ const real_pcre *external_re = (const real_pcre *)argument_re; const real_pcre *re = external_re; /* Plausibility checks */ - if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; +if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; /* This information is for finding all the numbers associated with a given name, for condition testing. */ @@ -5279,6 +5704,7 @@ end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; +md->use_ucp = (re->options & PCRE_UCP) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; @@ -5288,6 +5714,7 @@ md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; md->hitend = FALSE; +md->mark = NULL; /* In case never set */ md->recursive = NULL; /* No recursion at top level */ @@ -5373,16 +5800,14 @@ back the character offset. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { - if (_pcre_valid_utf8((USPTR)subject, length) >= 0) - return PCRE_ERROR_BADUTF8; + int tb; + if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0) + return (tb == length && md->partial > 1)? + PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { - int tb = ((USPTR)subject)[start_offset]; - if (tb > 127) - { - tb &= 0xc0; - if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; - } + tb = ((USPTR)subject)[start_offset] & 0xc0; + if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET; } } #endif @@ -5510,9 +5935,10 @@ for(;;) /* There are some optimizations that avoid running the match if a known starting point is not found, or if a known later character is not present. However, there is an option that disables these, for testing and for ensuring - that all callouts do actually occur. */ + that all callouts do actually occur. The option can be set in the regex by + (*NO_START_OPT) or passed in match-time options. */ - if ((options & PCRE_NO_START_OPTIMIZE) == 0) + if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) { /* Advance to a unique first byte if there is one. */ @@ -5566,8 +5992,16 @@ for(;;) while (start_match < end_subject) { register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + start_match++; +#ifdef SUPPORT_UTF8 + if (utf8) + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; +#endif + } + else break; } } } /* Starting optimizations */ @@ -5668,6 +6102,23 @@ for(;;) switch(rc) { + /* SKIP passes back the next starting point explicitly, but if it is the + same as the match we have just done, treat it as NOMATCH. */ + + case MATCH_SKIP: + if (md->start_match_ptr != start_match) + { + new_start_match = md->start_match_ptr; + break; + } + /* Fall through */ + + /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched + the SKIP's arg was not found. We also treat this as NOMATCH. */ + + case MATCH_SKIP_ARG: + /* Fall through */ + /* NOMATCH and PRUNE advance by one character. THEN at this level acts exactly like PRUNE. */ @@ -5682,12 +6133,6 @@ for(;;) #endif break; - /* SKIP passes back the next starting point explicitly. */ - - case MATCH_SKIP: - new_start_match = md->start_match_ptr; - break; - /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ case MATCH_COMMIT: @@ -5733,7 +6178,8 @@ for(;;) md->nllen == 2)) start_match++; - } /* End of for(;;) "bumpalong" loop */ + md->mark = NULL; /* Reset for start of next match attempt */ + } /* End of for(;;) "bumpalong" loop */ /* ==========================================================================*/ @@ -5757,7 +6203,7 @@ capturing parentheses than vector slots. */ ENDLOOP: -if (rc == MATCH_MATCH) +if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) { if (using_temporary_offsets) { @@ -5783,12 +6229,12 @@ if (rc == MATCH_MATCH) if (offsetcount < 2) rc = 0; else { - offsets[0] = md->start_match_ptr - md->start_subject; - offsets[1] = md->end_match_ptr - md->start_subject; + offsets[0] = (int)(md->start_match_ptr - md->start_subject); + offsets[1] = (int)(md->end_match_ptr - md->start_subject); } DPRINTF((">>>> returning %d\n", rc)); - return rc; + goto RETURN_MARK; } /* Control gets here if there has been an error, or if the overall match @@ -5800,26 +6246,43 @@ if (using_temporary_offsets) (pcre_free)(md->offset_vector); } +/* For anything other than nomatch or partial match, just return the code. */ + if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) { DPRINTF((">>>> error: returning %d\n", rc)); return rc; } -else if (start_partial != NULL) + +/* Handle partial matches - disable any mark data */ + +if (start_partial != NULL) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); + md->mark = NULL; if (offsetcount > 1) { - offsets[0] = start_partial - (USPTR)subject; - offsets[1] = end_subject - (USPTR)subject; + offsets[0] = (int)(start_partial - (USPTR)subject); + offsets[1] = (int)(end_subject - (USPTR)subject); } - return PCRE_ERROR_PARTIAL; + rc = PCRE_ERROR_PARTIAL; } + +/* This is the classic nomatch case */ + else { DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); - return PCRE_ERROR_NOMATCH; + rc = PCRE_ERROR_NOMATCH; } + +/* Return the MARK data if it has been requested. */ + +RETURN_MARK: + +if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) + *(extra_data->mark) = (unsigned char *)(md->mark); +return rc; } /* End of pcre_exec.c */ diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h index 7c7412f32..9cff71d27 100644 --- a/glib/pcre/pcre_internal.h +++ b/glib/pcre/pcre_internal.h @@ -408,9 +408,10 @@ capturing parenthesis numbers in back references. */ /* When UTF-8 encoding is being used, a character is no longer just a single byte. The macros for character handling generate simple sequences when used in -byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should -never be called in byte mode. To make sure it can never even appear when UTF-8 -support is omitted, we don't even define it. */ +byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is +not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should +never be called in byte mode. To make sure they can never even appear when +UTF-8 support is omitted, we don't even define them. */ #ifndef SUPPORT_UTF8 #define GETCHAR(c, eptr) c = *eptr; @@ -418,43 +419,83 @@ support is omitted, we don't even define it. */ #define GETCHARINC(c, eptr) c = *eptr++; #define GETCHARINCTEST(c, eptr) c = *eptr++; #define GETCHARLEN(c, eptr, len) c = *eptr; +/* #define GETCHARLENTEST(c, eptr, len) */ /* #define BACKCHAR(eptr) */ #else /* SUPPORT_UTF8 */ +/* These macros were originally written in the form of loops that used data +from the tables whose names start with _pcre_utf8_table. They were rewritten by +a user so as not to use loops, because in some environments this gives a +significant performance advantage, and it seems never to do any harm. */ + +/* Base macro to pick up the remaining bytes of a UTF-8 character, not +advancing the pointer. */ + +#define GETUTF8(c, eptr) \ + { \ + if ((c & 0x20) == 0) \ + c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ + else if ((c & 0x10) == 0) \ + c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ + else if ((c & 0x08) == 0) \ + c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ + ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ + else if ((c & 0x04) == 0) \ + c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ + ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ + (eptr[4] & 0x3f); \ + else \ + c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ + ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ + ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ + } + /* Get the next UTF-8 character, not advancing the pointer. This is called when we know we are in UTF-8 mode. */ #define GETCHAR(c, eptr) \ c = *eptr; \ - if (c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ - { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ - } \ - } + if (c >= 0xc0) GETUTF8(c, eptr); /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the pointer. */ #define GETCHARTEST(c, eptr) \ c = *eptr; \ - if (utf8 && c >= 0xc0) \ + if (utf8 && c >= 0xc0) GETUTF8(c, eptr); + +/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing +the pointer. */ + +#define GETUTF8INC(c, eptr) \ { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ + if ((c & 0x20) == 0) \ + c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \ + else if ((c & 0x10) == 0) \ { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ + c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \ + eptr += 2; \ + } \ + else if ((c & 0x08) == 0) \ + { \ + c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \ + ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ + eptr += 3; \ + } \ + else if ((c & 0x04) == 0) \ + { \ + c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \ + ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \ + (eptr[3] & 0x3f); \ + eptr += 4; \ + } \ + else \ + { \ + c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \ + ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \ + ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \ + eptr += 5; \ } \ } @@ -463,31 +504,49 @@ know we are in UTF-8 mode. */ #define GETCHARINC(c, eptr) \ c = *eptr++; \ - if (c >= 0xc0) \ - { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - while (gcaa-- > 0) \ - { \ - gcss -= 6; \ - c |= (*eptr++ & 0x3f) << gcss; \ - } \ - } + if (c >= 0xc0) GETUTF8INC(c, eptr); -/* Get the next character, testing for UTF-8 mode, and advancing the pointer */ +/* Get the next character, testing for UTF-8 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-8 mode. */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ - if (utf8 && c >= 0xc0) \ + if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr); + +/* Base macro to pick up the remaining bytes of a UTF-8 character, not +advancing the pointer, incrementing the length. */ + +#define GETUTF8LEN(c, eptr, len) \ { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - while (gcaa-- > 0) \ + if ((c & 0x20) == 0) \ + { \ + c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ + len++; \ + } \ + else if ((c & 0x10) == 0) \ { \ - gcss -= 6; \ - c |= (*eptr++ & 0x3f) << gcss; \ + c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ + len += 2; \ + } \ + else if ((c & 0x08) == 0) \ + {\ + c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ + ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ + len += 3; \ + } \ + else if ((c & 0x04) == 0) \ + { \ + c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ + ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ + (eptr[4] & 0x3f); \ + len += 4; \ + } \ + else \ + {\ + c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ + ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ + ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ + len += 5; \ } \ } @@ -496,39 +555,15 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */ #define GETCHARLEN(c, eptr, len) \ c = *eptr; \ - if (c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ - { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ - } \ - len += gcaa; \ - } + if (c >= 0xc0) GETUTF8LEN(c, eptr, len); /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the pointer, incrementing length if there are extra bytes. This is called when we -know we are in UTF-8 mode. */ +do not know if we are in UTF-8 mode. */ #define GETCHARLENTEST(c, eptr, len) \ c = *eptr; \ - if (utf8 && c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ - { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ - } \ - len += gcaa; \ - } + if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len); /* If the pointer is not at the start of a character, move it back until it is. This is called only in UTF-8 mode - we don't put a test within the macro @@ -536,7 +571,7 @@ because almost all calls are already within a block of UTF-8 only code. */ #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- -#endif +#endif /* SUPPORT_UTF8 */ /* In case there is no definition of offsetof() provided - though any proper @@ -580,7 +615,7 @@ time, run time, or study time, respectively. */ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ - PCRE_JAVASCRIPT_COMPAT) + PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE) #define PUBLIC_EXEC_OPTIONS \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ @@ -620,7 +655,7 @@ variable-length repeat, or a anything other than literal characters. */ environments where these macros are defined elsewhere. Unfortunately, there is no way to do the same for the typedef. */ -typedef gboolean BOOL; +typedef gboolean BOOL; /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal character constants like '*' because the compiler would emit their EBCDIC code, @@ -870,6 +905,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ #define STRING_COMMIT0 "COMMIT\0" #define STRING_F0 "F\0" #define STRING_FAIL0 "FAIL\0" +#define STRING_MARK0 "MARK\0" #define STRING_PRUNE0 "PRUNE\0" #define STRING_SKIP0 "SKIP\0" #define STRING_THEN "THEN" @@ -891,14 +927,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ #define STRING_DEFINE "DEFINE" -#define STRING_CR_RIGHTPAR "CR)" -#define STRING_LF_RIGHTPAR "LF)" -#define STRING_CRLF_RIGHTPAR "CRLF)" -#define STRING_ANY_RIGHTPAR "ANY)" -#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" -#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" -#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" -#define STRING_UTF8_RIGHTPAR "UTF8)" +#define STRING_CR_RIGHTPAR "CR)" +#define STRING_LF_RIGHTPAR "LF)" +#define STRING_CRLF_RIGHTPAR "CRLF)" +#define STRING_ANY_RIGHTPAR "ANY)" +#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" +#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" +#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" +#define STRING_UTF8_RIGHTPAR "UTF8)" +#define STRING_UCP_RIGHTPAR "UCP)" +#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" #else /* SUPPORT_UTF8 */ @@ -1122,6 +1160,7 @@ only. */ #define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0" #define STRING_F0 STR_F "\0" #define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0" +#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0" #define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0" #define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0" #define STRING_THEN STR_T STR_H STR_E STR_N @@ -1143,14 +1182,16 @@ only. */ #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E -#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS -#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS -#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS -#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS +#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS +#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS +#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS +#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS #endif /* SUPPORT_UTF8 */ @@ -1183,9 +1224,13 @@ only. */ #define PT_ANY 0 /* Any property - matches all chars */ #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ -#define PT_GC 2 /* General characteristic (e.g. L) */ -#define PT_PC 3 /* Particular characteristic (e.g. Lu) */ +#define PT_GC 2 /* Specified general characteristic (e.g. L) */ +#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */ #define PT_SC 4 /* Script (e.g. Han) */ +#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */ +#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */ +#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ +#define PT_WORD 8 /* Word - L plus N plus underscore */ /* Flag bits and data types for the extended class (OP_XCLASS) for classes that contain UTF-8 characters with values greater than 255. */ @@ -1202,9 +1247,15 @@ contain UTF-8 characters with values greater than 255. */ /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode -definitions below, up to ESC_z. There's a dummy for OP_ANY because it -corresponds to "." rather than an escape sequence, and another for OP_ALLANY -(which is used for [^] in JavaScript compatibility mode). +definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it +corresponds to "." in DOTALL mode rather than an escape sequence. It is also +used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves +like \N. + +The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. +when PCRE_UCP is set, when replacement of \d etc by \p sequences is required. +They must be contiguous, and remain in order so that the replacements can be +looked up from a table. The final escape must be ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). There are two tests in the code for an escape @@ -1214,11 +1265,12 @@ put in between that don't consume a character, that code will have to change. */ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, - ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, - ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, + ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, + ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, + ESC_E, ESC_Q, ESC_g, ESC_k, + ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu, ESC_REF }; - /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in order to the list of escapes immediately above. @@ -1242,8 +1294,8 @@ enum { OP_WHITESPACE, /* 9 \s */ OP_NOT_WORDCHAR, /* 10 \W */ OP_WORDCHAR, /* 11 \w */ - OP_ANY, /* 12 Match any character (subject to DOTALL) */ - OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */ + OP_ANY, /* 12 Match any character except newline */ + OP_ALLANY, /* 13 Match any character */ OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ OP_NOTPROP, /* 15 \P (not Unicode property) */ OP_PROP, /* 16 \p (Unicode property) */ @@ -1373,20 +1425,24 @@ enum { /* These are backtracking control verbs */ - OP_PRUNE, /* 107 */ - OP_SKIP, /* 108 */ - OP_THEN, /* 109 */ - OP_COMMIT, /* 110 */ + OP_MARK, /* 107 always has an argument */ + OP_PRUNE, /* 108 */ + OP_PRUNE_ARG, /* 109 same, but with argument */ + OP_SKIP, /* 110 */ + OP_SKIP_ARG, /* 111 same, but with argument */ + OP_THEN, /* 112 */ + OP_THEN_ARG, /* 113 same, but with argument */ + OP_COMMIT, /* 114 */ /* These are forced failure and success verbs */ - OP_FAIL, /* 111 */ - OP_ACCEPT, /* 112 */ - OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */ + OP_FAIL, /* 115 */ + OP_ACCEPT, /* 116 */ + OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO, /* 114 */ + OP_SKIPZERO, /* 118 */ /* This is not an opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors - there have been @@ -1397,7 +1453,7 @@ enum { /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro definitions that follow must also be updated to match. There are also tables -called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */ +called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */ /* This macro defines textual names for all the opcodes. These are used only @@ -1422,7 +1478,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \ "Brazero", "Braminzero", \ - "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ + "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ + "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ "Close", "Skip zero" @@ -1488,8 +1545,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 3, 3, /* RREF, NRREF */ \ 1, /* DEF */ \ 1, 1, /* BRAZERO, BRAMINZERO */ \ - 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ - 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ + 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ + 1, 3, /* SKIP, SKIP_ARG */ \ + 1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \ + 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */ /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion" @@ -1507,7 +1566,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, - ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT }; + ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, + ERRCOUNT }; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -1650,6 +1710,7 @@ typedef struct match_data { BOOL noteol; /* NOTEOL flag */ BOOL utf8; /* UTF8 flag */ BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ + BOOL use_ucp; /* PCRE_UCP flag */ BOOL endonly; /* Dollar not before final \n */ BOOL notempty; /* Empty string match not wanted */ BOOL notempty_atstart; /* Empty string match at start not wanted */ @@ -1669,6 +1730,7 @@ typedef struct match_data { int eptrn; /* Next free eptrblock */ recursion_info *recursive; /* Linked list of recursion data */ void *callout_data; /* To pass back to callouts */ + const uschar *mark; /* Mark pointer to pass back */ } match_data; /* A similar structure is used for the same purpose by the DFA matching @@ -1764,7 +1826,7 @@ extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL); extern int _pcre_ord2utf8(int, uschar *); extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *); -#define _pcre_valid_utf8(u, i) TRUE +#define _pcre_valid_utf8(USPTR, int) TRUE extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL); extern BOOL _pcre_xclass(int, const uschar *); diff --git a/glib/pcre/pcre_study.c b/glib/pcre/pcre_study.c index bd00a53a6..be321fa25 100644 --- a/glib/pcre/pcre_study.c +++ b/glib/pcre/pcre_study.c @@ -48,6 +48,7 @@ supporting functions. */ #include "pcre_internal.h" +#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7)) /* Returns from set_start_bits() */ @@ -413,6 +414,18 @@ for (;;) #endif break; + /* Skip these, but we need to add in the name length. */ + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + cc += _pcre_OP_lengths[op] + cc[1]; + break; + + case OP_THEN_ARG: + cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE]; + break; + /* For the record, these are the opcodes that are matched by "default": OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP, OP_THEN. */ @@ -431,25 +444,121 @@ for (;;) * Set a bit and maybe its alternate case * *************************************************/ -/* Given a character, set its bit in the table, and also the bit for the other -version of a letter if we are caseless. +/* Given a character, set its first byte's bit in the table, and also the +corresponding bit for the other version of a letter if we are caseless. In +UTF-8 mode, for characters greater than 127, we can only do the caseless thing +when Unicode property support is available. Arguments: start_bits points to the bit map - c is the character + p points to the character caseless the caseless flag cd the block with char table pointers + utf8 TRUE for UTF-8 mode -Returns: nothing +Returns: pointer after the character +*/ + +static const uschar * +set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless, + compile_data *cd, BOOL utf8) +{ +unsigned int c = *p; + +SET_BIT(c); + +#ifdef SUPPORT_UTF8 +if (utf8 && c > 127) + { + GETCHARINC(c, p); +#ifdef SUPPORT_UCP + if (caseless) + { + uschar buff[8]; + c = UCD_OTHERCASE(c); + (void)_pcre_ord2utf8(c, buff); + SET_BIT(buff[0]); + } +#endif + return p; + } +#endif + +/* Not UTF-8 mode, or character is less than 127. */ + +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); +return p + 1; +} + + + +/************************************************* +* Set bits for a positive character type * +*************************************************/ + +/* This function sets starting bits for a character type. In UTF-8 mode, we can +only do a direct setting for bytes less than 128, as otherwise there can be +confusion with bytes in the middle of UTF-8 characters. In a "traditional" +environment, the tables will only recognize ASCII characters anyway, but in at +least one Windows environment, some higher bytes bits were set in the tables. +So we deal with that case by considering the UTF-8 encoding. + +Arguments: + start_bits the starting bitmap + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + cd the block with char table pointers + +Returns: nothing */ static void -set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless, +set_type_bits(uschar *start_bits, int cbit_type, int table_limit, compile_data *cd) { -start_bits[c/8] |= (1 << (c&7)); -if (caseless && (cd->ctypes[c] & ctype_letter) != 0) - start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; +if (table_limit == 32) return; +for (c = 128; c < 256; c++) + { + if ((cd->cbits[c/8] & (1 << (c&7))) != 0) + { + uschar buff[8]; + (void)_pcre_ord2utf8(c, buff); + SET_BIT(buff[0]); + } + } +} + + +/************************************************* +* Set bits for a negative character type * +*************************************************/ + +/* This function sets starting bits for a negative character type such as \D. +In UTF-8 mode, we can only do a direct setting for bytes less than 128, as +otherwise there can be confusion with bytes in the middle of UTF-8 characters. +Unlike in the positive case, where we can set appropriate starting bits for +specific high-valued UTF-8 characters, in this case we have to set the bits for +all high-valued characters. The lowest is 0xc2, but we overkill by starting at +0xc0 (192) for simplicity. + +Arguments: + start_bits the starting bitmap + cbit type the type of character wanted + table_limit 32 for non-UTF-8; 16 for UTF-8 + cd the block with char table pointers + +Returns: nothing +*/ + +static void +set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit, + compile_data *cd) +{ +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; } @@ -484,6 +593,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, { register int c; int yield = SSB_DONE; +int table_limit = utf8? 16:32; #if 0 /* ========================================================================= */ @@ -607,12 +717,7 @@ do case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: - set_table_bit(start_bits, tcode[1], caseless, cd); - tcode += 2; -#ifdef SUPPORT_UTF8 - if (utf8 && tcode[-1] >= 0xc0) - tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; -#endif + tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8); break; /* Single-char upto sets the bit and tries the next */ @@ -620,12 +725,7 @@ do case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - set_table_bit(start_bits, tcode[3], caseless, cd); - tcode += 4; -#ifdef SUPPORT_UTF8 - if (utf8 && tcode[-1] >= 0xc0) - tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; -#endif + tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8); break; /* At least one single char sets the bit and stops */ @@ -638,59 +738,86 @@ do case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - set_table_bit(start_bits, tcode[1], caseless, cd); + (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8); + try_next = FALSE; + break; + + /* Special spacing and line-terminating items. These recognize specific + lists of characters. The difference between VSPACE and ANYNL is that the + latter can match the two-character CRLF sequence, but that is not + relevant for finding the first character, so their code here is + identical. */ + + case OP_HSPACE: + SET_BIT(0x09); + SET_BIT(0x20); + if (utf8) + { + SET_BIT(0xC2); /* For U+00A0 */ + SET_BIT(0xE1); /* For U+1680, U+180E */ + SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ + SET_BIT(0xE3); /* For U+3000 */ + } + else SET_BIT(0xA0); + try_next = FALSE; + break; + + case OP_ANYNL: + case OP_VSPACE: + SET_BIT(0x0A); + SET_BIT(0x0B); + SET_BIT(0x0C); + SET_BIT(0x0D); + if (utf8) + { + SET_BIT(0xC2); /* For U+0085 */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else SET_BIT(0x85); try_next = FALSE; break; - /* Single character type sets the bits and stops */ + /* Single character types set the bits and stop. Note that if PCRE_UCP + is set, we do not see these op codes because \d etc are converted to + properties. Therefore, these apply in the case when only characters less + than 256 are recognized to match the types. */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_digit]; + set_nottype_bits(start_bits, cbit_digit, table_limit, cd); try_next = FALSE; break; case OP_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_digit]; + set_type_bits(start_bits, cbit_digit, table_limit, cd); try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + ensure it is set as not whitespace. */ case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= ~d; - } + set_nottype_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] |= 0x08; try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + not set it from the table. */ case OP_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= d; - } + c = start_bits[1]; /* Save in case it was already set */ + set_type_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] = (start_bits[1] & ~0x08) | c; try_next = FALSE; break; case OP_NOT_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_word]; + set_nottype_bits(start_bits, cbit_word, table_limit, cd); try_next = FALSE; break; case OP_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_word]; + set_type_bits(start_bits, cbit_word, table_limit, cd); try_next = FALSE; break; @@ -699,6 +826,7 @@ do case OP_TYPEPLUS: case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: tcode++; break; @@ -722,52 +850,69 @@ do case OP_TYPEPOSQUERY: switch(tcode[1]) { + default: case OP_ANY: case OP_ALLANY: return SSB_FAIL; + case OP_HSPACE: + SET_BIT(0x09); + SET_BIT(0x20); + if (utf8) + { + SET_BIT(0xC2); /* For U+00A0 */ + SET_BIT(0xE1); /* For U+1680, U+180E */ + SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ + SET_BIT(0xE3); /* For U+3000 */ + } + else SET_BIT(0xA0); + break; + + case OP_ANYNL: + case OP_VSPACE: + SET_BIT(0x0A); + SET_BIT(0x0B); + SET_BIT(0x0C); + SET_BIT(0x0D); + if (utf8) + { + SET_BIT(0xC2); /* For U+0085 */ + SET_BIT(0xE2); /* For U+2028, U+2029 */ + } + else SET_BIT(0x85); + break; + case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_digit]; + set_nottype_bits(start_bits, cbit_digit, table_limit, cd); break; case OP_DIGIT: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_digit]; + set_type_bits(start_bits, cbit_digit, table_limit, cd); break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + ensure it gets set as not whitespace. */ case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= ~d; - } + set_nottype_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] |= 0x08; break; /* The cbit_space table has vertical tab as whitespace; we have to - discard it. */ + avoid setting it. */ case OP_WHITESPACE: - for (c = 0; c < 32; c++) - { - int d = cd->cbits[c+cbit_space]; - if (c == 1) d &= ~0x08; - start_bits[c] |= d; - } + c = start_bits[1]; /* Save in case it was already set */ + set_type_bits(start_bits, cbit_space, table_limit, cd); + start_bits[1] = (start_bits[1] & ~0x08) | c; break; case OP_NOT_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_word]; + set_nottype_bits(start_bits, cbit_word, table_limit, cd); break; case OP_WORDCHAR: - for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_word]; + set_type_bits(start_bits, cbit_word, table_limit, cd); break; } diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c index b7f7ba5d1..8cc4eb309 100644 --- a/glib/pcre/pcre_tables.c +++ b/glib/pcre/pcre_tables.c @@ -123,8 +123,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0" #define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0" #define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0" +#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0" #define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0" #define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0" +#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0" #define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0" #define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0" #define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0" @@ -184,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Lu0 STR_L STR_u "\0" #define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0" #define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0" +#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0" #define STRING_M0 STR_M "\0" #define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0" #define STRING_Mc0 STR_M STR_c "\0" @@ -243,6 +246,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0" #define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0" #define STRING_Vai0 STR_V STR_a STR_i "\0" +#define STRING_Xan0 STR_X STR_a STR_n "\0" +#define STRING_Xps0 STR_X STR_p STR_s "\0" +#define STRING_Xsp0 STR_X STR_s STR_p "\0" +#define STRING_Xwd0 STR_X STR_w STR_d "\0" #define STRING_Yi0 STR_Y STR_i "\0" #define STRING_Z0 STR_Z "\0" #define STRING_Zl0 STR_Z STR_l "\0" @@ -256,8 +263,10 @@ const char _pcre_utt_names[] = STRING_Avestan0 STRING_Balinese0 STRING_Bamum0 + STRING_Batak0 STRING_Bengali0 STRING_Bopomofo0 + STRING_Brahmi0 STRING_Braille0 STRING_Buginese0 STRING_Buhid0 @@ -319,6 +328,7 @@ const char _pcre_utt_names[] = STRING_Lydian0 STRING_M0 STRING_Malayalam0 + STRING_Mandaic0 STRING_Mc0 STRING_Me0 STRING_Meetei_Mayek0 @@ -376,6 +386,10 @@ const char _pcre_utt_names[] = STRING_Tifinagh0 STRING_Ugaritic0 STRING_Vai0 + STRING_Xan0 + STRING_Xps0 + STRING_Xsp0 + STRING_Xwd0 STRING_Yi0 STRING_Z0 STRING_Zl0 @@ -389,131 +403,138 @@ const ucp_type_table _pcre_utt[] = { { 20, PT_SC, ucp_Avestan }, { 28, PT_SC, ucp_Balinese }, { 37, PT_SC, ucp_Bamum }, - { 43, PT_SC, ucp_Bengali }, - { 51, PT_SC, ucp_Bopomofo }, - { 60, PT_SC, ucp_Braille }, - { 68, PT_SC, ucp_Buginese }, - { 77, PT_SC, ucp_Buhid }, - { 83, PT_GC, ucp_C }, - { 85, PT_SC, ucp_Canadian_Aboriginal }, - { 105, PT_SC, ucp_Carian }, - { 112, PT_PC, ucp_Cc }, - { 115, PT_PC, ucp_Cf }, - { 118, PT_SC, ucp_Cham }, - { 123, PT_SC, ucp_Cherokee }, - { 132, PT_PC, ucp_Cn }, - { 135, PT_PC, ucp_Co }, - { 138, PT_SC, ucp_Common }, - { 145, PT_SC, ucp_Coptic }, - { 152, PT_PC, ucp_Cs }, - { 155, PT_SC, ucp_Cuneiform }, - { 165, PT_SC, ucp_Cypriot }, - { 173, PT_SC, ucp_Cyrillic }, - { 182, PT_SC, ucp_Deseret }, - { 190, PT_SC, ucp_Devanagari }, - { 201, PT_SC, ucp_Egyptian_Hieroglyphs }, - { 222, PT_SC, ucp_Ethiopic }, - { 231, PT_SC, ucp_Georgian }, - { 240, PT_SC, ucp_Glagolitic }, - { 251, PT_SC, ucp_Gothic }, - { 258, PT_SC, ucp_Greek }, - { 264, PT_SC, ucp_Gujarati }, - { 273, PT_SC, ucp_Gurmukhi }, - { 282, PT_SC, ucp_Han }, - { 286, PT_SC, ucp_Hangul }, - { 293, PT_SC, ucp_Hanunoo }, - { 301, PT_SC, ucp_Hebrew }, - { 308, PT_SC, ucp_Hiragana }, - { 317, PT_SC, ucp_Imperial_Aramaic }, - { 334, PT_SC, ucp_Inherited }, - { 344, PT_SC, ucp_Inscriptional_Pahlavi }, - { 366, PT_SC, ucp_Inscriptional_Parthian }, - { 389, PT_SC, ucp_Javanese }, - { 398, PT_SC, ucp_Kaithi }, - { 405, PT_SC, ucp_Kannada }, - { 413, PT_SC, ucp_Katakana }, - { 422, PT_SC, ucp_Kayah_Li }, - { 431, PT_SC, ucp_Kharoshthi }, - { 442, PT_SC, ucp_Khmer }, - { 448, PT_GC, ucp_L }, - { 450, PT_LAMP, 0 }, - { 453, PT_SC, ucp_Lao }, - { 457, PT_SC, ucp_Latin }, - { 463, PT_SC, ucp_Lepcha }, - { 470, PT_SC, ucp_Limbu }, - { 476, PT_SC, ucp_Linear_B }, - { 485, PT_SC, ucp_Lisu }, - { 490, PT_PC, ucp_Ll }, - { 493, PT_PC, ucp_Lm }, - { 496, PT_PC, ucp_Lo }, - { 499, PT_PC, ucp_Lt }, - { 502, PT_PC, ucp_Lu }, - { 505, PT_SC, ucp_Lycian }, - { 512, PT_SC, ucp_Lydian }, - { 519, PT_GC, ucp_M }, - { 521, PT_SC, ucp_Malayalam }, - { 531, PT_PC, ucp_Mc }, - { 534, PT_PC, ucp_Me }, - { 537, PT_SC, ucp_Meetei_Mayek }, - { 550, PT_PC, ucp_Mn }, - { 553, PT_SC, ucp_Mongolian }, - { 563, PT_SC, ucp_Myanmar }, - { 571, PT_GC, ucp_N }, - { 573, PT_PC, ucp_Nd }, - { 576, PT_SC, ucp_New_Tai_Lue }, - { 588, PT_SC, ucp_Nko }, - { 592, PT_PC, ucp_Nl }, - { 595, PT_PC, ucp_No }, - { 598, PT_SC, ucp_Ogham }, - { 604, PT_SC, ucp_Ol_Chiki }, - { 613, PT_SC, ucp_Old_Italic }, - { 624, PT_SC, ucp_Old_Persian }, - { 636, PT_SC, ucp_Old_South_Arabian }, - { 654, PT_SC, ucp_Old_Turkic }, - { 665, PT_SC, ucp_Oriya }, - { 671, PT_SC, ucp_Osmanya }, - { 679, PT_GC, ucp_P }, - { 681, PT_PC, ucp_Pc }, - { 684, PT_PC, ucp_Pd }, - { 687, PT_PC, ucp_Pe }, - { 690, PT_PC, ucp_Pf }, - { 693, PT_SC, ucp_Phags_Pa }, - { 702, PT_SC, ucp_Phoenician }, - { 713, PT_PC, ucp_Pi }, - { 716, PT_PC, ucp_Po }, - { 719, PT_PC, ucp_Ps }, - { 722, PT_SC, ucp_Rejang }, - { 729, PT_SC, ucp_Runic }, - { 735, PT_GC, ucp_S }, - { 737, PT_SC, ucp_Samaritan }, - { 747, PT_SC, ucp_Saurashtra }, - { 758, PT_PC, ucp_Sc }, - { 761, PT_SC, ucp_Shavian }, - { 769, PT_SC, ucp_Sinhala }, - { 777, PT_PC, ucp_Sk }, - { 780, PT_PC, ucp_Sm }, - { 783, PT_PC, ucp_So }, - { 786, PT_SC, ucp_Sundanese }, - { 796, PT_SC, ucp_Syloti_Nagri }, - { 809, PT_SC, ucp_Syriac }, - { 816, PT_SC, ucp_Tagalog }, - { 824, PT_SC, ucp_Tagbanwa }, - { 833, PT_SC, ucp_Tai_Le }, - { 840, PT_SC, ucp_Tai_Tham }, - { 849, PT_SC, ucp_Tai_Viet }, - { 858, PT_SC, ucp_Tamil }, - { 864, PT_SC, ucp_Telugu }, - { 871, PT_SC, ucp_Thaana }, - { 878, PT_SC, ucp_Thai }, - { 883, PT_SC, ucp_Tibetan }, - { 891, PT_SC, ucp_Tifinagh }, - { 900, PT_SC, ucp_Ugaritic }, - { 909, PT_SC, ucp_Vai }, - { 913, PT_SC, ucp_Yi }, - { 916, PT_GC, ucp_Z }, - { 918, PT_PC, ucp_Zl }, - { 921, PT_PC, ucp_Zp }, - { 924, PT_PC, ucp_Zs } + { 43, PT_SC, ucp_Batak }, + { 49, PT_SC, ucp_Bengali }, + { 57, PT_SC, ucp_Bopomofo }, + { 66, PT_SC, ucp_Brahmi }, + { 73, PT_SC, ucp_Braille }, + { 81, PT_SC, ucp_Buginese }, + { 90, PT_SC, ucp_Buhid }, + { 96, PT_GC, ucp_C }, + { 98, PT_SC, ucp_Canadian_Aboriginal }, + { 118, PT_SC, ucp_Carian }, + { 125, PT_PC, ucp_Cc }, + { 128, PT_PC, ucp_Cf }, + { 131, PT_SC, ucp_Cham }, + { 136, PT_SC, ucp_Cherokee }, + { 145, PT_PC, ucp_Cn }, + { 148, PT_PC, ucp_Co }, + { 151, PT_SC, ucp_Common }, + { 158, PT_SC, ucp_Coptic }, + { 165, PT_PC, ucp_Cs }, + { 168, PT_SC, ucp_Cuneiform }, + { 178, PT_SC, ucp_Cypriot }, + { 186, PT_SC, ucp_Cyrillic }, + { 195, PT_SC, ucp_Deseret }, + { 203, PT_SC, ucp_Devanagari }, + { 214, PT_SC, ucp_Egyptian_Hieroglyphs }, + { 235, PT_SC, ucp_Ethiopic }, + { 244, PT_SC, ucp_Georgian }, + { 253, PT_SC, ucp_Glagolitic }, + { 264, PT_SC, ucp_Gothic }, + { 271, PT_SC, ucp_Greek }, + { 277, PT_SC, ucp_Gujarati }, + { 286, PT_SC, ucp_Gurmukhi }, + { 295, PT_SC, ucp_Han }, + { 299, PT_SC, ucp_Hangul }, + { 306, PT_SC, ucp_Hanunoo }, + { 314, PT_SC, ucp_Hebrew }, + { 321, PT_SC, ucp_Hiragana }, + { 330, PT_SC, ucp_Imperial_Aramaic }, + { 347, PT_SC, ucp_Inherited }, + { 357, PT_SC, ucp_Inscriptional_Pahlavi }, + { 379, PT_SC, ucp_Inscriptional_Parthian }, + { 402, PT_SC, ucp_Javanese }, + { 411, PT_SC, ucp_Kaithi }, + { 418, PT_SC, ucp_Kannada }, + { 426, PT_SC, ucp_Katakana }, + { 435, PT_SC, ucp_Kayah_Li }, + { 444, PT_SC, ucp_Kharoshthi }, + { 455, PT_SC, ucp_Khmer }, + { 461, PT_GC, ucp_L }, + { 463, PT_LAMP, 0 }, + { 466, PT_SC, ucp_Lao }, + { 470, PT_SC, ucp_Latin }, + { 476, PT_SC, ucp_Lepcha }, + { 483, PT_SC, ucp_Limbu }, + { 489, PT_SC, ucp_Linear_B }, + { 498, PT_SC, ucp_Lisu }, + { 503, PT_PC, ucp_Ll }, + { 506, PT_PC, ucp_Lm }, + { 509, PT_PC, ucp_Lo }, + { 512, PT_PC, ucp_Lt }, + { 515, PT_PC, ucp_Lu }, + { 518, PT_SC, ucp_Lycian }, + { 525, PT_SC, ucp_Lydian }, + { 532, PT_GC, ucp_M }, + { 534, PT_SC, ucp_Malayalam }, + { 544, PT_SC, ucp_Mandaic }, + { 552, PT_PC, ucp_Mc }, + { 555, PT_PC, ucp_Me }, + { 558, PT_SC, ucp_Meetei_Mayek }, + { 571, PT_PC, ucp_Mn }, + { 574, PT_SC, ucp_Mongolian }, + { 584, PT_SC, ucp_Myanmar }, + { 592, PT_GC, ucp_N }, + { 594, PT_PC, ucp_Nd }, + { 597, PT_SC, ucp_New_Tai_Lue }, + { 609, PT_SC, ucp_Nko }, + { 613, PT_PC, ucp_Nl }, + { 616, PT_PC, ucp_No }, + { 619, PT_SC, ucp_Ogham }, + { 625, PT_SC, ucp_Ol_Chiki }, + { 634, PT_SC, ucp_Old_Italic }, + { 645, PT_SC, ucp_Old_Persian }, + { 657, PT_SC, ucp_Old_South_Arabian }, + { 675, PT_SC, ucp_Old_Turkic }, + { 686, PT_SC, ucp_Oriya }, + { 692, PT_SC, ucp_Osmanya }, + { 700, PT_GC, ucp_P }, + { 702, PT_PC, ucp_Pc }, + { 705, PT_PC, ucp_Pd }, + { 708, PT_PC, ucp_Pe }, + { 711, PT_PC, ucp_Pf }, + { 714, PT_SC, ucp_Phags_Pa }, + { 723, PT_SC, ucp_Phoenician }, + { 734, PT_PC, ucp_Pi }, + { 737, PT_PC, ucp_Po }, + { 740, PT_PC, ucp_Ps }, + { 743, PT_SC, ucp_Rejang }, + { 750, PT_SC, ucp_Runic }, + { 756, PT_GC, ucp_S }, + { 758, PT_SC, ucp_Samaritan }, + { 768, PT_SC, ucp_Saurashtra }, + { 779, PT_PC, ucp_Sc }, + { 782, PT_SC, ucp_Shavian }, + { 790, PT_SC, ucp_Sinhala }, + { 798, PT_PC, ucp_Sk }, + { 801, PT_PC, ucp_Sm }, + { 804, PT_PC, ucp_So }, + { 807, PT_SC, ucp_Sundanese }, + { 817, PT_SC, ucp_Syloti_Nagri }, + { 830, PT_SC, ucp_Syriac }, + { 837, PT_SC, ucp_Tagalog }, + { 845, PT_SC, ucp_Tagbanwa }, + { 854, PT_SC, ucp_Tai_Le }, + { 861, PT_SC, ucp_Tai_Tham }, + { 870, PT_SC, ucp_Tai_Viet }, + { 879, PT_SC, ucp_Tamil }, + { 885, PT_SC, ucp_Telugu }, + { 892, PT_SC, ucp_Thaana }, + { 899, PT_SC, ucp_Thai }, + { 904, PT_SC, ucp_Tibetan }, + { 912, PT_SC, ucp_Tifinagh }, + { 921, PT_SC, ucp_Ugaritic }, + { 930, PT_SC, ucp_Vai }, + { 934, PT_ALNUM, 0 }, + { 938, PT_PXSPACE, 0 }, + { 942, PT_SPACE, 0 }, + { 946, PT_WORD, 0 }, + { 950, PT_SC, ucp_Yi }, + { 953, PT_GC, ucp_Z }, + { 955, PT_PC, ucp_Zl }, + { 958, PT_PC, ucp_Zp }, + { 961, PT_PC, ucp_Zs } }; const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c index c25ecdc75..5b1b6f4ff 100644 --- a/glib/pcre/pcre_xclass.c +++ b/glib/pcre/pcre_xclass.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2010 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -104,6 +104,7 @@ while ((t = *data++) != XCL_END) else /* XCL_PROP & XCL_NOTPROP */ { int chartype = UCD_CHARTYPE(c); + switch(*data) { case PT_ANY: @@ -111,12 +112,13 @@ while ((t = *data++) != XCL_END) break; case PT_LAMP: - if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == - (t == XCL_PROP)) return !negated; + if ((chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: - if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated; + if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) + return !negated; break; case PT_PC: @@ -127,6 +129,33 @@ while ((t = *data++) != XCL_END) if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated; break; + case PT_ALNUM: + if ((_pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP)) + return !negated; + break; + + case PT_SPACE: /* Perl space */ + if ((_pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (t == XCL_PROP)) + return !negated; + break; + + case PT_PXSPACE: /* POSIX space */ + if ((_pcre_ucp_gentype[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) + return !negated; + break; + + case PT_WORD: + if ((_pcre_ucp_gentype[chartype] == ucp_L || + _pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE) + == (t == XCL_PROP)) + return !negated; + break; + /* This should never occur, but compilers may mutter if there is no default. */ diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h index f1b68b0c2..dcaa827ef 100644 --- a/glib/pcre/ucp.h +++ b/glib/pcre/ucp.h @@ -150,7 +150,10 @@ enum { ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC, ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN, ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM, - ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET + ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET, + ucp_Batak = G_UNICODE_SCRIPT_BATAK, + ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI, + ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC }; #endif |