summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Clasen <mclasen@redhat.com>2011-01-22 00:01:54 -0500
committerMatthias Clasen <mclasen@redhat.com>2011-01-22 00:01:54 -0500
commitfb2809ec996e9e12d06f4bc7239a98718f5f06d7 (patch)
tree24a472665ff8dacbb97bd748da9588d7b935a1c4
parent3f059a6a123dd62257f224b9af7701078783060e (diff)
Forgotten files
-rw-r--r--glib/pcre/pcre.h81
-rw-r--r--glib/pcre/pcre_chartables.c2
-rw-r--r--glib/pcre/pcre_compile.c899
-rw-r--r--glib/pcre/pcre_dfa_exec.c257
-rw-r--r--glib/pcre/pcre_exec.c1155
-rw-r--r--glib/pcre/pcre_internal.h286
-rw-r--r--glib/pcre/pcre_study.c277
-rw-r--r--glib/pcre/pcre_tables.c271
-rw-r--r--glib/pcre/pcre_xclass.c37
-rw-r--r--glib/pcre/ucp.h5
10 files changed, 2271 insertions, 999 deletions
diff --git a/glib/pcre/pcre.h b/glib/pcre/pcre.h
index 4864bd099..7c4c04011 100644
--- a/glib/pcre/pcre.h
+++ b/glib/pcre/pcre.h
@@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
- Copyright (c) 1997-2009 University of Cambridge
+ Copyright (c) 1997-2010 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE_MAJOR 8
-#define PCRE_MINOR 02
+#define PCRE_MINOR 12
#define PCRE_PRERELEASE
-#define PCRE_DATE 2010-03-19
+#define PCRE_DATE 2011-01-15
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate
@@ -96,41 +96,44 @@ extern "C" {
#endif
/* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
-
-#define PCRE_CASELESS 0x00000001
-#define PCRE_MULTILINE 0x00000002
-#define PCRE_DOTALL 0x00000004
-#define PCRE_EXTENDED 0x00000008
-#define PCRE_ANCHORED 0x00000010
-#define PCRE_DOLLAR_ENDONLY 0x00000020
-#define PCRE_EXTRA 0x00000040
-#define PCRE_NOTBOL 0x00000080
-#define PCRE_NOTEOL 0x00000100
-#define PCRE_UNGREEDY 0x00000200
-#define PCRE_NOTEMPTY 0x00000400
-#define PCRE_UTF8 0x00000800
-#define PCRE_NO_AUTO_CAPTURE 0x00001000
-#define PCRE_NO_UTF8_CHECK 0x00002000
-#define PCRE_AUTO_CALLOUT 0x00004000
-#define PCRE_PARTIAL_SOFT 0x00008000
+both, so we keep them all distinct. However, almost all the bits in the options
+word are now used. In the long run, we may have to re-use some of the
+compile-time only bits for runtime options, or vice versa. */
+
+#define PCRE_CASELESS 0x00000001 /* Compile */
+#define PCRE_MULTILINE 0x00000002 /* Compile */
+#define PCRE_DOTALL 0x00000004 /* Compile */
+#define PCRE_EXTENDED 0x00000008 /* Compile */
+#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
+#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
+#define PCRE_EXTRA 0x00000040 /* Compile */
+#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
+#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
+#define PCRE_UNGREEDY 0x00000200 /* Compile */
+#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
+#define PCRE_UTF8 0x00000800 /* Compile */
+#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
+#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
+#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
+#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
-#define PCRE_DFA_SHORTEST 0x00010000
-#define PCRE_DFA_RESTART 0x00020000
-#define PCRE_FIRSTLINE 0x00040000
-#define PCRE_DUPNAMES 0x00080000
-#define PCRE_NEWLINE_CR 0x00100000
-#define PCRE_NEWLINE_LF 0x00200000
-#define PCRE_NEWLINE_CRLF 0x00300000
-#define PCRE_NEWLINE_ANY 0x00400000
-#define PCRE_NEWLINE_ANYCRLF 0x00500000
-#define PCRE_BSR_ANYCRLF 0x00800000
-#define PCRE_BSR_UNICODE 0x01000000
-#define PCRE_JAVASCRIPT_COMPAT 0x02000000
-#define PCRE_NO_START_OPTIMIZE 0x04000000
-#define PCRE_NO_START_OPTIMISE 0x04000000
-#define PCRE_PARTIAL_HARD 0x08000000
-#define PCRE_NOTEMPTY_ATSTART 0x10000000
+#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
+#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
+#define PCRE_FIRSTLINE 0x00040000 /* Compile */
+#define PCRE_DUPNAMES 0x00080000 /* Compile */
+#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_CRLF 0x00300000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANY 0x00400000 /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
+#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
+#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
+#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
+#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
+#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
+#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
+#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
+#define PCRE_UCP 0x20000000 /* Compile */
/* Exec-time and get/set-time error codes */
@@ -158,6 +161,8 @@ both, so we keep them all distinct. */
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
+#define PCRE_ERROR_BADOFFSET (-24)
+#define PCRE_ERROR_SHORTUTF8 (-25)
/* Request types for pcre_fullinfo() */
@@ -200,6 +205,7 @@ these bits, just add new ones on the end, in order to remain compatible. */
#define PCRE_EXTRA_CALLOUT_DATA 0x0004
#define PCRE_EXTRA_TABLES 0x0008
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
+#define PCRE_EXTRA_MARK 0x0020
/* Types */
@@ -225,6 +231,7 @@ typedef struct pcre_extra {
void *callout_data; /* Data passed back in callouts */
const unsigned char *tables; /* Pointer to character tables */
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
+ unsigned char **mark; /* For passing back a mark pointer */
} pcre_extra;
/* The structure for passing out data via the pcre_callout_function. We use a
diff --git a/glib/pcre/pcre_chartables.c b/glib/pcre/pcre_chartables.c
index ae45db0ca..9117ae3c7 100644
--- a/glib/pcre/pcre_chartables.c
+++ b/glib/pcre/pcre_chartables.c
@@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the
current locale. If PCRE is configured with --enable-rebuild-chartables, this
happens automatically.
-The following #includes are present because without the gcc 4.x may remove the
+The following #includes are present because without them gcc 4.x may remove the
array definition from the final binary if PCRE is built into a static library
and dead code stripping is activated. This leads to link errors. Pulling in the
header ensures that the array gets flagged as "someone outside this compilation
diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c
index a00a99017..f0bae53ee 100644
--- a/glib/pcre/pcre_compile.c
+++ b/glib/pcre/pcre_compile.c
@@ -124,7 +124,7 @@ static const short int escapes[] = {
-ESC_H, 0,
0, -ESC_K,
0, 0,
- 0, 0,
+ -ESC_N, 0,
-ESC_P, -ESC_Q,
-ESC_R, -ESC_S,
0, 0,
@@ -171,7 +171,7 @@ static const short int escapes[] = {
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
-/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
+/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
@@ -188,11 +188,14 @@ string is built from string macros so that it works in UTF-8 mode on EBCDIC
platforms. */
typedef struct verbitem {
- int len;
- int op;
+ int len; /* Length of verb name */
+ int op; /* Op when no arg, or -1 if arg mandatory */
+ int op_arg; /* Op when arg present, or -1 if not allowed */
} verbitem;
static const char verbnames[] =
+ "\0" /* Empty name is a shorthand for MARK */
+ STRING_MARK0
STRING_ACCEPT0
STRING_COMMIT0
STRING_F0
@@ -202,13 +205,15 @@ static const char verbnames[] =
STRING_THEN;
static const verbitem verbs[] = {
- { 6, OP_ACCEPT },
- { 6, OP_COMMIT },
- { 1, OP_FAIL },
- { 4, OP_FAIL },
- { 5, OP_PRUNE },
- { 4, OP_SKIP },
- { 4, OP_THEN }
+ { 0, -1, OP_MARK },
+ { 4, -1, OP_MARK },
+ { 6, OP_ACCEPT, -1 },
+ { 6, OP_COMMIT, -1 },
+ { 1, OP_FAIL, -1 },
+ { 4, OP_FAIL, -1 },
+ { 5, OP_PRUNE, OP_PRUNE_ARG },
+ { 4, OP_SKIP, OP_SKIP_ARG },
+ { 4, OP_THEN, OP_THEN_ARG }
};
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
@@ -256,6 +261,53 @@ static const int posix_class_maps[] = {
cbit_xdigit,-1, 0 /* xdigit */
};
+/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
+substitutes must be in the order of the names, defined above, and there are
+both positive and negative cases. NULL means no substitute. */
+
+#ifdef SUPPORT_UCP
+static const uschar *substitutes[] = {
+ (uschar *)"\\P{Nd}", /* \D */
+ (uschar *)"\\p{Nd}", /* \d */
+ (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
+ (uschar *)"\\p{Xsp}", /* \s */
+ (uschar *)"\\P{Xwd}", /* \W */
+ (uschar *)"\\p{Xwd}" /* \w */
+};
+
+static const uschar *posix_substitutes[] = {
+ (uschar *)"\\p{L}", /* alpha */
+ (uschar *)"\\p{Ll}", /* lower */
+ (uschar *)"\\p{Lu}", /* upper */
+ (uschar *)"\\p{Xan}", /* alnum */
+ NULL, /* ascii */
+ (uschar *)"\\h", /* blank */
+ NULL, /* cntrl */
+ (uschar *)"\\p{Nd}", /* digit */
+ NULL, /* graph */
+ NULL, /* print */
+ NULL, /* punct */
+ (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
+ (uschar *)"\\p{Xwd}", /* word */
+ NULL, /* xdigit */
+ /* Negated cases */
+ (uschar *)"\\P{L}", /* ^alpha */
+ (uschar *)"\\P{Ll}", /* ^lower */
+ (uschar *)"\\P{Lu}", /* ^upper */
+ (uschar *)"\\P{Xan}", /* ^alnum */
+ NULL, /* ^ascii */
+ (uschar *)"\\H", /* ^blank */
+ NULL, /* ^cntrl */
+ (uschar *)"\\P{Nd}", /* ^digit */
+ NULL, /* ^graph */
+ NULL, /* ^print */
+ NULL, /* ^punct */
+ (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
+ (uschar *)"\\P{Xwd}", /* ^word */
+ NULL /* ^xdigit */
+};
+#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
+#endif
#define STRING(a) # a
#define XSTRING(s) STRING(s)
@@ -319,7 +371,7 @@ static const char error_texts[] =
/* 35 */
"invalid condition (?(0)\0"
"\\C not allowed in lookbehind assertion\0"
- "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
+ "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
"number after (?C is > 255\0"
"closing ) for (?C expected\0"
/* 40 */
@@ -345,7 +397,7 @@ static const char error_texts[] =
"inconsistent NEWLINE options\0"
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
"a numbered reference must not be zero\0"
- "(*VERB) with an argument is not supported\0"
+ "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
/* 60 */
"(*VERB) not recognized\0"
"number is too big\0"
@@ -353,7 +405,11 @@ static const char error_texts[] =
"digit expected after (?+\0"
"] is an invalid data character in JavaScript compatibility mode\0"
/* 65 */
- "different names for subpatterns of the same number are not allowed\0";
+ "different names for subpatterns of the same number are not allowed\0"
+ "(*MARK) must have an argument\0"
+ "this version of PCRE is not compiled with PCRE_UCP support\0"
+ "\\c must be followed by an ASCII character\0"
+ ;
/* Definition to allow mutual recursion */
@@ -456,7 +512,6 @@ else
case CHAR_l:
case CHAR_L:
- case CHAR_N:
case CHAR_u:
case CHAR_U:
*errorcodeptr = ERR37;
@@ -657,7 +712,8 @@ else
break;
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
- This coding is ASCII-specific, but then the whole concept of \cx is
+ An error is given if the byte following \c is not an ASCII character. This
+ coding is ASCII-specific, but then the whole concept of \cx is
ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
case CHAR_c:
@@ -667,11 +723,15 @@ else
*errorcodeptr = ERR2;
break;
}
-
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
+#ifndef EBCDIC /* ASCII/UTF-8 coding */
+ if (c > 127) /* Excludes all non-ASCII in either mode */
+ {
+ *errorcodeptr = ERR68;
+ break;
+ }
if (c >= CHAR_a && c <= CHAR_z) c -= 32;
c ^= 0x40;
-#else /* EBCDIC coding */
+#else /* EBCDIC coding */
if (c >= CHAR_a && c <= CHAR_z) c += 64;
c ^= 0xC0;
#endif
@@ -694,6 +754,19 @@ else
}
}
+/* Perl supports \N{name} for character names, as well as plain \N for "not
+newline". PCRE does not support \N{name}. */
+
+if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
+ *errorcodeptr = ERR37;
+
+/* If PCRE_UCP is set, we change the values for \d etc. */
+
+if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
+ c -= (ESC_DU - ESC_D);
+
+/* Set the pointer to the final character before returning. */
+
*ptrptr = ptr;
return c;
}
@@ -902,10 +975,21 @@ top-level call starts at the beginning of the pattern. All other calls must
start at a parenthesis. It scans along a pattern's text looking for capturing
subpatterns, and counting them. If it finds a named pattern that matches the
name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. We know that if (?P< is
-encountered, the name will be terminated by '>' because that is checked in the
-first pass. Recursion is used to keep track of subpatterns that reset the
-capturing group numbers - the (?| feature.
+returns when it reaches a given numbered subpattern. Recursion is used to keep
+track of subpatterns that reset the capturing group numbers - the (?| feature.
+
+This function was originally called only from the second pass, in which we know
+that if (?< or (?' or (?P< is encountered, the name will be correctly
+terminated because that is checked in the first pass. There is now one call to
+this function in the first pass, to check for a recursive back reference by
+name (so that we can make the whole group atomic). In this case, we need check
+only up to the current position in the pattern, and that is still OK because
+and previous occurrences will have been checked. To make this work, the test
+for "end of pattern" is a check against cd->end_pattern in the main loop,
+instead of looking for a binary zero. This means that the special first-pass
+call can adjust cd->end_pattern temporarily. (Checks for binary zero while
+processing items within the loop are OK, because afterwards the main loop will
+terminate.)
Arguments:
ptrptr address of the current character pointer (updated)
@@ -913,6 +997,7 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
count pointer to the current capturing subpattern number (updated)
Returns: the number of the named subpattern, or -1 if not found
@@ -920,7 +1005,7 @@ Returns: the number of the named subpattern, or -1 if not found
static int
find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
+ BOOL xmode, BOOL utf8, int *count)
{
uschar *ptr = *ptrptr;
int start_count = *count;
@@ -932,25 +1017,39 @@ dealing with. The very first call may not start with a parenthesis. */
if (ptr[0] == CHAR_LEFT_PARENTHESIS)
{
- if (ptr[1] == CHAR_QUESTION_MARK &&
- ptr[2] == CHAR_VERTICAL_LINE)
+ /* Handle specials such as (*SKIP) or (*UTF8) etc. */
+
+ if (ptr[1] == CHAR_ASTERISK) ptr += 2;
+
+ /* Handle a normal, unnamed capturing parenthesis. */
+
+ else if (ptr[1] != CHAR_QUESTION_MARK)
+ {
+ *count += 1;
+ if (name == NULL && *count == lorn) return *count;
+ ptr++;
+ }
+
+ /* All cases now have (? at the start. Remember when we are in a group
+ where the parenthesis numbers are duplicated. */
+
+ else if (ptr[2] == CHAR_VERTICAL_LINE)
{
ptr += 3;
dup_parens = TRUE;
}
- /* Handle a normal, unnamed capturing parenthesis */
+ /* Handle comments; all characters are allowed until a ket is reached. */
- else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
+ else if (ptr[2] == CHAR_NUMBER_SIGN)
{
- *count += 1;
- if (name == NULL && *count == lorn) return *count;
- ptr++;
+ for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
+ goto FAIL_EXIT;
}
/* Handle a condition. If it is an assertion, just carry on so that it
is processed as normal. If not, skip to the closing parenthesis of the
- condition (there can't be any nested parens. */
+ condition (there can't be any nested parens). */
else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
{
@@ -962,7 +1061,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
}
}
- /* We have either (? or (* and not a condition */
+ /* Start with (? but not a condition. */
else
{
@@ -991,9 +1090,11 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
}
/* Past any initial parenthesis handling, scan for parentheses or vertical
-bars. */
+bars. Stop if we get to cd->end_pattern. Note that this is important for the
+first-pass call when this value is temporarily adjusted to stop at the current
+position. So DO NOT change this to a test for binary zero. */
-for (; *ptr != 0; ptr++)
+for (; ptr < cd->end_pattern; ptr++)
{
/* Skip over backslashed characters and also entire \Q...\E */
@@ -1067,7 +1168,15 @@ for (; *ptr != 0; ptr++)
if (xmode && *ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+ ptr++;
+ while (*ptr != 0)
+ {
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
if (*ptr == 0) goto FAIL_EXIT;
continue;
}
@@ -1076,7 +1185,7 @@ for (; *ptr != 0; ptr++)
if (*ptr == CHAR_LEFT_PARENTHESIS)
{
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+ int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
if (rc > 0) return rc;
if (*ptr == 0) goto FAIL_EXIT;
}
@@ -1084,8 +1193,7 @@ for (; *ptr != 0; ptr++)
else if (*ptr == CHAR_RIGHT_PARENTHESIS)
{
if (dup_parens && *count < hwm_count) *count = hwm_count;
- *ptrptr = ptr;
- return -1;
+ goto FAIL_EXIT;
}
else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
@@ -1123,12 +1231,14 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
Returns: the number of the found subpattern, or -1 if not found
*/
static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+ BOOL utf8)
{
uschar *ptr = (uschar *)cd->start_pattern;
int count = 0;
@@ -1141,7 +1251,7 @@ matching closing parens. That is why we have to have a loop. */
for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
if (rc > 0 || *ptr++ == 0) break;
}
@@ -1485,7 +1595,8 @@ for (;;)
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
+ two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
+ must add in its length. */
else
{
@@ -1509,6 +1620,16 @@ for (;;)
case OP_TYPEPOSUPTO:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ code += code[1];
+ break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
}
/* Add in the fixed length from the table */
@@ -1580,7 +1701,8 @@ for (;;)
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
+ two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
+ must add in its length. */
else
{
@@ -1604,6 +1726,16 @@ for (;;)
case OP_TYPEEXACT:
if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
break;
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ code += code[1];
+ break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
}
/* Add in the fixed length from the table */
@@ -1873,6 +2005,19 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
break;
#endif
+ /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
+ string. */
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ code += code[1];
+ break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
+
/* None of the remaining opcodes are required to match a character. */
default:
@@ -2093,8 +2238,8 @@ auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
{
*code++ = OP_CALLOUT;
*code++ = 255;
-PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
-PUT(code, LINK_SIZE, 0); /* Default length */
+PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
+PUT(code, LINK_SIZE, 0); /* Default length */
return code + 2*LINK_SIZE;
}
@@ -2119,7 +2264,7 @@ Returns: nothing
static void
complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
{
-int length = ptr - cd->start_pattern - GET(previous_callout, 2);
+int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
PUT(previous_callout, 2 + LINK_SIZE, length);
}
@@ -2169,6 +2314,69 @@ for (++c; c <= d; c++)
return TRUE;
}
+
+
+
+/*************************************************
+* Check a character and a property *
+*************************************************/
+
+/* This function is called by check_auto_possessive() when a property item
+is adjacent to a fixed character.
+
+Arguments:
+ c the character
+ ptype the property type
+ pdata the data for the type
+ negated TRUE if it's a negated property (\P or \p{^)
+
+Returns: TRUE if auto-possessifying is OK
+*/
+
+static BOOL
+check_char_prop(int c, int ptype, int pdata, BOOL negated)
+{
+int chartype = UCD_CHARTYPE(c);
+switch(ptype)
+ {
+ case PT_LAMP:
+ return (chartype == ucp_Lu ||
+ chartype == ucp_Ll ||
+ chartype == ucp_Lt) == negated;
+
+ case PT_GC:
+ return (pdata == _pcre_ucp_gentype[chartype]) == negated;
+
+ case PT_PC:
+ return (pdata == chartype) == negated;
+
+ case PT_SC:
+ return (pdata == UCD_SCRIPT(c)) == negated;
+
+ /* These are specials */
+
+ case PT_ALNUM:
+ return (_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N) == negated;
+
+ case PT_SPACE: /* Perl space */
+ return (_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == negated;
+
+ case PT_PXSPACE: /* POSIX space */
+ return (_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR)
+ == negated;
+
+ case PT_WORD:
+ return (_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE) == negated;
+ }
+return FALSE;
+}
#endif /* SUPPORT_UCP */
@@ -2182,10 +2390,8 @@ whether the next thing could possibly match the repeated item. If not, it makes
sense to automatically possessify the repeated item.
Arguments:
- op_code the repeated op code
- this data for this item, depends on the opcode
+ previous pointer to the repeated opcode
utf8 TRUE in UTF-8 mode
- utf8_char used for utf8 character bytes, NULL if not relevant
ptr next character in pattern
options options bits
cd contains pointers to tables etc.
@@ -2194,10 +2400,11 @@ Returns: TRUE if possessifying is wanted
*/
static BOOL
-check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
- const uschar *ptr, int options, compile_data *cd)
+check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
+ int options, compile_data *cd)
{
-int next;
+int c, next;
+int op_code = *previous++;
/* Skip whitespace and comments in extended mode */
@@ -2208,8 +2415,15 @@ if ((options & PCRE_EXTENDED) != 0)
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -2245,8 +2459,15 @@ if ((options & PCRE_EXTENDED) != 0)
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -2258,23 +2479,18 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
return FALSE;
-/* Now compare the next item with the previous opcode. If the previous is a
-positive single character match, "item" either contains the character or, if
-"item" is greater than 127 in utf8 mode, the character's bytes are in
-utf8_char. */
-
-
-/* Handle cases when the next item is a character. */
+/* Now compare the next item with the previous opcode. First, handle cases when
+the next item is a character. */
if (next >= 0) switch(op_code)
{
case OP_CHAR:
#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+ GETCHARTEST(c, previous);
#else
- (void)(utf8_char); /* Keep compiler happy by referencing function argument */
+ c = *previous;
#endif
- return item != next;
+ return c != next;
/* For CHARNC (caseless character) we must check the other case. If we have
Unicode property support, we can use it to test the other case of
@@ -2282,9 +2498,11 @@ if (next >= 0) switch(op_code)
case OP_CHARNC:
#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
#endif
- if (item == next) return FALSE;
+ if (c == next) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
{
@@ -2295,16 +2513,16 @@ if (next >= 0) switch(op_code)
#else
othercase = NOTACHAR;
#endif
- return (unsigned int)item != othercase;
+ return (unsigned int)c != othercase;
}
else
#endif /* SUPPORT_UTF8 */
- return (item != cd->fcc[next]); /* Non-UTF-8 mode */
+ return (c != cd->fcc[next]); /* Non-UTF-8 mode */
- /* For OP_NOT, "item" must be a single-byte character. */
+ /* For OP_NOT, its data is always a single-byte character. */
case OP_NOT:
- if (item == next) return TRUE;
+ if ((c = *previous) == next) return TRUE;
if ((options & PCRE_CASELESS) == 0) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
@@ -2316,11 +2534,14 @@ if (next >= 0) switch(op_code)
#else
othercase = NOTACHAR;
#endif
- return (unsigned int)item == othercase;
+ return (unsigned int)c == othercase;
}
else
#endif /* SUPPORT_UTF8 */
- return (item == cd->fcc[next]); /* Non-UTF-8 mode */
+ return (c == cd->fcc[next]); /* Non-UTF-8 mode */
+
+ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
+ When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
case OP_DIGIT:
return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
@@ -2363,11 +2584,12 @@ if (next >= 0) switch(op_code)
case 0x202f:
case 0x205f:
case 0x3000:
- return op_code != OP_HSPACE;
+ return op_code == OP_NOT_HSPACE;
default:
- return op_code == OP_HSPACE;
+ return op_code != OP_NOT_HSPACE;
}
+ case OP_ANYNL:
case OP_VSPACE:
case OP_NOT_VSPACE:
switch(next)
@@ -2379,48 +2601,62 @@ if (next >= 0) switch(op_code)
case 0x85:
case 0x2028:
case 0x2029:
- return op_code != OP_VSPACE;
+ return op_code == OP_NOT_VSPACE;
default:
- return op_code == OP_VSPACE;
+ return op_code != OP_NOT_VSPACE;
}
+#ifdef SUPPORT_UCP
+ case OP_PROP:
+ return check_char_prop(next, previous[0], previous[1], FALSE);
+
+ case OP_NOTPROP:
+ return check_char_prop(next, previous[0], previous[1], TRUE);
+#endif
+
default:
return FALSE;
}
-/* Handle the case when the next item is \d, \s, etc. */
+/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
+is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
+generated only when PCRE_UCP is *not* set, that is, when only ASCII
+characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
+replaced by OP_PROP codes when PCRE_UCP is set. */
switch(op_code)
{
case OP_CHAR:
case OP_CHARNC:
#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
#endif
switch(-next)
{
case ESC_d:
- return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
case ESC_D:
- return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
case ESC_s:
- return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
case ESC_S:
- return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
case ESC_w:
- return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
case ESC_W:
- return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
case ESC_h:
case ESC_H:
- switch(item)
+ switch(c)
{
case 0x09:
case 0x20:
@@ -2448,7 +2684,7 @@ switch(op_code)
case ESC_v:
case ESC_V:
- switch(item)
+ switch(c)
{
case 0x0a:
case 0x0b:
@@ -2462,38 +2698,92 @@ switch(op_code)
return -next == ESC_v;
}
+ /* When PCRE_UCP is set, these values get generated for \d etc. Find
+ their substitutions and process them. The result will always be either
+ -ESC_p or -ESC_P. Then fall through to process those values. */
+
+#ifdef SUPPORT_UCP
+ case ESC_du:
+ case ESC_DU:
+ case ESC_wu:
+ case ESC_WU:
+ case ESC_su:
+ case ESC_SU:
+ {
+ int temperrorcode = 0;
+ ptr = substitutes[-next - ESC_DU];
+ next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
+ if (temperrorcode != 0) return FALSE;
+ ptr++; /* For compatibility */
+ }
+ /* Fall through */
+
+ case ESC_p:
+ case ESC_P:
+ {
+ int ptype, pdata, errorcodeptr;
+ BOOL negated;
+
+ ptr--; /* Make ptr point at the p or P */
+ ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
+ if (ptype < 0) return FALSE;
+ ptr++; /* Point past the final curly ket */
+
+ /* If the property item is optional, we have to give up. (When generated
+ from \d etc by PCRE_UCP, this test will have been applied much earlier,
+ to the original \d etc. At this point, ptr will point to a zero byte. */
+
+ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
+ strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
+ return FALSE;
+
+ /* Do the property check. */
+
+ return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
+ }
+#endif
+
default:
return FALSE;
}
+ /* In principle, support for Unicode properties should be integrated here as
+ well. It means re-organizing the above code so as to get hold of the property
+ values before switching on the op-code. However, I wonder how many patterns
+ combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
+ these op-codes are never generated.) */
+
case OP_DIGIT:
return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
- next == -ESC_h || next == -ESC_v;
+ next == -ESC_h || next == -ESC_v || next == -ESC_R;
case OP_NOT_DIGIT:
return next == -ESC_d;
case OP_WHITESPACE:
- return next == -ESC_S || next == -ESC_d || next == -ESC_w;
+ return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
case OP_NOT_WHITESPACE:
return next == -ESC_s || next == -ESC_h || next == -ESC_v;
case OP_HSPACE:
- return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
+ return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
+ next == -ESC_w || next == -ESC_v || next == -ESC_R;
case OP_NOT_HSPACE:
return next == -ESC_h;
/* Can't have \S in here because VT matches \S (Perl anomaly) */
+ case OP_ANYNL:
case OP_VSPACE:
return next == -ESC_V || next == -ESC_d || next == -ESC_w;
case OP_NOT_VSPACE:
- return next == -ESC_v;
+ return next == -ESC_v || next == -ESC_R;
case OP_WORDCHAR:
- return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
+ return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
+ next == -ESC_v || next == -ESC_R;
case OP_NOT_WORDCHAR:
return next == -ESC_w || next == -ESC_d;
@@ -2557,6 +2847,7 @@ BOOL inescq = FALSE;
BOOL groupsetfirstbyte = FALSE;
const uschar *ptr = *ptrptr;
const uschar *tempptr;
+const uschar *nestptr = NULL;
uschar *previous = NULL;
uschar *previous_callout = NULL;
uschar *save_hwm = NULL;
@@ -2627,6 +2918,16 @@ for (;; ptr++)
c = *ptr;
+ /* If we are at the end of a nested substitution, revert to the outer level
+ string. Nesting only happens one level deep. */
+
+ if (c == 0 && nestptr != NULL)
+ {
+ ptr = nestptr;
+ nestptr = NULL;
+ c = *ptr;
+ }
+
/* If we are in the pre-compile phase, accumulate the length used for the
previous cycle of this loop. */
@@ -2657,7 +2958,7 @@ for (;; ptr++)
goto FAILED;
}
- *lengthptr += code - last_code;
+ *lengthptr += (int)(code - last_code);
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
/* If "previous" is set and it is not at the start of the work space, move
@@ -2739,9 +3040,14 @@ for (;; ptr++)
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
}
if (*ptr != 0) continue;
@@ -2775,7 +3081,7 @@ for (;; ptr++)
*errorcodeptr = ERR20;
goto FAILED;
}
- *lengthptr += code - last_code; /* To include callout length */
+ *lengthptr += (int)(code - last_code); /* To include callout length */
DPRINTF((">> end branch\n"));
}
return TRUE;
@@ -2980,7 +3286,7 @@ for (;; ptr++)
ptr++;
}
- posix_class = check_posix_name(ptr, tempptr - ptr);
+ posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
if (posix_class < 0)
{
*errorcodeptr = ERR30;
@@ -2994,10 +3300,25 @@ for (;; ptr++)
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
- /* We build the bit map for the POSIX class in a chunk of local store
- because we may be adding and subtracting from it, and we don't want to
- subtract bits that may be in the main map already. At the end we or the
- result into the bit map that is being built. */
+ /* When PCRE_UCP is set, some of the POSIX classes are converted to
+ different escape sequences that use Unicode properties. */
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_UCP) != 0)
+ {
+ int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
+ if (posix_substitutes[pc] != NULL)
+ {
+ nestptr = tempptr + 1;
+ ptr = posix_substitutes[pc] - 1;
+ continue;
+ }
+ }
+#endif
+ /* In the non-UCP case, we build the bit map for the POSIX class in a
+ chunk of local store because we may be adding and subtracting from it,
+ and we don't want to subtract bits that may be in the main map already.
+ At the end we or the result into the bit map that is being built. */
posix_class *= 3;
@@ -3041,19 +3362,18 @@ for (;; ptr++)
/* Backslash may introduce a single character, or it may introduce one
of the specials, which just set a flag. The sequence \b is a special
- case. Inside a class (and only there) it is treated as backspace.
- Elsewhere it marks a word boundary. Other escapes have preset maps ready
- to 'or' into the one we are building. We assume they have more than one
- character in them, so set class_charcount bigger than one. */
+ case. Inside a class (and only there) it is treated as backspace. We
+ assume that other escapes have more than one character in them, so set
+ class_charcount bigger than one. Unrecognized escapes fall through and
+ are either treated as literal characters (by default), or are faulted if
+ PCRE_EXTRA is set. */
if (c == CHAR_BACKSLASH)
{
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
- if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
- else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
- else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
+ if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
else if (-c == ESC_Q) /* Handle start of quoted string */
{
if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
@@ -3070,10 +3390,20 @@ for (;; ptr++)
register const uschar *cbits = cd->cbits;
class_charcount += 2; /* Greater than 1 is what matters */
- /* Save time by not doing this in the pre-compile phase. */
-
- if (lengthptr == NULL) switch (-c)
+ switch (-c)
{
+#ifdef SUPPORT_UCP
+ case ESC_du: /* These are the values given for \d etc */
+ case ESC_DU: /* when PCRE_UCP is set. We replace the */
+ case ESC_wu: /* escape sequence with an appropriate \p */
+ case ESC_WU: /* or \P to test Unicode properties instead */
+ case ESC_su: /* of the default ASCII testing. */
+ case ESC_SU:
+ nestptr = ptr;
+ ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
+ class_charcount -= 2; /* Undo! */
+ continue;
+#endif
case ESC_d:
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
continue;
@@ -3092,9 +3422,14 @@ for (;; ptr++)
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
continue;
+ /* Perl 5.004 onwards omits VT from \s, but we must preserve it
+ if it was previously set by something earlier in the character
+ class. */
+
case ESC_s:
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
- classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
+ classbits[0] |= cbits[cbit_space];
+ classbits[1] |= cbits[cbit_space+1] & ~0x08;
+ for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
continue;
case ESC_S:
@@ -3103,20 +3438,7 @@ for (;; ptr++)
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
- default: /* Not recognized; fall through */
- break; /* Need "default" setting to stop compiler warning. */
- }
-
- /* In the pre-compile phase, just do the recognition. */
-
- else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
- c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
-
- /* We need to deal with \H, \h, \V, and \v in both phases because
- they use extra memory. */
-
- if (-c == ESC_h)
- {
+ case ESC_h:
SETBIT(classbits, 0x09); /* VT */
SETBIT(classbits, 0x20); /* SPACE */
SETBIT(classbits, 0xa0); /* NSBP */
@@ -3140,10 +3462,8 @@ for (;; ptr++)
}
#endif
continue;
- }
- if (-c == ESC_H)
- {
+ case ESC_H:
for (c = 0; c < 32; c++)
{
int x = 0xff;
@@ -3185,10 +3505,8 @@ for (;; ptr++)
}
#endif
continue;
- }
- if (-c == ESC_v)
- {
+ case ESC_v:
SETBIT(classbits, 0x0a); /* LF */
SETBIT(classbits, 0x0b); /* VT */
SETBIT(classbits, 0x0c); /* FF */
@@ -3204,10 +3522,8 @@ for (;; ptr++)
}
#endif
continue;
- }
- if (-c == ESC_V)
- {
+ case ESC_V:
for (c = 0; c < 32; c++)
{
int x = 0xff;
@@ -3237,38 +3553,38 @@ for (;; ptr++)
}
#endif
continue;
- }
-
- /* We need to deal with \P and \p in both phases. */
#ifdef SUPPORT_UCP
- if (-c == ESC_p || -c == ESC_P)
- {
- BOOL negated;
- int pdata;
- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
- if (ptype < 0) goto FAILED;
- class_utf8 = TRUE;
- *class_utf8data++ = ((-c == ESC_p) != negated)?
- XCL_PROP : XCL_NOTPROP;
- *class_utf8data++ = ptype;
- *class_utf8data++ = pdata;
- class_charcount -= 2; /* Not a < 256 character */
- continue;
- }
+ case ESC_p:
+ case ESC_P:
+ {
+ BOOL negated;
+ int pdata;
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
+ if (ptype < 0) goto FAILED;
+ class_utf8 = TRUE;
+ *class_utf8data++ = ((-c == ESC_p) != negated)?
+ XCL_PROP : XCL_NOTPROP;
+ *class_utf8data++ = ptype;
+ *class_utf8data++ = pdata;
+ class_charcount -= 2; /* Not a < 256 character */
+ continue;
+ }
#endif
- /* Unrecognized escapes are faulted if PCRE is running in its
- strict mode. By default, for compatibility with Perl, they are
- treated as literals. */
+ /* Unrecognized escapes are faulted if PCRE is running in its
+ strict mode. By default, for compatibility with Perl, they are
+ treated as literals. */
- if ((options & PCRE_EXTRA) != 0)
- {
- *errorcodeptr = ERR7;
- goto FAILED;
+ default:
+ if ((options & PCRE_EXTRA) != 0)
+ {
+ *errorcodeptr = ERR7;
+ goto FAILED;
+ }
+ class_charcount -= 2; /* Undo the default count from above */
+ c = *ptr; /* Get the final character and fall through */
+ break;
}
-
- class_charcount -= 2; /* Undo the default count from above */
- c = *ptr; /* Get the final character and fall through */
}
/* Fall through if we have a single character (c >= 0). This may be
@@ -3338,14 +3654,11 @@ for (;; ptr++)
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
- /* \b is backspace; \X is literal X; \R is literal R; any other
- special means the '-' was literal */
+ /* \b is backspace; any other special means the '-' was literal */
if (d < 0)
{
- if (d == -ESC_b) d = CHAR_BS;
- else if (d == -ESC_X) d = CHAR_X;
- else if (d == -ESC_R) d = CHAR_R; else
+ if (d == -ESC_b) d = CHAR_BS; else
{
ptr = oldptr;
goto LONE_SINGLE_CHARACTER; /* A few lines below */
@@ -3511,35 +3824,23 @@ for (;; ptr++)
}
}
- /* Loop until ']' reached. This "while" is the end of the "do" above. */
+ /* Loop until ']' reached. This "while" is the end of the "do" far above.
+ If we are at the end of an internal nested string, revert to the outer
+ string. */
- while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
+ while (((c = *(++ptr)) != 0 ||
+ (nestptr != NULL &&
+ (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
+ (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
- if (c == 0) /* Missing terminating ']' */
+ /* Check for missing terminating ']' */
+
+ if (c == 0)
{
*errorcodeptr = ERR6;
goto FAILED;
}
-
-/* This code has been disabled because it would mean that \s counts as
-an explicit \r or \n reference, and that's not really what is wanted. Now
-we set the flag only if there is a literal "\r" or "\n" in the class. */
-
-#if 0
- /* Remember whether \r or \n are in this class */
-
- if (negate_class)
- {
- if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
- }
- else
- {
- if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
- }
-#endif
-
-
/* If class_charcount is 1, we saw precisely one character whose value is
less than 256. As long as there were no characters >= 128 and there was no
use of \p or \P, in other words, no use of any XCLASS features, we can
@@ -3603,13 +3904,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* If there are characters with values > 255, we have to compile an
extended class, with its own opcode, unless there was a negated special
- such as \S in the class, because in that case all characters > 255 are in
- the class, so any that were explicitly given as well can be ignored. If
- (when there are explicit characters > 255 that must be listed) there are no
- characters < 256, we can omit the bitmap in the actual compiled code. */
+ such as \S in the class, and PCRE_UCP is not set, because in that case all
+ characters > 255 are in the class, so any that were explicitly given as
+ well can be ignored. If (when there are explicit characters > 255 that must
+ be listed) there are no characters < 256, we can omit the bitmap in the
+ actual compiled code. */
#ifdef SUPPORT_UTF8
- if (class_utf8 && !should_flip_negation)
+ if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
{
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
@@ -3635,10 +3937,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
}
#endif
- /* If there are no characters > 255, set the opcode to OP_CLASS or
- OP_NCLASS, depending on whether the whole class was negated and whether
- there were negative specials such as \S in the class. Then copy the 32-byte
- map into the code vector, negating it if necessary. */
+ /* If there are no characters > 255, or they are all to be included or
+ excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
+ whole class was negated and whether there were negative specials such as \S
+ (non-UCP) in the class. Then copy the 32-byte map into the code vector,
+ negating it if necessary. */
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
if (negate_class)
@@ -3762,8 +4065,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
- options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -3784,7 +4086,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
c = previous[1];
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -3808,7 +4110,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -4018,7 +4320,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
{
register int i;
int ketoffset = 0;
- int len = code - previous;
+ int len = (int)(code - previous);
uschar *bralink = NULL;
/* Repeating a DEFINE group is pointless */
@@ -4039,7 +4341,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
{
register uschar *ket = previous;
do ket += GET(ket, 1); while (*ket != OP_KET);
- ketoffset = code - ket;
+ ketoffset = (int)(code - ket);
}
/* The case of a zero minimum is special because of the need to stick
@@ -4107,7 +4409,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* We chain together the bracket offset fields that have to be
filled in later when the ends of the brackets are reached. */
- offset = (bralink == NULL)? 0 : previous - bralink;
+ offset = (bralink == NULL)? 0 : (int)(previous - bralink);
bralink = previous;
PUTINC(previous, 0, offset);
}
@@ -4216,7 +4518,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
{
int offset;
*code++ = OP_BRA;
- offset = (bralink == NULL)? 0 : code - bralink;
+ offset = (bralink == NULL)? 0 : (int)(code - bralink);
bralink = code;
PUTINC(code, 0, offset);
}
@@ -4237,7 +4539,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
while (bralink != NULL)
{
int oldlinkoffset;
- int offset = code - bralink + 1;
+ int offset = (int)(code - bralink + 1);
uschar *bra = code - offset;
oldlinkoffset = GET(bra, 1);
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
@@ -4325,7 +4627,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
#endif
}
- len = code - tempcode;
+ len = (int)(code - tempcode);
if (len > 0) switch (*tempcode)
{
case OP_STAR: *tempcode = OP_POSSTAR; break;
@@ -4384,24 +4686,34 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* First deal with various "verbs" that can be introduced by '*'. */
- if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
+ if (*(++ptr) == CHAR_ASTERISK &&
+ ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
{
int i, namelen;
+ int arglen = 0;
const char *vn = verbnames;
- const uschar *name = ++ptr;
+ const uschar *name = ptr + 1;
+ const uschar *arg = NULL;
previous = NULL;
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
+ namelen = (int)(ptr - name);
+
if (*ptr == CHAR_COLON)
{
- *errorcodeptr = ERR59; /* Not supported */
- goto FAILED;
+ arg = ++ptr;
+ while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
+ || *ptr == '_') ptr++;
+ arglen = (int)(ptr - arg);
}
+
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
*errorcodeptr = ERR60;
goto FAILED;
}
- namelen = ptr - name;
+
+ /* Scan the table of verb names */
+
for (i = 0; i < verbcount; i++)
{
if (namelen == verbs[i].len &&
@@ -4419,13 +4731,51 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
PUT2INC(code, 0, oc->number);
}
}
- *code++ = verbs[i].op;
- break;
+
+ /* Handle the cases with/without an argument */
+
+ if (arglen == 0)
+ {
+ if (verbs[i].op < 0) /* Argument is mandatory */
+ {
+ *errorcodeptr = ERR66;
+ goto FAILED;
+ }
+ *code = verbs[i].op;
+ if (*code++ == OP_THEN)
+ {
+ PUT(code, 0, code - bcptr->current_branch - 1);
+ code += LINK_SIZE;
+ }
+ }
+
+ else
+ {
+ if (verbs[i].op_arg < 0) /* Argument is forbidden */
+ {
+ *errorcodeptr = ERR59;
+ goto FAILED;
+ }
+ *code = verbs[i].op_arg;
+ if (*code++ == OP_THEN_ARG)
+ {
+ PUT(code, 0, code - bcptr->current_branch - 1);
+ code += LINK_SIZE;
+ }
+ *code++ = arglen;
+ memcpy(code, arg, arglen);
+ code += arglen;
+ *code++ = 0;
+ }
+
+ break; /* Found verb, exit loop */
}
+
vn += verbs[i].len + 1;
}
- if (i < verbcount) continue;
- *errorcodeptr = ERR60;
+
+ if (i < verbcount) continue; /* Successfully handled a verb */
+ *errorcodeptr = ERR60; /* Verb not recognized */
goto FAILED;
}
@@ -4544,7 +4894,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
recno * 10 + *ptr - CHAR_0 : -1;
ptr++;
}
- namelen = ptr - name;
+ namelen = (int)(ptr - name);
if ((terminator > 0 && *ptr++ != terminator) ||
*ptr++ != CHAR_RIGHT_PARENTHESIS)
@@ -4605,7 +4955,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* Search the pattern for a forward reference */
else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) > 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
code[1+LINK_SIZE]++;
@@ -4740,8 +5090,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
goto FAILED;
}
*code++ = n;
- PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
- PUT(code, LINK_SIZE, 0); /* Default length */
+ PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
+ PUT(code, LINK_SIZE, 0); /* Default length */
code += 2 * LINK_SIZE;
}
previous = NULL;
@@ -4774,7 +5124,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
- namelen = ptr - name;
+ namelen = (int)(ptr - name);
/* In the pre-compile phase, just do a syntax check. */
@@ -4904,13 +5254,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
NAMED_REF_OR_RECURSE:
name = ++ptr;
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
- namelen = ptr - name;
+ namelen = (int)(ptr - name);
- /* In the pre-compile phase, do a syntax check and set a dummy
- reference number. */
+ /* In the pre-compile phase, do a syntax check. We used to just set
+ a dummy reference number, because it was not used in the first pass.
+ However, with the change of recursive back references to be atomic,
+ we have to look for the number so that this state can be identified, as
+ otherwise the incorrect length is computed. If it's not a backwards
+ reference, the dummy number will do. */
if (lengthptr != NULL)
{
+ const uschar *temp;
+
if (namelen == 0)
{
*errorcodeptr = ERR62;
@@ -4926,7 +5282,22 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
*errorcodeptr = ERR48;
goto FAILED;
}
- recno = 0;
+
+ /* The name table does not exist in the first pass, so we cannot
+ do a simple search as in the code below. Instead, we have to scan the
+ pattern to find the number. It is important that we scan it only as
+ far as we have got because the syntax of named subpatterns has not
+ been checked for the rest of the pattern, and find_parens() assumes
+ correct syntax. In any case, it's a waste of resources to scan
+ further. We stop the scan at the current point by temporarily
+ adjusting the value of cd->endpattern. */
+
+ temp = cd->end_pattern;
+ cd->end_pattern = ptr;
+ recno = find_parens(cd, name, namelen,
+ (options & PCRE_EXTENDED) != 0, utf8);
+ cd->end_pattern = temp;
+ if (recno < 0) recno = 0; /* Forward ref; set dummy number */
}
/* In the real compile, seek the name in the table. We check the name
@@ -4951,7 +5322,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
}
else if ((recno = /* Forward back reference */
find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) <= 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -5062,7 +5433,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (called == NULL)
{
if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0) < 0)
+ (options & PCRE_EXTENDED) != 0, utf8) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -5073,7 +5444,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
of the group. */
called = cd->start_code + recno;
- PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
+ PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
}
/* If not a forward reference, and the subpattern is still open,
@@ -5097,7 +5468,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
code += 1 + LINK_SIZE;
*code = OP_RECURSE;
- PUT(code, 1, called - cd->start_code);
+ PUT(code, 1, (int)(called - cd->start_code));
code += 1 + LINK_SIZE;
*code = OP_KET;
@@ -5208,8 +5579,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
} /* End of switch for character following (? */
} /* End of (? handling */
- /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
- all unadorned brackets become non-capturing and behave like (?:...)
+ /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
+ is set, all unadorned brackets become non-capturing and behave like (?:...)
brackets. */
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
@@ -5401,11 +5772,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* ===================================================================*/
/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
- are arranged to be the negation of the corresponding OP_values. For the
- back references, the values are ESC_REF plus the reference number. Only
- back references and those types that consume a character may be repeated.
- We can test for values between ESC_b and ESC_Z for the latter; this may
- have to change if any new ones are ever created. */
+ are arranged to be the negation of the corresponding OP_values in the
+ default case when PCRE_UCP is not set. For the back references, the values
+ are ESC_REF plus the reference number. Only back references and those types
+ that consume a character may be repeated. We can test for values between
+ ESC_b and ESC_Z for the latter; this may have to change if any new ones are
+ ever created. */
case CHAR_BACKSLASH:
tempptr = ptr;
@@ -5565,12 +5937,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
#endif
/* For the rest (including \X when Unicode properties are supported), we
- can obtain the OP value by negating the escape value. */
+ can obtain the OP value by negating the escape value in the default
+ situation when PCRE_UCP is not set. When it *is* set, we substitute
+ Unicode property tests. */
else
{
- previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = -c;
+#ifdef SUPPORT_UCP
+ if (-c >= ESC_DU && -c <= ESC_wu)
+ {
+ nestptr = ptr + 1; /* Where to resume */
+ ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
+ }
+ else
+#endif
+ {
+ previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
+ *code++ = -c;
+ }
}
continue;
}
@@ -5902,7 +6286,7 @@ for (;;)
{
if (lengthptr == NULL)
{
- int branch_length = code - last_branch;
+ int branch_length = (int)(code - last_branch);
do
{
int prev_length = GET(last_branch, 1);
@@ -5916,7 +6300,7 @@ for (;;)
/* Fill in the ket */
*code = OP_KET;
- PUT(code, 1, code - start_bracket);
+ PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
/* If it was a capturing subpattern, check to see if it contained any
@@ -5931,9 +6315,9 @@ for (;;)
code - start_bracket);
*start_bracket = OP_ONCE;
code += 1 + LINK_SIZE;
- PUT(start_bracket, 1, code - start_bracket);
+ PUT(start_bracket, 1, (int)(code - start_bracket));
*code = OP_KET;
- PUT(code, 1, code - start_bracket);
+ PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
length += 2 + 2*LINK_SIZE;
}
@@ -5988,7 +6372,7 @@ for (;;)
else
{
*code = OP_ALT;
- PUT(code, 1, code - last_branch);
+ PUT(code, 1, (int)(code - last_branch));
bc.current_branch = last_branch = code;
code += 1 + LINK_SIZE;
}
@@ -6290,8 +6674,6 @@ Returns: pointer to compiled data block, or NULL on error,
with errorptr and erroroffset set
*/
-#ifdef NOT_USED_IN_GLIB
-
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
@@ -6299,7 +6681,6 @@ pcre_compile(const char *pattern, int options, const char **errorptr,
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
}
-#endif
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
@@ -6310,7 +6691,7 @@ int length = 1; /* For final END opcode */
int firstbyte, reqbyte, newline;
int errorcode = 0;
int skipatstart = 0;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+BOOL utf8;
size_t size;
uschar *code;
const uschar *codestart;
@@ -6380,6 +6761,10 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
{ skipatstart += 7; options |= PCRE_UTF8; continue; }
+ else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
+ { skipatstart += 6; options |= PCRE_UCP; continue; }
+ else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
+ { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
@@ -6404,6 +6789,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
else break;
}
+utf8 = (options & PCRE_UTF8) != 0;
+
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
#ifdef SUPPORT_UTF8
@@ -6421,6 +6808,16 @@ if (utf8)
}
#endif
+/* Can't support UCP unless PCRE has been compiled to include the code. */
+
+#ifndef SUPPORT_UCP
+if ((options & PCRE_UCP) != 0)
+ {
+ errorcode = ERR67;
+ goto PCRE_EARLY_ERROR_RETURN;
+ }
+#endif
+
/* Check validity of \R options. */
switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
@@ -6549,7 +6946,7 @@ regex compiled on a system with 4-byte pointers is run on another with 8-byte
pointers. */
re->magic_number = MAGIC_NUMBER;
-re->size = size;
+re->size = (int)size;
re->options = cd->external_options;
re->flags = cd->external_flags;
re->dummy1 = 0;
@@ -6620,7 +7017,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)
recno = GET(codestart, offset);
groupptr = _pcre_find_bracket(codestart, utf8, recno);
if (groupptr == NULL) errorcode = ERR53;
- else PUT(((uschar *)codestart), offset, groupptr - codestart);
+ else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
}
/* Give an error if there's back reference to a non-existent capturing
@@ -6675,7 +7072,7 @@ if (errorcode != 0)
{
(pcre_free)(re);
PCRE_EARLY_ERROR_RETURN:
- *erroroffset = ptr - (const uschar *)pattern;
+ *erroroffset = (int)(ptr - (const uschar *)pattern);
PCRE_EARLY_ERROR_RETURN2:
*errorptr = find_error_text(errorcode);
if (errorcodeptr != NULL) *errorcodeptr = errorcode;
diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c
index c241f5b05..4d61a325d 100644
--- a/glib/pcre/pcre_dfa_exec.c
+++ b/glib/pcre/pcre_dfa_exec.c
@@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */
/* This table identifies those opcodes that are followed immediately by a
-character that is to be tested in some way. This makes is possible to
+character that is to be tested in some way. This makes it possible to
centralize the loading of these characters. In the case of Type * etc, the
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
small value. Non-zero values in the table are the offsets from the opcode where
@@ -161,8 +161,9 @@ static const uschar coptable[] = {
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
+ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
+ 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* This table identifies those opcodes that inspect a character. It is used to
@@ -218,8 +219,9 @@ static const uschar poptable[] = {
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
+ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
+ 0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE)
{
gone_back = (current_subject - max_back < start_subject)?
- current_subject - start_subject : max_back;
+ (int)(current_subject - start_subject) : max_back;
current_subject -= gone_back;
}
@@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE)
int back = GET(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
- int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
+ int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
ADD_NEW_DATA(-bstate, 0, gone_back - back);
}
end_code += GET(end_code, 1);
@@ -526,7 +528,7 @@ else
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
do
{
- ADD_NEW(end_code - start_code + length, 0);
+ ADD_NEW((int)(end_code - start_code + length), 0);
end_code += GET(end_code, 1);
length = 1 + LINK_SIZE;
}
@@ -753,8 +755,8 @@ for (;;)
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if (offsetcount >= 2)
{
- offsets[0] = current_subject - start_subject;
- offsets[1] = ptr - start_subject;
+ offsets[0] = (int)(current_subject - start_subject);
+ offsets[1] = (int)(ptr - start_subject);
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
offsets[1] - offsets[0], current_subject));
}
@@ -776,7 +778,7 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_ALT:
do { code += GET(code, 1); } while (*code == OP_ALT);
- ADD_ACTIVE(code - start_code, 0);
+ ADD_ACTIVE((int)(code - start_code), 0);
break;
/*-----------------------------------------------------------------*/
@@ -784,7 +786,7 @@ for (;;)
case OP_SBRA:
do
{
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
while (*code == OP_ALT);
@@ -793,11 +795,11 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_CBRA:
case OP_SCBRA:
- ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
code += GET(code, 1);
while (*code == OP_ALT)
{
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
break;
@@ -808,14 +810,14 @@ for (;;)
ADD_ACTIVE(state_offset + 1, 0);
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
case OP_SKIPZERO:
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
@@ -829,7 +831,12 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_EOD:
- if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+ if (ptr >= end_subject)
+ {
+ if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else { ADD_ACTIVE(state_offset + 1, 0); }
+ }
break;
/*-----------------------------------------------------------------*/
@@ -869,7 +876,9 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_EODN:
- if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
@@ -877,7 +886,9 @@ for (;;)
case OP_DOLL:
if ((md->moptions & PCRE_NOTEOL) == 0)
{
- if (clen == 0 ||
+ if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+ could_continue = TRUE;
+ else if (clen == 0 ||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
))
@@ -920,13 +931,37 @@ for (;;)
if (utf8) BACKCHAR(temp);
#endif
GETCHARTEST(d, temp);
+#ifdef SUPPORT_UCP
+ if ((md->poptions & PCRE_UCP) != 0)
+ {
+ if (d == '_') left_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(d);
+ left_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
}
- else left_word = 0;
+ else left_word = FALSE;
if (clen > 0)
+ {
+#ifdef SUPPORT_UCP
+ if ((md->poptions & PCRE_UCP) != 0)
+ {
+ if (c == '_') right_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ right_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
- else right_word = 0;
+ }
+ else right_word = FALSE;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
@@ -953,7 +988,8 @@ for (;;)
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
@@ -968,6 +1004,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[2];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1122,7 +1182,8 @@ for (;;)
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
@@ -1137,6 +1198,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[3];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1344,7 +1429,8 @@ for (;;)
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
@@ -1359,6 +1445,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[3];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1591,7 +1701,8 @@ for (;;)
break;
case PT_LAMP:
- OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
@@ -1606,6 +1717,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[5];
break;
+ /* These are specials for combination cases. */
+
+ case PT_ALNUM:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR;
+ break;
+
+ case PT_WORD:
+ OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -2233,7 +2368,7 @@ for (;;)
points to the byte after the end of the class. If there is a
quantifier, this is where it will be. */
- next_state_offset = ecode - start_code;
+ next_state_offset = (int)(ecode - start_code);
switch (*ecode)
{
@@ -2304,7 +2439,7 @@ for (;;)
md, /* static match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@@ -2315,7 +2450,7 @@ for (;;)
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
}
break;
@@ -2342,9 +2477,9 @@ for (;;)
cb.callout_number = code[LINK_SIZE+2];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
+ cb.subject_length = (int)(end_subject - start_subject);
+ cb.start_match = (int)(current_subject - start_subject);
+ cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, LINK_SIZE + 3);
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
cb.capture_top = 1;
@@ -2395,7 +2530,7 @@ for (;;)
md, /* fixed match data */
asscode, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@@ -2407,7 +2542,7 @@ for (;;)
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+ { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
else
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
@@ -2428,7 +2563,7 @@ for (;;)
md, /* fixed match data */
start_code + GET(code, 1), /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@@ -2480,7 +2615,7 @@ for (;;)
md, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
+ (int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@@ -2497,7 +2632,8 @@ for (;;)
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
- next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
+ next_state_offset =
+ (int)(end_subpattern - start_code + LINK_SIZE + 1);
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
arrange for the repeat state also to be added to the relevant list.
@@ -2505,7 +2641,7 @@ for (;;)
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
*end_subpattern == OP_KETRMIN)?
- end_subpattern - start_code - GET(end_subpattern, 1) : -1;
+ (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
/* If we have matched an empty string, add the next state at the
current character pointer. This is important so that the duplicate
@@ -2569,9 +2705,9 @@ for (;;)
cb.callout_number = code[1];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
+ cb.subject_length = (int)(end_subject - start_subject);
+ cb.start_match = (int)(current_subject - start_subject);
+ cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, 2);
cb.next_item_length = GET(code, 2 + LINK_SIZE);
cb.capture_top = 1;
@@ -2617,13 +2753,13 @@ for (;;)
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
match_count < 0) /* no matches */
) && /* And... */
- ptr >= end_subject && /* Reached end of subject */
- ptr > current_subject) /* Matched non-empty string */
+ ptr >= end_subject && /* Reached end of subject */
+ ptr > md->start_used_ptr) /* Inspected non-empty string */
{
if (offsetcount >= 2)
{
- offsets[0] = md->start_used_ptr - start_subject;
- offsets[1] = end_subject - start_subject;
+ offsets[0] = (int)(md->start_used_ptr - start_subject);
+ offsets[1] = (int)(end_subject - start_subject);
}
match_count = PCRE_ERROR_PARTIAL;
}
@@ -2708,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* We need to find the pointer to any study data before we test for byte
flipping, so we scan the extra_data block first. This may set two fields in the
@@ -2826,16 +2963,14 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
+ int tb;
+ if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
+ return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ tb = ((USPTR)subject)[start_offset] & 0xc0;
+ if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
@@ -2922,9 +3057,11 @@ for (;;)
/* There are some optimizations that avoid running the match if a known
starting point is not found. However, there is an option that disables
- these, for testing and for ensuring that all callouts do actually occur. */
+ these, for testing and for ensuring that all callouts do actually occur.
+ The option can be set in the regex by (*NO_START_OPT) or passed in
+ match-time options. */
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+ if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
/* Advance to a known first byte. */
@@ -2982,8 +3119,16 @@ for (;;)
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ current_subject++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(current_subject < end_subject &&
+ (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif
+ }
+ else break;
}
}
}
diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c
index 0a44fcced..569207cc3 100644
--- a/glib/pcre/pcre_exec.c
+++ b/glib/pcre/pcre_exec.c
@@ -71,10 +71,20 @@ defined PCRE_ERROR_xxx codes, which are all negative. */
/* Special internal returns from the match() function. Make them sufficiently
negative to avoid the external error codes. */
-#define MATCH_COMMIT (-999)
-#define MATCH_PRUNE (-998)
-#define MATCH_SKIP (-997)
-#define MATCH_THEN (-996)
+#define MATCH_ACCEPT (-999)
+#define MATCH_COMMIT (-998)
+#define MATCH_PRUNE (-997)
+#define MATCH_SKIP (-996)
+#define MATCH_SKIP_ARG (-995)
+#define MATCH_THEN (-994)
+
+/* This is a convenience macro for code that occurs many times. */
+
+#define MRRETURN(ra) \
+ { \
+ md->mark = markptr; \
+ RRETURN(ra); \
+ }
/* Maximum number of ints of offset to save on the stack for recursive calls.
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
@@ -245,7 +255,8 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
- RM51, RM52, RM53, RM54 };
+ RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
+ RM61, RM62 };
/* These versions of the macros use the stack, as normal. There are debugging
versions and production versions. Note that the "rw" argument of RMATCH isn't
@@ -283,7 +294,8 @@ argument of match(), which never changes. */
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
{\
- heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
+ heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
+ if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
frame->Xwhere = rw; \
newframe->Xeptr = ra;\
newframe->Xecode = rb;\
@@ -304,9 +316,9 @@ argument of match(), which never changes. */
#define RRETURN(ra)\
{\
- heapframe *newframe = frame;\
- frame = newframe->Xprevframe;\
- (pcre_stack_free)(newframe);\
+ heapframe *oldframe = frame;\
+ frame = oldframe->Xprevframe;\
+ (pcre_stack_free)(oldframe);\
if (frame != NULL)\
{\
rrc = ra;\
@@ -410,17 +422,18 @@ immediately. The second one is used when we already know we are past the end of
the subject. */
#define CHECK_PARTIAL()\
- if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
- {\
- md->hitend = TRUE;\
- if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
+ if (md->partial != 0 && eptr >= md->end_subject && \
+ eptr > md->start_used_ptr) \
+ { \
+ md->hitend = TRUE; \
+ if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
}
#define SCHECK_PARTIAL()\
- if (md->partial != 0 && eptr > mstart)\
- {\
- md->hitend = TRUE;\
- if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
+ if (md->partial != 0 && eptr > md->start_used_ptr) \
+ { \
+ md->hitend = TRUE; \
+ if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
}
@@ -448,13 +461,14 @@ Arguments:
Returns: MATCH_MATCH if matched ) these values are >= 0
MATCH_NOMATCH if failed to match )
+ a negative MATCH_xxx value for PRUNE, SKIP, etc
a negative PCRE_ERROR_xxx value if aborted by an error condition
(e.g. stopped by repeated call or recursion limit)
*/
static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR
- markptr, int offset_top, match_data *md, unsigned long int ims,
+match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
+ const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
eptrblock *eptrb, int flags, unsigned int rdepth)
{
/* These variables do not need to be preserved over recursion in this function,
@@ -475,7 +489,8 @@ heap storage. Set up the top-level frame here; others are obtained from the
heap whenever RMATCH() does a "recursion". See the macro definitions above. */
#ifdef NO_RECURSE
-heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
+heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
+if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
frame->Xprevframe = NULL; /* Marks the top level */
/* Copy in the original argument variables */
@@ -671,32 +686,99 @@ for (;;)
switch(op)
{
+ case OP_MARK:
+ markptr = ecode + 2;
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+ ims, eptrb, flags, RM55);
+
+ /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
+ argument, and we must check whether that argument matches this MARK's
+ argument. It is passed back in md->start_match_ptr (an overloading of that
+ variable). If it does match, we reset that variable to the current subject
+ position and return MATCH_SKIP. Otherwise, pass back the return code
+ unaltered. */
+
+ if (rrc == MATCH_SKIP_ARG &&
+ strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
+ {
+ md->start_match_ptr = eptr;
+ RRETURN(MATCH_SKIP);
+ }
+
+ if (md->mark == NULL) md->mark = markptr;
+ RRETURN(rrc);
+
case OP_FAIL:
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+
+ /* COMMIT overrides PRUNE, SKIP, and THEN */
+
+ case OP_COMMIT:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags, RM52);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
+ rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
+ rrc != MATCH_THEN)
+ RRETURN(rrc);
+ MRRETURN(MATCH_COMMIT);
+
+ /* PRUNE overrides THEN */
case OP_PRUNE:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM51);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ MRRETURN(MATCH_PRUNE);
+
+ case OP_PRUNE_ARG:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+ ims, eptrb, flags, RM56);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ md->mark = ecode + 2;
RRETURN(MATCH_PRUNE);
- case OP_COMMIT:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM52);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_COMMIT);
+ /* SKIP overrides PRUNE and THEN */
case OP_SKIP:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM53);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
+ RRETURN(rrc);
md->start_match_ptr = eptr; /* Pass back current position */
- RRETURN(MATCH_SKIP);
+ MRRETURN(MATCH_SKIP);
+
+ case OP_SKIP_ARG:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
+ ims, eptrb, flags, RM57);
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
+ RRETURN(rrc);
+
+ /* Pass back the current skip name by overloading md->start_match_ptr and
+ returning the special MATCH_SKIP_ARG return code. This will either be
+ caught by a matching MARK, or get to the top, where it is treated the same
+ as PRUNE. */
+
+ md->start_match_ptr = ecode + 2;
+ RRETURN(MATCH_SKIP_ARG);
+
+ /* For THEN (and THEN_ARG) we pass back the address of the bracket or
+ the alt that is at the start of the current branch. This makes it possible
+ to skip back past alternatives that precede the THEN within the current
+ branch. */
case OP_THEN:
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM54);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ md->start_match_ptr = ecode - GET(ecode, 1);
+ MRRETURN(MATCH_THEN);
+
+ case OP_THEN_ARG:
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
+ offset_top, md, ims, eptrb, flags, RM58);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ md->start_match_ptr = ecode - GET(ecode, 1);
+ md->mark = ecode + LINK_SIZE + 2;
RRETURN(MATCH_THEN);
/* Handle a capturing bracket. If there is space in the offset vector, save
@@ -733,14 +815,17 @@ for (;;)
save_capture_last = md->capture_last;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
+ md->offset_vector[md->offset_end - number] =
+ (int)(eptr - md->start_subject);
flags = (op == OP_SCBRA)? match_cbegroup : 0;
do
{
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
ims, eptrb, flags, RM1);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
}
@@ -752,6 +837,7 @@ for (;;)
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
+ if (rrc != MATCH_THEN) md->mark = markptr;
RRETURN(MATCH_NOMATCH);
}
@@ -791,6 +877,7 @@ for (;;)
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
eptrb, flags, RM48);
+ if (rrc == MATCH_NOMATCH) md->mark = markptr;
RRETURN(rrc);
}
@@ -799,7 +886,9 @@ for (;;)
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
eptrb, flags, RM2);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode, 1);
}
/* Control never reaches here. */
@@ -826,15 +915,15 @@ for (;;)
cb.callout_number = ecode[LINK_SIZE+2];
cb.offset_vector = md->offset_vector;
cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
+ cb.subject_length = (int)(md->end_subject - md->start_subject);
+ cb.start_match = (int)(mstart - md->start_subject);
+ cb.current_position = (int)(eptr - md->start_subject);
cb.pattern_position = GET(ecode, LINK_SIZE + 3);
cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+ if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
ecode += _pcre_OP_lengths[OP_CALLOUT];
@@ -1000,7 +1089,8 @@ for (;;)
ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
}
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+ else if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
{
RRETURN(rrc); /* Need braces because of following else */
}
@@ -1054,7 +1144,7 @@ for (;;)
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
+ md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
if (offset_top <= offset) offset_top = offset + 2;
}
ecode += 3;
@@ -1089,14 +1179,19 @@ for (;;)
(md->notempty ||
(md->notempty_atstart &&
mstart == md->start_subject + md->start_offset)))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* Otherwise, we have a match. */
md->end_match_ptr = eptr; /* Record where we ended */
md->end_offset_top = offset_top; /* and how many extracts were taken */
md->start_match_ptr = mstart; /* and the start (\K can modify) */
- RRETURN(MATCH_MATCH);
+
+ /* For some reason, the macros don't work properly if an expression is
+ given as the argument to MRRETURN when the heap is in use. */
+
+ rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
+ MRRETURN(rrc);
/* Change option settings */
@@ -1118,16 +1213,18 @@ for (;;)
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM4);
- if (rrc == MATCH_MATCH)
+ if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
mstart = md->start_match_ptr; /* In case \K reset it */
break;
}
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
- if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
+ if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
/* If checking an assertion for a condition, return MATCH_MATCH. */
@@ -1151,13 +1248,15 @@ for (;;)
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM5);
- if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
+ if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
{
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
break;
}
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
@@ -1180,7 +1279,7 @@ for (;;)
while (i-- > 0)
{
eptr--;
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
BACKCHAR(eptr);
}
}
@@ -1191,7 +1290,7 @@ for (;;)
{
eptr -= GET(ecode, 1);
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
}
/* Save the earliest consulted character, then skip to next op code */
@@ -1212,15 +1311,15 @@ for (;;)
cb.callout_number = ecode[1];
cb.offset_vector = md->offset_vector;
cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
+ cb.subject_length = (int)(md->end_subject - md->start_subject);
+ cb.start_match = (int)(mstart - md->start_subject);
+ cb.current_position = (int)(eptr - md->start_subject);
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
cb.capture_last = md->capture_last;
cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+ if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
ecode += 2 + 2*LINK_SIZE;
@@ -1286,15 +1385,16 @@ for (;;)
{
RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
md, ims, eptrb, flags, RM6);
- if (rrc == MATCH_MATCH)
+ if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
DPRINTF(("Recursion matched\n"));
md->recursive = new_recursive.prevrec;
if (new_recursive.offset_save != stacksave)
(pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_MATCH);
+ MRRETURN(MATCH_MATCH);
}
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
+ else if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
{
DPRINTF(("Recursion gave error %d\n", rrc));
if (new_recursive.offset_save != stacksave)
@@ -1313,7 +1413,7 @@ for (;;)
md->recursive = new_recursive.prevrec;
if (new_recursive.offset_save != stacksave)
(pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never reaches here */
@@ -1332,12 +1432,14 @@ for (;;)
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
- if (rrc == MATCH_MATCH)
+ if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
{
mstart = md->start_match_ptr;
break;
}
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH &&
+ (rrc != MATCH_THEN || md->start_match_ptr != ecode))
+ RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
@@ -1467,7 +1569,7 @@ for (;;)
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
md->start_match_ptr = mstart;
- RRETURN(MATCH_MATCH);
+ MRRETURN(MATCH_MATCH);
}
/* For capturing groups we have to check the group number back at the start
@@ -1491,7 +1593,7 @@ for (;;)
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
+ md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
if (offset_top <= offset) offset_top = offset + 2;
}
@@ -1562,12 +1664,12 @@ for (;;)
/* Start of subject unless notbol, or after internal newline if multiline */
case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr != md->start_subject &&
(eptr == md->end_subject || !WAS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
}
@@ -1576,14 +1678,14 @@ for (;;)
/* Start of subject assertion */
case OP_SOD:
- if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
ecode++;
break;
/* Start of match assertion */
case OP_SOM:
- if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
+ if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1601,39 +1703,42 @@ for (;;)
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr < md->end_subject)
- { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
+ { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
else
- { if (md->noteol) RRETURN(MATCH_NOMATCH); }
+ {
+ if (md->noteol) MRRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
+ }
ecode++;
break;
}
- else
+ else /* Not multiline */
{
- if (md->noteol) RRETURN(MATCH_NOMATCH);
- if (!md->endonly)
- {
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
+ if (md->noteol) MRRETURN(MATCH_NOMATCH);
+ if (!md->endonly) goto ASSERT_NL_OR_EOS;
}
+
/* ... else fall through for endonly */
/* End of subject assertion (\z) */
case OP_EOD:
- if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
ecode++;
break;
/* End of subject or ending \n assertion (\Z) */
case OP_EODN:
- if (eptr != md->end_subject &&
+ ASSERT_NL_OR_EOS:
+ if (eptr < md->end_subject &&
(!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+
+ /* Either at end of string or \n before end. */
+
+ SCHECK_PARTIAL();
ecode++;
break;
@@ -1651,14 +1756,30 @@ for (;;)
#ifdef SUPPORT_UTF8
if (utf8)
{
+ /* Get status of previous character */
+
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
USPTR lastptr = eptr - 1;
while((*lastptr & 0xc0) == 0x80) lastptr--;
if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
GETCHAR(c, lastptr);
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ if (c == '_') prev_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ prev_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
+
+ /* Get status of next character */
+
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -1667,47 +1788,89 @@ for (;;)
else
{
GETCHAR(c, eptr);
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ if (c == '_') cur_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ cur_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
}
else
#endif
- /* Not in UTF-8 mode */
+ /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
+ consistency with the behaviour of \w we do use it in this case. */
{
+ /* Get status of previous character */
+
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ c = eptr[-1];
+ if (c == '_') prev_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ prev_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
}
+
+ /* Get status of next character */
+
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
cur_is_word = FALSE;
}
- else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
+ else
+#ifdef SUPPORT_UCP
+ if (md->use_ucp)
+ {
+ c = *eptr;
+ if (c == '_') cur_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(c);
+ cur_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif
+ cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
}
/* Now see if the situation is what we want */
if ((*ecode++ == OP_WORD_BOUNDARY)?
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
/* Match a single character type; inline for speed */
case OP_ANY:
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ALLANY:
if (eptr++ >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
@@ -1720,7 +1883,7 @@ for (;;)
if (eptr++ >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
break;
@@ -1729,7 +1892,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
@@ -1738,7 +1901,7 @@ for (;;)
#endif
(md->ctypes[c] & ctype_digit) != 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1746,7 +1909,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
@@ -1755,7 +1918,7 @@ for (;;)
#endif
(md->ctypes[c] & ctype_digit) == 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1763,7 +1926,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
@@ -1772,7 +1935,7 @@ for (;;)
#endif
(md->ctypes[c] & ctype_space) != 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1780,7 +1943,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
@@ -1789,7 +1952,7 @@ for (;;)
#endif
(md->ctypes[c] & ctype_space) == 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1797,7 +1960,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
@@ -1806,7 +1969,7 @@ for (;;)
#endif
(md->ctypes[c] & ctype_word) != 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1814,7 +1977,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
if (
@@ -1823,7 +1986,7 @@ for (;;)
#endif
(md->ctypes[c] & ctype_word) == 0
)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
ecode++;
break;
@@ -1831,12 +1994,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
@@ -1849,7 +2012,7 @@ for (;;)
case 0x0085:
case 0x2028:
case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
ecode++;
@@ -1859,7 +2022,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
@@ -1884,7 +2047,7 @@ for (;;)
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
break;
@@ -1893,12 +2056,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
@@ -1927,7 +2090,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
@@ -1940,7 +2103,7 @@ for (;;)
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
break;
@@ -1949,12 +2112,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
@@ -1976,39 +2139,72 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
{
int chartype = UCD_CHARTYPE(c);
+
switch(ecode[1])
{
case PT_ANY:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
break;
case PT_LAMP:
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
chartype == ucp_Lt) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
+ MRRETURN(MATCH_NOMATCH);
+ break;
case PT_GC:
if ((ecode[2] != _pcre_ucp_gentype[chartype]) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case PT_PC:
if ((ecode[2] != chartype) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case PT_SC:
if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ /* These are specials */
+
+ case PT_ALNUM:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N) == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
break;
+ case PT_SPACE: /* Perl space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR)
+ == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_WORD:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
+ MRRETURN(MATCH_NOMATCH);
+ break;
+
+ /* This should never occur */
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
@@ -2024,12 +2220,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
{
int category = UCD_CATEGORY(c);
- if (category == ucp_M) RRETURN(MATCH_NOMATCH);
+ if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
@@ -2074,7 +2270,7 @@ for (;;)
referenced subpattern. */
if (offset >= offset_top || md->offset_vector[offset] < 0)
- length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
+ length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
else
length = md->offset_vector[offset+1] - md->offset_vector[offset];
@@ -2108,7 +2304,7 @@ for (;;)
if (!match_ref(offset, eptr, length, md, ims))
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += length;
continue; /* With the main loop */
@@ -2128,7 +2324,7 @@ for (;;)
if (!match_ref(offset, eptr, length, md, ims))
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += length;
}
@@ -2146,11 +2342,11 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (!match_ref(offset, eptr, length, md, ims))
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += length;
}
@@ -2177,7 +2373,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr -= length;
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -2239,16 +2435,16 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
if (c > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
}
else
{
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
}
@@ -2261,10 +2457,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
@@ -2286,20 +2482,20 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
if (c > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
}
else
{
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
}
@@ -2311,14 +2507,14 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -2384,7 +2580,7 @@ for (;;)
}
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -2436,10 +2632,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
}
/* If max == min we can continue with the main loop without the
@@ -2456,14 +2652,14 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -2492,7 +2688,7 @@ for (;;)
if (eptr-- == pp) break; /* Stop if tried at original pos */
if (utf8) BACKCHAR(eptr);
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -2511,9 +2707,9 @@ for (;;)
if (length > md->end_subject - eptr)
{
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
}
else
#endif
@@ -2523,9 +2719,9 @@ for (;;)
if (md->end_subject - eptr < 1)
{
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
ecode += 2;
}
break;
@@ -2543,7 +2739,7 @@ for (;;)
if (length > md->end_subject - eptr)
{
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* If the pattern character's value is < 128, we have only one byte, and
@@ -2551,7 +2747,7 @@ for (;;)
if (fc < 128)
{
- if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
/* Otherwise we must pick up the subject character */
@@ -2570,7 +2766,7 @@ for (;;)
#ifdef SUPPORT_UCP
if (dc != UCD_OTHERCASE(fc))
#endif
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
}
@@ -2582,9 +2778,9 @@ for (;;)
if (md->end_subject - eptr < 1)
{
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
ecode += 2;
}
break;
@@ -2678,7 +2874,7 @@ for (;;)
else
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
@@ -2690,7 +2886,7 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr <= md->end_subject - length &&
memcmp(eptr, charptr, length) == 0) eptr += length;
#ifdef SUPPORT_UCP
@@ -2701,7 +2897,7 @@ for (;;)
else
{
CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -2732,7 +2928,7 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
+ if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
@@ -2775,9 +2971,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
if (minimize)
@@ -2786,13 +2982,13 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -2818,7 +3014,7 @@ for (;;)
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -2832,9 +3028,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
@@ -2845,13 +3041,13 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -2876,7 +3072,7 @@ for (;;)
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -2888,7 +3084,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
ecode++;
GETCHARINCTEST(c, eptr);
@@ -2898,11 +3094,11 @@ for (;;)
if (c < 256)
#endif
c = md->lcc[c];
- if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
}
else
{
- if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
+ if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
}
break;
@@ -2996,11 +3192,11 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
@@ -3013,9 +3209,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
}
@@ -3032,15 +3228,15 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
@@ -3051,13 +3247,13 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -3119,7 +3315,7 @@ for (;;)
}
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -3138,10 +3334,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
@@ -3153,9 +3349,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
}
}
@@ -3172,14 +3368,14 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
+ if (fc == d) MRRETURN(MATCH_NOMATCH);
}
}
else
@@ -3190,13 +3386,13 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -3257,7 +3453,7 @@ for (;;)
}
}
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -3351,13 +3547,13 @@ for (;;)
switch(prop_type)
{
case PT_ANY:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
}
@@ -3369,14 +3565,14 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3386,12 +3582,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3401,12 +3597,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3416,15 +3612,84 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_ALNUM:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_SPACE: /* Perl space */
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_WORD:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N ||
+ c == CHAR_UNDERSCORE)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
}
break;
+ /* This should not occur */
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
@@ -3440,11 +3705,11 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
@@ -3471,9 +3736,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
@@ -3485,7 +3750,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -3493,7 +3758,7 @@ for (;;)
break;
case OP_ANYBYTE:
- if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
+ if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
eptr += min;
break;
@@ -3503,12 +3768,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
@@ -3521,7 +3786,7 @@ for (;;)
case 0x0085:
case 0x2028:
case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
}
@@ -3533,7 +3798,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
@@ -3558,7 +3823,7 @@ for (;;)
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
@@ -3569,12 +3834,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
@@ -3605,7 +3870,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
@@ -3618,7 +3883,7 @@ for (;;)
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
@@ -3629,12 +3894,12 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
@@ -3653,11 +3918,11 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINC(c, eptr);
if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3667,10 +3932,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
@@ -3681,10 +3946,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
}
break;
@@ -3695,10 +3960,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
@@ -3709,10 +3974,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
}
break;
@@ -3723,10 +3988,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
/* No need to skip more bytes - we know it's a 1-byte character */
}
break;
@@ -3749,9 +4014,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
}
break;
@@ -3760,7 +4025,7 @@ for (;;)
if (eptr > md->end_subject - min)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += min;
break;
@@ -3769,7 +4034,7 @@ for (;;)
if (eptr > md->end_subject - min)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
eptr += min;
break;
@@ -3780,11 +4045,11 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
@@ -3794,7 +4059,7 @@ for (;;)
case 0x000b:
case 0x000c:
case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
}
@@ -3806,7 +4071,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
@@ -3814,7 +4079,7 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
@@ -3825,11 +4090,11 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
@@ -3844,7 +4109,7 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
@@ -3854,7 +4119,7 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
}
break;
@@ -3865,11 +4130,11 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
switch(*eptr++)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
@@ -3886,9 +4151,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3898,9 +4163,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3910,9 +4175,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3922,9 +4187,9 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3934,10 +4199,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if ((md->ctypes[*eptr++] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3947,10 +4212,10 @@ for (;;)
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if ((md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
@@ -3979,14 +4244,14 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -3995,18 +4260,18 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -4015,16 +4280,16 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -4033,16 +4298,16 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -4051,19 +4316,101 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
+ case PT_ALNUM:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_SPACE: /* Perl space */
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_PXSPACE: /* POSIX space */
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_WORD:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L ||
+ prop_category == ucp_N ||
+ c == CHAR_UNDERSCORE)
+ == prop_fail_result)
+ MRRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ /* This should never occur */
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
@@ -4078,15 +4425,15 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
int len = 1;
@@ -4110,14 +4457,14 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (ctype == OP_ANY && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
@@ -4129,7 +4476,7 @@ for (;;)
case OP_ANYNL:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
@@ -4141,7 +4488,7 @@ for (;;)
case 0x0085:
case 0x2028:
case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
break;
@@ -4169,14 +4516,14 @@ for (;;)
case 0x202f: /* NARROW NO-BREAK SPACE */
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
@@ -4211,14 +4558,14 @@ for (;;)
case 0x85: /* NEL */
case 0x2028: /* LINE SEPARATOR */
case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
@@ -4232,32 +4579,32 @@ for (;;)
case OP_NOT_DIGIT:
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
break;
default:
@@ -4273,14 +4620,14 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
if (ctype == OP_ANY && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
@@ -4292,7 +4639,7 @@ for (;;)
case OP_ANYNL:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x000d:
if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
break;
@@ -4303,7 +4650,7 @@ for (;;)
case 0x000b:
case 0x000c:
case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
+ if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
break;
}
break;
@@ -4315,14 +4662,14 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_HSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
@@ -4339,14 +4686,14 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
break;
case OP_VSPACE:
switch(c)
{
- default: RRETURN(MATCH_NOMATCH);
+ default: MRRETURN(MATCH_NOMATCH);
case 0x0a: /* LF */
case 0x0b: /* VT */
case 0x0c: /* FF */
@@ -4357,27 +4704,27 @@ for (;;)
break;
case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
+ if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
break;
default:
@@ -4410,7 +4757,7 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
if (prop_fail_result) break;
eptr+= len;
}
@@ -4425,7 +4772,7 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
@@ -4444,7 +4791,7 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
break;
@@ -4461,7 +4808,7 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
break;
@@ -4478,13 +4825,90 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
break;
eptr+= len;
}
break;
+
+ case PT_ALNUM:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N)
+ == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_SPACE: /* Perl space */
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
+ c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
+ == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ case PT_WORD:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ prop_category = UCD_CATEGORY(c);
+ if ((prop_category == ucp_L || prop_category == ucp_N ||
+ c == CHAR_UNDERSCORE) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
@@ -5037,7 +5461,7 @@ for (;;)
/* Get here if we can't make it match with any permitted repetitions */
- RRETURN(MATCH_NOMATCH);
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -5070,12 +5494,13 @@ switch (frame->Xwhere)
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54)
+ LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
#ifdef SUPPORT_UTF8
LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
+ LBL(59) LBL(60) LBL(61) LBL(62)
#endif /* SUPPORT_UCP */
#endif /* SUPPORT_UTF8 */
default:
@@ -5204,11 +5629,11 @@ const real_pcre *external_re = (const real_pcre *)argument_re;
const real_pcre *re = external_re;
/* Plausibility checks */
-
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* This information is for finding all the numbers associated with a given
name, for condition testing. */
@@ -5279,6 +5704,7 @@ end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->use_ucp = (re->options & PCRE_UCP) != 0;
md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0;
@@ -5288,6 +5714,7 @@ md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
md->hitend = FALSE;
+md->mark = NULL; /* In case never set */
md->recursive = NULL; /* No recursion at top level */
@@ -5373,16 +5800,14 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
+ int tb;
+ if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
+ return (tb == length && md->partial > 1)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((USPTR)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ tb = ((USPTR)subject)[start_offset] & 0xc0;
+ if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
@@ -5510,9 +5935,10 @@ for(;;)
/* There are some optimizations that avoid running the match if a known
starting point is not found, or if a known later character is not present.
However, there is an option that disables these, for testing and for ensuring
- that all callouts do actually occur. */
+ that all callouts do actually occur. The option can be set in the regex by
+ (*NO_START_OPT) or passed in match-time options. */
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+ if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
/* Advance to a unique first byte if there is one. */
@@ -5566,8 +5992,16 @@ for(;;)
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ start_match++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+#endif
+ }
+ else break;
}
}
} /* Starting optimizations */
@@ -5668,6 +6102,23 @@ for(;;)
switch(rc)
{
+ /* SKIP passes back the next starting point explicitly, but if it is the
+ same as the match we have just done, treat it as NOMATCH. */
+
+ case MATCH_SKIP:
+ if (md->start_match_ptr != start_match)
+ {
+ new_start_match = md->start_match_ptr;
+ break;
+ }
+ /* Fall through */
+
+ /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
+ the SKIP's arg was not found. We also treat this as NOMATCH. */
+
+ case MATCH_SKIP_ARG:
+ /* Fall through */
+
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
exactly like PRUNE. */
@@ -5682,12 +6133,6 @@ for(;;)
#endif
break;
- /* SKIP passes back the next starting point explicitly. */
-
- case MATCH_SKIP:
- new_start_match = md->start_match_ptr;
- break;
-
/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
case MATCH_COMMIT:
@@ -5733,7 +6178,8 @@ for(;;)
md->nllen == 2))
start_match++;
- } /* End of for(;;) "bumpalong" loop */
+ md->mark = NULL; /* Reset for start of next match attempt */
+ } /* End of for(;;) "bumpalong" loop */
/* ==========================================================================*/
@@ -5757,7 +6203,7 @@ capturing parentheses than vector slots. */
ENDLOOP:
-if (rc == MATCH_MATCH)
+if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
{
if (using_temporary_offsets)
{
@@ -5783,12 +6229,12 @@ if (rc == MATCH_MATCH)
if (offsetcount < 2) rc = 0; else
{
- offsets[0] = md->start_match_ptr - md->start_subject;
- offsets[1] = md->end_match_ptr - md->start_subject;
+ offsets[0] = (int)(md->start_match_ptr - md->start_subject);
+ offsets[1] = (int)(md->end_match_ptr - md->start_subject);
}
DPRINTF((">>>> returning %d\n", rc));
- return rc;
+ goto RETURN_MARK;
}
/* Control gets here if there has been an error, or if the overall match
@@ -5800,26 +6246,43 @@ if (using_temporary_offsets)
(pcre_free)(md->offset_vector);
}
+/* For anything other than nomatch or partial match, just return the code. */
+
if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
{
DPRINTF((">>>> error: returning %d\n", rc));
return rc;
}
-else if (start_partial != NULL)
+
+/* Handle partial matches - disable any mark data */
+
+if (start_partial != NULL)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
+ md->mark = NULL;
if (offsetcount > 1)
{
- offsets[0] = start_partial - (USPTR)subject;
- offsets[1] = end_subject - (USPTR)subject;
+ offsets[0] = (int)(start_partial - (USPTR)subject);
+ offsets[1] = (int)(end_subject - (USPTR)subject);
}
- return PCRE_ERROR_PARTIAL;
+ rc = PCRE_ERROR_PARTIAL;
}
+
+/* This is the classic nomatch case */
+
else
{
DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
- return PCRE_ERROR_NOMATCH;
+ rc = PCRE_ERROR_NOMATCH;
}
+
+/* Return the MARK data if it has been requested. */
+
+RETURN_MARK:
+
+if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
+ *(extra_data->mark) = (unsigned char *)(md->mark);
+return rc;
}
/* End of pcre_exec.c */
diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h
index 7c7412f32..9cff71d27 100644
--- a/glib/pcre/pcre_internal.h
+++ b/glib/pcre/pcre_internal.h
@@ -408,9 +408,10 @@ capturing parenthesis numbers in back references. */
/* When UTF-8 encoding is being used, a character is no longer just a single
byte. The macros for character handling generate simple sequences when used in
-byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
-never be called in byte mode. To make sure it can never even appear when UTF-8
-support is omitted, we don't even define it. */
+byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
+not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
+never be called in byte mode. To make sure they can never even appear when
+UTF-8 support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF8
#define GETCHAR(c, eptr) c = *eptr;
@@ -418,43 +419,83 @@ support is omitted, we don't even define it. */
#define GETCHARINC(c, eptr) c = *eptr++;
#define GETCHARINCTEST(c, eptr) c = *eptr++;
#define GETCHARLEN(c, eptr, len) c = *eptr;
+/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
#else /* SUPPORT_UTF8 */
+/* These macros were originally written in the form of loops that used data
+from the tables whose names start with _pcre_utf8_table. They were rewritten by
+a user so as not to use loops, because in some environments this gives a
+significant performance advantage, and it seems never to do any harm. */
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer. */
+
+#define GETUTF8(c, eptr) \
+ { \
+ if ((c & 0x20) == 0) \
+ c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+ else if ((c & 0x10) == 0) \
+ c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+ else if ((c & 0x08) == 0) \
+ c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+ ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+ else if ((c & 0x04) == 0) \
+ c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+ ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+ (eptr[4] & 0x3f); \
+ else \
+ c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+ ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+ ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+ }
+
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
#define GETCHAR(c, eptr) \
c = *eptr; \
- if (c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- }
+ if (c >= 0xc0) GETUTF8(c, eptr);
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
- if (utf8 && c >= 0xc0) \
+ if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
{ \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
+ if ((c & 0x20) == 0) \
+ c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
+ else if ((c & 0x10) == 0) \
{ \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
+ c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
+ eptr += 2; \
+ } \
+ else if ((c & 0x08) == 0) \
+ { \
+ c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
+ ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+ eptr += 3; \
+ } \
+ else if ((c & 0x04) == 0) \
+ { \
+ c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
+ ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
+ (eptr[3] & 0x3f); \
+ eptr += 4; \
+ } \
+ else \
+ { \
+ c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
+ ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
+ ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
+ eptr += 5; \
} \
}
@@ -463,31 +504,49 @@ know we are in UTF-8 mode. */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
- if (c >= 0xc0) \
- { \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- while (gcaa-- > 0) \
- { \
- gcss -= 6; \
- c |= (*eptr++ & 0x3f) << gcss; \
- } \
- }
+ if (c >= 0xc0) GETUTF8INC(c, eptr);
-/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
- if (utf8 && c >= 0xc0) \
+ if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF8LEN(c, eptr, len) \
{ \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- while (gcaa-- > 0) \
+ if ((c & 0x20) == 0) \
+ { \
+ c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+ len++; \
+ } \
+ else if ((c & 0x10) == 0) \
{ \
- gcss -= 6; \
- c |= (*eptr++ & 0x3f) << gcss; \
+ c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+ len += 2; \
+ } \
+ else if ((c & 0x08) == 0) \
+ {\
+ c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+ ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+ len += 3; \
+ } \
+ else if ((c & 0x04) == 0) \
+ { \
+ c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+ ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+ (eptr[4] & 0x3f); \
+ len += 4; \
+ } \
+ else \
+ {\
+ c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+ ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+ ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+ len += 5; \
} \
}
@@ -496,39 +555,15 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
- if (c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
+ if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
-know we are in UTF-8 mode. */
+do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
- if (utf8 && c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
+ if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
@@ -536,7 +571,7 @@ because almost all calls are already within a block of UTF-8 only code. */
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
-#endif
+#endif /* SUPPORT_UTF8 */
/* In case there is no definition of offsetof() provided - though any proper
@@ -580,7 +615,7 @@ time, run time, or study time, respectively. */
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
- PCRE_JAVASCRIPT_COMPAT)
+ PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@@ -620,7 +655,7 @@ variable-length repeat, or a anything other than literal characters. */
environments where these macros are defined elsewhere. Unfortunately, there
is no way to do the same for the typedef. */
-typedef gboolean BOOL;
+typedef gboolean BOOL;
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
character constants like '*' because the compiler would emit their EBCDIC code,
@@ -870,6 +905,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define STRING_COMMIT0 "COMMIT\0"
#define STRING_F0 "F\0"
#define STRING_FAIL0 "FAIL\0"
+#define STRING_MARK0 "MARK\0"
#define STRING_PRUNE0 "PRUNE\0"
#define STRING_SKIP0 "SKIP\0"
#define STRING_THEN "THEN"
@@ -891,14 +927,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define STRING_DEFINE "DEFINE"
-#define STRING_CR_RIGHTPAR "CR)"
-#define STRING_LF_RIGHTPAR "LF)"
-#define STRING_CRLF_RIGHTPAR "CRLF)"
-#define STRING_ANY_RIGHTPAR "ANY)"
-#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
-#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
-#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
-#define STRING_UTF8_RIGHTPAR "UTF8)"
+#define STRING_CR_RIGHTPAR "CR)"
+#define STRING_LF_RIGHTPAR "LF)"
+#define STRING_CRLF_RIGHTPAR "CRLF)"
+#define STRING_ANY_RIGHTPAR "ANY)"
+#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
+#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
+#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
+#define STRING_UTF8_RIGHTPAR "UTF8)"
+#define STRING_UCP_RIGHTPAR "UCP)"
+#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
#else /* SUPPORT_UTF8 */
@@ -1122,6 +1160,7 @@ only. */
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
#define STRING_F0 STR_F "\0"
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
+#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
#define STRING_THEN STR_T STR_H STR_E STR_N
@@ -1143,14 +1182,16 @@ only. */
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
-#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
-#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
-#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
-#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
+#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
+#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
+#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
+#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
#endif /* SUPPORT_UTF8 */
@@ -1183,9 +1224,13 @@ only. */
#define PT_ANY 0 /* Any property - matches all chars */
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
-#define PT_GC 2 /* General characteristic (e.g. L) */
-#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
+#define PT_GC 2 /* Specified general characteristic (e.g. L) */
+#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
#define PT_SC 4 /* Script (e.g. Han) */
+#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
+#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
+#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD 8 /* Word - L plus N plus underscore */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
@@ -1202,9 +1247,15 @@ contain UTF-8 characters with values greater than 255. */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence, and another for OP_ALLANY
-(which is used for [^] in JavaScript compatibility mode).
+definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
+corresponds to "." in DOTALL mode rather than an escape sequence. It is also
+used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
+like \N.
+
+The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
+when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
+They must be contiguous, and remain in order so that the replacements can be
+looked up from a table.
The final escape must be ESC_REF as subsequent values are used for
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
@@ -1214,11 +1265,12 @@ put in between that don't consume a character, that code will have to change.
*/
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
- ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
- ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
+ ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
+ ESC_E, ESC_Q, ESC_g, ESC_k,
+ ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
ESC_REF };
-
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
OP_EOD must correspond in order to the list of escapes immediately above.
@@ -1242,8 +1294,8 @@ enum {
OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
- OP_ANY, /* 12 Match any character (subject to DOTALL) */
- OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
+ OP_ANY, /* 12 Match any character except newline */
+ OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 15 \P (not Unicode property) */
OP_PROP, /* 16 \p (Unicode property) */
@@ -1373,20 +1425,24 @@ enum {
/* These are backtracking control verbs */
- OP_PRUNE, /* 107 */
- OP_SKIP, /* 108 */
- OP_THEN, /* 109 */
- OP_COMMIT, /* 110 */
+ OP_MARK, /* 107 always has an argument */
+ OP_PRUNE, /* 108 */
+ OP_PRUNE_ARG, /* 109 same, but with argument */
+ OP_SKIP, /* 110 */
+ OP_SKIP_ARG, /* 111 same, but with argument */
+ OP_THEN, /* 112 */
+ OP_THEN_ARG, /* 113 same, but with argument */
+ OP_COMMIT, /* 114 */
/* These are forced failure and success verbs */
- OP_FAIL, /* 111 */
- OP_ACCEPT, /* 112 */
- OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
+ OP_FAIL, /* 115 */
+ OP_ACCEPT, /* 116 */
+ OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
- OP_SKIPZERO, /* 114 */
+ OP_SKIPZERO, /* 118 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@@ -1397,7 +1453,7 @@ enum {
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
-called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
+called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
@@ -1422,7 +1478,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
"Brazero", "Braminzero", \
- "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
+ "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
+ "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"Close", "Skip zero"
@@ -1488,8 +1545,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */
3, 3, /* RREF, NRREF */ \
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
- 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
+ 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
+ 1, 3, /* SKIP, SKIP_ARG */ \
+ 1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
+ 1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
@@ -1507,7 +1566,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
- ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
+ ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
+ ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@@ -1650,6 +1710,7 @@ typedef struct match_data {
BOOL noteol; /* NOTEOL flag */
BOOL utf8; /* UTF8 flag */
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
+ BOOL use_ucp; /* PCRE_UCP flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
BOOL notempty_atstart; /* Empty string match at start not wanted */
@@ -1669,6 +1730,7 @@ typedef struct match_data {
int eptrn; /* Next free eptrblock */
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
+ const uschar *mark; /* Mark pointer to pass back */
} match_data;
/* A similar structure is used for the same purpose by the DFA matching
@@ -1764,7 +1826,7 @@ extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
-#define _pcre_valid_utf8(u, i) TRUE
+#define _pcre_valid_utf8(USPTR, int) TRUE
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
extern BOOL _pcre_xclass(int, const uschar *);
diff --git a/glib/pcre/pcre_study.c b/glib/pcre/pcre_study.c
index bd00a53a6..be321fa25 100644
--- a/glib/pcre/pcre_study.c
+++ b/glib/pcre/pcre_study.c
@@ -48,6 +48,7 @@ supporting functions. */
#include "pcre_internal.h"
+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
/* Returns from set_start_bits() */
@@ -413,6 +414,18 @@ for (;;)
#endif
break;
+ /* Skip these, but we need to add in the name length. */
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ cc += _pcre_OP_lengths[op] + cc[1];
+ break;
+
+ case OP_THEN_ARG:
+ cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
+ break;
+
/* For the record, these are the opcodes that are matched by "default":
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
OP_THEN. */
@@ -431,25 +444,121 @@ for (;;)
* Set a bit and maybe its alternate case *
*************************************************/
-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.
Arguments:
start_bits points to the bit map
- c is the character
+ p points to the character
caseless the caseless flag
cd the block with char table pointers
+ utf8 TRUE for UTF-8 mode
-Returns: nothing
+Returns: pointer after the character
+*/
+
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+ compile_data *cd, BOOL utf8)
+{
+unsigned int c = *p;
+
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ uschar buff[8];
+ c = UCD_OTHERCASE(c);
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+#endif
+ return p;
+ }
+#endif
+
+/* Not UTF-8 mode, or character is less than 127. */
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+}
+
+
+
+/*************************************************
+* Set bits for a positive character type *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
*/
static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
-start_bits[c/8] |= (1 << (c&7));
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+ {
+ if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+ {
+ uschar buff[8];
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+ }
+}
+
+
+/*************************************************
+* Set bits for a negative character type *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at
+0xc0 (192) for simplicity.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+ compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
}
@@ -484,6 +593,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
{
register int c;
int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
@@ -607,12 +717,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- set_table_bit(start_bits, tcode[1], caseless, cd);
- tcode += 2;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
break;
/* Single-char upto sets the bit and tries the next */
@@ -620,12 +725,7 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- set_table_bit(start_bits, tcode[3], caseless, cd);
- tcode += 4;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
break;
/* At least one single char sets the bit and stops */
@@ -638,59 +738,86 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- set_table_bit(start_bits, tcode[1], caseless, cd);
+ (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
+ try_next = FALSE;
+ break;
+
+ /* Special spacing and line-terminating items. These recognize specific
+ lists of characters. The difference between VSPACE and ANYNL is that the
+ latter can match the two-character CRLF sequence, but that is not
+ relevant for finding the first character, so their code here is
+ identical. */
+
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+00A0 */
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ else SET_BIT(0xA0);
+ try_next = FALSE;
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
try_next = FALSE;
break;
- /* Single character type sets the bits and stops */
+ /* Single character types set the bits and stop. Note that if PCRE_UCP
+ is set, we do not see these op codes because \d etc are converted to
+ properties. Therefore, these apply in the case when only characters less
+ than 256 are recognized to match the types. */
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it is set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ not set it from the table. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
@@ -699,6 +826,7 @@ do
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
+ case OP_TYPEPOSPLUS:
tcode++;
break;
@@ -722,52 +850,69 @@ do
case OP_TYPEPOSQUERY:
switch(tcode[1])
{
+ default:
case OP_ANY:
case OP_ALLANY:
return SSB_FAIL;
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+00A0 */
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ else SET_BIT(0xA0);
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
+ break;
+
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it gets set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ avoid setting it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
break;
}
diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c
index b7f7ba5d1..8cc4eb309 100644
--- a/glib/pcre/pcre_tables.c
+++ b/glib/pcre/pcre_tables.c
@@ -123,8 +123,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
+#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
+#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
@@ -184,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Lu0 STR_L STR_u "\0"
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
+#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mc0 STR_M STR_c "\0"
@@ -243,6 +246,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
#define STRING_Vai0 STR_V STR_a STR_i "\0"
+#define STRING_Xan0 STR_X STR_a STR_n "\0"
+#define STRING_Xps0 STR_X STR_p STR_s "\0"
+#define STRING_Xsp0 STR_X STR_s STR_p "\0"
+#define STRING_Xwd0 STR_X STR_w STR_d "\0"
#define STRING_Yi0 STR_Y STR_i "\0"
#define STRING_Z0 STR_Z "\0"
#define STRING_Zl0 STR_Z STR_l "\0"
@@ -256,8 +263,10 @@ const char _pcre_utt_names[] =
STRING_Avestan0
STRING_Balinese0
STRING_Bamum0
+ STRING_Batak0
STRING_Bengali0
STRING_Bopomofo0
+ STRING_Brahmi0
STRING_Braille0
STRING_Buginese0
STRING_Buhid0
@@ -319,6 +328,7 @@ const char _pcre_utt_names[] =
STRING_Lydian0
STRING_M0
STRING_Malayalam0
+ STRING_Mandaic0
STRING_Mc0
STRING_Me0
STRING_Meetei_Mayek0
@@ -376,6 +386,10 @@ const char _pcre_utt_names[] =
STRING_Tifinagh0
STRING_Ugaritic0
STRING_Vai0
+ STRING_Xan0
+ STRING_Xps0
+ STRING_Xsp0
+ STRING_Xwd0
STRING_Yi0
STRING_Z0
STRING_Zl0
@@ -389,131 +403,138 @@ const ucp_type_table _pcre_utt[] = {
{ 20, PT_SC, ucp_Avestan },
{ 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bamum },
- { 43, PT_SC, ucp_Bengali },
- { 51, PT_SC, ucp_Bopomofo },
- { 60, PT_SC, ucp_Braille },
- { 68, PT_SC, ucp_Buginese },
- { 77, PT_SC, ucp_Buhid },
- { 83, PT_GC, ucp_C },
- { 85, PT_SC, ucp_Canadian_Aboriginal },
- { 105, PT_SC, ucp_Carian },
- { 112, PT_PC, ucp_Cc },
- { 115, PT_PC, ucp_Cf },
- { 118, PT_SC, ucp_Cham },
- { 123, PT_SC, ucp_Cherokee },
- { 132, PT_PC, ucp_Cn },
- { 135, PT_PC, ucp_Co },
- { 138, PT_SC, ucp_Common },
- { 145, PT_SC, ucp_Coptic },
- { 152, PT_PC, ucp_Cs },
- { 155, PT_SC, ucp_Cuneiform },
- { 165, PT_SC, ucp_Cypriot },
- { 173, PT_SC, ucp_Cyrillic },
- { 182, PT_SC, ucp_Deseret },
- { 190, PT_SC, ucp_Devanagari },
- { 201, PT_SC, ucp_Egyptian_Hieroglyphs },
- { 222, PT_SC, ucp_Ethiopic },
- { 231, PT_SC, ucp_Georgian },
- { 240, PT_SC, ucp_Glagolitic },
- { 251, PT_SC, ucp_Gothic },
- { 258, PT_SC, ucp_Greek },
- { 264, PT_SC, ucp_Gujarati },
- { 273, PT_SC, ucp_Gurmukhi },
- { 282, PT_SC, ucp_Han },
- { 286, PT_SC, ucp_Hangul },
- { 293, PT_SC, ucp_Hanunoo },
- { 301, PT_SC, ucp_Hebrew },
- { 308, PT_SC, ucp_Hiragana },
- { 317, PT_SC, ucp_Imperial_Aramaic },
- { 334, PT_SC, ucp_Inherited },
- { 344, PT_SC, ucp_Inscriptional_Pahlavi },
- { 366, PT_SC, ucp_Inscriptional_Parthian },
- { 389, PT_SC, ucp_Javanese },
- { 398, PT_SC, ucp_Kaithi },
- { 405, PT_SC, ucp_Kannada },
- { 413, PT_SC, ucp_Katakana },
- { 422, PT_SC, ucp_Kayah_Li },
- { 431, PT_SC, ucp_Kharoshthi },
- { 442, PT_SC, ucp_Khmer },
- { 448, PT_GC, ucp_L },
- { 450, PT_LAMP, 0 },
- { 453, PT_SC, ucp_Lao },
- { 457, PT_SC, ucp_Latin },
- { 463, PT_SC, ucp_Lepcha },
- { 470, PT_SC, ucp_Limbu },
- { 476, PT_SC, ucp_Linear_B },
- { 485, PT_SC, ucp_Lisu },
- { 490, PT_PC, ucp_Ll },
- { 493, PT_PC, ucp_Lm },
- { 496, PT_PC, ucp_Lo },
- { 499, PT_PC, ucp_Lt },
- { 502, PT_PC, ucp_Lu },
- { 505, PT_SC, ucp_Lycian },
- { 512, PT_SC, ucp_Lydian },
- { 519, PT_GC, ucp_M },
- { 521, PT_SC, ucp_Malayalam },
- { 531, PT_PC, ucp_Mc },
- { 534, PT_PC, ucp_Me },
- { 537, PT_SC, ucp_Meetei_Mayek },
- { 550, PT_PC, ucp_Mn },
- { 553, PT_SC, ucp_Mongolian },
- { 563, PT_SC, ucp_Myanmar },
- { 571, PT_GC, ucp_N },
- { 573, PT_PC, ucp_Nd },
- { 576, PT_SC, ucp_New_Tai_Lue },
- { 588, PT_SC, ucp_Nko },
- { 592, PT_PC, ucp_Nl },
- { 595, PT_PC, ucp_No },
- { 598, PT_SC, ucp_Ogham },
- { 604, PT_SC, ucp_Ol_Chiki },
- { 613, PT_SC, ucp_Old_Italic },
- { 624, PT_SC, ucp_Old_Persian },
- { 636, PT_SC, ucp_Old_South_Arabian },
- { 654, PT_SC, ucp_Old_Turkic },
- { 665, PT_SC, ucp_Oriya },
- { 671, PT_SC, ucp_Osmanya },
- { 679, PT_GC, ucp_P },
- { 681, PT_PC, ucp_Pc },
- { 684, PT_PC, ucp_Pd },
- { 687, PT_PC, ucp_Pe },
- { 690, PT_PC, ucp_Pf },
- { 693, PT_SC, ucp_Phags_Pa },
- { 702, PT_SC, ucp_Phoenician },
- { 713, PT_PC, ucp_Pi },
- { 716, PT_PC, ucp_Po },
- { 719, PT_PC, ucp_Ps },
- { 722, PT_SC, ucp_Rejang },
- { 729, PT_SC, ucp_Runic },
- { 735, PT_GC, ucp_S },
- { 737, PT_SC, ucp_Samaritan },
- { 747, PT_SC, ucp_Saurashtra },
- { 758, PT_PC, ucp_Sc },
- { 761, PT_SC, ucp_Shavian },
- { 769, PT_SC, ucp_Sinhala },
- { 777, PT_PC, ucp_Sk },
- { 780, PT_PC, ucp_Sm },
- { 783, PT_PC, ucp_So },
- { 786, PT_SC, ucp_Sundanese },
- { 796, PT_SC, ucp_Syloti_Nagri },
- { 809, PT_SC, ucp_Syriac },
- { 816, PT_SC, ucp_Tagalog },
- { 824, PT_SC, ucp_Tagbanwa },
- { 833, PT_SC, ucp_Tai_Le },
- { 840, PT_SC, ucp_Tai_Tham },
- { 849, PT_SC, ucp_Tai_Viet },
- { 858, PT_SC, ucp_Tamil },
- { 864, PT_SC, ucp_Telugu },
- { 871, PT_SC, ucp_Thaana },
- { 878, PT_SC, ucp_Thai },
- { 883, PT_SC, ucp_Tibetan },
- { 891, PT_SC, ucp_Tifinagh },
- { 900, PT_SC, ucp_Ugaritic },
- { 909, PT_SC, ucp_Vai },
- { 913, PT_SC, ucp_Yi },
- { 916, PT_GC, ucp_Z },
- { 918, PT_PC, ucp_Zl },
- { 921, PT_PC, ucp_Zp },
- { 924, PT_PC, ucp_Zs }
+ { 43, PT_SC, ucp_Batak },
+ { 49, PT_SC, ucp_Bengali },
+ { 57, PT_SC, ucp_Bopomofo },
+ { 66, PT_SC, ucp_Brahmi },
+ { 73, PT_SC, ucp_Braille },
+ { 81, PT_SC, ucp_Buginese },
+ { 90, PT_SC, ucp_Buhid },
+ { 96, PT_GC, ucp_C },
+ { 98, PT_SC, ucp_Canadian_Aboriginal },
+ { 118, PT_SC, ucp_Carian },
+ { 125, PT_PC, ucp_Cc },
+ { 128, PT_PC, ucp_Cf },
+ { 131, PT_SC, ucp_Cham },
+ { 136, PT_SC, ucp_Cherokee },
+ { 145, PT_PC, ucp_Cn },
+ { 148, PT_PC, ucp_Co },
+ { 151, PT_SC, ucp_Common },
+ { 158, PT_SC, ucp_Coptic },
+ { 165, PT_PC, ucp_Cs },
+ { 168, PT_SC, ucp_Cuneiform },
+ { 178, PT_SC, ucp_Cypriot },
+ { 186, PT_SC, ucp_Cyrillic },
+ { 195, PT_SC, ucp_Deseret },
+ { 203, PT_SC, ucp_Devanagari },
+ { 214, PT_SC, ucp_Egyptian_Hieroglyphs },
+ { 235, PT_SC, ucp_Ethiopic },
+ { 244, PT_SC, ucp_Georgian },
+ { 253, PT_SC, ucp_Glagolitic },
+ { 264, PT_SC, ucp_Gothic },
+ { 271, PT_SC, ucp_Greek },
+ { 277, PT_SC, ucp_Gujarati },
+ { 286, PT_SC, ucp_Gurmukhi },
+ { 295, PT_SC, ucp_Han },
+ { 299, PT_SC, ucp_Hangul },
+ { 306, PT_SC, ucp_Hanunoo },
+ { 314, PT_SC, ucp_Hebrew },
+ { 321, PT_SC, ucp_Hiragana },
+ { 330, PT_SC, ucp_Imperial_Aramaic },
+ { 347, PT_SC, ucp_Inherited },
+ { 357, PT_SC, ucp_Inscriptional_Pahlavi },
+ { 379, PT_SC, ucp_Inscriptional_Parthian },
+ { 402, PT_SC, ucp_Javanese },
+ { 411, PT_SC, ucp_Kaithi },
+ { 418, PT_SC, ucp_Kannada },
+ { 426, PT_SC, ucp_Katakana },
+ { 435, PT_SC, ucp_Kayah_Li },
+ { 444, PT_SC, ucp_Kharoshthi },
+ { 455, PT_SC, ucp_Khmer },
+ { 461, PT_GC, ucp_L },
+ { 463, PT_LAMP, 0 },
+ { 466, PT_SC, ucp_Lao },
+ { 470, PT_SC, ucp_Latin },
+ { 476, PT_SC, ucp_Lepcha },
+ { 483, PT_SC, ucp_Limbu },
+ { 489, PT_SC, ucp_Linear_B },
+ { 498, PT_SC, ucp_Lisu },
+ { 503, PT_PC, ucp_Ll },
+ { 506, PT_PC, ucp_Lm },
+ { 509, PT_PC, ucp_Lo },
+ { 512, PT_PC, ucp_Lt },
+ { 515, PT_PC, ucp_Lu },
+ { 518, PT_SC, ucp_Lycian },
+ { 525, PT_SC, ucp_Lydian },
+ { 532, PT_GC, ucp_M },
+ { 534, PT_SC, ucp_Malayalam },
+ { 544, PT_SC, ucp_Mandaic },
+ { 552, PT_PC, ucp_Mc },
+ { 555, PT_PC, ucp_Me },
+ { 558, PT_SC, ucp_Meetei_Mayek },
+ { 571, PT_PC, ucp_Mn },
+ { 574, PT_SC, ucp_Mongolian },
+ { 584, PT_SC, ucp_Myanmar },
+ { 592, PT_GC, ucp_N },
+ { 594, PT_PC, ucp_Nd },
+ { 597, PT_SC, ucp_New_Tai_Lue },
+ { 609, PT_SC, ucp_Nko },
+ { 613, PT_PC, ucp_Nl },
+ { 616, PT_PC, ucp_No },
+ { 619, PT_SC, ucp_Ogham },
+ { 625, PT_SC, ucp_Ol_Chiki },
+ { 634, PT_SC, ucp_Old_Italic },
+ { 645, PT_SC, ucp_Old_Persian },
+ { 657, PT_SC, ucp_Old_South_Arabian },
+ { 675, PT_SC, ucp_Old_Turkic },
+ { 686, PT_SC, ucp_Oriya },
+ { 692, PT_SC, ucp_Osmanya },
+ { 700, PT_GC, ucp_P },
+ { 702, PT_PC, ucp_Pc },
+ { 705, PT_PC, ucp_Pd },
+ { 708, PT_PC, ucp_Pe },
+ { 711, PT_PC, ucp_Pf },
+ { 714, PT_SC, ucp_Phags_Pa },
+ { 723, PT_SC, ucp_Phoenician },
+ { 734, PT_PC, ucp_Pi },
+ { 737, PT_PC, ucp_Po },
+ { 740, PT_PC, ucp_Ps },
+ { 743, PT_SC, ucp_Rejang },
+ { 750, PT_SC, ucp_Runic },
+ { 756, PT_GC, ucp_S },
+ { 758, PT_SC, ucp_Samaritan },
+ { 768, PT_SC, ucp_Saurashtra },
+ { 779, PT_PC, ucp_Sc },
+ { 782, PT_SC, ucp_Shavian },
+ { 790, PT_SC, ucp_Sinhala },
+ { 798, PT_PC, ucp_Sk },
+ { 801, PT_PC, ucp_Sm },
+ { 804, PT_PC, ucp_So },
+ { 807, PT_SC, ucp_Sundanese },
+ { 817, PT_SC, ucp_Syloti_Nagri },
+ { 830, PT_SC, ucp_Syriac },
+ { 837, PT_SC, ucp_Tagalog },
+ { 845, PT_SC, ucp_Tagbanwa },
+ { 854, PT_SC, ucp_Tai_Le },
+ { 861, PT_SC, ucp_Tai_Tham },
+ { 870, PT_SC, ucp_Tai_Viet },
+ { 879, PT_SC, ucp_Tamil },
+ { 885, PT_SC, ucp_Telugu },
+ { 892, PT_SC, ucp_Thaana },
+ { 899, PT_SC, ucp_Thai },
+ { 904, PT_SC, ucp_Tibetan },
+ { 912, PT_SC, ucp_Tifinagh },
+ { 921, PT_SC, ucp_Ugaritic },
+ { 930, PT_SC, ucp_Vai },
+ { 934, PT_ALNUM, 0 },
+ { 938, PT_PXSPACE, 0 },
+ { 942, PT_SPACE, 0 },
+ { 946, PT_WORD, 0 },
+ { 950, PT_SC, ucp_Yi },
+ { 953, PT_GC, ucp_Z },
+ { 955, PT_PC, ucp_Zl },
+ { 958, PT_PC, ucp_Zp },
+ { 961, PT_PC, ucp_Zs }
};
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c
index c25ecdc75..5b1b6f4ff 100644
--- a/glib/pcre/pcre_xclass.c
+++ b/glib/pcre/pcre_xclass.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
+ Copyright (c) 1997-2010 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -104,6 +104,7 @@ while ((t = *data++) != XCL_END)
else /* XCL_PROP & XCL_NOTPROP */
{
int chartype = UCD_CHARTYPE(c);
+
switch(*data)
{
case PT_ANY:
@@ -111,12 +112,13 @@ while ((t = *data++) != XCL_END)
break;
case PT_LAMP:
- if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
- (t == XCL_PROP)) return !negated;
+ if ((chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
break;
case PT_GC:
- if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
+ if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP))
+ return !negated;
break;
case PT_PC:
@@ -127,6 +129,33 @@ while ((t = *data++) != XCL_END)
if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
break;
+ case PT_ALNUM:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_WORD:
+ if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+ _pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE)
+ == (t == XCL_PROP))
+ return !negated;
+ break;
+
/* This should never occur, but compilers may mutter if there is no
default. */
diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h
index f1b68b0c2..dcaa827ef 100644
--- a/glib/pcre/ucp.h
+++ b/glib/pcre/ucp.h
@@ -150,7 +150,10 @@ enum {
ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
- ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
+ ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
+ ucp_Batak = G_UNICODE_SCRIPT_BATAK,
+ ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
+ ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC
};
#endif