-
-#include "pcre_internal.h"
-
-#define DFTABLES /* pcre_maketables.c notices this */
-#include "pcre_maketables.c"
-
-
-int main(int argc, char **argv)
-{
-FILE *f;
-int i = 1;
-const unsigned char *tables;
-const unsigned char *base_of_tables;
-
-/* By default, the default C locale is used rather than what the building user
-happens to have set. However, if the -L option is given, set the locale from
-the LC_xxx environment variables. */
-
-if (argc > 1 && strcmp(argv[1], "-L") == 0)
- {
- setlocale(LC_ALL, ""); /* Set from environment variables */
- i++;
- }
-
-if (argc < i + 1)
- {
- fprintf(stderr, "dftables: one filename argument is required\n");
- return 1;
- }
-
-tables = pcre_maketables();
-base_of_tables = tables;
-
-f = fopen(argv[i], "wb");
-if (f == NULL)
- {
- fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]);
- return 1;
- }
-
-/* There are several fprintf() calls here, because gcc in pedantic mode
-complains about the very long string otherwise. */
-
-fprintf(f,
- "/*************************************************\n"
- "* Perl-Compatible Regular Expressions *\n"
- "*************************************************/\n\n"
- "/* This file was automatically written by the dftables auxiliary\n"
- "program. It contains character tables that are used when no external\n"
- "tables are passed to PCRE by the application that calls it. The tables\n"
- "are used only for characters whose code values are less than 256.\n\n");
-fprintf(f,
- "The following #includes are present because without them gcc 4.x may remove\n"
- "the array definition from the final binary if PCRE is built into a static\n"
- "library and dead code stripping is activated. This leads to link errors.\n"
- "Pulling in the header ensures that the array gets flagged as \"someone\n"
- "outside this compilation unit might reference this\" and so it will always\n"
- "be supplied to the linker. */\n\n"
- "#ifdef HAVE_CONFIG_H\n"
- "#include \"config.h\"\n"
- "#endif\n\n"
- "#include \"pcre_internal.h\"\n\n");
-fprintf(f,
- "const unsigned char _pcre_default_tables[] = {\n\n"
- "/* This table is a lower casing table. */\n\n");
-
-fprintf(f, " ");
-for (i = 0; i < 256; i++)
- {
- if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
- fprintf(f, "%3d", *tables++);
- if (i != 255) fprintf(f, ",");
- }
-fprintf(f, ",\n\n");
-
-fprintf(f, "/* This table is a case flipping table. */\n\n");
-
-fprintf(f, " ");
-for (i = 0; i < 256; i++)
- {
- if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
- fprintf(f, "%3d", *tables++);
- if (i != 255) fprintf(f, ",");
- }
-fprintf(f, ",\n\n");
-
-fprintf(f,
- "/* This table contains bit maps for various character classes.\n"
- "Each map is 32 bytes long and the bits run from the least\n"
- "significant end of each byte. The classes that have their own\n"
- "maps are: space, xdigit, digit, upper, lower, word, graph\n"
- "print, punct, and cntrl. Other classes are built from combinations. */\n\n");
-
-fprintf(f, " ");
-for (i = 0; i < cbit_length; i++)
- {
- if ((i & 7) == 0 && i != 0)
- {
- if ((i & 31) == 0) fprintf(f, "\n");
- fprintf(f, "\n ");
- }
- fprintf(f, "0x%02x", *tables++);
- if (i != cbit_length - 1) fprintf(f, ",");
- }
-fprintf(f, ",\n\n");
-
-fprintf(f,
- "/* This table identifies various classes of character by individual bits:\n"
- " 0x%02x white space character\n"
- " 0x%02x letter\n"
- " 0x%02x decimal digit\n"
- " 0x%02x hexadecimal digit\n"
- " 0x%02x alphanumeric or '_'\n"
- " 0x%02x regular expression metacharacter or binary zero\n*/\n\n",
- ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word,
- ctype_meta);
-
-fprintf(f, " ");
-for (i = 0; i < 256; i++)
- {
- if ((i & 7) == 0 && i != 0)
- {
- fprintf(f, " /* ");
- if (isprint(i-8)) fprintf(f, " %c -", i-8);
- else fprintf(f, "%3d-", i-8);
- if (isprint(i-1)) fprintf(f, " %c ", i-1);
- else fprintf(f, "%3d", i-1);
- fprintf(f, " */\n ");
- }
- fprintf(f, "0x%02x", *tables++);
- if (i != 255) fprintf(f, ",");
- }
-
-fprintf(f, "};/* ");
-if (isprint(i-8)) fprintf(f, " %c -", i-8);
- else fprintf(f, "%3d-", i-8);
-if (isprint(i-1)) fprintf(f, " %c ", i-1);
- else fprintf(f, "%3d", i-1);
-fprintf(f, " */\n\n/* End of pcre_chartables.c */\n");
-
-fclose(f);
-free((void *)base_of_tables);
-return 0;
-}
-
-/* End of dftables.c */
diff --git a/libs/pcre/doc/Tech.Notes b/libs/pcre/doc/Tech.Notes
deleted file mode 100644
index 21dbe1f9b5..0000000000
--- a/libs/pcre/doc/Tech.Notes
+++ /dev/null
@@ -1,348 +0,0 @@
-Technical Notes about PCRE
---------------------------
-
-These are very rough technical notes that record potentially useful information
-about PCRE internals.
-
-Historical note 1
------------------
-
-Many years ago I implemented some regular expression functions to an algorithm
-suggested by Martin Richards. These were not Unix-like in form, and were quite
-restricted in what they could do by comparison with Perl. The interesting part
-about the algorithm was that the amount of space required to hold the compiled
-form of an expression was known in advance. The code to apply an expression did
-not operate by backtracking, as the original Henry Spencer code and current
-Perl code does, but instead checked all possibilities simultaneously by keeping
-a list of current states and checking all of them as it advanced through the
-subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
-algorithm". When the pattern was all used up, all remaining states were
-possible matches, and the one matching the longest subset of the subject string
-was chosen. This did not necessarily maximize the individual wild portions of
-the pattern, as is expected in Unix and Perl-style regular expressions.
-
-Historical note 2
------------------
-
-By contrast, the code originally written by Henry Spencer (which was
-subsequently heavily modified for Perl) compiles the expression twice: once in
-a dummy mode in order to find out how much store will be needed, and then for
-real. (The Perl version probably doesn't do this any more; I'm talking about
-the original library.) The execution function operates by backtracking and
-maximizing (or, optionally, minimizing in Perl) the amount of the subject that
-matches individual wild portions of the pattern. This is an "NFA algorithm" in
-Friedl's terminology.
-
-OK, here's the real stuff
--------------------------
-
-For the set of functions that form the "basic" PCRE library (which are
-unrelated to those mentioned above), I tried at first to invent an algorithm
-that used an amount of store bounded by a multiple of the number of characters
-in the pattern, to save on compiling time. However, because of the greater
-complexity in Perl regular expressions, I couldn't do this. In any case, a
-first pass through the pattern is needed, for a number of reasons. PCRE works
-by running a very degenerate first pass to calculate a maximum store size, and
-then a second pass to do the real compile - which may use a bit less than the
-predicted amount of store. The idea is that this is going to turn out faster
-because the first pass is degenerate and the second pass can just store stuff
-straight into the vector, which it knows is big enough. It does make the
-compiling functions bigger, of course, but they have become quite big anyway to
-handle all the Perl stuff.
-
-Traditional matching function
------------------------------
-
-The "traditional", and original, matching function is called pcre_exec(), and
-it implements an NFA algorithm, similar to the original Henry Spencer algorithm
-and the way that Perl works. Not surprising, since it is intended to be as
-compatible with Perl as possible. This is the function most users of PCRE will
-use most of the time.
-
-Supplementary matching function
--------------------------------
-
-From PCRE 6.0, there is also a supplementary matching function called
-pcre_dfa_exec(). This implements a DFA matching algorithm that searches
-simultaneously for all possible matches that start at one point in the subject
-string. (Going back to my roots: see Historical Note 1 above.) This function
-intreprets the same compiled pattern data as pcre_exec(); however, not all the
-facilities are available, and those that are do not always work in quite the
-same way. See the user documentation for details.
-
-Format of compiled patterns
----------------------------
-
-The compiled form of a pattern is a vector of bytes, containing items of
-variable length. The first byte in an item is an opcode, and the length of the
-item is either implicit in the opcode or contained in the data bytes that
-follow it.
-
-In many cases below "two-byte" data values are specified. This is in fact just
-a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
-performance). This is necessary only when patterns whose compiled length is
-greater than 64K are going to be processed. In this description, we assume the
-"normal" compilation options.
-
-A list of all the opcodes follows:
-
-Opcodes with no following data
-------------------------------
-
-These items are all just one byte long
-
- OP_END end of pattern
- OP_ANY match any character
- OP_ANYBYTE match any single byte, even in UTF-8 mode
- OP_SOD match start of data: \A
- OP_SOM, start of match (subject + offset): \G
- OP_CIRC ^ (start of data, or after \n in multiline)
- OP_NOT_WORD_BOUNDARY \W
- OP_WORD_BOUNDARY \w
- OP_NOT_DIGIT \D
- OP_DIGIT \d
- OP_NOT_WHITESPACE \S
- OP_WHITESPACE \s
- OP_NOT_WORDCHAR \W
- OP_WORDCHAR \w
- OP_EODN match end of data or \n at end: \Z
- OP_EOD match end of data: \z
- OP_DOLL $ (end of data, or before \n in multiline)
- OP_EXTUNI match an extended Unicode character
-
-
-Repeating single characters
----------------------------
-
-The common repeats (*, +, ?) when applied to a single character use the
-following opcodes:
-
- OP_STAR
- OP_MINSTAR
- OP_PLUS
- OP_MINPLUS
- OP_QUERY
- OP_MINQUERY
-
-In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
-Those with "MIN" in their name are the minimizing versions. Each is followed by
-the character that is to be repeated. Other repeats make use of
-
- OP_UPTO
- OP_MINUPTO
- OP_EXACT
-
-which are followed by a two-byte count (most significant first) and the
-repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
-non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
-OP_UPTO (or OP_MINUPTO).
-
-
-Repeating character types
--------------------------
-
-Repeats of things like \d are done exactly as for single characters, except
-that instead of a character, the opcode for the type is stored in the data
-byte. The opcodes are:
-
- OP_TYPESTAR
- OP_TYPEMINSTAR
- OP_TYPEPLUS
- OP_TYPEMINPLUS
- OP_TYPEQUERY
- OP_TYPEMINQUERY
- OP_TYPEUPTO
- OP_TYPEMINUPTO
- OP_TYPEEXACT
-
-
-Match by Unicode property
--------------------------
-
-OP_PROP and OP_NOTPROP are used for positive and negative matches of a
-character by testing its Unicode property (the \p and \P escape sequences).
-Each is followed by two bytes that encode the desired property as a type and a
-value.
-
-Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
-three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
-value.
-
-
-Matching literal characters
----------------------------
-
-The OP_CHAR opcode is followed by a single character that is to be matched
-casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
-character may be more than one byte long. (Earlier versions of PCRE used
-multi-character strings, but this was changed to allow some new features to be
-added.)
-
-
-Character classes
------------------
-
-If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
-class, and OP_NOT for a negative one (that is, for something like [^a]).
-However, in UTF-8 mode, the use of OP_NOT applies only to characters with
-values < 128, because OP_NOT is confined to single bytes.
-
-Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
-negated, single-character class. The normal ones (OP_STAR etc.) are used for a
-repeated positive single-character class.
-
-When there's more than one character in a class and all the characters are less
-than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
-one. In either case, the opcode is followed by a 32-byte bit map containing a 1
-bit for every character that is acceptable. The bits are counted from the least
-significant end of each byte.
-
-The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
-subject characters with values greater than 256 can be handled correctly. For
-OP_CLASS they don't match, whereas for OP_NCLASS they do.
-
-For classes containing characters with values > 255, OP_XCLASS is used. It
-optionally uses a bit map (if any characters lie within it), followed by a list
-of pairs and single characters. There is a flag character than indicates
-whether it's a positive or a negative class.
-
-
-Back references
----------------
-
-OP_REF is followed by two bytes containing the reference number.
-
-
-Repeating character classes and back references
------------------------------------------------
-
-Single-character classes are handled specially (see above). This applies to
-OP_CLASS and OP_REF. In both cases, the repeat information follows the base
-item. The matching code looks at the following opcode to see if it is one of
-
- OP_CRSTAR
- OP_CRMINSTAR
- OP_CRPLUS
- OP_CRMINPLUS
- OP_CRQUERY
- OP_CRMINQUERY
- OP_CRRANGE
- OP_CRMINRANGE
-
-All but the last two are just single-byte items. The others are followed by
-four bytes of data, comprising the minimum and maximum repeat counts.
-
-
-Brackets and alternation
-------------------------
-
-A pair of non-capturing (round) brackets is wrapped round each expression at
-compile time, so alternation always happens in the context of brackets.
-
-Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
-OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
-speakers, including myself, can be round, square, curly, or pointy. Hence this
-usage.]
-
-Originally PCRE was limited to 99 capturing brackets (so as not to use up all
-the opcodes). From release 3.5, there is no limit. What happens is that the
-first ones, up to EXTRACT_BASIC_MAX are handled with separate opcodes, as
-above. If there are more, the opcode is set to EXTRACT_BASIC_MAX+1, and the
-first operation in the bracket is OP_BRANUMBER, followed by a 2-byte bracket
-number. This opcode is ignored while matching, but is fished out when handling
-the bracket itself. (They could have all been done like this, but I was making
-minimal changes.)
-
-A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
-next alternative OP_ALT or, if there aren't any branches, to the matching
-OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
-the next one, or to the OP_KET opcode.
-
-OP_KET is used for subpatterns that do not repeat indefinitely, while
-OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
-maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
-positive number) the offset back to the matching OP_BRA opcode.
-
-If a subpattern is quantified such that it is permitted to match zero times, it
-is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
-opcodes which tell the matcher that skipping this subpattern entirely is a
-valid branch.
-
-A subpattern with an indefinite maximum repetition is replicated in the
-compiled data its minimum number of times (or once with OP_BRAZERO if the
-minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
-as appropriate.
-
-A subpattern with a bounded maximum repetition is replicated in a nested
-fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
-before each replication after the minimum, so that, for example, (abc){2,5} is
-compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.
-
-
-Assertions
-----------
-
-Forward assertions are just like other subpatterns, but starting with one of
-the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
-OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
-is OP_REVERSE, followed by a two byte count of the number of characters to move
-back the pointer in the subject string. When operating in UTF-8 mode, the count
-is a character count rather than a byte count. A separate count is present in
-each alternative of a lookbehind assertion, allowing them to have different
-fixed lengths.
-
-
-Once-only subpatterns
----------------------
-
-These are also just like other subpatterns, but they start with the opcode
-OP_ONCE.
-
-
-Conditional subpatterns
------------------------
-
-These are like other subpatterns, but they start with the opcode OP_COND. If
-the condition is a back reference, this is stored at the start of the
-subpattern using the opcode OP_CREF followed by two bytes containing the
-reference number. If the condition is "in recursion" (coded as "(?(R)"), the
-same scheme is used, with a "reference number" of 0xffff. Otherwise, a
-conditional subpattern always starts with one of the assertions.
-
-
-Recursion
----------
-
-Recursion either matches the current regex, or some subexpression. The opcode
-OP_RECURSE is followed by an value which is the offset to the starting bracket
-from the start of the whole pattern. From release 6.5, OP_RECURSE is
-automatically wrapped inside OP_ONCE brackets (because otherwise some patterns
-broke it). OP_RECURSE is also used for "subroutine" calls, even though they
-are not strictly a recursion.
-
-
-Callout
--------
-
-OP_CALLOUT is followed by one byte of data that holds a callout number in the
-range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
-cases there follows a two-byte value giving the offset in the pattern to the
-start of the following item, and another two-byte item giving the length of the
-next item.
-
-
-Changing options
-----------------
-
-If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
-opcode is compiled, followed by one byte containing the new settings of these
-flags. If there are several alternatives, there is an occurrence of OP_OPT at
-the start of all those following the first options change, to set appropriate
-options for the start of the alternative. Immediately after the end of the
-group there is another such item to reset the flags to their previous values. A
-change of flag right at the very start of the pattern can be handled entirely
-at compile time, and so does not cause anything to be put into the compiled
-data.
-
-Philip Hazel
-June 2006
diff --git a/libs/pcre/doc/html/index.html b/libs/pcre/doc/html/index.html
deleted file mode 100644
index 8a7174e6ca..0000000000
--- a/libs/pcre/doc/html/index.html
+++ /dev/null
@@ -1,140 +0,0 @@
-
-
-
-PCRE specification
-
-
-Perl-compatible Regular Expressions (PCRE)
-
-The HTML documentation for PCRE comprises the following pages:
-
-
-
-pcre |
- Introductory page |
-
-pcre-config |
- Information about the installation configuration |
-
-pcreapi |
- PCRE's native API |
-
-pcrebuild |
- Options for building PCRE |
-
-pcrecallout |
- The callout facility |
-
-pcrecompat |
- Compability with Perl |
-
-pcrecpp |
- The C++ wrapper for the PCRE library |
-
-pcregrep |
- The pcregrep command |
-
-pcrematching |
- Discussion of the two matching algorithms |
-
-pcrepartial |
- Using PCRE for partial matching |
-
-pcrepattern |
- Specification of the regular expressions supported by PCRE |
-
-pcreperform |
- Some comments on performance |
-
-pcreposix |
- The POSIX API to the PCRE library |
-
-pcreprecompile |
- How to save and re-use compiled patterns |
-
-pcresample |
- Description of the sample program |
-
-pcrestack |
- Discussion of PCRE's stack usage |
-
-pcresyntax |
- Syntax quick-reference summary |
-
-pcretest |
- The pcretest command for testing PCRE |
-
-
-
-There are also individual pages that summarize the interface for each function
-in the library:
-
-
-
-
-
diff --git a/libs/pcre/doc/html/pcre-config.html b/libs/pcre/doc/html/pcre-config.html
deleted file mode 100644
index 09877456fd..0000000000
--- a/libs/pcre/doc/html/pcre-config.html
+++ /dev/null
@@ -1,88 +0,0 @@
-
-
-pcre-config specification
-
-
-pcre-config man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
SYNOPSIS
-
-pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
-[--libs-posix] [--cflags] [--cflags-posix]
-
-
DESCRIPTION
-
-pcre-config returns the configuration of the installed PCRE
-libraries and the options required to compile a program to use them.
-
-
OPTIONS
-
---prefix
-Writes the directory prefix used in the PCRE installation for architecture
-independent files (/usr on many systems, /usr/local on some
-systems) to the standard output.
-
-
---exec-prefix
-Writes the directory prefix used in the PCRE installation for architecture
-dependent files (normally the same as --prefix) to the standard output.
-
-
---version
-Writes the version number of the installed PCRE libraries to the standard
-output.
-
-
---libs
-Writes to the standard output the command line options required to link
-with PCRE (-lpcre on many systems).
-
-
---libs-posix
-Writes to the standard output the command line options required to link with
-the PCRE posix emulation library (-lpcreposix -lpcre on many
-systems).
-
-
---cflags
-Writes to the standard output the command line options required to compile
-files that use PCRE (this may include some -I options, but is blank on
-many systems).
-
-
---cflags-posix
-Writes to the standard output the command line options required to compile
-files that use the PCRE posix emulation library (this may include some -I
-options, but is blank on many systems).
-
-
SEE ALSO
-
-pcre(3)
-
-
AUTHOR
-
-This manual page was originally written by Mark Baker for the Debian GNU/Linux
-system. It has been slightly revised as a generic PCRE man page.
-
-
REVISION
-
-Last updated: 18 April 2007
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre.html b/libs/pcre/doc/html/pcre.html
deleted file mode 100644
index 5e2a036397..0000000000
--- a/libs/pcre/doc/html/pcre.html
+++ /dev/null
@@ -1,306 +0,0 @@
-
-
-pcre specification
-
-
-pcre man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
INTRODUCTION
-
-The PCRE library is a set of functions that implement regular expression
-pattern matching using the same syntax and semantics as Perl, with just a few
-differences. Certain features that appeared in Python and PCRE before they
-appeared in Perl are also available using the Python syntax. There is also some
-support for certain .NET and Oniguruma syntax items, and there is an option for
-requesting some minor changes that give better JavaScript compatibility.
-
-
-The current implementation of PCRE (release 7.x) corresponds approximately with
-Perl 5.10, including support for UTF-8 encoded strings and Unicode general
-category properties. However, UTF-8 and Unicode support has to be explicitly
-enabled; it is not the default. The Unicode tables correspond to Unicode
-release 5.1.
-
-
-In addition to the Perl-compatible matching function, PCRE contains an
-alternative matching function that matches the same compiled patterns in a
-different way. In certain circumstances, the alternative function has some
-advantages. For a discussion of the two matching algorithms, see the
-pcrematching
-page.
-
-
-PCRE is written in C and released as a C library. A number of people have
-written wrappers and interfaces of various kinds. In particular, Google Inc.
-have provided a comprehensive C++ wrapper. This is now included as part of the
-PCRE distribution. The
-pcrecpp
-page has details of this interface. Other people's contributions can be found
-in the Contrib directory at the primary FTP site, which is:
-ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
-
-
-Details of exactly which Perl regular expression features are and are not
-supported by PCRE are given in separate documents. See the
-pcrepattern
-and
-pcrecompat
-pages. There is a syntax summary in the
-pcresyntax
-page.
-
-
-Some features of PCRE can be included, excluded, or changed when the library is
-built. The
-pcre_config()
-function makes it possible for a client to discover which features are
-available. The features themselves are described in the
-pcrebuild
-page. Documentation about building PCRE for various operating systems can be
-found in the README file in the source distribution.
-
-
-The library contains a number of undocumented internal functions and data
-tables that are used by more than one of the exported external functions, but
-which are not intended for use by external callers. Their names all begin with
-"_pcre_", which hopefully will not provoke any name clashes. In some
-environments, it is possible to control which external symbols are exported
-when a shared library is built, and in these cases the undocumented symbols are
-not exported.
-
-
USER DOCUMENTATION
-
-The user documentation for PCRE comprises a number of different sections. In
-the "man" format, each of these is a separate "man page". In the HTML format,
-each is a separate page, linked from the index page. In the plain text format,
-all the sections are concatenated, for ease of searching. The sections are as
-follows:
-
- pcre this document
- pcre-config show PCRE installation configuration information
- pcreapi details of PCRE's native C API
- pcrebuild options for building PCRE
- pcrecallout details of the callout feature
- pcrecompat discussion of Perl compatibility
- pcrecpp details of the C++ wrapper
- pcregrep description of the pcregrep command
- pcrematching discussion of the two matching algorithms
- pcrepartial details of the partial matching facility
- pcrepattern syntax and semantics of supported regular expressions
- pcresyntax quick syntax reference
- pcreperform discussion of performance issues
- pcreposix the POSIX-compatible C API
- pcreprecompile details of saving and re-using precompiled patterns
- pcresample discussion of the sample program
- pcrestack discussion of stack usage
- pcretest description of the pcretest testing command
-
-In addition, in the "man" and HTML formats, there is a short page for each
-C library function, listing its arguments and results.
-
-
LIMITATIONS
-
-There are some size limitations in PCRE but it is hoped that they will never in
-practice be relevant.
-
-
-The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
-compiled with the default internal linkage size of 2. If you want to process
-regular expressions that are truly enormous, you can compile PCRE with an
-internal linkage size of 3 or 4 (see the README file in the source
-distribution and the
-pcrebuild
-documentation for details). In these cases the limit is substantially larger.
-However, the speed of execution is slower.
-
-
-All values in repeating quantifiers must be less than 65536.
-
-
-There is no limit to the number of parenthesized subpatterns, but there can be
-no more than 65535 capturing subpatterns.
-
-
-The maximum length of name for a named subpattern is 32 characters, and the
-maximum number of named subpatterns is 10000.
-
-
-The maximum length of a subject string is the largest positive number that an
-integer variable can hold. However, when using the traditional matching
-function, PCRE uses recursion to handle subpatterns and indefinite repetition.
-This means that the available stack space may limit the size of a subject
-string that can be processed by certain patterns. For a discussion of stack
-issues, see the
-pcrestack
-documentation.
-
-
UTF-8 AND UNICODE PROPERTY SUPPORT
-
-From release 3.3, PCRE has had some support for character strings encoded in
-the UTF-8 format. For release 4.0 this was greatly extended to cover most
-common requirements, and in release 5.0 additional support for Unicode general
-category properties was added.
-
-
-In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
-the code, and, in addition, you must call
-pcre_compile()
-with the PCRE_UTF8 option flag, or the pattern must start with the sequence
-(*UTF8). When either of these is the case, both the pattern and any subject
-strings that are matched against it are treated as UTF-8 strings instead of
-just strings of bytes.
-
-
-If you compile PCRE with UTF-8 support, but do not use it at run time, the
-library will be a bit bigger, but the additional run time overhead is limited
-to testing the PCRE_UTF8 flag occasionally, so should not be very big.
-
-
-If PCRE is built with Unicode character property support (which implies UTF-8
-support), the escape sequences \p{..}, \P{..}, and \X are supported.
-The available properties that can be tested are limited to the general
-category properties such as Lu for an upper case letter or Nd for a decimal
-number, the Unicode script names such as Arabic or Han, and the derived
-properties Any and L&. A full list is given in the
-pcrepattern
-documentation. Only the short names for properties are supported. For example,
-\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE does not support this.
-
-
-Validity of UTF-8 strings
-
-
-When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
-are (by default) checked for validity on entry to the relevant functions. From
-release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
-themselves derived from the Unicode specification. Earlier releases of PCRE
-followed the rules of RFC 2279, which allows the full range of 31-bit values (0
-to 0x7FFFFFFF). The current check allows only values in the range U+0 to
-U+10FFFF, excluding U+D800 to U+DFFF.
-
-
-The excluded code points are the "Low Surrogate Area" of Unicode, of which the
-Unicode Standard says this: "The Low Surrogate Area does not contain any
-character assignments, consequently no character code charts or namelists are
-provided for this area. Surrogates are reserved for use with UTF-16 and then
-must be used in pairs." The code points that are encoded by UTF-16 pairs are
-available as independent code points in the UTF-8 encoding. (In other words,
-the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
-UTF-8.)
-
-
-If an invalid UTF-8 string is passed to PCRE, an error return
-(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
-your strings are valid, and therefore want to skip these checks in order to
-improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
-at run time, PCRE assumes that the pattern or subject it is given
-(respectively) contains only valid UTF-8 codes. In this case, it does not
-diagnose an invalid UTF-8 string.
-
-
-If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
-happens depends on why the string is invalid. If the string conforms to the
-"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
-in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
-test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
-rules of RFC 2279. However, if the string does not even conform to RFC 2279,
-the result is undefined. Your program may crash.
-
-
-If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
-encoded in a UTF-8-like manner as per the old RFC, you can set
-PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
-situation, you will have to apply your own validity check.
-
-
-General comments about UTF-8 mode
-
-
-1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
-UTF-8 character if the value is greater than 127.
-
-
-2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
-characters for values greater than \177.
-
-
-3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
-bytes, for example: \x{100}{3}.
-
-
-4. The dot metacharacter matches one UTF-8 character instead of a single byte.
-
-
-5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
-but its use can lead to some strange effects. This facility is not available in
-the alternative matching function, pcre_dfa_exec().
-
-
-6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
-test characters of any code value, but the characters that PCRE recognizes as
-digits, spaces, or word characters remain the same set as before, all with
-values less than 256. This remains true even when PCRE includes Unicode
-property support, because to do otherwise would slow down PCRE in many common
-cases. If you really want to test for a wider sense of, say, "digit", you
-must use Unicode property tests such as \p{Nd}. Note that this also applies to
-\b, because it is defined in terms of \w and \W.
-
-
-7. Similarly, characters that match the POSIX named character classes are all
-low-valued characters.
-
-
-8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
-(\h, \H, \v, and \V) do match all the appropriate Unicode characters.
-
-
-9. Case-insensitive matching applies only to characters whose values are less
-than 128, unless PCRE is built with Unicode property support. Even when Unicode
-property support is available, PCRE still uses its own character tables when
-checking the case of low-valued characters, so as not to degrade performance.
-The Unicode property information is used only for characters with higher
-values. Even when Unicode property support is available, PCRE supports
-case-insensitive matching only when there is a one-to-one mapping between a
-letter's cases. There are a small number of many-to-one mappings in Unicode;
-these are not supported by PCRE.
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
-Putting an actual email address here seems to have been a spam magnet, so I've
-taken it away. If you want to email me, use my two initials, followed by the
-two digits 10, at the domain cam.ac.uk.
-
-
REVISION
-
-Last updated: 11 April 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_compile.html b/libs/pcre/doc/html/pcre_compile.html
deleted file mode 100644
index 396a5fbdb4..0000000000
--- a/libs/pcre/doc/html/pcre_compile.html
+++ /dev/null
@@ -1,89 +0,0 @@
-
-
-pcre_compile specification
-
-
-pcre_compile man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-pcre *pcre_compile(const char *pattern, int options,
-const char **errptr, int *erroffset,
-const unsigned char *tableptr);
-
-
-DESCRIPTION
-
-
-This function compiles a regular expression into an internal form. It is the
-same as pcre_compile2(), except for the absence of the errorcodeptr
-argument. Its arguments are:
-
- pattern A zero-terminated string containing the
- regular expression to be compiled
- options Zero or more option bits
- errptr Where to put an error message
- erroffset Offset in pattern where error was found
- tableptr Pointer to character tables, or NULL to
- use the built-in default
-
-The option bits are:
-
- PCRE_ANCHORED Force pattern anchoring
- PCRE_AUTO_CALLOUT Compile automatic callouts
- PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \R matches all Unicode line endings
- PCRE_CASELESS Do caseless matching
- PCRE_DOLLAR_ENDONLY $ not to match newline at end
- PCRE_DOTALL . matches anything including NL
- PCRE_DUPNAMES Allow duplicate names for subpatterns
- PCRE_EXTENDED Ignore whitespace and # comments
- PCRE_EXTRA PCRE extra features
- (not much use currently)
- PCRE_FIRSTLINE Force matching to be before newline
- PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
- PCRE_MULTILINE ^ and $ match newlines within data
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
- sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
- theses (named ones available)
- PCRE_UNGREEDY Invert greediness of quantifiers
- PCRE_UTF8 Run in UTF-8 mode
- PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
- validity (only relevant if
- PCRE_UTF8 is set)
-
-PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
-PCRE_NO_UTF8_CHECK.
-
-
-The yield of the function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected. Note that
-compiling regular expressions with one version of PCRE for use with a different
-version is not guaranteed to work and may cause crashes.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_compile2.html b/libs/pcre/doc/html/pcre_compile2.html
deleted file mode 100644
index 8d743c10e5..0000000000
--- a/libs/pcre/doc/html/pcre_compile2.html
+++ /dev/null
@@ -1,89 +0,0 @@
-
-
-pcre_compile2 specification
-
-
-pcre_compile2 man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-pcre *pcre_compile2(const char *pattern, int options,
-int *errorcodeptr,
-const char **errptr, int *erroffset,
-const unsigned char *tableptr);
-
-
-DESCRIPTION
-
-
-This function compiles a regular expression into an internal form. It is the
-same as pcre_compile(), except for the addition of the errorcodeptr
-argument. The arguments are:
-
-
-
- pattern A zero-terminated string containing the
- regular expression to be compiled
- options Zero or more option bits
- errorcodeptr Where to put an error code
- errptr Where to put an error message
- erroffset Offset in pattern where error was found
- tableptr Pointer to character tables, or NULL to
- use the built-in default
-
-The option bits are:
-
- PCRE_ANCHORED Force pattern anchoring
- PCRE_AUTO_CALLOUT Compile automatic callouts
- PCRE_CASELESS Do caseless matching
- PCRE_DOLLAR_ENDONLY $ not to match newline at end
- PCRE_DOTALL . matches anything including NL
- PCRE_DUPNAMES Allow duplicate names for subpatterns
- PCRE_EXTENDED Ignore whitespace and # comments
- PCRE_EXTRA PCRE extra features
- (not much use currently)
- PCRE_FIRSTLINE Force matching to be before newline
- PCRE_MULTILINE ^ and $ match newlines within data
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
- theses (named ones available)
- PCRE_UNGREEDY Invert greediness of quantifiers
- PCRE_UTF8 Run in UTF-8 mode
- PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
- validity (only relevant if
- PCRE_UTF8 is set)
-
-PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
-PCRE_NO_UTF8_CHECK.
-
-
-The yield of the function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected. Note that
-compiling regular expressions with one version of PCRE for use with a different
-version is not guaranteed to work and may cause crashes.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_config.html b/libs/pcre/doc/html/pcre_config.html
deleted file mode 100644
index 40dee37db7..0000000000
--- a/libs/pcre/doc/html/pcre_config.html
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-pcre_config specification
-
-
-pcre_config man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_config(int what, void *where);
-
-
-DESCRIPTION
-
-
-This function makes it possible for a client program to find out which optional
-features are available in the version of the PCRE library it is using. Its
-arguments are as follows:
-
- what A code specifying what information is required
- where Points to where to put the data
-
-The available codes are:
-
- PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
- PCRE_CONFIG_MATCH_LIMIT Internal resource limit
- PCRE_CONFIG_MATCH_LIMIT_RECURSION
- Internal recursion depth limit
- PCRE_CONFIG_NEWLINE Value of the default newline sequence:
- 13 (0x000d) for CR
- 10 (0x000a) for LF
- 3338 (0x0d0a) for CRLF
- -2 for ANYCRLF
- -1 for ANY
- PCRE_CONFIG_BSR Indicates what \R matches by default:
- 0 all Unicode line endings
- 1 CR, LF, or CRLF only
- PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
- Threshold of return slots, above
- which malloc() is used by
- the POSIX API
- PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
- PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
- PCRE_CONFIG_UNICODE_PROPERTIES
- Availability of Unicode property support
- (1=yes 0=no)
-
-The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_copy_named_substring.html b/libs/pcre/doc/html/pcre_copy_named_substring.html
deleted file mode 100644
index 2185518c8a..0000000000
--- a/libs/pcre/doc/html/pcre_copy_named_substring.html
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-pcre_copy_named_substring specification
-
-
-pcre_copy_named_substring man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_copy_named_substring(const pcre *code,
-const char *subject, int *ovector,
-int stringcount, const char *stringname,
-char *buffer, int buffersize);
-
-
-DESCRIPTION
-
-
-This is a convenience function for extracting a captured substring, identified
-by name, into a given buffer. The arguments are:
-
- code Pattern that was successfully matched
- subject Subject that has been successfully matched
- ovector Offset vector that pcre_exec() used
- stringcount Value returned by pcre_exec()
- stringname Name of the required substring
- buffer Buffer to receive the string
- buffersize Size of buffer
-
-The yield is the length of the substring, PCRE_ERROR_NOMEMORY if the buffer was
-too small, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_copy_substring.html b/libs/pcre/doc/html/pcre_copy_substring.html
deleted file mode 100644
index b7d23417d7..0000000000
--- a/libs/pcre/doc/html/pcre_copy_substring.html
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-pcre_copy_substring specification
-
-
-pcre_copy_substring man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_copy_substring(const char *subject, int *ovector,
-int stringcount, int stringnumber, char *buffer,
-int buffersize);
-
-
-DESCRIPTION
-
-
-This is a convenience function for extracting a captured substring into a given
-buffer. The arguments are:
-
- subject Subject that has been successfully matched
- ovector Offset vector that pcre_exec() used
- stringcount Value returned by pcre_exec()
- stringnumber Number of the required substring
- buffer Buffer to receive the string
- buffersize Size of buffer
-
-The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
-too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_dfa_exec.html b/libs/pcre/doc/html/pcre_dfa_exec.html
deleted file mode 100644
index a243ee8175..0000000000
--- a/libs/pcre/doc/html/pcre_dfa_exec.html
+++ /dev/null
@@ -1,98 +0,0 @@
-
-
-pcre_dfa_exec specification
-
-
-pcre_dfa_exec man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
-const char *subject, int length, int startoffset,
-int options, int *ovector, int ovecsize,
-int *workspace, int wscount);
-
-
-DESCRIPTION
-
-
-This function matches a compiled regular expression against a given subject
-string, using an alternative matching algorithm that scans the subject string
-just once (not Perl-compatible). Note that the main, Perl-compatible,
-matching function is pcre_exec(). The arguments for this function are:
-
- code Points to the compiled pattern
- extra Points to an associated pcre_extra structure,
- or is NULL
- subject Points to the subject string
- length Length of the subject string, in bytes
- startoffset Offset in bytes in the subject at which to
- start matching
- options Option bits
- ovector Points to a vector of ints for result offsets
- ovecsize Number of elements in the vector
- workspace Points to a vector of ints used as working space
- wscount Number of elements in the vector
-
-The options are:
-
- PCRE_ANCHORED Match only at the first position
- PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \R matches all Unicode line endings
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NOTBOL Subject is not the beginning of a line
- PCRE_NOTEOL Subject is not the end of a line
- PCRE_NOTEMPTY An empty string is not a valid match
- PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
- PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
- validity (only relevant if PCRE_UTF8
- was set at compile time)
- PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
- PCRE_DFA_SHORTEST Return only the shortest match
- PCRE_DFA_RESTART This is a restart after a partial match
-
-There are restrictions on what may appear in a pattern when using this matching
-function. Details are given in the
-pcrematching
-documentation.
-
-
-A pcre_extra structure contains the following fields:
-
- flags Bits indicating which fields are set
- study_data Opaque data from pcre_study()
- match_limit Limit on internal resource use
- match_limit_recursion Limit on internal recursion depth
- callout_data Opaque data passed back to callouts
- tables Points to character tables or is NULL
-
-The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES. For this matching function, the match_limit and
-match_limit_recursion fields are not used, and must not be set.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_exec.html b/libs/pcre/doc/html/pcre_exec.html
deleted file mode 100644
index ef43830dfc..0000000000
--- a/libs/pcre/doc/html/pcre_exec.html
+++ /dev/null
@@ -1,91 +0,0 @@
-
-
-pcre_exec specification
-
-
-pcre_exec man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_exec(const pcre *code, const pcre_extra *extra,
-const char *subject, int length, int startoffset,
-int options, int *ovector, int ovecsize);
-
-
-DESCRIPTION
-
-
-This function matches a compiled regular expression against a given subject
-string, using a matching algorithm that is similar to Perl's. It returns
-offsets to captured substrings. Its arguments are:
-
- code Points to the compiled pattern
- extra Points to an associated pcre_extra structure,
- or is NULL
- subject Points to the subject string
- length Length of the subject string, in bytes
- startoffset Offset in bytes in the subject at which to
- start matching
- options Option bits
- ovector Points to a vector of ints for result offsets
- ovecsize Number of elements in the vector (a multiple of 3)
-
-The options are:
-
- PCRE_ANCHORED Match only at the first position
- PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \R matches all Unicode line endings
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NOTBOL Subject is not the beginning of a line
- PCRE_NOTEOL Subject is not the end of a line
- PCRE_NOTEMPTY An empty string is not a valid match
- PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
- PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
- validity (only relevant if PCRE_UTF8
- was set at compile time)
- PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
-
-There are restrictions on what may appear in a pattern when partial matching is
-requested. For details, see the
-pcrepartial
-page.
-
-
-A pcre_extra structure contains the following fields:
-
- flags Bits indicating which fields are set
- study_data Opaque data from pcre_study()
- match_limit Limit on internal resource use
- match_limit_recursion Limit on internal recursion depth
- callout_data Opaque data passed back to callouts
- tables Points to character tables or is NULL
-
-The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_free_substring.html b/libs/pcre/doc/html/pcre_free_substring.html
deleted file mode 100644
index fe6261474c..0000000000
--- a/libs/pcre/doc/html/pcre_free_substring.html
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-pcre_free_substring specification
-
-
-pcre_free_substring man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-void pcre_free_substring(const char *stringptr);
-
-
-DESCRIPTION
-
-
-This is a convenience function for freeing the store obtained by a previous
-call to pcre_get_substring() or pcre_get_named_substring(). Its
-only argument is a pointer to the string.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_free_substring_list.html b/libs/pcre/doc/html/pcre_free_substring_list.html
deleted file mode 100644
index a92c9603f0..0000000000
--- a/libs/pcre/doc/html/pcre_free_substring_list.html
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-pcre_free_substring_list specification
-
-
-pcre_free_substring_list man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-void pcre_free_substring_list(const char **stringptr);
-
-
-DESCRIPTION
-
-
-This is a convenience function for freeing the store obtained by a previous
-call to pcre_get_substring_list(). Its only argument is a pointer to the
-list of string pointers.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_fullinfo.html b/libs/pcre/doc/html/pcre_fullinfo.html
deleted file mode 100644
index 48fddf5c2b..0000000000
--- a/libs/pcre/doc/html/pcre_fullinfo.html
+++ /dev/null
@@ -1,72 +0,0 @@
-
-
-pcre_fullinfo specification
-
-
-pcre_fullinfo man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
-int what, void *where);
-
-
-DESCRIPTION
-
-
-This function returns information about a compiled pattern. Its arguments are:
-
- code Compiled regular expression
- extra Result of pcre_study() or NULL
- what What information is required
- where Where to put the information
-
-The following information is available:
-
- PCRE_INFO_BACKREFMAX Number of highest back reference
- PCRE_INFO_CAPTURECOUNT Number of capturing subpatterns
- PCRE_INFO_DEFAULT_TABLES Pointer to default tables
- PCRE_INFO_FIRSTBYTE Fixed first byte for a match, or
- -1 for start of string
- or after newline, or
- -2 otherwise
- PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
- PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
- PCRE_INFO_LASTLITERAL Literal last byte required
- PCRE_INFO_NAMECOUNT Number of named subpatterns
- PCRE_INFO_NAMEENTRYSIZE Size of name table entry
- PCRE_INFO_NAMETABLE Pointer to name table
- PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
- PCRE_INFO_OPTIONS Option bits used for compilation
- PCRE_INFO_SIZE Size of compiled pattern
- PCRE_INFO_STUDYSIZE Size of study data
-
-The yield of the function is zero on success or:
-
- PCRE_ERROR_NULL the argument code was NULL
- the argument where was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of what was invalid
-
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_get_named_substring.html b/libs/pcre/doc/html/pcre_get_named_substring.html
deleted file mode 100644
index 24dc058260..0000000000
--- a/libs/pcre/doc/html/pcre_get_named_substring.html
+++ /dev/null
@@ -1,55 +0,0 @@
-
-
-pcre_get_named_substring specification
-
-
-pcre_get_named_substring man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_get_named_substring(const pcre *code,
-const char *subject, int *ovector,
-int stringcount, const char *stringname,
-const char **stringptr);
-
-
-DESCRIPTION
-
-
-This is a convenience function for extracting a captured substring by name. The
-arguments are:
-
- code Compiled pattern
- subject Subject that has been successfully matched
- ovector Offset vector that pcre_exec() used
- stringcount Value returned by pcre_exec()
- stringname Name of the required substring
- stringptr Where to put the string pointer
-
-The memory in which the substring is placed is obtained by calling
-pcre_malloc(). The convenience function pcre_free_substring() can
-be used to free it when it is no longer needed. The yield of the function is
-the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
-could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_get_stringnumber.html b/libs/pcre/doc/html/pcre_get_stringnumber.html
deleted file mode 100644
index 43af3aaed8..0000000000
--- a/libs/pcre/doc/html/pcre_get_stringnumber.html
+++ /dev/null
@@ -1,49 +0,0 @@
-
-
-pcre_get_stringnumber specification
-
-
-pcre_get_stringnumber man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_get_stringnumber(const pcre *code,
-const char *name);
-
-
-DESCRIPTION
-
-
-This convenience function finds the number of a named substring capturing
-parenthesis in a compiled pattern. Its arguments are:
-
- code Compiled regular expression
- name Name whose number is required
-
-The yield of the function is the number of the parenthesis if the name is
-found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
-(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
-pcre_get_stringnumber(). You can obtain the complete list by calling
-pcre_get_stringtable_entries().
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_get_stringtable_entries.html b/libs/pcre/doc/html/pcre_get_stringtable_entries.html
deleted file mode 100644
index dc20ffd22c..0000000000
--- a/libs/pcre/doc/html/pcre_get_stringtable_entries.html
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-pcre_get_stringtable_entries specification
-
-
-pcre_get_stringtable_entries man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_get_stringtable_entries(const pcre *code,
-const char *name, char **first, char **last);
-
-
-DESCRIPTION
-
-
-This convenience function finds, for a compiled pattern, the first and last
-entries for a given name in the table that translates capturing parenthesis
-names into numbers. When names are required to be unique (PCRE_DUPNAMES is
-not set), it is usually easier to use pcre_get_stringnumber()
-instead.
-
- code Compiled regular expression
- name Name whose entries required
- first Where to return a pointer to the first entry
- last Where to return a pointer to the last entry
-
-The yield of the function is the length of each entry, or
-PCRE_ERROR_NOSUBSTRING if none are found.
-
-
-There is a complete description of the PCRE native API, including the format of
-the table entries, in the
-pcreapi
-page, and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_get_substring.html b/libs/pcre/doc/html/pcre_get_substring.html
deleted file mode 100644
index 9b40e4dfed..0000000000
--- a/libs/pcre/doc/html/pcre_get_substring.html
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-pcre_get_substring specification
-
-
-pcre_get_substring man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_get_substring(const char *subject, int *ovector,
-int stringcount, int stringnumber,
-const char **stringptr);
-
-
-DESCRIPTION
-
-
-This is a convenience function for extracting a captured substring. The
-arguments are:
-
- subject Subject that has been successfully matched
- ovector Offset vector that pcre_exec() used
- stringcount Value returned by pcre_exec()
- stringnumber Number of the required substring
- stringptr Where to put the string pointer
-
-The memory in which the substring is placed is obtained by calling
-pcre_malloc(). The convenience function pcre_free_substring() can
-be used to free it when it is no longer needed. The yield of the function is
-the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
-be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_get_substring_list.html b/libs/pcre/doc/html/pcre_get_substring_list.html
deleted file mode 100644
index 617a3151d3..0000000000
--- a/libs/pcre/doc/html/pcre_get_substring_list.html
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-pcre_get_substring_list specification
-
-
-pcre_get_substring_list man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_get_substring_list(const char *subject,
-int *ovector, int stringcount, const char ***listptr);
-
-
-DESCRIPTION
-
-
-This is a convenience function for extracting a list of all the captured
-substrings. The arguments are:
-
- subject Subject that has been successfully matched
- ovector Offset vector that pcre_exec used
- stringcount Value returned by pcre_exec
- listptr Where to put a pointer to the list
-
-The memory in which the substrings and the list are placed is obtained by
-calling pcre_malloc(). The convenience function
-pcre_free_substring_list() can be used to free it when it is no longer
-needed. A pointer to a list of pointers is put in the variable whose address is
-in listptr. The list is terminated by a NULL pointer. The yield of the
-function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
-not be obtained.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_info.html b/libs/pcre/doc/html/pcre_info.html
deleted file mode 100644
index 6693ffee6c..0000000000
--- a/libs/pcre/doc/html/pcre_info.html
+++ /dev/null
@@ -1,39 +0,0 @@
-
-
-pcre_info specification
-
-
-pcre_info man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_info(const pcre *code, int *optptr, int
-*firstcharptr);
-
-
-DESCRIPTION
-
-
-This function is obsolete. You should be using pcre_fullinfo() instead.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_maketables.html b/libs/pcre/doc/html/pcre_maketables.html
deleted file mode 100644
index cf8d69ecfd..0000000000
--- a/libs/pcre/doc/html/pcre_maketables.html
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
-pcre_maketables specification
-
-
-pcre_maketables man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-const unsigned char *pcre_maketables(void);
-
-
-DESCRIPTION
-
-
-This function builds a set of character tables for character values less than
-256. These can be passed to pcre_compile() to override PCRE's internal,
-built-in tables (which were made by pcre_maketables() when PCRE was
-compiled). You might want to do this if you are using a non-standard locale.
-The function yields a pointer to the tables.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_refcount.html b/libs/pcre/doc/html/pcre_refcount.html
deleted file mode 100644
index b748df2f1d..0000000000
--- a/libs/pcre/doc/html/pcre_refcount.html
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-pcre_refcount specification
-
-
-pcre_refcount man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-int pcre_refcount(pcre *code, int adjust);
-
-
-DESCRIPTION
-
-
-This function is used to maintain a reference count inside a data block that
-contains a compiled pattern. Its arguments are:
-
- code Compiled regular expression
- adjust Adjustment to reference value
-
-The yield of the function is the adjusted reference value, which is constrained
-to lie between 0 and 65535.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_study.html b/libs/pcre/doc/html/pcre_study.html
deleted file mode 100644
index d290420e54..0000000000
--- a/libs/pcre/doc/html/pcre_study.html
+++ /dev/null
@@ -1,56 +0,0 @@
-
-
-pcre_study specification
-
-
-pcre_study man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-pcre_extra *pcre_study(const pcre *code, int options,
-const char **errptr);
-
-
-DESCRIPTION
-
-
-This function studies a compiled pattern, to see if additional information can
-be extracted that might speed up matching. Its arguments are:
-
- code A compiled regular expression
- options Options for pcre_study()
- errptr Where to put an error message
-
-If the function succeeds, it returns a value that can be passed to
-pcre_exec() via its extra argument.
-
-
-If the function returns NULL, either it could not find any additional
-information, or there was an error. You can tell the difference by looking at
-the error value. It is NULL in first case.
-
-
-There are currently no options defined; the value of the second argument should
-always be zero.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcre_version.html b/libs/pcre/doc/html/pcre_version.html
deleted file mode 100644
index 7bc8f8653e..0000000000
--- a/libs/pcre/doc/html/pcre_version.html
+++ /dev/null
@@ -1,39 +0,0 @@
-
-
-pcre_version specification
-
-
-pcre_version man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-SYNOPSIS
-
-
-#include <pcre.h>
-
-
-char *pcre_version(void);
-
-
-DESCRIPTION
-
-
-This function returns a character string that gives the version number of the
-PCRE library and the date of its release.
-
-
-There is a complete description of the PCRE native API in the
-pcreapi
-page and a description of the POSIX API in the
-pcreposix
-page.
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcreapi.html b/libs/pcre/doc/html/pcreapi.html
deleted file mode 100644
index 91273de2e2..0000000000
--- a/libs/pcre/doc/html/pcreapi.html
+++ /dev/null
@@ -1,2005 +0,0 @@
-
-
-pcreapi specification
-
-
-pcreapi man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PCRE NATIVE API
-
-#include <pcre.h>
-
-
-pcre *pcre_compile(const char *pattern, int options,
-const char **errptr, int *erroffset,
-const unsigned char *tableptr);
-
-
-pcre *pcre_compile2(const char *pattern, int options,
-int *errorcodeptr,
-const char **errptr, int *erroffset,
-const unsigned char *tableptr);
-
-
-pcre_extra *pcre_study(const pcre *code, int options,
-const char **errptr);
-
-
-int pcre_exec(const pcre *code, const pcre_extra *extra,
-const char *subject, int length, int startoffset,
-int options, int *ovector, int ovecsize);
-
-
-int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
-const char *subject, int length, int startoffset,
-int options, int *ovector, int ovecsize,
-int *workspace, int wscount);
-
-
-int pcre_copy_named_substring(const pcre *code,
-const char *subject, int *ovector,
-int stringcount, const char *stringname,
-char *buffer, int buffersize);
-
-
-int pcre_copy_substring(const char *subject, int *ovector,
-int stringcount, int stringnumber, char *buffer,
-int buffersize);
-
-
-int pcre_get_named_substring(const pcre *code,
-const char *subject, int *ovector,
-int stringcount, const char *stringname,
-const char **stringptr);
-
-
-int pcre_get_stringnumber(const pcre *code,
-const char *name);
-
-
-int pcre_get_stringtable_entries(const pcre *code,
-const char *name, char **first, char **last);
-
-
-int pcre_get_substring(const char *subject, int *ovector,
-int stringcount, int stringnumber,
-const char **stringptr);
-
-
-int pcre_get_substring_list(const char *subject,
-int *ovector, int stringcount, const char ***listptr);
-
-
-void pcre_free_substring(const char *stringptr);
-
-
-void pcre_free_substring_list(const char **stringptr);
-
-
-const unsigned char *pcre_maketables(void);
-
-
-int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
-int what, void *where);
-
-
-int pcre_info(const pcre *code, int *optptr, int
-*firstcharptr);
-
-
-int pcre_refcount(pcre *code, int adjust);
-
-
-int pcre_config(int what, void *where);
-
-
-char *pcre_version(void);
-
-
-void *(*pcre_malloc)(size_t);
-
-
-void (*pcre_free)(void *);
-
-
-void *(*pcre_stack_malloc)(size_t);
-
-
-void (*pcre_stack_free)(void *);
-
-
-int (*pcre_callout)(pcre_callout_block *);
-
-
PCRE API OVERVIEW
-
-PCRE has its own native API, which is described in this document. There are
-also some wrapper functions that correspond to the POSIX regular expression
-API. These are described in the
-pcreposix
-documentation. Both of these APIs define a set of C function calls. A C++
-wrapper is distributed with PCRE. It is documented in the
-pcrecpp
-page.
-
-
-The native API C function prototypes are defined in the header file
-pcre.h, and on Unix systems the library itself is called libpcre.
-It can normally be accessed by adding -lpcre to the command for linking
-an application that uses PCRE. The header file defines the macros PCRE_MAJOR
-and PCRE_MINOR to contain the major and minor release numbers for the library.
-Applications can use these to include support for different releases of PCRE.
-
-
-The functions pcre_compile(), pcre_compile2(), pcre_study(),
-and pcre_exec() are used for compiling and matching regular expressions
-in a Perl-compatible manner. A sample program that demonstrates the simplest
-way of using them is provided in the file called pcredemo.c in the source
-distribution. The
-pcresample
-documentation describes how to compile and run it.
-
-
-A second matching function, pcre_dfa_exec(), which is not
-Perl-compatible, is also provided. This uses a different algorithm for the
-matching. The alternative algorithm finds all possible matches (at a given
-point in the subject), and scans the subject just once. However, this algorithm
-does not return captured substrings. A description of the two matching
-algorithms and their advantages and disadvantages is given in the
-pcrematching
-documentation.
-
-
-In addition to the main compiling and matching functions, there are convenience
-functions for extracting captured substrings from a subject string that is
-matched by pcre_exec(). They are:
-
- pcre_copy_substring()
- pcre_copy_named_substring()
- pcre_get_substring()
- pcre_get_named_substring()
- pcre_get_substring_list()
- pcre_get_stringnumber()
- pcre_get_stringtable_entries()
-
-pcre_free_substring() and pcre_free_substring_list() are also
-provided, to free the memory used for extracted strings.
-
-
-The function pcre_maketables() is used to build a set of character tables
-in the current locale for passing to pcre_compile(), pcre_exec(),
-or pcre_dfa_exec(). This is an optional facility that is provided for
-specialist use. Most commonly, no special tables are passed, in which case
-internal tables that are generated when PCRE is built are used.
-
-
-The function pcre_fullinfo() is used to find out information about a
-compiled pattern; pcre_info() is an obsolete version that returns only
-some of the available information, but is retained for backwards compatibility.
-The function pcre_version() returns a pointer to a string containing the
-version of PCRE and its date of release.
-
-
-The function pcre_refcount() maintains a reference count in a data block
-containing a compiled pattern. This is provided for the benefit of
-object-oriented applications.
-
-
-The global variables pcre_malloc and pcre_free initially contain
-the entry points of the standard malloc() and free() functions,
-respectively. PCRE calls the memory management functions via these variables,
-so a calling program can replace them if it wishes to intercept the calls. This
-should be done before calling any PCRE functions.
-
-
-The global variables pcre_stack_malloc and pcre_stack_free are also
-indirections to memory management functions. These special functions are used
-only when PCRE is compiled to use the heap for remembering data, instead of
-recursive function calls, when running the pcre_exec() function. See the
-pcrebuild
-documentation for details of how to do this. It is a non-standard way of
-building PCRE, for use in environments that have limited stacks. Because of the
-greater use of memory management, it runs more slowly. Separate functions are
-provided so that special-purpose external code can be used for this case. When
-used, these functions are always called in a stack-like manner (last obtained,
-first freed), and always for memory blocks of the same size. There is a
-discussion about PCRE's stack usage in the
-pcrestack
-documentation.
-
-
-The global variable pcre_callout initially contains NULL. It can be set
-by the caller to a "callout" function, which PCRE will then call at specified
-points during a matching operation. Details are given in the
-pcrecallout
-documentation.
-
-
NEWLINES
-
-PCRE supports five different conventions for indicating line breaks in
-strings: a single CR (carriage return) character, a single LF (linefeed)
-character, the two-character sequence CRLF, any of the three preceding, or any
-Unicode newline sequence. The Unicode newline sequences are the three just
-mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
-U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
-(paragraph separator, U+2029).
-
-
-Each of the first three conventions is used by at least one operating system as
-its standard newline sequence. When PCRE is built, a default can be specified.
-The default default is LF, which is the Unix standard. When PCRE is run, the
-default can be overridden, either when a pattern is compiled, or when it is
-matched.
-
-
-At compile time, the newline convention can be specified by the options
-argument of pcre_compile(), or it can be specified by special text at the
-start of the pattern itself; this overrides any other settings. See the
-pcrepattern
-page for details of the special character sequences.
-
-
-In the PCRE documentation the word "newline" is used to mean "the character or
-pair of characters that indicate a line break". The choice of newline
-convention affects the handling of the dot, circumflex, and dollar
-metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
-recognized line ending sequence, the match position advancement for a
-non-anchored pattern. There is more detail about this in the
-section on pcre_exec() options
-below.
-
-
-The choice of newline convention does not affect the interpretation of
-the \n or \r escape sequences, nor does it affect what \R matches, which is
-controlled in a similar way, but by separate options.
-
-
MULTITHREADING
-
-The PCRE functions can be used in multi-threading applications, with the
-proviso that the memory management functions pointed to by pcre_malloc,
-pcre_free, pcre_stack_malloc, and pcre_stack_free, and the
-callout function pointed to by pcre_callout, are shared by all threads.
-
-
-The compiled form of a regular expression is not altered during matching, so
-the same compiled pattern can safely be used by several threads at once.
-
-
SAVING PRECOMPILED PATTERNS FOR LATER USE
-
-The compiled form of a regular expression can be saved and re-used at a later
-time, possibly by a different program, and even on a host other than the one on
-which it was compiled. Details are given in the
-pcreprecompile
-documentation. However, compiling a regular expression with one version of PCRE
-for use with a different version is not guaranteed to work and may cause
-crashes.
-
-
CHECKING BUILD-TIME OPTIONS
-
-int pcre_config(int what, void *where);
-
-
-The function pcre_config() makes it possible for a PCRE client to
-discover which optional features have been compiled into the PCRE library. The
-pcrebuild
-documentation has more details about these optional features.
-
-
-The first argument for pcre_config() is an integer, specifying which
-information is required; the second argument is a pointer to a variable into
-which the information is placed. The following information is available:
-
- PCRE_CONFIG_UTF8
-
-The output is an integer that is set to one if UTF-8 support is available;
-otherwise it is set to zero.
-
- PCRE_CONFIG_UNICODE_PROPERTIES
-
-The output is an integer that is set to one if support for Unicode character
-properties is available; otherwise it is set to zero.
-
- PCRE_CONFIG_NEWLINE
-
-The output is an integer whose value specifies the default character sequence
-that is recognized as meaning "newline". The four values that are supported
-are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY.
-Though they are derived from ASCII, the same values are returned in EBCDIC
-environments. The default should normally correspond to the standard sequence
-for your operating system.
-
- PCRE_CONFIG_BSR
-
-The output is an integer whose value indicates what character sequences the \R
-escape sequence matches by default. A value of 0 means that \R matches any
-Unicode line ending sequence; a value of 1 means that \R matches only CR, LF,
-or CRLF. The default can be overridden when a pattern is compiled or matched.
-
- PCRE_CONFIG_LINK_SIZE
-
-The output is an integer that contains the number of bytes used for internal
-linkage in compiled regular expressions. The value is 2, 3, or 4. Larger values
-allow larger regular expressions to be compiled, at the expense of slower
-matching. The default value of 2 is sufficient for all but the most massive
-patterns, since it allows the compiled pattern to be up to 64K in size.
-
- PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
-
-The output is an integer that contains the threshold above which the POSIX
-interface uses malloc() for output vectors. Further details are given in
-the
-pcreposix
-documentation.
-
- PCRE_CONFIG_MATCH_LIMIT
-
-The output is a long integer that gives the default limit for the number of
-internal matching function calls in a pcre_exec() execution. Further
-details are given with pcre_exec() below.
-
- PCRE_CONFIG_MATCH_LIMIT_RECURSION
-
-The output is a long integer that gives the default limit for the depth of
-recursion when calling the internal matching function in a pcre_exec()
-execution. Further details are given with pcre_exec() below.
-
- PCRE_CONFIG_STACKRECURSE
-
-The output is an integer that is set to one if internal recursion when running
-pcre_exec() is implemented by recursive function calls that use the stack
-to remember their state. This is the usual way that PCRE is compiled. The
-output is zero if PCRE was compiled to use blocks of data on the heap instead
-of recursive function calls. In this case, pcre_stack_malloc and
-pcre_stack_free are called to manage memory blocks on the heap, thus
-avoiding the use of the stack.
-
-
COMPILING A PATTERN
-
-pcre *pcre_compile(const char *pattern, int options,
-const char **errptr, int *erroffset,
-const unsigned char *tableptr);
-pcre *pcre_compile2(const char *pattern, int options,
-int *errorcodeptr,
-const char **errptr, int *erroffset,
-const unsigned char *tableptr);
-
-
-Either of the functions pcre_compile() or pcre_compile2() can be
-called to compile a pattern into an internal form. The only difference between
-the two interfaces is that pcre_compile2() has an additional argument,
-errorcodeptr, via which a numerical error code can be returned.
-
-
-The pattern is a C string terminated by a binary zero, and is passed in the
-pattern argument. A pointer to a single block of memory that is obtained
-via pcre_malloc is returned. This contains the compiled code and related
-data. The pcre type is defined for the returned block; this is a typedef
-for a structure whose contents are not externally defined. It is up to the
-caller to free the memory (via pcre_free) when it is no longer required.
-
-
-Although the compiled code of a PCRE regex is relocatable, that is, it does not
-depend on memory location, the complete pcre data block is not
-fully relocatable, because it may contain a copy of the tableptr
-argument, which is an address (see below).
-
-
-The options argument contains various bit settings that affect the
-compilation. It should be zero if no options are required. The available
-options are described below. Some of them (in particular, those that are
-compatible with Perl, but also some others) can also be set and unset from
-within the pattern (see the detailed description in the
-pcrepattern
-documentation). For those options that can be different in different parts of
-the pattern, the contents of the options argument specifies their initial
-settings at the start of compilation and execution. The PCRE_ANCHORED and
-PCRE_NEWLINE_xxx options can be set at the time of matching as well as at
-compile time.
-
-
-If errptr is NULL, pcre_compile() returns NULL immediately.
-Otherwise, if compilation of a pattern fails, pcre_compile() returns
-NULL, and sets the variable pointed to by errptr to point to a textual
-error message. This is a static string that is part of the library. You must
-not try to free it. The offset from the start of the pattern to the character
-where the error was discovered is placed in the variable pointed to by
-erroffset, which must not be NULL. If it is, an immediate error is given.
-
-
-If pcre_compile2() is used instead of pcre_compile(), and the
-errorcodeptr argument is not NULL, a non-zero error code number is
-returned via this argument in the event of an error. This is in addition to the
-textual error message. Error codes and messages are listed below.
-
-
-If the final argument, tableptr, is NULL, PCRE uses a default set of
-character tables that are built when PCRE is compiled, using the default C
-locale. Otherwise, tableptr must be an address that is the result of a
-call to pcre_maketables(). This value is stored with the compiled
-pattern, and used again by pcre_exec(), unless another table pointer is
-passed to it. For more discussion, see the section on locale support below.
-
-
-This code fragment shows a typical straightforward call to pcre_compile():
-
- pcre *re;
- const char *error;
- int erroffset;
- re = pcre_compile(
- "^A.*Z", /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-
-The following names for option bits are defined in the pcre.h header
-file:
-
- PCRE_ANCHORED
-
-If this bit is set, the pattern is forced to be "anchored", that is, it is
-constrained to match only at the first matching point in the string that is
-being searched (the "subject string"). This effect can also be achieved by
-appropriate constructs in the pattern itself, which is the only way to do it in
-Perl.
-
- PCRE_AUTO_CALLOUT
-
-If this bit is set, pcre_compile() automatically inserts callout items,
-all with number 255, before each pattern item. For discussion of the callout
-facility, see the
-pcrecallout
-documentation.
-
- PCRE_BSR_ANYCRLF
- PCRE_BSR_UNICODE
-
-These options (which are mutually exclusive) control what the \R escape
-sequence matches. The choice is either to match only CR, LF, or CRLF, or to
-match any Unicode newline sequence. The default is specified when PCRE is
-built. It can be overridden from within the pattern, or by setting an option
-when a compiled pattern is matched.
-
- PCRE_CASELESS
-
-If this bit is set, letters in the pattern match both upper and lower case
-letters. It is equivalent to Perl's /i option, and it can be changed within a
-pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
-concept of case for characters whose values are less than 128, so caseless
-matching is always possible. For characters with higher values, the concept of
-case is supported if PCRE is compiled with Unicode property support, but not
-otherwise. If you want to use caseless matching for characters 128 and above,
-you must ensure that PCRE is compiled with Unicode property support as well as
-with UTF-8 support.
-
- PCRE_DOLLAR_ENDONLY
-
-If this bit is set, a dollar metacharacter in the pattern matches only at the
-end of the subject string. Without this option, a dollar also matches
-immediately before a newline at the end of the string (but not before any other
-newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
-There is no equivalent to this option in Perl, and no way to set it within a
-pattern.
-
- PCRE_DOTALL
-
-If this bit is set, a dot metacharater in the pattern matches all characters,
-including those that indicate newline. Without it, a dot does not match when
-the current position is at a newline. This option is equivalent to Perl's /s
-option, and it can be changed within a pattern by a (?s) option setting. A
-negative class such as [^a] always matches newline characters, independent of
-the setting of this option.
-
- PCRE_DUPNAMES
-
-If this bit is set, names used to identify capturing subpatterns need not be
-unique. This can be helpful for certain types of pattern when it is known that
-only one instance of the named subpattern can ever be matched. There are more
-details of named subpatterns below; see also the
-pcrepattern
-documentation.
-
- PCRE_EXTENDED
-
-If this bit is set, whitespace data characters in the pattern are totally
-ignored except when escaped or inside a character class. Whitespace does not
-include the VT character (code 11). In addition, characters between an
-unescaped # outside a character class and the next newline, inclusive, are also
-ignored. This is equivalent to Perl's /x option, and it can be changed within a
-pattern by a (?x) option setting.
-
-
-This option makes it possible to include comments inside complicated patterns.
-Note, however, that this applies only to data characters. Whitespace characters
-may never appear within special character sequences in a pattern, for example
-within the sequence (?( which introduces a conditional subpattern.
-
- PCRE_EXTRA
-
-This option was invented in order to turn on additional functionality of PCRE
-that is incompatible with Perl, but it is currently of very little use. When
-set, any backslash in a pattern that is followed by a letter that has no
-special meaning causes an error, thus reserving these combinations for future
-expansion. By default, as in Perl, a backslash followed by a letter with no
-special meaning is treated as a literal. (Perl can, however, be persuaded to
-give a warning for this.) There are at present no other features controlled by
-this option. It can also be set by a (?X) option setting within a pattern.
-
- PCRE_FIRSTLINE
-
-If this option is set, an unanchored pattern is required to match before or at
-the first newline in the subject string, though the matched text may continue
-over the newline.
-
- PCRE_JAVASCRIPT_COMPAT
-
-If this option is set, PCRE's behaviour is changed in some ways so that it is
-compatible with JavaScript rather than Perl. The changes are as follows:
-
-
-(1) A lone closing square bracket in a pattern causes a compile-time error,
-because this is illegal in JavaScript (by default it is treated as a data
-character). Thus, the pattern AB]CD becomes illegal when this option is set.
-
-
-(2) At run time, a back reference to an unset subpattern group matches an empty
-string (by default this causes the current matching alternative to fail). A
-pattern such as (\1)(a) succeeds when this option is set (assuming it can find
-an "a" in the subject), whereas it fails by default, for Perl compatibility.
-
- PCRE_MULTILINE
-
-By default, PCRE treats the subject string as consisting of a single line of
-characters (even if it actually contains newlines). The "start of line"
-metacharacter (^) matches only at the start of the string, while the "end of
-line" metacharacter ($) matches only at the end of the string, or before a
-terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
-Perl.
-
-
-When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
-match immediately following or immediately before internal newlines in the
-subject string, respectively, as well as at the very start and end. This is
-equivalent to Perl's /m option, and it can be changed within a pattern by a
-(?m) option setting. If there are no newlines in a subject string, or no
-occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
-
- PCRE_NEWLINE_CR
- PCRE_NEWLINE_LF
- PCRE_NEWLINE_CRLF
- PCRE_NEWLINE_ANYCRLF
- PCRE_NEWLINE_ANY
-
-These options override the default newline definition that was chosen when PCRE
-was built. Setting the first or the second specifies that a newline is
-indicated by a single character (CR or LF, respectively). Setting
-PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
-CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
-preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
-that any Unicode newline sequence should be recognized. The Unicode newline
-sequences are the three just mentioned, plus the single characters VT (vertical
-tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
-separator, U+2028), and PS (paragraph separator, U+2029). The last two are
-recognized only in UTF-8 mode.
-
-
-The newline setting in the options word uses three bits that are treated
-as a number, giving eight possibilities. Currently only six are used (default
-plus the five values above). This means that if you set more than one newline
-option, the combination may or may not be sensible. For example,
-PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
-other combinations may yield unused numbers and cause an error.
-
-
-The only time that a line break is specially recognized when compiling a
-pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
-class is encountered. This indicates a comment that lasts until after the next
-line break sequence. In other circumstances, line break sequences are treated
-as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
-as whitespace characters and are therefore ignored.
-
-
-The newline option that is set at compile time becomes the default that is used
-for pcre_exec() and pcre_dfa_exec(), but it can be overridden.
-
- PCRE_NO_AUTO_CAPTURE
-
-If this option is set, it disables the use of numbered capturing parentheses in
-the pattern. Any opening parenthesis that is not followed by ? behaves as if it
-were followed by ?: but named parentheses can still be used for capturing (and
-they acquire numbers in the usual way). There is no equivalent of this option
-in Perl.
-
- PCRE_UNGREEDY
-
-This option inverts the "greediness" of the quantifiers so that they are not
-greedy by default, but become greedy if followed by "?". It is not compatible
-with Perl. It can also be set by a (?U) option setting within the pattern.
-
- PCRE_UTF8
-
-This option causes PCRE to regard both the pattern and the subject as strings
-of UTF-8 characters instead of single-byte character strings. However, it is
-available only when PCRE is built to include UTF-8 support. If not, the use
-of this option provokes an error. Details of how this option changes the
-behaviour of PCRE are given in the
-section on UTF-8 support
-in the main
-pcre
-page.
-
- PCRE_NO_UTF8_CHECK
-
-When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
-automatically checked. There is a discussion about the
-validity of UTF-8 strings
-in the main
-pcre
-page. If an invalid UTF-8 sequence of bytes is found, pcre_compile()
-returns an error. If you already know that your pattern is valid, and you want
-to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
-option. When it is set, the effect of passing an invalid UTF-8 string as a
-pattern is undefined. It may cause your program to crash. Note that this option
-can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress
-the UTF-8 validity checking of subject strings.
-
-
COMPILATION ERROR CODES
-
-The following table lists the error codes than may be returned by
-pcre_compile2(), along with the error messages that may be returned by
-both compiling functions. As PCRE has developed, some error codes have fallen
-out of use. To avoid confusion, they have not been re-used.
-
- 0 no error
- 1 \ at end of pattern
- 2 \c at end of pattern
- 3 unrecognized character follows \
- 4 numbers out of order in {} quantifier
- 5 number too big in {} quantifier
- 6 missing terminating ] for character class
- 7 invalid escape sequence in character class
- 8 range out of order in character class
- 9 nothing to repeat
- 10 [this code is not in use]
- 11 internal error: unexpected repeat
- 12 unrecognized character after (? or (?-
- 13 POSIX named classes are supported only within a class
- 14 missing )
- 15 reference to non-existent subpattern
- 16 erroffset passed as NULL
- 17 unknown option bit(s) set
- 18 missing ) after comment
- 19 [this code is not in use]
- 20 regular expression is too large
- 21 failed to get memory
- 22 unmatched parentheses
- 23 internal error: code overflow
- 24 unrecognized character after (?<
- 25 lookbehind assertion is not fixed length
- 26 malformed number or name after (?(
- 27 conditional group contains more than two branches
- 28 assertion expected after (?(
- 29 (?R or (?[+-]digits must be followed by )
- 30 unknown POSIX class name
- 31 POSIX collating elements are not supported
- 32 this version of PCRE is not compiled with PCRE_UTF8 support
- 33 [this code is not in use]
- 34 character value in \x{...} sequence is too large
- 35 invalid condition (?(0)
- 36 \C not allowed in lookbehind assertion
- 37 PCRE does not support \L, \l, \N, \U, or \u
- 38 number after (?C is > 255
- 39 closing ) for (?C expected
- 40 recursive call could loop indefinitely
- 41 unrecognized character after (?P
- 42 syntax error in subpattern name (missing terminator)
- 43 two named subpatterns have the same name
- 44 invalid UTF-8 string
- 45 support for \P, \p, and \X has not been compiled
- 46 malformed \P or \p sequence
- 47 unknown property name after \P or \p
- 48 subpattern name is too long (maximum 32 characters)
- 49 too many named subpatterns (maximum 10000)
- 50 [this code is not in use]
- 51 octal value is greater than \377 (not in UTF-8 mode)
- 52 internal error: overran compiling workspace
- 53 internal error: previously-checked referenced subpattern not found
- 54 DEFINE group contains more than one branch
- 55 repeating a DEFINE group is not allowed
- 56 inconsistent NEWLINE options
- 57 \g is not followed by a braced, angle-bracketed, or quoted
- name/number or by a plain number
- 58 a numbered reference must not be zero
- 59 (*VERB) with an argument is not supported
- 60 (*VERB) not recognized
- 61 number is too big
- 62 subpattern name expected
- 63 digit expected after (?+
- 64 ] is an invalid data character in JavaScript compatibility mode
-
-The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
-be used if the limits were changed when PCRE was built.
-
-
STUDYING A PATTERN
-
-pcre_extra *pcre_study(const pcre *code, int options
-const char **errptr);
-
-
-If a compiled pattern is going to be used several times, it is worth spending
-more time analyzing it in order to speed up the time taken for matching. The
-function pcre_study() takes a pointer to a compiled pattern as its first
-argument. If studying the pattern produces additional information that will
-help speed up matching, pcre_study() returns a pointer to a
-pcre_extra block, in which the study_data field points to the
-results of the study.
-
-
-The returned value from pcre_study() can be passed directly to
-pcre_exec(). However, a pcre_extra block also contains other
-fields that can be set by the caller before the block is passed; these are
-described
-below
-in the section on matching a pattern.
-
-
-If studying the pattern does not produce any additional information
-pcre_study() returns NULL. In that circumstance, if the calling program
-wants to pass any of the other fields to pcre_exec(), it must set up its
-own pcre_extra block.
-
-
-The second argument of pcre_study() contains option bits. At present, no
-options are defined, and this argument should always be zero.
-
-
-The third argument for pcre_study() is a pointer for an error message. If
-studying succeeds (even if no data is returned), the variable it points to is
-set to NULL. Otherwise it is set to point to a textual error message. This is a
-static string that is part of the library. You must not try to free it. You
-should test the error pointer for NULL after calling pcre_study(), to be
-sure that it has run successfully.
-
-
-This is a typical call to pcre_study():
-
- pcre_extra *pe;
- pe = pcre_study(
- re, /* result of pcre_compile() */
- 0, /* no options exist */
- &error); /* set to NULL or points to a message */
-
-At present, studying a pattern is useful only for non-anchored patterns that do
-not have a single fixed starting character. A bitmap of possible starting
-bytes is created.
-
-
LOCALE SUPPORT
-
-PCRE handles caseless matching, and determines whether characters are letters,
-digits, or whatever, by reference to a set of tables, indexed by character
-value. When running in UTF-8 mode, this applies only to characters with codes
-less than 128. Higher-valued codes never match escapes such as \w or \d, but
-can be tested with \p if PCRE is built with Unicode character property
-support. The use of locales with Unicode is discouraged. If you are handling
-characters with codes greater than 128, you should either use UTF-8 and
-Unicode, or use locales, but not try to mix the two.
-
-
-PCRE contains an internal set of tables that are used when the final argument
-of pcre_compile() is NULL. These are sufficient for many applications.
-Normally, the internal tables recognize only ASCII characters. However, when
-PCRE is built, it is possible to cause the internal tables to be rebuilt in the
-default "C" locale of the local system, which may cause them to be different.
-
-
-The internal tables can always be overridden by tables supplied by the
-application that calls PCRE. These may be created in a different locale from
-the default. As more and more applications change to using Unicode, the need
-for this locale support is expected to die away.
-
-
-External tables are built by calling the pcre_maketables() function,
-which has no arguments, in the relevant locale. The result can then be passed
-to pcre_compile() or pcre_exec() as often as necessary. For
-example, to build and use tables that are appropriate for the French locale
-(where accented characters with values greater than 128 are treated as letters),
-the following code could be used:
-
- setlocale(LC_CTYPE, "fr_FR");
- tables = pcre_maketables();
- re = pcre_compile(..., tables);
-
-The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
-are using Windows, the name for the French locale is "french".
-
-
-When pcre_maketables() runs, the tables are built in memory that is
-obtained via pcre_malloc. It is the caller's responsibility to ensure
-that the memory containing the tables remains available for as long as it is
-needed.
-
-
-The pointer that is passed to pcre_compile() is saved with the compiled
-pattern, and the same tables are used via this pointer by pcre_study()
-and normally also by pcre_exec(). Thus, by default, for any single
-pattern, compilation, studying and matching all happen in the same locale, but
-different patterns can be compiled in different locales.
-
-
-It is possible to pass a table pointer or NULL (indicating the use of the
-internal tables) to pcre_exec(). Although not intended for this purpose,
-this facility could be used to match a pattern in a different locale from the
-one in which it was compiled. Passing table pointers at run time is discussed
-below in the section on matching a pattern.
-
-
INFORMATION ABOUT A PATTERN
-
-int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
-int what, void *where);
-
-
-The pcre_fullinfo() function returns information about a compiled
-pattern. It replaces the obsolete pcre_info() function, which is
-nevertheless retained for backwards compability (and is documented below).
-
-
-The first argument for pcre_fullinfo() is a pointer to the compiled
-pattern. The second argument is the result of pcre_study(), or NULL if
-the pattern was not studied. The third argument specifies which piece of
-information is required, and the fourth argument is a pointer to a variable
-to receive the data. The yield of the function is zero for success, or one of
-the following negative numbers:
-
- PCRE_ERROR_NULL the argument code was NULL
- the argument where was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of what was invalid
-
-The "magic number" is placed at the start of each compiled pattern as an simple
-check against passing an arbitrary memory pointer. Here is a typical call of
-pcre_fullinfo(), to obtain the length of the compiled pattern:
-
- int rc;
- size_t length;
- rc = pcre_fullinfo(
- re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
- PCRE_INFO_SIZE, /* what is required */
- &length); /* where to put the data */
-
-The possible values for the third argument are defined in pcre.h, and are
-as follows:
-
- PCRE_INFO_BACKREFMAX
-
-Return the number of the highest back reference in the pattern. The fourth
-argument should point to an int variable. Zero is returned if there are
-no back references.
-
- PCRE_INFO_CAPTURECOUNT
-
-Return the number of capturing subpatterns in the pattern. The fourth argument
-should point to an int variable.
-
- PCRE_INFO_DEFAULT_TABLES
-
-Return a pointer to the internal default character tables within PCRE. The
-fourth argument should point to an unsigned char * variable. This
-information call is provided for internal use by the pcre_study()
-function. External callers can cause PCRE to use its internal tables by passing
-a NULL table pointer.
-
- PCRE_INFO_FIRSTBYTE
-
-Return information about the first byte of any matched string, for a
-non-anchored pattern. The fourth argument should point to an int
-variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is
-still recognized for backwards compatibility.)
-
-
-If there is a fixed first byte, for example, from a pattern such as
-(cat|cow|coyote), its value is returned. Otherwise, if either
-
-
-(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
-starts with "^", or
-
-
-(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
-(if it were set, the pattern would be anchored),
-
-
--1 is returned, indicating that the pattern matches only at the start of a
-subject string or after any newline within the string. Otherwise -2 is
-returned. For anchored patterns, -2 is returned.
-
- PCRE_INFO_FIRSTTABLE
-
-If the pattern was studied, and this resulted in the construction of a 256-bit
-table indicating a fixed set of bytes for the first byte in any matching
-string, a pointer to the table is returned. Otherwise NULL is returned. The
-fourth argument should point to an unsigned char * variable.
-
- PCRE_INFO_HASCRORLF
-
-Return 1 if the pattern contains any explicit matches for CR or LF characters,
-otherwise 0. The fourth argument should point to an int variable. An
-explicit match is either a literal CR or LF character, or \r or \n.
-
- PCRE_INFO_JCHANGED
-
-Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
-0. The fourth argument should point to an int variable. (?J) and
-(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
-
- PCRE_INFO_LASTLITERAL
-
-Return the value of the rightmost literal byte that must exist in any matched
-string, other than at its start, if such a byte has been recorded. The fourth
-argument should point to an int variable. If there is no such byte, -1 is
-returned. For anchored patterns, a last literal byte is recorded only if it
-follows something of variable length. For example, for the pattern
-/^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value
-is -1.
-
- PCRE_INFO_NAMECOUNT
- PCRE_INFO_NAMEENTRYSIZE
- PCRE_INFO_NAMETABLE
-
-PCRE supports the use of named as well as numbered capturing parentheses. The
-names are just an additional way of identifying the parentheses, which still
-acquire numbers. Several convenience functions such as
-pcre_get_named_substring() are provided for extracting captured
-substrings by name. It is also possible to extract the data directly, by first
-converting the name to a number in order to access the correct pointers in the
-output vector (described with pcre_exec() below). To do the conversion,
-you need to use the name-to-number map, which is described by these three
-values.
-
-
-The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT gives
-the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each
-entry; both of these return an int value. The entry size depends on the
-length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
-entry of the table (a pointer to char). The first two bytes of each entry
-are the number of the capturing parenthesis, most significant byte first. The
-rest of the entry is the corresponding name, zero terminated. The names are in
-alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of
-their parentheses numbers. For example, consider the following pattern (assume
-PCRE_EXTENDED is set, so white space - including newlines - is ignored):
-
- (?<date> (?<year>(\d\d)?\d\d) - (?<month>\d\d) - (?<day>\d\d) )
-
-There are four named subpatterns, so the table has four entries, and each entry
-in the table is eight bytes long. The table is as follows, with non-printing
-bytes shows in hexadecimal, and undefined bytes shown as ??:
-
- 00 01 d a t e 00 ??
- 00 05 d a y 00 ?? ??
- 00 04 m o n t h 00
- 00 02 y e a r 00 ??
-
-When writing code to extract data from named subpatterns using the
-name-to-number map, remember that the length of the entries is likely to be
-different for each compiled pattern.
-
- PCRE_INFO_OKPARTIAL
-
-Return 1 if the pattern can be used for partial matching, otherwise 0. The
-fourth argument should point to an int variable. The
-pcrepartial
-documentation lists the restrictions that apply to patterns when partial
-matching is used.
-
- PCRE_INFO_OPTIONS
-
-Return a copy of the options with which the pattern was compiled. The fourth
-argument should point to an unsigned long int variable. These option bits
-are those specified in the call to pcre_compile(), modified by any
-top-level option settings at the start of the pattern itself. In other words,
-they are the options that will be in force when matching starts. For example,
-if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
-result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
-
-
-A pattern is automatically anchored by PCRE if all of its top-level
-alternatives begin with one of the following:
-
- ^ unless PCRE_MULTILINE is set
- \A always
- \G always
- .* if PCRE_DOTALL is set and there are no back references to the subpattern in which .* appears
-
-For such patterns, the PCRE_ANCHORED bit is set in the options returned by
-pcre_fullinfo().
-
- PCRE_INFO_SIZE
-
-Return the size of the compiled pattern, that is, the value that was passed as
-the argument to pcre_malloc() when PCRE was getting memory in which to
-place the compiled data. The fourth argument should point to a size_t
-variable.
-
- PCRE_INFO_STUDYSIZE
-
-Return the size of the data block pointed to by the study_data field in
-a pcre_extra block. That is, it is the value that was passed to
-pcre_malloc() when PCRE was getting memory into which to place the data
-created by pcre_study(). The fourth argument should point to a
-size_t variable.
-
-
OBSOLETE INFO FUNCTION
-
-int pcre_info(const pcre *code, int *optptr, int
-*firstcharptr);
-
-
-The pcre_info() function is now obsolete because its interface is too
-restrictive to return all the available data about a compiled pattern. New
-programs should use pcre_fullinfo() instead. The yield of
-pcre_info() is the number of capturing subpatterns, or one of the
-following negative numbers:
-
- PCRE_ERROR_NULL the argument code was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
-
-If the optptr argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to (see
-PCRE_INFO_OPTIONS above).
-
-
-If the pattern is not anchored and the firstcharptr argument is not NULL,
-it is used to pass back information about the first character of any matched
-string (see PCRE_INFO_FIRSTBYTE above).
-
-
REFERENCE COUNTS
-
-int pcre_refcount(pcre *code, int adjust);
-
-
-The pcre_refcount() function is used to maintain a reference count in the
-data block that contains a compiled pattern. It is provided for the benefit of
-applications that operate in an object-oriented manner, where different parts
-of the application may be using the same compiled pattern, but you want to free
-the block when they are all done.
-
-
-When a pattern is compiled, the reference count field is initialized to zero.
-It is changed only by calling this function, whose action is to add the
-adjust value (which may be positive or negative) to it. The yield of the
-function is the new value. However, the value of the count is constrained to
-lie between 0 and 65535, inclusive. If the new value is outside these limits,
-it is forced to the appropriate limit value.
-
-
-Except when it is zero, the reference count is not correctly preserved if a
-pattern is compiled on one host and then transferred to a host whose byte-order
-is different. (This seems a highly unlikely scenario.)
-
-
MATCHING A PATTERN: THE TRADITIONAL FUNCTION
-
-int pcre_exec(const pcre *code, const pcre_extra *extra,
-const char *subject, int length, int startoffset,
-int options, int *ovector, int ovecsize);
-
-
-The function pcre_exec() is called to match a subject string against a
-compiled pattern, which is passed in the code argument. If the
-pattern has been studied, the result of the study should be passed in the
-extra argument. This function is the main matching facility of the
-library, and it operates in a Perl-like manner. For specialist use there is
-also an alternative matching function, which is described
-below
-in the section about the pcre_dfa_exec() function.
-
-
-In most applications, the pattern will have been compiled (and optionally
-studied) in the same process that calls pcre_exec(). However, it is
-possible to save compiled patterns and study data, and then use them later
-in different processes, possibly even on different hosts. For a discussion
-about this, see the
-pcreprecompile
-documentation.
-
-
-Here is an example of a simple call to pcre_exec():
-
- int rc;
- int ovector[30];
- rc = pcre_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector of integers for substring information */
- 30); /* number of elements (NOT size in bytes) */
-
-
-
-Extra data for pcre_exec()
-
-
-If the extra argument is not NULL, it must point to a pcre_extra
-data block. The pcre_study() function returns such a block (when it
-doesn't return NULL), but you can also create one for yourself, and pass
-additional information in it. The pcre_extra block contains the following
-fields (not necessarily in this order):
-
- unsigned long int flags;
- void *study_data;
- unsigned long int match_limit;
- unsigned long int match_limit_recursion;
- void *callout_data;
- const unsigned char *tables;
-
-The flags field is a bitmap that specifies which of the other fields
-are set. The flag bits are:
-
- PCRE_EXTRA_STUDY_DATA
- PCRE_EXTRA_MATCH_LIMIT
- PCRE_EXTRA_MATCH_LIMIT_RECURSION
- PCRE_EXTRA_CALLOUT_DATA
- PCRE_EXTRA_TABLES
-
-Other flag bits should be set to zero. The study_data field is set in the
-pcre_extra block that is returned by pcre_study(), together with
-the appropriate flag bit. You should not set this yourself, but you may add to
-the block by setting the other fields and their corresponding flag bits.
-
-
-The match_limit field provides a means of preventing PCRE from using up a
-vast amount of resources when running patterns that are not going to match,
-but which have a very large number of possibilities in their search trees. The
-classic example is the use of nested unlimited repeats.
-
-
-Internally, PCRE uses a function called match() which it calls repeatedly
-(sometimes recursively). The limit set by match_limit is imposed on the
-number of times this function is called during a match, which has the effect of
-limiting the amount of backtracking that can take place. For patterns that are
-not anchored, the count restarts from zero for each position in the subject
-string.
-
-
-The default value for the limit can be set when PCRE is built; the default
-default is 10 million, which handles all but the most extreme cases. You can
-override the default by suppling pcre_exec() with a pcre_extra
-block in which match_limit is set, and PCRE_EXTRA_MATCH_LIMIT is set in
-the flags field. If the limit is exceeded, pcre_exec() returns
-PCRE_ERROR_MATCHLIMIT.
-
-
-The match_limit_recursion field is similar to match_limit, but
-instead of limiting the total number of times that match() is called, it
-limits the depth of recursion. The recursion depth is a smaller number than the
-total number of calls, because not all calls to match() are recursive.
-This limit is of use only if it is set smaller than match_limit.
-
-
-Limiting the recursion depth limits the amount of stack that can be used, or,
-when PCRE has been compiled to use memory on the heap instead of the stack, the
-amount of heap memory that can be used.
-
-
-The default value for match_limit_recursion can be set when PCRE is
-built; the default default is the same value as the default for
-match_limit. You can override the default by suppling pcre_exec()
-with a pcre_extra block in which match_limit_recursion is set, and
-PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the limit
-is exceeded, pcre_exec() returns PCRE_ERROR_RECURSIONLIMIT.
-
-
-The pcre_callout field is used in conjunction with the "callout" feature,
-which is described in the
-pcrecallout
-documentation.
-
-
-The tables field is used to pass a character tables pointer to
-pcre_exec(); this overrides the value that is stored with the compiled
-pattern. A non-NULL value is stored with the compiled pattern only if custom
-tables were supplied to pcre_compile() via its tableptr argument.
-If NULL is passed to pcre_exec() using this mechanism, it forces PCRE's
-internal tables to be used. This facility is helpful when re-using patterns
-that have been saved after compiling with an external set of tables, because
-the external tables might be at a different address when pcre_exec() is
-called. See the
-pcreprecompile
-documentation for a discussion of saving compiled patterns for later use.
-
-
-Option bits for pcre_exec()
-
-
-The unused bits of the options argument for pcre_exec() must be
-zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
-PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
-PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
-
- PCRE_ANCHORED
-
-The PCRE_ANCHORED option limits pcre_exec() to matching at the first
-matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
-to be anchored by virtue of its contents, it cannot be made unachored at
-matching time.
-
- PCRE_BSR_ANYCRLF
- PCRE_BSR_UNICODE
-
-These options (which are mutually exclusive) control what the \R escape
-sequence matches. The choice is either to match only CR, LF, or CRLF, or to
-match any Unicode newline sequence. These options override the choice that was
-made or defaulted when the pattern was compiled.
-
- PCRE_NEWLINE_CR
- PCRE_NEWLINE_LF
- PCRE_NEWLINE_CRLF
- PCRE_NEWLINE_ANYCRLF
- PCRE_NEWLINE_ANY
-
-These options override the newline definition that was chosen or defaulted when
-the pattern was compiled. For details, see the description of
-pcre_compile() above. During matching, the newline choice affects the
-behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
-the way the match position is advanced after a match failure for an unanchored
-pattern.
-
-
-When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
-match attempt for an unanchored pattern fails when the current position is at a
-CRLF sequence, and the pattern contains no explicit matches for CR or LF
-characters, the match position is advanced by two characters instead of one, in
-other words, to after the CRLF.
-
-
-The above rule is a compromise that makes the most common cases work as
-expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
-set), it does not match the string "\r\nA" because, after failing at the
-start, it skips both the CR and the LF before retrying. However, the pattern
-[\r\n]A does match that string, because it contains an explicit CR or LF
-reference, and so advances only by one character after the first failure.
-
-
-An explicit match for CR of LF is either a literal appearance of one of those
-characters, or one of the \r or \n escape sequences. Implicit matches such as
-[^X] do not count, nor does \s (which includes CR and LF in the characters
-that it matches).
-
-
-Notwithstanding the above, anomalous effects may still occur when CRLF is a
-valid newline sequence and explicit \r or \n escapes appear in the pattern.
-
- PCRE_NOTBOL
-
-This option specifies that first character of the subject string is not the
-beginning of a line, so the circumflex metacharacter should not match before
-it. Setting this without PCRE_MULTILINE (at compile time) causes circumflex
-never to match. This option affects only the behaviour of the circumflex
-metacharacter. It does not affect \A.
-
- PCRE_NOTEOL
-
-This option specifies that the end of the subject string is not the end of a
-line, so the dollar metacharacter should not match it nor (except in multiline
-mode) a newline immediately before it. Setting this without PCRE_MULTILINE (at
-compile time) causes dollar never to match. This option affects only the
-behaviour of the dollar metacharacter. It does not affect \Z or \z.
-
- PCRE_NOTEMPTY
-
-An empty string is not considered to be a valid match if this option is set. If
-there are alternatives in the pattern, they are tried. If all the alternatives
-match the empty string, the entire match fails. For example, if the pattern
-
- a?b?
-
-is applied to a string not beginning with "a" or "b", it matches the empty
-string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
-valid, so PCRE searches further into the string for occurrences of "a" or "b".
-
-
-Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
-of a pattern match of the empty string within its split() function, and
-when using the /g modifier. It is possible to emulate Perl's behaviour after
-matching a null string by first trying the match again at the same offset with
-PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
-starting offset (see below) and trying an ordinary match again. There is some
-code that demonstrates how to do this in the pcredemo.c sample program.
-
- PCRE_NO_START_OPTIMIZE
-
-There are a number of optimizations that pcre_exec() uses at the start of
-a match, in order to speed up the process. For example, if it is known that a
-match must start with a specific character, it searches the subject for that
-character, and fails immediately if it cannot find it, without actually running
-the main matching function. When callouts are in use, these optimizations can
-cause them to be skipped. This option disables the "start-up" optimizations,
-causing performance to suffer, but ensuring that the callouts do occur.
-
- PCRE_NO_UTF8_CHECK
-
-When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
-string is automatically checked when pcre_exec() is subsequently called.
-The value of startoffset is also checked to ensure that it points to the
-start of a UTF-8 character. There is a discussion about the validity of UTF-8
-strings in the
-section on UTF-8 support
-in the main
-pcre
-page. If an invalid UTF-8 sequence of bytes is found, pcre_exec() returns
-the error PCRE_ERROR_BADUTF8. If startoffset contains an invalid value,
-PCRE_ERROR_BADUTF8_OFFSET is returned.
-
-
-If you already know that your subject is valid, and you want to skip these
-checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
-calling pcre_exec(). You might want to do this for the second and
-subsequent calls to pcre_exec() if you are making repeated calls to find
-all the matches in a single subject string. However, you should be sure that
-the value of startoffset points to the start of a UTF-8 character. When
-PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a
-subject, or a value of startoffset that does not point to the start of a
-UTF-8 character, is undefined. Your program may crash.
-
- PCRE_PARTIAL
-
-This option turns on the partial matching feature. If the subject string fails
-to match the pattern, but at some point during the matching process the end of
-the subject was reached (that is, the subject partially matches the pattern and
-the failure to match occurred only because there were not enough subject
-characters), pcre_exec() returns PCRE_ERROR_PARTIAL instead of
-PCRE_ERROR_NOMATCH. When PCRE_PARTIAL is used, there are restrictions on what
-may appear in the pattern. These are discussed in the
-pcrepartial
-documentation.
-
-
-The string to be matched by pcre_exec()
-
-
-The subject string is passed to pcre_exec() as a pointer in
-subject, a length (in bytes) in length, and a starting byte offset
-in startoffset. In UTF-8 mode, the byte offset must point to the start of
-a UTF-8 character. Unlike the pattern string, the subject may contain binary
-zero bytes. When the starting offset is zero, the search for a match starts at
-the beginning of the subject, and this is by far the most common case.
-
-
-A non-zero starting offset is useful when searching for another match in the
-same subject by calling pcre_exec() again after a previous success.
-Setting startoffset differs from just passing over a shortened string and
-setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
-lookbehind. For example, consider the pattern
-
- \Biss\B
-
-which finds occurrences of "iss" in the middle of words. (\B matches only if
-the current position in the subject is not a word boundary.) When applied to
-the string "Mississipi" the first call to pcre_exec() finds the first
-occurrence. If pcre_exec() is called again with just the remainder of the
-subject, namely "issipi", it does not match, because \B is always false at the
-start of the subject, which is deemed to be a word boundary. However, if
-pcre_exec() is passed the entire string again, but with startoffset
-set to 4, it finds the second occurrence of "iss" because it is able to look
-behind the starting point to discover that it is preceded by a letter.
-
-
-If a non-zero starting offset is passed when the pattern is anchored, one
-attempt to match at the given offset is made. This can only succeed if the
-pattern does not require the match to be at the start of the subject.
-
-
-How pcre_exec() returns captured substrings
-
-
-In general, a pattern matches a certain portion of the subject, and in
-addition, further substrings from the subject may be picked out by parts of the
-pattern. Following the usage in Jeffrey Friedl's book, this is called
-"capturing" in what follows, and the phrase "capturing subpattern" is used for
-a fragment of a pattern that picks out a substring. PCRE supports several other
-kinds of parenthesized subpattern that do not cause substrings to be captured.
-
-
-Captured substrings are returned to the caller via a vector of integers whose
-address is passed in ovector. The number of elements in the vector is
-passed in ovecsize, which must be a non-negative number. Note: this
-argument is NOT the size of ovector in bytes.
-
-
-The first two-thirds of the vector is used to pass back captured substrings,
-each substring using a pair of integers. The remaining third of the vector is
-used as workspace by pcre_exec() while matching capturing subpatterns,
-and is not available for passing back information. The number passed in
-ovecsize should always be a multiple of three. If it is not, it is
-rounded down.
-
-
-When a match is successful, information about captured substrings is returned
-in pairs of integers, starting at the beginning of ovector, and
-continuing up to two-thirds of its length at the most. The first element of
-each pair is set to the byte offset of the first character in a substring, and
-the second is set to the byte offset of the first character after the end of a
-substring. Note: these values are always byte offsets, even in UTF-8
-mode. They are not character counts.
-
-
-The first pair of integers, ovector[0] and ovector[1], identify the
-portion of the subject string matched by the entire pattern. The next pair is
-used for the first capturing subpattern, and so on. The value returned by
-pcre_exec() is one more than the highest numbered pair that has been set.
-For example, if two substrings have been captured, the returned value is 3. If
-there are no capturing subpatterns, the return value from a successful match is
-1, indicating that just the first pair of offsets has been set.
-
-
-If a capturing subpattern is matched repeatedly, it is the last portion of the
-string that it matched that is returned.
-
-
-If the vector is too small to hold all the captured substring offsets, it is
-used as far as possible (up to two-thirds of its length), and the function
-returns a value of zero. If the substring offsets are not of interest,
-pcre_exec() may be called with ovector passed as NULL and
-ovecsize as zero. However, if the pattern contains back references and
-the ovector is not big enough to remember the related substrings, PCRE
-has to get additional memory for use during matching. Thus it is usually
-advisable to supply an ovector.
-
-
-The pcre_info() function can be used to find out how many capturing
-subpatterns there are in a compiled pattern. The smallest size for
-ovector that will allow for n captured substrings, in addition to
-the offsets of the substring matched by the whole pattern, is (n+1)*3.
-
-
-It is possible for capturing subpattern number n+1 to match some part of
-the subject when subpattern n has not been used at all. For example, if
-the string "abc" is matched against the pattern (a|(z))(bc) the return from the
-function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
-happens, both values in the offset pairs corresponding to unused subpatterns
-are set to -1.
-
-
-Offset values that correspond to unused subpatterns at the end of the
-expression are also set to -1. For example, if the string "abc" is matched
-against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The
-return from the function is 2, because the highest used capturing subpattern
-number is 1. However, you can refer to the offsets for the second and third
-capturing subpatterns if you wish (assuming the vector is large enough, of
-course).
-
-
-Some convenience functions are provided for extracting the captured substrings
-as separate strings. These are described below.
-
-
-Error return values from pcre_exec()
-
-
-If pcre_exec() fails, it returns a negative number. The following are
-defined in the header file:
-
- PCRE_ERROR_NOMATCH (-1)
-
-The subject string did not match the pattern.
-
- PCRE_ERROR_NULL (-2)
-
-Either code or subject was passed as NULL, or ovector was
-NULL and ovecsize was not zero.
-
- PCRE_ERROR_BADOPTION (-3)
-
-An unrecognized bit was set in the options argument.
-
- PCRE_ERROR_BADMAGIC (-4)
-
-PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
-the case when it is passed a junk pointer and to detect when a pattern that was
-compiled in an environment of one endianness is run in an environment with the
-other endianness. This is the error that PCRE gives when the magic number is
-not present.
-
- PCRE_ERROR_UNKNOWN_OPCODE (-5)
-
-While running the pattern match, an unknown item was encountered in the
-compiled pattern. This error could be caused by a bug in PCRE or by overwriting
-of the compiled pattern.
-
- PCRE_ERROR_NOMEMORY (-6)
-
-If a pattern contains back references, but the ovector that is passed to
-pcre_exec() is not big enough to remember the referenced substrings, PCRE
-gets a block of memory at the start of matching to use for this purpose. If the
-call via pcre_malloc() fails, this error is given. The memory is
-automatically freed at the end of matching.
-
- PCRE_ERROR_NOSUBSTRING (-7)
-
-This error is used by the pcre_copy_substring(),
-pcre_get_substring(), and pcre_get_substring_list() functions (see
-below). It is never returned by pcre_exec().
-
- PCRE_ERROR_MATCHLIMIT (-8)
-
-The backtracking limit, as specified by the match_limit field in a
-pcre_extra structure (or defaulted) was reached. See the description
-above.
-
- PCRE_ERROR_CALLOUT (-9)
-
-This error is never generated by pcre_exec() itself. It is provided for
-use by callout functions that want to yield a distinctive error code. See the
-pcrecallout
-documentation for details.
-
- PCRE_ERROR_BADUTF8 (-10)
-
-A string that contains an invalid UTF-8 byte sequence was passed as a subject.
-
- PCRE_ERROR_BADUTF8_OFFSET (-11)
-
-The UTF-8 byte sequence that was passed as a subject was valid, but the value
-of startoffset did not point to the beginning of a UTF-8 character.
-
- PCRE_ERROR_PARTIAL (-12)
-
-The subject string did not match, but it did match partially. See the
-pcrepartial
-documentation for details of partial matching.
-
- PCRE_ERROR_BADPARTIAL (-13)
-
-The PCRE_PARTIAL option was used with a compiled pattern containing items that
-are not supported for partial matching. See the
-pcrepartial
-documentation for details of partial matching.
-
- PCRE_ERROR_INTERNAL (-14)
-
-An unexpected internal error has occurred. This error could be caused by a bug
-in PCRE or by overwriting of the compiled pattern.
-
- PCRE_ERROR_BADCOUNT (-15)
-
-This error is given if the value of the ovecsize argument is negative.
-
- PCRE_ERROR_RECURSIONLIMIT (-21)
-
-The internal recursion limit, as specified by the match_limit_recursion
-field in a pcre_extra structure (or defaulted) was reached. See the
-description above.
-
- PCRE_ERROR_BADNEWLINE (-23)
-
-An invalid combination of PCRE_NEWLINE_xxx options was given.
-
-
-Error numbers -16 to -20 and -22 are not used by pcre_exec().
-
-
EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
-
-int pcre_copy_substring(const char *subject, int *ovector,
-int stringcount, int stringnumber, char *buffer,
-int buffersize);
-
-
-int pcre_get_substring(const char *subject, int *ovector,
-int stringcount, int stringnumber,
-const char **stringptr);
-
-
-int pcre_get_substring_list(const char *subject,
-int *ovector, int stringcount, const char ***listptr);
-
-
-Captured substrings can be accessed directly by using the offsets returned by
-pcre_exec() in ovector. For convenience, the functions
-pcre_copy_substring(), pcre_get_substring(), and
-pcre_get_substring_list() are provided for extracting captured substrings
-as new, separate, zero-terminated strings. These functions identify substrings
-by number. The next section describes functions for extracting named
-substrings.
-
-
-A substring that contains a binary zero is correctly extracted and has a
-further zero added on the end, but the result is not, of course, a C string.
-However, you can process such a string by referring to the length that is
-returned by pcre_copy_substring() and pcre_get_substring().
-Unfortunately, the interface to pcre_get_substring_list() is not adequate
-for handling strings containing binary zeros, because the end of the final
-string is not independently indicated.
-
-
-The first three arguments are the same for all three of these functions:
-subject is the subject string that has just been successfully matched,
-ovector is a pointer to the vector of integer offsets that was passed to
-pcre_exec(), and stringcount is the number of substrings that were
-captured by the match, including the substring that matched the entire regular
-expression. This is the value returned by pcre_exec() if it is greater
-than zero. If pcre_exec() returned zero, indicating that it ran out of
-space in ovector, the value passed as stringcount should be the
-number of elements in the vector divided by three.
-
-
-The functions pcre_copy_substring() and pcre_get_substring()
-extract a single substring, whose number is given as stringnumber. A
-value of zero extracts the substring that matched the entire pattern, whereas
-higher values extract the captured substrings. For pcre_copy_substring(),
-the string is placed in buffer, whose length is given by
-buffersize, while for pcre_get_substring() a new block of memory is
-obtained via pcre_malloc, and its address is returned via
-stringptr. The yield of the function is the length of the string, not
-including the terminating zero, or one of these error codes:
-
- PCRE_ERROR_NOMEMORY (-6)
-
-The buffer was too small for pcre_copy_substring(), or the attempt to get
-memory failed for pcre_get_substring().
-
- PCRE_ERROR_NOSUBSTRING (-7)
-
-There is no substring whose number is stringnumber.
-
-
-The pcre_get_substring_list() function extracts all available substrings
-and builds a list of pointers to them. All this is done in a single block of
-memory that is obtained via pcre_malloc. The address of the memory block
-is returned via listptr, which is also the start of the list of string
-pointers. The end of the list is marked by a NULL pointer. The yield of the
-function is zero if all went well, or the error code
-
- PCRE_ERROR_NOMEMORY (-6)
-
-if the attempt to get the memory block failed.
-
-
-When any of these functions encounter a substring that is unset, which can
-happen when capturing subpattern number n+1 matches some part of the
-subject, but subpattern n has not been used at all, they return an empty
-string. This can be distinguished from a genuine zero-length substring by
-inspecting the appropriate offset in ovector, which is negative for unset
-substrings.
-
-
-The two convenience functions pcre_free_substring() and
-pcre_free_substring_list() can be used to free the memory returned by
-a previous call of pcre_get_substring() or
-pcre_get_substring_list(), respectively. They do nothing more than call
-the function pointed to by pcre_free, which of course could be called
-directly from a C program. However, PCRE is used in some situations where it is
-linked via a special interface to another programming language that cannot use
-pcre_free directly; it is for these cases that the functions are
-provided.
-
-
EXTRACTING CAPTURED SUBSTRINGS BY NAME
-
-int pcre_get_stringnumber(const pcre *code,
-const char *name);
-
-
-int pcre_copy_named_substring(const pcre *code,
-const char *subject, int *ovector,
-int stringcount, const char *stringname,
-char *buffer, int buffersize);
-
-
-int pcre_get_named_substring(const pcre *code,
-const char *subject, int *ovector,
-int stringcount, const char *stringname,
-const char **stringptr);
-
-
-To extract a substring by name, you first have to find associated number.
-For example, for this pattern
-
- (a+)b(?<xxx>\d+)...
-
-the number of the subpattern called "xxx" is 2. If the name is known to be
-unique (PCRE_DUPNAMES was not set), you can find the number from the name by
-calling pcre_get_stringnumber(). The first argument is the compiled
-pattern, and the second is the name. The yield of the function is the
-subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no subpattern of
-that name.
-
-
-Given the number, you can extract the substring directly, or use one of the
-functions described in the previous section. For convenience, there are also
-two functions that do the whole job.
-
-
-Most of the arguments of pcre_copy_named_substring() and
-pcre_get_named_substring() are the same as those for the similarly named
-functions that extract by number. As these are described in the previous
-section, they are not re-described here. There are just two differences:
-
-
-First, instead of a substring number, a substring name is given. Second, there
-is an extra argument, given at the start, which is a pointer to the compiled
-pattern. This is needed in order to gain access to the name-to-number
-translation table.
-
-
-These functions call pcre_get_stringnumber(), and if it succeeds, they
-then call pcre_copy_substring() or pcre_get_substring(), as
-appropriate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names,
-the behaviour may not be what you want (see the next section).
-
-
-Warning: If the pattern uses the "(?|" feature to set up multiple
-subpatterns with the same number, you cannot use names to distinguish them,
-because names are not included in the compiled code. The matching process uses
-only numbers.
-
-
DUPLICATE SUBPATTERN NAMES
-
-int pcre_get_stringtable_entries(const pcre *code,
-const char *name, char **first, char **last);
-
-
-When a pattern is compiled with the PCRE_DUPNAMES option, names for subpatterns
-are not required to be unique. Normally, patterns with duplicate names are such
-that in any one match, only one of the named subpatterns participates. An
-example is shown in the
-pcrepattern
-documentation.
-
-
-When duplicates are present, pcre_copy_named_substring() and
-pcre_get_named_substring() return the first substring corresponding to
-the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
-returned; no data is returned. The pcre_get_stringnumber() function
-returns one of the numbers that are associated with the name, but it is not
-defined which it is.
-
-
-If you want to get full details of all captured substrings for a given name,
-you must use the pcre_get_stringtable_entries() function. The first
-argument is the compiled pattern, and the second is the name. The third and
-fourth are pointers to variables which are updated by the function. After it
-has run, they point to the first and last entries in the name-to-number table
-for the given name. The function itself returns the length of each entry, or
-PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
-described above in the section entitled Information about a pattern.
-Given all the relevant entries for the name, you can extract each of their
-numbers, and hence the captured data, if any.
-
-
FINDING ALL POSSIBLE MATCHES
-
-The traditional matching function uses a similar algorithm to Perl, which stops
-when it finds the first match, starting at a given point in the subject. If you
-want to find all possible matches, or the longest possible match, consider
-using the alternative matching function (see below) instead. If you cannot use
-the alternative function, but still need to find all possible matches, you
-can kludge it up by making use of the callout facility, which is described in
-the
-pcrecallout
-documentation.
-
-
-What you have to do is to insert a callout right at the end of the pattern.
-When your callout function is called, extract and save the current matched
-substring. Then return 1, which forces pcre_exec() to backtrack and try
-other alternatives. Ultimately, when it runs out of matches, pcre_exec()
-will yield PCRE_ERROR_NOMATCH.
-
-
MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
-
-int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
-const char *subject, int length, int startoffset,
-int options, int *ovector, int ovecsize,
-int *workspace, int wscount);
-
-
-The function pcre_dfa_exec() is called to match a subject string against
-a compiled pattern, using a matching algorithm that scans the subject string
-just once, and does not backtrack. This has different characteristics to the
-normal algorithm, and is not compatible with Perl. Some of the features of PCRE
-patterns are not supported. Nevertheless, there are times when this kind of
-matching can be useful. For a discussion of the two matching algorithms, see
-the
-pcrematching
-documentation.
-
-
-The arguments for the pcre_dfa_exec() function are the same as for
-pcre_exec(), plus two extras. The ovector argument is used in a
-different way, and this is described below. The other common arguments are used
-in the same way as for pcre_exec(), so their description is not repeated
-here.
-
-
-The two additional arguments provide workspace for the function. The workspace
-vector should contain at least 20 elements. It is used for keeping track of
-multiple paths through the pattern tree. More workspace will be needed for
-patterns and subjects where there are a lot of potential matches.
-
-
-Here is an example of a simple call to pcre_dfa_exec():
-
- int rc;
- int ovector[10];
- int wspace[20];
- rc = pcre_dfa_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector of integers for substring information */
- 10, /* number of elements (NOT size in bytes) */
- wspace, /* working space vector */
- 20); /* number of elements (NOT size in bytes) */
-
-
-
-Option bits for pcre_dfa_exec()
-
-
-The unused bits of the options argument for pcre_dfa_exec() must be
-zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
-PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL,
-PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last three of these are
-the same as for pcre_exec(), so their description is not repeated here.
-
- PCRE_PARTIAL
-
-This has the same general effect as it does for pcre_exec(), but the
-details are slightly different. When PCRE_PARTIAL is set for
-pcre_dfa_exec(), the return code PCRE_ERROR_NOMATCH is converted into
-PCRE_ERROR_PARTIAL if the end of the subject is reached, there have been no
-complete matches, but there is still at least one matching possibility. The
-portion of the string that provided the partial match is set as the first
-matching string.
-
- PCRE_DFA_SHORTEST
-
-Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
-soon as it has found one match. Because of the way the alternative algorithm
-works, this is necessarily the shortest possible match at the first possible
-matching point in the subject string.
-
- PCRE_DFA_RESTART
-
-When pcre_dfa_exec() is called with the PCRE_PARTIAL option, and returns
-a partial match, it is possible to call it again, with additional subject
-characters, and have it continue with the same match. The PCRE_DFA_RESTART
-option requests this action; when it is set, the workspace and
-wscount options must reference the same vector as before because data
-about the match so far is left in them after a partial match. There is more
-discussion of this facility in the
-pcrepartial
-documentation.
-
-
-Successful returns from pcre_dfa_exec()
-
-
-When pcre_dfa_exec() succeeds, it may have matched more than one
-substring in the subject. Note, however, that all the matches from one run of
-the function start at the same point in the subject. The shorter matches are
-all initial substrings of the longer matches. For example, if the pattern
-
- <.*>
-
-is matched against the string
-
- This is <something> <something else> <something further> no more
-
-the three matched strings are
-
- <something>
- <something> <something else>
- <something> <something else> <something further>
-
-On success, the yield of the function is a number greater than zero, which is
-the number of matched substrings. The substrings themselves are returned in
-ovector. Each string uses two elements; the first is the offset to the
-start, and the second is the offset to the end. In fact, all the strings have
-the same start offset. (Space could have been saved by giving this only once,
-but it was decided to retain some compatibility with the way pcre_exec()
-returns data, even though the meaning of the strings is different.)
-
-
-The strings are returned in reverse order of length; that is, the longest
-matching string is given first. If there were too many matches to fit into
-ovector, the yield of the function is zero, and the vector is filled with
-the longest matches.
-
-
-Error returns from pcre_dfa_exec()
-
-
-The pcre_dfa_exec() function returns a negative number when it fails.
-Many of the errors are the same as for pcre_exec(), and these are
-described
-above.
-There are in addition the following errors that are specific to
-pcre_dfa_exec():
-
- PCRE_ERROR_DFA_UITEM (-16)
-
-This return is given if pcre_dfa_exec() encounters an item in the pattern
-that it does not support, for instance, the use of \C or a back reference.
-
- PCRE_ERROR_DFA_UCOND (-17)
-
-This return is given if pcre_dfa_exec() encounters a condition item that
-uses a back reference for the condition, or a test for recursion in a specific
-group. These are not supported.
-
- PCRE_ERROR_DFA_UMLIMIT (-18)
-
-This return is given if pcre_dfa_exec() is called with an extra
-block that contains a setting of the match_limit field. This is not
-supported (it is meaningless).
-
- PCRE_ERROR_DFA_WSSIZE (-19)
-
-This return is given if pcre_dfa_exec() runs out of space in the
-workspace vector.
-
- PCRE_ERROR_DFA_RECURSE (-20)
-
-When a recursive subpattern is processed, the matching function calls itself
-recursively, using private vectors for ovector and workspace. This
-error is given if the output vector is not large enough. This should be
-extremely rare, as a vector of size 1000 is used.
-
-
SEE ALSO
-
-pcrebuild(3), pcrecallout(3), pcrecpp(3)(3),
-pcrematching(3), pcrepartial(3), pcreposix(3),
-pcreprecompile(3), pcresample(3), pcrestack(3).
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 11 April 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrebuild.html b/libs/pcre/doc/html/pcrebuild.html
deleted file mode 100644
index 7f06250660..0000000000
--- a/libs/pcre/doc/html/pcrebuild.html
+++ /dev/null
@@ -1,348 +0,0 @@
-
-
-pcrebuild specification
-
-
-pcrebuild man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PCRE BUILD-TIME OPTIONS
-
-This document describes the optional features of PCRE that can be selected when
-the library is compiled. It assumes use of the configure script, where
-the optional features are selected or deselected by providing options to
-configure before running the make command. However, the same
-options can be selected in both Unix-like and non-Unix-like environments using
-the GUI facility of CMakeSetup if you are using CMake instead of
-configure to build PCRE.
-
-
-The complete list of options for configure (which includes the standard
-ones such as the selection of the installation directory) can be obtained by
-running
-
- ./configure --help
-
-The following sections include descriptions of options whose names begin with
---enable or --disable. These settings specify changes to the defaults for the
-configure command. Because of the way that configure works,
---enable and --disable always come in pairs, so the complementary option always
-exists as well, but as it specifies the default, it is not described.
-
-
C++ SUPPORT
-
-By default, the configure script will search for a C++ compiler and C++
-header files. If it finds them, it automatically builds the C++ wrapper library
-for PCRE. You can disable this by adding
-
- --disable-cpp
-
-to the configure command.
-
-
UTF-8 SUPPORT
-
-To build PCRE with support for UTF-8 Unicode character strings, add
-
- --enable-utf8
-
-to the configure command. Of itself, this does not make PCRE treat
-strings as UTF-8. As well as compiling PCRE with this option, you also have
-have to set the PCRE_UTF8 option when you call the pcre_compile()
-function.
-
-
-If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
-its input to be either ASCII or UTF-8 (depending on the runtime option). It is
-not possible to support both EBCDIC and UTF-8 codes in the same version of the
-library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
-exclusive.
-
-
UNICODE CHARACTER PROPERTY SUPPORT
-
-UTF-8 support allows PCRE to process character values greater than 255 in the
-strings that it handles. On its own, however, it does not provide any
-facilities for accessing the properties of such characters. If you want to be
-able to use the pattern escapes \P, \p, and \X, which refer to Unicode
-character properties, you must add
-
- --enable-unicode-properties
-
-to the configure command. This implies UTF-8 support, even if you have
-not explicitly requested it.
-
-
-Including Unicode property support adds around 30K of tables to the PCRE
-library. Only the general category properties such as Lu and Nd are
-supported. Details are given in the
-pcrepattern
-documentation.
-
-
CODE VALUE OF NEWLINE
-
-By default, PCRE interprets the linefeed (LF) character as indicating the end
-of a line. This is the normal newline character on Unix-like systems. You can
-compile PCRE to use carriage return (CR) instead, by adding
-
- --enable-newline-is-cr
-
-to the configure command. There is also a --enable-newline-is-lf option,
-which explicitly specifies linefeed as the newline character.
-
-
-Alternatively, you can specify that line endings are to be indicated by the two
-character sequence CRLF. If you want this, add
-
- --enable-newline-is-crlf
-
-to the configure command. There is a fourth option, specified by
-
- --enable-newline-is-anycrlf
-
-which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
-indicating a line ending. Finally, a fifth option, specified by
-
- --enable-newline-is-any
-
-causes PCRE to recognize any Unicode newline sequence.
-
-
-Whatever line ending convention is selected when PCRE is built can be
-overridden when the library functions are called. At build time it is
-conventional to use the standard for your operating system.
-
-
WHAT \R MATCHES
-
-By default, the sequence \R in a pattern matches any Unicode newline sequence,
-whatever has been selected as the line ending sequence. If you specify
-
- --enable-bsr-anycrlf
-
-the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
-selected when PCRE is built can be overridden when the library functions are
-called.
-
-
BUILDING SHARED AND STATIC LIBRARIES
-
-The PCRE building process uses libtool to build both shared and static
-Unix libraries by default. You can suppress one of these by adding one of
-
- --disable-shared
- --disable-static
-
-to the configure command, as required.
-
-
POSIX MALLOC USAGE
-
-When PCRE is called through the POSIX interface (see the
-pcreposix
-documentation), additional working storage is required for holding the pointers
-to capturing substrings, because PCRE requires three integers per substring,
-whereas the POSIX interface provides only two. If the number of expected
-substrings is small, the wrapper function uses space on the stack, because this
-is faster than using malloc() for each call. The default threshold above
-which the stack is no longer used is 10; it can be changed by adding a setting
-such as
-
- --with-posix-malloc-threshold=20
-
-to the configure command.
-
-
HANDLING VERY LARGE PATTERNS
-
-Within a compiled pattern, offset values are used to point from one part to
-another (for example, from an opening parenthesis to an alternation
-metacharacter). By default, two-byte values are used for these offsets, leading
-to a maximum size for a compiled pattern of around 64K. This is sufficient to
-handle all but the most gigantic patterns. Nevertheless, some people do want to
-process enormous patterns, so it is possible to compile PCRE to use three-byte
-or four-byte offsets by adding a setting such as
-
- --with-link-size=3
-
-to the configure command. The value given must be 2, 3, or 4. Using
-longer offsets slows down the operation of PCRE because it has to load
-additional bytes when handling them.
-
-
AVOIDING EXCESSIVE STACK USAGE
-
-When matching with the pcre_exec() function, PCRE implements backtracking
-by making recursive calls to an internal function called match(). In
-environments where the size of the stack is limited, this can severely limit
-PCRE's operation. (The Unix environment does not usually suffer from this
-problem, but it may sometimes be necessary to increase the maximum stack size.
-There is a discussion in the
-pcrestack
-documentation.) An alternative approach to recursion that uses memory from the
-heap to remember data, instead of using recursive function calls, has been
-implemented to work round the problem of limited stack size. If you want to
-build a version of PCRE that works this way, add
-
- --disable-stack-for-recursion
-
-to the configure command. With this configuration, PCRE will use the
-pcre_stack_malloc and pcre_stack_free variables to call memory
-management functions. By default these point to malloc() and
-free(), but you can replace the pointers so that your own functions are
-used.
-
-
-Separate functions are provided rather than using pcre_malloc and
-pcre_free because the usage is very predictable: the block sizes
-requested are always the same, and the blocks are always freed in reverse
-order. A calling program might be able to implement optimized functions that
-perform better than malloc() and free(). PCRE runs noticeably more
-slowly when built in this way. This option affects only the pcre_exec()
-function; it is not relevant for the the pcre_dfa_exec() function.
-
-
LIMITING PCRE RESOURCE USAGE
-
-Internally, PCRE has a function called match(), which it calls repeatedly
-(sometimes recursively) when matching a pattern with the pcre_exec()
-function. By controlling the maximum number of times this function may be
-called during a single matching operation, a limit can be placed on the
-resources used by a single call to pcre_exec(). The limit can be changed
-at run time, as described in the
-pcreapi
-documentation. The default is 10 million, but this can be changed by adding a
-setting such as
-
- --with-match-limit=500000
-
-to the configure command. This setting has no effect on the
-pcre_dfa_exec() matching function.
-
-
-In some environments it is desirable to limit the depth of recursive calls of
-match() more strictly than the total number of calls, in order to
-restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion
-is specified) that is used. A second limit controls this; it defaults to the
-value that is set for --with-match-limit, which imposes no additional
-constraints. However, you can set a lower limit by adding, for example,
-
- --with-match-limit-recursion=10000
-
-to the configure command. This value can also be overridden at run time.
-
-
CREATING CHARACTER TABLES AT BUILD TIME
-
-PCRE uses fixed tables for processing characters whose code values are less
-than 256. By default, PCRE is built with a set of tables that are distributed
-in the file pcre_chartables.c.dist. These tables are for ASCII codes
-only. If you add
-
- --enable-rebuild-chartables
-
-to the configure command, the distributed tables are no longer used.
-Instead, a program called dftables is compiled and run. This outputs the
-source for new set of tables, created in the default locale of your C runtime
-system. (This method of replacing the tables does not work if you are cross
-compiling, because dftables is run on the local host. If you need to
-create alternative tables when cross compiling, you will have to do so "by
-hand".)
-
-
USING EBCDIC CODE
-
-PCRE assumes by default that it will run in an environment where the character
-code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
-most computer operating systems. PCRE can, however, be compiled to run in an
-EBCDIC environment by adding
-
- --enable-ebcdic
-
-to the configure command. This setting implies
---enable-rebuild-chartables. You should only use it if you know that you are in
-an EBCDIC environment (for example, an IBM mainframe operating system). The
---enable-ebcdic option is incompatible with --enable-utf8.
-
-
PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT
-
-By default, pcregrep reads all files as plain text. You can build it so
-that it recognizes files whose names end in .gz or .bz2, and reads
-them with libz or libbz2, respectively, by adding one or both of
-
- --enable-pcregrep-libz
- --enable-pcregrep-libbz2
-
-to the configure command. These options naturally require that the
-relevant libraries are installed on your system. Configuration will fail if
-they are not.
-
-
PCRETEST OPTION FOR LIBREADLINE SUPPORT
-
-If you add
-
- --enable-pcretest-libreadline
-
-to the configure command, pcretest is linked with the
-libreadline library, and when its input is from a terminal, it reads it
-using the readline() function. This provides line-editing and history
-facilities. Note that libreadline is GPL-licenced, so if you distribute a
-binary of pcretest linked in this way, there may be licensing issues.
-
-
-Setting this option causes the -lreadline option to be added to the
-pcretest build. In many operating environments with a sytem-installed
-libreadline this is sufficient. However, in some environments (e.g.
-if an unmodified distribution version of readline is in use), some extra
-configuration may be necessary. The INSTALL file for libreadline says
-this:
-
- "Readline uses the termcap functions, but does not link with the
- termcap or curses library itself, allowing applications which link
- with readline the to choose an appropriate library."
-
-If your environment has not been set up so that an appropriate library is
-automatically included, you may need to add something like
-
- LIBS="-ncurses"
-
-immediately before the configure command.
-
-
SEE ALSO
-
-pcreapi(3), pcre_config(3).
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 17 March 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrecallout.html b/libs/pcre/doc/html/pcrecallout.html
deleted file mode 100644
index 217764e7b5..0000000000
--- a/libs/pcre/doc/html/pcrecallout.html
+++ /dev/null
@@ -1,208 +0,0 @@
-
-
-pcrecallout specification
-
-
-pcrecallout man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PCRE CALLOUTS
-
-int (*pcre_callout)(pcre_callout_block *);
-
-
-PCRE provides a feature called "callout", which is a means of temporarily
-passing control to the caller of PCRE in the middle of pattern matching. The
-caller of PCRE provides an external function by putting its entry point in the
-global variable pcre_callout. By default, this variable contains NULL,
-which disables all calling out.
-
-
-Within a regular expression, (?C) indicates the points at which the external
-function is to be called. Different callout points can be identified by putting
-a number less than 256 after the letter C. The default value is zero.
-For example, this pattern has two callout points:
-
- (?C1)abc(?C2)def
-
-If the PCRE_AUTO_CALLOUT option bit is set when pcre_compile() is called,
-PCRE automatically inserts callouts, all with number 255, before each item in
-the pattern. For example, if PCRE_AUTO_CALLOUT is used with the pattern
-
- A(\d{2}|--)
-
-it is processed as if it were
-
-
-(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
-
-
-Notice that there is a callout before and after each parenthesis and
-alternation bar. Automatic callouts can be used for tracking the progress of
-pattern matching. The
-pcretest
-command has an option that sets automatic callouts; when it is used, the output
-indicates how the pattern is matched. This is useful information when you are
-trying to optimize the performance of a particular pattern.
-
-
MISSING CALLOUTS
-
-You should be aware that, because of optimizations in the way PCRE matches
-patterns by default, callouts sometimes do not happen. For example, if the
-pattern is
-
- ab(?C4)cd
-
-PCRE knows that any matching string must contain the letter "d". If the subject
-string is "abyz", the lack of "d" means that matching doesn't ever start, and
-the callout is never reached. However, with "abyd", though the result is still
-no match, the callout is obeyed.
-
-
-You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
-option to pcre_exec() or pcre_dfa_exec(). This slows down the
-matching process, but does ensure that callouts such as the example above are
-obeyed.
-
-
THE CALLOUT INTERFACE
-
-During matching, when PCRE reaches a callout point, the external function
-defined by pcre_callout is called (if it is set). This applies to both
-the pcre_exec() and the pcre_dfa_exec() matching functions. The
-only argument to the callout function is a pointer to a pcre_callout
-block. This structure contains the following fields:
-
- int version;
- int callout_number;
- int *offset_vector;
- const char *subject;
- int subject_length;
- int start_match;
- int current_position;
- int capture_top;
- int capture_last;
- void *callout_data;
- int pattern_position;
- int next_item_length;
-
-The version field is an integer containing the version number of the
-block format. The initial version was 0; the current version is 1. The version
-number will change again in future if additional fields are added, but the
-intention is never to remove any of the existing fields.
-
-
-The callout_number field contains the number of the callout, as compiled
-into the pattern (that is, the number after ?C for manual callouts, and 255 for
-automatically generated callouts).
-
-
-The offset_vector field is a pointer to the vector of offsets that was
-passed by the caller to pcre_exec() or pcre_dfa_exec(). When
-pcre_exec() is used, the contents can be inspected in order to extract
-substrings that have been matched so far, in the same way as for extracting
-substrings after a match has completed. For pcre_dfa_exec() this field is
-not useful.
-
-
-The subject and subject_length fields contain copies of the values
-that were passed to pcre_exec().
-
-
-The start_match field normally contains the offset within the subject at
-which the current match attempt started. However, if the escape sequence \K
-has been encountered, this value is changed to reflect the modified starting
-point. If the pattern is not anchored, the callout function may be called
-several times from the same point in the pattern for different starting points
-in the subject.
-
-
-The current_position field contains the offset within the subject of the
-current match pointer.
-
-
-When the pcre_exec() function is used, the capture_top field
-contains one more than the number of the highest numbered captured substring so
-far. If no substrings have been captured, the value of capture_top is
-one. This is always the case when pcre_dfa_exec() is used, because it
-does not support captured substrings.
-
-
-The capture_last field contains the number of the most recently captured
-substring. If no substrings have been captured, its value is -1. This is always
-the case when pcre_dfa_exec() is used.
-
-
-The callout_data field contains a value that is passed to
-pcre_exec() or pcre_dfa_exec() specifically so that it can be
-passed back in callouts. It is passed in the pcre_callout field of the
-pcre_extra data structure. If no such data was passed, the value of
-callout_data in a pcre_callout block is NULL. There is a
-description of the pcre_extra structure in the
-pcreapi
-documentation.
-
-
-The pattern_position field is present from version 1 of the
-pcre_callout structure. It contains the offset to the next item to be
-matched in the pattern string.
-
-
-The next_item_length field is present from version 1 of the
-pcre_callout structure. It contains the length of the next item to be
-matched in the pattern string. When the callout immediately precedes an
-alternation bar, a closing parenthesis, or the end of the pattern, the length
-is zero. When the callout precedes an opening parenthesis, the length is that
-of the entire subpattern.
-
-
-The pattern_position and next_item_length fields are intended to
-help in distinguishing between different automatic callouts, which all have the
-same callout number. However, they are set for all callouts.
-
-
RETURN VALUES
-
-The external callout function returns an integer to PCRE. If the value is zero,
-matching proceeds as normal. If the value is greater than zero, matching fails
-at the current point, but the testing of other matching possibilities goes
-ahead, just as if a lookahead assertion had failed. If the value is less than
-zero, the match is abandoned, and pcre_exec() (or pcre_dfa_exec())
-returns the negative value.
-
-
-Negative values should normally be chosen from the set of PCRE_ERROR_xxx
-values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
-The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
-it will never be used by PCRE itself.
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 15 March 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrecompat.html b/libs/pcre/doc/html/pcrecompat.html
deleted file mode 100644
index d1b93d043d..0000000000
--- a/libs/pcre/doc/html/pcrecompat.html
+++ /dev/null
@@ -1,179 +0,0 @@
-
-
-pcrecompat specification
-
-
-pcrecompat man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-DIFFERENCES BETWEEN PCRE AND PERL
-
-
-This document describes the differences in the ways that PCRE and Perl handle
-regular expressions. The differences described here are mainly with respect to
-Perl 5.8, though PCRE versions 7.0 and later contain some features that are
-expected to be in the forthcoming Perl 5.10.
-
-
-1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
-it does have are given in the
-section on UTF-8 support
-in the main
-pcre
-page.
-
-
-2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
-them, but they do not mean what you might think. For example, (?!a){3} does
-not assert that the next three characters are not "a". It just asserts that the
-next character is not "a" three times.
-
-
-3. Capturing subpatterns that occur inside negative lookahead assertions are
-counted, but their entries in the offsets vector are never set. Perl sets its
-numerical variables from any such patterns that are matched before the
-assertion fails to match something (thereby succeeding), but only if the
-negative lookahead assertion contains just one branch.
-
-
-4. Though binary zero characters are supported in the subject string, they are
-not allowed in a pattern string because it is passed as a normal C string,
-terminated by zero. The escape sequence \0 can be used in the pattern to
-represent a binary zero.
-
-
-5. The following Perl escape sequences are not supported: \l, \u, \L,
-\U, and \N. In fact these are implemented by Perl's general string-handling
-and are not part of its pattern matching engine. If any of these are
-encountered by PCRE, an error is generated.
-
-
-6. The Perl escape sequences \p, \P, and \X are supported only if PCRE is
-built with Unicode character property support. The properties that can be
-tested with \p and \P are limited to the general category properties such as
-Lu and Nd, script names such as Greek or Han, and the derived properties Any
-and L&.
-
-
-7. PCRE does support the \Q...\E escape for quoting substrings. Characters in
-between are treated as literals. This is slightly different from Perl in that $
-and @ are also handled as literals inside the quotes. In Perl, they cause
-variable interpolation (but of course PCRE does not have variables). Note the
-following examples:
-
- Pattern PCRE matches Perl matches
-
- \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
- \Qabc\$xyz\E abc\$xyz abc\$xyz
- \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
-
-The \Q...\E sequence is recognized both inside and outside character classes.
-
-
-8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
-constructions. However, there is support for recursive patterns. This is not
-available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
-feature allows an external function to be called during pattern matching. See
-the
-pcrecallout
-documentation for details.
-
-
-9. Subpatterns that are called recursively or as "subroutines" are always
-treated as atomic groups in PCRE. This is like Python, but unlike Perl.
-
-
-10. There are some differences that are concerned with the settings of captured
-strings when part of a pattern is repeated. For example, matching "aba" against
-the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
-
-
-11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
-(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
-argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
-parentheses, PCRE does not set that capture group; this is different to Perl.
-
-
-12. PCRE provides some extensions to the Perl regular expression facilities.
-Perl 5.10 will include new features that are not in earlier versions, some of
-which (such as named parentheses) have been in PCRE for some time. This list is
-with respect to Perl 5.10:
-
-
-(a) Although lookbehind assertions must match fixed length strings, each
-alternative branch of a lookbehind assertion can match a different length of
-string. Perl requires them all to have the same length.
-
-
-(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
-meta-character matches only at the very end of the string.
-
-
-(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
-meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
-(Perl can be made to issue a warning.)
-
-
-(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
-inverted, that is, by default they are not greedy, but if followed by a
-question mark they are.
-
-
-(e) PCRE_ANCHORED can be used at matching time to force a pattern to be tried
-only at the first matching position in the subject string.
-
-
-(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
-options for pcre_exec() have no Perl equivalents.
-
-
-(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
-by the PCRE_BSR_ANYCRLF option.
-
-
-(h) The callout facility is PCRE-specific.
-
-
-(i) The partial matching facility is PCRE-specific.
-
-
-(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
-different hosts that have the other endianness.
-
-
-(k) The alternative matching function (pcre_dfa_exec()) matches in a
-different way and is not Perl-compatible.
-
-
-(l) PCRE recognizes some special sequences such as (*CR) at the start of
-a pattern that set overall options that cannot be changed within the pattern.
-
-
-AUTHOR
-
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
-REVISION
-
-
-Last updated: 11 September 2007
-
-Copyright © 1997-2007 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrecpp.html b/libs/pcre/doc/html/pcrecpp.html
deleted file mode 100644
index 7b52d92fda..0000000000
--- a/libs/pcre/doc/html/pcrecpp.html
+++ /dev/null
@@ -1,370 +0,0 @@
-
-
-pcrecpp specification
-
-
-pcrecpp man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
SYNOPSIS OF C++ WRAPPER
-
-#include <pcrecpp.h>
-
-
DESCRIPTION
-
-The C++ wrapper for PCRE was provided by Google Inc. Some additional
-functionality was added by Giuseppe Maxia. This brief man page was constructed
-from the notes in the pcrecpp.h file, which should be consulted for
-further details.
-
-
MATCHING INTERFACE
-
-The "FullMatch" operation checks that supplied text matches a supplied pattern
-exactly. If pointer arguments are supplied, it copies matched sub-strings that
-match sub-patterns into them.
-
- Example: successful match
- pcrecpp::RE re("h.*o");
- re.FullMatch("hello");
-
- Example: unsuccessful match (requires full match):
- pcrecpp::RE re("e");
- !re.FullMatch("hello");
-
- Example: creating a temporary RE object:
- pcrecpp::RE("h.*o").FullMatch("hello");
-
-You can pass in a "const char*" or a "string" for "text". The examples below
-tend to use a const char*. You can, as in the different examples above, store
-the RE object explicitly in a variable or use a temporary RE object. The
-examples below use one mode or the other arbitrarily. Either could correctly be
-used for any of these examples.
-
-
-You must supply extra pointer arguments to extract matched subpieces.
-
- Example: extracts "ruby" into "s" and 1234 into "i"
- int i;
- string s;
- pcrecpp::RE re("(\\w+):(\\d+)");
- re.FullMatch("ruby:1234", &s, &i);
-
- Example: does not try to extract any extra sub-patterns
- re.FullMatch("ruby:1234", &s);
-
- Example: does not try to extract into NULL
- re.FullMatch("ruby:1234", NULL, &i);
-
- Example: integer overflow causes failure
- !re.FullMatch("ruby:1234567891234", NULL, &i);
-
- Example: fails because there aren't enough sub-patterns:
- !pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s);
-
- Example: fails because string cannot be stored in integer
- !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
-
-The provided pointer arguments can be pointers to any scalar numeric
-type, or one of:
-
- string (matched piece is copied to string)
- StringPiece (StringPiece is mutated to point to matched piece)
- T (where "bool T::ParseFrom(const char*, int)" exists)
- NULL (the corresponding matched sub-pattern is not copied)
-
-The function returns true iff all of the following conditions are satisfied:
-
- a. "text" matches "pattern" exactly;
-
- b. The number of matched sub-patterns is >= number of supplied
- pointers;
-
- c. The "i"th argument has a suitable type for holding the
- string captured as the "i"th sub-pattern. If you pass in
- void * NULL for the "i"th argument, or a non-void * NULL
- of the correct type, or pass fewer arguments than the
- number of sub-patterns, "i"th captured sub-pattern is
- ignored.
-
-CAVEAT: An optional sub-pattern that does not exist in the matched
-string is assigned the empty string. Therefore, the following will
-return false (because the empty string is not a valid number):
-
- int number;
- pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
-
-The matching interface supports at most 16 arguments per call.
-If you need more, consider using the more general interface
-pcrecpp::RE::DoMatch. See pcrecpp.h for the signature for
-DoMatch.
-
-
-NOTE: Do not use no_arg, which is used internally to mark the end of a
-list of optional arguments, as a placeholder for missing arguments, as this can
-lead to segfaults.
-
-
QUOTING METACHARACTERS
-
-You can use the "QuoteMeta" operation to insert backslashes before all
-potentially meaningful characters in a string. The returned string, used as a
-regular expression, will exactly match the original string.
-
- Example:
- string quoted = RE::QuoteMeta(unquoted);
-
-Note that it's legal to escape a character even if it has no special meaning in
-a regular expression -- so this function does that. (This also makes it
-identical to the perl function of the same name; see "perldoc -f quotemeta".)
-For example, "1.5-2.0?" becomes "1\.5\-2\.0\?".
-
-
PARTIAL MATCHES
-
-You can use the "PartialMatch" operation when you want the pattern
-to match any substring of the text.
-
- Example: simple search for a string:
- pcrecpp::RE("ell").PartialMatch("hello");
-
- Example: find first number in a string:
- int number;
- pcrecpp::RE re("(\\d+)");
- re.PartialMatch("x*100 + 20", &number);
- assert(number == 100);
-
-
-
UTF-8 AND THE MATCHING INTERFACE
-
-By default, pattern and text are plain text, one byte per character. The UTF8
-flag, passed to the constructor, causes both pattern and string to be treated
-as UTF-8 text, still a byte stream but potentially multiple bytes per
-character. In practice, the text is likelier to be UTF-8 than the pattern, but
-the match returned may depend on the UTF8 flag, so always use it when matching
-UTF8 text. For example, "." will match one byte normally but with UTF8 set may
-match up to three bytes of a multi-byte character.
-
- Example:
- pcrecpp::RE_Options options;
- options.set_utf8();
- pcrecpp::RE re(utf8_pattern, options);
- re.FullMatch(utf8_string);
-
- Example: using the convenience function UTF8():
- pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
- re.FullMatch(utf8_string);
-
-NOTE: The UTF8 flag is ignored if pcre was not configured with the
-
- --enable-utf8 flag.
-
-
-
PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
-
-PCRE defines some modifiers to change the behavior of the regular expression
-engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
-pass such modifiers to a RE class. Currently, the following modifiers are
-supported:
-
- modifier description Perl corresponding
-
- PCRE_CASELESS case insensitive match /i
- PCRE_MULTILINE multiple lines match /m
- PCRE_DOTALL dot matches newlines /s
- PCRE_DOLLAR_ENDONLY $ matches only at end N/A
- PCRE_EXTRA strict escape parsing N/A
- PCRE_EXTENDED ignore whitespaces /x
- PCRE_UTF8 handles UTF8 chars built-in
- PCRE_UNGREEDY reverses * and *? N/A
- PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
-
-(*) Both Perl and PCRE allow non capturing parentheses by means of the
-"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
-capture, while (ab|cd) does.
-
-
-For a full account on how each modifier works, please check the
-PCRE API reference page.
-
-
-For each modifier, there are two member functions whose name is made
-out of the modifier in lowercase, without the "PCRE_" prefix. For
-instance, PCRE_CASELESS is handled by
-
- bool caseless()
-
-which returns true if the modifier is set, and
-
- RE_Options & set_caseless(bool)
-
-which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can be
-accessed through the set_match_limit() and match_limit() member
-functions. Setting match_limit to a non-zero value will limit the
-execution of pcre to keep it from doing bad things like blowing the stack or
-taking an eternity to return a result. A value of 5000 is good enough to stop
-stack blowup in a 2MB thread stack. Setting match_limit to zero disables
-match limiting. Alternatively, you can call match_limit_recursion()
-which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much PCRE
-recurses. match_limit() limits the number of matches PCRE does;
-match_limit_recursion() limits the depth of internal recursion, and
-therefore the amount of stack that is used.
-
-
-Normally, to pass one or more modifiers to a RE class, you declare
-a RE_Options object, set the appropriate options, and pass this
-object to a RE constructor. Example:
-
- RE_options opt;
- opt.set_caseless(true);
- if (RE("HELLO", opt).PartialMatch("hello world")) ...
-
-RE_options has two constructors. The default constructor takes no arguments and
-creates a set of flags that are off by default. The optional parameter
-option_flags is to facilitate transfer of legacy code from C programs.
-This lets you do
-
- RE(pattern,
- RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
-
-However, new code is better off doing
-
- RE(pattern,
- RE_Options().set_caseless(true).set_multiline(true))
- .PartialMatch(str);
-
-If you are going to pass one of the most used modifiers, there are some
-convenience functions that return a RE_Options class with the
-appropriate modifier already set: CASELESS(), UTF8(),
-MULTILINE(), DOTALL(), and EXTENDED().
-
-
-If you need to set several options at once, and you don't want to go through
-the pains of declaring a RE_Options object and setting several options, there
-is a parallel method that give you such ability on the fly. You can concatenate
-several set_xxxxx() member functions, since each of them returns a
-reference to its class object. For example, to pass PCRE_CASELESS,
-PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
-
- RE(" ^ xyz \\s+ .* blah$",
- RE_Options()
- .set_caseless(true)
- .set_extended(true)
- .set_multiline(true)).PartialMatch(sometext);
-
-
-
-
SCANNING TEXT INCREMENTALLY
-
-The "Consume" operation may be useful if you want to repeatedly
-match regular expressions at the front of a string and skip over
-them as they match. This requires use of the "StringPiece" type,
-which represents a sub-range of a real string. Like RE, StringPiece
-is defined in the pcrecpp namespace.
-
- Example: read lines of the form "var = value" from a string.
- string contents = ...; // Fill string somehow
- pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
-
-
-
-
- string var;
- int value;
- pcrecpp::RE re("(\\w+) = (\\d+)\n");
- while (re.Consume(&input, &var, &value)) {
- ...;
- }
-
-Each successful call to "Consume" will set "var/value", and also
-advance "input" so it points past the matched text.
-
-
-The "FindAndConsume" operation is similar to "Consume" but does not
-anchor your match at the beginning of the string. For example, you
-could extract all words from a string by repeatedly calling
-
- pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
-
-
-
PARSING HEX/OCTAL/C-RADIX NUMBERS
-
-By default, if you pass a pointer to a numeric value, the
-corresponding text is interpreted as a base-10 number. You can
-instead wrap the pointer with a call to one of the operators Hex(),
-Octal(), or CRadix() to interpret the text in another base. The
-CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
-prefixes, but defaults to base-10.
-
- Example:
- int a, b, c, d;
- pcrecpp::RE re("(.*) (.*) (.*) (.*)");
- re.FullMatch("100 40 0100 0x40",
- pcrecpp::Octal(&a), pcrecpp::Hex(&b),
- pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
-
-will leave 64 in a, b, c, and d.
-
-
REPLACING PARTS OF STRINGS
-
-You can replace the first match of "pattern" in "str" with "rewrite".
-Within "rewrite", backslash-escaped digits (\1 to \9) can be
-used to insert text matching corresponding parenthesized group
-from the pattern. \0 in "rewrite" refers to the entire matching
-text. For example:
-
- string s = "yabba dabba doo";
- pcrecpp::RE("b+").Replace("d", &s);
-
-will leave "s" containing "yada dabba doo". The result is true if the pattern
-matches and a replacement occurs, false otherwise.
-
-
-GlobalReplace is like Replace except that it replaces all
-occurrences of the pattern in the string with the rewrite. Replacements are
-not subject to re-matching. For example:
-
- string s = "yabba dabba doo";
- pcrecpp::RE("b+").GlobalReplace("d", &s);
-
-will leave "s" containing "yada dada doo". It returns the number of
-replacements made.
-
-
-Extract is like Replace, except that if the pattern matches,
-"rewrite" is copied into "out" (an additional argument) with substitutions.
-The non-matching portions of "text" are ignored. Returns true iff a match
-occurred and the extraction happened successfully; if no match occurs, the
-string is left unaffected.
-
-
AUTHOR
-
-The C++ wrapper was contributed by Google Inc.
-
-Copyright © 2007 Google Inc.
-
-
-
REVISION
-
-Last updated: 17 March 2009
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcregrep.html b/libs/pcre/doc/html/pcregrep.html
deleted file mode 100644
index 13e45d90ae..0000000000
--- a/libs/pcre/doc/html/pcregrep.html
+++ /dev/null
@@ -1,533 +0,0 @@
-
-
-pcregrep specification
-
-
-pcregrep man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
SYNOPSIS
-
-pcregrep [options] [long options] [pattern] [path1 path2 ...]
-
-
DESCRIPTION
-
-pcregrep searches files for character patterns, in the same way as other
-grep commands do, but it uses the PCRE regular expression library to support
-patterns that are compatible with the regular expressions of Perl 5. See
-pcrepattern(3)
-for a full description of syntax and semantics of the regular expressions
-that PCRE supports.
-
-
-Patterns, whether supplied on the command line or in a separate file, are given
-without delimiters. For example:
-
- pcregrep Thursday /etc/motd
-
-If you attempt to use delimiters (for example, by surrounding a pattern with
-slashes, as is common in Perl scripts), they are interpreted as part of the
-pattern. Quotes can of course be used to delimit patterns on the command line
-because they are interpreted by the shell, and indeed they are required if a
-pattern contains white space or shell metacharacters.
-
-
-The first argument that follows any option settings is treated as the single
-pattern to be matched when neither -e nor -f is present.
-Conversely, when one or both of these options are used to specify patterns, all
-arguments are treated as path names. At least one of -e, -f, or an
-argument pattern must be provided.
-
-
-If no files are specified, pcregrep reads the standard input. The
-standard input can also be referenced by a name consisting of a single hyphen.
-For example:
-
- pcregrep some-pattern /file1 - /file3
-
-By default, each line that matches a pattern is copied to the standard
-output, and if there is more than one file, the file name is output at the
-start of each line, followed by a colon. However, there are options that can
-change how pcregrep behaves. In particular, the -M option makes it
-possible to search for patterns that span line boundaries. What defines a line
-boundary is controlled by the -N (--newline) option.
-
-
-Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
-BUFSIZ is defined in <stdio.h>. When there is more than one pattern
-(specified by the use of -e and/or -f), each pattern is applied to
-each line in the order in which they are defined, except that all the -e
-patterns are tried before the -f patterns.
-
-
-By default, as soon as one pattern matches (or fails to match when -v is
-used), no further patterns are considered. However, if --colour (or
---color) is used to colour the matching substrings, or if
---only-matching, --file-offsets, or --line-offsets is used to
-output only the part of the line that matched (either shown literally, or as an
-offset), scanning resumes immediately following the match, so that further
-matches on the same line can be found. If there are multiple patterns, they are
-all tried on the remainder of the line, but patterns that follow the one that
-matched are not tried on the earlier part of the line.
-
-
-This is the same behaviour as GNU grep, but it does mean that the order in
-which multiple patterns are specified can affect the output when one of the
-above options is used.
-
-
-Patterns that can match an empty string are accepted, but empty string
-matches are not recognized. An example is the pattern "(super)?(man)?", in
-which all components are optional. This pattern finds all occurrences of both
-"super" and "man"; the output differs from matching with "super|man" when only
-the matching substrings are being shown.
-
-
-If the LC_ALL or LC_CTYPE environment variable is set,
-pcregrep uses the value to set a locale when calling the PCRE library.
-The --locale option can be used to override this.
-
-
SUPPORT FOR COMPRESSED FILES
-
-It is possible to compile pcregrep so that it uses libz or
-libbz2 to read files whose names end in .gz or .bz2,
-respectively. You can find out whether your binary has support for one or both
-of these file types by running it with the --help option. If the
-appropriate support is not present, files are treated as plain text. The
-standard input is always so treated.
-
-
OPTIONS
-
---
-This terminate the list of options. It is useful if the next item on the
-command line starts with a hyphen but is not an option. This allows for the
-processing of patterns and filenames that start with hyphens.
-
-
--A number, --after-context=number
-Output number lines of context after each matching line. If filenames
-and/or line numbers are being output, a hyphen separator is used instead of a
-colon for the context lines. A line containing "--" is output between each
-group of lines, unless they are in fact contiguous in the input file. The value
-of number is expected to be relatively small. However, pcregrep
-guarantees to have up to 8K of following text available for context output.
-
-
--B number, --before-context=number
-Output number lines of context before each matching line. If filenames
-and/or line numbers are being output, a hyphen separator is used instead of a
-colon for the context lines. A line containing "--" is output between each
-group of lines, unless they are in fact contiguous in the input file. The value
-of number is expected to be relatively small. However, pcregrep
-guarantees to have up to 8K of preceding text available for context output.
-
-
--C number, --context=number
-Output number lines of context both before and after each matching line.
-This is equivalent to setting both -A and -B to the same value.
-
-
--c, --count
-Do not output individual lines; instead just output a count of the number of
-lines that would otherwise have been output. If several files are given, a
-count is output for each of them. In this mode, the -A, -B, and
--C options are ignored.
-
-
---colour, --color
-If this option is given without any data, it is equivalent to "--colour=auto".
-If data is required, it must be given in the same shell item, separated by an
-equals sign.
-
-
---colour=value, --color=value
-This option specifies under what circumstances the parts of a line that matched
-a pattern should be coloured in the output. By default, the output is not
-coloured. The value (which is optional, see above) may be "never", "always", or
-"auto". In the latter case, colouring happens only if the standard output is
-connected to a terminal. More resources are used when colouring is enabled,
-because pcregrep has to search for all possible matches in a line, not
-just one, in order to colour them all.
-
-
-The colour that is used can be specified by setting the environment variable
-PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
-string of two numbers, separated by a semicolon. They are copied directly into
-the control string for setting colour on a terminal, so it is your
-responsibility to ensure that they make sense. If neither of the environment
-variables is set, the default is "1;31", which gives red.
-
-
--D action, --devices=action
-If an input path is not a regular file or a directory, "action" specifies how
-it is to be processed. Valid values are "read" (the default) or "skip"
-(silently skip the path).
-
-
--d action, --directories=action
-If an input path is a directory, "action" specifies how it is to be processed.
-Valid values are "read" (the default), "recurse" (equivalent to the -r
-option), or "skip" (silently skip the path). In the default case, directories
-are read as if they were ordinary files. In some operating systems the effect
-of reading a directory like this is an immediate end-of-file.
-
-
--e pattern, --regex=pattern, --regexp=pattern
-Specify a pattern to be matched. This option can be used multiple times in
-order to specify several patterns. It can also be used as a way of specifying a
-single pattern that starts with a hyphen. When -e is used, no argument
-pattern is taken from the command line; all arguments are treated as file
-names. There is an overall maximum of 100 patterns. They are applied to each
-line in the order in which they are defined until one matches (or fails to
-match if -v is used). If -f is used with -e, the command line
-patterns are matched first, followed by the patterns from the file, independent
-of the order in which these options are specified. Note that multiple use of
--e is not the same as a single pattern with alternatives. For example,
-X|Y finds the first character in a line that is X or Y, whereas if the two
-patterns are given separately, pcregrep finds X if it is present, even if
-it follows Y in the line. It finds Y only if there is no X in the line. This
-really matters only if you are using -o to show the part(s) of the line
-that matched.
-
-
---exclude=pattern
-When pcregrep is searching the files in a directory as a consequence of
-the -r (recursive search) option, any regular files whose names match the
-pattern are excluded. Subdirectories are not excluded by this option; they are
-searched recursively, subject to the --exclude_dir and
---include_dir options. The pattern is a PCRE regular expression, and is
-matched against the final component of the file name (not the entire path). If
-a file name matches both --include and --exclude, it is excluded.
-There is no short form for this option.
-
-
---exclude_dir=pattern
-When pcregrep is searching the contents of a directory as a consequence
-of the -r (recursive search) option, any subdirectories whose names match
-the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
-subdirectories.) The pattern is a PCRE regular expression, and is matched
-against the final component of the name (not the entire path). If a
-subdirectory name matches both --include_dir and --exclude_dir, it
-is excluded. There is no short form for this option.
-
-
--F, --fixed-strings
-Interpret each pattern as a list of fixed strings, separated by newlines,
-instead of as a regular expression. The -w (match as a word) and -x
-(match whole line) options can be used with -F. They apply to each of the
-fixed strings. A line is selected if any of the fixed strings are found in it
-(subject to -w or -x, if present).
-
-
--f filename, --file=filename
-Read a number of patterns from the file, one per line, and match them against
-each line of input. A data line is output if any of the patterns match it. The
-filename can be given as "-" to refer to the standard input. When -f is
-used, patterns specified on the command line using -e may also be
-present; they are tested before the file's patterns. However, no other pattern
-is taken from the command line; all arguments are treated as file names. There
-is an overall maximum of 100 patterns. Trailing white space is removed from
-each line, and blank lines are ignored. An empty file contains no patterns and
-therefore matches nothing. See also the comments about multiple patterns versus
-a single pattern with alternatives in the description of -e above.
-
-
---file-offsets
-Instead of showing lines or parts of lines that match, show each match as an
-offset from the start of the file and a length, separated by a comma. In this
-mode, no context is shown. That is, the -A, -B, and -C
-options are ignored. If there is more than one match in a line, each of them is
-shown separately. This option is mutually exclusive with --line-offsets
-and --only-matching.
-
-
--H, --with-filename
-Force the inclusion of the filename at the start of output lines when searching
-a single file. By default, the filename is not shown in this case. For matching
-lines, the filename is followed by a colon; for context lines, a hyphen
-separator is used. If a line number is also being output, it follows the file
-name.
-
-
--h, --no-filename
-Suppress the output filenames when searching multiple files. By default,
-filenames are shown when multiple files are searched. For matching lines, the
-filename is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name.
-
-
---help
-Output a help message, giving brief details of the command options and file
-type support, and then exit.
-
-
--i, --ignore-case
-Ignore upper/lower case distinctions during comparisons.
-
-
---include=pattern
-When pcregrep is searching the files in a directory as a consequence of
-the -r (recursive search) option, only those regular files whose names
-match the pattern are included. Subdirectories are always included and searched
-recursively, subject to the \fP--include_dir\fP and --exclude_dir
-options. The pattern is a PCRE regular expression, and is matched against the
-final component of the file name (not the entire path). If a file name matches
-both --include and --exclude, it is excluded. There is no short
-form for this option.
-
-
---include_dir=pattern
-When pcregrep is searching the contents of a directory as a consequence
-of the -r (recursive search) option, only those subdirectories whose
-names match the pattern are included. (Note that the --include option
-does not affect subdirectories.) The pattern is a PCRE regular expression, and
-is matched against the final component of the name (not the entire path). If a
-subdirectory name matches both --include_dir and --exclude_dir, it
-is excluded. There is no short form for this option.
-
-
--L, --files-without-match
-Instead of outputting lines from the files, just output the names of the files
-that do not contain any lines that would have been output. Each file name is
-output once, on a separate line.
-
-
--l, --files-with-matches
-Instead of outputting lines from the files, just output the names of the files
-containing lines that would have been output. Each file name is output
-once, on a separate line. Searching stops as soon as a matching line is found
-in a file.
-
-
---label=name
-This option supplies a name to be used for the standard input when file names
-are being output. If not supplied, "(standard input)" is used. There is no
-short form for this option.
-
-
---line-offsets
-Instead of showing lines or parts of lines that match, show each match as a
-line number, the offset from the start of the line, and a length. The line
-number is terminated by a colon (as usual; see the -n option), and the
-offset and length are separated by a comma. In this mode, no context is shown.
-That is, the -A, -B, and -C options are ignored. If there is
-more than one match in a line, each of them is shown separately. This option is
-mutually exclusive with --file-offsets and --only-matching.
-
-
---locale=locale-name
-This option specifies a locale to be used for pattern matching. It overrides
-the value in the LC_ALL or LC_CTYPE environment variables. If no
-locale is specified, the PCRE library's default (usually the "C" locale) is
-used. There is no short form for this option.
-
-
--M, --multiline
-Allow patterns to match more than one line. When this option is given, patterns
-may usefully contain literal newline characters and internal occurrences of ^
-and $ characters. The output for any one match may consist of more than one
-line. When this option is set, the PCRE library is called in "multiline" mode.
-There is a limit to the number of lines that can be matched, imposed by the way
-that pcregrep buffers the input file as it scans it. However,
-pcregrep ensures that at least 8K characters or the rest of the document
-(whichever is the shorter) are available for forward matching, and similarly
-the previous 8K characters (or all the previous characters, if fewer than 8K)
-are guaranteed to be available for lookbehind assertions.
-
-
--N newline-type, --newline=newline-type
-The PCRE library supports five different conventions for indicating
-the ends of lines. They are the single-character sequences CR (carriage return)
-and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
-which recognizes any of the preceding three types, and an "any" convention, in
-which any Unicode line ending sequence is assumed to end a line. The Unicode
-sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
-(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
-PS (paragraph separator, U+2029).
-
-
-When the PCRE library is built, a default line-ending sequence is specified.
-This is normally the standard sequence for the operating system. Unless
-otherwise specified by this option, pcregrep uses the library's default.
-The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
-makes it possible to use pcregrep on files that have come from other
-environments without having to modify their line endings. If the data that is
-being scanned does not agree with the convention set by this option,
-pcregrep may behave in strange ways.
-
-
--n, --line-number
-Precede each output line by its line number in the file, followed by a colon
-for matching lines or a hyphen for context lines. If the filename is also being
-output, it precedes the line number. This option is forced if
---line-offsets is used.
-
-
--o, --only-matching
-Show only the part of the line that matched a pattern. In this mode, no
-context is shown. That is, the -A, -B, and -C options are
-ignored. If there is more than one match in a line, each of them is shown
-separately. If -o is combined with -v (invert the sense of the
-match to find non-matching lines), no output is generated, but the return code
-is set appropriately. This option is mutually exclusive with
---file-offsets and --line-offsets.
-
-
--q, --quiet
-Work quietly, that is, display nothing except error messages. The exit
-status indicates whether or not any matches were found.
-
-
--r, --recursive
-If any given path is a directory, recursively scan the files it contains,
-taking note of any --include and --exclude settings. By default, a
-directory is read as a normal file; in some operating systems this gives an
-immediate end-of-file. This option is a shorthand for setting the -d
-option to "recurse".
-
-
--s, --no-messages
-Suppress error messages about non-existent or unreadable files. Such files are
-quietly skipped. However, the return code is still 2, even if matches were
-found in other files.
-
-
--u, --utf-8
-Operate in UTF-8 mode. This option is available only if PCRE has been compiled
-with UTF-8 support. Both patterns and subject lines must be valid strings of
-UTF-8 characters.
-
-
--V, --version
-Write the version numbers of pcregrep and the PCRE library that is being
-used to the standard error stream.
-
-
--v, --invert-match
-Invert the sense of the match, so that lines which do not match any of
-the patterns are the ones that are found.
-
-
--w, --word-regex, --word-regexp
-Force the patterns to match only whole words. This is equivalent to having \b
-at the start and end of the pattern.
-
-
--x, --line-regex, --line-regexp
-Force the patterns to be anchored (each must start matching at the beginning of
-a line) and in addition, require them to match entire lines. This is
-equivalent to having ^ and $ characters at the start and end of each
-alternative branch in every pattern.
-
-
ENVIRONMENT VARIABLES
-
-The environment variables LC_ALL and LC_CTYPE are examined, in that
-order, for a locale. The first one that is set is used. This can be overridden
-by the --locale option. If no locale is set, the PCRE library's default
-(usually the "C" locale) is used.
-
-
NEWLINES
-
-The -N (--newline) option allows pcregrep to scan files with
-different newline conventions from the default. However, the setting of this
-option does not affect the way in which pcregrep writes information to
-the standard error and output streams. It uses the string "\n" in C
-printf() calls to indicate newlines, relying on the C I/O library to
-convert this to an appropriate sequence if the output is sent to a file.
-
-
OPTIONS COMPATIBILITY
-
-The majority of short and long forms of pcregrep's options are the same
-as in the GNU grep program. Any long option of the form
---xxx-regexp (GNU terminology) is also available as --xxx-regex
-(PCRE terminology). However, the --locale, -M, --multiline,
--u, and --utf-8 options are specific to pcregrep.
-
-
OPTIONS WITH DATA
-
-There are four different ways in which an option with data can be specified.
-If a short form option is used, the data may follow immediately, or in the next
-command line item. For example:
-
- -f/some/file
- -f /some/file
-
-If a long form option is used, the data may appear in the same command line
-item, separated by an equals character, or (with one exception) it may appear
-in the next command line item. For example:
-
- --file=/some/file
- --file /some/file
-
-Note, however, that if you want to supply a file name beginning with ~ as data
-in a shell command, and have the shell expand ~ to a home directory, you must
-separate the file name from the option, because the shell does not treat ~
-specially unless it is at the start of an item.
-
-
-The exception to the above is the --colour (or --color) option,
-for which the data is optional. If this option does have data, it must be given
-in the first form, using an equals character. Otherwise it will be assumed that
-it has no data.
-
-
MATCHING ERRORS
-
-It is possible to supply a regular expression that takes a very long time to
-fail to match certain lines. Such patterns normally involve nested indefinite
-repeats, for example: (a+)*\d when matched against a line of a's with no final
-digit. The PCRE matching function has a resource limit that causes it to abort
-in these circumstances. If this happens, pcregrep outputs an error
-message and the line that caused the problem to the standard error stream. If
-there are more than 20 such errors, pcregrep gives up.
-
-
DIAGNOSTICS
-
-Exit status is 0 if any matches were found, 1 if no matches were found, and 2
-for syntax errors and non-existent or inacessible files (even if matches were
-found in other files) or too many matching errors. Using the -s option to
-suppress error messages about inaccessble files does not affect the return
-code.
-
-
SEE ALSO
-
-pcrepattern(3), pcretest(1).
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 01 March 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrematching.html b/libs/pcre/doc/html/pcrematching.html
deleted file mode 100644
index 2cad88b061..0000000000
--- a/libs/pcre/doc/html/pcrematching.html
+++ /dev/null
@@ -1,224 +0,0 @@
-
-
-pcrematching specification
-
-
-pcrematching man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PCRE MATCHING ALGORITHMS
-
-This document describes the two different algorithms that are available in PCRE
-for matching a compiled regular expression against a given subject string. The
-"standard" algorithm is the one provided by the pcre_exec() function.
-This works in the same was as Perl's matching function, and provides a
-Perl-compatible matching operation.
-
-
-An alternative algorithm is provided by the pcre_dfa_exec() function;
-this operates in a different way, and is not Perl-compatible. It has advantages
-and disadvantages compared with the standard algorithm, and these are described
-below.
-
-
-When there is only one possible way in which a given subject string can match a
-pattern, the two algorithms give the same answer. A difference arises, however,
-when there are multiple possibilities. For example, if the pattern
-
- ^<.*>
-
-is matched against the string
-
- <something> <something else> <something further>
-
-there are three possible answers. The standard algorithm finds only one of
-them, whereas the alternative algorithm finds all three.
-
-
REGULAR EXPRESSIONS AS TREES
-
-The set of strings that are matched by a regular expression can be represented
-as a tree structure. An unlimited repetition in the pattern makes the tree of
-infinite size, but it is still a tree. Matching the pattern to a given subject
-string (from a given starting point) can be thought of as a search of the tree.
-There are two ways to search a tree: depth-first and breadth-first, and these
-correspond to the two matching algorithms provided by PCRE.
-
-
THE STANDARD MATCHING ALGORITHM
-
-In the terminology of Jeffrey Friedl's book "Mastering Regular
-Expressions", the standard algorithm is an "NFA algorithm". It conducts a
-depth-first search of the pattern tree. That is, it proceeds along a single
-path through the tree, checking that the subject matches what is required. When
-there is a mismatch, the algorithm tries any alternatives at the current point,
-and if they all fail, it backs up to the previous branch point in the tree, and
-tries the next alternative branch at that level. This often involves backing up
-(moving to the left) in the subject string as well. The order in which
-repetition branches are tried is controlled by the greedy or ungreedy nature of
-the quantifier.
-
-
-If a leaf node is reached, a matching string has been found, and at that point
-the algorithm stops. Thus, if there is more than one possible match, this
-algorithm returns the first one that it finds. Whether this is the shortest,
-the longest, or some intermediate length depends on the way the greedy and
-ungreedy repetition quantifiers are specified in the pattern.
-
-
-Because it ends up with a single path through the tree, it is relatively
-straightforward for this algorithm to keep track of the substrings that are
-matched by portions of the pattern in parentheses. This provides support for
-capturing parentheses and back references.
-
-
THE ALTERNATIVE MATCHING ALGORITHM
-
-This algorithm conducts a breadth-first search of the tree. Starting from the
-first matching point in the subject, it scans the subject string from left to
-right, once, character by character, and as it does this, it remembers all the
-paths through the tree that represent valid matches. In Friedl's terminology,
-this is a kind of "DFA algorithm", though it is not implemented as a
-traditional finite state machine (it keeps multiple states active
-simultaneously).
-
-
-The scan continues until either the end of the subject is reached, or there are
-no more unterminated paths. At this point, terminated paths represent the
-different matching possibilities (if there are none, the match has failed).
-Thus, if there is more than one possible match, this algorithm finds all of
-them, and in particular, it finds the longest. In PCRE, there is an option to
-stop the algorithm after the first match (which is necessarily the shortest)
-has been found.
-
-
-Note that all the matches that are found start at the same point in the
-subject. If the pattern
-
- cat(er(pillar)?)
-
-is matched against the string "the caterpillar catchment", the result will be
-the three strings "cat", "cater", and "caterpillar" that start at the fourth
-character of the subject. The algorithm does not automatically move on to find
-matches that start at later positions.
-
-
-There are a number of features of PCRE regular expressions that are not
-supported by the alternative matching algorithm. They are as follows:
-
-
-1. Because the algorithm finds all possible matches, the greedy or ungreedy
-nature of repetition quantifiers is not relevant. Greedy and ungreedy
-quantifiers are treated in exactly the same way. However, possessive
-quantifiers can make a difference when what follows could also match what is
-quantified, for example in a pattern like this:
-
- ^a++\w!
-
-This pattern matches "aaab!" but not "aaa!", which would be matched by a
-non-possessive quantifier. Similarly, if an atomic group is present, it is
-matched as if it were a standalone pattern at the current point, and the
-longest match is then "locked in" for the rest of the overall pattern.
-
-
-2. When dealing with multiple paths through the tree simultaneously, it is not
-straightforward to keep track of captured substrings for the different matching
-possibilities, and PCRE's implementation of this algorithm does not attempt to
-do this. This means that no captured substrings are available.
-
-
-3. Because no substrings are captured, back references within the pattern are
-not supported, and cause errors if encountered.
-
-
-4. For the same reason, conditional expressions that use a backreference as the
-condition or test for a specific group recursion are not supported.
-
-
-5. Because many paths through the tree may be active, the \K escape sequence,
-which resets the start of the match when encountered (but may be on some paths
-and not on others), is not supported. It causes an error if encountered.
-
-
-6. Callouts are supported, but the value of the capture_top field is
-always 1, and the value of the capture_last field is always -1.
-
-
-7. The \C escape sequence, which (in the standard algorithm) matches a single
-byte, even in UTF-8 mode, is not supported because the alternative algorithm
-moves through the subject string one character at a time, for all active paths
-through the tree.
-
-
-8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
-supported. (*FAIL) is supported, and behaves like a failing negative assertion.
-
-
ADVANTAGES OF THE ALTERNATIVE ALGORITHM
-
-Using the alternative matching algorithm provides the following advantages:
-
-
-1. All possible matches (at a single point in the subject) are automatically
-found, and in particular, the longest match is found. To find more than one
-match using the standard algorithm, you have to do kludgy things with
-callouts.
-
-
-2. There is much better support for partial matching. The restrictions on the
-content of the pattern that apply when using the standard algorithm for partial
-matching do not apply to the alternative algorithm. For non-anchored patterns,
-the starting position of a partial match is available.
-
-
-3. Because the alternative algorithm scans the subject string just once, and
-never needs to backtrack, it is possible to pass very long subject strings to
-the matching function in several pieces, checking for partial matching each
-time.
-
-
DISADVANTAGES OF THE ALTERNATIVE ALGORITHM
-
-The alternative algorithm suffers from a number of disadvantages:
-
-
-1. It is substantially slower than the standard algorithm. This is partly
-because it has to search for all possible matches, but is also because it is
-less susceptible to optimization.
-
-
-2. Capturing parentheses and back references are not supported.
-
-
-3. Although atomic groups are supported, their use does not provide the
-performance advantage that it does for the standard algorithm.
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 19 April 2008
-
-Copyright © 1997-2008 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrepartial.html b/libs/pcre/doc/html/pcrepartial.html
deleted file mode 100644
index 1fab23c771..0000000000
--- a/libs/pcre/doc/html/pcrepartial.html
+++ /dev/null
@@ -1,242 +0,0 @@
-
-
-pcrepartial specification
-
-
-pcrepartial man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PARTIAL MATCHING IN PCRE
-
-In normal use of PCRE, if the subject string that is passed to
-pcre_exec() or pcre_dfa_exec() matches as far as it goes, but is
-too short to match the entire pattern, PCRE_ERROR_NOMATCH is returned. There
-are circumstances where it might be helpful to distinguish this case from other
-cases in which there is no match.
-
-
-Consider, for example, an application where a human is required to type in data
-for a field with specific formatting requirements. An example might be a date
-in the form ddmmmyy, defined by this pattern:
-
- ^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
-
-If the application sees the user's keystrokes one by one, and can check that
-what has been typed so far is potentially valid, it is able to raise an error
-as soon as a mistake is made, possibly beeping and not reflecting the
-character that has been typed. This immediate feedback is likely to be a better
-user interface than a check that is delayed until the entire string has been
-entered.
-
-
-PCRE supports the concept of partial matching by means of the PCRE_PARTIAL
-option, which can be set when calling pcre_exec() or
-pcre_dfa_exec(). When this flag is set for pcre_exec(), the return
-code PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
-during the matching process the last part of the subject string matched part of
-the pattern. Unfortunately, for non-anchored matching, it is not possible to
-obtain the position of the start of the partial match. No captured data is set
-when PCRE_ERROR_PARTIAL is returned.
-
-
-When PCRE_PARTIAL is set for pcre_dfa_exec(), the return code
-PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of the
-subject is reached, there have been no complete matches, but there is still at
-least one matching possibility. The portion of the string that provided the
-partial match is set as the first matching string.
-
-
-Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers the
-last literal byte in a pattern, and abandons matching immediately if such a
-byte is not present in the subject string. This optimization cannot be used
-for a subject string that might match only partially.
-
-
RESTRICTED PATTERNS FOR PCRE_PARTIAL
-
-Because of the way certain internal optimizations are implemented in the
-pcre_exec() function, the PCRE_PARTIAL option cannot be used with all
-patterns. These restrictions do not apply when pcre_dfa_exec() is used.
-For pcre_exec(), repeated single characters such as
-
- a{2,4}
-
-and repeated single metasequences such as
-
- \d+
-
-are not permitted if the maximum number of occurrences is greater than one.
-Optional items such as \d? (where the maximum is one) are permitted.
-Quantifiers with any values are permitted after parentheses, so the invalid
-examples above can be coded thus:
-
- (a){2,4}
- (\d)+
-
-These constructions run more slowly, but for the kinds of application that are
-envisaged for this facility, this is not felt to be a major restriction.
-
-
-If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
-pcre_exec() returns the error code PCRE_ERROR_BADPARTIAL (-13).
-You can use the PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out
-if a compiled pattern can be used for partial matching.
-
-
EXAMPLE OF PARTIAL MATCHING USING PCRETEST
-
-If the escape sequence \P is present in a pcretest data line, the
-PCRE_PARTIAL flag is used for the match. Here is a run of pcretest that
-uses the date example quoted above:
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 25jun04\P
- 0: 25jun04
- 1: jun
- data> 25dec3\P
- Partial match
- data> 3ju\P
- Partial match
- data> 3juj\P
- No match
- data> j\P
- No match
-
-The first data string is matched completely, so pcretest shows the
-matched substrings. The remaining four strings do not match the complete
-pattern, but the first two are partial matches. The same test, using
-pcre_dfa_exec() matching (by means of the \D escape sequence), produces
-the following output:
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 25jun04\P\D
- 0: 25jun04
- data> 23dec3\P\D
- Partial match: 23dec3
- data> 3ju\P\D
- Partial match: 3ju
- data> 3juj\P\D
- No match
- data> j\P\D
- No match
-
-Notice that in this case the portion of the string that was matched is made
-available.
-
-
MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()
-
-When a partial match has been found using pcre_dfa_exec(), it is possible
-to continue the match by providing additional subject data and calling
-pcre_dfa_exec() again with the same compiled regular expression, this
-time setting the PCRE_DFA_RESTART option. You must also pass the same working
-space as before, because this is where details of the previous partial match
-are stored. Here is an example using pcretest, using the \R escape
-sequence to set the PCRE_DFA_RESTART option (\P and \D are as above):
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 23ja\P\D
- Partial match: 23ja
- data> n05\R\D
- 0: n05
-
-The first call has "23ja" as the subject, and requests partial matching; the
-second call has "n05" as the subject for the continued (restarted) match.
-Notice that when the match is complete, only the last part is shown; PCRE does
-not retain the previously partially-matched string. It is up to the calling
-program to do that if it needs to.
-
-
-You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
-over multiple segments. This facility can be used to pass very long subject
-strings to pcre_dfa_exec(). However, some care is needed for certain
-types of pattern.
-
-
-1. If the pattern contains tests for the beginning or end of a line, you need
-to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
-subject string for any call does not contain the beginning or end of a line.
-
-
-2. If the pattern contains backward assertions (including \b or \B), you need
-to arrange for some overlap in the subject strings to allow for this. For
-example, you could pass the subject in chunks that are 500 bytes long, but in
-a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
-bytes at the start of the buffer.
-
-
-3. Matching a subject string that is split into multiple segments does not
-always produce exactly the same result as matching over one single long string.
-The difference arises when there are multiple matching possibilities, because a
-partial match result is given only when there are no completed matches in a
-call to pcre_dfa_exec(). This means that as soon as the shortest match has
-been found, continuation to a new subject segment is no longer possible.
-Consider this pcretest example:
-
- re> /dog(sbody)?/
- data> do\P\D
- Partial match: do
- data> gsb\R\P\D
- 0: g
- data> dogsbody\D
- 0: dogsbody
- 1: dog
-
-The pattern matches the words "dog" or "dogsbody". When the subject is
-presented in several parts ("do" and "gsb" being the first two) the match stops
-when "dog" has been found, and it is not possible to continue. On the other
-hand, if "dogsbody" is presented as a single string, both matches are found.
-
-
-Because of this phenomenon, it does not usually make sense to end a pattern
-that is going to be matched in this way with a variable repeat.
-
-
-4. Patterns that contain alternatives at the top level which do not all
-start with the same pattern item may not work as expected. For example,
-consider this pattern:
-
- 1234|3789
-
-If the first part of the subject is "ABC123", a partial match of the first
-alternative is found at offset 3. There is no partial match for the second
-alternative, because such a match does not start at the same point in the
-subject string. Attempting to continue with the string "789" does not yield a
-match because only those alternatives that match at one point in the subject
-are remembered. The problem arises because the start of the second alternative
-matches within the first alternative. There is no problem with anchored
-patterns or patterns such as:
-
- 1234|ABCD
-
-where no string can be a partial match for both alternatives.
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 04 June 2007
-
-Copyright © 1997-2007 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrepattern.html b/libs/pcre/doc/html/pcrepattern.html
deleted file mode 100644
index 5881bc3811..0000000000
--- a/libs/pcre/doc/html/pcrepattern.html
+++ /dev/null
@@ -1,2262 +0,0 @@
-
-
-pcrepattern specification
-
-
-pcrepattern man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PCRE REGULAR EXPRESSION DETAILS
-
-The syntax and semantics of the regular expressions that are supported by PCRE
-are described in detail below. There is a quick-reference syntax summary in the
-pcresyntax
-page. PCRE tries to match Perl syntax and semantics as closely as it can. PCRE
-also supports some alternative regular expression syntax (which does not
-conflict with the Perl syntax) in order to provide some compatibility with
-regular expressions in Python, .NET, and Oniguruma.
-
-
-Perl's regular expressions are described in its own documentation, and
-regular expressions in general are covered in a number of books, some of which
-have copious examples. Jeffrey Friedl's "Mastering Regular Expressions",
-published by O'Reilly, covers regular expressions in great detail. This
-description of PCRE's regular expressions is intended as reference material.
-
-
-The original operation of PCRE was on strings of one-byte characters. However,
-there is now also support for UTF-8 character strings. To use this, you must
-build PCRE to include UTF-8 support, and then call pcre_compile() with
-the PCRE_UTF8 option. There is also a special sequence that can be given at the
-start of a pattern:
-
- (*UTF8)
-
-Starting a pattern with this sequence is equivalent to setting the PCRE_UTF8
-option. This feature is not Perl-compatible. How setting UTF-8 mode affects
-pattern matching is mentioned in several places below. There is also a summary
-of UTF-8 features in the
-section on UTF-8 support
-in the main
-pcre
-page.
-
-
-The remainder of this document discusses the patterns that are supported by
-PCRE when its main matching function, pcre_exec(), is used.
-From release 6.0, PCRE offers a second matching function,
-pcre_dfa_exec(), which matches using a different algorithm that is not
-Perl-compatible. Some of the features discussed below are not available when
-pcre_dfa_exec() is used. The advantages and disadvantages of the
-alternative function, and how it differs from the normal function, are
-discussed in the
-pcrematching
-page.
-
-
NEWLINE CONVENTIONS
-
-PCRE supports five different conventions for indicating line breaks in
-strings: a single CR (carriage return) character, a single LF (linefeed)
-character, the two-character sequence CRLF, any of the three preceding, or any
-Unicode newline sequence. The
-pcreapi
-page has
-further discussion
-about newlines, and shows how to set the newline convention in the
-options arguments for the compiling and matching functions.
-
-
-It is also possible to specify a newline convention by starting a pattern
-string with one of the following five sequences:
-
- (*CR) carriage return
- (*LF) linefeed
- (*CRLF) carriage return, followed by linefeed
- (*ANYCRLF) any of the three above
- (*ANY) all Unicode newline sequences
-
-These override the default and the options given to pcre_compile(). For
-example, on a Unix system where LF is the default newline sequence, the pattern
-
- (*CR)a.b
-
-changes the convention to CR. That pattern matches "a\nb" because LF is no
-longer a newline. Note that these special settings, which are not
-Perl-compatible, are recognized only at the very start of a pattern, and that
-they must be in upper case. If more than one of them is present, the last one
-is used.
-
-
-The newline convention does not affect what the \R escape sequence matches. By
-default, this is any Unicode newline sequence, for Perl compatibility. However,
-this can be changed; see the description of \R in the section entitled
-"Newline sequences"
-below. A change of \R setting can be combined with a change of newline
-convention.
-
-
CHARACTERS AND METACHARACTERS
-
-A regular expression is a pattern that is matched against a subject string from
-left to right. Most characters stand for themselves in a pattern, and match the
-corresponding characters in the subject. As a trivial example, the pattern
-
- The quick brown fox
-
-matches a portion of a subject string that is identical to itself. When
-caseless matching is specified (the PCRE_CASELESS option), letters are matched
-independently of case. In UTF-8 mode, PCRE always understands the concept of
-case for characters whose values are less than 128, so caseless matching is
-always possible. For characters with higher values, the concept of case is
-supported if PCRE is compiled with Unicode property support, but not otherwise.
-If you want to use caseless matching for characters 128 and above, you must
-ensure that PCRE is compiled with Unicode property support as well as with
-UTF-8 support.
-
-
-The power of regular expressions comes from the ability to include alternatives
-and repetitions in the pattern. These are encoded in the pattern by the use of
-metacharacters, which do not stand for themselves but instead are
-interpreted in some special way.
-
-
-There are two different sets of metacharacters: those that are recognized
-anywhere in the pattern except within square brackets, and those that are
-recognized within square brackets. Outside square brackets, the metacharacters
-are as follows:
-
- \ general escape character with several uses
- ^ assert start of string (or line, in multiline mode)
- $ assert end of string (or line, in multiline mode)
- . match any character except newline (by default)
- [ start character class definition
- | start of alternative branch
- ( start subpattern
- ) end subpattern
- ? extends the meaning of (
- also 0 or 1 quantifier
- also quantifier minimizer
- * 0 or more quantifier
- + 1 or more quantifier
- also "possessive quantifier"
- { start min/max quantifier
-
-Part of a pattern that is in square brackets is called a "character class". In
-a character class the only metacharacters are:
-
- \ general escape character
- ^ negate the class, but only if the first character
- - indicates character range
- [ POSIX character class (only if followed by POSIX syntax)
- ] terminates the character class
-
-The following sections describe the use of each of the metacharacters.
-
-
BACKSLASH
-
-The backslash character has several uses. Firstly, if it is followed by a
-non-alphanumeric character, it takes away any special meaning that character
-may have. This use of backslash as an escape character applies both inside and
-outside character classes.
-
-
-For example, if you want to match a * character, you write \* in the pattern.
-This escaping action applies whether or not the following character would
-otherwise be interpreted as a metacharacter, so it is always safe to precede a
-non-alphanumeric with backslash to specify that it stands for itself. In
-particular, if you want to match a backslash, you write \\.
-
-
-If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
-pattern (other than in a character class) and characters between a # outside
-a character class and the next newline are ignored. An escaping backslash can
-be used to include a whitespace or # character as part of the pattern.
-
-
-If you want to remove the special meaning from a sequence of characters, you
-can do so by putting them between \Q and \E. This is different from Perl in
-that $ and @ are handled as literals in \Q...\E sequences in PCRE, whereas in
-Perl, $ and @ cause variable interpolation. Note the following examples:
-
- Pattern PCRE matches Perl matches
-
- \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
- \Qabc\$xyz\E abc\$xyz abc\$xyz
- \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
-
-The \Q...\E sequence is recognized both inside and outside character classes.
-
-
-Non-printing characters
-
-
-A second use of backslash provides a way of encoding non-printing characters
-in patterns in a visible manner. There is no restriction on the appearance of
-non-printing characters, apart from the binary zero that terminates a pattern,
-but when a pattern is being prepared by text editing, it is usually easier to
-use one of the following escape sequences than the binary character it
-represents:
-
- \a alarm, that is, the BEL character (hex 07)
- \cx "control-x", where x is any character
- \e escape (hex 1B)
- \f formfeed (hex 0C)
- \n linefeed (hex 0A)
- \r carriage return (hex 0D)
- \t tab (hex 09)
- \ddd character with octal code ddd, or backreference
- \xhh character with hex code hh
- \x{hhh..} character with hex code hhh..
-
-The precise effect of \cx is as follows: if x is a lower case letter, it
-is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
-Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c; becomes hex
-7B.
-
-
-After \x, from zero to two hexadecimal digits are read (letters can be in
-upper or lower case). Any number of hexadecimal digits may appear between \x{
-and }, but the value of the character code must be less than 256 in non-UTF-8
-mode, and less than 2**31 in UTF-8 mode. That is, the maximum value in
-hexadecimal is 7FFFFFFF. Note that this is bigger than the largest Unicode code
-point, which is 10FFFF.
-
-
-If characters other than hexadecimal digits appear between \x{ and }, or if
-there is no terminating }, this form of escape is not recognized. Instead, the
-initial \x will be interpreted as a basic hexadecimal escape, with no
-following digits, giving a character whose value is zero.
-
-
-Characters whose value is less than 256 can be defined by either of the two
-syntaxes for \x. There is no difference in the way they are handled. For
-example, \xdc is exactly the same as \x{dc}.
-
-
-After \0 up to two further octal digits are read. If there are fewer than two
-digits, just those that are present are used. Thus the sequence \0\x\07
-specifies two binary zeros followed by a BEL character (code value 7). Make
-sure you supply two digits after the initial zero if the pattern character that
-follows is itself an octal digit.
-
-
-The handling of a backslash followed by a digit other than 0 is complicated.
-Outside a character class, PCRE reads it and any following digits as a decimal
-number. If the number is less than 10, or if there have been at least that many
-previous capturing left parentheses in the expression, the entire sequence is
-taken as a back reference. A description of how this works is given
-later,
-following the discussion of
-parenthesized subpatterns.
-
-
-Inside a character class, or if the decimal number is greater than 9 and there
-have not been that many capturing subpatterns, PCRE re-reads up to three octal
-digits following the backslash, and uses them to generate a data character. Any
-subsequent digits stand for themselves. In non-UTF-8 mode, the value of a
-character specified in octal must be less than \400. In UTF-8 mode, values up
-to \777 are permitted. For example:
-
- \040 is another way of writing a space
- \40 is the same, provided there are fewer than 40 previous capturing subpatterns
- \7 is always a back reference
- \11 might be a back reference, or another way of writing a tab
- \011 is always a tab
- \0113 is a tab followed by the character "3"
- \113 might be a back reference, otherwise the character with octal code 113
- \377 might be a back reference, otherwise the byte consisting entirely of 1 bits
- \81 is either a back reference, or a binary zero followed by the two characters "8" and "1"
-
-Note that octal values of 100 or greater must not be introduced by a leading
-zero, because no more than three octal digits are ever read.
-
-
-All the sequences that define a single character value can be used both inside
-and outside character classes. In addition, inside a character class, the
-sequence \b is interpreted as the backspace character (hex 08), and the
-sequences \R and \X are interpreted as the characters "R" and "X",
-respectively. Outside a character class, these sequences have different
-meanings
-(see below).
-
-
-Absolute and relative back references
-
-
-The sequence \g followed by an unsigned or a negative number, optionally
-enclosed in braces, is an absolute or relative back reference. A named back
-reference can be coded as \g{name}. Back references are discussed
-later,
-following the discussion of
-parenthesized subpatterns.
-
-
-Absolute and relative subroutine calls
-
-
-For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or
-a number enclosed either in angle brackets or single quotes, is an alternative
-syntax for referencing a subpattern as a "subroutine". Details are discussed
-later.
-Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
-synonymous. The former is a back reference; the latter is a subroutine call.
-
-
-Generic character types
-
-
-Another use of backslash is for specifying generic character types. The
-following are always recognized:
-
- \d any decimal digit
- \D any character that is not a decimal digit
- \h any horizontal whitespace character
- \H any character that is not a horizontal whitespace character
- \s any whitespace character
- \S any character that is not a whitespace character
- \v any vertical whitespace character
- \V any character that is not a vertical whitespace character
- \w any "word" character
- \W any "non-word" character
-
-Each pair of escape sequences partitions the complete set of characters into
-two disjoint sets. Any given character matches one, and only one, of each pair.
-
-
-These character type sequences can appear both inside and outside character
-classes. They each match one character of the appropriate type. If the current
-matching point is at the end of the subject string, all of them fail, since
-there is no character to match.
-
-
-For compatibility with Perl, \s does not match the VT character (code 11).
-This makes it different from the the POSIX "space" class. The \s characters
-are HT (9), LF (10), FF (12), CR (13), and space (32). If "use locale;" is
-included in a Perl script, \s may match the VT character. In PCRE, it never
-does.
-
-
-In UTF-8 mode, characters with values greater than 128 never match \d, \s, or
-\w, and always match \D, \S, and \W. This is true even when Unicode
-character property support is available. These sequences retain their original
-meanings from before UTF-8 support was available, mainly for efficiency
-reasons. Note that this also affects \b, because it is defined in terms of \w
-and \W.
-
-
-The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to the
-other sequences, these do match certain high-valued codepoints in UTF-8 mode.
-The horizontal space characters are:
-
- U+0009 Horizontal tab
- U+0020 Space
- U+00A0 Non-break space
- U+1680 Ogham space mark
- U+180E Mongolian vowel separator
- U+2000 En quad
- U+2001 Em quad
- U+2002 En space
- U+2003 Em space
- U+2004 Three-per-em space
- U+2005 Four-per-em space
- U+2006 Six-per-em space
- U+2007 Figure space
- U+2008 Punctuation space
- U+2009 Thin space
- U+200A Hair space
- U+202F Narrow no-break space
- U+205F Medium mathematical space
- U+3000 Ideographic space
-
-The vertical space characters are:
-
- U+000A Linefeed
- U+000B Vertical tab
- U+000C Formfeed
- U+000D Carriage return
- U+0085 Next line
- U+2028 Line separator
- U+2029 Paragraph separator
-
-
-
-A "word" character is an underscore or any character less than 256 that is a
-letter or digit. The definition of letters and digits is controlled by PCRE's
-low-valued character tables, and may vary if locale-specific matching is taking
-place (see
-"Locale support"
-in the
-pcreapi
-page). For example, in a French locale such as "fr_FR" in Unix-like systems,
-or "french" in Windows, some character codes greater than 128 are used for
-accented letters, and these are matched by \w. The use of locales with Unicode
-is discouraged.
-
-
-Newline sequences
-
-
-Outside a character class, by default, the escape sequence \R matches any
-Unicode newline sequence. This is a Perl 5.10 feature. In non-UTF-8 mode \R is
-equivalent to the following:
-
- (?>\r\n|\n|\x0b|\f|\r|\x85)
-
-This is an example of an "atomic group", details of which are given
-below.
-This particular group matches either the two-character sequence CR followed by
-LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
-U+000B), FF (formfeed, U+000C), CR (carriage return, U+000D), or NEL (next
-line, U+0085). The two-character sequence is treated as a single unit that
-cannot be split.
-
-
-In UTF-8 mode, two additional characters whose codepoints are greater than 255
-are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029).
-Unicode character property support is not needed for these characters to be
-recognized.
-
-
-It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
-complete set of Unicode line endings) by setting the option PCRE_BSR_ANYCRLF
-either at compile time or when the pattern is matched. (BSR is an abbrevation
-for "backslash R".) This can be made the default when PCRE is built; if this is
-the case, the other behaviour can be requested via the PCRE_BSR_UNICODE option.
-It is also possible to specify these settings by starting a pattern string with
-one of the following sequences:
-
- (*BSR_ANYCRLF) CR, LF, or CRLF only
- (*BSR_UNICODE) any Unicode newline sequence
-
-These override the default and the options given to pcre_compile(), but
-they can be overridden by options given to pcre_exec(). Note that these
-special settings, which are not Perl-compatible, are recognized only at the
-very start of a pattern, and that they must be in upper case. If more than one
-of them is present, the last one is used. They can be combined with a change of
-newline convention, for example, a pattern can start with:
-
- (*ANY)(*BSR_ANYCRLF)
-
-Inside a character class, \R matches the letter "R".
-
-
-Unicode character properties
-
-
-When PCRE is built with Unicode character property support, three additional
-escape sequences that match characters with specific properties are available.
-When not in UTF-8 mode, these sequences are of course limited to testing
-characters whose codepoints are less than 256, but they do work in this mode.
-The extra escape sequences are:
-
- \p{xx} a character with the xx property
- \P{xx} a character without the xx property
- \X an extended Unicode sequence
-
-The property names represented by xx above are limited to the Unicode
-script names, the general category properties, and "Any", which matches any
-character (including newline). Other properties such as "InMusicalSymbols" are
-not currently supported by PCRE. Note that \P{Any} does not match any
-characters, so always causes a match failure.
-
-
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-
- \p{Greek}
- \P{Han}
-
-Those that are not part of an identified script are lumped together as
-"Common". The current list of scripts is:
-
-
-Arabic,
-Armenian,
-Balinese,
-Bengali,
-Bopomofo,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Greek,
-Gujarati,
-Gurmukhi,
-Han,
-Hangul,
-Hanunoo,
-Hebrew,
-Hiragana,
-Inherited,
-Kannada,
-Katakana,
-Kharoshthi,
-Khmer,
-Lao,
-Latin,
-Limbu,
-Linear_B,
-Malayalam,
-Mongolian,
-Myanmar,
-New_Tai_Lue,
-Nko,
-Ogham,
-Old_Italic,
-Old_Persian,
-Oriya,
-Osmanya,
-Phags_Pa,
-Phoenician,
-Runic,
-Shavian,
-Sinhala,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tamil,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Ugaritic,
-Yi.
-
-
-Each character has exactly one general category property, specified by a
-two-letter abbreviation. For compatibility with Perl, negation can be specified
-by including a circumflex between the opening brace and the property name. For
-example, \p{^Lu} is the same as \P{Lu}.
-
-
-If only one letter is specified with \p or \P, it includes all the general
-category properties that start with that letter. In this case, in the absence
-of negation, the curly brackets in the escape sequence are optional; these two
-examples have the same effect:
-
- \p{L}
- \pL
-
-The following general category property codes are supported:
-
- C Other
- Cc Control
- Cf Format
- Cn Unassigned
- Co Private use
- Cs Surrogate
-
- L Letter
- Ll Lower case letter
- Lm Modifier letter
- Lo Other letter
- Lt Title case letter
- Lu Upper case letter
-
- M Mark
- Mc Spacing mark
- Me Enclosing mark
- Mn Non-spacing mark
-
- N Number
- Nd Decimal number
- Nl Letter number
- No Other number
-
- P Punctuation
- Pc Connector punctuation
- Pd Dash punctuation
- Pe Close punctuation
- Pf Final punctuation
- Pi Initial punctuation
- Po Other punctuation
- Ps Open punctuation
-
- S Symbol
- Sc Currency symbol
- Sk Modifier symbol
- Sm Mathematical symbol
- So Other symbol
-
- Z Separator
- Zl Line separator
- Zp Paragraph separator
- Zs Space separator
-
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
-
-
-The Cs (Surrogate) property applies only to characters in the range U+D800 to
-U+DFFF. Such characters are not valid in UTF-8 strings (see RFC 3629) and so
-cannot be tested by PCRE, unless UTF-8 validity checking has been turned off
-(see the discussion of PCRE_NO_UTF8_CHECK in the
-pcreapi
-page).
-
-
-The long synonyms for these properties that Perl supports (such as \p{Letter})
-are not supported by PCRE, nor is it permitted to prefix any of these
-properties with "Is".
-
-
-No character that is in the Unicode table has the Cn (unassigned) property.
-Instead, this property is assumed for any code point that is not in the
-Unicode table.
-
-
-Specifying caseless matching does not affect these escape sequences. For
-example, \p{Lu} always matches only upper case letters.
-
-
-The \X escape matches any number of Unicode characters that form an extended
-Unicode sequence. \X is equivalent to
-
- (?>\PM\pM*)
-
-That is, it matches a character without the "mark" property, followed by zero
-or more characters with the "mark" property, and treats the sequence as an
-atomic group
-(see below).
-Characters with the "mark" property are typically accents that affect the
-preceding character. None of them have codepoints less than 256, so in
-non-UTF-8 mode \X matches any one character.
-
-
-Matching characters by Unicode property is not fast, because PCRE has to search
-a structure that contains data for over fifteen thousand characters. That is
-why the traditional escape sequences such as \d and \w do not use Unicode
-properties in PCRE.
-
-
-Resetting the match start
-
-
-The escape sequence \K, which is a Perl 5.10 feature, causes any previously
-matched characters not to be included in the final matched sequence. For
-example, the pattern:
-
- foo\Kbar
-
-matches "foobar", but reports that it has matched "bar". This feature is
-similar to a lookbehind assertion
-(described below).
-However, in this case, the part of the subject before the real match does not
-have to be of fixed length, as lookbehind assertions do. The use of \K does
-not interfere with the setting of
-captured substrings.
-For example, when the pattern
-
- (foo)\Kbar
-
-matches "foobar", the first substring is still set to "foo".
-
-
-Simple assertions
-
-
-The final use of backslash is for certain simple assertions. An assertion
-specifies a condition that has to be met at a particular point in a match,
-without consuming any characters from the subject string. The use of
-subpatterns for more complicated assertions is described
-below.
-The backslashed assertions are:
-
- \b matches at a word boundary
- \B matches when not at a word boundary
- \A matches at the start of the subject
- \Z matches at the end of the subject
- also matches before a newline at the end of the subject
- \z matches only at the end of the subject
- \G matches at the first matching position in the subject
-
-These assertions may not appear in character classes (but note that \b has a
-different meaning, namely the backspace character, inside a character class).
-
-
-A word boundary is a position in the subject string where the current character
-and the previous character do not both match \w or \W (i.e. one matches
-\w and the other matches \W), or the start or end of the string if the
-first or last character matches \w, respectively.
-
-
-The \A, \Z, and \z assertions differ from the traditional circumflex and
-dollar (described in the next section) in that they only ever match at the very
-start and end of the subject string, whatever options are set. Thus, they are
-independent of multiline mode. These three assertions are not affected by the
-PCRE_NOTBOL or PCRE_NOTEOL options, which affect only the behaviour of the
-circumflex and dollar metacharacters. However, if the startoffset
-argument of pcre_exec() is non-zero, indicating that matching is to start
-at a point other than the beginning of the subject, \A can never match. The
-difference between \Z and \z is that \Z matches before a newline at the end
-of the string as well as at the very end, whereas \z matches only at the end.
-
-
-The \G assertion is true only when the current matching position is at the
-start point of the match, as specified by the startoffset argument of
-pcre_exec(). It differs from \A when the value of startoffset is
-non-zero. By calling pcre_exec() multiple times with appropriate
-arguments, you can mimic Perl's /g option, and it is in this kind of
-implementation where \G can be useful.
-
-
-Note, however, that PCRE's interpretation of \G, as the start of the current
-match, is subtly different from Perl's, which defines it as the end of the
-previous match. In Perl, these can be different when the previously matched
-string was empty. Because PCRE does just one match at a time, it cannot
-reproduce this behaviour.
-
-
-If all the alternatives of a pattern begin with \G, the expression is anchored
-to the starting match position, and the "anchored" flag is set in the compiled
-regular expression.
-
-
CIRCUMFLEX AND DOLLAR
-
-Outside a character class, in the default matching mode, the circumflex
-character is an assertion that is true only if the current matching point is
-at the start of the subject string. If the startoffset argument of
-pcre_exec() is non-zero, circumflex can never match if the PCRE_MULTILINE
-option is unset. Inside a character class, circumflex has an entirely different
-meaning
-(see below).
-
-
-Circumflex need not be the first character of the pattern if a number of
-alternatives are involved, but it should be the first thing in each alternative
-in which it appears if the pattern is ever to match that branch. If all
-possible alternatives start with a circumflex, that is, if the pattern is
-constrained to match only at the start of the subject, it is said to be an
-"anchored" pattern. (There are also other constructs that can cause a pattern
-to be anchored.)
-
-
-A dollar character is an assertion that is true only if the current matching
-point is at the end of the subject string, or immediately before a newline
-at the end of the string (by default). Dollar need not be the last character of
-the pattern if a number of alternatives are involved, but it should be the last
-item in any branch in which it appears. Dollar has no special meaning in a
-character class.
-
-
-The meaning of dollar can be changed so that it matches only at the very end of
-the string, by setting the PCRE_DOLLAR_ENDONLY option at compile time. This
-does not affect the \Z assertion.
-
-
-The meanings of the circumflex and dollar characters are changed if the
-PCRE_MULTILINE option is set. When this is the case, a circumflex matches
-immediately after internal newlines as well as at the start of the subject
-string. It does not match after a newline that ends the string. A dollar
-matches before any newlines in the string, as well as at the very end, when
-PCRE_MULTILINE is set. When newline is specified as the two-character
-sequence CRLF, isolated CR and LF characters do not indicate newlines.
-
-
-For example, the pattern /^abc$/ matches the subject string "def\nabc" (where
-\n represents a newline) in multiline mode, but not otherwise. Consequently,
-patterns that are anchored in single line mode because all branches start with
-^ are not anchored in multiline mode, and a match for circumflex is possible
-when the startoffset argument of pcre_exec() is non-zero. The
-PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
-
-
-Note that the sequences \A, \Z, and \z can be used to match the start and
-end of the subject in both modes, and if all branches of a pattern start with
-\A it is always anchored, whether or not PCRE_MULTILINE is set.
-
-
FULL STOP (PERIOD, DOT)
-
-Outside a character class, a dot in the pattern matches any one character in
-the subject string except (by default) a character that signifies the end of a
-line. In UTF-8 mode, the matched character may be more than one byte long.
-
-
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
-
-
-The behaviour of dot with regard to newlines can be changed. If the PCRE_DOTALL
-option is set, a dot matches any one character, without exception. If the
-two-character sequence CRLF is present in the subject string, it takes two dots
-to match it.
-
-
-The handling of dot is entirely independent of the handling of circumflex and
-dollar, the only relationship being that they both involve newlines. Dot has no
-special meaning in a character class.
-
-
MATCHING A SINGLE BYTE
-
-Outside a character class, the escape sequence \C matches any one byte, both
-in and out of UTF-8 mode. Unlike a dot, it always matches any line-ending
-characters. The feature is provided in Perl in order to match individual bytes
-in UTF-8 mode. Because it breaks up UTF-8 characters into individual bytes,
-what remains in the string may be a malformed UTF-8 string. For this reason,
-the \C escape sequence is best avoided.
-
-
-PCRE does not allow \C to appear in lookbehind assertions
-(described below),
-because in UTF-8 mode this would make it impossible to calculate the length of
-the lookbehind.
-
-
SQUARE BRACKETS AND CHARACTER CLASSES
-
-An opening square bracket introduces a character class, terminated by a closing
-square bracket. A closing square bracket on its own is not special. If a
-closing square bracket is required as a member of the class, it should be the
-first data character in the class (after an initial circumflex, if present) or
-escaped with a backslash.
-
-
-A character class matches a single character in the subject. In UTF-8 mode, the
-character may occupy more than one byte. A matched character must be in the set
-of characters defined by the class, unless the first character in the class
-definition is a circumflex, in which case the subject character must not be in
-the set defined by the class. If a circumflex is actually required as a member
-of the class, ensure it is not the first character, or escape it with a
-backslash.
-
-
-For example, the character class [aeiou] matches any lower case vowel, while
-[^aeiou] matches any character that is not a lower case vowel. Note that a
-circumflex is just a convenient notation for specifying the characters that
-are in the class by enumerating those that are not. A class that starts with a
-circumflex is not an assertion: it still consumes a character from the subject
-string, and therefore it fails if the current pointer is at the end of the
-string.
-
-
-In UTF-8 mode, characters with values greater than 255 can be included in a
-class as a literal string of bytes, or by using the \x{ escaping mechanism.
-
-
-When caseless matching is set, any letters in a class represent both their
-upper case and lower case versions, so for example, a caseless [aeiou] matches
-"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
-caseful version would. In UTF-8 mode, PCRE always understands the concept of
-case for characters whose values are less than 128, so caseless matching is
-always possible. For characters with higher values, the concept of case is
-supported if PCRE is compiled with Unicode property support, but not otherwise.
-If you want to use caseless matching for characters 128 and above, you must
-ensure that PCRE is compiled with Unicode property support as well as with
-UTF-8 support.
-
-
-Characters that might indicate line breaks are never treated in any special way
-when matching character classes, whatever line-ending sequence is in use, and
-whatever setting of the PCRE_DOTALL and PCRE_MULTILINE options is used. A class
-such as [^a] always matches one of these characters.
-
-
-The minus (hyphen) character can be used to specify a range of characters in a
-character class. For example, [d-m] matches any letter between d and m,
-inclusive. If a minus character is required in a class, it must be escaped with
-a backslash or appear in a position where it cannot be interpreted as
-indicating a range, typically as the first or last character in the class.
-
-
-It is not possible to have the literal character "]" as the end character of a
-range. A pattern such as [W-]46] is interpreted as a class of two characters
-("W" and "-") followed by a literal string "46]", so it would match "W46]" or
-"-46]". However, if the "]" is escaped with a backslash it is interpreted as
-the end of range, so [W-\]46] is interpreted as a class containing a range
-followed by two other characters. The octal or hexadecimal representation of
-"]" can also be used to end a range.
-
-
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\000-\037]. In UTF-8
-mode, ranges can include characters whose values are greater than 255, for
-example [\x{100}-\x{2ff}].
-
-
-If a range that includes letters is used when caseless matching is set, it
-matches the letters in either case. For example, [W-c] is equivalent to
-[][\\^_`wxyzabc], matched caselessly, and in non-UTF-8 mode, if character
-tables for a French locale are in use, [\xc8-\xcb] matches accented E
-characters in both cases. In UTF-8 mode, PCRE supports the concept of case for
-characters with values greater than 128 only when it is compiled with Unicode
-property support.
-
-
-The character types \d, \D, \p, \P, \s, \S, \w, and \W may also appear
-in a character class, and add the characters that they match to the class. For
-example, [\dABCDEF] matches any hexadecimal digit. A circumflex can
-conveniently be used with the upper case character types to specify a more
-restricted set of characters than the matching lower case type. For example,
-the class [^\W_] matches any letter or digit, but not underscore.
-
-
-The only metacharacters that are recognized in character classes are backslash,
-hyphen (only where it can be interpreted as specifying a range), circumflex
-(only at the start), opening square bracket (only when it can be interpreted as
-introducing a POSIX class name - see the next section), and the terminating
-closing square bracket. However, escaping other non-alphanumeric characters
-does no harm.
-
-
POSIX CHARACTER CLASSES
-
-Perl supports the POSIX notation for character classes. This uses names
-enclosed by [: and :] within the enclosing square brackets. PCRE also supports
-this notation. For example,
-
- [01[:alpha:]%]
-
-matches "0", "1", any alphabetic character, or "%". The supported class names
-are
-
- alnum letters and digits
- alpha letters
- ascii character codes 0 - 127
- blank space or tab only
- cntrl control characters
- digit decimal digits (same as \d)
- graph printing characters, excluding space
- lower lower case letters
- print printing characters, including space
- punct printing characters, excluding letters and digits
- space white space (not quite the same as \s)
- upper upper case letters
- word "word" characters (same as \w)
- xdigit hexadecimal digits
-
-The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and
-space (32). Notice that this list includes the VT character (code 11). This
-makes "space" different to \s, which does not include VT (for Perl
-compatibility).
-
-
-The name "word" is a Perl extension, and "blank" is a GNU extension from Perl
-5.8. Another Perl extension is negation, which is indicated by a ^ character
-after the colon. For example,
-
- [12[:^digit:]]
-
-matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
-syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
-supported, and an error is given if they are encountered.
-
-
-In UTF-8 mode, characters with values greater than 128 do not match any of
-the POSIX character classes.
-
-
VERTICAL BAR
-
-Vertical bar characters are used to separate alternative patterns. For example,
-the pattern
-
- gilbert|sullivan
-
-matches either "gilbert" or "sullivan". Any number of alternatives may appear,
-and an empty alternative is permitted (matching the empty string). The matching
-process tries each alternative in turn, from left to right, and the first one
-that succeeds is used. If the alternatives are within a subpattern
-(defined below),
-"succeeds" means matching the rest of the main pattern as well as the
-alternative in the subpattern.
-
-
INTERNAL OPTION SETTING
-
-The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
-PCRE_EXTENDED options (which are Perl-compatible) can be changed from within
-the pattern by a sequence of Perl option letters enclosed between "(?" and ")".
-The option letters are
-
- i for PCRE_CASELESS
- m for PCRE_MULTILINE
- s for PCRE_DOTALL
- x for PCRE_EXTENDED
-
-For example, (?im) sets caseless, multiline matching. It is also possible to
-unset these options by preceding the letter with a hyphen, and a combined
-setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
-PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
-permitted. If a letter appears both before and after the hyphen, the option is
-unset.
-
-
-The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA can be
-changed in the same way as the Perl-compatible options by using the characters
-J, U and X respectively.
-
-
-When one of these option changes occurs at top level (that is, not inside
-subpattern parentheses), the change applies to the remainder of the pattern
-that follows. If the change is placed right at the start of a pattern, PCRE
-extracts it into the global options (and it will therefore show up in data
-extracted by the pcre_fullinfo() function).
-
-
-An option change within a subpattern (see below for a description of
-subpatterns) affects only that part of the current pattern that follows it, so
-
- (a(?i)b)c
-
-matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
-By this means, options can be made to have different settings in different
-parts of the pattern. Any changes made in one alternative do carry on
-into subsequent branches within the same subpattern. For example,
-
- (a(?i)b|c)
-
-matches "ab", "aB", "c", and "C", even though when matching "C" the first
-branch is abandoned before the option setting. This is because the effects of
-option settings happen at compile time. There would be some very weird
-behaviour otherwise.
-
-
-Note: There are other PCRE-specific options that can be set by the
-application when the compile or match functions are called. In some cases the
-pattern can contain special leading sequences such as (*CRLF) to override what
-the application has set or what has been defaulted. Details are given in the
-section entitled
-"Newline sequences"
-above. There is also the (*UTF8) leading sequence that can be used to set UTF-8
-mode; this is equivalent to setting the PCRE_UTF8 option.
-
-
SUBPATTERNS
-
-Subpatterns are delimited by parentheses (round brackets), which can be nested.
-Turning part of a pattern into a subpattern does two things:
-
-
-1. It localizes a set of alternatives. For example, the pattern
-
- cat(aract|erpillar|)
-
-matches one of the words "cat", "cataract", or "caterpillar". Without the
-parentheses, it would match "cataract", "erpillar" or an empty string.
-
-
-2. It sets up the subpattern as a capturing subpattern. This means that, when
-the whole pattern matches, that portion of the subject string that matched the
-subpattern is passed back to the caller via the ovector argument of
-pcre_exec(). Opening parentheses are counted from left to right (starting
-from 1) to obtain numbers for the capturing subpatterns.
-
-
-For example, if the string "the red king" is matched against the pattern
-
- the ((red|white) (king|queen))
-
-the captured substrings are "red king", "red", and "king", and are numbered 1,
-2, and 3, respectively.
-
-
-The fact that plain parentheses fulfil two functions is not always helpful.
-There are often times when a grouping subpattern is required without a
-capturing requirement. If an opening parenthesis is followed by a question mark
-and a colon, the subpattern does not do any capturing, and is not counted when
-computing the number of any subsequent capturing subpatterns. For example, if
-the string "the white queen" is matched against the pattern
-
- the ((?:red|white) (king|queen))
-
-the captured substrings are "white queen" and "queen", and are numbered 1 and
-2. The maximum number of capturing subpatterns is 65535.
-
-
-As a convenient shorthand, if any option settings are required at the start of
-a non-capturing subpattern, the option letters may appear between the "?" and
-the ":". Thus the two patterns
-
- (?i:saturday|sunday)
- (?:(?i)saturday|sunday)
-
-match exactly the same set of strings. Because alternative branches are tried
-from left to right, and options are not reset until the end of the subpattern
-is reached, an option setting in one branch does affect subsequent branches, so
-the above patterns match "SUNDAY" as well as "Saturday".
-
-
DUPLICATE SUBPATTERN NUMBERS
-
-Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
-the same numbers for its capturing parentheses. Such a subpattern starts with
-(?| and is itself a non-capturing subpattern. For example, consider this
-pattern:
-
- (?|(Sat)ur|(Sun))day
-
-Because the two alternatives are inside a (?| group, both sets of capturing
-parentheses are numbered one. Thus, when the pattern matches, you can look
-at captured substring number one, whichever alternative matched. This construct
-is useful when you want to capture part, but not all, of one of a number of
-alternatives. Inside a (?| group, parentheses are numbered as usual, but the
-number is reset at the start of each branch. The numbers of any capturing
-buffers that follow the subpattern start after the highest number used in any
-branch. The following example is taken from the Perl documentation.
-The numbers underneath show in which buffer the captured content will be
-stored.
-
- # before ---------------branch-reset----------- after
- / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
- # 1 2 2 3 2 3 4
-
-A backreference or a recursive call to a numbered subpattern always refers to
-the first one in the pattern with the given number.
-
-
-An alternative approach to using this "branch reset" feature is to use
-duplicate named subpatterns, as described in the next section.
-
-
NAMED SUBPATTERNS
-
-Identifying capturing parentheses by number is simple, but it can be very hard
-to keep track of the numbers in complicated regular expressions. Furthermore,
-if an expression is modified, the numbers may change. To help with this
-difficulty, PCRE supports the naming of subpatterns. This feature was not
-added to Perl until release 5.10. Python had the feature earlier, and PCRE
-introduced it at release 4.0, using the Python syntax. PCRE now supports both
-the Perl and the Python syntax.
-
-
-In PCRE, a subpattern can be named in one of three ways: (?<name>...) or
-(?'name'...) as in Perl, or (?P<name>...) as in Python. References to capturing
-parentheses from other parts of the pattern, such as
-backreferences,
-recursion,
-and
-conditions,
-can be made by name as well as by number.
-
-
-Names consist of up to 32 alphanumeric characters and underscores. Named
-capturing parentheses are still allocated numbers as well as names, exactly as
-if the names were not present. The PCRE API provides function calls for
-extracting the name-to-number translation table from a compiled pattern. There
-is also a convenience function for extracting a captured substring by name.
-
-
-By default, a name must be unique within a pattern, but it is possible to relax
-this constraint by setting the PCRE_DUPNAMES option at compile time. This can
-be useful for patterns where only one instance of the named parentheses can
-match. Suppose you want to match the name of a weekday, either as a 3-letter
-abbreviation or as the full name, and in both cases you want to extract the
-abbreviation. This pattern (ignoring the line breaks) does the job:
-
- (?<DN>Mon|Fri|Sun)(?:day)?|
- (?<DN>Tue)(?:sday)?|
- (?<DN>Wed)(?:nesday)?|
- (?<DN>Thu)(?:rsday)?|
- (?<DN>Sat)(?:urday)?
-
-There are five capturing substrings, but only one is ever set after a match.
-(An alternative way of solving this problem is to use a "branch reset"
-subpattern, as described in the previous section.)
-
-
-The convenience function for extracting the data by name returns the substring
-for the first (and in this example, the only) subpattern of that name that
-matched. This saves searching to find which numbered subpattern it was. If you
-make a reference to a non-unique named subpattern from elsewhere in the
-pattern, the one that corresponds to the lowest number is used. For further
-details of the interfaces for handling named subpatterns, see the
-pcreapi
-documentation.
-
-
-Warning: You cannot use different names to distinguish between two
-subpatterns with the same number (see the previous section) because PCRE uses
-only the numbers when matching.
-
-
REPETITION
-
-Repetition is specified by quantifiers, which can follow any of the following
-items:
-
- a literal data character
- the dot metacharacter
- the \C escape sequence
- the \X escape sequence (in UTF-8 mode with Unicode properties)
- the \R escape sequence
- an escape such as \d that matches a single character
- a character class
- a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion)
-
-The general repetition quantifier specifies a minimum and maximum number of
-permitted matches, by giving the two numbers in curly brackets (braces),
-separated by a comma. The numbers must be less than 65536, and the first must
-be less than or equal to the second. For example:
-
- z{2,4}
-
-matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
-character. If the second number is omitted, but the comma is present, there is
-no upper limit; if the second number and the comma are both omitted, the
-quantifier specifies an exact number of required matches. Thus
-
- [aeiou]{3,}
-
-matches at least 3 successive vowels, but may match many more, while
-
- \d{8}
-
-matches exactly 8 digits. An opening curly bracket that appears in a position
-where a quantifier is not allowed, or one that does not match the syntax of a
-quantifier, is taken as a literal character. For example, {,6} is not a
-quantifier, but a literal string of four characters.
-
-
-In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to individual
-bytes. Thus, for example, \x{100}{2} matches two UTF-8 characters, each of
-which is represented by a two-byte sequence. Similarly, when Unicode property
-support is available, \X{3} matches three Unicode extended sequences, each of
-which may be several bytes long (and they may be of different lengths).
-
-
-The quantifier {0} is permitted, causing the expression to behave as if the
-previous item and the quantifier were not present. This may be useful for
-subpatterns that are referenced as
-subroutines
-from elsewhere in the pattern. Items other than subpatterns that have a {0}
-quantifier are omitted from the compiled pattern.
-
-
-For convenience, the three most common quantifiers have single-character
-abbreviations:
-
- * is equivalent to {0,}
- + is equivalent to {1,}
- ? is equivalent to {0,1}
-
-It is possible to construct infinite loops by following a subpattern that can
-match no characters with a quantifier that has no upper limit, for example:
-
- (a?)*
-
-Earlier versions of Perl and PCRE used to give an error at compile time for
-such patterns. However, because there are cases where this can be useful, such
-patterns are now accepted, but if any repetition of the subpattern does in fact
-match no characters, the loop is forcibly broken.
-
-
-By default, the quantifiers are "greedy", that is, they match as much as
-possible (up to the maximum number of permitted times), without causing the
-rest of the pattern to fail. The classic example of where this gives problems
-is in trying to match comments in C programs. These appear between /* and */
-and within the comment, individual * and / characters may appear. An attempt to
-match C comments by applying the pattern
-
- /\*.*\*/
-
-to the string
-
- /* first comment */ not comment /* second comment */
-
-fails, because it matches the entire string owing to the greediness of the .*
-item.
-
-
-However, if a quantifier is followed by a question mark, it ceases to be
-greedy, and instead matches the minimum number of times possible, so the
-pattern
-
- /\*.*?\*/
-
-does the right thing with the C comments. The meaning of the various
-quantifiers is not otherwise changed, just the preferred number of matches.
-Do not confuse this use of question mark with its use as a quantifier in its
-own right. Because it has two uses, it can sometimes appear doubled, as in
-
- \d??\d
-
-which matches one digit by preference, but can match two if that is the only
-way the rest of the pattern matches.
-
-
-If the PCRE_UNGREEDY option is set (an option that is not available in Perl),
-the quantifiers are not greedy by default, but individual ones can be made
-greedy by following them with a question mark. In other words, it inverts the
-default behaviour.
-
-
-When a parenthesized subpattern is quantified with a minimum repeat count that
-is greater than 1 or with a limited maximum, more memory is required for the
-compiled pattern, in proportion to the size of the minimum or maximum.
-
-
-If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
-to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is
-implicitly anchored, because whatever follows will be tried against every
-character position in the subject string, so there is no point in retrying the
-overall match at any position after the first. PCRE normally treats such a
-pattern as though it were preceded by \A.
-
-
-In cases where it is known that the subject string contains no newlines, it is
-worth setting PCRE_DOTALL in order to obtain this optimization, or
-alternatively using ^ to indicate anchoring explicitly.
-
-
-However, there is one situation where the optimization cannot be used. When .*
-is inside capturing parentheses that are the subject of a backreference
-elsewhere in the pattern, a match at the start may fail where a later one
-succeeds. Consider, for example:
-
- (.*)abc\1
-
-If the subject is "xyz123abc123" the match point is the fourth character. For
-this reason, such a pattern is not implicitly anchored.
-
-
-When a capturing subpattern is repeated, the value captured is the substring
-that matched the final iteration. For example, after
-
- (tweedle[dume]{3}\s*)+
-
-has matched "tweedledum tweedledee" the value of the captured substring is
-"tweedledee". However, if there are nested capturing subpatterns, the
-corresponding captured values may have been set in previous iterations. For
-example, after
-
- /(a|(b))+/
-
-matches "aba" the value of the second captured substring is "b".
-
-
ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
-
-With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
-repetition, failure of what follows normally causes the repeated item to be
-re-evaluated to see if a different number of repeats allows the rest of the
-pattern to match. Sometimes it is useful to prevent this, either to change the
-nature of the match, or to cause it fail earlier than it otherwise might, when
-the author of the pattern knows there is no point in carrying on.
-
-
-Consider, for example, the pattern \d+foo when applied to the subject line
-
- 123456bar
-
-After matching all 6 digits and then failing to match "foo", the normal
-action of the matcher is to try again with only 5 digits matching the \d+
-item, and then with 4, and so on, before ultimately failing. "Atomic grouping"
-(a term taken from Jeffrey Friedl's book) provides the means for specifying
-that once a subpattern has matched, it is not to be re-evaluated in this way.
-
-
-If we use atomic grouping for the previous example, the matcher gives up
-immediately on failing to match "foo" the first time. The notation is a kind of
-special parenthesis, starting with (?> as in this example:
-
- (?>\d+)foo
-
-This kind of parenthesis "locks up" the part of the pattern it contains once
-it has matched, and a failure further into the pattern is prevented from
-backtracking into it. Backtracking past it to previous items, however, works as
-normal.
-
-
-An alternative description is that a subpattern of this type matches the string
-of characters that an identical standalone pattern would match, if anchored at
-the current point in the subject string.
-
-
-Atomic grouping subpatterns are not capturing subpatterns. Simple cases such as
-the above example can be thought of as a maximizing repeat that must swallow
-everything it can. So, while both \d+ and \d+? are prepared to adjust the
-number of digits they match in order to make the rest of the pattern match,
-(?>\d+) can only match an entire sequence of digits.
-
-
-Atomic groups in general can of course contain arbitrarily complicated
-subpatterns, and can be nested. However, when the subpattern for an atomic
-group is just a single repeated item, as in the example above, a simpler
-notation, called a "possessive quantifier" can be used. This consists of an
-additional + character following a quantifier. Using this notation, the
-previous example can be rewritten as
-
- \d++foo
-
-Note that a possessive quantifier can be used with an entire group, for
-example:
-
- (abc|xyz){2,3}+
-
-Possessive quantifiers are always greedy; the setting of the PCRE_UNGREEDY
-option is ignored. They are a convenient notation for the simpler forms of
-atomic group. However, there is no difference in the meaning of a possessive
-quantifier and the equivalent atomic group, though there may be a performance
-difference; possessive quantifiers should be slightly faster.
-
-
-The possessive quantifier syntax is an extension to the Perl 5.8 syntax.
-Jeffrey Friedl originated the idea (and the name) in the first edition of his
-book. Mike McCloskey liked it, so implemented it when he built Sun's Java
-package, and PCRE copied it from there. It ultimately found its way into Perl
-at release 5.10.
-
-
-PCRE has an optimization that automatically "possessifies" certain simple
-pattern constructs. For example, the sequence A+B is treated as A++B because
-there is no point in backtracking into a sequence of A's when B must follow.
-
-
-When a pattern contains an unlimited repeat inside a subpattern that can itself
-be repeated an unlimited number of times, the use of an atomic group is the
-only way to avoid some failing matches taking a very long time indeed. The
-pattern
-
- (\D+|<\d+>)*[!?]
-
-matches an unlimited number of substrings that either consist of non-digits, or
-digits enclosed in <>, followed by either ! or ?. When it matches, it runs
-quickly. However, if it is applied to
-
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-it takes a long time before reporting failure. This is because the string can
-be divided between the internal \D+ repeat and the external * repeat in a
-large number of ways, and all have to be tried. (The example uses [!?] rather
-than a single character at the end, because both PCRE and Perl have an
-optimization that allows for fast failure when a single character is used. They
-remember the last single character that is required for a match, and fail early
-if it is not present in the string.) If the pattern is changed so that it uses
-an atomic group, like this:
-
- ((?>\D+)|<\d+>)*[!?]
-
-sequences of non-digits cannot be broken, and failure happens quickly.
-
-
BACK REFERENCES
-
-Outside a character class, a backslash followed by a digit greater than 0 (and
-possibly further digits) is a back reference to a capturing subpattern earlier
-(that is, to its left) in the pattern, provided there have been that many
-previous capturing left parentheses.
-
-
-However, if the decimal number following the backslash is less than 10, it is
-always taken as a back reference, and causes an error only if there are not
-that many capturing left parentheses in the entire pattern. In other words, the
-parentheses that are referenced need not be to the left of the reference for
-numbers less than 10. A "forward back reference" of this type can make sense
-when a repetition is involved and the subpattern to the right has participated
-in an earlier iteration.
-
-
-It is not possible to have a numerical "forward back reference" to a subpattern
-whose number is 10 or more using this syntax because a sequence such as \50 is
-interpreted as a character defined in octal. See the subsection entitled
-"Non-printing characters"
-above
-for further details of the handling of digits following a backslash. There is
-no such problem when named parentheses are used. A back reference to any
-subpattern is possible using named parentheses (see below).
-
-
-Another way of avoiding the ambiguity inherent in the use of digits following a
-backslash is to use the \g escape sequence, which is a feature introduced in
-Perl 5.10. This escape must be followed by an unsigned number or a negative
-number, optionally enclosed in braces. These examples are all identical:
-
- (ring), \1
- (ring), \g1
- (ring), \g{1}
-
-An unsigned number specifies an absolute reference without the ambiguity that
-is present in the older syntax. It is also useful when literal digits follow
-the reference. A negative number is a relative reference. Consider this
-example:
-
- (abc(def)ghi)\g{-1}
-
-The sequence \g{-1} is a reference to the most recently started capturing
-subpattern before \g, that is, is it equivalent to \2. Similarly, \g{-2}
-would be equivalent to \1. The use of relative references can be helpful in
-long patterns, and also in patterns that are created by joining together
-fragments that contain references within themselves.
-
-
-A back reference matches whatever actually matched the capturing subpattern in
-the current subject string, rather than anything matching the subpattern
-itself (see
-"Subpatterns as subroutines"
-below for a way of doing that). So the pattern
-
- (sens|respons)e and \1ibility
-
-matches "sense and sensibility" and "response and responsibility", but not
-"sense and responsibility". If caseful matching is in force at the time of the
-back reference, the case of letters is relevant. For example,
-
- ((?i)rah)\s+\1
-
-matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
-capturing subpattern is matched caselessly.
-
-
-There are several different ways of writing back references to named
-subpatterns. The .NET syntax \k{name} and the Perl syntax \k<name> or
-\k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's unified
-back reference syntax, in which \g can be used for both numeric and named
-references, is also supported. We could rewrite the above example in any of
-the following ways:
-
- (?<p1>(?i)rah)\s+\k<p1>
- (?'p1'(?i)rah)\s+\k{p1}
- (?P<p1>(?i)rah)\s+(?P=p1)
- (?<p1>(?i)rah)\s+\g{p1}
-
-A subpattern that is referenced by name may appear in the pattern before or
-after the reference.
-
-
-There may be more than one back reference to the same subpattern. If a
-subpattern has not actually been used in a particular match, any back
-references to it always fail. For example, the pattern
-
- (a|(bc))\2
-
-always fails if it starts to match "a" rather than "bc". Because there may be
-many capturing parentheses in a pattern, all digits following the backslash are
-taken as part of a potential back reference number. If the pattern continues
-with a digit character, some delimiter must be used to terminate the back
-reference. If the PCRE_EXTENDED option is set, this can be whitespace.
-Otherwise an empty comment (see
-"Comments"
-below) can be used.
-
-
-A back reference that occurs inside the parentheses to which it refers fails
-when the subpattern is first used, so, for example, (a\1) never matches.
-However, such references can be useful inside repeated subpatterns. For
-example, the pattern
-
- (a|b\1)+
-
-matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
-the subpattern, the back reference matches the character string corresponding
-to the previous iteration. In order for this to work, the pattern must be such
-that the first iteration does not need to match the back reference. This can be
-done using alternation, as in the example above, or by a quantifier with a
-minimum of zero.
-
-
ASSERTIONS
-
-An assertion is a test on the characters following or preceding the current
-matching point that does not actually consume any characters. The simple
-assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described
-above.
-
-
-More complicated assertions are coded as subpatterns. There are two kinds:
-those that look ahead of the current position in the subject string, and those
-that look behind it. An assertion subpattern is matched in the normal way,
-except that it does not cause the current matching position to be changed.
-
-
-Assertion subpatterns are not capturing subpatterns, and may not be repeated,
-because it makes no sense to assert the same thing several times. If any kind
-of assertion contains capturing subpatterns within it, these are counted for
-the purposes of numbering the capturing subpatterns in the whole pattern.
-However, substring capturing is carried out only for positive assertions,
-because it does not make sense for negative assertions.
-
-
-Lookahead assertions
-
-
-Lookahead assertions start with (?= for positive assertions and (?! for
-negative assertions. For example,
-
- \w+(?=;)
-
-matches a word followed by a semicolon, but does not include the semicolon in
-the match, and
-
- foo(?!bar)
-
-matches any occurrence of "foo" that is not followed by "bar". Note that the
-apparently similar pattern
-
- (?!foo)bar
-
-does not find an occurrence of "bar" that is preceded by something other than
-"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
-(?!foo) is always true when the next three characters are "bar". A
-lookbehind assertion is needed to achieve the other effect.
-
-
-If you want to force a matching failure at some point in a pattern, the most
-convenient way to do it is with (?!) because an empty string always matches, so
-an assertion that requires there not to be an empty string must always fail.
-
-
-Lookbehind assertions
-
-
-Lookbehind assertions start with (?<= for positive assertions and (?<! for
-negative assertions. For example,
-
- (?<!foo)bar
-
-does find an occurrence of "bar" that is not preceded by "foo". The contents of
-a lookbehind assertion are restricted such that all the strings it matches must
-have a fixed length. However, if there are several top-level alternatives, they
-do not all have to have the same fixed length. Thus
-
- (?<=bullock|donkey)
-
-is permitted, but
-
- (?<!dogs?|cats?)
-
-causes an error at compile time. Branches that match different length strings
-are permitted only at the top level of a lookbehind assertion. This is an
-extension compared with Perl (at least for 5.8), which requires all branches to
-match the same length of string. An assertion such as
-
- (?<=ab(c|de))
-
-is not permitted, because its single top-level branch can match two different
-lengths, but it is acceptable if rewritten to use two top-level branches:
-
- (?<=abc|abde)
-
-In some cases, the Perl 5.10 escape sequence \K
-(see above)
-can be used instead of a lookbehind assertion; this is not restricted to a
-fixed-length.
-
-
-The implementation of lookbehind assertions is, for each alternative, to
-temporarily move the current position back by the fixed length and then try to
-match. If there are insufficient characters before the current position, the
-assertion fails.
-
-
-PCRE does not allow the \C escape (which matches a single byte in UTF-8 mode)
-to appear in lookbehind assertions, because it makes it impossible to calculate
-the length of the lookbehind. The \X and \R escapes, which can match
-different numbers of bytes, are also not permitted.
-
-
-Possessive quantifiers can be used in conjunction with lookbehind assertions to
-specify efficient matching at the end of the subject string. Consider a simple
-pattern such as
-
- abcd$
-
-when applied to a long string that does not match. Because matching proceeds
-from left to right, PCRE will look for each "a" in the subject and then see if
-what follows matches the rest of the pattern. If the pattern is specified as
-
- ^.*abcd$
-
-the initial .* matches the entire string at first, but when this fails (because
-there is no following "a"), it backtracks to match all but the last character,
-then all but the last two characters, and so on. Once again the search for "a"
-covers the entire string, from right to left, so we are no better off. However,
-if the pattern is written as
-
- ^.*+(?<=abcd)
-
-there can be no backtracking for the .*+ item; it can match only the entire
-string. The subsequent lookbehind assertion does a single test on the last four
-characters. If it fails, the match fails immediately. For long strings, this
-approach makes a significant difference to the processing time.
-
-
-Using multiple assertions
-
-
-Several assertions (of any sort) may occur in succession. For example,
-
- (?<=\d{3})(?<!999)foo
-
-matches "foo" preceded by three digits that are not "999". Notice that each of
-the assertions is applied independently at the same point in the subject
-string. First there is a check that the previous three characters are all
-digits, and then there is a check that the same three characters are not "999".
-This pattern does not match "foo" preceded by six characters, the first
-of which are digits and the last three of which are not "999". For example, it
-doesn't match "123abcfoo". A pattern to do that is
-
- (?<=\d{3}...)(?<!999)foo
-
-This time the first assertion looks at the preceding six characters, checking
-that the first three are digits, and then the second assertion checks that the
-preceding three characters are not "999".
-
-
-Assertions can be nested in any combination. For example,
-
- (?<=(?<!foo)bar)baz
-
-matches an occurrence of "baz" that is preceded by "bar" which in turn is not
-preceded by "foo", while
-
- (?<=\d{3}(?!999)...)foo
-
-is another pattern that matches "foo" preceded by three digits and any three
-characters that are not "999".
-
-
CONDITIONAL SUBPATTERNS
-
-It is possible to cause the matching process to obey a subpattern
-conditionally or to choose between two alternative subpatterns, depending on
-the result of an assertion, or whether a previous capturing subpattern matched
-or not. The two possible forms of conditional subpattern are
-
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-
-If the condition is satisfied, the yes-pattern is used; otherwise the
-no-pattern (if present) is used. If there are more than two alternatives in the
-subpattern, a compile-time error occurs.
-
-
-There are four kinds of condition: references to subpatterns, references to
-recursion, a pseudo-condition called DEFINE, and assertions.
-
-
-Checking for a used subpattern by number
-
-
-If the text between the parentheses consists of a sequence of digits, the
-condition is true if the capturing subpattern of that number has previously
-matched. An alternative notation is to precede the digits with a plus or minus
-sign. In this case, the subpattern number is relative rather than absolute.
-The most recently opened parentheses can be referenced by (?(-1), the next most
-recent by (?(-2), and so on. In looping constructs it can also make sense to
-refer to subsequent groups with constructs such as (?(+2).
-
-
-Consider the following pattern, which contains non-significant white space to
-make it more readable (assume the PCRE_EXTENDED option) and to divide it into
-three parts for ease of discussion:
-
- ( \( )? [^()]+ (?(1) \) )
-
-The first part matches an optional opening parenthesis, and if that
-character is present, sets it as the first captured substring. The second part
-matches one or more characters that are not parentheses. The third part is a
-conditional subpattern that tests whether the first set of parentheses matched
-or not. If they did, that is, if subject started with an opening parenthesis,
-the condition is true, and so the yes-pattern is executed and a closing
-parenthesis is required. Otherwise, since no-pattern is not present, the
-subpattern matches nothing. In other words, this pattern matches a sequence of
-non-parentheses, optionally enclosed in parentheses.
-
-
-If you were embedding this pattern in a larger one, you could use a relative
-reference:
-
- ...other stuff... ( \( )? [^()]+ (?(-1) \) ) ...
-
-This makes the fragment independent of the parentheses in the larger pattern.
-
-
-Checking for a used subpattern by name
-
-
-Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used
-subpattern by name. For compatibility with earlier versions of PCRE, which had
-this facility before Perl, the syntax (?(name)...) is also recognized. However,
-there is a possible ambiguity with this syntax, because subpattern names may
-consist entirely of digits. PCRE looks first for a named subpattern; if it
-cannot find one and the name consists entirely of digits, PCRE looks for a
-subpattern of that number, which must be greater than zero. Using subpattern
-names that consist entirely of digits is not recommended.
-
-
-Rewriting the above example to use a named subpattern gives this:
-
- (?<OPEN> \( )? [^()]+ (?(<OPEN>) \) )
-
-
-
-
-Checking for pattern recursion
-
-
-If the condition is the string (R), and there is no subpattern with the name R,
-the condition is true if a recursive call to the whole pattern or any
-subpattern has been made. If digits or a name preceded by ampersand follow the
-letter R, for example:
-
- (?(R3)...) or (?(R&name)...)
-
-the condition is true if the most recent recursion is into the subpattern whose
-number or name is given. This condition does not check the entire recursion
-stack.
-
-
-At "top level", all these recursion test conditions are false. Recursive
-patterns are described below.
-
-
-Defining subpatterns for use by reference only
-
-
-If the condition is the string (DEFINE), and there is no subpattern with the
-name DEFINE, the condition is always false. In this case, there may be only one
-alternative in the subpattern. It is always skipped if control reaches this
-point in the pattern; the idea of DEFINE is that it can be used to define
-"subroutines" that can be referenced from elsewhere. (The use of "subroutines"
-is described below.) For example, a pattern to match an IPv4 address could be
-written like this (ignore whitespace and line breaks):
-
- (?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
- \b (?&byte) (\.(?&byte)){3} \b
-
-The first part of the pattern is a DEFINE group inside which a another group
-named "byte" is defined. This matches an individual component of an IPv4
-address (a number less than 256). When matching takes place, this part of the
-pattern is skipped because DEFINE acts like a false condition.
-
-
-The rest of the pattern uses references to the named group to match the four
-dot-separated components of an IPv4 address, insisting on a word boundary at
-each end.
-
-
-Assertion conditions
-
-
-If the condition is not in any of the above formats, it must be an assertion.
-This may be a positive or negative lookahead or lookbehind assertion. Consider
-this pattern, again containing non-significant white space, and with the two
-alternatives on the second line:
-
- (?(?=[^a-z]*[a-z])
- \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
-
-The condition is a positive lookahead assertion that matches an optional
-sequence of non-letters followed by a letter. In other words, it tests for the
-presence of at least one letter in the subject. If a letter is found, the
-subject is matched against the first alternative; otherwise it is matched
-against the second. This pattern matches strings in one of the two forms
-dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
-
-
COMMENTS
-
-The sequence (?# marks the start of a comment that continues up to the next
-closing parenthesis. Nested parentheses are not permitted. The characters
-that make up a comment play no part in the pattern matching at all.
-
-
-If the PCRE_EXTENDED option is set, an unescaped # character outside a
-character class introduces a comment that continues to immediately after the
-next newline in the pattern.
-
-
RECURSIVE PATTERNS
-
-Consider the problem of matching a string in parentheses, allowing for
-unlimited nested parentheses. Without the use of recursion, the best that can
-be done is to use a pattern that matches up to some fixed depth of nesting. It
-is not possible to handle an arbitrary nesting depth.
-
-
-For some time, Perl has provided a facility that allows regular expressions to
-recurse (amongst other things). It does this by interpolating Perl code in the
-expression at run time, and the code can refer to the expression itself. A Perl
-pattern using code interpolation to solve the parentheses problem can be
-created like this:
-
- $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
-
-The (?p{...}) item interpolates Perl code at run time, and in this case refers
-recursively to the pattern in which it appears.
-
-
-Obviously, PCRE cannot support the interpolation of Perl code. Instead, it
-supports special syntax for recursion of the entire pattern, and also for
-individual subpattern recursion. After its introduction in PCRE and Python,
-this kind of recursion was introduced into Perl at release 5.10.
-
-
-A special item that consists of (? followed by a number greater than zero and a
-closing parenthesis is a recursive call of the subpattern of the given number,
-provided that it occurs inside that subpattern. (If not, it is a "subroutine"
-call, which is described in the next section.) The special item (?R) or (?0) is
-a recursive call of the entire regular expression.
-
-
-In PCRE (like Python, but unlike Perl), a recursive subpattern call is always
-treated as an atomic group. That is, once it has matched some of the subject
-string, it is never re-entered, even if it contains untried alternatives and
-there is a subsequent matching failure.
-
-
-This PCRE pattern solves the nested parentheses problem (assume the
-PCRE_EXTENDED option is set so that white space is ignored):
-
- \( ( (?>[^()]+) | (?R) )* \)
-
-First it matches an opening parenthesis. Then it matches any number of
-substrings which can either be a sequence of non-parentheses, or a recursive
-match of the pattern itself (that is, a correctly parenthesized substring).
-Finally there is a closing parenthesis.
-
-
-If this were part of a larger pattern, you would not want to recurse the entire
-pattern, so instead you could use this:
-
- ( \( ( (?>[^()]+) | (?1) )* \) )
-
-We have put the pattern into parentheses, and caused the recursion to refer to
-them instead of the whole pattern.
-
-
-In a larger pattern, keeping track of parenthesis numbers can be tricky. This
-is made easier by the use of relative references. (A Perl 5.10 feature.)
-Instead of (?1) in the pattern above you can write (?-2) to refer to the second
-most recently opened parentheses preceding the recursion. In other words, a
-negative number counts capturing parentheses leftwards from the point at which
-it is encountered.
-
-
-It is also possible to refer to subsequently opened parentheses, by writing
-references such as (?+2). However, these cannot be recursive because the
-reference is not inside the parentheses that are referenced. They are always
-"subroutine" calls, as described in the next section.
-
-
-An alternative approach is to use named parentheses instead. The Perl syntax
-for this is (?&name); PCRE's earlier syntax (?P>name) is also supported. We
-could rewrite the above example as follows:
-
- (?<pn> \( ( (?>[^()]+) | (?&pn) )* \) )
-
-If there is more than one subpattern with the same name, the earliest one is
-used.
-
-
-This particular example pattern that we have been looking at contains nested
-unlimited repeats, and so the use of atomic grouping for matching strings of
-non-parentheses is important when applying the pattern to strings that do not
-match. For example, when this pattern is applied to
-
- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
-
-it yields "no match" quickly. However, if atomic grouping is not used,
-the match runs for a very long time indeed because there are so many different
-ways the + and * repeats can carve up the subject, and all have to be tested
-before failure can be reported.
-
-
-At the end of a match, the values set for any capturing subpatterns are those
-from the outermost level of the recursion at which the subpattern value is set.
-If you want to obtain intermediate values, a callout function can be used (see
-below and the
-pcrecallout
-documentation). If the pattern above is matched against
-
- (ab(cd)ef)
-
-the value for the capturing parentheses is "ef", which is the last value taken
-on at the top level. If additional parentheses are added, giving
-
- \( ( ( (?>[^()]+) | (?R) )* ) \)
- ^ ^
- ^ ^
-
-the string they capture is "ab(cd)ef", the contents of the top level
-parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
-has to obtain extra memory to store data during a recursion, which it does by
-using pcre_malloc, freeing it via pcre_free afterwards. If no
-memory can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
-
-
-Do not confuse the (?R) item with the condition (R), which tests for recursion.
-Consider this pattern, which matches text in angle brackets, allowing for
-arbitrary nesting. Only digits are allowed in nested brackets (that is, when
-recursing), whereas any characters are permitted at the outer level.
-
- < (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
-
-In this pattern, (?(R) is the start of a conditional subpattern, with two
-different alternatives for the recursive and non-recursive cases. The (?R) item
-is the actual recursive call.
-
-
SUBPATTERNS AS SUBROUTINES
-
-If the syntax for a recursive subpattern reference (either by number or by
-name) is used outside the parentheses to which it refers, it operates like a
-subroutine in a programming language. The "called" subpattern may be defined
-before or after the reference. A numbered reference can be absolute or
-relative, as in these examples:
-
- (...(absolute)...)...(?2)...
- (...(relative)...)...(?-1)...
- (...(?+1)...(relative)...
-
-An earlier example pointed out that the pattern
-
- (sens|respons)e and \1ibility
-
-matches "sense and sensibility" and "response and responsibility", but not
-"sense and responsibility". If instead the pattern
-
- (sens|respons)e and (?1)ibility
-
-is used, it does match "sense and responsibility" as well as the other two
-strings. Another example is given in the discussion of DEFINE above.
-
-
-Like recursive subpatterns, a "subroutine" call is always treated as an atomic
-group. That is, once it has matched some of the subject string, it is never
-re-entered, even if it contains untried alternatives and there is a subsequent
-matching failure.
-
-
-When a subpattern is used as a subroutine, processing options such as
-case-independence are fixed when the subpattern is defined. They cannot be
-changed for different calls. For example, consider this pattern:
-
- (abc)(?i:(?-1))
-
-It matches "abcabc". It does not match "abcABC" because the change of
-processing option does not affect the called subpattern.
-
-
ONIGURUMA SUBROUTINE SYNTAX
-
-For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or
-a number enclosed either in angle brackets or single quotes, is an alternative
-syntax for referencing a subpattern as a subroutine, possibly recursively. Here
-are two of the examples used above, rewritten using this syntax:
-
- (?<pn> \( ( (?>[^()]+) | \g<pn> )* \) )
- (sens|respons)e and \g'1'ibility
-
-PCRE supports an extension to Oniguruma: if a number is preceded by a
-plus or a minus sign it is taken as a relative reference. For example:
-
- (abc)(?i:\g<-1>)
-
-Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
-synonymous. The former is a back reference; the latter is a subroutine call.
-
-
CALLOUTS
-
-Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
-code to be obeyed in the middle of matching a regular expression. This makes it
-possible, amongst other things, to extract different substrings that match the
-same pair of parentheses when there is a repetition.
-
-
-PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
-code. The feature is called "callout". The caller of PCRE provides an external
-function by putting its entry point in the global variable pcre_callout.
-By default, this variable contains NULL, which disables all calling out.
-
-
-Within a regular expression, (?C) indicates the points at which the external
-function is to be called. If you want to identify different callout points, you
-can put a number less than 256 after the letter C. The default value is zero.
-For example, this pattern has two callout points:
-
- (?C1)abc(?C2)def
-
-If the PCRE_AUTO_CALLOUT flag is passed to pcre_compile(), callouts are
-automatically installed before each item in the pattern. They are all numbered
-255.
-
-
-During matching, when PCRE reaches a callout point (and pcre_callout is
-set), the external function is called. It is provided with the number of the
-callout, the position in the pattern, and, optionally, one item of data
-originally supplied by the caller of pcre_exec(). The callout function
-may cause matching to proceed, to backtrack, or to fail altogether. A complete
-description of the interface to the callout function is given in the
-pcrecallout
-documentation.
-
-
BACKTRACKING CONTROL
-
-Perl 5.10 introduced a number of "Special Backtracking Control Verbs", which
-are described in the Perl documentation as "experimental and subject to change
-or removal in a future version of Perl". It goes on to say: "Their usage in
-production code should be noted to avoid problems during upgrades." The same
-remarks apply to the PCRE features described in this section.
-
-
-Since these verbs are specifically related to backtracking, most of them can be
-used only when the pattern is to be matched using pcre_exec(), which uses
-a backtracking algorithm. With the exception of (*FAIL), which behaves like a
-failing negative assertion, they cause an error if encountered by
-pcre_dfa_exec().
-
-
-The new verbs make use of what was previously invalid syntax: an opening
-parenthesis followed by an asterisk. In Perl, they are generally of the form
-(*VERB:ARG) but PCRE does not support the use of arguments, so its general
-form is just (*VERB). Any number of these verbs may occur in a pattern. There
-are two kinds:
-
-
-Verbs that act immediately
-
-
-The following verbs act as soon as they are encountered:
-
- (*ACCEPT)
-
-This verb causes the match to end successfully, skipping the remainder of the
-pattern. When inside a recursion, only the innermost pattern is ended
-immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside
-capturing parentheses. In Perl, the data so far is captured: in PCRE no data is
-captured. For example:
-
- A(A|B(*ACCEPT)|C)D
-
-This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is
-captured.
-
- (*FAIL) or (*F)
-
-This verb causes the match to fail, forcing backtracking to occur. It is
-equivalent to (?!) but easier to read. The Perl documentation notes that it is
-probably useful only when combined with (?{}) or (??{}). Those are, of course,
-Perl features that are not present in PCRE. The nearest equivalent is the
-callout feature, as for example in this pattern:
-
- a+(?C)(*FAIL)
-
-A match with the string "aaaa" always fails, but the callout is taken before
-each backtrack happens (in this example, 10 times).
-
-
-Verbs that act after backtracking
-
-
-The following verbs do nothing when they are encountered. Matching continues
-with what follows, but if there is no subsequent match, a failure is forced.
-The verbs differ in exactly what kind of failure occurs.
-
- (*COMMIT)
-
-This verb causes the whole match to fail outright if the rest of the pattern
-does not match. Even if the pattern is unanchored, no further attempts to find
-a match by advancing the start point take place. Once (*COMMIT) has been
-passed, pcre_exec() is committed to finding a match at the current
-starting point, or not at all. For example:
-
- a+(*COMMIT)b
-
-This matches "xxaab" but not "aacaab". It can be thought of as a kind of
-dynamic anchor, or "I've started, so I must finish."
-
- (*PRUNE)
-
-This verb causes the match to fail at the current position if the rest of the
-pattern does not match. If the pattern is unanchored, the normal "bumpalong"
-advance to the next starting character then happens. Backtracking can occur as
-usual to the left of (*PRUNE), or when matching to the right of (*PRUNE), but
-if there is no match to the right, backtracking cannot cross (*PRUNE).
-In simple cases, the use of (*PRUNE) is just an alternative to an atomic
-group or possessive quantifier, but there are some uses of (*PRUNE) that cannot
-be expressed in any other way.
-
- (*SKIP)
-
-This verb is like (*PRUNE), except that if the pattern is unanchored, the
-"bumpalong" advance is not to the next character, but to the position in the
-subject where (*SKIP) was encountered. (*SKIP) signifies that whatever text
-was matched leading up to it cannot be part of a successful match. Consider:
-
- a+(*SKIP)b
-
-If the subject is "aaaac...", after the first match attempt fails (starting at
-the first character in the string), the starting point skips on to start the
-next attempt at "c". Note that a possessive quantifer does not have the same
-effect in this example; although it would suppress backtracking during the
-first match attempt, the second attempt would start at the second character
-instead of skipping on to "c".
-
- (*THEN)
-
-This verb causes a skip to the next alternation if the rest of the pattern does
-not match. That is, it cancels pending backtracking, but only within the
-current alternation. Its name comes from the observation that it can be used
-for a pattern-based if-then-else block:
-
- ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
-
-If the COND1 pattern matches, FOO is tried (and possibly further items after
-the end of the group if FOO succeeds); on failure the matcher skips to the
-second alternative and tries COND2, without backtracking into COND1. If (*THEN)
-is used outside of any alternation, it acts exactly like (*PRUNE).
-
-
SEE ALSO
-
-pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 11 April 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcreperform.html b/libs/pcre/doc/html/pcreperform.html
deleted file mode 100644
index 41d893d706..0000000000
--- a/libs/pcre/doc/html/pcreperform.html
+++ /dev/null
@@ -1,173 +0,0 @@
-
-
-pcreperform specification
-
-
-pcreperform man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-PCRE PERFORMANCE
-
-
-Two aspects of performance are discussed below: memory usage and processing
-time. The way you express your pattern as a regular expression can affect both
-of them.
-
-
-MEMORY USAGE
-
-
-Patterns are compiled by PCRE into a reasonably efficient byte code, so that
-most simple patterns do not use much memory. However, there is one case where
-memory usage can be unexpectedly large. When a parenthesized subpattern has a
-quantifier with a minimum greater than 1 and/or a limited maximum, the whole
-subpattern is repeated in the compiled code. For example, the pattern
-
- (abc|def){2,4}
-
-is compiled as if it were
-
- (abc|def)(abc|def)((abc|def)(abc|def)?)?
-
-(Technical aside: It is done this way so that backtrack points within each of
-the repetitions can be independently maintained.)
-
-
-For regular expressions whose quantifiers use only small numbers, this is not
-usually a problem. However, if the numbers are large, and particularly if such
-repetitions are nested, the memory usage can become an embarrassment. For
-example, the very simple pattern
-
- ((ab){1,1000}c){1,3}
-
-uses 51K bytes when compiled. When PCRE is compiled with its default internal
-pointer size of two bytes, the size limit on a compiled pattern is 64K, and
-this is reached with the above pattern if the outer repetition is increased
-from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
-handle larger compiled patterns, but it is better to try to rewrite your
-pattern to use less memory if you can.
-
-
-One way of reducing the memory usage for such patterns is to make use of PCRE's
-"subroutine"
-facility. Re-writing the above pattern as
-
- ((ab)(?2){0,999}c)(?1){0,2}
-
-reduces the memory requirements to 18K, and indeed it remains under 20K even
-with the outer repetition increased to 100. However, this pattern is not
-exactly equivalent, because the "subroutine" calls are treated as
-atomic groups
-into which there can be no backtracking if there is a subsequent matching
-failure. Therefore, PCRE cannot do this kind of rewriting automatically.
-Furthermore, there is a noticeable loss of speed when executing the modified
-pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
-speed is acceptable, this kind of rewriting will allow you to process patterns
-that PCRE cannot otherwise handle.
-
-
-PROCESSING TIME
-
-
-Certain items in regular expression patterns are processed more efficiently
-than others. It is more efficient to use a character class like [aeiou] than a
-set of single-character alternatives such as (a|e|i|o|u). In general, the
-simplest construction that provides the required behaviour is usually the most
-efficient. Jeffrey Friedl's book contains a lot of useful general discussion
-about optimizing regular expressions for efficient performance. This document
-contains a few observations about PCRE.
-
-
-Using Unicode character properties (the \p, \P, and \X escapes) is slow,
-because PCRE has to scan a structure that contains data for over fifteen
-thousand characters whenever it needs a character's property. If you can find
-an alternative pattern that does not use character properties, it will probably
-be faster.
-
-
-When a pattern begins with .* not in parentheses, or in parentheses that are
-not the subject of a backreference, and the PCRE_DOTALL option is set, the
-pattern is implicitly anchored by PCRE, since it can match only at the start of
-a subject string. However, if PCRE_DOTALL is not set, PCRE cannot make this
-optimization, because the . metacharacter does not then match a newline, and if
-the subject string contains newlines, the pattern may match from the character
-immediately following one of them instead of from the very start. For example,
-the pattern
-
- .*second
-
-matches the subject "first\nand second" (where \n stands for a newline
-character), with the match starting at the seventh character. In order to do
-this, PCRE has to retry the match starting after every newline in the subject.
-
-
-If you are using such a pattern with subject strings that do not contain
-newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
-the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE
-from having to scan along the subject looking for a newline to restart at.
-
-
-Beware of patterns that contain nested indefinite repeats. These can take a
-long time to run when applied to a string that does not match. Consider the
-pattern fragment
-
- ^(a+)*
-
-This can match "aaaa" in 16 different ways, and this number increases very
-rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
-times, and for each of those cases other than 0 or 4, the + repeats can match
-different numbers of times.) When the remainder of the pattern is such that the
-entire match is going to fail, PCRE has in principle to try every possible
-variation, and this can take an extremely long time, even for relatively short
-strings.
-
-
-An optimization catches some of the more simple cases such as
-
- (a+)*b
-
-where a literal character follows. Before embarking on the standard matching
-procedure, PCRE checks that there is a "b" later in the subject string, and if
-there is not, it fails the match immediately. However, when there is no
-following literal this optimization cannot be used. You can see the difference
-by comparing the behaviour of
-
- (a+)*\d
-
-with the pattern above. The former gives a failure almost instantly when
-applied to a whole line of "a" characters, whereas the latter takes an
-appreciable time with strings longer than about 20 characters.
-
-
-In many cases, the solution to this kind of performance issue is to use an
-atomic group or a possessive quantifier.
-
-
-AUTHOR
-
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
-REVISION
-
-
-Last updated: 06 March 2007
-
-Copyright © 1997-2007 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcreposix.html b/libs/pcre/doc/html/pcreposix.html
deleted file mode 100644
index 3a35664f7e..0000000000
--- a/libs/pcre/doc/html/pcreposix.html
+++ /dev/null
@@ -1,266 +0,0 @@
-
-
-pcreposix specification
-
-
-pcreposix man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
SYNOPSIS OF POSIX API
-
-#include <pcreposix.h>
-
-
-int regcomp(regex_t *preg, const char *pattern,
-int cflags);
-
-
-int regexec(regex_t *preg, const char *string,
-size_t nmatch, regmatch_t pmatch[], int eflags);
-
-
-size_t regerror(int errcode, const regex_t *preg,
-char *errbuf, size_t errbuf_size);
-
-
-void regfree(regex_t *preg);
-
-
DESCRIPTION
-
-This set of functions provides a POSIX-style API to the PCRE regular expression
-package. See the
-pcreapi
-documentation for a description of PCRE's native API, which contains much
-additional functionality.
-
-
-The functions described here are just wrapper functions that ultimately call
-the PCRE native API. Their prototypes are defined in the pcreposix.h
-header file, and on Unix systems the library itself is called
-pcreposix.a, so can be accessed by adding -lpcreposix to the
-command for linking an application that uses them. Because the POSIX functions
-call the native ones, it is also necessary to add -lpcre.
-
-
-I have implemented only those POSIX option bits that can be reasonably mapped
-to PCRE native options. In addition, the option REG_EXTENDED is defined with
-the value zero. This has no effect, but since programs that are written to the
-POSIX interface often use it, this makes it easier to slot in PCRE as a
-replacement library. Other POSIX options are not even defined.
-
-
-When PCRE is called via these functions, it is only the API that is POSIX-like
-in style. The syntax and semantics of the regular expressions themselves are
-still those of Perl, subject to the setting of various PCRE options, as
-described below. "POSIX-like in style" means that the API approximates to the
-POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
-domains it is probably even less compatible.
-
-
-The header for these functions is supplied as pcreposix.h to avoid any
-potential clash with other POSIX libraries. It can, of course, be renamed or
-aliased as regex.h, which is the "correct" name. It provides two
-structure types, regex_t for compiled internal forms, and
-regmatch_t for returning captured substrings. It also defines some
-constants whose names start with "REG_"; these are used for setting options and
-identifying error codes.
-
-
-
-
COMPILING A PATTERN
-
-The function regcomp() is called to compile a pattern into an
-internal form. The pattern is a C string terminated by a binary zero, and
-is passed in the argument pattern. The preg argument is a pointer
-to a regex_t structure that is used as a base for storing information
-about the compiled regular expression.
-
-
-The argument cflags is either zero, or contains one or more of the bits
-defined by the following macros:
-
- REG_DOTALL
-
-The PCRE_DOTALL option is set when the regular expression is passed for
-compilation to the native function. Note that REG_DOTALL is not part of the
-POSIX standard.
-
- REG_ICASE
-
-The PCRE_CASELESS option is set when the regular expression is passed for
-compilation to the native function.
-
- REG_NEWLINE
-
-The PCRE_MULTILINE option is set when the regular expression is passed for
-compilation to the native function. Note that this does not mimic the
-defined POSIX behaviour for REG_NEWLINE (see the following section).
-
- REG_NOSUB
-
-The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is passed
-for compilation to the native function. In addition, when a pattern that is
-compiled with this flag is passed to regexec() for matching, the
-nmatch and pmatch arguments are ignored, and no captured strings
-are returned.
-
- REG_UTF8
-
-The PCRE_UTF8 option is set when the regular expression is passed for
-compilation to the native function. This causes the pattern itself and all data
-strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF8
-is not part of the POSIX standard.
-
-
-In the absence of these flags, no options are passed to the native function.
-This means the the regex is compiled with PCRE default semantics. In
-particular, the way it handles newline characters in the subject string is the
-Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
-some of the effects specified for REG_NEWLINE. It does not affect the way
-newlines are matched by . (they aren't) or by a negative class such as [^a]
-(they are).
-
-
-The yield of regcomp() is zero on success, and non-zero otherwise. The
-preg structure is filled in on success, and one member of the structure
-is public: re_nsub contains the number of capturing subpatterns in
-the regular expression. Various error codes are defined in the header file.
-
-
MATCHING NEWLINE CHARACTERS
-
-This area is not simple, because POSIX and Perl take different views of things.
-It is not possible to get PCRE to obey POSIX semantics, but then PCRE was never
-intended to be a POSIX engine. The following table lists the different
-possibilities for matching newline characters in PCRE:
-
- Default Change with
-
- . matches newline no PCRE_DOTALL
- newline matches [^a] yes not changeable
- $ matches \n at end yes PCRE_DOLLARENDONLY
- $ matches \n in middle no PCRE_MULTILINE
- ^ matches \n in middle no PCRE_MULTILINE
-
-This is the equivalent table for POSIX:
-
- Default Change with
-
- . matches newline yes REG_NEWLINE
- newline matches [^a] yes REG_NEWLINE
- $ matches \n at end no REG_NEWLINE
- $ matches \n in middle no REG_NEWLINE
- ^ matches \n in middle no REG_NEWLINE
-
-PCRE's behaviour is the same as Perl's, except that there is no equivalent for
-PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is no way to stop
-newline from matching [^a].
-
-
-The default POSIX newline handling can be obtained by setting PCRE_DOTALL and
-PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE behave exactly as for the
-REG_NEWLINE action.
-
-
MATCHING A PATTERN
-
-The function regexec() is called to match a compiled pattern preg
-against a given string, which is by default terminated by a zero byte
-(but see REG_STARTEND below), subject to the options in eflags. These can
-be:
-
- REG_NOTBOL
-
-The PCRE_NOTBOL option is set when calling the underlying PCRE matching
-function.
-
- REG_NOTEMPTY
-
-The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
-function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
-setting this option can give more POSIX-like behaviour in some situations.
-
- REG_NOTEOL
-
-The PCRE_NOTEOL option is set when calling the underlying PCRE matching
-function.
-
- REG_STARTEND
-
-The string is considered to start at string + pmatch[0].rm_so and
-to have a terminating NUL located at string + pmatch[0].rm_eo
-(there need not actually be a NUL at that location), regardless of the value of
-nmatch. This is a BSD extension, compatible with but not specified by
-IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
-intended to be portable to other systems. Note that a non-zero rm_so does
-not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
-how it is matched.
-
-
-If the pattern was compiled with the REG_NOSUB flag, no data about any matched
-strings is returned. The nmatch and pmatch arguments of
-regexec() are ignored.
-
-
-Otherwise,the portion of the string that was matched, and also any captured
-substrings, are returned via the pmatch argument, which points to an
-array of nmatch structures of type regmatch_t, containing the
-members rm_so and rm_eo. These contain the offset to the first
-character of each substring and the offset to the first character after the end
-of each substring, respectively. The 0th element of the vector relates to the
-entire portion of string that was matched; subsequent elements relate to
-the capturing subpatterns of the regular expression. Unused entries in the
-array have both structure members set to -1.
-
-
-A successful match yields a zero return; various error codes are defined in the
-header file, of which REG_NOMATCH is the "expected" failure code.
-
-
ERROR MESSAGES
-
-The regerror() function maps a non-zero errorcode from either
-regcomp() or regexec() to a printable message. If preg is not
-NULL, the error should have arisen from the use of that structure. A message
-terminated by a binary zero is placed in errbuf. The length of the
-message, including the zero, is limited to errbuf_size. The yield of the
-function is the size of buffer needed to hold the whole message.
-
-
MEMORY USAGE
-
-Compiling a regular expression causes memory to be allocated and associated
-with the preg structure. The function regfree() frees all such
-memory, after which preg may no longer be used as a compiled expression.
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 11 March 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcreprecompile.html b/libs/pcre/doc/html/pcreprecompile.html
deleted file mode 100644
index 83da22673d..0000000000
--- a/libs/pcre/doc/html/pcreprecompile.html
+++ /dev/null
@@ -1,148 +0,0 @@
-
-
-pcreprecompile specification
-
-
-pcreprecompile man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
SAVING AND RE-USING PRECOMPILED PCRE PATTERNS
-
-If you are running an application that uses a large number of regular
-expression patterns, it may be useful to store them in a precompiled form
-instead of having to compile them every time the application is run.
-If you are not using any private character tables (see the
-pcre_maketables()
-documentation), this is relatively straightforward. If you are using private
-tables, it is a little bit more complicated.
-
-
-If you save compiled patterns to a file, you can copy them to a different host
-and run them there. This works even if the new host has the opposite endianness
-to the one on which the patterns were compiled. There may be a small
-performance penalty, but it should be insignificant. However, compiling regular
-expressions with one version of PCRE for use with a different version is not
-guaranteed to work and may cause crashes.
-
-
SAVING A COMPILED PATTERN
-
-The value returned by pcre_compile() points to a single block of memory
-that holds the compiled pattern and associated data. You can find the length of
-this block in bytes by calling pcre_fullinfo() with an argument of
-PCRE_INFO_SIZE. You can then save the data in any appropriate manner. Here is
-sample code that compiles a pattern and writes it to a file. It assumes that
-the variable fd refers to a file that is open for output:
-
- int erroroffset, rc, size;
- char *error;
- pcre *re;
-
- re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
- if (re == NULL) { ... handle errors ... }
- rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
- if (rc < 0) { ... handle errors ... }
- rc = fwrite(re, 1, size, fd);
- if (rc != size) { ... handle errors ... }
-
-In this example, the bytes that comprise the compiled pattern are copied
-exactly. Note that this is binary data that may contain any of the 256 possible
-byte values. On systems that make a distinction between binary and non-binary
-data, be sure that the file is opened for binary output.
-
-
-If you want to write more than one pattern to a file, you will have to devise a
-way of separating them. For binary data, preceding each pattern with its length
-is probably the most straightforward approach. Another possibility is to write
-out the data in hexadecimal instead of binary, one pattern to a line.
-
-
-Saving compiled patterns in a file is only one possible way of storing them for
-later use. They could equally well be saved in a database, or in the memory of
-some daemon process that passes them via sockets to the processes that want
-them.
-
-
-If the pattern has been studied, it is also possible to save the study data in
-a similar way to the compiled pattern itself. When studying generates
-additional information, pcre_study() returns a pointer to a
-pcre_extra data block. Its format is defined in the
-section on matching a pattern
-in the
-pcreapi
-documentation. The study_data field points to the binary study data, and
-this is what you must save (not the pcre_extra block itself). The length
-of the study data can be obtained by calling pcre_fullinfo() with an
-argument of PCRE_INFO_STUDYSIZE. Remember to check that pcre_study() did
-return a non-NULL value before trying to save the study data.
-
-
RE-USING A PRECOMPILED PATTERN
-
-Re-using a precompiled pattern is straightforward. Having reloaded it into main
-memory, you pass its pointer to pcre_exec() or pcre_dfa_exec() in
-the usual way. This should work even on another host, and even if that host has
-the opposite endianness to the one where the pattern was compiled.
-
-
-However, if you passed a pointer to custom character tables when the pattern
-was compiled (the tableptr argument of pcre_compile()), you must
-now pass a similar pointer to pcre_exec() or pcre_dfa_exec(),
-because the value saved with the compiled pattern will obviously be nonsense. A
-field in a pcre_extra() block is used to pass this data, as described in
-the
-section on matching a pattern
-in the
-pcreapi
-documentation.
-
-
-If you did not provide custom character tables when the pattern was compiled,
-the pointer in the compiled pattern is NULL, which causes pcre_exec() to
-use PCRE's internal tables. Thus, you do not need to take any special action at
-run time in this case.
-
-
-If you saved study data with the compiled pattern, you need to create your own
-pcre_extra data block and set the study_data field to point to the
-reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
-flags field to indicate that study data is present. Then pass the
-pcre_extra block to pcre_exec() or pcre_dfa_exec() in the
-usual way.
-
-
COMPATIBILITY WITH DIFFERENT PCRE RELEASES
-
-In general, it is safest to recompile all saved patterns when you update to a
-new PCRE release, though not all updates actually require this. Recompiling is
-definitely needed for release 7.2.
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 13 June 2007
-
-Copyright © 1997-2007 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcresample.html b/libs/pcre/doc/html/pcresample.html
deleted file mode 100644
index 6243be6a5a..0000000000
--- a/libs/pcre/doc/html/pcresample.html
+++ /dev/null
@@ -1,96 +0,0 @@
-
-
-pcresample specification
-
-
-pcresample man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-PCRE SAMPLE PROGRAM
-
-
-A simple, complete demonstration program, to get you started with using PCRE,
-is supplied in the file pcredemo.c in the PCRE distribution.
-
-
-The program compiles the regular expression that is its first argument, and
-matches it against the subject string in its second argument. No PCRE options
-are set, and default character tables are used. If matching succeeds, the
-program outputs the portion of the subject that matched, together with the
-contents of any captured substrings.
-
-
-If the -g option is given on the command line, the program then goes on to
-check for further matches of the same regular expression in the same subject
-string. The logic is a little bit tricky because of the possibility of matching
-an empty string. Comments in the code explain what is going on.
-
-
-If PCRE is installed in the standard include and library directories for your
-system, you should be able to compile the demonstration program using this
-command:
-
- gcc -o pcredemo pcredemo.c -lpcre
-
-If PCRE is installed elsewhere, you may need to add additional options to the
-command line. For example, on a Unix-like system that has PCRE installed in
-/usr/local, you can compile the demonstration program using a command
-like this:
-
- gcc -o pcredemo -I/usr/local/include pcredemo.c -L/usr/local/lib -lpcre
-
-Once you have compiled the demonstration program, you can run simple tests like
-this:
-
- ./pcredemo 'cat|dog' 'the cat sat on the mat'
- ./pcredemo -g 'cat|dog' 'the dog sat on the cat'
-
-Note that there is a much more comprehensive test program, called
-pcretest,
-which supports many more facilities for testing regular expressions and the
-PCRE library. The pcredemo program is provided as a simple coding
-example.
-
-
-On some operating systems (e.g. Solaris), when PCRE is not installed in the
-standard library directory, you may get an error like this when you try to run
-pcredemo:
-
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
-
-This is caused by the way shared library support works on those systems. You
-need to add
-
- -R/usr/local/lib
-
-(for example) to the compile command to get round this problem.
-
-
-AUTHOR
-
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
-REVISION
-
-
-Last updated: 23 January 2008
-
-Copyright © 1997-2008 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcrestack.html b/libs/pcre/doc/html/pcrestack.html
deleted file mode 100644
index 60488280e9..0000000000
--- a/libs/pcre/doc/html/pcrestack.html
+++ /dev/null
@@ -1,172 +0,0 @@
-
-
-pcrestack specification
-
-
-pcrestack man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-PCRE DISCUSSION OF STACK USAGE
-
-
-When you call pcre_exec(), it makes use of an internal function called
-match(). This calls itself recursively at branch points in the pattern,
-in order to remember the state of the match so that it can back up and try a
-different alternative if the first one fails. As matching proceeds deeper and
-deeper into the tree of possibilities, the recursion depth increases.
-
-
-Not all calls of match() increase the recursion depth; for an item such
-as a* it may be called several times at the same level, after matching
-different numbers of a's. Furthermore, in a number of cases where the result of
-the recursive call would immediately be passed back as the result of the
-current call (a "tail recursion"), the function is just restarted instead.
-
-
-The pcre_dfa_exec() function operates in an entirely different way, and
-hardly uses recursion at all. The limit on its complexity is the amount of
-workspace it is given. The comments that follow do NOT apply to
-pcre_dfa_exec(); they are relevant only for pcre_exec().
-
-
-You can set limits on the number of times that match() is called, both in
-total and recursively. If the limit is exceeded, an error occurs. For details,
-see the
-section on extra data for pcre_exec()
-in the
-pcreapi
-documentation.
-
-
-Each time that match() is actually called recursively, it uses memory
-from the process stack. For certain kinds of pattern and data, very large
-amounts of stack may be needed, despite the recognition of "tail recursion".
-You can often reduce the amount of recursion, and therefore the amount of stack
-used, by modifying the pattern that is being matched. Consider, for example,
-this pattern:
-
- ([^<]|<(?!inet))+
-
-It matches from wherever it starts until it encounters "<inet" or the end of
-the data, and is the kind of pattern that might be used when processing an XML
-file. Each iteration of the outer parentheses matches either one character that
-is not "<" or a "<" that is not followed by "inet". However, each time a
-parenthesis is processed, a recursion occurs, so this formulation uses a stack
-frame for each matched character. For a long string, a lot of stack is
-required. Consider now this rewritten pattern, which matches exactly the same
-strings:
-
- ([^<]++|<(?!inet))+
-
-This uses very much less stack, because runs of characters that do not contain
-"<" are "swallowed" in one item inside the parentheses. Recursion happens only
-when a "<" character that is not followed by "inet" is encountered (and we
-assume this is relatively rare). A possessive quantifier is used to stop any
-backtracking into the runs of non-"<" characters, but that is not related to
-stack usage.
-
-
-This example shows that one way of avoiding stack problems when matching long
-subject strings is to write repeated parenthesized subpatterns to match more
-than one character whenever possible.
-
-
-Compiling PCRE to use heap instead of stack
-
-
-In environments where stack memory is constrained, you might want to compile
-PCRE to use heap memory instead of stack for remembering back-up points. This
-makes it run a lot more slowly, however. Details of how to do this are given in
-the
-pcrebuild
-documentation. When built in this way, instead of using the stack, PCRE obtains
-and frees memory by calling the functions that are pointed to by the
-pcre_stack_malloc and pcre_stack_free variables. By default, these
-point to malloc() and free(), but you can replace the pointers to
-cause PCRE to use your own functions. Since the block sizes are always the
-same, and are always freed in reverse order, it may be possible to implement
-customized memory handlers that are more efficient than the standard functions.
-
-
-Limiting PCRE's stack usage
-
-
-PCRE has an internal counter that can be used to limit the depth of recursion,
-and thus cause pcre_exec() to give an error code before it runs out of
-stack. By default, the limit is very large, and unlikely ever to operate. It
-can be changed when PCRE is built, and it can also be set when
-pcre_exec() is called. For details of these interfaces, see the
-pcrebuild
-and
-pcreapi
-documentation.
-
-
-As a very rough rule of thumb, you should reckon on about 500 bytes per
-recursion. Thus, if you want to limit your stack usage to 8Mb, you
-should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
-support around 128000 recursions. The pcretest test program has a command
-line option (-S) that can be used to increase the size of its stack.
-
-
-Changing stack size in Unix-like systems
-
-
-In Unix-like environments, there is not often a problem with the stack unless
-very long strings are involved, though the default limit on stack size varies
-from system to system. Values from 8Mb to 64Mb are common. You can find your
-default limit by running the command:
-
- ulimit -s
-
-Unfortunately, the effect of running out of stack is often SIGSEGV, though
-sometimes a more explicit error message is given. You can normally increase the
-limit on stack size by code such as this:
-
- struct rlimit rlim;
- getrlimit(RLIMIT_STACK, &rlim);
- rlim.rlim_cur = 100*1024*1024;
- setrlimit(RLIMIT_STACK, &rlim);
-
-This reads the current limits (soft and hard) using getrlimit(), then
-attempts to increase the soft limit to 100Mb using setrlimit(). You must
-do this before calling pcre_exec().
-
-
-Changing stack size in Mac OS X
-
-
-Using setrlimit(), as described above, should also work on Mac OS X. It
-is also possible to set a stack size when linking a program. There is a
-discussion about stack sizes in Mac OS X at this web site:
-http://developer.apple.com/qa/qa2005/qa1419.html.
-
-
-AUTHOR
-
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
-REVISION
-
-
-Last updated: 09 July 2008
-
-Copyright © 1997-2008 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcresyntax.html b/libs/pcre/doc/html/pcresyntax.html
deleted file mode 100644
index 2a1a686284..0000000000
--- a/libs/pcre/doc/html/pcresyntax.html
+++ /dev/null
@@ -1,473 +0,0 @@
-
-
-pcresyntax specification
-
-
-pcresyntax man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
PCRE REGULAR EXPRESSION SYNTAX SUMMARY
-
-The full syntax and semantics of the regular expressions that are supported by
-PCRE are described in the
-pcrepattern
-documentation. This document contains just a quick-reference summary of the
-syntax.
-
-
QUOTING
-
-
- \x where x is non-alphanumeric is a literal x
- \Q...\E treat enclosed characters as literal
-
-
-
CHARACTERS
-
-
- \a alarm, that is, the BEL character (hex 07)
- \cx "control-x", where x is any character
- \e escape (hex 1B)
- \f formfeed (hex 0C)
- \n newline (hex 0A)
- \r carriage return (hex 0D)
- \t tab (hex 09)
- \ddd character with octal code ddd, or backreference
- \xhh character with hex code hh
- \x{hhh..} character with hex code hhh..
-
-
-
CHARACTER TYPES
-
-
- . any character except newline;
- in dotall mode, any character whatsoever
- \C one byte, even in UTF-8 mode (best avoided)
- \d a decimal digit
- \D a character that is not a decimal digit
- \h a horizontal whitespace character
- \H a character that is not a horizontal whitespace character
- \p{xx} a character with the xx property
- \P{xx} a character without the xx property
- \R a newline sequence
- \s a whitespace character
- \S a character that is not a whitespace character
- \v a vertical whitespace character
- \V a character that is not a vertical whitespace character
- \w a "word" character
- \W a "non-word" character
- \X an extended Unicode sequence
-
-In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
-
-
GENERAL CATEGORY PROPERTY CODES FOR \p and \P
-
-
- C Other
- Cc Control
- Cf Format
- Cn Unassigned
- Co Private use
- Cs Surrogate
-
- L Letter
- Ll Lower case letter
- Lm Modifier letter
- Lo Other letter
- Lt Title case letter
- Lu Upper case letter
- L& Ll, Lu, or Lt
-
- M Mark
- Mc Spacing mark
- Me Enclosing mark
- Mn Non-spacing mark
-
- N Number
- Nd Decimal number
- Nl Letter number
- No Other number
-
- P Punctuation
- Pc Connector punctuation
- Pd Dash punctuation
- Pe Close punctuation
- Pf Final punctuation
- Pi Initial punctuation
- Po Other punctuation
- Ps Open punctuation
-
- S Symbol
- Sc Currency symbol
- Sk Modifier symbol
- Sm Mathematical symbol
- So Other symbol
-
- Z Separator
- Zl Line separator
- Zp Paragraph separator
- Zs Space separator
-
-
-
SCRIPT NAMES FOR \p AND \P
-
-Arabic,
-Armenian,
-Balinese,
-Bengali,
-Bopomofo,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Cham,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Greek,
-Gujarati,
-Gurmukhi,
-Han,
-Hangul,
-Hanunoo,
-Hebrew,
-Hiragana,
-Inherited,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khmer,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_B,
-Lycian,
-Lydian,
-Malayalam,
-Mongolian,
-Myanmar,
-New_Tai_Lue,
-Nko,
-Ogham,
-Old_Italic,
-Old_Persian,
-Ol_Chiki,
-Oriya,
-Osmanya,
-Phags_Pa,
-Phoenician,
-Rejang,
-Runic,
-Saurashtra,
-Shavian,
-Sinhala,
-Sudanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tamil,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Ugaritic,
-Vai,
-Yi.
-
-
CHARACTER CLASSES
-
-
- [...] positive character class
- [^...] negative character class
- [x-y] range (can be used for hex characters)
- [[:xxx:]] positive POSIX named set
- [[:^xxx:]] negative POSIX named set
-
- alnum alphanumeric
- alpha alphabetic
- ascii 0-127
- blank space or tab
- cntrl control character
- digit decimal digit
- graph printing, excluding space
- lower lower case letter
- print printing, including space
- punct printing, excluding alphanumeric
- space whitespace
- upper upper case letter
- word same as \w
- xdigit hexadecimal digit
-
-In PCRE, POSIX character set names recognize only ASCII characters. You can use
-\Q...\E inside a character class.
-
-
QUANTIFIERS
-
-
- ? 0 or 1, greedy
- ?+ 0 or 1, possessive
- ?? 0 or 1, lazy
- * 0 or more, greedy
- *+ 0 or more, possessive
- *? 0 or more, lazy
- + 1 or more, greedy
- ++ 1 or more, possessive
- +? 1 or more, lazy
- {n} exactly n
- {n,m} at least n, no more than m, greedy
- {n,m}+ at least n, no more than m, possessive
- {n,m}? at least n, no more than m, lazy
- {n,} n or more, greedy
- {n,}+ n or more, possessive
- {n,}? n or more, lazy
-
-
-
ANCHORS AND SIMPLE ASSERTIONS
-
-
- \b word boundary (only ASCII letters recognized)
- \B not a word boundary
- ^ start of subject
- also after internal newline in multiline mode
- \A start of subject
- $ end of subject
- also before newline at end of subject
- also before internal newline in multiline mode
- \Z end of subject
- also before newline at end of subject
- \z end of subject
- \G first matching position in subject
-
-
-
MATCH POINT RESET
-
-
- \K reset start of match
-
-
-
ALTERNATION
-
-
- expr|expr|expr...
-
-
-
CAPTURING
-
-
- (...) capturing group
- (?<name>...) named capturing group (Perl)
- (?'name'...) named capturing group (Perl)
- (?P<name>...) named capturing group (Python)
- (?:...) non-capturing group
- (?|...) non-capturing group; reset group numbers for
- capturing groups in each alternative
-
-
-
ATOMIC GROUPS
-
-
- (?>...) atomic, non-capturing group
-
-
-
COMMENT
-
-
- (?#....) comment (not nestable)
-
-
-
OPTION SETTING
-
-
- (?i) caseless
- (?J) allow duplicate names
- (?m) multiline
- (?s) single line (dotall)
- (?U) default ungreedy (lazy)
- (?x) extended (ignore white space)
- (?-...) unset option(s)
-
-The following is recognized only at the start of a pattern or after one of the
-newline-setting options with similar syntax:
-
- (*UTF8) set UTF-8 mode
-
-
-
LOOKAHEAD AND LOOKBEHIND ASSERTIONS
-
-
- (?=...) positive look ahead
- (?!...) negative look ahead
- (?<=...) positive look behind
- (?<!...) negative look behind
-
-Each top-level branch of a look behind must be of a fixed length.
-
-
BACKREFERENCES
-
-
- \n reference by number (can be ambiguous)
- \gn reference by number
- \g{n} reference by number
- \g{-n} relative reference by number
- \k<name> reference by name (Perl)
- \k'name' reference by name (Perl)
- \g{name} reference by name (Perl)
- \k{name} reference by name (.NET)
- (?P=name) reference by name (Python)
-
-
-
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
-
-
- (?R) recurse whole pattern
- (?n) call subpattern by absolute number
- (?+n) call subpattern by relative number
- (?-n) call subpattern by relative number
- (?&name) call subpattern by name (Perl)
- (?P>name) call subpattern by name (Python)
- \g<name> call subpattern by name (Oniguruma)
- \g'name' call subpattern by name (Oniguruma)
- \g<n> call subpattern by absolute number (Oniguruma)
- \g'n' call subpattern by absolute number (Oniguruma)
- \g<+n> call subpattern by relative number (PCRE extension)
- \g'+n' call subpattern by relative number (PCRE extension)
- \g<-n> call subpattern by relative number (PCRE extension)
- \g'-n' call subpattern by relative number (PCRE extension)
-
-
-
CONDITIONAL PATTERNS
-
-
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-
- (?(n)... absolute reference condition
- (?(+n)... relative reference condition
- (?(-n)... relative reference condition
- (?(<name>)... named reference condition (Perl)
- (?('name')... named reference condition (Perl)
- (?(name)... named reference condition (PCRE)
- (?(R)... overall recursion condition
- (?(Rn)... specific group recursion condition
- (?(R&name)... specific recursion condition
- (?(DEFINE)... define subpattern for reference
- (?(assert)... assertion condition
-
-
-
BACKTRACKING CONTROL
-
-The following act immediately they are reached:
-
- (*ACCEPT) force successful match
- (*FAIL) force backtrack; synonym (*F)
-
-The following act only when a subsequent match failure causes a backtrack to
-reach them. They all force a match failure, but they differ in what happens
-afterwards. Those that advance the start-of-match point do so only if the
-pattern is not anchored.
-
- (*COMMIT) overall failure, no advance of starting point
- (*PRUNE) advance to next starting character
- (*SKIP) advance start to current matching position
- (*THEN) local failure, backtrack to next alternation
-
-
-
NEWLINE CONVENTIONS
-
-These are recognized only at the very start of the pattern or after a
-(*BSR_...) or (*UTF8) option.
-
- (*CR) carriage return only
- (*LF) linefeed only
- (*CRLF) carriage return followed by linefeed
- (*ANYCRLF) all three of the above
- (*ANY) any Unicode newline sequence
-
-
-
WHAT \R MATCHES
-
-These are recognized only at the very start of the pattern or after a
-(*...) option that sets the newline convention or UTF-8 mode.
-
- (*BSR_ANYCRLF) CR, LF, or CRLF
- (*BSR_UNICODE) any Unicode newline sequence
-
-
-
CALLOUTS
-
-
- (?C) callout
- (?Cn) callout with data n
-
-
-
SEE ALSO
-
-pcrepattern(3), pcreapi(3), pcrecallout(3),
-pcrematching(3), pcre(3).
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 11 April 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/html/pcretest.html b/libs/pcre/doc/html/pcretest.html
deleted file mode 100644
index 0358958b74..0000000000
--- a/libs/pcre/doc/html/pcretest.html
+++ /dev/null
@@ -1,712 +0,0 @@
-
-
-pcretest specification
-
-
-pcretest man page
-
-Return to the PCRE index page.
-
-
-This page is part of the PCRE HTML documentation. It was generated automatically
-from the original man page. If there is any nonsense in it, please consult the
-man page, in case the conversion went wrong.
-
-
-
SYNOPSIS
-
-pcretest [options] [source] [destination]
-
-
-pcretest was written as a test program for the PCRE regular expression
-library itself, but it can also be used for experimenting with regular
-expressions. This document describes the features of the test program; for
-details of the regular expressions themselves, see the
-pcrepattern
-documentation. For details of the PCRE library function calls and their
-options, see the
-pcreapi
-documentation.
-
-
OPTIONS
-
--b
-Behave as if each regex has the /B (show bytecode) modifier; the internal
-form is output after compilation.
-
-
--C
-Output the version number of the PCRE library, and all available information
-about the optional features that are included, and then exit.
-
-
--d
-Behave as if each regex has the /D (debug) modifier; the internal
-form and information about the compiled pattern is output after compilation;
--d is equivalent to -b -i.
-
-
--dfa
-Behave as if each data line contains the \D escape sequence; this causes the
-alternative matching function, pcre_dfa_exec(), to be used instead of the
-standard pcre_exec() function (more detail is given below).
-
-
--help
-Output a brief summary these options and then exit.
-
-
--i
-Behave as if each regex has the /I modifier; information about the
-compiled pattern is given after compilation.
-
-
--M
-Behave as if each data line contains the \M escape sequence; this causes
-PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
-calling pcre_exec() repeatedly with different limits.
-
-
--m
-Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding /M to each regular expression. For compatibility
-with earlier versions of pcretest, -s is a synonym for -m.
-
-
--o osize
-Set the number of elements in the output vector that is used when calling
-pcre_exec() or pcre_dfa_exec() to be osize. The default value
-is 45, which is enough for 14 capturing subexpressions for pcre_exec() or
-22 different matches for pcre_dfa_exec(). The vector size can be
-changed for individual matching calls by including \O in the data line (see
-below).
-
-
--p
-Behave as if each regex has the /P modifier; the POSIX wrapper API is
-used to call PCRE. None of the other options has any effect when -p is
-set.
-
-
--q
-Do not output the version number of pcretest at the start of execution.
-
-
--S size
-On Unix-like systems, set the size of the runtime stack to size
-megabytes.
-
-
--t
-Run each compile, study, and match many times with a timer, and output
-resulting time per compile or match (in milliseconds). Do not set -m with
--t, because you will then get the size output a zillion times, and the
-timing will be distorted. You can control the number of iterations that are
-used for timing by following -t with a number (as a separate item on the
-command line). For example, "-t 1000" would iterate 1000 times. The default is
-to iterate 500000 times.
-
-
--tm
-This is like -t except that it times only the matching phase, not the
-compile or study phases.
-
-
DESCRIPTION
-
-If pcretest is given two filename arguments, it reads from the first and
-writes to the second. If it is given only one filename argument, it reads from
-that file and writes to stdout. Otherwise, it reads from stdin and writes to
-stdout, and prompts for each line of input, using "re>" to prompt for regular
-expressions, and "data>" to prompt for data lines.
-
-
-When pcretest is built, a configuration option can specify that it should
-be linked with the libreadline library. When this is done, if the input
-is from a terminal, it is read using the readline() function. This
-provides line-editing and history facilities. The output from the -help
-option states whether or not readline() will be used.
-
-
-The program handles any number of sets of input on a single input file. Each
-set starts with a regular expression, and continues with any number of data
-lines to be matched against the pattern.
-
-
-Each data line is matched separately and independently. If you want to do
-multi-line matches, you have to use the \n escape sequence (or \r or \r\n,
-etc., depending on the newline setting) in a single line of input to encode the
-newline sequences. There is no limit on the length of data lines; the input
-buffer is automatically extended if it is too small.
-
-
-An empty line signals the end of the data lines, at which point a new regular
-expression is read. The regular expressions are given enclosed in any
-non-alphanumeric delimiters other than backslash, for example:
-
- /(a|bc)x+yz/
-
-White space before the initial delimiter is ignored. A regular expression may
-be continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it, for example
-
- /abc\/def/
-
-If you do so, the escape and the delimiter form part of the pattern, but since
-delimiters are always non-alphanumeric, this does not affect its interpretation.
-If the terminating delimiter is immediately followed by a backslash, for
-example,
-
- /abc/\
-
-then a backslash is added to the end of the pattern. This is done to provide a
-way of testing the error condition that arises if a pattern finishes with a
-backslash, because
-
- /abc\/
-
-is interpreted as the first line of a pattern that starts with "abc/", causing
-pcretest to read the next line as a continuation of the regular expression.
-
-
PATTERN MODIFIERS
-
-A pattern may be followed by any number of modifiers, which are mostly single
-characters. Following Perl usage, these are referred to below as, for example,
-"the /i modifier", even though the delimiter of the pattern need not
-always be a slash, and no slash is used when writing modifiers. Whitespace may
-appear between the final pattern delimiter and the first modifier, and between
-the modifiers themselves.
-
-
-The /i, /m, /s, and /x modifiers set the PCRE_CASELESS,
-PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when
-pcre_compile() is called. These four modifier letters have the same
-effect as they do in Perl. For example:
-
- /caseless/i
-
-The following table shows additional modifiers for setting PCRE options that do
-not correspond to anything in Perl:
-
- /A PCRE_ANCHORED
- /C PCRE_AUTO_CALLOUT
- /E PCRE_DOLLAR_ENDONLY
- /f PCRE_FIRSTLINE
- /J PCRE_DUPNAMES
- /N PCRE_NO_AUTO_CAPTURE
- /U PCRE_UNGREEDY
- /X PCRE_EXTRA
- /<JS> PCRE_JAVASCRIPT_COMPAT
- /<cr> PCRE_NEWLINE_CR
- /<lf> PCRE_NEWLINE_LF
- /<crlf> PCRE_NEWLINE_CRLF
- /<anycrlf> PCRE_NEWLINE_ANYCRLF
- /<any> PCRE_NEWLINE_ANY
- /<bsr_anycrlf> PCRE_BSR_ANYCRLF
- /<bsr_unicode> PCRE_BSR_UNICODE
-
-Those specifying line ending sequences are literal strings as shown, but the
-letters can be in either case. This example sets multiline matching with CRLF
-as the line ending sequence:
-
- /^abc/m<crlf>
-
-Details of the meanings of these PCRE options are given in the
-pcreapi
-documentation.
-
-
-Finding all matches in a string
-
-
-Searching for all possible matches within each subject string can be requested
-by the /g or /G modifier. After finding a match, PCRE is called
-again to search the remainder of the subject string. The difference between
-/g and /G is that the former uses the startoffset argument to
-pcre_exec() to start searching at a new point within the entire string
-(which is in effect what Perl does), whereas the latter passes over a shortened
-substring. This makes a difference to the matching process if the pattern
-begins with a lookbehind assertion (including \b or \B).
-
-
-If any call to pcre_exec() in a /g or /G sequence matches an
-empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
-flags set in order to search for another, non-empty, match at the same point.
-If this second match fails, the start offset is advanced by one, and the normal
-match is retried. This imitates the way Perl handles such cases when using the
-/g modifier or the split() function.
-
-
-Other modifiers
-
-
-There are yet more modifiers for controlling the way pcretest
-operates.
-
-
-The /+ modifier requests that as well as outputting the substring that
-matched the entire pattern, pcretest should in addition output the remainder of
-the subject string. This is useful for tests where the subject contains
-multiple copies of the same substring.
-
-
-The /B modifier is a debugging feature. It requests that pcretest
-output a representation of the compiled byte code after compilation. Normally
-this information contains length and offset values; however, if /Z is
-also present, this data is replaced by spaces. This is a special feature for
-use in the automatic test scripts; it ensures that the same output is generated
-for different internal link sizes.
-
-
-The /L modifier must be followed directly by the name of a locale, for
-example,
-
- /pattern/Lfr_FR
-
-For this reason, it must be the last modifier. The given locale is set,
-pcre_maketables() is called to build a set of character tables for the
-locale, and this is then passed to pcre_compile() when compiling the
-regular expression. Without an /L modifier, NULL is passed as the tables
-pointer; that is, /L applies only to the expression on which it appears.
-
-
-The /I modifier requests that pcretest output information about the
-compiled pattern (whether it is anchored, has a fixed first character, and
-so on). It does this by calling pcre_fullinfo() after compiling a
-pattern. If the pattern is studied, the results of that are also output.
-
-
-The /D modifier is a PCRE debugging feature, and is equivalent to
-/BI, that is, both the /B and the /I modifiers.
-
-
-The /F modifier causes pcretest to flip the byte order of the
-fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
-facility is for testing the feature in PCRE that allows it to execute patterns
-that were compiled on a host with a different endianness. This feature is not
-available when the POSIX interface to PCRE is being used, that is, when the
-/P pattern modifier is specified. See also the section about saving and
-reloading compiled patterns below.
-
-
-The /S modifier causes pcre_study() to be called after the
-expression has been compiled, and the results used when the expression is
-matched.
-
-
-The /M modifier causes the size of memory block used to hold the compiled
-pattern to be output.
-
-
-The /P modifier causes pcretest to call PCRE via the POSIX wrapper
-API rather than its native API. When this is done, all other modifiers except
-/i, /m, and /+ are ignored. REG_ICASE is set if /i is
-present, and REG_NEWLINE is set if /m is present. The wrapper functions
-force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
-
-
-The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8
-option set. This turns on support for UTF-8 character handling in PCRE,
-provided that it was compiled with this support enabled. This modifier also
-causes any non-printing characters in output strings to be printed using the
-\x{hh...} notation if they are valid UTF-8 sequences.
-
-
-If the /? modifier is used with /8, it causes pcretest to
-call pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
-checking of the string for UTF-8 validity.
-
-
DATA LINES
-
-Before each data line is passed to pcre_exec(), leading and trailing
-whitespace is removed, and it is then scanned for \ escapes. Some of these are
-pretty esoteric features, intended for checking out some of the more
-complicated features of PCRE. If you are just testing "ordinary" regular
-expressions, you probably don't need any of these. The following escapes are
-recognized:
-
- \a alarm (BEL, \x07)
- \b backspace (\x08)
- \e escape (\x27)
- \f formfeed (\x0c)
- \n newline (\x0a)
- \qdd set the PCRE_MATCH_LIMIT limit to dd (any number of digits)
- \r carriage return (\x0d)
- \t tab (\x09)
- \v vertical tab (\x0b)
- \nnn octal character (up to 3 octal digits)
- \xhh hexadecimal character (up to 2 hex digits)
- \x{hh...} hexadecimal character, any number of digits in UTF-8 mode
- \A pass the PCRE_ANCHORED option to pcre_exec() or pcre_dfa_exec()
- \B pass the PCRE_NOTBOL option to pcre_exec() or pcre_dfa_exec()
- \Cdd call pcre_copy_substring() for substring dd after a successful match (number less than 32)
- \Cname call pcre_copy_named_substring() for substring "name" after a successful match (name termin-
- ated by next non alphanumeric character)
- \C+ show the current captured substrings at callout time
- \C- do not supply a callout function
- \C!n return 1 instead of 0 when callout number n is reached
- \C!n!m return 1 instead of 0 when callout number n is reached for the nth time
- \C*n pass the number n (may be negative) as callout data; this is used as the callout return value
- \D use the pcre_dfa_exec() match function
- \F only shortest match for pcre_dfa_exec()
- \Gdd call pcre_get_substring() for substring dd after a successful match (number less than 32)
- \Gname call pcre_get_named_substring() for substring "name" after a successful match (name termin-
- ated by next non-alphanumeric character)
- \L call pcre_get_substringlist() after a successful match
- \M discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings
- \N pass the PCRE_NOTEMPTY option to pcre_exec() or pcre_dfa_exec()
- \Odd set the size of the output vector passed to pcre_exec() to dd (any number of digits)
- \P pass the PCRE_PARTIAL option to pcre_exec() or pcre_dfa_exec()
- \Qdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd (any number of digits)
- \R pass the PCRE_DFA_RESTART option to pcre_dfa_exec()
- \S output details of memory get/free calls during matching
- \Z pass the PCRE_NOTEOL option to pcre_exec() or pcre_dfa_exec()
- \? pass the PCRE_NO_UTF8_CHECK option to pcre_exec() or pcre_dfa_exec()
- \>dd start the match at offset dd (any number of digits);
- this sets the startoffset argument for pcre_exec() or pcre_dfa_exec()
- \<cr> pass the PCRE_NEWLINE_CR option to pcre_exec() or pcre_dfa_exec()
- \<lf> pass the PCRE_NEWLINE_LF option to pcre_exec() or pcre_dfa_exec()
- \<crlf> pass the PCRE_NEWLINE_CRLF option to pcre_exec() or pcre_dfa_exec()
- \<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to pcre_exec() or pcre_dfa_exec()
- \<any> pass the PCRE_NEWLINE_ANY option to pcre_exec() or pcre_dfa_exec()
-
-The escapes that specify line ending sequences are literal strings, exactly as
-shown. No more than one newline setting should be present in any data line.
-
-
-A backslash followed by anything else just escapes the anything else. If
-the very last character is a backslash, it is ignored. This gives a way of
-passing an empty line as data, since a real empty line terminates the data
-input.
-
-
-If \M is present, pcretest calls pcre_exec() several times, with
-different values in the match_limit and match_limit_recursion
-fields of the pcre_extra data structure, until it finds the minimum
-numbers for each parameter that allow pcre_exec() to complete. The
-match_limit number is a measure of the amount of backtracking that takes
-place, and checking it out can be instructive. For most simple matches, the
-number is quite small, but for patterns with very large numbers of matching
-possibilities, it can become large very quickly with increasing length of
-subject string. The match_limit_recursion number is a measure of how much
-stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
-to complete the match attempt.
-
-
-When \O is used, the value specified may be higher or lower than the size set
-by the -O command line option (or defaulted to 45); \O applies only to
-the call of pcre_exec() for the line in which it appears.
-
-
-If the /P modifier was present on the pattern, causing the POSIX wrapper
-API to be used, the only option-setting sequences that have any effect are \B
-and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
-regexec().
-
-
-The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
-of the /8 modifier on the pattern. It is recognized always. There may be
-any number of hexadecimal digits inside the braces. The result is from one to
-six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
-allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
-valid Unicode code points, or indeed valid UTF-8 characters according to the
-later rules in RFC 3629.
-
-
THE ALTERNATIVE MATCHING FUNCTION
-
-By default, pcretest uses the standard PCRE matching function,
-pcre_exec() to match each data line. From release 6.0, PCRE supports an
-alternative matching function, pcre_dfa_test(), which operates in a
-different way, and has some restrictions. The differences between the two
-functions are described in the
-pcrematching
-documentation.
-
-
-If a data line contains the \D escape sequence, or if the command line
-contains the -dfa option, the alternative matching function is called.
-This function finds all possible matches at a given point. If, however, the \F
-escape sequence is present in the data line, it stops after the first match is
-found. This is always the shortest possible match.
-
-
DEFAULT OUTPUT FROM PCRETEST
-
-This section describes the output when the normal matching function,
-pcre_exec(), is being used.
-
-
-When a match succeeds, pcretest outputs the list of captured substrings that
-pcre_exec() returns, starting with number 0 for the string that matched
-the whole pattern. Otherwise, it outputs "No match" or "Partial match"
-when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PARTIAL,
-respectively, and otherwise the PCRE negative error number. Here is an example
-of an interactive pcretest run.
-
- $ pcretest
- PCRE version 7.0 30-Nov-2006
-
- re> /^abc(\d+)/
- data> abc123
- 0: abc123
- 1: 123
- data> xyz
- No match
-
-Note that unset capturing substrings that are not followed by one that is set
-are not returned by pcre_exec(), and are not shown by pcretest. In
-the following example, there are two capturing substrings, but when the first
-data line is matched, the second, unset substring is not shown. An "internal"
-unset substring is shown as "<unset>", as for the second data line.
-
- re> /(a)|(b)/
- data> a
- 0: a
- 1: a
- data> b
- 0: b
- 1: <unset>
- 2: b
-
-If the strings contain any non-printing characters, they are output as \0x
-escapes, or as \x{...} escapes if the /8 modifier was present on the
-pattern. See below for the definition of non-printing characters. If the
-pattern has the /+ modifier, the output for substring 0 is followed by
-the the rest of the subject string, identified by "0+" like this:
-
- re> /cat/+
- data> cataract
- 0: cat
- 0+ aract
-
-If the pattern has the /g or /G modifier, the results of successive
-matching attempts are output in sequence, like this:
-
- re> /\Bi(\w\w)/g
- data> Mississippi
- 0: iss
- 1: ss
- 0: iss
- 1: ss
- 0: ipp
- 1: pp
-
-"No match" is output only if the first match attempt fails.
-
-
-If any of the sequences \C, \G, or \L are present in a
-data line that is successfully matched, the substrings extracted by the
-convenience functions are output with C, G, or L after the string number
-instead of a colon. This is in addition to the normal full list. The string
-length (that is, the return from the extraction function) is given in
-parentheses after each string for \C and \G.
-
-
-Note that whereas patterns can be continued over several lines (a plain ">"
-prompt is used for continuations), data lines may not. However newlines can be
-included in data by means of the \n escape (or \r, \r\n, etc., depending on
-the newline sequence setting).
-
-
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
-
-When the alternative matching function, pcre_dfa_exec(), is used (by
-means of the \D escape sequence or the -dfa command line option), the
-output consists of a list of all the matches that start at the first point in
-the subject where there is at least one match. For example:
-
- re> /(tang|tangerine|tan)/
- data> yellow tangerine\D
- 0: tangerine
- 1: tang
- 2: tan
-
-(Using the normal matching function on this data finds only "tang".) The
-longest matching string is always given first (and numbered zero).
-
-
-If /g is present on the pattern, the search for further matches resumes
-at the end of the longest match. For example:
-
- re> /(tang|tangerine|tan)/g
- data> yellow tangerine and tangy sultana\D
- 0: tangerine
- 1: tang
- 2: tan
- 0: tang
- 1: tan
- 0: tan
-
-Since the matching function does not support substring capture, the escape
-sequences that are concerned with captured substrings are not relevant.
-
-
RESTARTING AFTER A PARTIAL MATCH
-
-When the alternative matching function has given the PCRE_ERROR_PARTIAL return,
-indicating that the subject partially matched the pattern, you can restart the
-match with additional subject data by means of the \R escape sequence. For
-example:
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 23ja\P\D
- Partial match: 23ja
- data> n05\R\D
- 0: n05
-
-For further information about partial matching, see the
-pcrepartial
-documentation.
-
-
CALLOUTS
-
-If the pattern contains any callout requests, pcretest's callout function
-is called during matching. This works with both matching functions. By default,
-the called function displays the callout number, the start and current
-positions in the text at the callout time, and the next pattern item to be
-tested. For example, the output
-
- --->pqrabcdef
- 0 ^ ^ \d
-
-indicates that callout number 0 occurred for a match attempt starting at the
-fourth character of the subject string, when the pointer was at the seventh
-character of the data, and when the next pattern item was \d. Just one
-circumflex is output if the start and current positions are the same.
-
-
-Callouts numbered 255 are assumed to be automatic callouts, inserted as a
-result of the /C pattern modifier. In this case, instead of showing the
-callout number, the offset in the pattern, preceded by a plus, is output. For
-example:
-
- re> /\d?[A-E]\*/C
- data> E*
- --->E*
- +0 ^ \d?
- +3 ^ [A-E]
- +8 ^^ \*
- +10 ^ ^
- 0: E*
-
-The callout function in pcretest returns zero (carry on matching) by
-default, but you can use a \C item in a data line (as described above) to
-change this.
-
-
-Inserting callouts can be helpful when using pcretest to check
-complicated regular expressions. For further information about callouts, see
-the
-pcrecallout
-documentation.
-
-
NON-PRINTING CHARACTERS
-
-When pcretest is outputting text in the compiled version of a pattern,
-bytes other than 32-126 are always treated as non-printing characters are are
-therefore shown as hex escapes.
-
-
-When pcretest is outputting text that is a matched part of a subject
-string, it behaves in the same way, unless a different locale has been set for
-the pattern (using the /L modifier). In this case, the isprint()
-function to distinguish printing and non-printing characters.
-
-
SAVING AND RELOADING COMPILED PATTERNS
-
-The facilities described in this section are not available when the POSIX
-inteface to PCRE is being used, that is, when the /P pattern modifier is
-specified.
-
-
-When the POSIX interface is not in use, you can cause pcretest to write a
-compiled pattern to a file, by following the modifiers with > and a file name.
-For example:
-
- /pattern/im >/some/file
-
-See the
-pcreprecompile
-documentation for a discussion about saving and re-using compiled patterns.
-
-
-The data that is written is binary. The first eight bytes are the length of the
-compiled pattern data followed by the length of the optional study data, each
-written as four bytes in big-endian order (most significant byte first). If
-there is no study data (either the pattern was not studied, or studying did not
-return any data), the second length is zero. The lengths are followed by an
-exact copy of the compiled pattern. If there is additional study data, this
-follows immediately after the compiled pattern. After writing the file,
-pcretest expects to read a new pattern.
-
-
-A saved pattern can be reloaded into pcretest by specifing < and a file
-name instead of a pattern. The name of the file must not contain a < character,
-as otherwise pcretest will interpret the line as a pattern delimited by <
-characters.
-For example:
-
- re> </some/file
- Compiled regex loaded from /some/file
- No study data
-
-When the pattern has been loaded, pcretest proceeds to read data lines in
-the usual way.
-
-
-You can copy a file written by pcretest to a different host and reload it
-there, even if the new host has opposite endianness to the one on which the
-pattern was compiled. For example, you can compile on an i86 machine and run on
-a SPARC machine.
-
-
-File names for saving and reloading can be absolute or relative, but note that
-the shell facility of expanding a file name that starts with a tilde (~) is not
-available.
-
-
-The ability to save and reload files in pcretest is intended for testing
-and experimentation. It is not intended for production use because only a
-single pattern can be written to a file. Furthermore, there is no facility for
-supplying custom character tables for use with a reloaded pattern. If the
-original pattern was compiled with custom tables, an attempt to match a subject
-string using a reloaded pattern is likely to cause pcretest to crash.
-Finally, if you attempt to load a file that is not in the correct format, the
-result is undefined.
-
-
SEE ALSO
-
-pcre(3), pcreapi(3), pcrecallout(3), pcrematching(3),
-pcrepartial(d), pcrepattern(3), pcreprecompile(3).
-
-
AUTHOR
-
-Philip Hazel
-
-University Computing Service
-
-Cambridge CB2 3QH, England.
-
-
-
REVISION
-
-Last updated: 10 March 2009
-
-Copyright © 1997-2009 University of Cambridge.
-
-
-Return to the PCRE index page.
-
diff --git a/libs/pcre/doc/index.html.src b/libs/pcre/doc/index.html.src
deleted file mode 100644
index 888471ff75..0000000000
--- a/libs/pcre/doc/index.html.src
+++ /dev/null
@@ -1,140 +0,0 @@
-
-
-
-PCRE specification
-
-
-Perl-compatible Regular Expressions (PCRE)
-
-The HTML documentation for PCRE comprises the following pages:
-
-
-
-pcre |
- Introductory page |
-
-pcre-config |
- Information about the installation configuration |
-
-pcreapi |
- PCRE's native API |
-
-pcrebuild |
- Options for building PCRE |
-
-pcrecallout |
- The callout facility |
-
-pcrecompat |
- Compability with Perl |
-
-pcrecpp |
- The C++ wrapper for the PCRE library |
-
-pcregrep |
- The pcregrep command |
-
-pcrematching |
- Discussion of the two matching algorithms |
-
-pcrepartial |
- Using PCRE for partial matching |
-
-pcrepattern |
- Specification of the regular expressions supported by PCRE |
-
-pcreperform |
- Some comments on performance |
-
-pcreposix |
- The POSIX API to the PCRE library |
-
-pcreprecompile |
- How to save and re-use compiled patterns |
-
-pcresample |
- Description of the sample program |
-
-pcrestack |
- Discussion of PCRE's stack usage |
-
-pcresyntax |
- Syntax quick-reference summary |
-
-pcretest |
- The pcretest command for testing PCRE |
-
-
-
-There are also individual pages that summarize the interface for each function
-in the library:
-
-
-
-
-
diff --git a/libs/pcre/doc/pcre-config.1 b/libs/pcre/doc/pcre-config.1
deleted file mode 100644
index afbd3a0182..0000000000
--- a/libs/pcre/doc/pcre-config.1
+++ /dev/null
@@ -1,73 +0,0 @@
-.TH PCRE-CONFIG 1
-.SH NAME
-pcre-config - program to return PCRE configuration
-.SH SYNOPSIS
-.rs
-.sp
-.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
-.ti +5n
-.B [--libs-posix] [--cflags] [--cflags-posix]
-.
-.
-.SH DESCRIPTION
-.rs
-.sp
-\fBpcre-config\fP returns the configuration of the installed PCRE
-libraries and the options required to compile a program to use them.
-.
-.
-.SH OPTIONS
-.rs
-.TP 10
-\fB--prefix\fP
-Writes the directory prefix used in the PCRE installation for architecture
-independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some
-systems) to the standard output.
-.TP 10
-\fB--exec-prefix\fP
-Writes the directory prefix used in the PCRE installation for architecture
-dependent files (normally the same as \fB--prefix\fP) to the standard output.
-.TP 10
-\fB--version\fP
-Writes the version number of the installed PCRE libraries to the standard
-output.
-.TP 10
-\fB--libs\fP
-Writes to the standard output the command line options required to link
-with PCRE (\fB-lpcre\fP on many systems).
-.TP 10
-\fB--libs-posix\fP
-Writes to the standard output the command line options required to link with
-the PCRE posix emulation library (\fB-lpcreposix\fP \fB-lpcre\fP on many
-systems).
-.TP 10
-\fB--cflags\fP
-Writes to the standard output the command line options required to compile
-files that use PCRE (this may include some \fB-I\fP options, but is blank on
-many systems).
-.TP 10
-\fB--cflags-posix\fP
-Writes to the standard output the command line options required to compile
-files that use the PCRE posix emulation library (this may include some \fB-I\fP
-options, but is blank on many systems).
-.
-.
-.SH "SEE ALSO"
-.rs
-.sp
-\fBpcre(3)\fP
-.
-.
-.SH AUTHOR
-.rs
-.sp
-This manual page was originally written by Mark Baker for the Debian GNU/Linux
-system. It has been slightly revised as a generic PCRE man page.
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 18 April 2007
-.fi
diff --git a/libs/pcre/doc/pcre-config.txt b/libs/pcre/doc/pcre-config.txt
deleted file mode 100644
index c979d455e8..0000000000
--- a/libs/pcre/doc/pcre-config.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-PCRE-CONFIG(1) PCRE-CONFIG(1)
-
-
-
-NAME
- pcre-config - program to return PCRE configuration
-
-SYNOPSIS
-
- pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
- [--libs-posix] [--cflags] [--cflags-posix]
-
-
-DESCRIPTION
-
- pcre-config returns the configuration of the installed PCRE libraries
- and the options required to compile a program to use them.
-
-
-OPTIONS
-
- --prefix Writes the directory prefix used in the PCRE installation for
- architecture independent files (/usr on many systems,
- /usr/local on some systems) to the standard output.
-
- --exec-prefix
- Writes the directory prefix used in the PCRE installation for
- architecture dependent files (normally the same as --prefix)
- to the standard output.
-
- --version Writes the version number of the installed PCRE libraries to
- the standard output.
-
- --libs Writes to the standard output the command line options
- required to link with PCRE (-lpcre on many systems).
-
- --libs-posix
- Writes to the standard output the command line options
- required to link with the PCRE posix emulation library
- (-lpcreposix -lpcre on many systems).
-
- --cflags Writes to the standard output the command line options
- required to compile files that use PCRE (this may include
- some -I options, but is blank on many systems).
-
- --cflags-posix
- Writes to the standard output the command line options
- required to compile files that use the PCRE posix emulation
- library (this may include some -I options, but is blank on
- many systems).
-
-
-SEE ALSO
-
- pcre(3)
-
-
-AUTHOR
-
- This manual page was originally written by Mark Baker for the Debian
- GNU/Linux system. It has been slightly revised as a generic PCRE man
- page.
-
-
-REVISION
-
- Last updated: 18 April 2007
diff --git a/libs/pcre/doc/pcre.3 b/libs/pcre/doc/pcre.3
deleted file mode 100644
index 6f174adf1e..0000000000
--- a/libs/pcre/doc/pcre.3
+++ /dev/null
@@ -1,296 +0,0 @@
-.TH PCRE 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH INTRODUCTION
-.rs
-.sp
-The PCRE library is a set of functions that implement regular expression
-pattern matching using the same syntax and semantics as Perl, with just a few
-differences. Certain features that appeared in Python and PCRE before they
-appeared in Perl are also available using the Python syntax. There is also some
-support for certain .NET and Oniguruma syntax items, and there is an option for
-requesting some minor changes that give better JavaScript compatibility.
-.P
-The current implementation of PCRE (release 7.x) corresponds approximately with
-Perl 5.10, including support for UTF-8 encoded strings and Unicode general
-category properties. However, UTF-8 and Unicode support has to be explicitly
-enabled; it is not the default. The Unicode tables correspond to Unicode
-release 5.1.
-.P
-In addition to the Perl-compatible matching function, PCRE contains an
-alternative matching function that matches the same compiled patterns in a
-different way. In certain circumstances, the alternative function has some
-advantages. For a discussion of the two matching algorithms, see the
-.\" HREF
-\fBpcrematching\fP
-.\"
-page.
-.P
-PCRE is written in C and released as a C library. A number of people have
-written wrappers and interfaces of various kinds. In particular, Google Inc.
-have provided a comprehensive C++ wrapper. This is now included as part of the
-PCRE distribution. The
-.\" HREF
-\fBpcrecpp\fP
-.\"
-page has details of this interface. Other people's contributions can be found
-in the \fIContrib\fR directory at the primary FTP site, which is:
-.sp
-.\" HTML
-.\"
-ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
-.P
-Details of exactly which Perl regular expression features are and are not
-supported by PCRE are given in separate documents. See the
-.\" HREF
-\fBpcrepattern\fR
-.\"
-and
-.\" HREF
-\fBpcrecompat\fR
-.\"
-pages. There is a syntax summary in the
-.\" HREF
-\fBpcresyntax\fR
-.\"
-page.
-.P
-Some features of PCRE can be included, excluded, or changed when the library is
-built. The
-.\" HREF
-\fBpcre_config()\fR
-.\"
-function makes it possible for a client to discover which features are
-available. The features themselves are described in the
-.\" HREF
-\fBpcrebuild\fP
-.\"
-page. Documentation about building PCRE for various operating systems can be
-found in the \fBREADME\fP file in the source distribution.
-.P
-The library contains a number of undocumented internal functions and data
-tables that are used by more than one of the exported external functions, but
-which are not intended for use by external callers. Their names all begin with
-"_pcre_", which hopefully will not provoke any name clashes. In some
-environments, it is possible to control which external symbols are exported
-when a shared library is built, and in these cases the undocumented symbols are
-not exported.
-.
-.
-.SH "USER DOCUMENTATION"
-.rs
-.sp
-The user documentation for PCRE comprises a number of different sections. In
-the "man" format, each of these is a separate "man page". In the HTML format,
-each is a separate page, linked from the index page. In the plain text format,
-all the sections are concatenated, for ease of searching. The sections are as
-follows:
-.sp
- pcre this document
- pcre-config show PCRE installation configuration information
- pcreapi details of PCRE's native C API
- pcrebuild options for building PCRE
- pcrecallout details of the callout feature
- pcrecompat discussion of Perl compatibility
- pcrecpp details of the C++ wrapper
- pcregrep description of the \fBpcregrep\fP command
- pcrematching discussion of the two matching algorithms
- pcrepartial details of the partial matching facility
-.\" JOIN
- pcrepattern syntax and semantics of supported
- regular expressions
- pcresyntax quick syntax reference
- pcreperform discussion of performance issues
- pcreposix the POSIX-compatible C API
- pcreprecompile details of saving and re-using precompiled patterns
- pcresample discussion of the sample program
- pcrestack discussion of stack usage
- pcretest description of the \fBpcretest\fP testing command
-.sp
-In addition, in the "man" and HTML formats, there is a short page for each
-C library function, listing its arguments and results.
-.
-.
-.SH LIMITATIONS
-.rs
-.sp
-There are some size limitations in PCRE but it is hoped that they will never in
-practice be relevant.
-.P
-The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
-compiled with the default internal linkage size of 2. If you want to process
-regular expressions that are truly enormous, you can compile PCRE with an
-internal linkage size of 3 or 4 (see the \fBREADME\fP file in the source
-distribution and the
-.\" HREF
-\fBpcrebuild\fP
-.\"
-documentation for details). In these cases the limit is substantially larger.
-However, the speed of execution is slower.
-.P
-All values in repeating quantifiers must be less than 65536.
-.P
-There is no limit to the number of parenthesized subpatterns, but there can be
-no more than 65535 capturing subpatterns.
-.P
-The maximum length of name for a named subpattern is 32 characters, and the
-maximum number of named subpatterns is 10000.
-.P
-The maximum length of a subject string is the largest positive number that an
-integer variable can hold. However, when using the traditional matching
-function, PCRE uses recursion to handle subpatterns and indefinite repetition.
-This means that the available stack space may limit the size of a subject
-string that can be processed by certain patterns. For a discussion of stack
-issues, see the
-.\" HREF
-\fBpcrestack\fP
-.\"
-documentation.
-.
-.\" HTML
-.
-.
-.SH "UTF-8 AND UNICODE PROPERTY SUPPORT"
-.rs
-.sp
-From release 3.3, PCRE has had some support for character strings encoded in
-the UTF-8 format. For release 4.0 this was greatly extended to cover most
-common requirements, and in release 5.0 additional support for Unicode general
-category properties was added.
-.P
-In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
-the code, and, in addition, you must call
-.\" HREF
-\fBpcre_compile()\fP
-.\"
-with the PCRE_UTF8 option flag, or the pattern must start with the sequence
-(*UTF8). When either of these is the case, both the pattern and any subject
-strings that are matched against it are treated as UTF-8 strings instead of
-just strings of bytes.
-.P
-If you compile PCRE with UTF-8 support, but do not use it at run time, the
-library will be a bit bigger, but the additional run time overhead is limited
-to testing the PCRE_UTF8 flag occasionally, so should not be very big.
-.P
-If PCRE is built with Unicode character property support (which implies UTF-8
-support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
-The available properties that can be tested are limited to the general
-category properties such as Lu for an upper case letter or Nd for a decimal
-number, the Unicode script names such as Arabic or Han, and the derived
-properties Any and L&. A full list is given in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation. Only the short names for properties are supported. For example,
-\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
-Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
-compatibility with Perl 5.6. PCRE does not support this.
-.
-.\" HTML
-.
-.SS "Validity of UTF-8 strings"
-.rs
-.sp
-When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
-are (by default) checked for validity on entry to the relevant functions. From
-release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
-themselves derived from the Unicode specification. Earlier releases of PCRE
-followed the rules of RFC 2279, which allows the full range of 31-bit values (0
-to 0x7FFFFFFF). The current check allows only values in the range U+0 to
-U+10FFFF, excluding U+D800 to U+DFFF.
-.P
-The excluded code points are the "Low Surrogate Area" of Unicode, of which the
-Unicode Standard says this: "The Low Surrogate Area does not contain any
-character assignments, consequently no character code charts or namelists are
-provided for this area. Surrogates are reserved for use with UTF-16 and then
-must be used in pairs." The code points that are encoded by UTF-16 pairs are
-available as independent code points in the UTF-8 encoding. (In other words,
-the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
-UTF-8.)
-.P
-If an invalid UTF-8 string is passed to PCRE, an error return
-(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
-your strings are valid, and therefore want to skip these checks in order to
-improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
-at run time, PCRE assumes that the pattern or subject it is given
-(respectively) contains only valid UTF-8 codes. In this case, it does not
-diagnose an invalid UTF-8 string.
-.P
-If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
-happens depends on why the string is invalid. If the string conforms to the
-"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
-in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
-test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
-rules of RFC 2279. However, if the string does not even conform to RFC 2279,
-the result is undefined. Your program may crash.
-.P
-If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
-encoded in a UTF-8-like manner as per the old RFC, you can set
-PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
-situation, you will have to apply your own validity check.
-.
-.SS "General comments about UTF-8 mode"
-.rs
-.sp
-1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
-UTF-8 character if the value is greater than 127.
-.P
-2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
-characters for values greater than \e177.
-.P
-3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
-bytes, for example: \ex{100}{3}.
-.P
-4. The dot metacharacter matches one UTF-8 character instead of a single byte.
-.P
-5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
-but its use can lead to some strange effects. This facility is not available in
-the alternative matching function, \fBpcre_dfa_exec()\fP.
-.P
-6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
-test characters of any code value, but the characters that PCRE recognizes as
-digits, spaces, or word characters remain the same set as before, all with
-values less than 256. This remains true even when PCRE includes Unicode
-property support, because to do otherwise would slow down PCRE in many common
-cases. If you really want to test for a wider sense of, say, "digit", you
-must use Unicode property tests such as \ep{Nd}. Note that this also applies to
-\eb, because it is defined in terms of \ew and \eW.
-.P
-7. Similarly, characters that match the POSIX named character classes are all
-low-valued characters.
-.P
-8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
-(\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters.
-.P
-9. Case-insensitive matching applies only to characters whose values are less
-than 128, unless PCRE is built with Unicode property support. Even when Unicode
-property support is available, PCRE still uses its own character tables when
-checking the case of low-valued characters, so as not to degrade performance.
-The Unicode property information is used only for characters with higher
-values. Even when Unicode property support is available, PCRE supports
-case-insensitive matching only when there is a one-to-one mapping between a
-letter's cases. There are a small number of many-to-one mappings in Unicode;
-these are not supported by PCRE.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.P
-Putting an actual email address here seems to have been a spam magnet, so I've
-taken it away. If you want to email me, use my two initials, followed by the
-two digits 10, at the domain cam.ac.uk.
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 11 April 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcre.txt b/libs/pcre/doc/pcre.txt
deleted file mode 100644
index 9a2ce31598..0000000000
--- a/libs/pcre/doc/pcre.txt
+++ /dev/null
@@ -1,6680 +0,0 @@
------------------------------------------------------------------------------
-This file contains a concatenation of the PCRE man pages, converted to plain
-text format for ease of searching with a text editor, or for use on systems
-that do not have a man page processor. The small individual files that give
-synopses of each function in the library have not been included. There are
-separate text files for the pcregrep and pcretest commands.
------------------------------------------------------------------------------
-
-
-PCRE(3) PCRE(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-INTRODUCTION
-
- The PCRE library is a set of functions that implement regular expres-
- sion pattern matching using the same syntax and semantics as Perl, with
- just a few differences. Certain features that appeared in Python and
- PCRE before they appeared in Perl are also available using the Python
- syntax. There is also some support for certain .NET and Oniguruma syn-
- tax items, and there is an option for requesting some minor changes
- that give better JavaScript compatibility.
-
- The current implementation of PCRE (release 7.x) corresponds approxi-
- mately with Perl 5.10, including support for UTF-8 encoded strings and
- Unicode general category properties. However, UTF-8 and Unicode support
- has to be explicitly enabled; it is not the default. The Unicode tables
- correspond to Unicode release 5.1.
-
- In addition to the Perl-compatible matching function, PCRE contains an
- alternative matching function that matches the same compiled patterns
- in a different way. In certain circumstances, the alternative function
- has some advantages. For a discussion of the two matching algorithms,
- see the pcrematching page.
-
- PCRE is written in C and released as a C library. A number of people
- have written wrappers and interfaces of various kinds. In particular,
- Google Inc. have provided a comprehensive C++ wrapper. This is now
- included as part of the PCRE distribution. The pcrecpp page has details
- of this interface. Other people's contributions can be found in the
- Contrib directory at the primary FTP site, which is:
-
- ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
-
- Details of exactly which Perl regular expression features are and are
- not supported by PCRE are given in separate documents. See the pcrepat-
- tern and pcrecompat pages. There is a syntax summary in the pcresyntax
- page.
-
- Some features of PCRE can be included, excluded, or changed when the
- library is built. The pcre_config() function makes it possible for a
- client to discover which features are available. The features them-
- selves are described in the pcrebuild page. Documentation about build-
- ing PCRE for various operating systems can be found in the README file
- in the source distribution.
-
- The library contains a number of undocumented internal functions and
- data tables that are used by more than one of the exported external
- functions, but which are not intended for use by external callers.
- Their names all begin with "_pcre_", which hopefully will not provoke
- any name clashes. In some environments, it is possible to control which
- external symbols are exported when a shared library is built, and in
- these cases the undocumented symbols are not exported.
-
-
-USER DOCUMENTATION
-
- The user documentation for PCRE comprises a number of different sec-
- tions. In the "man" format, each of these is a separate "man page". In
- the HTML format, each is a separate page, linked from the index page.
- In the plain text format, all the sections are concatenated, for ease
- of searching. The sections are as follows:
-
- pcre this document
- pcre-config show PCRE installation configuration information
- pcreapi details of PCRE's native C API
- pcrebuild options for building PCRE
- pcrecallout details of the callout feature
- pcrecompat discussion of Perl compatibility
- pcrecpp details of the C++ wrapper
- pcregrep description of the pcregrep command
- pcrematching discussion of the two matching algorithms
- pcrepartial details of the partial matching facility
- pcrepattern syntax and semantics of supported
- regular expressions
- pcresyntax quick syntax reference
- pcreperform discussion of performance issues
- pcreposix the POSIX-compatible C API
- pcreprecompile details of saving and re-using precompiled patterns
- pcresample discussion of the sample program
- pcrestack discussion of stack usage
- pcretest description of the pcretest testing command
-
- In addition, in the "man" and HTML formats, there is a short page for
- each C library function, listing its arguments and results.
-
-
-LIMITATIONS
-
- There are some size limitations in PCRE but it is hoped that they will
- never in practice be relevant.
-
- The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE
- is compiled with the default internal linkage size of 2. If you want to
- process regular expressions that are truly enormous, you can compile
- PCRE with an internal linkage size of 3 or 4 (see the README file in
- the source distribution and the pcrebuild documentation for details).
- In these cases the limit is substantially larger. However, the speed
- of execution is slower.
-
- All values in repeating quantifiers must be less than 65536.
-
- There is no limit to the number of parenthesized subpatterns, but there
- can be no more than 65535 capturing subpatterns.
-
- The maximum length of name for a named subpattern is 32 characters, and
- the maximum number of named subpatterns is 10000.
-
- The maximum length of a subject string is the largest positive number
- that an integer variable can hold. However, when using the traditional
- matching function, PCRE uses recursion to handle subpatterns and indef-
- inite repetition. This means that the available stack space may limit
- the size of a subject string that can be processed by certain patterns.
- For a discussion of stack issues, see the pcrestack documentation.
-
-
-UTF-8 AND UNICODE PROPERTY SUPPORT
-
- From release 3.3, PCRE has had some support for character strings
- encoded in the UTF-8 format. For release 4.0 this was greatly extended
- to cover most common requirements, and in release 5.0 additional sup-
- port for Unicode general category properties was added.
-
- In order process UTF-8 strings, you must build PCRE to include UTF-8
- support in the code, and, in addition, you must call pcre_compile()
- with the PCRE_UTF8 option flag, or the pattern must start with the
- sequence (*UTF8). When either of these is the case, both the pattern
- and any subject strings that are matched against it are treated as
- UTF-8 strings instead of just strings of bytes.
-
- If you compile PCRE with UTF-8 support, but do not use it at run time,
- the library will be a bit bigger, but the additional run time overhead
- is limited to testing the PCRE_UTF8 flag occasionally, so should not be
- very big.
-
- If PCRE is built with Unicode character property support (which implies
- UTF-8 support), the escape sequences \p{..}, \P{..}, and \X are sup-
- ported. The available properties that can be tested are limited to the
- general category properties such as Lu for an upper case letter or Nd
- for a decimal number, the Unicode script names such as Arabic or Han,
- and the derived properties Any and L&. A full list is given in the
- pcrepattern documentation. Only the short names for properties are sup-
- ported. For example, \p{L} matches a letter. Its Perl synonym, \p{Let-
- ter}, is not supported. Furthermore, in Perl, many properties may
- optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE
- does not support this.
-
- Validity of UTF-8 strings
-
- When you set the PCRE_UTF8 flag, the strings passed as patterns and
- subjects are (by default) checked for validity on entry to the relevant
- functions. From release 7.3 of PCRE, the check is according the rules
- of RFC 3629, which are themselves derived from the Unicode specifica-
- tion. Earlier releases of PCRE followed the rules of RFC 2279, which
- allows the full range of 31-bit values (0 to 0x7FFFFFFF). The current
- check allows only values in the range U+0 to U+10FFFF, excluding U+D800
- to U+DFFF.
-
- The excluded code points are the "Low Surrogate Area" of Unicode, of
- which the Unicode Standard says this: "The Low Surrogate Area does not
- contain any character assignments, consequently no character code
- charts or namelists are provided for this area. Surrogates are reserved
- for use with UTF-16 and then must be used in pairs." The code points
- that are encoded by UTF-16 pairs are available as independent code
- points in the UTF-8 encoding. (In other words, the whole surrogate
- thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
-
- If an invalid UTF-8 string is passed to PCRE, an error return
- (PCRE_ERROR_BADUTF8) is given. In some situations, you may already know
- that your strings are valid, and therefore want to skip these checks in
- order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag at
- compile time or at run time, PCRE assumes that the pattern or subject
- it is given (respectively) contains only valid UTF-8 codes. In this
- case, it does not diagnose an invalid UTF-8 string.
-
- If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set,
- what happens depends on why the string is invalid. If the string con-
- forms to the "old" definition of UTF-8 (RFC 2279), it is processed as a
- string of characters in the range 0 to 0x7FFFFFFF. In other words,
- apart from the initial validity test, PCRE (when in UTF-8 mode) handles
- strings according to the more liberal rules of RFC 2279. However, if
- the string does not even conform to RFC 2279, the result is undefined.
- Your program may crash.
-
- If you want to process strings of values in the full range 0 to
- 0x7FFFFFFF, encoded in a UTF-8-like manner as per the old RFC, you can
- set PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in
- this situation, you will have to apply your own validity check.
-
- General comments about UTF-8 mode
-
- 1. An unbraced hexadecimal escape sequence (such as \xb3) matches a
- two-byte UTF-8 character if the value is greater than 127.
-
- 2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
- characters for values greater than \177.
-
- 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi-
- vidual bytes, for example: \x{100}{3}.
-
- 4. The dot metacharacter matches one UTF-8 character instead of a sin-
- gle byte.
-
- 5. The escape sequence \C can be used to match a single byte in UTF-8
- mode, but its use can lead to some strange effects. This facility is
- not available in the alternative matching function, pcre_dfa_exec().
-
- 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
- test characters of any code value, but the characters that PCRE recog-
- nizes as digits, spaces, or word characters remain the same set as
- before, all with values less than 256. This remains true even when PCRE
- includes Unicode property support, because to do otherwise would slow
- down PCRE in many common cases. If you really want to test for a wider
- sense of, say, "digit", you must use Unicode property tests such as
- \p{Nd}. Note that this also applies to \b, because it is defined in
- terms of \w and \W.
-
- 7. Similarly, characters that match the POSIX named character classes
- are all low-valued characters.
-
- 8. However, the Perl 5.10 horizontal and vertical whitespace matching
- escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
- acters.
-
- 9. Case-insensitive matching applies only to characters whose values
- are less than 128, unless PCRE is built with Unicode property support.
- Even when Unicode property support is available, PCRE still uses its
- own character tables when checking the case of low-valued characters,
- so as not to degrade performance. The Unicode property information is
- used only for characters with higher values. Even when Unicode property
- support is available, PCRE supports case-insensitive matching only when
- there is a one-to-one mapping between a letter's cases. There are a
- small number of many-to-one mappings in Unicode; these are not sup-
- ported by PCRE.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
- Putting an actual email address here seems to have been a spam magnet,
- so I've taken it away. If you want to email me, use my two initials,
- followed by the two digits 10, at the domain cam.ac.uk.
-
-
-REVISION
-
- Last updated: 11 April 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREBUILD(3) PCREBUILD(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE BUILD-TIME OPTIONS
-
- This document describes the optional features of PCRE that can be
- selected when the library is compiled. It assumes use of the configure
- script, where the optional features are selected or deselected by pro-
- viding options to configure before running the make command. However,
- the same options can be selected in both Unix-like and non-Unix-like
- environments using the GUI facility of CMakeSetup if you are using
- CMake instead of configure to build PCRE.
-
- The complete list of options for configure (which includes the standard
- ones such as the selection of the installation directory) can be
- obtained by running
-
- ./configure --help
-
- The following sections include descriptions of options whose names
- begin with --enable or --disable. These settings specify changes to the
- defaults for the configure command. Because of the way that configure
- works, --enable and --disable always come in pairs, so the complemen-
- tary option always exists as well, but as it specifies the default, it
- is not described.
-
-
-C++ SUPPORT
-
- By default, the configure script will search for a C++ compiler and C++
- header files. If it finds them, it automatically builds the C++ wrapper
- library for PCRE. You can disable this by adding
-
- --disable-cpp
-
- to the configure command.
-
-
-UTF-8 SUPPORT
-
- To build PCRE with support for UTF-8 Unicode character strings, add
-
- --enable-utf8
-
- to the configure command. Of itself, this does not make PCRE treat
- strings as UTF-8. As well as compiling PCRE with this option, you also
- have have to set the PCRE_UTF8 option when you call the pcre_compile()
- function.
-
- If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE
- expects its input to be either ASCII or UTF-8 (depending on the runtime
- option). It is not possible to support both EBCDIC and UTF-8 codes in
- the same version of the library. Consequently, --enable-utf8 and
- --enable-ebcdic are mutually exclusive.
-
-
-UNICODE CHARACTER PROPERTY SUPPORT
-
- UTF-8 support allows PCRE to process character values greater than 255
- in the strings that it handles. On its own, however, it does not pro-
- vide any facilities for accessing the properties of such characters. If
- you want to be able to use the pattern escapes \P, \p, and \X, which
- refer to Unicode character properties, you must add
-
- --enable-unicode-properties
-
- to the configure command. This implies UTF-8 support, even if you have
- not explicitly requested it.
-
- Including Unicode property support adds around 30K of tables to the
- PCRE library. Only the general category properties such as Lu and Nd
- are supported. Details are given in the pcrepattern documentation.
-
-
-CODE VALUE OF NEWLINE
-
- By default, PCRE interprets the linefeed (LF) character as indicating
- the end of a line. This is the normal newline character on Unix-like
- systems. You can compile PCRE to use carriage return (CR) instead, by
- adding
-
- --enable-newline-is-cr
-
- to the configure command. There is also a --enable-newline-is-lf
- option, which explicitly specifies linefeed as the newline character.
-
- Alternatively, you can specify that line endings are to be indicated by
- the two character sequence CRLF. If you want this, add
-
- --enable-newline-is-crlf
-
- to the configure command. There is a fourth option, specified by
-
- --enable-newline-is-anycrlf
-
- which causes PCRE to recognize any of the three sequences CR, LF, or
- CRLF as indicating a line ending. Finally, a fifth option, specified by
-
- --enable-newline-is-any
-
- causes PCRE to recognize any Unicode newline sequence.
-
- Whatever line ending convention is selected when PCRE is built can be
- overridden when the library functions are called. At build time it is
- conventional to use the standard for your operating system.
-
-
-WHAT \R MATCHES
-
- By default, the sequence \R in a pattern matches any Unicode newline
- sequence, whatever has been selected as the line ending sequence. If
- you specify
-
- --enable-bsr-anycrlf
-
- the default is changed so that \R matches only CR, LF, or CRLF. What-
- ever is selected when PCRE is built can be overridden when the library
- functions are called.
-
-
-BUILDING SHARED AND STATIC LIBRARIES
-
- The PCRE building process uses libtool to build both shared and static
- Unix libraries by default. You can suppress one of these by adding one
- of
-
- --disable-shared
- --disable-static
-
- to the configure command, as required.
-
-
-POSIX MALLOC USAGE
-
- When PCRE is called through the POSIX interface (see the pcreposix doc-
- umentation), additional working storage is required for holding the
- pointers to capturing substrings, because PCRE requires three integers
- per substring, whereas the POSIX interface provides only two. If the
- number of expected substrings is small, the wrapper function uses space
- on the stack, because this is faster than using malloc() for each call.
- The default threshold above which the stack is no longer used is 10; it
- can be changed by adding a setting such as
-
- --with-posix-malloc-threshold=20
-
- to the configure command.
-
-
-HANDLING VERY LARGE PATTERNS
-
- Within a compiled pattern, offset values are used to point from one
- part to another (for example, from an opening parenthesis to an alter-
- nation metacharacter). By default, two-byte values are used for these
- offsets, leading to a maximum size for a compiled pattern of around
- 64K. This is sufficient to handle all but the most gigantic patterns.
- Nevertheless, some people do want to process enormous patterns, so it
- is possible to compile PCRE to use three-byte or four-byte offsets by
- adding a setting such as
-
- --with-link-size=3
-
- to the configure command. The value given must be 2, 3, or 4. Using
- longer offsets slows down the operation of PCRE because it has to load
- additional bytes when handling them.
-
-
-AVOIDING EXCESSIVE STACK USAGE
-
- When matching with the pcre_exec() function, PCRE implements backtrack-
- ing by making recursive calls to an internal function called match().
- In environments where the size of the stack is limited, this can se-
- verely limit PCRE's operation. (The Unix environment does not usually
- suffer from this problem, but it may sometimes be necessary to increase
- the maximum stack size. There is a discussion in the pcrestack docu-
- mentation.) An alternative approach to recursion that uses memory from
- the heap to remember data, instead of using recursive function calls,
- has been implemented to work round the problem of limited stack size.
- If you want to build a version of PCRE that works this way, add
-
- --disable-stack-for-recursion
-
- to the configure command. With this configuration, PCRE will use the
- pcre_stack_malloc and pcre_stack_free variables to call memory manage-
- ment functions. By default these point to malloc() and free(), but you
- can replace the pointers so that your own functions are used.
-
- Separate functions are provided rather than using pcre_malloc and
- pcre_free because the usage is very predictable: the block sizes
- requested are always the same, and the blocks are always freed in
- reverse order. A calling program might be able to implement optimized
- functions that perform better than malloc() and free(). PCRE runs
- noticeably more slowly when built in this way. This option affects only
- the pcre_exec() function; it is not relevant for the the
- pcre_dfa_exec() function.
-
-
-LIMITING PCRE RESOURCE USAGE
-
- Internally, PCRE has a function called match(), which it calls repeat-
- edly (sometimes recursively) when matching a pattern with the
- pcre_exec() function. By controlling the maximum number of times this
- function may be called during a single matching operation, a limit can
- be placed on the resources used by a single call to pcre_exec(). The
- limit can be changed at run time, as described in the pcreapi documen-
- tation. The default is 10 million, but this can be changed by adding a
- setting such as
-
- --with-match-limit=500000
-
- to the configure command. This setting has no effect on the
- pcre_dfa_exec() matching function.
-
- In some environments it is desirable to limit the depth of recursive
- calls of match() more strictly than the total number of calls, in order
- to restrict the maximum amount of stack (or heap, if --disable-stack-
- for-recursion is specified) that is used. A second limit controls this;
- it defaults to the value that is set for --with-match-limit, which
- imposes no additional constraints. However, you can set a lower limit
- by adding, for example,
-
- --with-match-limit-recursion=10000
-
- to the configure command. This value can also be overridden at run
- time.
-
-
-CREATING CHARACTER TABLES AT BUILD TIME
-
- PCRE uses fixed tables for processing characters whose code values are
- less than 256. By default, PCRE is built with a set of tables that are
- distributed in the file pcre_chartables.c.dist. These tables are for
- ASCII codes only. If you add
-
- --enable-rebuild-chartables
-
- to the configure command, the distributed tables are no longer used.
- Instead, a program called dftables is compiled and run. This outputs
- the source for new set of tables, created in the default locale of your
- C runtime system. (This method of replacing the tables does not work if
- you are cross compiling, because dftables is run on the local host. If
- you need to create alternative tables when cross compiling, you will
- have to do so "by hand".)
-
-
-USING EBCDIC CODE
-
- PCRE assumes by default that it will run in an environment where the
- character code is ASCII (or Unicode, which is a superset of ASCII).
- This is the case for most computer operating systems. PCRE can, how-
- ever, be compiled to run in an EBCDIC environment by adding
-
- --enable-ebcdic
-
- to the configure command. This setting implies --enable-rebuild-charta-
- bles. You should only use it if you know that you are in an EBCDIC
- environment (for example, an IBM mainframe operating system). The
- --enable-ebcdic option is incompatible with --enable-utf8.
-
-
-PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT
-
- By default, pcregrep reads all files as plain text. You can build it so
- that it recognizes files whose names end in .gz or .bz2, and reads them
- with libz or libbz2, respectively, by adding one or both of
-
- --enable-pcregrep-libz
- --enable-pcregrep-libbz2
-
- to the configure command. These options naturally require that the rel-
- evant libraries are installed on your system. Configuration will fail
- if they are not.
-
-
-PCRETEST OPTION FOR LIBREADLINE SUPPORT
-
- If you add
-
- --enable-pcretest-libreadline
-
- to the configure command, pcretest is linked with the libreadline
- library, and when its input is from a terminal, it reads it using the
- readline() function. This provides line-editing and history facilities.
- Note that libreadline is GPL-licenced, so if you distribute a binary of
- pcretest linked in this way, there may be licensing issues.
-
- Setting this option causes the -lreadline option to be added to the
- pcretest build. In many operating environments with a sytem-installed
- libreadline this is sufficient. However, in some environments (e.g. if
- an unmodified distribution version of readline is in use), some extra
- configuration may be necessary. The INSTALL file for libreadline says
- this:
-
- "Readline uses the termcap functions, but does not link with the
- termcap or curses library itself, allowing applications which link
- with readline the to choose an appropriate library."
-
- If your environment has not been set up so that an appropriate library
- is automatically included, you may need to add something like
-
- LIBS="-ncurses"
-
- immediately before the configure command.
-
-
-SEE ALSO
-
- pcreapi(3), pcre_config(3).
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 17 March 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREMATCHING(3) PCREMATCHING(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE MATCHING ALGORITHMS
-
- This document describes the two different algorithms that are available
- in PCRE for matching a compiled regular expression against a given sub-
- ject string. The "standard" algorithm is the one provided by the
- pcre_exec() function. This works in the same was as Perl's matching
- function, and provides a Perl-compatible matching operation.
-
- An alternative algorithm is provided by the pcre_dfa_exec() function;
- this operates in a different way, and is not Perl-compatible. It has
- advantages and disadvantages compared with the standard algorithm, and
- these are described below.
-
- When there is only one possible way in which a given subject string can
- match a pattern, the two algorithms give the same answer. A difference
- arises, however, when there are multiple possibilities. For example, if
- the pattern
-
- ^<.*>
-
- is matched against the string
-
-
-
- there are three possible answers. The standard algorithm finds only one
- of them, whereas the alternative algorithm finds all three.
-
-
-REGULAR EXPRESSIONS AS TREES
-
- The set of strings that are matched by a regular expression can be rep-
- resented as a tree structure. An unlimited repetition in the pattern
- makes the tree of infinite size, but it is still a tree. Matching the
- pattern to a given subject string (from a given starting point) can be
- thought of as a search of the tree. There are two ways to search a
- tree: depth-first and breadth-first, and these correspond to the two
- matching algorithms provided by PCRE.
-
-
-THE STANDARD MATCHING ALGORITHM
-
- In the terminology of Jeffrey Friedl's book "Mastering Regular Expres-
- sions", the standard algorithm is an "NFA algorithm". It conducts a
- depth-first search of the pattern tree. That is, it proceeds along a
- single path through the tree, checking that the subject matches what is
- required. When there is a mismatch, the algorithm tries any alterna-
- tives at the current point, and if they all fail, it backs up to the
- previous branch point in the tree, and tries the next alternative
- branch at that level. This often involves backing up (moving to the
- left) in the subject string as well. The order in which repetition
- branches are tried is controlled by the greedy or ungreedy nature of
- the quantifier.
-
- If a leaf node is reached, a matching string has been found, and at
- that point the algorithm stops. Thus, if there is more than one possi-
- ble match, this algorithm returns the first one that it finds. Whether
- this is the shortest, the longest, or some intermediate length depends
- on the way the greedy and ungreedy repetition quantifiers are specified
- in the pattern.
-
- Because it ends up with a single path through the tree, it is rela-
- tively straightforward for this algorithm to keep track of the sub-
- strings that are matched by portions of the pattern in parentheses.
- This provides support for capturing parentheses and back references.
-
-
-THE ALTERNATIVE MATCHING ALGORITHM
-
- This algorithm conducts a breadth-first search of the tree. Starting
- from the first matching point in the subject, it scans the subject
- string from left to right, once, character by character, and as it does
- this, it remembers all the paths through the tree that represent valid
- matches. In Friedl's terminology, this is a kind of "DFA algorithm",
- though it is not implemented as a traditional finite state machine (it
- keeps multiple states active simultaneously).
-
- The scan continues until either the end of the subject is reached, or
- there are no more unterminated paths. At this point, terminated paths
- represent the different matching possibilities (if there are none, the
- match has failed). Thus, if there is more than one possible match,
- this algorithm finds all of them, and in particular, it finds the long-
- est. In PCRE, there is an option to stop the algorithm after the first
- match (which is necessarily the shortest) has been found.
-
- Note that all the matches that are found start at the same point in the
- subject. If the pattern
-
- cat(er(pillar)?)
-
- is matched against the string "the caterpillar catchment", the result
- will be the three strings "cat", "cater", and "caterpillar" that start
- at the fourth character of the subject. The algorithm does not automat-
- ically move on to find matches that start at later positions.
-
- There are a number of features of PCRE regular expressions that are not
- supported by the alternative matching algorithm. They are as follows:
-
- 1. Because the algorithm finds all possible matches, the greedy or
- ungreedy nature of repetition quantifiers is not relevant. Greedy and
- ungreedy quantifiers are treated in exactly the same way. However, pos-
- sessive quantifiers can make a difference when what follows could also
- match what is quantified, for example in a pattern like this:
-
- ^a++\w!
-
- This pattern matches "aaab!" but not "aaa!", which would be matched by
- a non-possessive quantifier. Similarly, if an atomic group is present,
- it is matched as if it were a standalone pattern at the current point,
- and the longest match is then "locked in" for the rest of the overall
- pattern.
-
- 2. When dealing with multiple paths through the tree simultaneously, it
- is not straightforward to keep track of captured substrings for the
- different matching possibilities, and PCRE's implementation of this
- algorithm does not attempt to do this. This means that no captured sub-
- strings are available.
-
- 3. Because no substrings are captured, back references within the pat-
- tern are not supported, and cause errors if encountered.
-
- 4. For the same reason, conditional expressions that use a backrefer-
- ence as the condition or test for a specific group recursion are not
- supported.
-
- 5. Because many paths through the tree may be active, the \K escape
- sequence, which resets the start of the match when encountered (but may
- be on some paths and not on others), is not supported. It causes an
- error if encountered.
-
- 6. Callouts are supported, but the value of the capture_top field is
- always 1, and the value of the capture_last field is always -1.
-
- 7. The \C escape sequence, which (in the standard algorithm) matches a
- single byte, even in UTF-8 mode, is not supported because the alterna-
- tive algorithm moves through the subject string one character at a
- time, for all active paths through the tree.
-
- 8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE)
- are not supported. (*FAIL) is supported, and behaves like a failing
- negative assertion.
-
-
-ADVANTAGES OF THE ALTERNATIVE ALGORITHM
-
- Using the alternative matching algorithm provides the following advan-
- tages:
-
- 1. All possible matches (at a single point in the subject) are automat-
- ically found, and in particular, the longest match is found. To find
- more than one match using the standard algorithm, you have to do kludgy
- things with callouts.
-
- 2. There is much better support for partial matching. The restrictions
- on the content of the pattern that apply when using the standard algo-
- rithm for partial matching do not apply to the alternative algorithm.
- For non-anchored patterns, the starting position of a partial match is
- available.
-
- 3. Because the alternative algorithm scans the subject string just
- once, and never needs to backtrack, it is possible to pass very long
- subject strings to the matching function in several pieces, checking
- for partial matching each time.
-
-
-DISADVANTAGES OF THE ALTERNATIVE ALGORITHM
-
- The alternative algorithm suffers from a number of disadvantages:
-
- 1. It is substantially slower than the standard algorithm. This is
- partly because it has to search for all possible matches, but is also
- because it is less susceptible to optimization.
-
- 2. Capturing parentheses and back references are not supported.
-
- 3. Although atomic groups are supported, their use does not provide the
- performance advantage that it does for the standard algorithm.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 19 April 2008
- Copyright (c) 1997-2008 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREAPI(3) PCREAPI(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE NATIVE API
-
- #include
-
- pcre *pcre_compile(const char *pattern, int options,
- const char **errptr, int *erroffset,
- const unsigned char *tableptr);
-
- pcre *pcre_compile2(const char *pattern, int options,
- int *errorcodeptr,
- const char **errptr, int *erroffset,
- const unsigned char *tableptr);
-
- pcre_extra *pcre_study(const pcre *code, int options,
- const char **errptr);
-
- int pcre_exec(const pcre *code, const pcre_extra *extra,
- const char *subject, int length, int startoffset,
- int options, int *ovector, int ovecsize);
-
- int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
- const char *subject, int length, int startoffset,
- int options, int *ovector, int ovecsize,
- int *workspace, int wscount);
-
- int pcre_copy_named_substring(const pcre *code,
- const char *subject, int *ovector,
- int stringcount, const char *stringname,
- char *buffer, int buffersize);
-
- int pcre_copy_substring(const char *subject, int *ovector,
- int stringcount, int stringnumber, char *buffer,
- int buffersize);
-
- int pcre_get_named_substring(const pcre *code,
- const char *subject, int *ovector,
- int stringcount, const char *stringname,
- const char **stringptr);
-
- int pcre_get_stringnumber(const pcre *code,
- const char *name);
-
- int pcre_get_stringtable_entries(const pcre *code,
- const char *name, char **first, char **last);
-
- int pcre_get_substring(const char *subject, int *ovector,
- int stringcount, int stringnumber,
- const char **stringptr);
-
- int pcre_get_substring_list(const char *subject,
- int *ovector, int stringcount, const char ***listptr);
-
- void pcre_free_substring(const char *stringptr);
-
- void pcre_free_substring_list(const char **stringptr);
-
- const unsigned char *pcre_maketables(void);
-
- int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
- int what, void *where);
-
- int pcre_info(const pcre *code, int *optptr, int *firstcharptr);
-
- int pcre_refcount(pcre *code, int adjust);
-
- int pcre_config(int what, void *where);
-
- char *pcre_version(void);
-
- void *(*pcre_malloc)(size_t);
-
- void (*pcre_free)(void *);
-
- void *(*pcre_stack_malloc)(size_t);
-
- void (*pcre_stack_free)(void *);
-
- int (*pcre_callout)(pcre_callout_block *);
-
-
-PCRE API OVERVIEW
-
- PCRE has its own native API, which is described in this document. There
- are also some wrapper functions that correspond to the POSIX regular
- expression API. These are described in the pcreposix documentation.
- Both of these APIs define a set of C function calls. A C++ wrapper is
- distributed with PCRE. It is documented in the pcrecpp page.
-
- The native API C function prototypes are defined in the header file
- pcre.h, and on Unix systems the library itself is called libpcre. It
- can normally be accessed by adding -lpcre to the command for linking an
- application that uses PCRE. The header file defines the macros
- PCRE_MAJOR and PCRE_MINOR to contain the major and minor release num-
- bers for the library. Applications can use these to include support
- for different releases of PCRE.
-
- The functions pcre_compile(), pcre_compile2(), pcre_study(), and
- pcre_exec() are used for compiling and matching regular expressions in
- a Perl-compatible manner. A sample program that demonstrates the sim-
- plest way of using them is provided in the file called pcredemo.c in
- the source distribution. The pcresample documentation describes how to
- compile and run it.
-
- A second matching function, pcre_dfa_exec(), which is not Perl-compati-
- ble, is also provided. This uses a different algorithm for the match-
- ing. The alternative algorithm finds all possible matches (at a given
- point in the subject), and scans the subject just once. However, this
- algorithm does not return captured substrings. A description of the two
- matching algorithms and their advantages and disadvantages is given in
- the pcrematching documentation.
-
- In addition to the main compiling and matching functions, there are
- convenience functions for extracting captured substrings from a subject
- string that is matched by pcre_exec(). They are:
-
- pcre_copy_substring()
- pcre_copy_named_substring()
- pcre_get_substring()
- pcre_get_named_substring()
- pcre_get_substring_list()
- pcre_get_stringnumber()
- pcre_get_stringtable_entries()
-
- pcre_free_substring() and pcre_free_substring_list() are also provided,
- to free the memory used for extracted strings.
-
- The function pcre_maketables() is used to build a set of character
- tables in the current locale for passing to pcre_compile(),
- pcre_exec(), or pcre_dfa_exec(). This is an optional facility that is
- provided for specialist use. Most commonly, no special tables are
- passed, in which case internal tables that are generated when PCRE is
- built are used.
-
- The function pcre_fullinfo() is used to find out information about a
- compiled pattern; pcre_info() is an obsolete version that returns only
- some of the available information, but is retained for backwards com-
- patibility. The function pcre_version() returns a pointer to a string
- containing the version of PCRE and its date of release.
-
- The function pcre_refcount() maintains a reference count in a data
- block containing a compiled pattern. This is provided for the benefit
- of object-oriented applications.
-
- The global variables pcre_malloc and pcre_free initially contain the
- entry points of the standard malloc() and free() functions, respec-
- tively. PCRE calls the memory management functions via these variables,
- so a calling program can replace them if it wishes to intercept the
- calls. This should be done before calling any PCRE functions.
-
- The global variables pcre_stack_malloc and pcre_stack_free are also
- indirections to memory management functions. These special functions
- are used only when PCRE is compiled to use the heap for remembering
- data, instead of recursive function calls, when running the pcre_exec()
- function. See the pcrebuild documentation for details of how to do
- this. It is a non-standard way of building PCRE, for use in environ-
- ments that have limited stacks. Because of the greater use of memory
- management, it runs more slowly. Separate functions are provided so
- that special-purpose external code can be used for this case. When
- used, these functions are always called in a stack-like manner (last
- obtained, first freed), and always for memory blocks of the same size.
- There is a discussion about PCRE's stack usage in the pcrestack docu-
- mentation.
-
- The global variable pcre_callout initially contains NULL. It can be set
- by the caller to a "callout" function, which PCRE will then call at
- specified points during a matching operation. Details are given in the
- pcrecallout documentation.
-
-
-NEWLINES
-
- PCRE supports five different conventions for indicating line breaks in
- strings: a single CR (carriage return) character, a single LF (line-
- feed) character, the two-character sequence CRLF, any of the three pre-
- ceding, or any Unicode newline sequence. The Unicode newline sequences
- are the three just mentioned, plus the single characters VT (vertical
- tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
- separator, U+2028), and PS (paragraph separator, U+2029).
-
- Each of the first three conventions is used by at least one operating
- system as its standard newline sequence. When PCRE is built, a default
- can be specified. The default default is LF, which is the Unix stan-
- dard. When PCRE is run, the default can be overridden, either when a
- pattern is compiled, or when it is matched.
-
- At compile time, the newline convention can be specified by the options
- argument of pcre_compile(), or it can be specified by special text at
- the start of the pattern itself; this overrides any other settings. See
- the pcrepattern page for details of the special character sequences.
-
- In the PCRE documentation the word "newline" is used to mean "the char-
- acter or pair of characters that indicate a line break". The choice of
- newline convention affects the handling of the dot, circumflex, and
- dollar metacharacters, the handling of #-comments in /x mode, and, when
- CRLF is a recognized line ending sequence, the match position advance-
- ment for a non-anchored pattern. There is more detail about this in the
- section on pcre_exec() options below.
-
- The choice of newline convention does not affect the interpretation of
- the \n or \r escape sequences, nor does it affect what \R matches,
- which is controlled in a similar way, but by separate options.
-
-
-MULTITHREADING
-
- The PCRE functions can be used in multi-threading applications, with
- the proviso that the memory management functions pointed to by
- pcre_malloc, pcre_free, pcre_stack_malloc, and pcre_stack_free, and the
- callout function pointed to by pcre_callout, are shared by all threads.
-
- The compiled form of a regular expression is not altered during match-
- ing, so the same compiled pattern can safely be used by several threads
- at once.
-
-
-SAVING PRECOMPILED PATTERNS FOR LATER USE
-
- The compiled form of a regular expression can be saved and re-used at a
- later time, possibly by a different program, and even on a host other
- than the one on which it was compiled. Details are given in the
- pcreprecompile documentation. However, compiling a regular expression
- with one version of PCRE for use with a different version is not guar-
- anteed to work and may cause crashes.
-
-
-CHECKING BUILD-TIME OPTIONS
-
- int pcre_config(int what, void *where);
-
- The function pcre_config() makes it possible for a PCRE client to dis-
- cover which optional features have been compiled into the PCRE library.
- The pcrebuild documentation has more details about these optional fea-
- tures.
-
- The first argument for pcre_config() is an integer, specifying which
- information is required; the second argument is a pointer to a variable
- into which the information is placed. The following information is
- available:
-
- PCRE_CONFIG_UTF8
-
- The output is an integer that is set to one if UTF-8 support is avail-
- able; otherwise it is set to zero.
-
- PCRE_CONFIG_UNICODE_PROPERTIES
-
- The output is an integer that is set to one if support for Unicode
- character properties is available; otherwise it is set to zero.
-
- PCRE_CONFIG_NEWLINE
-
- The output is an integer whose value specifies the default character
- sequence that is recognized as meaning "newline". The four values that
- are supported are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF,
- and -1 for ANY. Though they are derived from ASCII, the same values
- are returned in EBCDIC environments. The default should normally corre-
- spond to the standard sequence for your operating system.
-
- PCRE_CONFIG_BSR
-
- The output is an integer whose value indicates what character sequences
- the \R escape sequence matches by default. A value of 0 means that \R
- matches any Unicode line ending sequence; a value of 1 means that \R
- matches only CR, LF, or CRLF. The default can be overridden when a pat-
- tern is compiled or matched.
-
- PCRE_CONFIG_LINK_SIZE
-
- The output is an integer that contains the number of bytes used for
- internal linkage in compiled regular expressions. The value is 2, 3, or
- 4. Larger values allow larger regular expressions to be compiled, at
- the expense of slower matching. The default value of 2 is sufficient
- for all but the most massive patterns, since it allows the compiled
- pattern to be up to 64K in size.
-
- PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
-
- The output is an integer that contains the threshold above which the
- POSIX interface uses malloc() for output vectors. Further details are
- given in the pcreposix documentation.
-
- PCRE_CONFIG_MATCH_LIMIT
-
- The output is a long integer that gives the default limit for the num-
- ber of internal matching function calls in a pcre_exec() execution.
- Further details are given with pcre_exec() below.
-
- PCRE_CONFIG_MATCH_LIMIT_RECURSION
-
- The output is a long integer that gives the default limit for the depth
- of recursion when calling the internal matching function in a
- pcre_exec() execution. Further details are given with pcre_exec()
- below.
-
- PCRE_CONFIG_STACKRECURSE
-
- The output is an integer that is set to one if internal recursion when
- running pcre_exec() is implemented by recursive function calls that use
- the stack to remember their state. This is the usual way that PCRE is
- compiled. The output is zero if PCRE was compiled to use blocks of data
- on the heap instead of recursive function calls. In this case,
- pcre_stack_malloc and pcre_stack_free are called to manage memory
- blocks on the heap, thus avoiding the use of the stack.
-
-
-COMPILING A PATTERN
-
- pcre *pcre_compile(const char *pattern, int options,
- const char **errptr, int *erroffset,
- const unsigned char *tableptr);
-
- pcre *pcre_compile2(const char *pattern, int options,
- int *errorcodeptr,
- const char **errptr, int *erroffset,
- const unsigned char *tableptr);
-
- Either of the functions pcre_compile() or pcre_compile2() can be called
- to compile a pattern into an internal form. The only difference between
- the two interfaces is that pcre_compile2() has an additional argument,
- errorcodeptr, via which a numerical error code can be returned.
-
- The pattern is a C string terminated by a binary zero, and is passed in
- the pattern argument. A pointer to a single block of memory that is
- obtained via pcre_malloc is returned. This contains the compiled code
- and related data. The pcre type is defined for the returned block; this
- is a typedef for a structure whose contents are not externally defined.
- It is up to the caller to free the memory (via pcre_free) when it is no
- longer required.
-
- Although the compiled code of a PCRE regex is relocatable, that is, it
- does not depend on memory location, the complete pcre data block is not
- fully relocatable, because it may contain a copy of the tableptr argu-
- ment, which is an address (see below).
-
- The options argument contains various bit settings that affect the com-
- pilation. It should be zero if no options are required. The available
- options are described below. Some of them (in particular, those that
- are compatible with Perl, but also some others) can also be set and
- unset from within the pattern (see the detailed description in the
- pcrepattern documentation). For those options that can be different in
- different parts of the pattern, the contents of the options argument
- specifies their initial settings at the start of compilation and execu-
- tion. The PCRE_ANCHORED and PCRE_NEWLINE_xxx options can be set at the
- time of matching as well as at compile time.
-
- If errptr is NULL, pcre_compile() returns NULL immediately. Otherwise,
- if compilation of a pattern fails, pcre_compile() returns NULL, and
- sets the variable pointed to by errptr to point to a textual error mes-
- sage. This is a static string that is part of the library. You must not
- try to free it. The offset from the start of the pattern to the charac-
- ter where the error was discovered is placed in the variable pointed to
- by erroffset, which must not be NULL. If it is, an immediate error is
- given.
-
- If pcre_compile2() is used instead of pcre_compile(), and the error-
- codeptr argument is not NULL, a non-zero error code number is returned
- via this argument in the event of an error. This is in addition to the
- textual error message. Error codes and messages are listed below.
-
- If the final argument, tableptr, is NULL, PCRE uses a default set of
- character tables that are built when PCRE is compiled, using the
- default C locale. Otherwise, tableptr must be an address that is the
- result of a call to pcre_maketables(). This value is stored with the
- compiled pattern, and used again by pcre_exec(), unless another table
- pointer is passed to it. For more discussion, see the section on locale
- support below.
-
- This code fragment shows a typical straightforward call to pcre_com-
- pile():
-
- pcre *re;
- const char *error;
- int erroffset;
- re = pcre_compile(
- "^A.*Z", /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-
- The following names for option bits are defined in the pcre.h header
- file:
-
- PCRE_ANCHORED
-
- If this bit is set, the pattern is forced to be "anchored", that is, it
- is constrained to match only at the first matching point in the string
- that is being searched (the "subject string"). This effect can also be
- achieved by appropriate constructs in the pattern itself, which is the
- only way to do it in Perl.
-
- PCRE_AUTO_CALLOUT
-
- If this bit is set, pcre_compile() automatically inserts callout items,
- all with number 255, before each pattern item. For discussion of the
- callout facility, see the pcrecallout documentation.
-
- PCRE_BSR_ANYCRLF
- PCRE_BSR_UNICODE
-
- These options (which are mutually exclusive) control what the \R escape
- sequence matches. The choice is either to match only CR, LF, or CRLF,
- or to match any Unicode newline sequence. The default is specified when
- PCRE is built. It can be overridden from within the pattern, or by set-
- ting an option when a compiled pattern is matched.
-
- PCRE_CASELESS
-
- If this bit is set, letters in the pattern match both upper and lower
- case letters. It is equivalent to Perl's /i option, and it can be
- changed within a pattern by a (?i) option setting. In UTF-8 mode, PCRE
- always understands the concept of case for characters whose values are
- less than 128, so caseless matching is always possible. For characters
- with higher values, the concept of case is supported if PCRE is com-
- piled with Unicode property support, but not otherwise. If you want to
- use caseless matching for characters 128 and above, you must ensure
- that PCRE is compiled with Unicode property support as well as with
- UTF-8 support.
-
- PCRE_DOLLAR_ENDONLY
-
- If this bit is set, a dollar metacharacter in the pattern matches only
- at the end of the subject string. Without this option, a dollar also
- matches immediately before a newline at the end of the string (but not
- before any other newlines). The PCRE_DOLLAR_ENDONLY option is ignored
- if PCRE_MULTILINE is set. There is no equivalent to this option in
- Perl, and no way to set it within a pattern.
-
- PCRE_DOTALL
-
- If this bit is set, a dot metacharater in the pattern matches all char-
- acters, including those that indicate newline. Without it, a dot does
- not match when the current position is at a newline. This option is
- equivalent to Perl's /s option, and it can be changed within a pattern
- by a (?s) option setting. A negative class such as [^a] always matches
- newline characters, independent of the setting of this option.
-
- PCRE_DUPNAMES
-
- If this bit is set, names used to identify capturing subpatterns need
- not be unique. This can be helpful for certain types of pattern when it
- is known that only one instance of the named subpattern can ever be
- matched. There are more details of named subpatterns below; see also
- the pcrepattern documentation.
-
- PCRE_EXTENDED
-
- If this bit is set, whitespace data characters in the pattern are
- totally ignored except when escaped or inside a character class. White-
- space does not include the VT character (code 11). In addition, charac-
- ters between an unescaped # outside a character class and the next new-
- line, inclusive, are also ignored. This is equivalent to Perl's /x
- option, and it can be changed within a pattern by a (?x) option set-
- ting.
-
- This option makes it possible to include comments inside complicated
- patterns. Note, however, that this applies only to data characters.
- Whitespace characters may never appear within special character
- sequences in a pattern, for example within the sequence (?( which
- introduces a conditional subpattern.
-
- PCRE_EXTRA
-
- This option was invented in order to turn on additional functionality
- of PCRE that is incompatible with Perl, but it is currently of very
- little use. When set, any backslash in a pattern that is followed by a
- letter that has no special meaning causes an error, thus reserving
- these combinations for future expansion. By default, as in Perl, a
- backslash followed by a letter with no special meaning is treated as a
- literal. (Perl can, however, be persuaded to give a warning for this.)
- There are at present no other features controlled by this option. It
- can also be set by a (?X) option setting within a pattern.
-
- PCRE_FIRSTLINE
-
- If this option is set, an unanchored pattern is required to match
- before or at the first newline in the subject string, though the
- matched text may continue over the newline.
-
- PCRE_JAVASCRIPT_COMPAT
-
- If this option is set, PCRE's behaviour is changed in some ways so that
- it is compatible with JavaScript rather than Perl. The changes are as
- follows:
-
- (1) A lone closing square bracket in a pattern causes a compile-time
- error, because this is illegal in JavaScript (by default it is treated
- as a data character). Thus, the pattern AB]CD becomes illegal when this
- option is set.
-
- (2) At run time, a back reference to an unset subpattern group matches
- an empty string (by default this causes the current matching alterna-
- tive to fail). A pattern such as (\1)(a) succeeds when this option is
- set (assuming it can find an "a" in the subject), whereas it fails by
- default, for Perl compatibility.
-
- PCRE_MULTILINE
-
- By default, PCRE treats the subject string as consisting of a single
- line of characters (even if it actually contains newlines). The "start
- of line" metacharacter (^) matches only at the start of the string,
- while the "end of line" metacharacter ($) matches only at the end of
- the string, or before a terminating newline (unless PCRE_DOLLAR_ENDONLY
- is set). This is the same as Perl.
-
- When PCRE_MULTILINE it is set, the "start of line" and "end of line"
- constructs match immediately following or immediately before internal
- newlines in the subject string, respectively, as well as at the very
- start and end. This is equivalent to Perl's /m option, and it can be
- changed within a pattern by a (?m) option setting. If there are no new-
- lines in a subject string, or no occurrences of ^ or $ in a pattern,
- setting PCRE_MULTILINE has no effect.
-
- PCRE_NEWLINE_CR
- PCRE_NEWLINE_LF
- PCRE_NEWLINE_CRLF
- PCRE_NEWLINE_ANYCRLF
- PCRE_NEWLINE_ANY
-
- These options override the default newline definition that was chosen
- when PCRE was built. Setting the first or the second specifies that a
- newline is indicated by a single character (CR or LF, respectively).
- Setting PCRE_NEWLINE_CRLF specifies that a newline is indicated by the
- two-character CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies
- that any of the three preceding sequences should be recognized. Setting
- PCRE_NEWLINE_ANY specifies that any Unicode newline sequence should be
- recognized. The Unicode newline sequences are the three just mentioned,
- plus the single characters VT (vertical tab, U+000B), FF (formfeed,
- U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
- (paragraph separator, U+2029). The last two are recognized only in
- UTF-8 mode.
-
- The newline setting in the options word uses three bits that are
- treated as a number, giving eight possibilities. Currently only six are
- used (default plus the five values above). This means that if you set
- more than one newline option, the combination may or may not be sensi-
- ble. For example, PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to
- PCRE_NEWLINE_CRLF, but other combinations may yield unused numbers and
- cause an error.
-
- The only time that a line break is specially recognized when compiling
- a pattern is if PCRE_EXTENDED is set, and an unescaped # outside a
- character class is encountered. This indicates a comment that lasts
- until after the next line break sequence. In other circumstances, line
- break sequences are treated as literal data, except that in
- PCRE_EXTENDED mode, both CR and LF are treated as whitespace characters
- and are therefore ignored.
-
- The newline option that is set at compile time becomes the default that
- is used for pcre_exec() and pcre_dfa_exec(), but it can be overridden.
-
- PCRE_NO_AUTO_CAPTURE
-
- If this option is set, it disables the use of numbered capturing paren-
- theses in the pattern. Any opening parenthesis that is not followed by
- ? behaves as if it were followed by ?: but named parentheses can still
- be used for capturing (and they acquire numbers in the usual way).
- There is no equivalent of this option in Perl.
-
- PCRE_UNGREEDY
-
- This option inverts the "greediness" of the quantifiers so that they
- are not greedy by default, but become greedy if followed by "?". It is
- not compatible with Perl. It can also be set by a (?U) option setting
- within the pattern.
-
- PCRE_UTF8
-
- This option causes PCRE to regard both the pattern and the subject as
- strings of UTF-8 characters instead of single-byte character strings.
- However, it is available only when PCRE is built to include UTF-8 sup-
- port. If not, the use of this option provokes an error. Details of how
- this option changes the behaviour of PCRE are given in the section on
- UTF-8 support in the main pcre page.
-
- PCRE_NO_UTF8_CHECK
-
- When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
- automatically checked. There is a discussion about the validity of
- UTF-8 strings in the main pcre page. If an invalid UTF-8 sequence of
- bytes is found, pcre_compile() returns an error. If you already know
- that your pattern is valid, and you want to skip this check for perfor-
- mance reasons, you can set the PCRE_NO_UTF8_CHECK option. When it is
- set, the effect of passing an invalid UTF-8 string as a pattern is
- undefined. It may cause your program to crash. Note that this option
- can also be passed to pcre_exec() and pcre_dfa_exec(), to suppress the
- UTF-8 validity checking of subject strings.
-
-
-COMPILATION ERROR CODES
-
- The following table lists the error codes than may be returned by
- pcre_compile2(), along with the error messages that may be returned by
- both compiling functions. As PCRE has developed, some error codes have
- fallen out of use. To avoid confusion, they have not been re-used.
-
- 0 no error
- 1 \ at end of pattern
- 2 \c at end of pattern
- 3 unrecognized character follows \
- 4 numbers out of order in {} quantifier
- 5 number too big in {} quantifier
- 6 missing terminating ] for character class
- 7 invalid escape sequence in character class
- 8 range out of order in character class
- 9 nothing to repeat
- 10 [this code is not in use]
- 11 internal error: unexpected repeat
- 12 unrecognized character after (? or (?-
- 13 POSIX named classes are supported only within a class
- 14 missing )
- 15 reference to non-existent subpattern
- 16 erroffset passed as NULL
- 17 unknown option bit(s) set
- 18 missing ) after comment
- 19 [this code is not in use]
- 20 regular expression is too large
- 21 failed to get memory
- 22 unmatched parentheses
- 23 internal error: code overflow
- 24 unrecognized character after (?<
- 25 lookbehind assertion is not fixed length
- 26 malformed number or name after (?(
- 27 conditional group contains more than two branches
- 28 assertion expected after (?(
- 29 (?R or (?[+-]digits must be followed by )
- 30 unknown POSIX class name
- 31 POSIX collating elements are not supported
- 32 this version of PCRE is not compiled with PCRE_UTF8 support
- 33 [this code is not in use]
- 34 character value in \x{...} sequence is too large
- 35 invalid condition (?(0)
- 36 \C not allowed in lookbehind assertion
- 37 PCRE does not support \L, \l, \N, \U, or \u
- 38 number after (?C is > 255
- 39 closing ) for (?C expected
- 40 recursive call could loop indefinitely
- 41 unrecognized character after (?P
- 42 syntax error in subpattern name (missing terminator)
- 43 two named subpatterns have the same name
- 44 invalid UTF-8 string
- 45 support for \P, \p, and \X has not been compiled
- 46 malformed \P or \p sequence
- 47 unknown property name after \P or \p
- 48 subpattern name is too long (maximum 32 characters)
- 49 too many named subpatterns (maximum 10000)
- 50 [this code is not in use]
- 51 octal value is greater than \377 (not in UTF-8 mode)
- 52 internal error: overran compiling workspace
- 53 internal error: previously-checked referenced subpattern not
- found
- 54 DEFINE group contains more than one branch
- 55 repeating a DEFINE group is not allowed
- 56 inconsistent NEWLINE options
- 57 \g is not followed by a braced, angle-bracketed, or quoted
- name/number or by a plain number
- 58 a numbered reference must not be zero
- 59 (*VERB) with an argument is not supported
- 60 (*VERB) not recognized
- 61 number is too big
- 62 subpattern name expected
- 63 digit expected after (?+
- 64 ] is an invalid data character in JavaScript compatibility mode
-
- The numbers 32 and 10000 in errors 48 and 49 are defaults; different
- values may be used if the limits were changed when PCRE was built.
-
-
-STUDYING A PATTERN
-
- pcre_extra *pcre_study(const pcre *code, int options
- const char **errptr);
-
- If a compiled pattern is going to be used several times, it is worth
- spending more time analyzing it in order to speed up the time taken for
- matching. The function pcre_study() takes a pointer to a compiled pat-
- tern as its first argument. If studying the pattern produces additional
- information that will help speed up matching, pcre_study() returns a
- pointer to a pcre_extra block, in which the study_data field points to
- the results of the study.
-
- The returned value from pcre_study() can be passed directly to
- pcre_exec(). However, a pcre_extra block also contains other fields
- that can be set by the caller before the block is passed; these are
- described below in the section on matching a pattern.
-
- If studying the pattern does not produce any additional information
- pcre_study() returns NULL. In that circumstance, if the calling program
- wants to pass any of the other fields to pcre_exec(), it must set up
- its own pcre_extra block.
-
- The second argument of pcre_study() contains option bits. At present,
- no options are defined, and this argument should always be zero.
-
- The third argument for pcre_study() is a pointer for an error message.
- If studying succeeds (even if no data is returned), the variable it
- points to is set to NULL. Otherwise it is set to point to a textual
- error message. This is a static string that is part of the library. You
- must not try to free it. You should test the error pointer for NULL
- after calling pcre_study(), to be sure that it has run successfully.
-
- This is a typical call to pcre_study():
-
- pcre_extra *pe;
- pe = pcre_study(
- re, /* result of pcre_compile() */
- 0, /* no options exist */
- &error); /* set to NULL or points to a message */
-
- At present, studying a pattern is useful only for non-anchored patterns
- that do not have a single fixed starting character. A bitmap of possi-
- ble starting bytes is created.
-
-
-LOCALE SUPPORT
-
- PCRE handles caseless matching, and determines whether characters are
- letters, digits, or whatever, by reference to a set of tables, indexed
- by character value. When running in UTF-8 mode, this applies only to
- characters with codes less than 128. Higher-valued codes never match
- escapes such as \w or \d, but can be tested with \p if PCRE is built
- with Unicode character property support. The use of locales with Uni-
- code is discouraged. If you are handling characters with codes greater
- than 128, you should either use UTF-8 and Unicode, or use locales, but
- not try to mix the two.
-
- PCRE contains an internal set of tables that are used when the final
- argument of pcre_compile() is NULL. These are sufficient for many
- applications. Normally, the internal tables recognize only ASCII char-
- acters. However, when PCRE is built, it is possible to cause the inter-
- nal tables to be rebuilt in the default "C" locale of the local system,
- which may cause them to be different.
-
- The internal tables can always be overridden by tables supplied by the
- application that calls PCRE. These may be created in a different locale
- from the default. As more and more applications change to using Uni-
- code, the need for this locale support is expected to die away.
-
- External tables are built by calling the pcre_maketables() function,
- which has no arguments, in the relevant locale. The result can then be
- passed to pcre_compile() or pcre_exec() as often as necessary. For
- example, to build and use tables that are appropriate for the French
- locale (where accented characters with values greater than 128 are
- treated as letters), the following code could be used:
-
- setlocale(LC_CTYPE, "fr_FR");
- tables = pcre_maketables();
- re = pcre_compile(..., tables);
-
- The locale name "fr_FR" is used on Linux and other Unix-like systems;
- if you are using Windows, the name for the French locale is "french".
-
- When pcre_maketables() runs, the tables are built in memory that is
- obtained via pcre_malloc. It is the caller's responsibility to ensure
- that the memory containing the tables remains available for as long as
- it is needed.
-
- The pointer that is passed to pcre_compile() is saved with the compiled
- pattern, and the same tables are used via this pointer by pcre_study()
- and normally also by pcre_exec(). Thus, by default, for any single pat-
- tern, compilation, studying and matching all happen in the same locale,
- but different patterns can be compiled in different locales.
-
- It is possible to pass a table pointer or NULL (indicating the use of
- the internal tables) to pcre_exec(). Although not intended for this
- purpose, this facility could be used to match a pattern in a different
- locale from the one in which it was compiled. Passing table pointers at
- run time is discussed below in the section on matching a pattern.
-
-
-INFORMATION ABOUT A PATTERN
-
- int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
- int what, void *where);
-
- The pcre_fullinfo() function returns information about a compiled pat-
- tern. It replaces the obsolete pcre_info() function, which is neverthe-
- less retained for backwards compability (and is documented below).
-
- The first argument for pcre_fullinfo() is a pointer to the compiled
- pattern. The second argument is the result of pcre_study(), or NULL if
- the pattern was not studied. The third argument specifies which piece
- of information is required, and the fourth argument is a pointer to a
- variable to receive the data. The yield of the function is zero for
- success, or one of the following negative numbers:
-
- PCRE_ERROR_NULL the argument code was NULL
- the argument where was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of what was invalid
-
- The "magic number" is placed at the start of each compiled pattern as
- an simple check against passing an arbitrary memory pointer. Here is a
- typical call of pcre_fullinfo(), to obtain the length of the compiled
- pattern:
-
- int rc;
- size_t length;
- rc = pcre_fullinfo(
- re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
- PCRE_INFO_SIZE, /* what is required */
- &length); /* where to put the data */
-
- The possible values for the third argument are defined in pcre.h, and
- are as follows:
-
- PCRE_INFO_BACKREFMAX
-
- Return the number of the highest back reference in the pattern. The
- fourth argument should point to an int variable. Zero is returned if
- there are no back references.
-
- PCRE_INFO_CAPTURECOUNT
-
- Return the number of capturing subpatterns in the pattern. The fourth
- argument should point to an int variable.
-
- PCRE_INFO_DEFAULT_TABLES
-
- Return a pointer to the internal default character tables within PCRE.
- The fourth argument should point to an unsigned char * variable. This
- information call is provided for internal use by the pcre_study() func-
- tion. External callers can cause PCRE to use its internal tables by
- passing a NULL table pointer.
-
- PCRE_INFO_FIRSTBYTE
-
- Return information about the first byte of any matched string, for a
- non-anchored pattern. The fourth argument should point to an int vari-
- able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name
- is still recognized for backwards compatibility.)
-
- If there is a fixed first byte, for example, from a pattern such as
- (cat|cow|coyote), its value is returned. Otherwise, if either
-
- (a) the pattern was compiled with the PCRE_MULTILINE option, and every
- branch starts with "^", or
-
- (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not
- set (if it were set, the pattern would be anchored),
-
- -1 is returned, indicating that the pattern matches only at the start
- of a subject string or after any newline within the string. Otherwise
- -2 is returned. For anchored patterns, -2 is returned.
-
- PCRE_INFO_FIRSTTABLE
-
- If the pattern was studied, and this resulted in the construction of a
- 256-bit table indicating a fixed set of bytes for the first byte in any
- matching string, a pointer to the table is returned. Otherwise NULL is
- returned. The fourth argument should point to an unsigned char * vari-
- able.
-
- PCRE_INFO_HASCRORLF
-
- Return 1 if the pattern contains any explicit matches for CR or LF
- characters, otherwise 0. The fourth argument should point to an int
- variable. An explicit match is either a literal CR or LF character, or
- \r or \n.
-
- PCRE_INFO_JCHANGED
-
- Return 1 if the (?J) or (?-J) option setting is used in the pattern,
- otherwise 0. The fourth argument should point to an int variable. (?J)
- and (?-J) set and unset the local PCRE_DUPNAMES option, respectively.
-
- PCRE_INFO_LASTLITERAL
-
- Return the value of the rightmost literal byte that must exist in any
- matched string, other than at its start, if such a byte has been
- recorded. The fourth argument should point to an int variable. If there
- is no such byte, -1 is returned. For anchored patterns, a last literal
- byte is recorded only if it follows something of variable length. For
- example, for the pattern /^a\d+z\d+/ the returned value is "z", but for
- /^a\dz\d/ the returned value is -1.
-
- PCRE_INFO_NAMECOUNT
- PCRE_INFO_NAMEENTRYSIZE
- PCRE_INFO_NAMETABLE
-
- PCRE supports the use of named as well as numbered capturing parenthe-
- ses. The names are just an additional way of identifying the parenthe-
- ses, which still acquire numbers. Several convenience functions such as
- pcre_get_named_substring() are provided for extracting captured sub-
- strings by name. It is also possible to extract the data directly, by
- first converting the name to a number in order to access the correct
- pointers in the output vector (described with pcre_exec() below). To do
- the conversion, you need to use the name-to-number map, which is
- described by these three values.
-
- The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT
- gives the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size
- of each entry; both of these return an int value. The entry size
- depends on the length of the longest name. PCRE_INFO_NAMETABLE returns
- a pointer to the first entry of the table (a pointer to char). The
- first two bytes of each entry are the number of the capturing parenthe-
- sis, most significant byte first. The rest of the entry is the corre-
- sponding name, zero terminated. The names are in alphabetical order.
- When PCRE_DUPNAMES is set, duplicate names are in order of their paren-
- theses numbers. For example, consider the following pattern (assume
- PCRE_EXTENDED is set, so white space - including newlines - is
- ignored):
-
- (? (?(\d\d)?\d\d) -
- (?\d\d) - (?\d\d) )
-
- There are four named subpatterns, so the table has four entries, and
- each entry in the table is eight bytes long. The table is as follows,
- with non-printing bytes shows in hexadecimal, and undefined bytes shown
- as ??:
-
- 00 01 d a t e 00 ??
- 00 05 d a y 00 ?? ??
- 00 04 m o n t h 00
- 00 02 y e a r 00 ??
-
- When writing code to extract data from named subpatterns using the
- name-to-number map, remember that the length of the entries is likely
- to be different for each compiled pattern.
-
- PCRE_INFO_OKPARTIAL
-
- Return 1 if the pattern can be used for partial matching, otherwise 0.
- The fourth argument should point to an int variable. The pcrepartial
- documentation lists the restrictions that apply to patterns when par-
- tial matching is used.
-
- PCRE_INFO_OPTIONS
-
- Return a copy of the options with which the pattern was compiled. The
- fourth argument should point to an unsigned long int variable. These
- option bits are those specified in the call to pcre_compile(), modified
- by any top-level option settings at the start of the pattern itself. In
- other words, they are the options that will be in force when matching
- starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with
- the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE,
- and PCRE_EXTENDED.
-
- A pattern is automatically anchored by PCRE if all of its top-level
- alternatives begin with one of the following:
-
- ^ unless PCRE_MULTILINE is set
- \A always
- \G always
- .* if PCRE_DOTALL is set and there are no back
- references to the subpattern in which .* appears
-
- For such patterns, the PCRE_ANCHORED bit is set in the options returned
- by pcre_fullinfo().
-
- PCRE_INFO_SIZE
-
- Return the size of the compiled pattern, that is, the value that was
- passed as the argument to pcre_malloc() when PCRE was getting memory in
- which to place the compiled data. The fourth argument should point to a
- size_t variable.
-
- PCRE_INFO_STUDYSIZE
-
- Return the size of the data block pointed to by the study_data field in
- a pcre_extra block. That is, it is the value that was passed to
- pcre_malloc() when PCRE was getting memory into which to place the data
- created by pcre_study(). The fourth argument should point to a size_t
- variable.
-
-
-OBSOLETE INFO FUNCTION
-
- int pcre_info(const pcre *code, int *optptr, int *firstcharptr);
-
- The pcre_info() function is now obsolete because its interface is too
- restrictive to return all the available data about a compiled pattern.
- New programs should use pcre_fullinfo() instead. The yield of
- pcre_info() is the number of capturing subpatterns, or one of the fol-
- lowing negative numbers:
-
- PCRE_ERROR_NULL the argument code was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
-
- If the optptr argument is not NULL, a copy of the options with which
- the pattern was compiled is placed in the integer it points to (see
- PCRE_INFO_OPTIONS above).
-
- If the pattern is not anchored and the firstcharptr argument is not
- NULL, it is used to pass back information about the first character of
- any matched string (see PCRE_INFO_FIRSTBYTE above).
-
-
-REFERENCE COUNTS
-
- int pcre_refcount(pcre *code, int adjust);
-
- The pcre_refcount() function is used to maintain a reference count in
- the data block that contains a compiled pattern. It is provided for the
- benefit of applications that operate in an object-oriented manner,
- where different parts of the application may be using the same compiled
- pattern, but you want to free the block when they are all done.
-
- When a pattern is compiled, the reference count field is initialized to
- zero. It is changed only by calling this function, whose action is to
- add the adjust value (which may be positive or negative) to it. The
- yield of the function is the new value. However, the value of the count
- is constrained to lie between 0 and 65535, inclusive. If the new value
- is outside these limits, it is forced to the appropriate limit value.
-
- Except when it is zero, the reference count is not correctly preserved
- if a pattern is compiled on one host and then transferred to a host
- whose byte-order is different. (This seems a highly unlikely scenario.)
-
-
-MATCHING A PATTERN: THE TRADITIONAL FUNCTION
-
- int pcre_exec(const pcre *code, const pcre_extra *extra,
- const char *subject, int length, int startoffset,
- int options, int *ovector, int ovecsize);
-
- The function pcre_exec() is called to match a subject string against a
- compiled pattern, which is passed in the code argument. If the pattern
- has been studied, the result of the study should be passed in the extra
- argument. This function is the main matching facility of the library,
- and it operates in a Perl-like manner. For specialist use there is also
- an alternative matching function, which is described below in the sec-
- tion about the pcre_dfa_exec() function.
-
- In most applications, the pattern will have been compiled (and option-
- ally studied) in the same process that calls pcre_exec(). However, it
- is possible to save compiled patterns and study data, and then use them
- later in different processes, possibly even on different hosts. For a
- discussion about this, see the pcreprecompile documentation.
-
- Here is an example of a simple call to pcre_exec():
-
- int rc;
- int ovector[30];
- rc = pcre_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector of integers for substring information */
- 30); /* number of elements (NOT size in bytes) */
-
- Extra data for pcre_exec()
-
- If the extra argument is not NULL, it must point to a pcre_extra data
- block. The pcre_study() function returns such a block (when it doesn't
- return NULL), but you can also create one for yourself, and pass addi-
- tional information in it. The pcre_extra block contains the following
- fields (not necessarily in this order):
-
- unsigned long int flags;
- void *study_data;
- unsigned long int match_limit;
- unsigned long int match_limit_recursion;
- void *callout_data;
- const unsigned char *tables;
-
- The flags field is a bitmap that specifies which of the other fields
- are set. The flag bits are:
-
- PCRE_EXTRA_STUDY_DATA
- PCRE_EXTRA_MATCH_LIMIT
- PCRE_EXTRA_MATCH_LIMIT_RECURSION
- PCRE_EXTRA_CALLOUT_DATA
- PCRE_EXTRA_TABLES
-
- Other flag bits should be set to zero. The study_data field is set in
- the pcre_extra block that is returned by pcre_study(), together with
- the appropriate flag bit. You should not set this yourself, but you may
- add to the block by setting the other fields and their corresponding
- flag bits.
-
- The match_limit field provides a means of preventing PCRE from using up
- a vast amount of resources when running patterns that are not going to
- match, but which have a very large number of possibilities in their
- search trees. The classic example is the use of nested unlimited
- repeats.
-
- Internally, PCRE uses a function called match() which it calls repeat-
- edly (sometimes recursively). The limit set by match_limit is imposed
- on the number of times this function is called during a match, which
- has the effect of limiting the amount of backtracking that can take
- place. For patterns that are not anchored, the count restarts from zero
- for each position in the subject string.
-
- The default value for the limit can be set when PCRE is built; the
- default default is 10 million, which handles all but the most extreme
- cases. You can override the default by suppling pcre_exec() with a
- pcre_extra block in which match_limit is set, and
- PCRE_EXTRA_MATCH_LIMIT is set in the flags field. If the limit is
- exceeded, pcre_exec() returns PCRE_ERROR_MATCHLIMIT.
-
- The match_limit_recursion field is similar to match_limit, but instead
- of limiting the total number of times that match() is called, it limits
- the depth of recursion. The recursion depth is a smaller number than
- the total number of calls, because not all calls to match() are recur-
- sive. This limit is of use only if it is set smaller than match_limit.
-
- Limiting the recursion depth limits the amount of stack that can be
- used, or, when PCRE has been compiled to use memory on the heap instead
- of the stack, the amount of heap memory that can be used.
-
- The default value for match_limit_recursion can be set when PCRE is
- built; the default default is the same value as the default for
- match_limit. You can override the default by suppling pcre_exec() with
- a pcre_extra block in which match_limit_recursion is set, and
- PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the flags field. If the
- limit is exceeded, pcre_exec() returns PCRE_ERROR_RECURSIONLIMIT.
-
- The pcre_callout field is used in conjunction with the "callout" fea-
- ture, which is described in the pcrecallout documentation.
-
- The tables field is used to pass a character tables pointer to
- pcre_exec(); this overrides the value that is stored with the compiled
- pattern. A non-NULL value is stored with the compiled pattern only if
- custom tables were supplied to pcre_compile() via its tableptr argu-
- ment. If NULL is passed to pcre_exec() using this mechanism, it forces
- PCRE's internal tables to be used. This facility is helpful when re-
- using patterns that have been saved after compiling with an external
- set of tables, because the external tables might be at a different
- address when pcre_exec() is called. See the pcreprecompile documenta-
- tion for a discussion of saving compiled patterns for later use.
-
- Option bits for pcre_exec()
-
- The unused bits of the options argument for pcre_exec() must be zero.
- The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_xxx,
- PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
- PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
-
- PCRE_ANCHORED
-
- The PCRE_ANCHORED option limits pcre_exec() to matching at the first
- matching position. If a pattern was compiled with PCRE_ANCHORED, or
- turned out to be anchored by virtue of its contents, it cannot be made
- unachored at matching time.
-
- PCRE_BSR_ANYCRLF
- PCRE_BSR_UNICODE
-
- These options (which are mutually exclusive) control what the \R escape
- sequence matches. The choice is either to match only CR, LF, or CRLF,
- or to match any Unicode newline sequence. These options override the
- choice that was made or defaulted when the pattern was compiled.
-
- PCRE_NEWLINE_CR
- PCRE_NEWLINE_LF
- PCRE_NEWLINE_CRLF
- PCRE_NEWLINE_ANYCRLF
- PCRE_NEWLINE_ANY
-
- These options override the newline definition that was chosen or
- defaulted when the pattern was compiled. For details, see the descrip-
- tion of pcre_compile() above. During matching, the newline choice
- affects the behaviour of the dot, circumflex, and dollar metacharac-
- ters. It may also alter the way the match position is advanced after a
- match failure for an unanchored pattern.
-
- When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
- set, and a match attempt for an unanchored pattern fails when the cur-
- rent position is at a CRLF sequence, and the pattern contains no
- explicit matches for CR or LF characters, the match position is
- advanced by two characters instead of one, in other words, to after the
- CRLF.
-
- The above rule is a compromise that makes the most common cases work as
- expected. For example, if the pattern is .+A (and the PCRE_DOTALL
- option is not set), it does not match the string "\r\nA" because, after
- failing at the start, it skips both the CR and the LF before retrying.
- However, the pattern [\r\n]A does match that string, because it con-
- tains an explicit CR or LF reference, and so advances only by one char-
- acter after the first failure.
-
- An explicit match for CR of LF is either a literal appearance of one of
- those characters, or one of the \r or \n escape sequences. Implicit
- matches such as [^X] do not count, nor does \s (which includes CR and
- LF in the characters that it matches).
-
- Notwithstanding the above, anomalous effects may still occur when CRLF
- is a valid newline sequence and explicit \r or \n escapes appear in the
- pattern.
-
- PCRE_NOTBOL
-
- This option specifies that first character of the subject string is not
- the beginning of a line, so the circumflex metacharacter should not
- match before it. Setting this without PCRE_MULTILINE (at compile time)
- causes circumflex never to match. This option affects only the behav-
- iour of the circumflex metacharacter. It does not affect \A.
-
- PCRE_NOTEOL
-
- This option specifies that the end of the subject string is not the end
- of a line, so the dollar metacharacter should not match it nor (except
- in multiline mode) a newline immediately before it. Setting this with-
- out PCRE_MULTILINE (at compile time) causes dollar never to match. This
- option affects only the behaviour of the dollar metacharacter. It does
- not affect \Z or \z.
-
- PCRE_NOTEMPTY
-
- An empty string is not considered to be a valid match if this option is
- set. If there are alternatives in the pattern, they are tried. If all
- the alternatives match the empty string, the entire match fails. For
- example, if the pattern
-
- a?b?
-
- is applied to a string not beginning with "a" or "b", it matches the
- empty string at the start of the subject. With PCRE_NOTEMPTY set, this
- match is not valid, so PCRE searches further into the string for occur-
- rences of "a" or "b".
-
- Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a spe-
- cial case of a pattern match of the empty string within its split()
- function, and when using the /g modifier. It is possible to emulate
- Perl's behaviour after matching a null string by first trying the match
- again at the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and then
- if that fails by advancing the starting offset (see below) and trying
- an ordinary match again. There is some code that demonstrates how to do
- this in the pcredemo.c sample program.
-
- PCRE_NO_START_OPTIMIZE
-
- There are a number of optimizations that pcre_exec() uses at the start
- of a match, in order to speed up the process. For example, if it is
- known that a match must start with a specific character, it searches
- the subject for that character, and fails immediately if it cannot find
- it, without actually running the main matching function. When callouts
- are in use, these optimizations can cause them to be skipped. This
- option disables the "start-up" optimizations, causing performance to
- suffer, but ensuring that the callouts do occur.
-
- PCRE_NO_UTF8_CHECK
-
- When PCRE_UTF8 is set at compile time, the validity of the subject as a
- UTF-8 string is automatically checked when pcre_exec() is subsequently
- called. The value of startoffset is also checked to ensure that it
- points to the start of a UTF-8 character. There is a discussion about
- the validity of UTF-8 strings in the section on UTF-8 support in the
- main pcre page. If an invalid UTF-8 sequence of bytes is found,
- pcre_exec() returns the error PCRE_ERROR_BADUTF8. If startoffset con-
- tains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
-
- If you already know that your subject is valid, and you want to skip
- these checks for performance reasons, you can set the
- PCRE_NO_UTF8_CHECK option when calling pcre_exec(). You might want to
- do this for the second and subsequent calls to pcre_exec() if you are
- making repeated calls to find all the matches in a single subject
- string. However, you should be sure that the value of startoffset
- points to the start of a UTF-8 character. When PCRE_NO_UTF8_CHECK is
- set, the effect of passing an invalid UTF-8 string as a subject, or a
- value of startoffset that does not point to the start of a UTF-8 char-
- acter, is undefined. Your program may crash.
-
- PCRE_PARTIAL
-
- This option turns on the partial matching feature. If the subject
- string fails to match the pattern, but at some point during the match-
- ing process the end of the subject was reached (that is, the subject
- partially matches the pattern and the failure to match occurred only
- because there were not enough subject characters), pcre_exec() returns
- PCRE_ERROR_PARTIAL instead of PCRE_ERROR_NOMATCH. When PCRE_PARTIAL is
- used, there are restrictions on what may appear in the pattern. These
- are discussed in the pcrepartial documentation.
-
- The string to be matched by pcre_exec()
-
- The subject string is passed to pcre_exec() as a pointer in subject, a
- length (in bytes) in length, and a starting byte offset in startoffset.
- In UTF-8 mode, the byte offset must point to the start of a UTF-8 char-
- acter. Unlike the pattern string, the subject may contain binary zero
- bytes. When the starting offset is zero, the search for a match starts
- at the beginning of the subject, and this is by far the most common
- case.
-
- A non-zero starting offset is useful when searching for another match
- in the same subject by calling pcre_exec() again after a previous suc-
- cess. Setting startoffset differs from just passing over a shortened
- string and setting PCRE_NOTBOL in the case of a pattern that begins
- with any kind of lookbehind. For example, consider the pattern
-
- \Biss\B
-
- which finds occurrences of "iss" in the middle of words. (\B matches
- only if the current position in the subject is not a word boundary.)
- When applied to the string "Mississipi" the first call to pcre_exec()
- finds the first occurrence. If pcre_exec() is called again with just
- the remainder of the subject, namely "issipi", it does not match,
- because \B is always false at the start of the subject, which is deemed
- to be a word boundary. However, if pcre_exec() is passed the entire
- string again, but with startoffset set to 4, it finds the second occur-
- rence of "iss" because it is able to look behind the starting point to
- discover that it is preceded by a letter.
-
- If a non-zero starting offset is passed when the pattern is anchored,
- one attempt to match at the given offset is made. This can only succeed
- if the pattern does not require the match to be at the start of the
- subject.
-
- How pcre_exec() returns captured substrings
-
- In general, a pattern matches a certain portion of the subject, and in
- addition, further substrings from the subject may be picked out by
- parts of the pattern. Following the usage in Jeffrey Friedl's book,
- this is called "capturing" in what follows, and the phrase "capturing
- subpattern" is used for a fragment of a pattern that picks out a sub-
- string. PCRE supports several other kinds of parenthesized subpattern
- that do not cause substrings to be captured.
-
- Captured substrings are returned to the caller via a vector of integers
- whose address is passed in ovector. The number of elements in the vec-
- tor is passed in ovecsize, which must be a non-negative number. Note:
- this argument is NOT the size of ovector in bytes.
-
- The first two-thirds of the vector is used to pass back captured sub-
- strings, each substring using a pair of integers. The remaining third
- of the vector is used as workspace by pcre_exec() while matching cap-
- turing subpatterns, and is not available for passing back information.
- The number passed in ovecsize should always be a multiple of three. If
- it is not, it is rounded down.
-
- When a match is successful, information about captured substrings is
- returned in pairs of integers, starting at the beginning of ovector,
- and continuing up to two-thirds of its length at the most. The first
- element of each pair is set to the byte offset of the first character
- in a substring, and the second is set to the byte offset of the first
- character after the end of a substring. Note: these values are always
- byte offsets, even in UTF-8 mode. They are not character counts.
-
- The first pair of integers, ovector[0] and ovector[1], identify the
- portion of the subject string matched by the entire pattern. The next
- pair is used for the first capturing subpattern, and so on. The value
- returned by pcre_exec() is one more than the highest numbered pair that
- has been set. For example, if two substrings have been captured, the
- returned value is 3. If there are no capturing subpatterns, the return
- value from a successful match is 1, indicating that just the first pair
- of offsets has been set.
-
- If a capturing subpattern is matched repeatedly, it is the last portion
- of the string that it matched that is returned.
-
- If the vector is too small to hold all the captured substring offsets,
- it is used as far as possible (up to two-thirds of its length), and the
- function returns a value of zero. If the substring offsets are not of
- interest, pcre_exec() may be called with ovector passed as NULL and
- ovecsize as zero. However, if the pattern contains back references and
- the ovector is not big enough to remember the related substrings, PCRE
- has to get additional memory for use during matching. Thus it is usu-
- ally advisable to supply an ovector.
-
- The pcre_info() function can be used to find out how many capturing
- subpatterns there are in a compiled pattern. The smallest size for
- ovector that will allow for n captured substrings, in addition to the
- offsets of the substring matched by the whole pattern, is (n+1)*3.
-
- It is possible for capturing subpattern number n+1 to match some part
- of the subject when subpattern n has not been used at all. For example,
- if the string "abc" is matched against the pattern (a|(z))(bc) the
- return from the function is 4, and subpatterns 1 and 3 are matched, but
- 2 is not. When this happens, both values in the offset pairs corre-
- sponding to unused subpatterns are set to -1.
-
- Offset values that correspond to unused subpatterns at the end of the
- expression are also set to -1. For example, if the string "abc" is
- matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not
- matched. The return from the function is 2, because the highest used
- capturing subpattern number is 1. However, you can refer to the offsets
- for the second and third capturing subpatterns if you wish (assuming
- the vector is large enough, of course).
-
- Some convenience functions are provided for extracting the captured
- substrings as separate strings. These are described below.
-
- Error return values from pcre_exec()
-
- If pcre_exec() fails, it returns a negative number. The following are
- defined in the header file:
-
- PCRE_ERROR_NOMATCH (-1)
-
- The subject string did not match the pattern.
-
- PCRE_ERROR_NULL (-2)
-
- Either code or subject was passed as NULL, or ovector was NULL and
- ovecsize was not zero.
-
- PCRE_ERROR_BADOPTION (-3)
-
- An unrecognized bit was set in the options argument.
-
- PCRE_ERROR_BADMAGIC (-4)
-
- PCRE stores a 4-byte "magic number" at the start of the compiled code,
- to catch the case when it is passed a junk pointer and to detect when a
- pattern that was compiled in an environment of one endianness is run in
- an environment with the other endianness. This is the error that PCRE
- gives when the magic number is not present.
-
- PCRE_ERROR_UNKNOWN_OPCODE (-5)
-
- While running the pattern match, an unknown item was encountered in the
- compiled pattern. This error could be caused by a bug in PCRE or by
- overwriting of the compiled pattern.
-
- PCRE_ERROR_NOMEMORY (-6)
-
- If a pattern contains back references, but the ovector that is passed
- to pcre_exec() is not big enough to remember the referenced substrings,
- PCRE gets a block of memory at the start of matching to use for this
- purpose. If the call via pcre_malloc() fails, this error is given. The
- memory is automatically freed at the end of matching.
-
- PCRE_ERROR_NOSUBSTRING (-7)
-
- This error is used by the pcre_copy_substring(), pcre_get_substring(),
- and pcre_get_substring_list() functions (see below). It is never
- returned by pcre_exec().
-
- PCRE_ERROR_MATCHLIMIT (-8)
-
- The backtracking limit, as specified by the match_limit field in a
- pcre_extra structure (or defaulted) was reached. See the description
- above.
-
- PCRE_ERROR_CALLOUT (-9)
-
- This error is never generated by pcre_exec() itself. It is provided for
- use by callout functions that want to yield a distinctive error code.
- See the pcrecallout documentation for details.
-
- PCRE_ERROR_BADUTF8 (-10)
-
- A string that contains an invalid UTF-8 byte sequence was passed as a
- subject.
-
- PCRE_ERROR_BADUTF8_OFFSET (-11)
-
- The UTF-8 byte sequence that was passed as a subject was valid, but the
- value of startoffset did not point to the beginning of a UTF-8 charac-
- ter.
-
- PCRE_ERROR_PARTIAL (-12)
-
- The subject string did not match, but it did match partially. See the
- pcrepartial documentation for details of partial matching.
-
- PCRE_ERROR_BADPARTIAL (-13)
-
- The PCRE_PARTIAL option was used with a compiled pattern containing
- items that are not supported for partial matching. See the pcrepartial
- documentation for details of partial matching.
-
- PCRE_ERROR_INTERNAL (-14)
-
- An unexpected internal error has occurred. This error could be caused
- by a bug in PCRE or by overwriting of the compiled pattern.
-
- PCRE_ERROR_BADCOUNT (-15)
-
- This error is given if the value of the ovecsize argument is negative.
-
- PCRE_ERROR_RECURSIONLIMIT (-21)
-
- The internal recursion limit, as specified by the match_limit_recursion
- field in a pcre_extra structure (or defaulted) was reached. See the
- description above.
-
- PCRE_ERROR_BADNEWLINE (-23)
-
- An invalid combination of PCRE_NEWLINE_xxx options was given.
-
- Error numbers -16 to -20 and -22 are not used by pcre_exec().
-
-
-EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
-
- int pcre_copy_substring(const char *subject, int *ovector,
- int stringcount, int stringnumber, char *buffer,
- int buffersize);
-
- int pcre_get_substring(const char *subject, int *ovector,
- int stringcount, int stringnumber,
- const char **stringptr);
-
- int pcre_get_substring_list(const char *subject,
- int *ovector, int stringcount, const char ***listptr);
-
- Captured substrings can be accessed directly by using the offsets
- returned by pcre_exec() in ovector. For convenience, the functions
- pcre_copy_substring(), pcre_get_substring(), and pcre_get_sub-
- string_list() are provided for extracting captured substrings as new,
- separate, zero-terminated strings. These functions identify substrings
- by number. The next section describes functions for extracting named
- substrings.
-
- A substring that contains a binary zero is correctly extracted and has
- a further zero added on the end, but the result is not, of course, a C
- string. However, you can process such a string by referring to the
- length that is returned by pcre_copy_substring() and pcre_get_sub-
- string(). Unfortunately, the interface to pcre_get_substring_list() is
- not adequate for handling strings containing binary zeros, because the
- end of the final string is not independently indicated.
-
- The first three arguments are the same for all three of these func-
- tions: subject is the subject string that has just been successfully
- matched, ovector is a pointer to the vector of integer offsets that was
- passed to pcre_exec(), and stringcount is the number of substrings that
- were captured by the match, including the substring that matched the
- entire regular expression. This is the value returned by pcre_exec() if
- it is greater than zero. If pcre_exec() returned zero, indicating that
- it ran out of space in ovector, the value passed as stringcount should
- be the number of elements in the vector divided by three.
-
- The functions pcre_copy_substring() and pcre_get_substring() extract a
- single substring, whose number is given as stringnumber. A value of
- zero extracts the substring that matched the entire pattern, whereas
- higher values extract the captured substrings. For pcre_copy_sub-
- string(), the string is placed in buffer, whose length is given by
- buffersize, while for pcre_get_substring() a new block of memory is
- obtained via pcre_malloc, and its address is returned via stringptr.
- The yield of the function is the length of the string, not including
- the terminating zero, or one of these error codes:
-
- PCRE_ERROR_NOMEMORY (-6)
-
- The buffer was too small for pcre_copy_substring(), or the attempt to
- get memory failed for pcre_get_substring().
-
- PCRE_ERROR_NOSUBSTRING (-7)
-
- There is no substring whose number is stringnumber.
-
- The pcre_get_substring_list() function extracts all available sub-
- strings and builds a list of pointers to them. All this is done in a
- single block of memory that is obtained via pcre_malloc. The address of
- the memory block is returned via listptr, which is also the start of
- the list of string pointers. The end of the list is marked by a NULL
- pointer. The yield of the function is zero if all went well, or the
- error code
-
- PCRE_ERROR_NOMEMORY (-6)
-
- if the attempt to get the memory block failed.
-
- When any of these functions encounter a substring that is unset, which
- can happen when capturing subpattern number n+1 matches some part of
- the subject, but subpattern n has not been used at all, they return an
- empty string. This can be distinguished from a genuine zero-length sub-
- string by inspecting the appropriate offset in ovector, which is nega-
- tive for unset substrings.
-
- The two convenience functions pcre_free_substring() and pcre_free_sub-
- string_list() can be used to free the memory returned by a previous
- call of pcre_get_substring() or pcre_get_substring_list(), respec-
- tively. They do nothing more than call the function pointed to by
- pcre_free, which of course could be called directly from a C program.
- However, PCRE is used in some situations where it is linked via a spe-
- cial interface to another programming language that cannot use
- pcre_free directly; it is for these cases that the functions are pro-
- vided.
-
-
-EXTRACTING CAPTURED SUBSTRINGS BY NAME
-
- int pcre_get_stringnumber(const pcre *code,
- const char *name);
-
- int pcre_copy_named_substring(const pcre *code,
- const char *subject, int *ovector,
- int stringcount, const char *stringname,
- char *buffer, int buffersize);
-
- int pcre_get_named_substring(const pcre *code,
- const char *subject, int *ovector,
- int stringcount, const char *stringname,
- const char **stringptr);
-
- To extract a substring by name, you first have to find associated num-
- ber. For example, for this pattern
-
- (a+)b(?\d+)...
-
- the number of the subpattern called "xxx" is 2. If the name is known to
- be unique (PCRE_DUPNAMES was not set), you can find the number from the
- name by calling pcre_get_stringnumber(). The first argument is the com-
- piled pattern, and the second is the name. The yield of the function is
- the subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no
- subpattern of that name.
-
- Given the number, you can extract the substring directly, or use one of
- the functions described in the previous section. For convenience, there
- are also two functions that do the whole job.
-
- Most of the arguments of pcre_copy_named_substring() and
- pcre_get_named_substring() are the same as those for the similarly
- named functions that extract by number. As these are described in the
- previous section, they are not re-described here. There are just two
- differences:
-
- First, instead of a substring number, a substring name is given. Sec-
- ond, there is an extra argument, given at the start, which is a pointer
- to the compiled pattern. This is needed in order to gain access to the
- name-to-number translation table.
-
- These functions call pcre_get_stringnumber(), and if it succeeds, they
- then call pcre_copy_substring() or pcre_get_substring(), as appropri-
- ate. NOTE: If PCRE_DUPNAMES is set and there are duplicate names, the
- behaviour may not be what you want (see the next section).
-
- Warning: If the pattern uses the "(?|" feature to set up multiple sub-
- patterns with the same number, you cannot use names to distinguish
- them, because names are not included in the compiled code. The matching
- process uses only numbers.
-
-
-DUPLICATE SUBPATTERN NAMES
-
- int pcre_get_stringtable_entries(const pcre *code,
- const char *name, char **first, char **last);
-
- When a pattern is compiled with the PCRE_DUPNAMES option, names for
- subpatterns are not required to be unique. Normally, patterns with
- duplicate names are such that in any one match, only one of the named
- subpatterns participates. An example is shown in the pcrepattern docu-
- mentation.
-
- When duplicates are present, pcre_copy_named_substring() and
- pcre_get_named_substring() return the first substring corresponding to
- the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING
- (-7) is returned; no data is returned. The pcre_get_stringnumber()
- function returns one of the numbers that are associated with the name,
- but it is not defined which it is.
-
- If you want to get full details of all captured substrings for a given
- name, you must use the pcre_get_stringtable_entries() function. The
- first argument is the compiled pattern, and the second is the name. The
- third and fourth are pointers to variables which are updated by the
- function. After it has run, they point to the first and last entries in
- the name-to-number table for the given name. The function itself
- returns the length of each entry, or PCRE_ERROR_NOSUBSTRING (-7) if
- there are none. The format of the table is described above in the sec-
- tion entitled Information about a pattern. Given all the relevant
- entries for the name, you can extract each of their numbers, and hence
- the captured data, if any.
-
-
-FINDING ALL POSSIBLE MATCHES
-
- The traditional matching function uses a similar algorithm to Perl,
- which stops when it finds the first match, starting at a given point in
- the subject. If you want to find all possible matches, or the longest
- possible match, consider using the alternative matching function (see
- below) instead. If you cannot use the alternative function, but still
- need to find all possible matches, you can kludge it up by making use
- of the callout facility, which is described in the pcrecallout documen-
- tation.
-
- What you have to do is to insert a callout right at the end of the pat-
- tern. When your callout function is called, extract and save the cur-
- rent matched substring. Then return 1, which forces pcre_exec() to
- backtrack and try other alternatives. Ultimately, when it runs out of
- matches, pcre_exec() will yield PCRE_ERROR_NOMATCH.
-
-
-MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
-
- int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
- const char *subject, int length, int startoffset,
- int options, int *ovector, int ovecsize,
- int *workspace, int wscount);
-
- The function pcre_dfa_exec() is called to match a subject string
- against a compiled pattern, using a matching algorithm that scans the
- subject string just once, and does not backtrack. This has different
- characteristics to the normal algorithm, and is not compatible with
- Perl. Some of the features of PCRE patterns are not supported. Never-
- theless, there are times when this kind of matching can be useful. For
- a discussion of the two matching algorithms, see the pcrematching docu-
- mentation.
-
- The arguments for the pcre_dfa_exec() function are the same as for
- pcre_exec(), plus two extras. The ovector argument is used in a differ-
- ent way, and this is described below. The other common arguments are
- used in the same way as for pcre_exec(), so their description is not
- repeated here.
-
- The two additional arguments provide workspace for the function. The
- workspace vector should contain at least 20 elements. It is used for
- keeping track of multiple paths through the pattern tree. More
- workspace will be needed for patterns and subjects where there are a
- lot of potential matches.
-
- Here is an example of a simple call to pcre_dfa_exec():
-
- int rc;
- int ovector[10];
- int wspace[20];
- rc = pcre_dfa_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector of integers for substring information */
- 10, /* number of elements (NOT size in bytes) */
- wspace, /* working space vector */
- 20); /* number of elements (NOT size in bytes) */
-
- Option bits for pcre_dfa_exec()
-
- The unused bits of the options argument for pcre_dfa_exec() must be
- zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEW-
- LINE_xxx, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK,
- PCRE_PARTIAL, PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last
- three of these are the same as for pcre_exec(), so their description is
- not repeated here.
-
- PCRE_PARTIAL
-
- This has the same general effect as it does for pcre_exec(), but the
- details are slightly different. When PCRE_PARTIAL is set for
- pcre_dfa_exec(), the return code PCRE_ERROR_NOMATCH is converted into
- PCRE_ERROR_PARTIAL if the end of the subject is reached, there have
- been no complete matches, but there is still at least one matching pos-
- sibility. The portion of the string that provided the partial match is
- set as the first matching string.
-
- PCRE_DFA_SHORTEST
-
- Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to
- stop as soon as it has found one match. Because of the way the alterna-
- tive algorithm works, this is necessarily the shortest possible match
- at the first possible matching point in the subject string.
-
- PCRE_DFA_RESTART
-
- When pcre_dfa_exec() is called with the PCRE_PARTIAL option, and
- returns a partial match, it is possible to call it again, with addi-
- tional subject characters, and have it continue with the same match.
- The PCRE_DFA_RESTART option requests this action; when it is set, the
- workspace and wscount options must reference the same vector as before
- because data about the match so far is left in them after a partial
- match. There is more discussion of this facility in the pcrepartial
- documentation.
-
- Successful returns from pcre_dfa_exec()
-
- When pcre_dfa_exec() succeeds, it may have matched more than one sub-
- string in the subject. Note, however, that all the matches from one run
- of the function start at the same point in the subject. The shorter
- matches are all initial substrings of the longer matches. For example,
- if the pattern
-
- <.*>
-
- is matched against the string
-
- This is no more
-
- the three matched strings are
-
-
-
-
-
- On success, the yield of the function is a number greater than zero,
- which is the number of matched substrings. The substrings themselves
- are returned in ovector. Each string uses two elements; the first is
- the offset to the start, and the second is the offset to the end. In
- fact, all the strings have the same start offset. (Space could have
- been saved by giving this only once, but it was decided to retain some
- compatibility with the way pcre_exec() returns data, even though the
- meaning of the strings is different.)
-
- The strings are returned in reverse order of length; that is, the long-
- est matching string is given first. If there were too many matches to
- fit into ovector, the yield of the function is zero, and the vector is
- filled with the longest matches.
-
- Error returns from pcre_dfa_exec()
-
- The pcre_dfa_exec() function returns a negative number when it fails.
- Many of the errors are the same as for pcre_exec(), and these are
- described above. There are in addition the following errors that are
- specific to pcre_dfa_exec():
-
- PCRE_ERROR_DFA_UITEM (-16)
-
- This return is given if pcre_dfa_exec() encounters an item in the pat-
- tern that it does not support, for instance, the use of \C or a back
- reference.
-
- PCRE_ERROR_DFA_UCOND (-17)
-
- This return is given if pcre_dfa_exec() encounters a condition item
- that uses a back reference for the condition, or a test for recursion
- in a specific group. These are not supported.
-
- PCRE_ERROR_DFA_UMLIMIT (-18)
-
- This return is given if pcre_dfa_exec() is called with an extra block
- that contains a setting of the match_limit field. This is not supported
- (it is meaningless).
-
- PCRE_ERROR_DFA_WSSIZE (-19)
-
- This return is given if pcre_dfa_exec() runs out of space in the
- workspace vector.
-
- PCRE_ERROR_DFA_RECURSE (-20)
-
- When a recursive subpattern is processed, the matching function calls
- itself recursively, using private vectors for ovector and workspace.
- This error is given if the output vector is not large enough. This
- should be extremely rare, as a vector of size 1000 is used.
-
-
-SEE ALSO
-
- pcrebuild(3), pcrecallout(3), pcrecpp(3)(3), pcrematching(3), pcrepar-
- tial(3), pcreposix(3), pcreprecompile(3), pcresample(3), pcrestack(3).
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 11 April 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCRECALLOUT(3) PCRECALLOUT(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE CALLOUTS
-
- int (*pcre_callout)(pcre_callout_block *);
-
- PCRE provides a feature called "callout", which is a means of temporar-
- ily passing control to the caller of PCRE in the middle of pattern
- matching. The caller of PCRE provides an external function by putting
- its entry point in the global variable pcre_callout. By default, this
- variable contains NULL, which disables all calling out.
-
- Within a regular expression, (?C) indicates the points at which the
- external function is to be called. Different callout points can be
- identified by putting a number less than 256 after the letter C. The
- default value is zero. For example, this pattern has two callout
- points:
-
- (?C1)abc(?C2)def
-
- If the PCRE_AUTO_CALLOUT option bit is set when pcre_compile() is
- called, PCRE automatically inserts callouts, all with number 255,
- before each item in the pattern. For example, if PCRE_AUTO_CALLOUT is
- used with the pattern
-
- A(\d{2}|--)
-
- it is processed as if it were
-
- (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
-
- Notice that there is a callout before and after each parenthesis and
- alternation bar. Automatic callouts can be used for tracking the
- progress of pattern matching. The pcretest command has an option that
- sets automatic callouts; when it is used, the output indicates how the
- pattern is matched. This is useful information when you are trying to
- optimize the performance of a particular pattern.
-
-
-MISSING CALLOUTS
-
- You should be aware that, because of optimizations in the way PCRE
- matches patterns by default, callouts sometimes do not happen. For
- example, if the pattern is
-
- ab(?C4)cd
-
- PCRE knows that any matching string must contain the letter "d". If the
- subject string is "abyz", the lack of "d" means that matching doesn't
- ever start, and the callout is never reached. However, with "abyd",
- though the result is still no match, the callout is obeyed.
-
- You can disable these optimizations by passing the PCRE_NO_START_OPTI-
- MIZE option to pcre_exec() or pcre_dfa_exec(). This slows down the
- matching process, but does ensure that callouts such as the example
- above are obeyed.
-
-
-THE CALLOUT INTERFACE
-
- During matching, when PCRE reaches a callout point, the external func-
- tion defined by pcre_callout is called (if it is set). This applies to
- both the pcre_exec() and the pcre_dfa_exec() matching functions. The
- only argument to the callout function is a pointer to a pcre_callout
- block. This structure contains the following fields:
-
- int version;
- int callout_number;
- int *offset_vector;
- const char *subject;
- int subject_length;
- int start_match;
- int current_position;
- int capture_top;
- int capture_last;
- void *callout_data;
- int pattern_position;
- int next_item_length;
-
- The version field is an integer containing the version number of the
- block format. The initial version was 0; the current version is 1. The
- version number will change again in future if additional fields are
- added, but the intention is never to remove any of the existing fields.
-
- The callout_number field contains the number of the callout, as com-
- piled into the pattern (that is, the number after ?C for manual call-
- outs, and 255 for automatically generated callouts).
-
- The offset_vector field is a pointer to the vector of offsets that was
- passed by the caller to pcre_exec() or pcre_dfa_exec(). When
- pcre_exec() is used, the contents can be inspected in order to extract
- substrings that have been matched so far, in the same way as for
- extracting substrings after a match has completed. For pcre_dfa_exec()
- this field is not useful.
-
- The subject and subject_length fields contain copies of the values that
- were passed to pcre_exec().
-
- The start_match field normally contains the offset within the subject
- at which the current match attempt started. However, if the escape
- sequence \K has been encountered, this value is changed to reflect the
- modified starting point. If the pattern is not anchored, the callout
- function may be called several times from the same point in the pattern
- for different starting points in the subject.
-
- The current_position field contains the offset within the subject of
- the current match pointer.
-
- When the pcre_exec() function is used, the capture_top field contains
- one more than the number of the highest numbered captured substring so
- far. If no substrings have been captured, the value of capture_top is
- one. This is always the case when pcre_dfa_exec() is used, because it
- does not support captured substrings.
-
- The capture_last field contains the number of the most recently cap-
- tured substring. If no substrings have been captured, its value is -1.
- This is always the case when pcre_dfa_exec() is used.
-
- The callout_data field contains a value that is passed to pcre_exec()
- or pcre_dfa_exec() specifically so that it can be passed back in call-
- outs. It is passed in the pcre_callout field of the pcre_extra data
- structure. If no such data was passed, the value of callout_data in a
- pcre_callout block is NULL. There is a description of the pcre_extra
- structure in the pcreapi documentation.
-
- The pattern_position field is present from version 1 of the pcre_call-
- out structure. It contains the offset to the next item to be matched in
- the pattern string.
-
- The next_item_length field is present from version 1 of the pcre_call-
- out structure. It contains the length of the next item to be matched in
- the pattern string. When the callout immediately precedes an alterna-
- tion bar, a closing parenthesis, or the end of the pattern, the length
- is zero. When the callout precedes an opening parenthesis, the length
- is that of the entire subpattern.
-
- The pattern_position and next_item_length fields are intended to help
- in distinguishing between different automatic callouts, which all have
- the same callout number. However, they are set for all callouts.
-
-
-RETURN VALUES
-
- The external callout function returns an integer to PCRE. If the value
- is zero, matching proceeds as normal. If the value is greater than
- zero, matching fails at the current point, but the testing of other
- matching possibilities goes ahead, just as if a lookahead assertion had
- failed. If the value is less than zero, the match is abandoned, and
- pcre_exec() (or pcre_dfa_exec()) returns the negative value.
-
- Negative values should normally be chosen from the set of
- PCRE_ERROR_xxx values. In particular, PCRE_ERROR_NOMATCH forces a stan-
- dard "no match" failure. The error number PCRE_ERROR_CALLOUT is
- reserved for use by callout functions; it will never be used by PCRE
- itself.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 15 March 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCRECOMPAT(3) PCRECOMPAT(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-DIFFERENCES BETWEEN PCRE AND PERL
-
- This document describes the differences in the ways that PCRE and Perl
- handle regular expressions. The differences described here are mainly
- with respect to Perl 5.8, though PCRE versions 7.0 and later contain
- some features that are expected to be in the forthcoming Perl 5.10.
-
- 1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details
- of what it does have are given in the section on UTF-8 support in the
- main pcre page.
-
- 2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl
- permits them, but they do not mean what you might think. For example,
- (?!a){3} does not assert that the next three characters are not "a". It
- just asserts that the next character is not "a" three times.
-
- 3. Capturing subpatterns that occur inside negative lookahead asser-
- tions are counted, but their entries in the offsets vector are never
- set. Perl sets its numerical variables from any such patterns that are
- matched before the assertion fails to match something (thereby succeed-
- ing), but only if the negative lookahead assertion contains just one
- branch.
-
- 4. Though binary zero characters are supported in the subject string,
- they are not allowed in a pattern string because it is passed as a nor-
- mal C string, terminated by zero. The escape sequence \0 can be used in
- the pattern to represent a binary zero.
-
- 5. The following Perl escape sequences are not supported: \l, \u, \L,
- \U, and \N. In fact these are implemented by Perl's general string-han-
- dling and are not part of its pattern matching engine. If any of these
- are encountered by PCRE, an error is generated.
-
- 6. The Perl escape sequences \p, \P, and \X are supported only if PCRE
- is built with Unicode character property support. The properties that
- can be tested with \p and \P are limited to the general category prop-
- erties such as Lu and Nd, script names such as Greek or Han, and the
- derived properties Any and L&.
-
- 7. PCRE does support the \Q...\E escape for quoting substrings. Charac-
- ters in between are treated as literals. This is slightly different
- from Perl in that $ and @ are also handled as literals inside the
- quotes. In Perl, they cause variable interpolation (but of course PCRE
- does not have variables). Note the following examples:
-
- Pattern PCRE matches Perl matches
-
- \Qabc$xyz\E abc$xyz abc followed by the
- contents of $xyz
- \Qabc\$xyz\E abc\$xyz abc\$xyz
- \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
-
- The \Q...\E sequence is recognized both inside and outside character
- classes.
-
- 8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
- constructions. However, there is support for recursive patterns. This
- is not available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE
- "callout" feature allows an external function to be called during pat-
- tern matching. See the pcrecallout documentation for details.
-
- 9. Subpatterns that are called recursively or as "subroutines" are
- always treated as atomic groups in PCRE. This is like Python, but
- unlike Perl.
-
- 10. There are some differences that are concerned with the settings of
- captured strings when part of a pattern is repeated. For example,
- matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
- unset, but in PCRE it is set to "b".
-
- 11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT),
- (*FAIL), (*F), (*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in
- the forms without an argument. PCRE does not support (*MARK). If
- (*ACCEPT) is within capturing parentheses, PCRE does not set that cap-
- ture group; this is different to Perl.
-
- 12. PCRE provides some extensions to the Perl regular expression facil-
- ities. Perl 5.10 will include new features that are not in earlier
- versions, some of which (such as named parentheses) have been in PCRE
- for some time. This list is with respect to Perl 5.10:
-
- (a) Although lookbehind assertions must match fixed length strings,
- each alternative branch of a lookbehind assertion can match a different
- length of string. Perl requires them all to have the same length.
-
- (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
- meta-character matches only at the very end of the string.
-
- (c) If PCRE_EXTRA is set, a backslash followed by a letter with no spe-
- cial meaning is faulted. Otherwise, like Perl, the backslash is quietly
- ignored. (Perl can be made to issue a warning.)
-
- (d) If PCRE_UNGREEDY is set, the greediness of the repetition quanti-
- fiers is inverted, that is, by default they are not greedy, but if fol-
- lowed by a question mark they are.
-
- (e) PCRE_ANCHORED can be used at matching time to force a pattern to be
- tried only at the first matching position in the subject string.
-
- (f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAP-
- TURE options for pcre_exec() have no Perl equivalents.
-
- (g) The \R escape sequence can be restricted to match only CR, LF, or
- CRLF by the PCRE_BSR_ANYCRLF option.
-
- (h) The callout facility is PCRE-specific.
-
- (i) The partial matching facility is PCRE-specific.
-
- (j) Patterns compiled by PCRE can be saved and re-used at a later time,
- even on different hosts that have the other endianness.
-
- (k) The alternative matching function (pcre_dfa_exec()) matches in a
- different way and is not Perl-compatible.
-
- (l) PCRE recognizes some special sequences such as (*CR) at the start
- of a pattern that set overall options that cannot be changed within the
- pattern.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 11 September 2007
- Copyright (c) 1997-2007 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREPATTERN(3) PCREPATTERN(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE REGULAR EXPRESSION DETAILS
-
- The syntax and semantics of the regular expressions that are supported
- by PCRE are described in detail below. There is a quick-reference syn-
- tax summary in the pcresyntax page. PCRE tries to match Perl syntax and
- semantics as closely as it can. PCRE also supports some alternative
- regular expression syntax (which does not conflict with the Perl syn-
- tax) in order to provide some compatibility with regular expressions in
- Python, .NET, and Oniguruma.
-
- Perl's regular expressions are described in its own documentation, and
- regular expressions in general are covered in a number of books, some
- of which have copious examples. Jeffrey Friedl's "Mastering Regular
- Expressions", published by O'Reilly, covers regular expressions in
- great detail. This description of PCRE's regular expressions is
- intended as reference material.
-
- The original operation of PCRE was on strings of one-byte characters.
- However, there is now also support for UTF-8 character strings. To use
- this, you must build PCRE to include UTF-8 support, and then call
- pcre_compile() with the PCRE_UTF8 option. There is also a special
- sequence that can be given at the start of a pattern:
-
- (*UTF8)
-
- Starting a pattern with this sequence is equivalent to setting the
- PCRE_UTF8 option. This feature is not Perl-compatible. How setting
- UTF-8 mode affects pattern matching is mentioned in several places
- below. There is also a summary of UTF-8 features in the section on
- UTF-8 support in the main pcre page.
-
- The remainder of this document discusses the patterns that are sup-
- ported by PCRE when its main matching function, pcre_exec(), is used.
- From release 6.0, PCRE offers a second matching function,
- pcre_dfa_exec(), which matches using a different algorithm that is not
- Perl-compatible. Some of the features discussed below are not available
- when pcre_dfa_exec() is used. The advantages and disadvantages of the
- alternative function, and how it differs from the normal function, are
- discussed in the pcrematching page.
-
-
-NEWLINE CONVENTIONS
-
- PCRE supports five different conventions for indicating line breaks in
- strings: a single CR (carriage return) character, a single LF (line-
- feed) character, the two-character sequence CRLF, any of the three pre-
- ceding, or any Unicode newline sequence. The pcreapi page has further
- discussion about newlines, and shows how to set the newline convention
- in the options arguments for the compiling and matching functions.
-
- It is also possible to specify a newline convention by starting a pat-
- tern string with one of the following five sequences:
-
- (*CR) carriage return
- (*LF) linefeed
- (*CRLF) carriage return, followed by linefeed
- (*ANYCRLF) any of the three above
- (*ANY) all Unicode newline sequences
-
- These override the default and the options given to pcre_compile(). For
- example, on a Unix system where LF is the default newline sequence, the
- pattern
-
- (*CR)a.b
-
- changes the convention to CR. That pattern matches "a\nb" because LF is
- no longer a newline. Note that these special settings, which are not
- Perl-compatible, are recognized only at the very start of a pattern,
- and that they must be in upper case. If more than one of them is
- present, the last one is used.
-
- The newline convention does not affect what the \R escape sequence
- matches. By default, this is any Unicode newline sequence, for Perl
- compatibility. However, this can be changed; see the description of \R
- in the section entitled "Newline sequences" below. A change of \R set-
- ting can be combined with a change of newline convention.
-
-
-CHARACTERS AND METACHARACTERS
-
- A regular expression is a pattern that is matched against a subject
- string from left to right. Most characters stand for themselves in a
- pattern, and match the corresponding characters in the subject. As a
- trivial example, the pattern
-
- The quick brown fox
-
- matches a portion of a subject string that is identical to itself. When
- caseless matching is specified (the PCRE_CASELESS option), letters are
- matched independently of case. In UTF-8 mode, PCRE always understands
- the concept of case for characters whose values are less than 128, so
- caseless matching is always possible. For characters with higher val-
- ues, the concept of case is supported if PCRE is compiled with Unicode
- property support, but not otherwise. If you want to use caseless
- matching for characters 128 and above, you must ensure that PCRE is
- compiled with Unicode property support as well as with UTF-8 support.
-
- The power of regular expressions comes from the ability to include
- alternatives and repetitions in the pattern. These are encoded in the
- pattern by the use of metacharacters, which do not stand for themselves
- but instead are interpreted in some special way.
-
- There are two different sets of metacharacters: those that are recog-
- nized anywhere in the pattern except within square brackets, and those
- that are recognized within square brackets. Outside square brackets,
- the metacharacters are as follows:
-
- \ general escape character with several uses
- ^ assert start of string (or line, in multiline mode)
- $ assert end of string (or line, in multiline mode)
- . match any character except newline (by default)
- [ start character class definition
- | start of alternative branch
- ( start subpattern
- ) end subpattern
- ? extends the meaning of (
- also 0 or 1 quantifier
- also quantifier minimizer
- * 0 or more quantifier
- + 1 or more quantifier
- also "possessive quantifier"
- { start min/max quantifier
-
- Part of a pattern that is in square brackets is called a "character
- class". In a character class the only metacharacters are:
-
- \ general escape character
- ^ negate the class, but only if the first character
- - indicates character range
- [ POSIX character class (only if followed by POSIX
- syntax)
- ] terminates the character class
-
- The following sections describe the use of each of the metacharacters.
-
-
-BACKSLASH
-
- The backslash character has several uses. Firstly, if it is followed by
- a non-alphanumeric character, it takes away any special meaning that
- character may have. This use of backslash as an escape character
- applies both inside and outside character classes.
-
- For example, if you want to match a * character, you write \* in the
- pattern. This escaping action applies whether or not the following
- character would otherwise be interpreted as a metacharacter, so it is
- always safe to precede a non-alphanumeric with backslash to specify
- that it stands for itself. In particular, if you want to match a back-
- slash, you write \\.
-
- If a pattern is compiled with the PCRE_EXTENDED option, whitespace in
- the pattern (other than in a character class) and characters between a
- # outside a character class and the next newline are ignored. An escap-
- ing backslash can be used to include a whitespace or # character as
- part of the pattern.
-
- If you want to remove the special meaning from a sequence of charac-
- ters, you can do so by putting them between \Q and \E. This is differ-
- ent from Perl in that $ and @ are handled as literals in \Q...\E
- sequences in PCRE, whereas in Perl, $ and @ cause variable interpola-
- tion. Note the following examples:
-
- Pattern PCRE matches Perl matches
-
- \Qabc$xyz\E abc$xyz abc followed by the
- contents of $xyz
- \Qabc\$xyz\E abc\$xyz abc\$xyz
- \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
-
- The \Q...\E sequence is recognized both inside and outside character
- classes.
-
- Non-printing characters
-
- A second use of backslash provides a way of encoding non-printing char-
- acters in patterns in a visible manner. There is no restriction on the
- appearance of non-printing characters, apart from the binary zero that
- terminates a pattern, but when a pattern is being prepared by text
- editing, it is usually easier to use one of the following escape
- sequences than the binary character it represents:
-
- \a alarm, that is, the BEL character (hex 07)
- \cx "control-x", where x is any character
- \e escape (hex 1B)
- \f formfeed (hex 0C)
- \n linefeed (hex 0A)
- \r carriage return (hex 0D)
- \t tab (hex 09)
- \ddd character with octal code ddd, or backreference
- \xhh character with hex code hh
- \x{hhh..} character with hex code hhh..
-
- The precise effect of \cx is as follows: if x is a lower case letter,
- it is converted to upper case. Then bit 6 of the character (hex 40) is
- inverted. Thus \cz becomes hex 1A, but \c{ becomes hex 3B, while \c;
- becomes hex 7B.
-
- After \x, from zero to two hexadecimal digits are read (letters can be
- in upper or lower case). Any number of hexadecimal digits may appear
- between \x{ and }, but the value of the character code must be less
- than 256 in non-UTF-8 mode, and less than 2**31 in UTF-8 mode. That is,
- the maximum value in hexadecimal is 7FFFFFFF. Note that this is bigger
- than the largest Unicode code point, which is 10FFFF.
-
- If characters other than hexadecimal digits appear between \x{ and },
- or if there is no terminating }, this form of escape is not recognized.
- Instead, the initial \x will be interpreted as a basic hexadecimal
- escape, with no following digits, giving a character whose value is
- zero.
-
- Characters whose value is less than 256 can be defined by either of the
- two syntaxes for \x. There is no difference in the way they are han-
- dled. For example, \xdc is exactly the same as \x{dc}.
-
- After \0 up to two further octal digits are read. If there are fewer
- than two digits, just those that are present are used. Thus the
- sequence \0\x\07 specifies two binary zeros followed by a BEL character
- (code value 7). Make sure you supply two digits after the initial zero
- if the pattern character that follows is itself an octal digit.
-
- The handling of a backslash followed by a digit other than 0 is compli-
- cated. Outside a character class, PCRE reads it and any following dig-
- its as a decimal number. If the number is less than 10, or if there
- have been at least that many previous capturing left parentheses in the
- expression, the entire sequence is taken as a back reference. A
- description of how this works is given later, following the discussion
- of parenthesized subpatterns.
-
- Inside a character class, or if the decimal number is greater than 9
- and there have not been that many capturing subpatterns, PCRE re-reads
- up to three octal digits following the backslash, and uses them to gen-
- erate a data character. Any subsequent digits stand for themselves. In
- non-UTF-8 mode, the value of a character specified in octal must be
- less than \400. In UTF-8 mode, values up to \777 are permitted. For
- example:
-
- \040 is another way of writing a space
- \40 is the same, provided there are fewer than 40
- previous capturing subpatterns
- \7 is always a back reference
- \11 might be a back reference, or another way of
- writing a tab
- \011 is always a tab
- \0113 is a tab followed by the character "3"
- \113 might be a back reference, otherwise the
- character with octal code 113
- \377 might be a back reference, otherwise
- the byte consisting entirely of 1 bits
- \81 is either a back reference, or a binary zero
- followed by the two characters "8" and "1"
-
- Note that octal values of 100 or greater must not be introduced by a
- leading zero, because no more than three octal digits are ever read.
-
- All the sequences that define a single character value can be used both
- inside and outside character classes. In addition, inside a character
- class, the sequence \b is interpreted as the backspace character (hex
- 08), and the sequences \R and \X are interpreted as the characters "R"
- and "X", respectively. Outside a character class, these sequences have
- different meanings (see below).
-
- Absolute and relative back references
-
- The sequence \g followed by an unsigned or a negative number, option-
- ally enclosed in braces, is an absolute or relative back reference. A
- named back reference can be coded as \g{name}. Back references are dis-
- cussed later, following the discussion of parenthesized subpatterns.
-
- Absolute and relative subroutine calls
-
- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
- name or a number enclosed either in angle brackets or single quotes, is
- an alternative syntax for referencing a subpattern as a "subroutine".
- Details are discussed later. Note that \g{...} (Perl syntax) and
- \g<...> (Oniguruma syntax) are not synonymous. The former is a back
- reference; the latter is a subroutine call.
-
- Generic character types
-
- Another use of backslash is for specifying generic character types. The
- following are always recognized:
-
- \d any decimal digit
- \D any character that is not a decimal digit
- \h any horizontal whitespace character
- \H any character that is not a horizontal whitespace character
- \s any whitespace character
- \S any character that is not a whitespace character
- \v any vertical whitespace character
- \V any character that is not a vertical whitespace character
- \w any "word" character
- \W any "non-word" character
-
- Each pair of escape sequences partitions the complete set of characters
- into two disjoint sets. Any given character matches one, and only one,
- of each pair.
-
- These character type sequences can appear both inside and outside char-
- acter classes. They each match one character of the appropriate type.
- If the current matching point is at the end of the subject string, all
- of them fail, since there is no character to match.
-
- For compatibility with Perl, \s does not match the VT character (code
- 11). This makes it different from the the POSIX "space" class. The \s
- characters are HT (9), LF (10), FF (12), CR (13), and space (32). If
- "use locale;" is included in a Perl script, \s may match the VT charac-
- ter. In PCRE, it never does.
-
- In UTF-8 mode, characters with values greater than 128 never match \d,
- \s, or \w, and always match \D, \S, and \W. This is true even when Uni-
- code character property support is available. These sequences retain
- their original meanings from before UTF-8 support was available, mainly
- for efficiency reasons. Note that this also affects \b, because it is
- defined in terms of \w and \W.
-
- The sequences \h, \H, \v, and \V are Perl 5.10 features. In contrast to
- the other sequences, these do match certain high-valued codepoints in
- UTF-8 mode. The horizontal space characters are:
-
- U+0009 Horizontal tab
- U+0020 Space
- U+00A0 Non-break space
- U+1680 Ogham space mark
- U+180E Mongolian vowel separator
- U+2000 En quad
- U+2001 Em quad
- U+2002 En space
- U+2003 Em space
- U+2004 Three-per-em space
- U+2005 Four-per-em space
- U+2006 Six-per-em space
- U+2007 Figure space
- U+2008 Punctuation space
- U+2009 Thin space
- U+200A Hair space
- U+202F Narrow no-break space
- U+205F Medium mathematical space
- U+3000 Ideographic space
-
- The vertical space characters are:
-
- U+000A Linefeed
- U+000B Vertical tab
- U+000C Formfeed
- U+000D Carriage return
- U+0085 Next line
- U+2028 Line separator
- U+2029 Paragraph separator
-
- A "word" character is an underscore or any character less than 256 that
- is a letter or digit. The definition of letters and digits is con-
- trolled by PCRE's low-valued character tables, and may vary if locale-
- specific matching is taking place (see "Locale support" in the pcreapi
- page). For example, in a French locale such as "fr_FR" in Unix-like
- systems, or "french" in Windows, some character codes greater than 128
- are used for accented letters, and these are matched by \w. The use of
- locales with Unicode is discouraged.
-
- Newline sequences
-
- Outside a character class, by default, the escape sequence \R matches
- any Unicode newline sequence. This is a Perl 5.10 feature. In non-UTF-8
- mode \R is equivalent to the following:
-
- (?>\r\n|\n|\x0b|\f|\r|\x85)
-
- This is an example of an "atomic group", details of which are given
- below. This particular group matches either the two-character sequence
- CR followed by LF, or one of the single characters LF (linefeed,
- U+000A), VT (vertical tab, U+000B), FF (formfeed, U+000C), CR (carriage
- return, U+000D), or NEL (next line, U+0085). The two-character sequence
- is treated as a single unit that cannot be split.
-
- In UTF-8 mode, two additional characters whose codepoints are greater
- than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa-
- rator, U+2029). Unicode character property support is not needed for
- these characters to be recognized.
-
- It is possible to restrict \R to match only CR, LF, or CRLF (instead of
- the complete set of Unicode line endings) by setting the option
- PCRE_BSR_ANYCRLF either at compile time or when the pattern is matched.
- (BSR is an abbrevation for "backslash R".) This can be made the default
- when PCRE is built; if this is the case, the other behaviour can be
- requested via the PCRE_BSR_UNICODE option. It is also possible to
- specify these settings by starting a pattern string with one of the
- following sequences:
-
- (*BSR_ANYCRLF) CR, LF, or CRLF only
- (*BSR_UNICODE) any Unicode newline sequence
-
- These override the default and the options given to pcre_compile(), but
- they can be overridden by options given to pcre_exec(). Note that these
- special settings, which are not Perl-compatible, are recognized only at
- the very start of a pattern, and that they must be in upper case. If
- more than one of them is present, the last one is used. They can be
- combined with a change of newline convention, for example, a pattern
- can start with:
-
- (*ANY)(*BSR_ANYCRLF)
-
- Inside a character class, \R matches the letter "R".
-
- Unicode character properties
-
- When PCRE is built with Unicode character property support, three addi-
- tional escape sequences that match characters with specific properties
- are available. When not in UTF-8 mode, these sequences are of course
- limited to testing characters whose codepoints are less than 256, but
- they do work in this mode. The extra escape sequences are:
-
- \p{xx} a character with the xx property
- \P{xx} a character without the xx property
- \X an extended Unicode sequence
-
- The property names represented by xx above are limited to the Unicode
- script names, the general category properties, and "Any", which matches
- any character (including newline). Other properties such as "InMusical-
- Symbols" are not currently supported by PCRE. Note that \P{Any} does
- not match any characters, so always causes a match failure.
-
- Sets of Unicode characters are defined as belonging to certain scripts.
- A character from one of these sets can be matched using a script name.
- For example:
-
- \p{Greek}
- \P{Han}
-
- Those that are not part of an identified script are lumped together as
- "Common". The current list of scripts is:
-
- Arabic, Armenian, Balinese, Bengali, Bopomofo, Braille, Buginese,
- Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic, Cuneiform,
- Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian, Glagolitic,
- Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hira-
- gana, Inherited, Kannada, Katakana, Kharoshthi, Khmer, Lao, Latin,
- Limbu, Linear_B, Malayalam, Mongolian, Myanmar, New_Tai_Lue, Nko,
- Ogham, Old_Italic, Old_Persian, Oriya, Osmanya, Phags_Pa, Phoenician,
- Runic, Shavian, Sinhala, Syloti_Nagri, Syriac, Tagalog, Tagbanwa,
- Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, Tifinagh, Ugaritic, Yi.
-
- Each character has exactly one general category property, specified by
- a two-letter abbreviation. For compatibility with Perl, negation can be
- specified by including a circumflex between the opening brace and the
- property name. For example, \p{^Lu} is the same as \P{Lu}.
-
- If only one letter is specified with \p or \P, it includes all the gen-
- eral category properties that start with that letter. In this case, in
- the absence of negation, the curly brackets in the escape sequence are
- optional; these two examples have the same effect:
-
- \p{L}
- \pL
-
- The following general category property codes are supported:
-
- C Other
- Cc Control
- Cf Format
- Cn Unassigned
- Co Private use
- Cs Surrogate
-
- L Letter
- Ll Lower case letter
- Lm Modifier letter
- Lo Other letter
- Lt Title case letter
- Lu Upper case letter
-
- M Mark
- Mc Spacing mark
- Me Enclosing mark
- Mn Non-spacing mark
-
- N Number
- Nd Decimal number
- Nl Letter number
- No Other number
-
- P Punctuation
- Pc Connector punctuation
- Pd Dash punctuation
- Pe Close punctuation
- Pf Final punctuation
- Pi Initial punctuation
- Po Other punctuation
- Ps Open punctuation
-
- S Symbol
- Sc Currency symbol
- Sk Modifier symbol
- Sm Mathematical symbol
- So Other symbol
-
- Z Separator
- Zl Line separator
- Zp Paragraph separator
- Zs Space separator
-
- The special property L& is also supported: it matches a character that
- has the Lu, Ll, or Lt property, in other words, a letter that is not
- classified as a modifier or "other".
-
- The Cs (Surrogate) property applies only to characters in the range
- U+D800 to U+DFFF. Such characters are not valid in UTF-8 strings (see
- RFC 3629) and so cannot be tested by PCRE, unless UTF-8 validity check-
- ing has been turned off (see the discussion of PCRE_NO_UTF8_CHECK in
- the pcreapi page).
-
- The long synonyms for these properties that Perl supports (such as
- \p{Letter}) are not supported by PCRE, nor is it permitted to prefix
- any of these properties with "Is".
-
- No character that is in the Unicode table has the Cn (unassigned) prop-
- erty. Instead, this property is assumed for any code point that is not
- in the Unicode table.
-
- Specifying caseless matching does not affect these escape sequences.
- For example, \p{Lu} always matches only upper case letters.
-
- The \X escape matches any number of Unicode characters that form an
- extended Unicode sequence. \X is equivalent to
-
- (?>\PM\pM*)
-
- That is, it matches a character without the "mark" property, followed
- by zero or more characters with the "mark" property, and treats the
- sequence as an atomic group (see below). Characters with the "mark"
- property are typically accents that affect the preceding character.
- None of them have codepoints less than 256, so in non-UTF-8 mode \X
- matches any one character.
-
- Matching characters by Unicode property is not fast, because PCRE has
- to search a structure that contains data for over fifteen thousand
- characters. That is why the traditional escape sequences such as \d and
- \w do not use Unicode properties in PCRE.
-
- Resetting the match start
-
- The escape sequence \K, which is a Perl 5.10 feature, causes any previ-
- ously matched characters not to be included in the final matched
- sequence. For example, the pattern:
-
- foo\Kbar
-
- matches "foobar", but reports that it has matched "bar". This feature
- is similar to a lookbehind assertion (described below). However, in
- this case, the part of the subject before the real match does not have
- to be of fixed length, as lookbehind assertions do. The use of \K does
- not interfere with the setting of captured substrings. For example,
- when the pattern
-
- (foo)\Kbar
-
- matches "foobar", the first substring is still set to "foo".
-
- Simple assertions
-
- The final use of backslash is for certain simple assertions. An asser-
- tion specifies a condition that has to be met at a particular point in
- a match, without consuming any characters from the subject string. The
- use of subpatterns for more complicated assertions is described below.
- The backslashed assertions are:
-
- \b matches at a word boundary
- \B matches when not at a word boundary
- \A matches at the start of the subject
- \Z matches at the end of the subject
- also matches before a newline at the end of the subject
- \z matches only at the end of the subject
- \G matches at the first matching position in the subject
-
- These assertions may not appear in character classes (but note that \b
- has a different meaning, namely the backspace character, inside a char-
- acter class).
-
- A word boundary is a position in the subject string where the current
- character and the previous character do not both match \w or \W (i.e.
- one matches \w and the other matches \W), or the start or end of the
- string if the first or last character matches \w, respectively.
-
- The \A, \Z, and \z assertions differ from the traditional circumflex
- and dollar (described in the next section) in that they only ever match
- at the very start and end of the subject string, whatever options are
- set. Thus, they are independent of multiline mode. These three asser-
- tions are not affected by the PCRE_NOTBOL or PCRE_NOTEOL options, which
- affect only the behaviour of the circumflex and dollar metacharacters.
- However, if the startoffset argument of pcre_exec() is non-zero, indi-
- cating that matching is to start at a point other than the beginning of
- the subject, \A can never match. The difference between \Z and \z is
- that \Z matches before a newline at the end of the string as well as at
- the very end, whereas \z matches only at the end.
-
- The \G assertion is true only when the current matching position is at
- the start point of the match, as specified by the startoffset argument
- of pcre_exec(). It differs from \A when the value of startoffset is
- non-zero. By calling pcre_exec() multiple times with appropriate argu-
- ments, you can mimic Perl's /g option, and it is in this kind of imple-
- mentation where \G can be useful.
-
- Note, however, that PCRE's interpretation of \G, as the start of the
- current match, is subtly different from Perl's, which defines it as the
- end of the previous match. In Perl, these can be different when the
- previously matched string was empty. Because PCRE does just one match
- at a time, it cannot reproduce this behaviour.
-
- If all the alternatives of a pattern begin with \G, the expression is
- anchored to the starting match position, and the "anchored" flag is set
- in the compiled regular expression.
-
-
-CIRCUMFLEX AND DOLLAR
-
- Outside a character class, in the default matching mode, the circumflex
- character is an assertion that is true only if the current matching
- point is at the start of the subject string. If the startoffset argu-
- ment of pcre_exec() is non-zero, circumflex can never match if the
- PCRE_MULTILINE option is unset. Inside a character class, circumflex
- has an entirely different meaning (see below).
-
- Circumflex need not be the first character of the pattern if a number
- of alternatives are involved, but it should be the first thing in each
- alternative in which it appears if the pattern is ever to match that
- branch. If all possible alternatives start with a circumflex, that is,
- if the pattern is constrained to match only at the start of the sub-
- ject, it is said to be an "anchored" pattern. (There are also other
- constructs that can cause a pattern to be anchored.)
-
- A dollar character is an assertion that is true only if the current
- matching point is at the end of the subject string, or immediately
- before a newline at the end of the string (by default). Dollar need not
- be the last character of the pattern if a number of alternatives are
- involved, but it should be the last item in any branch in which it
- appears. Dollar has no special meaning in a character class.
-
- The meaning of dollar can be changed so that it matches only at the
- very end of the string, by setting the PCRE_DOLLAR_ENDONLY option at
- compile time. This does not affect the \Z assertion.
-
- The meanings of the circumflex and dollar characters are changed if the
- PCRE_MULTILINE option is set. When this is the case, a circumflex
- matches immediately after internal newlines as well as at the start of
- the subject string. It does not match after a newline that ends the
- string. A dollar matches before any newlines in the string, as well as
- at the very end, when PCRE_MULTILINE is set. When newline is specified
- as the two-character sequence CRLF, isolated CR and LF characters do
- not indicate newlines.
-
- For example, the pattern /^abc$/ matches the subject string "def\nabc"
- (where \n represents a newline) in multiline mode, but not otherwise.
- Consequently, patterns that are anchored in single line mode because
- all branches start with ^ are not anchored in multiline mode, and a
- match for circumflex is possible when the startoffset argument of
- pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
- PCRE_MULTILINE is set.
-
- Note that the sequences \A, \Z, and \z can be used to match the start
- and end of the subject in both modes, and if all branches of a pattern
- start with \A it is always anchored, whether or not PCRE_MULTILINE is
- set.
-
-
-FULL STOP (PERIOD, DOT)
-
- Outside a character class, a dot in the pattern matches any one charac-
- ter in the subject string except (by default) a character that signi-
- fies the end of a line. In UTF-8 mode, the matched character may be
- more than one byte long.
-
- When a line ending is defined as a single character, dot never matches
- that character; when the two-character sequence CRLF is used, dot does
- not match CR if it is immediately followed by LF, but otherwise it
- matches all characters (including isolated CRs and LFs). When any Uni-
- code line endings are being recognized, dot does not match CR or LF or
- any of the other line ending characters.
-
- The behaviour of dot with regard to newlines can be changed. If the
- PCRE_DOTALL option is set, a dot matches any one character, without
- exception. If the two-character sequence CRLF is present in the subject
- string, it takes two dots to match it.
-
- The handling of dot is entirely independent of the handling of circum-
- flex and dollar, the only relationship being that they both involve
- newlines. Dot has no special meaning in a character class.
-
-
-MATCHING A SINGLE BYTE
-
- Outside a character class, the escape sequence \C matches any one byte,
- both in and out of UTF-8 mode. Unlike a dot, it always matches any
- line-ending characters. The feature is provided in Perl in order to
- match individual bytes in UTF-8 mode. Because it breaks up UTF-8 char-
- acters into individual bytes, what remains in the string may be a mal-
- formed UTF-8 string. For this reason, the \C escape sequence is best
- avoided.
-
- PCRE does not allow \C to appear in lookbehind assertions (described
- below), because in UTF-8 mode this would make it impossible to calcu-
- late the length of the lookbehind.
-
-
-SQUARE BRACKETS AND CHARACTER CLASSES
-
- An opening square bracket introduces a character class, terminated by a
- closing square bracket. A closing square bracket on its own is not spe-
- cial. If a closing square bracket is required as a member of the class,
- it should be the first data character in the class (after an initial
- circumflex, if present) or escaped with a backslash.
-
- A character class matches a single character in the subject. In UTF-8
- mode, the character may occupy more than one byte. A matched character
- must be in the set of characters defined by the class, unless the first
- character in the class definition is a circumflex, in which case the
- subject character must not be in the set defined by the class. If a
- circumflex is actually required as a member of the class, ensure it is
- not the first character, or escape it with a backslash.
-
- For example, the character class [aeiou] matches any lower case vowel,
- while [^aeiou] matches any character that is not a lower case vowel.
- Note that a circumflex is just a convenient notation for specifying the
- characters that are in the class by enumerating those that are not. A
- class that starts with a circumflex is not an assertion: it still con-
- sumes a character from the subject string, and therefore it fails if
- the current pointer is at the end of the string.
-
- In UTF-8 mode, characters with values greater than 255 can be included
- in a class as a literal string of bytes, or by using the \x{ escaping
- mechanism.
-
- When caseless matching is set, any letters in a class represent both
- their upper case and lower case versions, so for example, a caseless
- [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
- match "A", whereas a caseful version would. In UTF-8 mode, PCRE always
- understands the concept of case for characters whose values are less
- than 128, so caseless matching is always possible. For characters with
- higher values, the concept of case is supported if PCRE is compiled
- with Unicode property support, but not otherwise. If you want to use
- caseless matching for characters 128 and above, you must ensure that
- PCRE is compiled with Unicode property support as well as with UTF-8
- support.
-
- Characters that might indicate line breaks are never treated in any
- special way when matching character classes, whatever line-ending
- sequence is in use, and whatever setting of the PCRE_DOTALL and
- PCRE_MULTILINE options is used. A class such as [^a] always matches one
- of these characters.
-
- The minus (hyphen) character can be used to specify a range of charac-
- ters in a character class. For example, [d-m] matches any letter
- between d and m, inclusive. If a minus character is required in a
- class, it must be escaped with a backslash or appear in a position
- where it cannot be interpreted as indicating a range, typically as the
- first or last character in the class.
-
- It is not possible to have the literal character "]" as the end charac-
- ter of a range. A pattern such as [W-]46] is interpreted as a class of
- two characters ("W" and "-") followed by a literal string "46]", so it
- would match "W46]" or "-46]". However, if the "]" is escaped with a
- backslash it is interpreted as the end of range, so [W-\]46] is inter-
- preted as a class containing a range followed by two other characters.
- The octal or hexadecimal representation of "]" can also be used to end
- a range.
-
- Ranges operate in the collating sequence of character values. They can
- also be used for characters specified numerically, for example
- [\000-\037]. In UTF-8 mode, ranges can include characters whose values
- are greater than 255, for example [\x{100}-\x{2ff}].
-
- If a range that includes letters is used when caseless matching is set,
- it matches the letters in either case. For example, [W-c] is equivalent
- to [][\\^_`wxyzabc], matched caselessly, and in non-UTF-8 mode, if
- character tables for a French locale are in use, [\xc8-\xcb] matches
- accented E characters in both cases. In UTF-8 mode, PCRE supports the
- concept of case for characters with values greater than 128 only when
- it is compiled with Unicode property support.
-
- The character types \d, \D, \p, \P, \s, \S, \w, and \W may also appear
- in a character class, and add the characters that they match to the
- class. For example, [\dABCDEF] matches any hexadecimal digit. A circum-
- flex can conveniently be used with the upper case character types to
- specify a more restricted set of characters than the matching lower
- case type. For example, the class [^\W_] matches any letter or digit,
- but not underscore.
-
- The only metacharacters that are recognized in character classes are
- backslash, hyphen (only where it can be interpreted as specifying a
- range), circumflex (only at the start), opening square bracket (only
- when it can be interpreted as introducing a POSIX class name - see the
- next section), and the terminating closing square bracket. However,
- escaping other non-alphanumeric characters does no harm.
-
-
-POSIX CHARACTER CLASSES
-
- Perl supports the POSIX notation for character classes. This uses names
- enclosed by [: and :] within the enclosing square brackets. PCRE also
- supports this notation. For example,
-
- [01[:alpha:]%]
-
- matches "0", "1", any alphabetic character, or "%". The supported class
- names are
-
- alnum letters and digits
- alpha letters
- ascii character codes 0 - 127
- blank space or tab only
- cntrl control characters
- digit decimal digits (same as \d)
- graph printing characters, excluding space
- lower lower case letters
- print printing characters, including space
- punct printing characters, excluding letters and digits
- space white space (not quite the same as \s)
- upper upper case letters
- word "word" characters (same as \w)
- xdigit hexadecimal digits
-
- The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13),
- and space (32). Notice that this list includes the VT character (code
- 11). This makes "space" different to \s, which does not include VT (for
- Perl compatibility).
-
- The name "word" is a Perl extension, and "blank" is a GNU extension
- from Perl 5.8. Another Perl extension is negation, which is indicated
- by a ^ character after the colon. For example,
-
- [12[:^digit:]]
-
- matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the
- POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but
- these are not supported, and an error is given if they are encountered.
-
- In UTF-8 mode, characters with values greater than 128 do not match any
- of the POSIX character classes.
-
-
-VERTICAL BAR
-
- Vertical bar characters are used to separate alternative patterns. For
- example, the pattern
-
- gilbert|sullivan
-
- matches either "gilbert" or "sullivan". Any number of alternatives may
- appear, and an empty alternative is permitted (matching the empty
- string). The matching process tries each alternative in turn, from left
- to right, and the first one that succeeds is used. If the alternatives
- are within a subpattern (defined below), "succeeds" means matching the
- rest of the main pattern as well as the alternative in the subpattern.
-
-
-INTERNAL OPTION SETTING
-
- The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
- PCRE_EXTENDED options (which are Perl-compatible) can be changed from
- within the pattern by a sequence of Perl option letters enclosed
- between "(?" and ")". The option letters are
-
- i for PCRE_CASELESS
- m for PCRE_MULTILINE
- s for PCRE_DOTALL
- x for PCRE_EXTENDED
-
- For example, (?im) sets caseless, multiline matching. It is also possi-
- ble to unset these options by preceding the letter with a hyphen, and a
- combined setting and unsetting such as (?im-sx), which sets PCRE_CASE-
- LESS and PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED,
- is also permitted. If a letter appears both before and after the
- hyphen, the option is unset.
-
- The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA
- can be changed in the same way as the Perl-compatible options by using
- the characters J, U and X respectively.
-
- When one of these option changes occurs at top level (that is, not
- inside subpattern parentheses), the change applies to the remainder of
- the pattern that follows. If the change is placed right at the start of
- a pattern, PCRE extracts it into the global options (and it will there-
- fore show up in data extracted by the pcre_fullinfo() function).
-
- An option change within a subpattern (see below for a description of
- subpatterns) affects only that part of the current pattern that follows
- it, so
-
- (a(?i)b)c
-
- matches abc and aBc and no other strings (assuming PCRE_CASELESS is not
- used). By this means, options can be made to have different settings
- in different parts of the pattern. Any changes made in one alternative
- do carry on into subsequent branches within the same subpattern. For
- example,
-
- (a(?i)b|c)
-
- matches "ab", "aB", "c", and "C", even though when matching "C" the
- first branch is abandoned before the option setting. This is because
- the effects of option settings happen at compile time. There would be
- some very weird behaviour otherwise.
-
- Note: There are other PCRE-specific options that can be set by the
- application when the compile or match functions are called. In some
- cases the pattern can contain special leading sequences such as (*CRLF)
- to override what the application has set or what has been defaulted.
- Details are given in the section entitled "Newline sequences" above.
- There is also the (*UTF8) leading sequence that can be used to set
- UTF-8 mode; this is equivalent to setting the PCRE_UTF8 option.
-
-
-SUBPATTERNS
-
- Subpatterns are delimited by parentheses (round brackets), which can be
- nested. Turning part of a pattern into a subpattern does two things:
-
- 1. It localizes a set of alternatives. For example, the pattern
-
- cat(aract|erpillar|)
-
- matches one of the words "cat", "cataract", or "caterpillar". Without
- the parentheses, it would match "cataract", "erpillar" or an empty
- string.
-
- 2. It sets up the subpattern as a capturing subpattern. This means
- that, when the whole pattern matches, that portion of the subject
- string that matched the subpattern is passed back to the caller via the
- ovector argument of pcre_exec(). Opening parentheses are counted from
- left to right (starting from 1) to obtain numbers for the capturing
- subpatterns.
-
- For example, if the string "the red king" is matched against the pat-
- tern
-
- the ((red|white) (king|queen))
-
- the captured substrings are "red king", "red", and "king", and are num-
- bered 1, 2, and 3, respectively.
-
- The fact that plain parentheses fulfil two functions is not always
- helpful. There are often times when a grouping subpattern is required
- without a capturing requirement. If an opening parenthesis is followed
- by a question mark and a colon, the subpattern does not do any captur-
- ing, and is not counted when computing the number of any subsequent
- capturing subpatterns. For example, if the string "the white queen" is
- matched against the pattern
-
- the ((?:red|white) (king|queen))
-
- the captured substrings are "white queen" and "queen", and are numbered
- 1 and 2. The maximum number of capturing subpatterns is 65535.
-
- As a convenient shorthand, if any option settings are required at the
- start of a non-capturing subpattern, the option letters may appear
- between the "?" and the ":". Thus the two patterns
-
- (?i:saturday|sunday)
- (?:(?i)saturday|sunday)
-
- match exactly the same set of strings. Because alternative branches are
- tried from left to right, and options are not reset until the end of
- the subpattern is reached, an option setting in one branch does affect
- subsequent branches, so the above patterns match "SUNDAY" as well as
- "Saturday".
-
-
-DUPLICATE SUBPATTERN NUMBERS
-
- Perl 5.10 introduced a feature whereby each alternative in a subpattern
- uses the same numbers for its capturing parentheses. Such a subpattern
- starts with (?| and is itself a non-capturing subpattern. For example,
- consider this pattern:
-
- (?|(Sat)ur|(Sun))day
-
- Because the two alternatives are inside a (?| group, both sets of cap-
- turing parentheses are numbered one. Thus, when the pattern matches,
- you can look at captured substring number one, whichever alternative
- matched. This construct is useful when you want to capture part, but
- not all, of one of a number of alternatives. Inside a (?| group, paren-
- theses are numbered as usual, but the number is reset at the start of
- each branch. The numbers of any capturing buffers that follow the sub-
- pattern start after the highest number used in any branch. The follow-
- ing example is taken from the Perl documentation. The numbers under-
- neath show in which buffer the captured content will be stored.
-
- # before ---------------branch-reset----------- after
- / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
- # 1 2 2 3 2 3 4
-
- A backreference or a recursive call to a numbered subpattern always
- refers to the first one in the pattern with the given number.
-
- An alternative approach to using this "branch reset" feature is to use
- duplicate named subpatterns, as described in the next section.
-
-
-NAMED SUBPATTERNS
-
- Identifying capturing parentheses by number is simple, but it can be
- very hard to keep track of the numbers in complicated regular expres-
- sions. Furthermore, if an expression is modified, the numbers may
- change. To help with this difficulty, PCRE supports the naming of sub-
- patterns. This feature was not added to Perl until release 5.10. Python
- had the feature earlier, and PCRE introduced it at release 4.0, using
- the Python syntax. PCRE now supports both the Perl and the Python syn-
- tax.
-
- In PCRE, a subpattern can be named in one of three ways: (?...)
- or (?'name'...) as in Perl, or (?P...) as in Python. References
- to capturing parentheses from other parts of the pattern, such as back-
- references, recursion, and conditions, can be made by name as well as
- by number.
-
- Names consist of up to 32 alphanumeric characters and underscores.
- Named capturing parentheses are still allocated numbers as well as
- names, exactly as if the names were not present. The PCRE API provides
- function calls for extracting the name-to-number translation table from
- a compiled pattern. There is also a convenience function for extracting
- a captured substring by name.
-
- By default, a name must be unique within a pattern, but it is possible
- to relax this constraint by setting the PCRE_DUPNAMES option at compile
- time. This can be useful for patterns where only one instance of the
- named parentheses can match. Suppose you want to match the name of a
- weekday, either as a 3-letter abbreviation or as the full name, and in
- both cases you want to extract the abbreviation. This pattern (ignoring
- the line breaks) does the job:
-
- (?Mon|Fri|Sun)(?:day)?|
- (?Tue)(?:sday)?|
- (?Wed)(?:nesday)?|
- (?Thu)(?:rsday)?|
- (?Sat)(?:urday)?
-
- There are five capturing substrings, but only one is ever set after a
- match. (An alternative way of solving this problem is to use a "branch
- reset" subpattern, as described in the previous section.)
-
- The convenience function for extracting the data by name returns the
- substring for the first (and in this example, the only) subpattern of
- that name that matched. This saves searching to find which numbered
- subpattern it was. If you make a reference to a non-unique named sub-
- pattern from elsewhere in the pattern, the one that corresponds to the
- lowest number is used. For further details of the interfaces for han-
- dling named subpatterns, see the pcreapi documentation.
-
- Warning: You cannot use different names to distinguish between two sub-
- patterns with the same number (see the previous section) because PCRE
- uses only the numbers when matching.
-
-
-REPETITION
-
- Repetition is specified by quantifiers, which can follow any of the
- following items:
-
- a literal data character
- the dot metacharacter
- the \C escape sequence
- the \X escape sequence (in UTF-8 mode with Unicode properties)
- the \R escape sequence
- an escape such as \d that matches a single character
- a character class
- a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion)
-
- The general repetition quantifier specifies a minimum and maximum num-
- ber of permitted matches, by giving the two numbers in curly brackets
- (braces), separated by a comma. The numbers must be less than 65536,
- and the first must be less than or equal to the second. For example:
-
- z{2,4}
-
- matches "zz", "zzz", or "zzzz". A closing brace on its own is not a
- special character. If the second number is omitted, but the comma is
- present, there is no upper limit; if the second number and the comma
- are both omitted, the quantifier specifies an exact number of required
- matches. Thus
-
- [aeiou]{3,}
-
- matches at least 3 successive vowels, but may match many more, while
-
- \d{8}
-
- matches exactly 8 digits. An opening curly bracket that appears in a
- position where a quantifier is not allowed, or one that does not match
- the syntax of a quantifier, is taken as a literal character. For exam-
- ple, {,6} is not a quantifier, but a literal string of four characters.
-
- In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to
- individual bytes. Thus, for example, \x{100}{2} matches two UTF-8 char-
- acters, each of which is represented by a two-byte sequence. Similarly,
- when Unicode property support is available, \X{3} matches three Unicode
- extended sequences, each of which may be several bytes long (and they
- may be of different lengths).
-
- The quantifier {0} is permitted, causing the expression to behave as if
- the previous item and the quantifier were not present. This may be use-
- ful for subpatterns that are referenced as subroutines from elsewhere
- in the pattern. Items other than subpatterns that have a {0} quantifier
- are omitted from the compiled pattern.
-
- For convenience, the three most common quantifiers have single-charac-
- ter abbreviations:
-
- * is equivalent to {0,}
- + is equivalent to {1,}
- ? is equivalent to {0,1}
-
- It is possible to construct infinite loops by following a subpattern
- that can match no characters with a quantifier that has no upper limit,
- for example:
-
- (a?)*
-
- Earlier versions of Perl and PCRE used to give an error at compile time
- for such patterns. However, because there are cases where this can be
- useful, such patterns are now accepted, but if any repetition of the
- subpattern does in fact match no characters, the loop is forcibly bro-
- ken.
-
- By default, the quantifiers are "greedy", that is, they match as much
- as possible (up to the maximum number of permitted times), without
- causing the rest of the pattern to fail. The classic example of where
- this gives problems is in trying to match comments in C programs. These
- appear between /* and */ and within the comment, individual * and /
- characters may appear. An attempt to match C comments by applying the
- pattern
-
- /\*.*\*/
-
- to the string
-
- /* first comment */ not comment /* second comment */
-
- fails, because it matches the entire string owing to the greediness of
- the .* item.
-
- However, if a quantifier is followed by a question mark, it ceases to
- be greedy, and instead matches the minimum number of times possible, so
- the pattern
-
- /\*.*?\*/
-
- does the right thing with the C comments. The meaning of the various
- quantifiers is not otherwise changed, just the preferred number of
- matches. Do not confuse this use of question mark with its use as a
- quantifier in its own right. Because it has two uses, it can sometimes
- appear doubled, as in
-
- \d??\d
-
- which matches one digit by preference, but can match two if that is the
- only way the rest of the pattern matches.
-
- If the PCRE_UNGREEDY option is set (an option that is not available in
- Perl), the quantifiers are not greedy by default, but individual ones
- can be made greedy by following them with a question mark. In other
- words, it inverts the default behaviour.
-
- When a parenthesized subpattern is quantified with a minimum repeat
- count that is greater than 1 or with a limited maximum, more memory is
- required for the compiled pattern, in proportion to the size of the
- minimum or maximum.
-
- If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equiv-
- alent to Perl's /s) is set, thus allowing the dot to match newlines,
- the pattern is implicitly anchored, because whatever follows will be
- tried against every character position in the subject string, so there
- is no point in retrying the overall match at any position after the
- first. PCRE normally treats such a pattern as though it were preceded
- by \A.
-
- In cases where it is known that the subject string contains no new-
- lines, it is worth setting PCRE_DOTALL in order to obtain this opti-
- mization, or alternatively using ^ to indicate anchoring explicitly.
-
- However, there is one situation where the optimization cannot be used.
- When .* is inside capturing parentheses that are the subject of a
- backreference elsewhere in the pattern, a match at the start may fail
- where a later one succeeds. Consider, for example:
-
- (.*)abc\1
-
- If the subject is "xyz123abc123" the match point is the fourth charac-
- ter. For this reason, such a pattern is not implicitly anchored.
-
- When a capturing subpattern is repeated, the value captured is the sub-
- string that matched the final iteration. For example, after
-
- (tweedle[dume]{3}\s*)+
-
- has matched "tweedledum tweedledee" the value of the captured substring
- is "tweedledee". However, if there are nested capturing subpatterns,
- the corresponding captured values may have been set in previous itera-
- tions. For example, after
-
- /(a|(b))+/
-
- matches "aba" the value of the second captured substring is "b".
-
-
-ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
-
- With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
- repetition, failure of what follows normally causes the repeated item
- to be re-evaluated to see if a different number of repeats allows the
- rest of the pattern to match. Sometimes it is useful to prevent this,
- either to change the nature of the match, or to cause it fail earlier
- than it otherwise might, when the author of the pattern knows there is
- no point in carrying on.
-
- Consider, for example, the pattern \d+foo when applied to the subject
- line
-
- 123456bar
-
- After matching all 6 digits and then failing to match "foo", the normal
- action of the matcher is to try again with only 5 digits matching the
- \d+ item, and then with 4, and so on, before ultimately failing.
- "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides
- the means for specifying that once a subpattern has matched, it is not
- to be re-evaluated in this way.
-
- If we use atomic grouping for the previous example, the matcher gives
- up immediately on failing to match "foo" the first time. The notation
- is a kind of special parenthesis, starting with (?> as in this example:
-
- (?>\d+)foo
-
- This kind of parenthesis "locks up" the part of the pattern it con-
- tains once it has matched, and a failure further into the pattern is
- prevented from backtracking into it. Backtracking past it to previous
- items, however, works as normal.
-
- An alternative description is that a subpattern of this type matches
- the string of characters that an identical standalone pattern would
- match, if anchored at the current point in the subject string.
-
- Atomic grouping subpatterns are not capturing subpatterns. Simple cases
- such as the above example can be thought of as a maximizing repeat that
- must swallow everything it can. So, while both \d+ and \d+? are pre-
- pared to adjust the number of digits they match in order to make the
- rest of the pattern match, (?>\d+) can only match an entire sequence of
- digits.
-
- Atomic groups in general can of course contain arbitrarily complicated
- subpatterns, and can be nested. However, when the subpattern for an
- atomic group is just a single repeated item, as in the example above, a
- simpler notation, called a "possessive quantifier" can be used. This
- consists of an additional + character following a quantifier. Using
- this notation, the previous example can be rewritten as
-
- \d++foo
-
- Note that a possessive quantifier can be used with an entire group, for
- example:
-
- (abc|xyz){2,3}+
-
- Possessive quantifiers are always greedy; the setting of the
- PCRE_UNGREEDY option is ignored. They are a convenient notation for the
- simpler forms of atomic group. However, there is no difference in the
- meaning of a possessive quantifier and the equivalent atomic group,
- though there may be a performance difference; possessive quantifiers
- should be slightly faster.
-
- The possessive quantifier syntax is an extension to the Perl 5.8 syn-
- tax. Jeffrey Friedl originated the idea (and the name) in the first
- edition of his book. Mike McCloskey liked it, so implemented it when he
- built Sun's Java package, and PCRE copied it from there. It ultimately
- found its way into Perl at release 5.10.
-
- PCRE has an optimization that automatically "possessifies" certain sim-
- ple pattern constructs. For example, the sequence A+B is treated as
- A++B because there is no point in backtracking into a sequence of A's
- when B must follow.
-
- When a pattern contains an unlimited repeat inside a subpattern that
- can itself be repeated an unlimited number of times, the use of an
- atomic group is the only way to avoid some failing matches taking a
- very long time indeed. The pattern
-
- (\D+|<\d+>)*[!?]
-
- matches an unlimited number of substrings that either consist of non-
- digits, or digits enclosed in <>, followed by either ! or ?. When it
- matches, it runs quickly. However, if it is applied to
-
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
- it takes a long time before reporting failure. This is because the
- string can be divided between the internal \D+ repeat and the external
- * repeat in a large number of ways, and all have to be tried. (The
- example uses [!?] rather than a single character at the end, because
- both PCRE and Perl have an optimization that allows for fast failure
- when a single character is used. They remember the last single charac-
- ter that is required for a match, and fail early if it is not present
- in the string.) If the pattern is changed so that it uses an atomic
- group, like this:
-
- ((?>\D+)|<\d+>)*[!?]
-
- sequences of non-digits cannot be broken, and failure happens quickly.
-
-
-BACK REFERENCES
-
- Outside a character class, a backslash followed by a digit greater than
- 0 (and possibly further digits) is a back reference to a capturing sub-
- pattern earlier (that is, to its left) in the pattern, provided there
- have been that many previous capturing left parentheses.
-
- However, if the decimal number following the backslash is less than 10,
- it is always taken as a back reference, and causes an error only if
- there are not that many capturing left parentheses in the entire pat-
- tern. In other words, the parentheses that are referenced need not be
- to the left of the reference for numbers less than 10. A "forward back
- reference" of this type can make sense when a repetition is involved
- and the subpattern to the right has participated in an earlier itera-
- tion.
-
- It is not possible to have a numerical "forward back reference" to a
- subpattern whose number is 10 or more using this syntax because a
- sequence such as \50 is interpreted as a character defined in octal.
- See the subsection entitled "Non-printing characters" above for further
- details of the handling of digits following a backslash. There is no
- such problem when named parentheses are used. A back reference to any
- subpattern is possible using named parentheses (see below).
-
- Another way of avoiding the ambiguity inherent in the use of digits
- following a backslash is to use the \g escape sequence, which is a fea-
- ture introduced in Perl 5.10. This escape must be followed by an
- unsigned number or a negative number, optionally enclosed in braces.
- These examples are all identical:
-
- (ring), \1
- (ring), \g1
- (ring), \g{1}
-
- An unsigned number specifies an absolute reference without the ambigu-
- ity that is present in the older syntax. It is also useful when literal
- digits follow the reference. A negative number is a relative reference.
- Consider this example:
-
- (abc(def)ghi)\g{-1}
-
- The sequence \g{-1} is a reference to the most recently started captur-
- ing subpattern before \g, that is, is it equivalent to \2. Similarly,
- \g{-2} would be equivalent to \1. The use of relative references can be
- helpful in long patterns, and also in patterns that are created by
- joining together fragments that contain references within themselves.
-
- A back reference matches whatever actually matched the capturing sub-
- pattern in the current subject string, rather than anything matching
- the subpattern itself (see "Subpatterns as subroutines" below for a way
- of doing that). So the pattern
-
- (sens|respons)e and \1ibility
-
- matches "sense and sensibility" and "response and responsibility", but
- not "sense and responsibility". If caseful matching is in force at the
- time of the back reference, the case of letters is relevant. For exam-
- ple,
-
- ((?i)rah)\s+\1
-
- matches "rah rah" and "RAH RAH", but not "RAH rah", even though the
- original capturing subpattern is matched caselessly.
-
- There are several different ways of writing back references to named
- subpatterns. The .NET syntax \k{name} and the Perl syntax \k or
- \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's
- unified back reference syntax, in which \g can be used for both numeric
- and named references, is also supported. We could rewrite the above
- example in any of the following ways:
-
- (?(?i)rah)\s+\k
- (?'p1'(?i)rah)\s+\k{p1}
- (?P(?i)rah)\s+(?P=p1)
- (?(?i)rah)\s+\g{p1}
-
- A subpattern that is referenced by name may appear in the pattern
- before or after the reference.
-
- There may be more than one back reference to the same subpattern. If a
- subpattern has not actually been used in a particular match, any back
- references to it always fail. For example, the pattern
-
- (a|(bc))\2
-
- always fails if it starts to match "a" rather than "bc". Because there
- may be many capturing parentheses in a pattern, all digits following
- the backslash are taken as part of a potential back reference number.
- If the pattern continues with a digit character, some delimiter must be
- used to terminate the back reference. If the PCRE_EXTENDED option is
- set, this can be whitespace. Otherwise an empty comment (see "Com-
- ments" below) can be used.
-
- A back reference that occurs inside the parentheses to which it refers
- fails when the subpattern is first used, so, for example, (a\1) never
- matches. However, such references can be useful inside repeated sub-
- patterns. For example, the pattern
-
- (a|b\1)+
-
- matches any number of "a"s and also "aba", "ababbaa" etc. At each iter-
- ation of the subpattern, the back reference matches the character
- string corresponding to the previous iteration. In order for this to
- work, the pattern must be such that the first iteration does not need
- to match the back reference. This can be done using alternation, as in
- the example above, or by a quantifier with a minimum of zero.
-
-
-ASSERTIONS
-
- An assertion is a test on the characters following or preceding the
- current matching point that does not actually consume any characters.
- The simple assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are
- described above.
-
- More complicated assertions are coded as subpatterns. There are two
- kinds: those that look ahead of the current position in the subject
- string, and those that look behind it. An assertion subpattern is
- matched in the normal way, except that it does not cause the current
- matching position to be changed.
-
- Assertion subpatterns are not capturing subpatterns, and may not be
- repeated, because it makes no sense to assert the same thing several
- times. If any kind of assertion contains capturing subpatterns within
- it, these are counted for the purposes of numbering the capturing sub-
- patterns in the whole pattern. However, substring capturing is carried
- out only for positive assertions, because it does not make sense for
- negative assertions.
-
- Lookahead assertions
-
- Lookahead assertions start with (?= for positive assertions and (?! for
- negative assertions. For example,
-
- \w+(?=;)
-
- matches a word followed by a semicolon, but does not include the semi-
- colon in the match, and
-
- foo(?!bar)
-
- matches any occurrence of "foo" that is not followed by "bar". Note
- that the apparently similar pattern
-
- (?!foo)bar
-
- does not find an occurrence of "bar" that is preceded by something
- other than "foo"; it finds any occurrence of "bar" whatsoever, because
- the assertion (?!foo) is always true when the next three characters are
- "bar". A lookbehind assertion is needed to achieve the other effect.
-
- If you want to force a matching failure at some point in a pattern, the
- most convenient way to do it is with (?!) because an empty string
- always matches, so an assertion that requires there not to be an empty
- string must always fail.
-
- Lookbehind assertions
-
- Lookbehind assertions start with (?<= for positive assertions and (?)...) or (?('name')...) to test for a
- used subpattern by name. For compatibility with earlier versions of
- PCRE, which had this facility before Perl, the syntax (?(name)...) is
- also recognized. However, there is a possible ambiguity with this syn-
- tax, because subpattern names may consist entirely of digits. PCRE
- looks first for a named subpattern; if it cannot find one and the name
- consists entirely of digits, PCRE looks for a subpattern of that num-
- ber, which must be greater than zero. Using subpattern names that con-
- sist entirely of digits is not recommended.
-
- Rewriting the above example to use a named subpattern gives this:
-
- (? \( )? [^()]+ (?() \) )
-
-
- Checking for pattern recursion
-
- If the condition is the string (R), and there is no subpattern with the
- name R, the condition is true if a recursive call to the whole pattern
- or any subpattern has been made. If digits or a name preceded by amper-
- sand follow the letter R, for example:
-
- (?(R3)...) or (?(R&name)...)
-
- the condition is true if the most recent recursion is into the subpat-
- tern whose number or name is given. This condition does not check the
- entire recursion stack.
-
- At "top level", all these recursion test conditions are false. Recur-
- sive patterns are described below.
-
- Defining subpatterns for use by reference only
-
- If the condition is the string (DEFINE), and there is no subpattern
- with the name DEFINE, the condition is always false. In this case,
- there may be only one alternative in the subpattern. It is always
- skipped if control reaches this point in the pattern; the idea of
- DEFINE is that it can be used to define "subroutines" that can be ref-
- erenced from elsewhere. (The use of "subroutines" is described below.)
- For example, a pattern to match an IPv4 address could be written like
- this (ignore whitespace and line breaks):
-
- (?(DEFINE) (? 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
- \b (?&byte) (\.(?&byte)){3} \b
-
- The first part of the pattern is a DEFINE group inside which a another
- group named "byte" is defined. This matches an individual component of
- an IPv4 address (a number less than 256). When matching takes place,
- this part of the pattern is skipped because DEFINE acts like a false
- condition.
-
- The rest of the pattern uses references to the named group to match the
- four dot-separated components of an IPv4 address, insisting on a word
- boundary at each end.
-
- Assertion conditions
-
- If the condition is not in any of the above formats, it must be an
- assertion. This may be a positive or negative lookahead or lookbehind
- assertion. Consider this pattern, again containing non-significant
- white space, and with the two alternatives on the second line:
-
- (?(?=[^a-z]*[a-z])
- \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
-
- The condition is a positive lookahead assertion that matches an
- optional sequence of non-letters followed by a letter. In other words,
- it tests for the presence of at least one letter in the subject. If a
- letter is found, the subject is matched against the first alternative;
- otherwise it is matched against the second. This pattern matches
- strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
- letters and dd are digits.
-
-
-COMMENTS
-
- The sequence (?# marks the start of a comment that continues up to the
- next closing parenthesis. Nested parentheses are not permitted. The
- characters that make up a comment play no part in the pattern matching
- at all.
-
- If the PCRE_EXTENDED option is set, an unescaped # character outside a
- character class introduces a comment that continues to immediately
- after the next newline in the pattern.
-
-
-RECURSIVE PATTERNS
-
- Consider the problem of matching a string in parentheses, allowing for
- unlimited nested parentheses. Without the use of recursion, the best
- that can be done is to use a pattern that matches up to some fixed
- depth of nesting. It is not possible to handle an arbitrary nesting
- depth.
-
- For some time, Perl has provided a facility that allows regular expres-
- sions to recurse (amongst other things). It does this by interpolating
- Perl code in the expression at run time, and the code can refer to the
- expression itself. A Perl pattern using code interpolation to solve the
- parentheses problem can be created like this:
-
- $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
-
- The (?p{...}) item interpolates Perl code at run time, and in this case
- refers recursively to the pattern in which it appears.
-
- Obviously, PCRE cannot support the interpolation of Perl code. Instead,
- it supports special syntax for recursion of the entire pattern, and
- also for individual subpattern recursion. After its introduction in
- PCRE and Python, this kind of recursion was introduced into Perl at
- release 5.10.
-
- A special item that consists of (? followed by a number greater than
- zero and a closing parenthesis is a recursive call of the subpattern of
- the given number, provided that it occurs inside that subpattern. (If
- not, it is a "subroutine" call, which is described in the next sec-
- tion.) The special item (?R) or (?0) is a recursive call of the entire
- regular expression.
-
- In PCRE (like Python, but unlike Perl), a recursive subpattern call is
- always treated as an atomic group. That is, once it has matched some of
- the subject string, it is never re-entered, even if it contains untried
- alternatives and there is a subsequent matching failure.
-
- This PCRE pattern solves the nested parentheses problem (assume the
- PCRE_EXTENDED option is set so that white space is ignored):
-
- \( ( (?>[^()]+) | (?R) )* \)
-
- First it matches an opening parenthesis. Then it matches any number of
- substrings which can either be a sequence of non-parentheses, or a
- recursive match of the pattern itself (that is, a correctly parenthe-
- sized substring). Finally there is a closing parenthesis.
-
- If this were part of a larger pattern, you would not want to recurse
- the entire pattern, so instead you could use this:
-
- ( \( ( (?>[^()]+) | (?1) )* \) )
-
- We have put the pattern into parentheses, and caused the recursion to
- refer to them instead of the whole pattern.
-
- In a larger pattern, keeping track of parenthesis numbers can be
- tricky. This is made easier by the use of relative references. (A Perl
- 5.10 feature.) Instead of (?1) in the pattern above you can write
- (?-2) to refer to the second most recently opened parentheses preceding
- the recursion. In other words, a negative number counts capturing
- parentheses leftwards from the point at which it is encountered.
-
- It is also possible to refer to subsequently opened parentheses, by
- writing references such as (?+2). However, these cannot be recursive
- because the reference is not inside the parentheses that are refer-
- enced. They are always "subroutine" calls, as described in the next
- section.
-
- An alternative approach is to use named parentheses instead. The Perl
- syntax for this is (?&name); PCRE's earlier syntax (?P>name) is also
- supported. We could rewrite the above example as follows:
-
- (? \( ( (?>[^()]+) | (?&pn) )* \) )
-
- If there is more than one subpattern with the same name, the earliest
- one is used.
-
- This particular example pattern that we have been looking at contains
- nested unlimited repeats, and so the use of atomic grouping for match-
- ing strings of non-parentheses is important when applying the pattern
- to strings that do not match. For example, when this pattern is applied
- to
-
- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
-
- it yields "no match" quickly. However, if atomic grouping is not used,
- the match runs for a very long time indeed because there are so many
- different ways the + and * repeats can carve up the subject, and all
- have to be tested before failure can be reported.
-
- At the end of a match, the values set for any capturing subpatterns are
- those from the outermost level of the recursion at which the subpattern
- value is set. If you want to obtain intermediate values, a callout
- function can be used (see below and the pcrecallout documentation). If
- the pattern above is matched against
-
- (ab(cd)ef)
-
- the value for the capturing parentheses is "ef", which is the last
- value taken on at the top level. If additional parentheses are added,
- giving
-
- \( ( ( (?>[^()]+) | (?R) )* ) \)
- ^ ^
- ^ ^
-
- the string they capture is "ab(cd)ef", the contents of the top level
- parentheses. If there are more than 15 capturing parentheses in a pat-
- tern, PCRE has to obtain extra memory to store data during a recursion,
- which it does by using pcre_malloc, freeing it via pcre_free after-
- wards. If no memory can be obtained, the match fails with the
- PCRE_ERROR_NOMEMORY error.
-
- Do not confuse the (?R) item with the condition (R), which tests for
- recursion. Consider this pattern, which matches text in angle brack-
- ets, allowing for arbitrary nesting. Only digits are allowed in nested
- brackets (that is, when recursing), whereas any characters are permit-
- ted at the outer level.
-
- < (?: (?(R) \d++ | [^<>]*+) | (?R)) * >
-
- In this pattern, (?(R) is the start of a conditional subpattern, with
- two different alternatives for the recursive and non-recursive cases.
- The (?R) item is the actual recursive call.
-
-
-SUBPATTERNS AS SUBROUTINES
-
- If the syntax for a recursive subpattern reference (either by number or
- by name) is used outside the parentheses to which it refers, it oper-
- ates like a subroutine in a programming language. The "called" subpat-
- tern may be defined before or after the reference. A numbered reference
- can be absolute or relative, as in these examples:
-
- (...(absolute)...)...(?2)...
- (...(relative)...)...(?-1)...
- (...(?+1)...(relative)...
-
- An earlier example pointed out that the pattern
-
- (sens|respons)e and \1ibility
-
- matches "sense and sensibility" and "response and responsibility", but
- not "sense and responsibility". If instead the pattern
-
- (sens|respons)e and (?1)ibility
-
- is used, it does match "sense and responsibility" as well as the other
- two strings. Another example is given in the discussion of DEFINE
- above.
-
- Like recursive subpatterns, a "subroutine" call is always treated as an
- atomic group. That is, once it has matched some of the subject string,
- it is never re-entered, even if it contains untried alternatives and
- there is a subsequent matching failure.
-
- When a subpattern is used as a subroutine, processing options such as
- case-independence are fixed when the subpattern is defined. They cannot
- be changed for different calls. For example, consider this pattern:
-
- (abc)(?i:(?-1))
-
- It matches "abcabc". It does not match "abcABC" because the change of
- processing option does not affect the called subpattern.
-
-
-ONIGURUMA SUBROUTINE SYNTAX
-
- For compatibility with Oniguruma, the non-Perl syntax \g followed by a
- name or a number enclosed either in angle brackets or single quotes, is
- an alternative syntax for referencing a subpattern as a subroutine,
- possibly recursively. Here are two of the examples used above, rewrit-
- ten using this syntax:
-
- (? \( ( (?>[^()]+) | \g )* \) )
- (sens|respons)e and \g'1'ibility
-
- PCRE supports an extension to Oniguruma: if a number is preceded by a
- plus or a minus sign it is taken as a relative reference. For example:
-
- (abc)(?i:\g<-1>)
-
- Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not
- synonymous. The former is a back reference; the latter is a subroutine
- call.
-
-
-CALLOUTS
-
- Perl has a feature whereby using the sequence (?{...}) causes arbitrary
- Perl code to be obeyed in the middle of matching a regular expression.
- This makes it possible, amongst other things, to extract different sub-
- strings that match the same pair of parentheses when there is a repeti-
- tion.
-
- PCRE provides a similar feature, but of course it cannot obey arbitrary
- Perl code. The feature is called "callout". The caller of PCRE provides
- an external function by putting its entry point in the global variable
- pcre_callout. By default, this variable contains NULL, which disables
- all calling out.
-
- Within a regular expression, (?C) indicates the points at which the
- external function is to be called. If you want to identify different
- callout points, you can put a number less than 256 after the letter C.
- The default value is zero. For example, this pattern has two callout
- points:
-
- (?C1)abc(?C2)def
-
- If the PCRE_AUTO_CALLOUT flag is passed to pcre_compile(), callouts are
- automatically installed before each item in the pattern. They are all
- numbered 255.
-
- During matching, when PCRE reaches a callout point (and pcre_callout is
- set), the external function is called. It is provided with the number
- of the callout, the position in the pattern, and, optionally, one item
- of data originally supplied by the caller of pcre_exec(). The callout
- function may cause matching to proceed, to backtrack, or to fail alto-
- gether. A complete description of the interface to the callout function
- is given in the pcrecallout documentation.
-
-
-BACKTRACKING CONTROL
-
- Perl 5.10 introduced a number of "Special Backtracking Control Verbs",
- which are described in the Perl documentation as "experimental and sub-
- ject to change or removal in a future version of Perl". It goes on to
- say: "Their usage in production code should be noted to avoid problems
- during upgrades." The same remarks apply to the PCRE features described
- in this section.
-
- Since these verbs are specifically related to backtracking, most of
- them can be used only when the pattern is to be matched using
- pcre_exec(), which uses a backtracking algorithm. With the exception of
- (*FAIL), which behaves like a failing negative assertion, they cause an
- error if encountered by pcre_dfa_exec().
-
- The new verbs make use of what was previously invalid syntax: an open-
- ing parenthesis followed by an asterisk. In Perl, they are generally of
- the form (*VERB:ARG) but PCRE does not support the use of arguments, so
- its general form is just (*VERB). Any number of these verbs may occur
- in a pattern. There are two kinds:
-
- Verbs that act immediately
-
- The following verbs act as soon as they are encountered:
-
- (*ACCEPT)
-
- This verb causes the match to end successfully, skipping the remainder
- of the pattern. When inside a recursion, only the innermost pattern is
- ended immediately. PCRE differs from Perl in what happens if the
- (*ACCEPT) is inside capturing parentheses. In Perl, the data so far is
- captured: in PCRE no data is captured. For example:
-
- A(A|B(*ACCEPT)|C)D
-
- This matches "AB", "AAD", or "ACD", but when it matches "AB", no data
- is captured.
-
- (*FAIL) or (*F)
-
- This verb causes the match to fail, forcing backtracking to occur. It
- is equivalent to (?!) but easier to read. The Perl documentation notes
- that it is probably useful only when combined with (?{}) or (??{}).
- Those are, of course, Perl features that are not present in PCRE. The
- nearest equivalent is the callout feature, as for example in this pat-
- tern:
-
- a+(?C)(*FAIL)
-
- A match with the string "aaaa" always fails, but the callout is taken
- before each backtrack happens (in this example, 10 times).
-
- Verbs that act after backtracking
-
- The following verbs do nothing when they are encountered. Matching con-
- tinues with what follows, but if there is no subsequent match, a fail-
- ure is forced. The verbs differ in exactly what kind of failure
- occurs.
-
- (*COMMIT)
-
- This verb causes the whole match to fail outright if the rest of the
- pattern does not match. Even if the pattern is unanchored, no further
- attempts to find a match by advancing the start point take place. Once
- (*COMMIT) has been passed, pcre_exec() is committed to finding a match
- at the current starting point, or not at all. For example:
-
- a+(*COMMIT)b
-
- This matches "xxaab" but not "aacaab". It can be thought of as a kind
- of dynamic anchor, or "I've started, so I must finish."
-
- (*PRUNE)
-
- This verb causes the match to fail at the current position if the rest
- of the pattern does not match. If the pattern is unanchored, the normal
- "bumpalong" advance to the next starting character then happens. Back-
- tracking can occur as usual to the left of (*PRUNE), or when matching
- to the right of (*PRUNE), but if there is no match to the right, back-
- tracking cannot cross (*PRUNE). In simple cases, the use of (*PRUNE)
- is just an alternative to an atomic group or possessive quantifier, but
- there are some uses of (*PRUNE) that cannot be expressed in any other
- way.
-
- (*SKIP)
-
- This verb is like (*PRUNE), except that if the pattern is unanchored,
- the "bumpalong" advance is not to the next character, but to the posi-
- tion in the subject where (*SKIP) was encountered. (*SKIP) signifies
- that whatever text was matched leading up to it cannot be part of a
- successful match. Consider:
-
- a+(*SKIP)b
-
- If the subject is "aaaac...", after the first match attempt fails
- (starting at the first character in the string), the starting point
- skips on to start the next attempt at "c". Note that a possessive quan-
- tifer does not have the same effect in this example; although it would
- suppress backtracking during the first match attempt, the second
- attempt would start at the second character instead of skipping on to
- "c".
-
- (*THEN)
-
- This verb causes a skip to the next alternation if the rest of the pat-
- tern does not match. That is, it cancels pending backtracking, but only
- within the current alternation. Its name comes from the observation
- that it can be used for a pattern-based if-then-else block:
-
- ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
-
- If the COND1 pattern matches, FOO is tried (and possibly further items
- after the end of the group if FOO succeeds); on failure the matcher
- skips to the second alternative and tries COND2, without backtracking
- into COND1. If (*THEN) is used outside of any alternation, it acts
- exactly like (*PRUNE).
-
-
-SEE ALSO
-
- pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 11 April 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCRESYNTAX(3) PCRESYNTAX(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE REGULAR EXPRESSION SYNTAX SUMMARY
-
- The full syntax and semantics of the regular expressions that are sup-
- ported by PCRE are described in the pcrepattern documentation. This
- document contains just a quick-reference summary of the syntax.
-
-
-QUOTING
-
- \x where x is non-alphanumeric is a literal x
- \Q...\E treat enclosed characters as literal
-
-
-CHARACTERS
-
- \a alarm, that is, the BEL character (hex 07)
- \cx "control-x", where x is any character
- \e escape (hex 1B)
- \f formfeed (hex 0C)
- \n newline (hex 0A)
- \r carriage return (hex 0D)
- \t tab (hex 09)
- \ddd character with octal code ddd, or backreference
- \xhh character with hex code hh
- \x{hhh..} character with hex code hhh..
-
-
-CHARACTER TYPES
-
- . any character except newline;
- in dotall mode, any character whatsoever
- \C one byte, even in UTF-8 mode (best avoided)
- \d a decimal digit
- \D a character that is not a decimal digit
- \h a horizontal whitespace character
- \H a character that is not a horizontal whitespace character
- \p{xx} a character with the xx property
- \P{xx} a character without the xx property
- \R a newline sequence
- \s a whitespace character
- \S a character that is not a whitespace character
- \v a vertical whitespace character
- \V a character that is not a vertical whitespace character
- \w a "word" character
- \W a "non-word" character
- \X an extended Unicode sequence
-
- In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
-
-
-GENERAL CATEGORY PROPERTY CODES FOR \p and \P
-
- C Other
- Cc Control
- Cf Format
- Cn Unassigned
- Co Private use
- Cs Surrogate
-
- L Letter
- Ll Lower case letter
- Lm Modifier letter
- Lo Other letter
- Lt Title case letter
- Lu Upper case letter
- L& Ll, Lu, or Lt
-
- M Mark
- Mc Spacing mark
- Me Enclosing mark
- Mn Non-spacing mark
-
- N Number
- Nd Decimal number
- Nl Letter number
- No Other number
-
- P Punctuation
- Pc Connector punctuation
- Pd Dash punctuation
- Pe Close punctuation
- Pf Final punctuation
- Pi Initial punctuation
- Po Other punctuation
- Ps Open punctuation
-
- S Symbol
- Sc Currency symbol
- Sk Modifier symbol
- Sm Mathematical symbol
- So Other symbol
-
- Z Separator
- Zl Line separator
- Zp Paragraph separator
- Zs Space separator
-
-
-SCRIPT NAMES FOR \p AND \P
-
- Arabic, Armenian, Balinese, Bengali, Bopomofo, Braille, Buginese,
- Buhid, Canadian_Aboriginal, Carian, Cham, Cherokee, Common, Coptic, Cu-
- neiform, Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian,
- Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo,
- Hebrew, Hiragana, Inherited, Kannada, Katakana, Kayah_Li, Kharoshthi,
- Khmer, Lao, Latin, Lepcha, Limbu, Linear_B, Lycian, Lydian, Malayalam,
- Mongolian, Myanmar, New_Tai_Lue, Nko, Ogham, Old_Italic, Old_Persian,
- Ol_Chiki, Oriya, Osmanya, Phags_Pa, Phoenician, Rejang, Runic, Saurash-
- tra, Shavian, Sinhala, Sudanese, Syloti_Nagri, Syriac, Tagalog, Tag-
- banwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan, Tifinagh,
- Ugaritic, Vai, Yi.
-
-
-CHARACTER CLASSES
-
- [...] positive character class
- [^...] negative character class
- [x-y] range (can be used for hex characters)
- [[:xxx:]] positive POSIX named set
- [[:^xxx:]] negative POSIX named set
-
- alnum alphanumeric
- alpha alphabetic
- ascii 0-127
- blank space or tab
- cntrl control character
- digit decimal digit
- graph printing, excluding space
- lower lower case letter
- print printing, including space
- punct printing, excluding alphanumeric
- space whitespace
- upper upper case letter
- word same as \w
- xdigit hexadecimal digit
-
- In PCRE, POSIX character set names recognize only ASCII characters. You
- can use \Q...\E inside a character class.
-
-
-QUANTIFIERS
-
- ? 0 or 1, greedy
- ?+ 0 or 1, possessive
- ?? 0 or 1, lazy
- * 0 or more, greedy
- *+ 0 or more, possessive
- *? 0 or more, lazy
- + 1 or more, greedy
- ++ 1 or more, possessive
- +? 1 or more, lazy
- {n} exactly n
- {n,m} at least n, no more than m, greedy
- {n,m}+ at least n, no more than m, possessive
- {n,m}? at least n, no more than m, lazy
- {n,} n or more, greedy
- {n,}+ n or more, possessive
- {n,}? n or more, lazy
-
-
-ANCHORS AND SIMPLE ASSERTIONS
-
- \b word boundary (only ASCII letters recognized)
- \B not a word boundary
- ^ start of subject
- also after internal newline in multiline mode
- \A start of subject
- $ end of subject
- also before newline at end of subject
- also before internal newline in multiline mode
- \Z end of subject
- also before newline at end of subject
- \z end of subject
- \G first matching position in subject
-
-
-MATCH POINT RESET
-
- \K reset start of match
-
-
-ALTERNATION
-
- expr|expr|expr...
-
-
-CAPTURING
-
- (...) capturing group
- (?...) named capturing group (Perl)
- (?'name'...) named capturing group (Perl)
- (?P...) named capturing group (Python)
- (?:...) non-capturing group
- (?|...) non-capturing group; reset group numbers for
- capturing groups in each alternative
-
-
-ATOMIC GROUPS
-
- (?>...) atomic, non-capturing group
-
-
-COMMENT
-
- (?#....) comment (not nestable)
-
-
-OPTION SETTING
-
- (?i) caseless
- (?J) allow duplicate names
- (?m) multiline
- (?s) single line (dotall)
- (?U) default ungreedy (lazy)
- (?x) extended (ignore white space)
- (?-...) unset option(s)
-
- The following is recognized only at the start of a pattern or after one
- of the newline-setting options with similar syntax:
-
- (*UTF8) set UTF-8 mode
-
-
-LOOKAHEAD AND LOOKBEHIND ASSERTIONS
-
- (?=...) positive look ahead
- (?!...) negative look ahead
- (?<=...) positive look behind
- (? reference by name (Perl)
- \k'name' reference by name (Perl)
- \g{name} reference by name (Perl)
- \k{name} reference by name (.NET)
- (?P=name) reference by name (Python)
-
-
-SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
-
- (?R) recurse whole pattern
- (?n) call subpattern by absolute number
- (?+n) call subpattern by relative number
- (?-n) call subpattern by relative number
- (?&name) call subpattern by name (Perl)
- (?P>name) call subpattern by name (Python)
- \g call subpattern by name (Oniguruma)
- \g'name' call subpattern by name (Oniguruma)
- \g call subpattern by absolute number (Oniguruma)
- \g'n' call subpattern by absolute number (Oniguruma)
- \g<+n> call subpattern by relative number (PCRE extension)
- \g'+n' call subpattern by relative number (PCRE extension)
- \g<-n> call subpattern by relative number (PCRE extension)
- \g'-n' call subpattern by relative number (PCRE extension)
-
-
-CONDITIONAL PATTERNS
-
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-
- (?(n)... absolute reference condition
- (?(+n)... relative reference condition
- (?(-n)... relative reference condition
- (?()... named reference condition (Perl)
- (?('name')... named reference condition (Perl)
- (?(name)... named reference condition (PCRE)
- (?(R)... overall recursion condition
- (?(Rn)... specific group recursion condition
- (?(R&name)... specific recursion condition
- (?(DEFINE)... define subpattern for reference
- (?(assert)... assertion condition
-
-
-BACKTRACKING CONTROL
-
- The following act immediately they are reached:
-
- (*ACCEPT) force successful match
- (*FAIL) force backtrack; synonym (*F)
-
- The following act only when a subsequent match failure causes a back-
- track to reach them. They all force a match failure, but they differ in
- what happens afterwards. Those that advance the start-of-match point do
- so only if the pattern is not anchored.
-
- (*COMMIT) overall failure, no advance of starting point
- (*PRUNE) advance to next starting character
- (*SKIP) advance start to current matching position
- (*THEN) local failure, backtrack to next alternation
-
-
-NEWLINE CONVENTIONS
-
- These are recognized only at the very start of the pattern or after a
- (*BSR_...) or (*UTF8) option.
-
- (*CR) carriage return only
- (*LF) linefeed only
- (*CRLF) carriage return followed by linefeed
- (*ANYCRLF) all three of the above
- (*ANY) any Unicode newline sequence
-
-
-WHAT \R MATCHES
-
- These are recognized only at the very start of the pattern or after a
- (*...) option that sets the newline convention or UTF-8 mode.
-
- (*BSR_ANYCRLF) CR, LF, or CRLF
- (*BSR_UNICODE) any Unicode newline sequence
-
-
-CALLOUTS
-
- (?C) callout
- (?Cn) callout with data n
-
-
-SEE ALSO
-
- pcrepattern(3), pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 11 April 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREPARTIAL(3) PCREPARTIAL(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PARTIAL MATCHING IN PCRE
-
- In normal use of PCRE, if the subject string that is passed to
- pcre_exec() or pcre_dfa_exec() matches as far as it goes, but is too
- short to match the entire pattern, PCRE_ERROR_NOMATCH is returned.
- There are circumstances where it might be helpful to distinguish this
- case from other cases in which there is no match.
-
- Consider, for example, an application where a human is required to type
- in data for a field with specific formatting requirements. An example
- might be a date in the form ddmmmyy, defined by this pattern:
-
- ^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
-
- If the application sees the user's keystrokes one by one, and can check
- that what has been typed so far is potentially valid, it is able to
- raise an error as soon as a mistake is made, possibly beeping and not
- reflecting the character that has been typed. This immediate feedback
- is likely to be a better user interface than a check that is delayed
- until the entire string has been entered.
-
- PCRE supports the concept of partial matching by means of the PCRE_PAR-
- TIAL option, which can be set when calling pcre_exec() or
- pcre_dfa_exec(). When this flag is set for pcre_exec(), the return code
- PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
- during the matching process the last part of the subject string matched
- part of the pattern. Unfortunately, for non-anchored matching, it is
- not possible to obtain the position of the start of the partial match.
- No captured data is set when PCRE_ERROR_PARTIAL is returned.
-
- When PCRE_PARTIAL is set for pcre_dfa_exec(), the return code
- PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of
- the subject is reached, there have been no complete matches, but there
- is still at least one matching possibility. The portion of the string
- that provided the partial match is set as the first matching string.
-
- Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers
- the last literal byte in a pattern, and abandons matching immediately
- if such a byte is not present in the subject string. This optimization
- cannot be used for a subject string that might match only partially.
-
-
-RESTRICTED PATTERNS FOR PCRE_PARTIAL
-
- Because of the way certain internal optimizations are implemented in
- the pcre_exec() function, the PCRE_PARTIAL option cannot be used with
- all patterns. These restrictions do not apply when pcre_dfa_exec() is
- used. For pcre_exec(), repeated single characters such as
-
- a{2,4}
-
- and repeated single metasequences such as
-
- \d+
-
- are not permitted if the maximum number of occurrences is greater than
- one. Optional items such as \d? (where the maximum is one) are permit-
- ted. Quantifiers with any values are permitted after parentheses, so
- the invalid examples above can be coded thus:
-
- (a){2,4}
- (\d)+
-
- These constructions run more slowly, but for the kinds of application
- that are envisaged for this facility, this is not felt to be a major
- restriction.
-
- If PCRE_PARTIAL is set for a pattern that does not conform to the
- restrictions, pcre_exec() returns the error code PCRE_ERROR_BADPARTIAL
- (-13). You can use the PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to
- find out if a compiled pattern can be used for partial matching.
-
-
-EXAMPLE OF PARTIAL MATCHING USING PCRETEST
-
- If the escape sequence \P is present in a pcretest data line, the
- PCRE_PARTIAL flag is used for the match. Here is a run of pcretest that
- uses the date example quoted above:
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 25jun04\P
- 0: 25jun04
- 1: jun
- data> 25dec3\P
- Partial match
- data> 3ju\P
- Partial match
- data> 3juj\P
- No match
- data> j\P
- No match
-
- The first data string is matched completely, so pcretest shows the
- matched substrings. The remaining four strings do not match the com-
- plete pattern, but the first two are partial matches. The same test,
- using pcre_dfa_exec() matching (by means of the \D escape sequence),
- produces the following output:
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 25jun04\P\D
- 0: 25jun04
- data> 23dec3\P\D
- Partial match: 23dec3
- data> 3ju\P\D
- Partial match: 3ju
- data> 3juj\P\D
- No match
- data> j\P\D
- No match
-
- Notice that in this case the portion of the string that was matched is
- made available.
-
-
-MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()
-
- When a partial match has been found using pcre_dfa_exec(), it is possi-
- ble to continue the match by providing additional subject data and
- calling pcre_dfa_exec() again with the same compiled regular expres-
- sion, this time setting the PCRE_DFA_RESTART option. You must also pass
- the same working space as before, because this is where details of the
- previous partial match are stored. Here is an example using pcretest,
- using the \R escape sequence to set the PCRE_DFA_RESTART option (\P and
- \D are as above):
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 23ja\P\D
- Partial match: 23ja
- data> n05\R\D
- 0: n05
-
- The first call has "23ja" as the subject, and requests partial match-
- ing; the second call has "n05" as the subject for the continued
- (restarted) match. Notice that when the match is complete, only the
- last part is shown; PCRE does not retain the previously partially-
- matched string. It is up to the calling program to do that if it needs
- to.
-
- You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial
- matching over multiple segments. This facility can be used to pass very
- long subject strings to pcre_dfa_exec(). However, some care is needed
- for certain types of pattern.
-
- 1. If the pattern contains tests for the beginning or end of a line,
- you need to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropri-
- ate, when the subject string for any call does not contain the begin-
- ning or end of a line.
-
- 2. If the pattern contains backward assertions (including \b or \B),
- you need to arrange for some overlap in the subject strings to allow
- for this. For example, you could pass the subject in chunks that are
- 500 bytes long, but in a buffer of 700 bytes, with the starting offset
- set to 200 and the previous 200 bytes at the start of the buffer.
-
- 3. Matching a subject string that is split into multiple segments does
- not always produce exactly the same result as matching over one single
- long string. The difference arises when there are multiple matching
- possibilities, because a partial match result is given only when there
- are no completed matches in a call to pcre_dfa_exec(). This means that
- as soon as the shortest match has been found, continuation to a new
- subject segment is no longer possible. Consider this pcretest example:
-
- re> /dog(sbody)?/
- data> do\P\D
- Partial match: do
- data> gsb\R\P\D
- 0: g
- data> dogsbody\D
- 0: dogsbody
- 1: dog
-
- The pattern matches the words "dog" or "dogsbody". When the subject is
- presented in several parts ("do" and "gsb" being the first two) the
- match stops when "dog" has been found, and it is not possible to con-
- tinue. On the other hand, if "dogsbody" is presented as a single
- string, both matches are found.
-
- Because of this phenomenon, it does not usually make sense to end a
- pattern that is going to be matched in this way with a variable repeat.
-
- 4. Patterns that contain alternatives at the top level which do not all
- start with the same pattern item may not work as expected. For example,
- consider this pattern:
-
- 1234|3789
-
- If the first part of the subject is "ABC123", a partial match of the
- first alternative is found at offset 3. There is no partial match for
- the second alternative, because such a match does not start at the same
- point in the subject string. Attempting to continue with the string
- "789" does not yield a match because only those alternatives that match
- at one point in the subject are remembered. The problem arises because
- the start of the second alternative matches within the first alterna-
- tive. There is no problem with anchored patterns or patterns such as:
-
- 1234|ABCD
-
- where no string can be a partial match for both alternatives.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 04 June 2007
- Copyright (c) 1997-2007 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREPRECOMPILE(3) PCREPRECOMPILE(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-SAVING AND RE-USING PRECOMPILED PCRE PATTERNS
-
- If you are running an application that uses a large number of regular
- expression patterns, it may be useful to store them in a precompiled
- form instead of having to compile them every time the application is
- run. If you are not using any private character tables (see the
- pcre_maketables() documentation), this is relatively straightforward.
- If you are using private tables, it is a little bit more complicated.
-
- If you save compiled patterns to a file, you can copy them to a differ-
- ent host and run them there. This works even if the new host has the
- opposite endianness to the one on which the patterns were compiled.
- There may be a small performance penalty, but it should be insignifi-
- cant. However, compiling regular expressions with one version of PCRE
- for use with a different version is not guaranteed to work and may
- cause crashes.
-
-
-SAVING A COMPILED PATTERN
- The value returned by pcre_compile() points to a single block of memory
- that holds the compiled pattern and associated data. You can find the
- length of this block in bytes by calling pcre_fullinfo() with an argu-
- ment of PCRE_INFO_SIZE. You can then save the data in any appropriate
- manner. Here is sample code that compiles a pattern and writes it to a
- file. It assumes that the variable fd refers to a file that is open for
- output:
-
- int erroroffset, rc, size;
- char *error;
- pcre *re;
-
- re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
- if (re == NULL) { ... handle errors ... }
- rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
- if (rc < 0) { ... handle errors ... }
- rc = fwrite(re, 1, size, fd);
- if (rc != size) { ... handle errors ... }
-
- In this example, the bytes that comprise the compiled pattern are
- copied exactly. Note that this is binary data that may contain any of
- the 256 possible byte values. On systems that make a distinction
- between binary and non-binary data, be sure that the file is opened for
- binary output.
-
- If you want to write more than one pattern to a file, you will have to
- devise a way of separating them. For binary data, preceding each pat-
- tern with its length is probably the most straightforward approach.
- Another possibility is to write out the data in hexadecimal instead of
- binary, one pattern to a line.
-
- Saving compiled patterns in a file is only one possible way of storing
- them for later use. They could equally well be saved in a database, or
- in the memory of some daemon process that passes them via sockets to
- the processes that want them.
-
- If the pattern has been studied, it is also possible to save the study
- data in a similar way to the compiled pattern itself. When studying
- generates additional information, pcre_study() returns a pointer to a
- pcre_extra data block. Its format is defined in the section on matching
- a pattern in the pcreapi documentation. The study_data field points to
- the binary study data, and this is what you must save (not the
- pcre_extra block itself). The length of the study data can be obtained
- by calling pcre_fullinfo() with an argument of PCRE_INFO_STUDYSIZE.
- Remember to check that pcre_study() did return a non-NULL value before
- trying to save the study data.
-
-
-RE-USING A PRECOMPILED PATTERN
-
- Re-using a precompiled pattern is straightforward. Having reloaded it
- into main memory, you pass its pointer to pcre_exec() or
- pcre_dfa_exec() in the usual way. This should work even on another
- host, and even if that host has the opposite endianness to the one
- where the pattern was compiled.
-
- However, if you passed a pointer to custom character tables when the
- pattern was compiled (the tableptr argument of pcre_compile()), you
- must now pass a similar pointer to pcre_exec() or pcre_dfa_exec(),
- because the value saved with the compiled pattern will obviously be
- nonsense. A field in a pcre_extra() block is used to pass this data, as
- described in the section on matching a pattern in the pcreapi documen-
- tation.
-
- If you did not provide custom character tables when the pattern was
- compiled, the pointer in the compiled pattern is NULL, which causes
- pcre_exec() to use PCRE's internal tables. Thus, you do not need to
- take any special action at run time in this case.
-
- If you saved study data with the compiled pattern, you need to create
- your own pcre_extra data block and set the study_data field to point to
- the reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA
- bit in the flags field to indicate that study data is present. Then
- pass the pcre_extra block to pcre_exec() or pcre_dfa_exec() in the
- usual way.
-
-
-COMPATIBILITY WITH DIFFERENT PCRE RELEASES
-
- In general, it is safest to recompile all saved patterns when you
- update to a new PCRE release, though not all updates actually require
- this. Recompiling is definitely needed for release 7.2.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 13 June 2007
- Copyright (c) 1997-2007 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREPERFORM(3) PCREPERFORM(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE PERFORMANCE
-
- Two aspects of performance are discussed below: memory usage and pro-
- cessing time. The way you express your pattern as a regular expression
- can affect both of them.
-
-
-MEMORY USAGE
-
- Patterns are compiled by PCRE into a reasonably efficient byte code, so
- that most simple patterns do not use much memory. However, there is one
- case where memory usage can be unexpectedly large. When a parenthesized
- subpattern has a quantifier with a minimum greater than 1 and/or a lim-
- ited maximum, the whole subpattern is repeated in the compiled code.
- For example, the pattern
-
- (abc|def){2,4}
-
- is compiled as if it were
-
- (abc|def)(abc|def)((abc|def)(abc|def)?)?
-
- (Technical aside: It is done this way so that backtrack points within
- each of the repetitions can be independently maintained.)
-
- For regular expressions whose quantifiers use only small numbers, this
- is not usually a problem. However, if the numbers are large, and par-
- ticularly if such repetitions are nested, the memory usage can become
- an embarrassment. For example, the very simple pattern
-
- ((ab){1,1000}c){1,3}
-
- uses 51K bytes when compiled. When PCRE is compiled with its default
- internal pointer size of two bytes, the size limit on a compiled pat-
- tern is 64K, and this is reached with the above pattern if the outer
- repetition is increased from 3 to 4. PCRE can be compiled to use larger
- internal pointers and thus handle larger compiled patterns, but it is
- better to try to rewrite your pattern to use less memory if you can.
-
- One way of reducing the memory usage for such patterns is to make use
- of PCRE's "subroutine" facility. Re-writing the above pattern as
-
- ((ab)(?2){0,999}c)(?1){0,2}
-
- reduces the memory requirements to 18K, and indeed it remains under 20K
- even with the outer repetition increased to 100. However, this pattern
- is not exactly equivalent, because the "subroutine" calls are treated
- as atomic groups into which there can be no backtracking if there is a
- subsequent matching failure. Therefore, PCRE cannot do this kind of
- rewriting automatically. Furthermore, there is a noticeable loss of
- speed when executing the modified pattern. Nevertheless, if the atomic
- grouping is not a problem and the loss of speed is acceptable, this
- kind of rewriting will allow you to process patterns that PCRE cannot
- otherwise handle.
-
-
-PROCESSING TIME
-
- Certain items in regular expression patterns are processed more effi-
- ciently than others. It is more efficient to use a character class like
- [aeiou] than a set of single-character alternatives such as
- (a|e|i|o|u). In general, the simplest construction that provides the
- required behaviour is usually the most efficient. Jeffrey Friedl's book
- contains a lot of useful general discussion about optimizing regular
- expressions for efficient performance. This document contains a few
- observations about PCRE.
-
- Using Unicode character properties (the \p, \P, and \X escapes) is
- slow, because PCRE has to scan a structure that contains data for over
- fifteen thousand characters whenever it needs a character's property.
- If you can find an alternative pattern that does not use character
- properties, it will probably be faster.
-
- When a pattern begins with .* not in parentheses, or in parentheses
- that are not the subject of a backreference, and the PCRE_DOTALL option
- is set, the pattern is implicitly anchored by PCRE, since it can match
- only at the start of a subject string. However, if PCRE_DOTALL is not
- set, PCRE cannot make this optimization, because the . metacharacter
- does not then match a newline, and if the subject string contains new-
- lines, the pattern may match from the character immediately following
- one of them instead of from the very start. For example, the pattern
-
- .*second
-
- matches the subject "first\nand second" (where \n stands for a newline
- character), with the match starting at the seventh character. In order
- to do this, PCRE has to retry the match starting after every newline in
- the subject.
-
- If you are using such a pattern with subject strings that do not con-
- tain newlines, the best performance is obtained by setting PCRE_DOTALL,
- or starting the pattern with ^.* or ^.*? to indicate explicit anchor-
- ing. That saves PCRE from having to scan along the subject looking for
- a newline to restart at.
-
- Beware of patterns that contain nested indefinite repeats. These can
- take a long time to run when applied to a string that does not match.
- Consider the pattern fragment
-
- ^(a+)*
-
- This can match "aaaa" in 16 different ways, and this number increases
- very rapidly as the string gets longer. (The * repeat can match 0, 1,
- 2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
- repeats can match different numbers of times.) When the remainder of
- the pattern is such that the entire match is going to fail, PCRE has in
- principle to try every possible variation, and this can take an
- extremely long time, even for relatively short strings.
-
- An optimization catches some of the more simple cases such as
-
- (a+)*b
-
- where a literal character follows. Before embarking on the standard
- matching procedure, PCRE checks that there is a "b" later in the sub-
- ject string, and if there is not, it fails the match immediately. How-
- ever, when there is no following literal this optimization cannot be
- used. You can see the difference by comparing the behaviour of
-
- (a+)*\d
-
- with the pattern above. The former gives a failure almost instantly
- when applied to a whole line of "a" characters, whereas the latter
- takes an appreciable time with strings longer than about 20 characters.
-
- In many cases, the solution to this kind of performance issue is to use
- an atomic group or a possessive quantifier.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 06 March 2007
- Copyright (c) 1997-2007 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCREPOSIX(3) PCREPOSIX(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions.
-
-
-SYNOPSIS OF POSIX API
-
- #include
-
- int regcomp(regex_t *preg, const char *pattern,
- int cflags);
-
- int regexec(regex_t *preg, const char *string,
- size_t nmatch, regmatch_t pmatch[], int eflags);
-
- size_t regerror(int errcode, const regex_t *preg,
- char *errbuf, size_t errbuf_size);
-
- void regfree(regex_t *preg);
-
-
-DESCRIPTION
-
- This set of functions provides a POSIX-style API to the PCRE regular
- expression package. See the pcreapi documentation for a description of
- PCRE's native API, which contains much additional functionality.
-
- The functions described here are just wrapper functions that ultimately
- call the PCRE native API. Their prototypes are defined in the
- pcreposix.h header file, and on Unix systems the library itself is
- called pcreposix.a, so can be accessed by adding -lpcreposix to the
- command for linking an application that uses them. Because the POSIX
- functions call the native ones, it is also necessary to add -lpcre.
-
- I have implemented only those POSIX option bits that can be reasonably
- mapped to PCRE native options. In addition, the option REG_EXTENDED is
- defined with the value zero. This has no effect, but since programs
- that are written to the POSIX interface often use it, this makes it
- easier to slot in PCRE as a replacement library. Other POSIX options
- are not even defined.
-
- When PCRE is called via these functions, it is only the API that is
- POSIX-like in style. The syntax and semantics of the regular expres-
- sions themselves are still those of Perl, subject to the setting of
- various PCRE options, as described below. "POSIX-like in style" means
- that the API approximates to the POSIX definition; it is not fully
- POSIX-compatible, and in multi-byte encoding domains it is probably
- even less compatible.
-
- The header for these functions is supplied as pcreposix.h to avoid any
- potential clash with other POSIX libraries. It can, of course, be
- renamed or aliased as regex.h, which is the "correct" name. It provides
- two structure types, regex_t for compiled internal forms, and reg-
- match_t for returning captured substrings. It also defines some con-
- stants whose names start with "REG_"; these are used for setting
- options and identifying error codes.
-
-
-COMPILING A PATTERN
-
- The function regcomp() is called to compile a pattern into an internal
- form. The pattern is a C string terminated by a binary zero, and is
- passed in the argument pattern. The preg argument is a pointer to a
- regex_t structure that is used as a base for storing information about
- the compiled regular expression.
-
- The argument cflags is either zero, or contains one or more of the bits
- defined by the following macros:
-
- REG_DOTALL
-
- The PCRE_DOTALL option is set when the regular expression is passed for
- compilation to the native function. Note that REG_DOTALL is not part of
- the POSIX standard.
-
- REG_ICASE
-
- The PCRE_CASELESS option is set when the regular expression is passed
- for compilation to the native function.
-
- REG_NEWLINE
-
- The PCRE_MULTILINE option is set when the regular expression is passed
- for compilation to the native function. Note that this does not mimic
- the defined POSIX behaviour for REG_NEWLINE (see the following sec-
- tion).
-
- REG_NOSUB
-
- The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is
- passed for compilation to the native function. In addition, when a pat-
- tern that is compiled with this flag is passed to regexec() for match-
- ing, the nmatch and pmatch arguments are ignored, and no captured
- strings are returned.
-
- REG_UTF8
-
- The PCRE_UTF8 option is set when the regular expression is passed for
- compilation to the native function. This causes the pattern itself and
- all data strings used for matching it to be treated as UTF-8 strings.
- Note that REG_UTF8 is not part of the POSIX standard.
-
- In the absence of these flags, no options are passed to the native
- function. This means the the regex is compiled with PCRE default
- semantics. In particular, the way it handles newline characters in the
- subject string is the Perl way, not the POSIX way. Note that setting
- PCRE_MULTILINE has only some of the effects specified for REG_NEWLINE.
- It does not affect the way newlines are matched by . (they aren't) or
- by a negative class such as [^a] (they are).
-
- The yield of regcomp() is zero on success, and non-zero otherwise. The
- preg structure is filled in on success, and one member of the structure
- is public: re_nsub contains the number of capturing subpatterns in the
- regular expression. Various error codes are defined in the header file.
-
-
-MATCHING NEWLINE CHARACTERS
-
- This area is not simple, because POSIX and Perl take different views of
- things. It is not possible to get PCRE to obey POSIX semantics, but
- then PCRE was never intended to be a POSIX engine. The following table
- lists the different possibilities for matching newline characters in
- PCRE:
-
- Default Change with
-
- . matches newline no PCRE_DOTALL
- newline matches [^a] yes not changeable
- $ matches \n at end yes PCRE_DOLLARENDONLY
- $ matches \n in middle no PCRE_MULTILINE
- ^ matches \n in middle no PCRE_MULTILINE
-
- This is the equivalent table for POSIX:
-
- Default Change with
-
- . matches newline yes REG_NEWLINE
- newline matches [^a] yes REG_NEWLINE
- $ matches \n at end no REG_NEWLINE
- $ matches \n in middle no REG_NEWLINE
- ^ matches \n in middle no REG_NEWLINE
-
- PCRE's behaviour is the same as Perl's, except that there is no equiva-
- lent for PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is
- no way to stop newline from matching [^a].
-
- The default POSIX newline handling can be obtained by setting
- PCRE_DOTALL and PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE
- behave exactly as for the REG_NEWLINE action.
-
-
-MATCHING A PATTERN
-
- The function regexec() is called to match a compiled pattern preg
- against a given string, which is by default terminated by a zero byte
- (but see REG_STARTEND below), subject to the options in eflags. These
- can be:
-
- REG_NOTBOL
-
- The PCRE_NOTBOL option is set when calling the underlying PCRE matching
- function.
-
- REG_NOTEMPTY
-
- The PCRE_NOTEMPTY option is set when calling the underlying PCRE match-
- ing function. Note that REG_NOTEMPTY is not part of the POSIX standard.
- However, setting this option can give more POSIX-like behaviour in some
- situations.
-
- REG_NOTEOL
-
- The PCRE_NOTEOL option is set when calling the underlying PCRE matching
- function.
-
- REG_STARTEND
-
- The string is considered to start at string + pmatch[0].rm_so and to
- have a terminating NUL located at string + pmatch[0].rm_eo (there need
- not actually be a NUL at that location), regardless of the value of
- nmatch. This is a BSD extension, compatible with but not specified by
- IEEE Standard 1003.2 (POSIX.2), and should be used with caution in
- software intended to be portable to other systems. Note that a non-zero
- rm_so does not imply REG_NOTBOL; REG_STARTEND affects only the location
- of the string, not how it is matched.
-
- If the pattern was compiled with the REG_NOSUB flag, no data about any
- matched strings is returned. The nmatch and pmatch arguments of
- regexec() are ignored.
-
- Otherwise,the portion of the string that was matched, and also any cap-
- tured substrings, are returned via the pmatch argument, which points to
- an array of nmatch structures of type regmatch_t, containing the mem-
- bers rm_so and rm_eo. These contain the offset to the first character
- of each substring and the offset to the first character after the end
- of each substring, respectively. The 0th element of the vector relates
- to the entire portion of string that was matched; subsequent elements
- relate to the capturing subpatterns of the regular expression. Unused
- entries in the array have both structure members set to -1.
-
- A successful match yields a zero return; various error codes are
- defined in the header file, of which REG_NOMATCH is the "expected"
- failure code.
-
-
-ERROR MESSAGES
-
- The regerror() function maps a non-zero errorcode from either regcomp()
- or regexec() to a printable message. If preg is not NULL, the error
- should have arisen from the use of that structure. A message terminated
- by a binary zero is placed in errbuf. The length of the message,
- including the zero, is limited to errbuf_size. The yield of the func-
- tion is the size of buffer needed to hold the whole message.
-
-
-MEMORY USAGE
-
- Compiling a regular expression causes memory to be allocated and asso-
- ciated with the preg structure. The function regfree() frees all such
- memory, after which preg may no longer be used as a compiled expres-
- sion.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 11 March 2009
- Copyright (c) 1997-2009 University of Cambridge.
-------------------------------------------------------------------------------
-
-
-PCRECPP(3) PCRECPP(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions.
-
-
-SYNOPSIS OF C++ WRAPPER
-
- #include
-
-
-DESCRIPTION
-
- The C++ wrapper for PCRE was provided by Google Inc. Some additional
- functionality was added by Giuseppe Maxia. This brief man page was con-
- structed from the notes in the pcrecpp.h file, which should be con-
- sulted for further details.
-
-
-MATCHING INTERFACE
-
- The "FullMatch" operation checks that supplied text matches a supplied
- pattern exactly. If pointer arguments are supplied, it copies matched
- sub-strings that match sub-patterns into them.
-
- Example: successful match
- pcrecpp::RE re("h.*o");
- re.FullMatch("hello");
-
- Example: unsuccessful match (requires full match):
- pcrecpp::RE re("e");
- !re.FullMatch("hello");
-
- Example: creating a temporary RE object:
- pcrecpp::RE("h.*o").FullMatch("hello");
-
- You can pass in a "const char*" or a "string" for "text". The examples
- below tend to use a const char*. You can, as in the different examples
- above, store the RE object explicitly in a variable or use a temporary
- RE object. The examples below use one mode or the other arbitrarily.
- Either could correctly be used for any of these examples.
-
- You must supply extra pointer arguments to extract matched subpieces.
-
- Example: extracts "ruby" into "s" and 1234 into "i"
- int i;
- string s;
- pcrecpp::RE re("(\\w+):(\\d+)");
- re.FullMatch("ruby:1234", &s, &i);
-
- Example: does not try to extract any extra sub-patterns
- re.FullMatch("ruby:1234", &s);
-
- Example: does not try to extract into NULL
- re.FullMatch("ruby:1234", NULL, &i);
-
- Example: integer overflow causes failure
- !re.FullMatch("ruby:1234567891234", NULL, &i);
-
- Example: fails because there aren't enough sub-patterns:
- !pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s);
-
- Example: fails because string cannot be stored in integer
- !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
-
- The provided pointer arguments can be pointers to any scalar numeric
- type, or one of:
-
- string (matched piece is copied to string)
- StringPiece (StringPiece is mutated to point to matched piece)
- T (where "bool T::ParseFrom(const char*, int)" exists)
- NULL (the corresponding matched sub-pattern is not copied)
-
- The function returns true iff all of the following conditions are sat-
- isfied:
-
- a. "text" matches "pattern" exactly;
-
- b. The number of matched sub-patterns is >= number of supplied
- pointers;
-
- c. The "i"th argument has a suitable type for holding the
- string captured as the "i"th sub-pattern. If you pass in
- void * NULL for the "i"th argument, or a non-void * NULL
- of the correct type, or pass fewer arguments than the
- number of sub-patterns, "i"th captured sub-pattern is
- ignored.
-
- CAVEAT: An optional sub-pattern that does not exist in the matched
- string is assigned the empty string. Therefore, the following will
- return false (because the empty string is not a valid number):
-
- int number;
- pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
-
- The matching interface supports at most 16 arguments per call. If you
- need more, consider using the more general interface
- pcrecpp::RE::DoMatch. See pcrecpp.h for the signature for DoMatch.
-
- NOTE: Do not use no_arg, which is used internally to mark the end of a
- list of optional arguments, as a placeholder for missing arguments, as
- this can lead to segfaults.
-
-
-QUOTING METACHARACTERS
-
- You can use the "QuoteMeta" operation to insert backslashes before all
- potentially meaningful characters in a string. The returned string,
- used as a regular expression, will exactly match the original string.
-
- Example:
- string quoted = RE::QuoteMeta(unquoted);
-
- Note that it's legal to escape a character even if it has no special
- meaning in a regular expression -- so this function does that. (This
- also makes it identical to the perl function of the same name; see
- "perldoc -f quotemeta".) For example, "1.5-2.0?" becomes
- "1\.5\-2\.0\?".
-
-
-PARTIAL MATCHES
-
- You can use the "PartialMatch" operation when you want the pattern to
- match any substring of the text.
-
- Example: simple search for a string:
- pcrecpp::RE("ell").PartialMatch("hello");
-
- Example: find first number in a string:
- int number;
- pcrecpp::RE re("(\\d+)");
- re.PartialMatch("x*100 + 20", &number);
- assert(number == 100);
-
-
-UTF-8 AND THE MATCHING INTERFACE
-
- By default, pattern and text are plain text, one byte per character.
- The UTF8 flag, passed to the constructor, causes both pattern and
- string to be treated as UTF-8 text, still a byte stream but potentially
- multiple bytes per character. In practice, the text is likelier to be
- UTF-8 than the pattern, but the match returned may depend on the UTF8
- flag, so always use it when matching UTF8 text. For example, "." will
- match one byte normally but with UTF8 set may match up to three bytes
- of a multi-byte character.
-
- Example:
- pcrecpp::RE_Options options;
- options.set_utf8();
- pcrecpp::RE re(utf8_pattern, options);
- re.FullMatch(utf8_string);
-
- Example: using the convenience function UTF8():
- pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
- re.FullMatch(utf8_string);
-
- NOTE: The UTF8 flag is ignored if pcre was not configured with the
- --enable-utf8 flag.
-
-
-PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
-
- PCRE defines some modifiers to change the behavior of the regular
- expression engine. The C++ wrapper defines an auxiliary class,
- RE_Options, as a vehicle to pass such modifiers to a RE class. Cur-
- rently, the following modifiers are supported:
-
- modifier description Perl corresponding
-
- PCRE_CASELESS case insensitive match /i
- PCRE_MULTILINE multiple lines match /m
- PCRE_DOTALL dot matches newlines /s
- PCRE_DOLLAR_ENDONLY $ matches only at end N/A
- PCRE_EXTRA strict escape parsing N/A
- PCRE_EXTENDED ignore whitespaces /x
- PCRE_UTF8 handles UTF8 chars built-in
- PCRE_UNGREEDY reverses * and *? N/A
- PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
-
- (*) Both Perl and PCRE allow non capturing parentheses by means of the
- "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not cap-
- ture, while (ab|cd) does.
-
- For a full account on how each modifier works, please check the PCRE
- API reference page.
-
- For each modifier, there are two member functions whose name is made
- out of the modifier in lowercase, without the "PCRE_" prefix. For
- instance, PCRE_CASELESS is handled by
-
- bool caseless()
-
- which returns true if the modifier is set, and
-
- RE_Options & set_caseless(bool)
-
- which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can
- be accessed through the set_match_limit() and match_limit() member
- functions. Setting match_limit to a non-zero value will limit the exe-
- cution of pcre to keep it from doing bad things like blowing the stack
- or taking an eternity to return a result. A value of 5000 is good
- enough to stop stack blowup in a 2MB thread stack. Setting match_limit
- to zero disables match limiting. Alternatively, you can call
- match_limit_recursion() which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to
- limit how much PCRE recurses. match_limit() limits the number of
- matches PCRE does; match_limit_recursion() limits the depth of internal
- recursion, and therefore the amount of stack that is used.
-
- Normally, to pass one or more modifiers to a RE class, you declare a
- RE_Options object, set the appropriate options, and pass this object to
- a RE constructor. Example:
-
- RE_options opt;
- opt.set_caseless(true);
- if (RE("HELLO", opt).PartialMatch("hello world")) ...
-
- RE_options has two constructors. The default constructor takes no argu-
- ments and creates a set of flags that are off by default. The optional
- parameter option_flags is to facilitate transfer of legacy code from C
- programs. This lets you do
-
- RE(pattern,
- RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
-
- However, new code is better off doing
-
- RE(pattern,
- RE_Options().set_caseless(true).set_multiline(true))
- .PartialMatch(str);
-
- If you are going to pass one of the most used modifiers, there are some
- convenience functions that return a RE_Options class with the appropri-
- ate modifier already set: CASELESS(), UTF8(), MULTILINE(), DOTALL(),
- and EXTENDED().
-
- If you need to set several options at once, and you don't want to go
- through the pains of declaring a RE_Options object and setting several
- options, there is a parallel method that give you such ability on the
- fly. You can concatenate several set_xxxxx() member functions, since
- each of them returns a reference to its class object. For example, to
- pass PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one
- statement, you may write:
-
- RE(" ^ xyz \\s+ .* blah$",
- RE_Options()
- .set_caseless(true)
- .set_extended(true)
- .set_multiline(true)).PartialMatch(sometext);
-
-
-SCANNING TEXT INCREMENTALLY
-
- The "Consume" operation may be useful if you want to repeatedly match
- regular expressions at the front of a string and skip over them as they
- match. This requires use of the "StringPiece" type, which represents a
- sub-range of a real string. Like RE, StringPiece is defined in the
- pcrecpp namespace.
-
- Example: read lines of the form "var = value" from a string.
- string contents = ...; // Fill string somehow
- pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
-
- string var;
- int value;
- pcrecpp::RE re("(\\w+) = (\\d+)\n");
- while (re.Consume(&input, &var, &value)) {
- ...;
- }
-
- Each successful call to "Consume" will set "var/value", and also
- advance "input" so it points past the matched text.
-
- The "FindAndConsume" operation is similar to "Consume" but does not
- anchor your match at the beginning of the string. For example, you
- could extract all words from a string by repeatedly calling
-
- pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
-
-
-PARSING HEX/OCTAL/C-RADIX NUMBERS
-
- By default, if you pass a pointer to a numeric value, the corresponding
- text is interpreted as a base-10 number. You can instead wrap the
- pointer with a call to one of the operators Hex(), Octal(), or CRadix()
- to interpret the text in another base. The CRadix operator interprets
- C-style "0" (base-8) and "0x" (base-16) prefixes, but defaults to
- base-10.
-
- Example:
- int a, b, c, d;
- pcrecpp::RE re("(.*) (.*) (.*) (.*)");
- re.FullMatch("100 40 0100 0x40",
- pcrecpp::Octal(&a), pcrecpp::Hex(&b),
- pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
-
- will leave 64 in a, b, c, and d.
-
-
-REPLACING PARTS OF STRINGS
-
- You can replace the first match of "pattern" in "str" with "rewrite".
- Within "rewrite", backslash-escaped digits (\1 to \9) can be used to
- insert text matching corresponding parenthesized group from the pat-
- tern. \0 in "rewrite" refers to the entire matching text. For example:
-
- string s = "yabba dabba doo";
- pcrecpp::RE("b+").Replace("d", &s);
-
- will leave "s" containing "yada dabba doo". The result is true if the
- pattern matches and a replacement occurs, false otherwise.
-
- GlobalReplace is like Replace except that it replaces all occurrences
- of the pattern in the string with the rewrite. Replacements are not
- subject to re-matching. For example:
-
- string s = "yabba dabba doo";
- pcrecpp::RE("b+").GlobalReplace("d", &s);
-
- will leave "s" containing "yada dada doo". It returns the number of
- replacements made.
-
- Extract is like Replace, except that if the pattern matches, "rewrite"
- is copied into "out" (an additional argument) with substitutions. The
- non-matching portions of "text" are ignored. Returns true iff a match
- occurred and the extraction happened successfully; if no match occurs,
- the string is left unaffected.
-
-
-AUTHOR
-
- The C++ wrapper was contributed by Google Inc.
- Copyright (c) 2007 Google Inc.
-
-
-REVISION
-
- Last updated: 17 March 2009
-------------------------------------------------------------------------------
-
-
-PCRESAMPLE(3) PCRESAMPLE(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE SAMPLE PROGRAM
-
- A simple, complete demonstration program, to get you started with using
- PCRE, is supplied in the file pcredemo.c in the PCRE distribution.
-
- The program compiles the regular expression that is its first argument,
- and matches it against the subject string in its second argument. No
- PCRE options are set, and default character tables are used. If match-
- ing succeeds, the program outputs the portion of the subject that
- matched, together with the contents of any captured substrings.
-
- If the -g option is given on the command line, the program then goes on
- to check for further matches of the same regular expression in the same
- subject string. The logic is a little bit tricky because of the possi-
- bility of matching an empty string. Comments in the code explain what
- is going on.
-
- If PCRE is installed in the standard include and library directories
- for your system, you should be able to compile the demonstration pro-
- gram using this command:
-
- gcc -o pcredemo pcredemo.c -lpcre
-
- If PCRE is installed elsewhere, you may need to add additional options
- to the command line. For example, on a Unix-like system that has PCRE
- installed in /usr/local, you can compile the demonstration program
- using a command like this:
-
- gcc -o pcredemo -I/usr/local/include pcredemo.c \
- -L/usr/local/lib -lpcre
-
- Once you have compiled the demonstration program, you can run simple
- tests like this:
-
- ./pcredemo 'cat|dog' 'the cat sat on the mat'
- ./pcredemo -g 'cat|dog' 'the dog sat on the cat'
-
- Note that there is a much more comprehensive test program, called
- pcretest, which supports many more facilities for testing regular
- expressions and the PCRE library. The pcredemo program is provided as a
- simple coding example.
-
- On some operating systems (e.g. Solaris), when PCRE is not installed in
- the standard library directory, you may get an error like this when you
- try to run pcredemo:
-
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or
- directory
-
- This is caused by the way shared library support works on those sys-
- tems. You need to add
-
- -R/usr/local/lib
-
- (for example) to the compile command to get round this problem.
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 23 January 2008
- Copyright (c) 1997-2008 University of Cambridge.
-------------------------------------------------------------------------------
-PCRESTACK(3) PCRESTACK(3)
-
-
-NAME
- PCRE - Perl-compatible regular expressions
-
-
-PCRE DISCUSSION OF STACK USAGE
-
- When you call pcre_exec(), it makes use of an internal function called
- match(). This calls itself recursively at branch points in the pattern,
- in order to remember the state of the match so that it can back up and
- try a different alternative if the first one fails. As matching pro-
- ceeds deeper and deeper into the tree of possibilities, the recursion
- depth increases.
-
- Not all calls of match() increase the recursion depth; for an item such
- as a* it may be called several times at the same level, after matching
- different numbers of a's. Furthermore, in a number of cases where the
- result of the recursive call would immediately be passed back as the
- result of the current call (a "tail recursion"), the function is just
- restarted instead.
-
- The pcre_dfa_exec() function operates in an entirely different way, and
- hardly uses recursion at all. The limit on its complexity is the amount
- of workspace it is given. The comments that follow do NOT apply to
- pcre_dfa_exec(); they are relevant only for pcre_exec().
-
- You can set limits on the number of times that match() is called, both
- in total and recursively. If the limit is exceeded, an error occurs.
- For details, see the section on extra data for pcre_exec() in the
- pcreapi documentation.
-
- Each time that match() is actually called recursively, it uses memory
- from the process stack. For certain kinds of pattern and data, very
- large amounts of stack may be needed, despite the recognition of "tail
- recursion". You can often reduce the amount of recursion, and there-
- fore the amount of stack used, by modifying the pattern that is being
- matched. Consider, for example, this pattern:
-
- ([^<]|<(?!inet))+
-
- It matches from wherever it starts until it encounters "
-.PP
-.SM
-.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
-.ti +5n
-.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
-.ti +5n
-.B const unsigned char *\fItableptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function compiles a regular expression into an internal form. It is the
-same as \fBpcre_compile2()\fP, except for the absence of the \fIerrorcodeptr\fP
-argument. Its arguments are:
-.sp
- \fIpattern\fR A zero-terminated string containing the
- regular expression to be compiled
- \fIoptions\fR Zero or more option bits
- \fIerrptr\fR Where to put an error message
- \fIerroffset\fR Offset in pattern where error was found
- \fItableptr\fR Pointer to character tables, or NULL to
- use the built-in default
-.sp
-The option bits are:
-.sp
- PCRE_ANCHORED Force pattern anchoring
- PCRE_AUTO_CALLOUT Compile automatic callouts
- PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \eR matches all Unicode line endings
- PCRE_CASELESS Do caseless matching
- PCRE_DOLLAR_ENDONLY $ not to match newline at end
- PCRE_DOTALL . matches anything including NL
- PCRE_DUPNAMES Allow duplicate names for subpatterns
- PCRE_EXTENDED Ignore whitespace and # comments
- PCRE_EXTRA PCRE extra features
- (not much use currently)
- PCRE_FIRSTLINE Force matching to be before newline
- PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
- PCRE_MULTILINE ^ and $ match newlines within data
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
- sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
- theses (named ones available)
- PCRE_UNGREEDY Invert greediness of quantifiers
- PCRE_UTF8 Run in UTF-8 mode
- PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
- validity (only relevant if
- PCRE_UTF8 is set)
-.sp
-PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
-PCRE_NO_UTF8_CHECK.
-.P
-The yield of the function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected. Note that
-compiling regular expressions with one version of PCRE for use with a different
-version is not guaranteed to work and may cause crashes.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fR
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fR
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_compile2.3 b/libs/pcre/doc/pcre_compile2.3
deleted file mode 100644
index 1e71aff60e..0000000000
--- a/libs/pcre/doc/pcre_compile2.3
+++ /dev/null
@@ -1,77 +0,0 @@
-.TH PCRE_COMPILE2 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
-.ti +5n
-.B int *\fIerrorcodeptr\fP,
-.ti +5n
-.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
-.ti +5n
-.B const unsigned char *\fItableptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function compiles a regular expression into an internal form. It is the
-same as \fBpcre_compile()\fP, except for the addition of the \fIerrorcodeptr\fP
-argument. The arguments are:
-
-.sp
- \fIpattern\fR A zero-terminated string containing the
- regular expression to be compiled
- \fIoptions\fR Zero or more option bits
- \fIerrorcodeptr\fP Where to put an error code
- \fIerrptr\fR Where to put an error message
- \fIerroffset\fR Offset in pattern where error was found
- \fItableptr\fR Pointer to character tables, or NULL to
- use the built-in default
-.sp
-The option bits are:
-.sp
- PCRE_ANCHORED Force pattern anchoring
- PCRE_AUTO_CALLOUT Compile automatic callouts
- PCRE_CASELESS Do caseless matching
- PCRE_DOLLAR_ENDONLY $ not to match newline at end
- PCRE_DOTALL . matches anything including NL
- PCRE_DUPNAMES Allow duplicate names for subpatterns
- PCRE_EXTENDED Ignore whitespace and # comments
- PCRE_EXTRA PCRE extra features
- (not much use currently)
- PCRE_FIRSTLINE Force matching to be before newline
- PCRE_MULTILINE ^ and $ match newlines within data
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
- theses (named ones available)
- PCRE_UNGREEDY Invert greediness of quantifiers
- PCRE_UTF8 Run in UTF-8 mode
- PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
- validity (only relevant if
- PCRE_UTF8 is set)
-.sp
-PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
-PCRE_NO_UTF8_CHECK.
-.P
-The yield of the function is a pointer to a private data structure that
-contains the compiled pattern, or NULL if an error was detected. Note that
-compiling regular expressions with one version of PCRE for use with a different
-version is not guaranteed to work and may cause crashes.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fR
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fR
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_config.3 b/libs/pcre/doc/pcre_config.3
deleted file mode 100644
index b111a70cad..0000000000
--- a/libs/pcre/doc/pcre_config.3
+++ /dev/null
@@ -1,57 +0,0 @@
-.TH PCRE_CONFIG 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function makes it possible for a client program to find out which optional
-features are available in the version of the PCRE library it is using. Its
-arguments are as follows:
-.sp
- \fIwhat\fR A code specifying what information is required
- \fIwhere\fR Points to where to put the data
-.sp
-The available codes are:
-.sp
- PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
- PCRE_CONFIG_MATCH_LIMIT Internal resource limit
- PCRE_CONFIG_MATCH_LIMIT_RECURSION
- Internal recursion depth limit
- PCRE_CONFIG_NEWLINE Value of the default newline sequence:
- 13 (0x000d) for CR
- 10 (0x000a) for LF
- 3338 (0x0d0a) for CRLF
- -2 for ANYCRLF
- -1 for ANY
- PCRE_CONFIG_BSR Indicates what \eR matches by default:
- 0 all Unicode line endings
- 1 CR, LF, or CRLF only
- PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
- Threshold of return slots, above
- which \fBmalloc()\fR is used by
- the POSIX API
- PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
- PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
- PCRE_CONFIG_UNICODE_PROPERTIES
- Availability of Unicode property support
- (1=yes 0=no)
-.sp
-The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fR
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fR
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_copy_named_substring.3 b/libs/pcre/doc/pcre_copy_named_substring.3
deleted file mode 100644
index 9ad682655b..0000000000
--- a/libs/pcre/doc/pcre_copy_named_substring.3
+++ /dev/null
@@ -1,43 +0,0 @@
-.TH PCRE_COPY_NAMED_SUBSTRING 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, const char *\fIstringname\fP,
-.ti +5n
-.B char *\fIbuffer\fP, int \fIbuffersize\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for extracting a captured substring, identified
-by name, into a given buffer. The arguments are:
-.sp
- \fIcode\fP Pattern that was successfully matched
- \fIsubject\fP Subject that has been successfully matched
- \fIovector\fP Offset vector that \fBpcre_exec()\fP used
- \fIstringcount\fP Value returned by \fBpcre_exec()\fP
- \fIstringname\fP Name of the required substring
- \fIbuffer\fP Buffer to receive the string
- \fIbuffersize\fP Size of buffer
-.sp
-The yield is the length of the substring, PCRE_ERROR_NOMEMORY if the buffer was
-too small, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_copy_substring.3 b/libs/pcre/doc/pcre_copy_substring.3
deleted file mode 100644
index 1910d185e7..0000000000
--- a/libs/pcre/doc/pcre_copy_substring.3
+++ /dev/null
@@ -1,40 +0,0 @@
-.TH PCRE_COPY_SUBSTRING 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
-.ti +5n
-.B int \fIbuffersize\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for extracting a captured substring into a given
-buffer. The arguments are:
-.sp
- \fIsubject\fP Subject that has been successfully matched
- \fIovector\fP Offset vector that \fBpcre_exec()\fP used
- \fIstringcount\fP Value returned by \fBpcre_exec()\fP
- \fIstringnumber\fP Number of the required substring
- \fIbuffer\fP Buffer to receive the string
- \fIbuffersize\fP Size of buffer
-.sp
-The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
-too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_dfa_exec.3 b/libs/pcre/doc/pcre_dfa_exec.3
deleted file mode 100644
index a046276d63..0000000000
--- a/libs/pcre/doc/pcre_dfa_exec.3
+++ /dev/null
@@ -1,89 +0,0 @@
-.TH PCRE_DFA_EXEC 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
-.ti +5n
-.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
-.ti +5n
-.B int *\fIworkspace\fP, int \fIwscount\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function matches a compiled regular expression against a given subject
-string, using an alternative matching algorithm that scans the subject string
-just once (\fInot\fP Perl-compatible). Note that the main, Perl-compatible,
-matching function is \fBpcre_exec()\fP. The arguments for this function are:
-.sp
- \fIcode\fP Points to the compiled pattern
- \fIextra\fP Points to an associated \fBpcre_extra\fP structure,
- or is NULL
- \fIsubject\fP Points to the subject string
- \fIlength\fP Length of the subject string, in bytes
- \fIstartoffset\fP Offset in bytes in the subject at which to
- start matching
- \fIoptions\fP Option bits
- \fIovector\fP Points to a vector of ints for result offsets
- \fIovecsize\fP Number of elements in the vector
- \fIworkspace\fP Points to a vector of ints used as working space
- \fIwscount\fP Number of elements in the vector
-.sp
-The options are:
-.sp
- PCRE_ANCHORED Match only at the first position
- PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \eR matches all Unicode line endings
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NOTBOL Subject is not the beginning of a line
- PCRE_NOTEOL Subject is not the end of a line
- PCRE_NOTEMPTY An empty string is not a valid match
- PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
- PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
- validity (only relevant if PCRE_UTF8
- was set at compile time)
- PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
- PCRE_DFA_SHORTEST Return only the shortest match
- PCRE_DFA_RESTART This is a restart after a partial match
-.sp
-There are restrictions on what may appear in a pattern when using this matching
-function. Details are given in the
-.\" HREF
-\fBpcrematching\fP
-.\"
-documentation.
-.P
-A \fBpcre_extra\fP structure contains the following fields:
-.sp
- \fIflags\fP Bits indicating which fields are set
- \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
- \fImatch_limit\fP Limit on internal resource use
- \fImatch_limit_recursion\fP Limit on internal recursion depth
- \fIcallout_data\fP Opaque data passed back to callouts
- \fItables\fP Points to character tables or is NULL
-.sp
-The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES. For this matching function, the \fImatch_limit\fP and
-\fImatch_limit_recursion\fP fields are not used, and must not be set.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_exec.3 b/libs/pcre/doc/pcre_exec.3
deleted file mode 100644
index 21bf6da5e0..0000000000
--- a/libs/pcre/doc/pcre_exec.3
+++ /dev/null
@@ -1,81 +0,0 @@
-.TH PCRE_EXEC 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
-.ti +5n
-.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function matches a compiled regular expression against a given subject
-string, using a matching algorithm that is similar to Perl's. It returns
-offsets to captured substrings. Its arguments are:
-.sp
- \fIcode\fP Points to the compiled pattern
- \fIextra\fP Points to an associated \fBpcre_extra\fP structure,
- or is NULL
- \fIsubject\fP Points to the subject string
- \fIlength\fP Length of the subject string, in bytes
- \fIstartoffset\fP Offset in bytes in the subject at which to
- start matching
- \fIoptions\fP Option bits
- \fIovector\fP Points to a vector of ints for result offsets
- \fIovecsize\fP Number of elements in the vector (a multiple of 3)
-.sp
-The options are:
-.sp
- PCRE_ANCHORED Match only at the first position
- PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
- PCRE_BSR_UNICODE \eR matches all Unicode line endings
- PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
- PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
- PCRE_NEWLINE_CR Set CR as the newline sequence
- PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
- PCRE_NEWLINE_LF Set LF as the newline sequence
- PCRE_NOTBOL Subject is not the beginning of a line
- PCRE_NOTEOL Subject is not the end of a line
- PCRE_NOTEMPTY An empty string is not a valid match
- PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
- PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
- validity (only relevant if PCRE_UTF8
- was set at compile time)
- PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
-.sp
-There are restrictions on what may appear in a pattern when partial matching is
-requested. For details, see the
-.\" HREF
-\fBpcrepartial\fP
-.\"
-page.
-.P
-A \fBpcre_extra\fP structure contains the following fields:
-.sp
- \fIflags\fP Bits indicating which fields are set
- \fIstudy_data\fP Opaque data from \fBpcre_study()\fP
- \fImatch_limit\fP Limit on internal resource use
- \fImatch_limit_recursion\fP Limit on internal recursion depth
- \fIcallout_data\fP Opaque data passed back to callouts
- \fItables\fP Points to character tables or is NULL
-.sp
-The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_free_substring.3 b/libs/pcre/doc/pcre_free_substring.3
deleted file mode 100644
index ed3999a746..0000000000
--- a/libs/pcre/doc/pcre_free_substring.3
+++ /dev/null
@@ -1,27 +0,0 @@
-.TH PCRE_FREE_SUBSTRING 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B void pcre_free_substring(const char *\fIstringptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for freeing the store obtained by a previous
-call to \fBpcre_get_substring()\fP or \fBpcre_get_named_substring()\fP. Its
-only argument is a pointer to the string.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_free_substring_list.3 b/libs/pcre/doc/pcre_free_substring_list.3
deleted file mode 100644
index 89b7078524..0000000000
--- a/libs/pcre/doc/pcre_free_substring_list.3
+++ /dev/null
@@ -1,27 +0,0 @@
-.TH PCRE_FREE_SUBSTRING_LIST 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B void pcre_free_substring_list(const char **\fIstringptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for freeing the store obtained by a previous
-call to \fBpcre_get_substring_list()\fP. Its only argument is a pointer to the
-list of string pointers.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_fullinfo.3 b/libs/pcre/doc/pcre_fullinfo.3
deleted file mode 100644
index 3cf8cbddef..0000000000
--- a/libs/pcre/doc/pcre_fullinfo.3
+++ /dev/null
@@ -1,59 +0,0 @@
-.TH PCRE_FULLINFO 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B int \fIwhat\fP, void *\fIwhere\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function returns information about a compiled pattern. Its arguments are:
-.sp
- \fIcode\fP Compiled regular expression
- \fIextra\fP Result of \fBpcre_study()\fP or NULL
- \fIwhat\fP What information is required
- \fIwhere\fP Where to put the information
-.sp
-The following information is available:
-.sp
- PCRE_INFO_BACKREFMAX Number of highest back reference
- PCRE_INFO_CAPTURECOUNT Number of capturing subpatterns
- PCRE_INFO_DEFAULT_TABLES Pointer to default tables
- PCRE_INFO_FIRSTBYTE Fixed first byte for a match, or
- -1 for start of string
- or after newline, or
- -2 otherwise
- PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
- PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
- PCRE_INFO_LASTLITERAL Literal last byte required
- PCRE_INFO_NAMECOUNT Number of named subpatterns
- PCRE_INFO_NAMEENTRYSIZE Size of name table entry
- PCRE_INFO_NAMETABLE Pointer to name table
- PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
- PCRE_INFO_OPTIONS Option bits used for compilation
- PCRE_INFO_SIZE Size of compiled pattern
- PCRE_INFO_STUDYSIZE Size of study data
-.sp
-The yield of the function is zero on success or:
-.sp
- PCRE_ERROR_NULL the argument \fIcode\fP was NULL
- the argument \fIwhere\fP was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_get_named_substring.3 b/libs/pcre/doc/pcre_get_named_substring.3
deleted file mode 100644
index 22d0c1be32..0000000000
--- a/libs/pcre/doc/pcre_get_named_substring.3
+++ /dev/null
@@ -1,45 +0,0 @@
-.TH PCRE_GET_NAMED_SUBSTRING 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_get_named_substring(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, const char *\fIstringname\fP,
-.ti +5n
-.B const char **\fIstringptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for extracting a captured substring by name. The
-arguments are:
-.sp
- \fIcode\fP Compiled pattern
- \fIsubject\fP Subject that has been successfully matched
- \fIovector\fP Offset vector that \fBpcre_exec()\fP used
- \fIstringcount\fP Value returned by \fBpcre_exec()\fP
- \fIstringname\fP Name of the required substring
- \fIstringptr\fP Where to put the string pointer
-.sp
-The memory in which the substring is placed is obtained by calling
-\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
-be used to free it when it is no longer needed. The yield of the function is
-the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
-could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_get_stringnumber.3 b/libs/pcre/doc/pcre_get_stringnumber.3
deleted file mode 100644
index f6017ffba4..0000000000
--- a/libs/pcre/doc/pcre_get_stringnumber.3
+++ /dev/null
@@ -1,37 +0,0 @@
-.TH PCRE_GET_STRINGNUMBER 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIname\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This convenience function finds the number of a named substring capturing
-parenthesis in a compiled pattern. Its arguments are:
-.sp
- \fIcode\fP Compiled regular expression
- \fIname\fP Name whose number is required
-.sp
-The yield of the function is the number of the parenthesis if the name is
-found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
-(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
-\fBpcre_get_stringnumber()\fP. You can obtain the complete list by calling
-\fBpcre_get_stringtable_entries()\fP.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_get_stringtable_entries.3 b/libs/pcre/doc/pcre_get_stringtable_entries.3
deleted file mode 100644
index 979c4be5b9..0000000000
--- a/libs/pcre/doc/pcre_get_stringtable_entries.3
+++ /dev/null
@@ -1,40 +0,0 @@
-.TH PCRE_GET_STRINGTABLE_ENTRIES 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This convenience function finds, for a compiled pattern, the first and last
-entries for a given name in the table that translates capturing parenthesis
-names into numbers. When names are required to be unique (PCRE_DUPNAMES is
-\fInot\fP set), it is usually easier to use \fBpcre_get_stringnumber()\fP
-instead.
-.sp
- \fIcode\fP Compiled regular expression
- \fIname\fP Name whose entries required
- \fIfirst\fP Where to return a pointer to the first entry
- \fIlast\fP Where to return a pointer to the last entry
-.sp
-The yield of the function is the length of each entry, or
-PCRE_ERROR_NOSUBSTRING if none are found.
-.P
-There is a complete description of the PCRE native API, including the format of
-the table entries, in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page, and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_get_substring.3 b/libs/pcre/doc/pcre_get_substring.3
deleted file mode 100644
index 8fb11ec6d1..0000000000
--- a/libs/pcre/doc/pcre_get_substring.3
+++ /dev/null
@@ -1,42 +0,0 @@
-.TH PCRE_GET_SUBSTRING 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, int \fIstringnumber\fP,
-.ti +5n
-.B const char **\fIstringptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for extracting a captured substring. The
-arguments are:
-.sp
- \fIsubject\fP Subject that has been successfully matched
- \fIovector\fP Offset vector that \fBpcre_exec()\fP used
- \fIstringcount\fP Value returned by \fBpcre_exec()\fP
- \fIstringnumber\fP Number of the required substring
- \fIstringptr\fP Where to put the string pointer
-.sp
-The memory in which the substring is placed is obtained by calling
-\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
-be used to free it when it is no longer needed. The yield of the function is
-the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
-be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_get_substring_list.3 b/libs/pcre/doc/pcre_get_substring_list.3
deleted file mode 100644
index 647ae39191..0000000000
--- a/libs/pcre/doc/pcre_get_substring_list.3
+++ /dev/null
@@ -1,41 +0,0 @@
-.TH PCRE_GET_SUBSTRING_LIST 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_get_substring_list(const char *\fIsubject\fP,
-.ti +5n
-.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
-.
-.SH DESCRIPTION
-.rs
-.sp
-This is a convenience function for extracting a list of all the captured
-substrings. The arguments are:
-.sp
- \fIsubject\fP Subject that has been successfully matched
- \fIovector\fP Offset vector that \fBpcre_exec\fP used
- \fIstringcount\fP Value returned by \fBpcre_exec\fP
- \fIlistptr\fP Where to put a pointer to the list
-.sp
-The memory in which the substrings and the list are placed is obtained by
-calling \fBpcre_malloc()\fP. The convenience function
-\fBpcre_free_substring_list()\fP can be used to free it when it is no longer
-needed. A pointer to a list of pointers is put in the variable whose address is
-in \fIlistptr\fP. The list is terminated by a NULL pointer. The yield of the
-function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
-not be obtained.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_info.3 b/libs/pcre/doc/pcre_info.3
deleted file mode 100644
index 8c78121de1..0000000000
--- a/libs/pcre/doc/pcre_info.3
+++ /dev/null
@@ -1,26 +0,0 @@
-.TH PCRE_INFO 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
-.B *\fIfirstcharptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function is obsolete. You should be using \fBpcre_fullinfo()\fP instead.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_maketables.3 b/libs/pcre/doc/pcre_maketables.3
deleted file mode 100644
index 8d3978c251..0000000000
--- a/libs/pcre/doc/pcre_maketables.3
+++ /dev/null
@@ -1,29 +0,0 @@
-.TH PCRE_MAKETABLES 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B const unsigned char *pcre_maketables(void);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function builds a set of character tables for character values less than
-256. These can be passed to \fBpcre_compile()\fP to override PCRE's internal,
-built-in tables (which were made by \fBpcre_maketables()\fP when PCRE was
-compiled). You might want to do this if you are using a non-standard locale.
-The function yields a pointer to the tables.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_refcount.3 b/libs/pcre/doc/pcre_refcount.3
deleted file mode 100644
index 6ab9f4fee7..0000000000
--- a/libs/pcre/doc/pcre_refcount.3
+++ /dev/null
@@ -1,32 +0,0 @@
-.TH PCRE_REFCOUNT 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function is used to maintain a reference count inside a data block that
-contains a compiled pattern. Its arguments are:
-.sp
- \fIcode\fP Compiled regular expression
- \fIadjust\fP Adjustment to reference value
-.sp
-The yield of the function is the adjusted reference value, which is constrained
-to lie between 0 and 65535.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_study.3 b/libs/pcre/doc/pcre_study.3
deleted file mode 100644
index 53f5bc1b48..0000000000
--- a/libs/pcre/doc/pcre_study.3
+++ /dev/null
@@ -1,42 +0,0 @@
-.TH PCRE_STUDY 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
-.ti +5n
-.B const char **\fIerrptr\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function studies a compiled pattern, to see if additional information can
-be extracted that might speed up matching. Its arguments are:
-.sp
- \fIcode\fP A compiled regular expression
- \fIoptions\fP Options for \fBpcre_study()\fP
- \fIerrptr\fP Where to put an error message
-.sp
-If the function succeeds, it returns a value that can be passed to
-\fBpcre_exec()\fP via its \fIextra\fP argument.
-.P
-If the function returns NULL, either it could not find any additional
-information, or there was an error. You can tell the difference by looking at
-the error value. It is NULL in first case.
-.P
-There are currently no options defined; the value of the second argument should
-always be zero.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcre_version.3 b/libs/pcre/doc/pcre_version.3
deleted file mode 100644
index f1563fac6a..0000000000
--- a/libs/pcre/doc/pcre_version.3
+++ /dev/null
@@ -1,26 +0,0 @@
-.TH PCRE_VERSION 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH SYNOPSIS
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B char *pcre_version(void);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This function returns a character string that gives the version number of the
-PCRE library and the date of its release.
-.P
-There is a complete description of the PCRE native API in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page and a description of the POSIX API in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-page.
diff --git a/libs/pcre/doc/pcreapi.3 b/libs/pcre/doc/pcreapi.3
deleted file mode 100644
index 7c830ff454..0000000000
--- a/libs/pcre/doc/pcreapi.3
+++ /dev/null
@@ -1,2001 +0,0 @@
-.TH PCREAPI 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE NATIVE API"
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
-.ti +5n
-.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
-.ti +5n
-.B const unsigned char *\fItableptr\fP);
-.PP
-.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
-.ti +5n
-.B int *\fIerrorcodeptr\fP,
-.ti +5n
-.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
-.ti +5n
-.B const unsigned char *\fItableptr\fP);
-.PP
-.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
-.ti +5n
-.B const char **\fIerrptr\fP);
-.PP
-.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
-.ti +5n
-.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
-.PP
-.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
-.ti +5n
-.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
-.ti +5n
-.B int *\fIworkspace\fP, int \fIwscount\fP);
-.PP
-.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, const char *\fIstringname\fP,
-.ti +5n
-.B char *\fIbuffer\fP, int \fIbuffersize\fP);
-.PP
-.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
-.ti +5n
-.B int \fIbuffersize\fP);
-.PP
-.B int pcre_get_named_substring(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, const char *\fIstringname\fP,
-.ti +5n
-.B const char **\fIstringptr\fP);
-.PP
-.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIname\fP);
-.PP
-.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
-.PP
-.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, int \fIstringnumber\fP,
-.ti +5n
-.B const char **\fIstringptr\fP);
-.PP
-.B int pcre_get_substring_list(const char *\fIsubject\fP,
-.ti +5n
-.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
-.PP
-.B void pcre_free_substring(const char *\fIstringptr\fP);
-.PP
-.B void pcre_free_substring_list(const char **\fIstringptr\fP);
-.PP
-.B const unsigned char *pcre_maketables(void);
-.PP
-.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B int \fIwhat\fP, void *\fIwhere\fP);
-.PP
-.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
-.B *\fIfirstcharptr\fP);
-.PP
-.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
-.PP
-.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
-.PP
-.B char *pcre_version(void);
-.PP
-.B void *(*pcre_malloc)(size_t);
-.PP
-.B void (*pcre_free)(void *);
-.PP
-.B void *(*pcre_stack_malloc)(size_t);
-.PP
-.B void (*pcre_stack_free)(void *);
-.PP
-.B int (*pcre_callout)(pcre_callout_block *);
-.
-.
-.SH "PCRE API OVERVIEW"
-.rs
-.sp
-PCRE has its own native API, which is described in this document. There are
-also some wrapper functions that correspond to the POSIX regular expression
-API. These are described in the
-.\" HREF
-\fBpcreposix\fP
-.\"
-documentation. Both of these APIs define a set of C function calls. A C++
-wrapper is distributed with PCRE. It is documented in the
-.\" HREF
-\fBpcrecpp\fP
-.\"
-page.
-.P
-The native API C function prototypes are defined in the header file
-\fBpcre.h\fP, and on Unix systems the library itself is called \fBlibpcre\fP.
-It can normally be accessed by adding \fB-lpcre\fP to the command for linking
-an application that uses PCRE. The header file defines the macros PCRE_MAJOR
-and PCRE_MINOR to contain the major and minor release numbers for the library.
-Applications can use these to include support for different releases of PCRE.
-.P
-The functions \fBpcre_compile()\fP, \fBpcre_compile2()\fP, \fBpcre_study()\fP,
-and \fBpcre_exec()\fP are used for compiling and matching regular expressions
-in a Perl-compatible manner. A sample program that demonstrates the simplest
-way of using them is provided in the file called \fIpcredemo.c\fP in the source
-distribution. The
-.\" HREF
-\fBpcresample\fP
-.\"
-documentation describes how to compile and run it.
-.P
-A second matching function, \fBpcre_dfa_exec()\fP, which is not
-Perl-compatible, is also provided. This uses a different algorithm for the
-matching. The alternative algorithm finds all possible matches (at a given
-point in the subject), and scans the subject just once. However, this algorithm
-does not return captured substrings. A description of the two matching
-algorithms and their advantages and disadvantages is given in the
-.\" HREF
-\fBpcrematching\fP
-.\"
-documentation.
-.P
-In addition to the main compiling and matching functions, there are convenience
-functions for extracting captured substrings from a subject string that is
-matched by \fBpcre_exec()\fP. They are:
-.sp
- \fBpcre_copy_substring()\fP
- \fBpcre_copy_named_substring()\fP
- \fBpcre_get_substring()\fP
- \fBpcre_get_named_substring()\fP
- \fBpcre_get_substring_list()\fP
- \fBpcre_get_stringnumber()\fP
- \fBpcre_get_stringtable_entries()\fP
-.sp
-\fBpcre_free_substring()\fP and \fBpcre_free_substring_list()\fP are also
-provided, to free the memory used for extracted strings.
-.P
-The function \fBpcre_maketables()\fP is used to build a set of character tables
-in the current locale for passing to \fBpcre_compile()\fP, \fBpcre_exec()\fP,
-or \fBpcre_dfa_exec()\fP. This is an optional facility that is provided for
-specialist use. Most commonly, no special tables are passed, in which case
-internal tables that are generated when PCRE is built are used.
-.P
-The function \fBpcre_fullinfo()\fP is used to find out information about a
-compiled pattern; \fBpcre_info()\fP is an obsolete version that returns only
-some of the available information, but is retained for backwards compatibility.
-The function \fBpcre_version()\fP returns a pointer to a string containing the
-version of PCRE and its date of release.
-.P
-The function \fBpcre_refcount()\fP maintains a reference count in a data block
-containing a compiled pattern. This is provided for the benefit of
-object-oriented applications.
-.P
-The global variables \fBpcre_malloc\fP and \fBpcre_free\fP initially contain
-the entry points of the standard \fBmalloc()\fP and \fBfree()\fP functions,
-respectively. PCRE calls the memory management functions via these variables,
-so a calling program can replace them if it wishes to intercept the calls. This
-should be done before calling any PCRE functions.
-.P
-The global variables \fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP are also
-indirections to memory management functions. These special functions are used
-only when PCRE is compiled to use the heap for remembering data, instead of
-recursive function calls, when running the \fBpcre_exec()\fP function. See the
-.\" HREF
-\fBpcrebuild\fP
-.\"
-documentation for details of how to do this. It is a non-standard way of
-building PCRE, for use in environments that have limited stacks. Because of the
-greater use of memory management, it runs more slowly. Separate functions are
-provided so that special-purpose external code can be used for this case. When
-used, these functions are always called in a stack-like manner (last obtained,
-first freed), and always for memory blocks of the same size. There is a
-discussion about PCRE's stack usage in the
-.\" HREF
-\fBpcrestack\fP
-.\"
-documentation.
-.P
-The global variable \fBpcre_callout\fP initially contains NULL. It can be set
-by the caller to a "callout" function, which PCRE will then call at specified
-points during a matching operation. Details are given in the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation.
-.
-.
-.\" HTML
-.SH NEWLINES
-.rs
-.sp
-PCRE supports five different conventions for indicating line breaks in
-strings: a single CR (carriage return) character, a single LF (linefeed)
-character, the two-character sequence CRLF, any of the three preceding, or any
-Unicode newline sequence. The Unicode newline sequences are the three just
-mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
-U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
-(paragraph separator, U+2029).
-.P
-Each of the first three conventions is used by at least one operating system as
-its standard newline sequence. When PCRE is built, a default can be specified.
-The default default is LF, which is the Unix standard. When PCRE is run, the
-default can be overridden, either when a pattern is compiled, or when it is
-matched.
-.P
-At compile time, the newline convention can be specified by the \fIoptions\fP
-argument of \fBpcre_compile()\fP, or it can be specified by special text at the
-start of the pattern itself; this overrides any other settings. See the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-page for details of the special character sequences.
-.P
-In the PCRE documentation the word "newline" is used to mean "the character or
-pair of characters that indicate a line break". The choice of newline
-convention affects the handling of the dot, circumflex, and dollar
-metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
-recognized line ending sequence, the match position advancement for a
-non-anchored pattern. There is more detail about this in the
-.\" HTML
-.\"
-section on \fBpcre_exec()\fP options
-.\"
-below.
-.P
-The choice of newline convention does not affect the interpretation of
-the \en or \er escape sequences, nor does it affect what \eR matches, which is
-controlled in a similar way, but by separate options.
-.
-.
-.SH MULTITHREADING
-.rs
-.sp
-The PCRE functions can be used in multi-threading applications, with the
-proviso that the memory management functions pointed to by \fBpcre_malloc\fP,
-\fBpcre_free\fP, \fBpcre_stack_malloc\fP, and \fBpcre_stack_free\fP, and the
-callout function pointed to by \fBpcre_callout\fP, are shared by all threads.
-.P
-The compiled form of a regular expression is not altered during matching, so
-the same compiled pattern can safely be used by several threads at once.
-.
-.
-.SH "SAVING PRECOMPILED PATTERNS FOR LATER USE"
-.rs
-.sp
-The compiled form of a regular expression can be saved and re-used at a later
-time, possibly by a different program, and even on a host other than the one on
-which it was compiled. Details are given in the
-.\" HREF
-\fBpcreprecompile\fP
-.\"
-documentation. However, compiling a regular expression with one version of PCRE
-for use with a different version is not guaranteed to work and may cause
-crashes.
-.
-.
-.SH "CHECKING BUILD-TIME OPTIONS"
-.rs
-.sp
-.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
-.PP
-The function \fBpcre_config()\fP makes it possible for a PCRE client to
-discover which optional features have been compiled into the PCRE library. The
-.\" HREF
-\fBpcrebuild\fP
-.\"
-documentation has more details about these optional features.
-.P
-The first argument for \fBpcre_config()\fP is an integer, specifying which
-information is required; the second argument is a pointer to a variable into
-which the information is placed. The following information is available:
-.sp
- PCRE_CONFIG_UTF8
-.sp
-The output is an integer that is set to one if UTF-8 support is available;
-otherwise it is set to zero.
-.sp
- PCRE_CONFIG_UNICODE_PROPERTIES
-.sp
-The output is an integer that is set to one if support for Unicode character
-properties is available; otherwise it is set to zero.
-.sp
- PCRE_CONFIG_NEWLINE
-.sp
-The output is an integer whose value specifies the default character sequence
-that is recognized as meaning "newline". The four values that are supported
-are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY.
-Though they are derived from ASCII, the same values are returned in EBCDIC
-environments. The default should normally correspond to the standard sequence
-for your operating system.
-.sp
- PCRE_CONFIG_BSR
-.sp
-The output is an integer whose value indicates what character sequences the \eR
-escape sequence matches by default. A value of 0 means that \eR matches any
-Unicode line ending sequence; a value of 1 means that \eR matches only CR, LF,
-or CRLF. The default can be overridden when a pattern is compiled or matched.
-.sp
- PCRE_CONFIG_LINK_SIZE
-.sp
-The output is an integer that contains the number of bytes used for internal
-linkage in compiled regular expressions. The value is 2, 3, or 4. Larger values
-allow larger regular expressions to be compiled, at the expense of slower
-matching. The default value of 2 is sufficient for all but the most massive
-patterns, since it allows the compiled pattern to be up to 64K in size.
-.sp
- PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
-.sp
-The output is an integer that contains the threshold above which the POSIX
-interface uses \fBmalloc()\fP for output vectors. Further details are given in
-the
-.\" HREF
-\fBpcreposix\fP
-.\"
-documentation.
-.sp
- PCRE_CONFIG_MATCH_LIMIT
-.sp
-The output is a long integer that gives the default limit for the number of
-internal matching function calls in a \fBpcre_exec()\fP execution. Further
-details are given with \fBpcre_exec()\fP below.
-.sp
- PCRE_CONFIG_MATCH_LIMIT_RECURSION
-.sp
-The output is a long integer that gives the default limit for the depth of
-recursion when calling the internal matching function in a \fBpcre_exec()\fP
-execution. Further details are given with \fBpcre_exec()\fP below.
-.sp
- PCRE_CONFIG_STACKRECURSE
-.sp
-The output is an integer that is set to one if internal recursion when running
-\fBpcre_exec()\fP is implemented by recursive function calls that use the stack
-to remember their state. This is the usual way that PCRE is compiled. The
-output is zero if PCRE was compiled to use blocks of data on the heap instead
-of recursive function calls. In this case, \fBpcre_stack_malloc\fP and
-\fBpcre_stack_free\fP are called to manage memory blocks on the heap, thus
-avoiding the use of the stack.
-.
-.
-.SH "COMPILING A PATTERN"
-.rs
-.sp
-.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
-.ti +5n
-.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
-.ti +5n
-.B const unsigned char *\fItableptr\fP);
-.sp
-.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
-.ti +5n
-.B int *\fIerrorcodeptr\fP,
-.ti +5n
-.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
-.ti +5n
-.B const unsigned char *\fItableptr\fP);
-.P
-Either of the functions \fBpcre_compile()\fP or \fBpcre_compile2()\fP can be
-called to compile a pattern into an internal form. The only difference between
-the two interfaces is that \fBpcre_compile2()\fP has an additional argument,
-\fIerrorcodeptr\fP, via which a numerical error code can be returned.
-.P
-The pattern is a C string terminated by a binary zero, and is passed in the
-\fIpattern\fP argument. A pointer to a single block of memory that is obtained
-via \fBpcre_malloc\fP is returned. This contains the compiled code and related
-data. The \fBpcre\fP type is defined for the returned block; this is a typedef
-for a structure whose contents are not externally defined. It is up to the
-caller to free the memory (via \fBpcre_free\fP) when it is no longer required.
-.P
-Although the compiled code of a PCRE regex is relocatable, that is, it does not
-depend on memory location, the complete \fBpcre\fP data block is not
-fully relocatable, because it may contain a copy of the \fItableptr\fP
-argument, which is an address (see below).
-.P
-The \fIoptions\fP argument contains various bit settings that affect the
-compilation. It should be zero if no options are required. The available
-options are described below. Some of them (in particular, those that are
-compatible with Perl, but also some others) can also be set and unset from
-within the pattern (see the detailed description in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation). For those options that can be different in different parts of
-the pattern, the contents of the \fIoptions\fP argument specifies their initial
-settings at the start of compilation and execution. The PCRE_ANCHORED and
-PCRE_NEWLINE_\fIxxx\fP options can be set at the time of matching as well as at
-compile time.
-.P
-If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
-Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
-NULL, and sets the variable pointed to by \fIerrptr\fP to point to a textual
-error message. This is a static string that is part of the library. You must
-not try to free it. The offset from the start of the pattern to the character
-where the error was discovered is placed in the variable pointed to by
-\fIerroffset\fP, which must not be NULL. If it is, an immediate error is given.
-.P
-If \fBpcre_compile2()\fP is used instead of \fBpcre_compile()\fP, and the
-\fIerrorcodeptr\fP argument is not NULL, a non-zero error code number is
-returned via this argument in the event of an error. This is in addition to the
-textual error message. Error codes and messages are listed below.
-.P
-If the final argument, \fItableptr\fP, is NULL, PCRE uses a default set of
-character tables that are built when PCRE is compiled, using the default C
-locale. Otherwise, \fItableptr\fP must be an address that is the result of a
-call to \fBpcre_maketables()\fP. This value is stored with the compiled
-pattern, and used again by \fBpcre_exec()\fP, unless another table pointer is
-passed to it. For more discussion, see the section on locale support below.
-.P
-This code fragment shows a typical straightforward call to \fBpcre_compile()\fP:
-.sp
- pcre *re;
- const char *error;
- int erroffset;
- re = pcre_compile(
- "^A.*Z", /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-.sp
-The following names for option bits are defined in the \fBpcre.h\fP header
-file:
-.sp
- PCRE_ANCHORED
-.sp
-If this bit is set, the pattern is forced to be "anchored", that is, it is
-constrained to match only at the first matching point in the string that is
-being searched (the "subject string"). This effect can also be achieved by
-appropriate constructs in the pattern itself, which is the only way to do it in
-Perl.
-.sp
- PCRE_AUTO_CALLOUT
-.sp
-If this bit is set, \fBpcre_compile()\fP automatically inserts callout items,
-all with number 255, before each pattern item. For discussion of the callout
-facility, see the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation.
-.sp
- PCRE_BSR_ANYCRLF
- PCRE_BSR_UNICODE
-.sp
-These options (which are mutually exclusive) control what the \eR escape
-sequence matches. The choice is either to match only CR, LF, or CRLF, or to
-match any Unicode newline sequence. The default is specified when PCRE is
-built. It can be overridden from within the pattern, or by setting an option
-when a compiled pattern is matched.
-.sp
- PCRE_CASELESS
-.sp
-If this bit is set, letters in the pattern match both upper and lower case
-letters. It is equivalent to Perl's /i option, and it can be changed within a
-pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
-concept of case for characters whose values are less than 128, so caseless
-matching is always possible. For characters with higher values, the concept of
-case is supported if PCRE is compiled with Unicode property support, but not
-otherwise. If you want to use caseless matching for characters 128 and above,
-you must ensure that PCRE is compiled with Unicode property support as well as
-with UTF-8 support.
-.sp
- PCRE_DOLLAR_ENDONLY
-.sp
-If this bit is set, a dollar metacharacter in the pattern matches only at the
-end of the subject string. Without this option, a dollar also matches
-immediately before a newline at the end of the string (but not before any other
-newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
-There is no equivalent to this option in Perl, and no way to set it within a
-pattern.
-.sp
- PCRE_DOTALL
-.sp
-If this bit is set, a dot metacharater in the pattern matches all characters,
-including those that indicate newline. Without it, a dot does not match when
-the current position is at a newline. This option is equivalent to Perl's /s
-option, and it can be changed within a pattern by a (?s) option setting. A
-negative class such as [^a] always matches newline characters, independent of
-the setting of this option.
-.sp
- PCRE_DUPNAMES
-.sp
-If this bit is set, names used to identify capturing subpatterns need not be
-unique. This can be helpful for certain types of pattern when it is known that
-only one instance of the named subpattern can ever be matched. There are more
-details of named subpatterns below; see also the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation.
-.sp
- PCRE_EXTENDED
-.sp
-If this bit is set, whitespace data characters in the pattern are totally
-ignored except when escaped or inside a character class. Whitespace does not
-include the VT character (code 11). In addition, characters between an
-unescaped # outside a character class and the next newline, inclusive, are also
-ignored. This is equivalent to Perl's /x option, and it can be changed within a
-pattern by a (?x) option setting.
-.P
-This option makes it possible to include comments inside complicated patterns.
-Note, however, that this applies only to data characters. Whitespace characters
-may never appear within special character sequences in a pattern, for example
-within the sequence (?( which introduces a conditional subpattern.
-.sp
- PCRE_EXTRA
-.sp
-This option was invented in order to turn on additional functionality of PCRE
-that is incompatible with Perl, but it is currently of very little use. When
-set, any backslash in a pattern that is followed by a letter that has no
-special meaning causes an error, thus reserving these combinations for future
-expansion. By default, as in Perl, a backslash followed by a letter with no
-special meaning is treated as a literal. (Perl can, however, be persuaded to
-give a warning for this.) There are at present no other features controlled by
-this option. It can also be set by a (?X) option setting within a pattern.
-.sp
- PCRE_FIRSTLINE
-.sp
-If this option is set, an unanchored pattern is required to match before or at
-the first newline in the subject string, though the matched text may continue
-over the newline.
-.sp
- PCRE_JAVASCRIPT_COMPAT
-.sp
-If this option is set, PCRE's behaviour is changed in some ways so that it is
-compatible with JavaScript rather than Perl. The changes are as follows:
-.P
-(1) A lone closing square bracket in a pattern causes a compile-time error,
-because this is illegal in JavaScript (by default it is treated as a data
-character). Thus, the pattern AB]CD becomes illegal when this option is set.
-.P
-(2) At run time, a back reference to an unset subpattern group matches an empty
-string (by default this causes the current matching alternative to fail). A
-pattern such as (\e1)(a) succeeds when this option is set (assuming it can find
-an "a" in the subject), whereas it fails by default, for Perl compatibility.
-.sp
- PCRE_MULTILINE
-.sp
-By default, PCRE treats the subject string as consisting of a single line of
-characters (even if it actually contains newlines). The "start of line"
-metacharacter (^) matches only at the start of the string, while the "end of
-line" metacharacter ($) matches only at the end of the string, or before a
-terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
-Perl.
-.P
-When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
-match immediately following or immediately before internal newlines in the
-subject string, respectively, as well as at the very start and end. This is
-equivalent to Perl's /m option, and it can be changed within a pattern by a
-(?m) option setting. If there are no newlines in a subject string, or no
-occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
-.sp
- PCRE_NEWLINE_CR
- PCRE_NEWLINE_LF
- PCRE_NEWLINE_CRLF
- PCRE_NEWLINE_ANYCRLF
- PCRE_NEWLINE_ANY
-.sp
-These options override the default newline definition that was chosen when PCRE
-was built. Setting the first or the second specifies that a newline is
-indicated by a single character (CR or LF, respectively). Setting
-PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
-CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
-preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
-that any Unicode newline sequence should be recognized. The Unicode newline
-sequences are the three just mentioned, plus the single characters VT (vertical
-tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
-separator, U+2028), and PS (paragraph separator, U+2029). The last two are
-recognized only in UTF-8 mode.
-.P
-The newline setting in the options word uses three bits that are treated
-as a number, giving eight possibilities. Currently only six are used (default
-plus the five values above). This means that if you set more than one newline
-option, the combination may or may not be sensible. For example,
-PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
-other combinations may yield unused numbers and cause an error.
-.P
-The only time that a line break is specially recognized when compiling a
-pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
-class is encountered. This indicates a comment that lasts until after the next
-line break sequence. In other circumstances, line break sequences are treated
-as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
-as whitespace characters and are therefore ignored.
-.P
-The newline option that is set at compile time becomes the default that is used
-for \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
-.sp
- PCRE_NO_AUTO_CAPTURE
-.sp
-If this option is set, it disables the use of numbered capturing parentheses in
-the pattern. Any opening parenthesis that is not followed by ? behaves as if it
-were followed by ?: but named parentheses can still be used for capturing (and
-they acquire numbers in the usual way). There is no equivalent of this option
-in Perl.
-.sp
- PCRE_UNGREEDY
-.sp
-This option inverts the "greediness" of the quantifiers so that they are not
-greedy by default, but become greedy if followed by "?". It is not compatible
-with Perl. It can also be set by a (?U) option setting within the pattern.
-.sp
- PCRE_UTF8
-.sp
-This option causes PCRE to regard both the pattern and the subject as strings
-of UTF-8 characters instead of single-byte character strings. However, it is
-available only when PCRE is built to include UTF-8 support. If not, the use
-of this option provokes an error. Details of how this option changes the
-behaviour of PCRE are given in the
-.\" HTML
-.\"
-section on UTF-8 support
-.\"
-in the main
-.\" HREF
-\fBpcre\fP
-.\"
-page.
-.sp
- PCRE_NO_UTF8_CHECK
-.sp
-When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
-automatically checked. There is a discussion about the
-.\" HTML
-.\"
-validity of UTF-8 strings
-.\"
-in the main
-.\" HREF
-\fBpcre\fP
-.\"
-page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_compile()\fP
-returns an error. If you already know that your pattern is valid, and you want
-to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
-option. When it is set, the effect of passing an invalid UTF-8 string as a
-pattern is undefined. It may cause your program to crash. Note that this option
-can also be passed to \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, to suppress
-the UTF-8 validity checking of subject strings.
-.
-.
-.SH "COMPILATION ERROR CODES"
-.rs
-.sp
-The following table lists the error codes than may be returned by
-\fBpcre_compile2()\fP, along with the error messages that may be returned by
-both compiling functions. As PCRE has developed, some error codes have fallen
-out of use. To avoid confusion, they have not been re-used.
-.sp
- 0 no error
- 1 \e at end of pattern
- 2 \ec at end of pattern
- 3 unrecognized character follows \e
- 4 numbers out of order in {} quantifier
- 5 number too big in {} quantifier
- 6 missing terminating ] for character class
- 7 invalid escape sequence in character class
- 8 range out of order in character class
- 9 nothing to repeat
- 10 [this code is not in use]
- 11 internal error: unexpected repeat
- 12 unrecognized character after (? or (?-
- 13 POSIX named classes are supported only within a class
- 14 missing )
- 15 reference to non-existent subpattern
- 16 erroffset passed as NULL
- 17 unknown option bit(s) set
- 18 missing ) after comment
- 19 [this code is not in use]
- 20 regular expression is too large
- 21 failed to get memory
- 22 unmatched parentheses
- 23 internal error: code overflow
- 24 unrecognized character after (?<
- 25 lookbehind assertion is not fixed length
- 26 malformed number or name after (?(
- 27 conditional group contains more than two branches
- 28 assertion expected after (?(
- 29 (?R or (?[+-]digits must be followed by )
- 30 unknown POSIX class name
- 31 POSIX collating elements are not supported
- 32 this version of PCRE is not compiled with PCRE_UTF8 support
- 33 [this code is not in use]
- 34 character value in \ex{...} sequence is too large
- 35 invalid condition (?(0)
- 36 \eC not allowed in lookbehind assertion
- 37 PCRE does not support \eL, \el, \eN, \eU, or \eu
- 38 number after (?C is > 255
- 39 closing ) for (?C expected
- 40 recursive call could loop indefinitely
- 41 unrecognized character after (?P
- 42 syntax error in subpattern name (missing terminator)
- 43 two named subpatterns have the same name
- 44 invalid UTF-8 string
- 45 support for \eP, \ep, and \eX has not been compiled
- 46 malformed \eP or \ep sequence
- 47 unknown property name after \eP or \ep
- 48 subpattern name is too long (maximum 32 characters)
- 49 too many named subpatterns (maximum 10000)
- 50 [this code is not in use]
- 51 octal value is greater than \e377 (not in UTF-8 mode)
- 52 internal error: overran compiling workspace
- 53 internal error: previously-checked referenced subpattern not found
- 54 DEFINE group contains more than one branch
- 55 repeating a DEFINE group is not allowed
- 56 inconsistent NEWLINE options
- 57 \eg is not followed by a braced, angle-bracketed, or quoted
- name/number or by a plain number
- 58 a numbered reference must not be zero
- 59 (*VERB) with an argument is not supported
- 60 (*VERB) not recognized
- 61 number is too big
- 62 subpattern name expected
- 63 digit expected after (?+
- 64 ] is an invalid data character in JavaScript compatibility mode
-.sp
-The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
-be used if the limits were changed when PCRE was built.
-.
-.
-.SH "STUDYING A PATTERN"
-.rs
-.sp
-.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP
-.ti +5n
-.B const char **\fIerrptr\fP);
-.PP
-If a compiled pattern is going to be used several times, it is worth spending
-more time analyzing it in order to speed up the time taken for matching. The
-function \fBpcre_study()\fP takes a pointer to a compiled pattern as its first
-argument. If studying the pattern produces additional information that will
-help speed up matching, \fBpcre_study()\fP returns a pointer to a
-\fBpcre_extra\fP block, in which the \fIstudy_data\fP field points to the
-results of the study.
-.P
-The returned value from \fBpcre_study()\fP can be passed directly to
-\fBpcre_exec()\fP. However, a \fBpcre_extra\fP block also contains other
-fields that can be set by the caller before the block is passed; these are
-described
-.\" HTML
-.\"
-below
-.\"
-in the section on matching a pattern.
-.P
-If studying the pattern does not produce any additional information
-\fBpcre_study()\fP returns NULL. In that circumstance, if the calling program
-wants to pass any of the other fields to \fBpcre_exec()\fP, it must set up its
-own \fBpcre_extra\fP block.
-.P
-The second argument of \fBpcre_study()\fP contains option bits. At present, no
-options are defined, and this argument should always be zero.
-.P
-The third argument for \fBpcre_study()\fP is a pointer for an error message. If
-studying succeeds (even if no data is returned), the variable it points to is
-set to NULL. Otherwise it is set to point to a textual error message. This is a
-static string that is part of the library. You must not try to free it. You
-should test the error pointer for NULL after calling \fBpcre_study()\fP, to be
-sure that it has run successfully.
-.P
-This is a typical call to \fBpcre_study\fP():
-.sp
- pcre_extra *pe;
- pe = pcre_study(
- re, /* result of pcre_compile() */
- 0, /* no options exist */
- &error); /* set to NULL or points to a message */
-.sp
-At present, studying a pattern is useful only for non-anchored patterns that do
-not have a single fixed starting character. A bitmap of possible starting
-bytes is created.
-.
-.
-.\" HTML
-.SH "LOCALE SUPPORT"
-.rs
-.sp
-PCRE handles caseless matching, and determines whether characters are letters,
-digits, or whatever, by reference to a set of tables, indexed by character
-value. When running in UTF-8 mode, this applies only to characters with codes
-less than 128. Higher-valued codes never match escapes such as \ew or \ed, but
-can be tested with \ep if PCRE is built with Unicode character property
-support. The use of locales with Unicode is discouraged. If you are handling
-characters with codes greater than 128, you should either use UTF-8 and
-Unicode, or use locales, but not try to mix the two.
-.P
-PCRE contains an internal set of tables that are used when the final argument
-of \fBpcre_compile()\fP is NULL. These are sufficient for many applications.
-Normally, the internal tables recognize only ASCII characters. However, when
-PCRE is built, it is possible to cause the internal tables to be rebuilt in the
-default "C" locale of the local system, which may cause them to be different.
-.P
-The internal tables can always be overridden by tables supplied by the
-application that calls PCRE. These may be created in a different locale from
-the default. As more and more applications change to using Unicode, the need
-for this locale support is expected to die away.
-.P
-External tables are built by calling the \fBpcre_maketables()\fP function,
-which has no arguments, in the relevant locale. The result can then be passed
-to \fBpcre_compile()\fP or \fBpcre_exec()\fP as often as necessary. For
-example, to build and use tables that are appropriate for the French locale
-(where accented characters with values greater than 128 are treated as letters),
-the following code could be used:
-.sp
- setlocale(LC_CTYPE, "fr_FR");
- tables = pcre_maketables();
- re = pcre_compile(..., tables);
-.sp
-The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
-are using Windows, the name for the French locale is "french".
-.P
-When \fBpcre_maketables()\fP runs, the tables are built in memory that is
-obtained via \fBpcre_malloc\fP. It is the caller's responsibility to ensure
-that the memory containing the tables remains available for as long as it is
-needed.
-.P
-The pointer that is passed to \fBpcre_compile()\fP is saved with the compiled
-pattern, and the same tables are used via this pointer by \fBpcre_study()\fP
-and normally also by \fBpcre_exec()\fP. Thus, by default, for any single
-pattern, compilation, studying and matching all happen in the same locale, but
-different patterns can be compiled in different locales.
-.P
-It is possible to pass a table pointer or NULL (indicating the use of the
-internal tables) to \fBpcre_exec()\fP. Although not intended for this purpose,
-this facility could be used to match a pattern in a different locale from the
-one in which it was compiled. Passing table pointers at run time is discussed
-below in the section on matching a pattern.
-.
-.
-.SH "INFORMATION ABOUT A PATTERN"
-.rs
-.sp
-.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B int \fIwhat\fP, void *\fIwhere\fP);
-.PP
-The \fBpcre_fullinfo()\fP function returns information about a compiled
-pattern. It replaces the obsolete \fBpcre_info()\fP function, which is
-nevertheless retained for backwards compability (and is documented below).
-.P
-The first argument for \fBpcre_fullinfo()\fP is a pointer to the compiled
-pattern. The second argument is the result of \fBpcre_study()\fP, or NULL if
-the pattern was not studied. The third argument specifies which piece of
-information is required, and the fourth argument is a pointer to a variable
-to receive the data. The yield of the function is zero for success, or one of
-the following negative numbers:
-.sp
- PCRE_ERROR_NULL the argument \fIcode\fP was NULL
- the argument \fIwhere\fP was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
-.sp
-The "magic number" is placed at the start of each compiled pattern as an simple
-check against passing an arbitrary memory pointer. Here is a typical call of
-\fBpcre_fullinfo()\fP, to obtain the length of the compiled pattern:
-.sp
- int rc;
- size_t length;
- rc = pcre_fullinfo(
- re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
- PCRE_INFO_SIZE, /* what is required */
- &length); /* where to put the data */
-.sp
-The possible values for the third argument are defined in \fBpcre.h\fP, and are
-as follows:
-.sp
- PCRE_INFO_BACKREFMAX
-.sp
-Return the number of the highest back reference in the pattern. The fourth
-argument should point to an \fBint\fP variable. Zero is returned if there are
-no back references.
-.sp
- PCRE_INFO_CAPTURECOUNT
-.sp
-Return the number of capturing subpatterns in the pattern. The fourth argument
-should point to an \fBint\fP variable.
-.sp
- PCRE_INFO_DEFAULT_TABLES
-.sp
-Return a pointer to the internal default character tables within PCRE. The
-fourth argument should point to an \fBunsigned char *\fP variable. This
-information call is provided for internal use by the \fBpcre_study()\fP
-function. External callers can cause PCRE to use its internal tables by passing
-a NULL table pointer.
-.sp
- PCRE_INFO_FIRSTBYTE
-.sp
-Return information about the first byte of any matched string, for a
-non-anchored pattern. The fourth argument should point to an \fBint\fP
-variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is
-still recognized for backwards compatibility.)
-.P
-If there is a fixed first byte, for example, from a pattern such as
-(cat|cow|coyote), its value is returned. Otherwise, if either
-.sp
-(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
-starts with "^", or
-.sp
-(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
-(if it were set, the pattern would be anchored),
-.sp
--1 is returned, indicating that the pattern matches only at the start of a
-subject string or after any newline within the string. Otherwise -2 is
-returned. For anchored patterns, -2 is returned.
-.sp
- PCRE_INFO_FIRSTTABLE
-.sp
-If the pattern was studied, and this resulted in the construction of a 256-bit
-table indicating a fixed set of bytes for the first byte in any matching
-string, a pointer to the table is returned. Otherwise NULL is returned. The
-fourth argument should point to an \fBunsigned char *\fP variable.
-.sp
- PCRE_INFO_HASCRORLF
-.sp
-Return 1 if the pattern contains any explicit matches for CR or LF characters,
-otherwise 0. The fourth argument should point to an \fBint\fP variable. An
-explicit match is either a literal CR or LF character, or \er or \en.
-.sp
- PCRE_INFO_JCHANGED
-.sp
-Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
-0. The fourth argument should point to an \fBint\fP variable. (?J) and
-(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
-.sp
- PCRE_INFO_LASTLITERAL
-.sp
-Return the value of the rightmost literal byte that must exist in any matched
-string, other than at its start, if such a byte has been recorded. The fourth
-argument should point to an \fBint\fP variable. If there is no such byte, -1 is
-returned. For anchored patterns, a last literal byte is recorded only if it
-follows something of variable length. For example, for the pattern
-/^a\ed+z\ed+/ the returned value is "z", but for /^a\edz\ed/ the returned value
-is -1.
-.sp
- PCRE_INFO_NAMECOUNT
- PCRE_INFO_NAMEENTRYSIZE
- PCRE_INFO_NAMETABLE
-.sp
-PCRE supports the use of named as well as numbered capturing parentheses. The
-names are just an additional way of identifying the parentheses, which still
-acquire numbers. Several convenience functions such as
-\fBpcre_get_named_substring()\fP are provided for extracting captured
-substrings by name. It is also possible to extract the data directly, by first
-converting the name to a number in order to access the correct pointers in the
-output vector (described with \fBpcre_exec()\fP below). To do the conversion,
-you need to use the name-to-number map, which is described by these three
-values.
-.P
-The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT gives
-the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each
-entry; both of these return an \fBint\fP value. The entry size depends on the
-length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
-entry of the table (a pointer to \fBchar\fP). The first two bytes of each entry
-are the number of the capturing parenthesis, most significant byte first. The
-rest of the entry is the corresponding name, zero terminated. The names are in
-alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of
-their parentheses numbers. For example, consider the following pattern (assume
-PCRE_EXTENDED is set, so white space - including newlines - is ignored):
-.sp
-.\" JOIN
- (? (?(\ed\ed)?\ed\ed) -
- (?\ed\ed) - (?\ed\ed) )
-.sp
-There are four named subpatterns, so the table has four entries, and each entry
-in the table is eight bytes long. The table is as follows, with non-printing
-bytes shows in hexadecimal, and undefined bytes shown as ??:
-.sp
- 00 01 d a t e 00 ??
- 00 05 d a y 00 ?? ??
- 00 04 m o n t h 00
- 00 02 y e a r 00 ??
-.sp
-When writing code to extract data from named subpatterns using the
-name-to-number map, remember that the length of the entries is likely to be
-different for each compiled pattern.
-.sp
- PCRE_INFO_OKPARTIAL
-.sp
-Return 1 if the pattern can be used for partial matching, otherwise 0. The
-fourth argument should point to an \fBint\fP variable. The
-.\" HREF
-\fBpcrepartial\fP
-.\"
-documentation lists the restrictions that apply to patterns when partial
-matching is used.
-.sp
- PCRE_INFO_OPTIONS
-.sp
-Return a copy of the options with which the pattern was compiled. The fourth
-argument should point to an \fBunsigned long int\fP variable. These option bits
-are those specified in the call to \fBpcre_compile()\fP, modified by any
-top-level option settings at the start of the pattern itself. In other words,
-they are the options that will be in force when matching starts. For example,
-if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
-result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
-.P
-A pattern is automatically anchored by PCRE if all of its top-level
-alternatives begin with one of the following:
-.sp
- ^ unless PCRE_MULTILINE is set
- \eA always
- \eG always
-.\" JOIN
- .* if PCRE_DOTALL is set and there are no back
- references to the subpattern in which .* appears
-.sp
-For such patterns, the PCRE_ANCHORED bit is set in the options returned by
-\fBpcre_fullinfo()\fP.
-.sp
- PCRE_INFO_SIZE
-.sp
-Return the size of the compiled pattern, that is, the value that was passed as
-the argument to \fBpcre_malloc()\fP when PCRE was getting memory in which to
-place the compiled data. The fourth argument should point to a \fBsize_t\fP
-variable.
-.sp
- PCRE_INFO_STUDYSIZE
-.sp
-Return the size of the data block pointed to by the \fIstudy_data\fP field in
-a \fBpcre_extra\fP block. That is, it is the value that was passed to
-\fBpcre_malloc()\fP when PCRE was getting memory into which to place the data
-created by \fBpcre_study()\fP. The fourth argument should point to a
-\fBsize_t\fP variable.
-.
-.
-.SH "OBSOLETE INFO FUNCTION"
-.rs
-.sp
-.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
-.B *\fIfirstcharptr\fP);
-.PP
-The \fBpcre_info()\fP function is now obsolete because its interface is too
-restrictive to return all the available data about a compiled pattern. New
-programs should use \fBpcre_fullinfo()\fP instead. The yield of
-\fBpcre_info()\fP is the number of capturing subpatterns, or one of the
-following negative numbers:
-.sp
- PCRE_ERROR_NULL the argument \fIcode\fP was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
-.sp
-If the \fIoptptr\fP argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to (see
-PCRE_INFO_OPTIONS above).
-.P
-If the pattern is not anchored and the \fIfirstcharptr\fP argument is not NULL,
-it is used to pass back information about the first character of any matched
-string (see PCRE_INFO_FIRSTBYTE above).
-.
-.
-.SH "REFERENCE COUNTS"
-.rs
-.sp
-.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
-.PP
-The \fBpcre_refcount()\fP function is used to maintain a reference count in the
-data block that contains a compiled pattern. It is provided for the benefit of
-applications that operate in an object-oriented manner, where different parts
-of the application may be using the same compiled pattern, but you want to free
-the block when they are all done.
-.P
-When a pattern is compiled, the reference count field is initialized to zero.
-It is changed only by calling this function, whose action is to add the
-\fIadjust\fP value (which may be positive or negative) to it. The yield of the
-function is the new value. However, the value of the count is constrained to
-lie between 0 and 65535, inclusive. If the new value is outside these limits,
-it is forced to the appropriate limit value.
-.P
-Except when it is zero, the reference count is not correctly preserved if a
-pattern is compiled on one host and then transferred to a host whose byte-order
-is different. (This seems a highly unlikely scenario.)
-.
-.
-.SH "MATCHING A PATTERN: THE TRADITIONAL FUNCTION"
-.rs
-.sp
-.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
-.ti +5n
-.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
-.P
-The function \fBpcre_exec()\fP is called to match a subject string against a
-compiled pattern, which is passed in the \fIcode\fP argument. If the
-pattern has been studied, the result of the study should be passed in the
-\fIextra\fP argument. This function is the main matching facility of the
-library, and it operates in a Perl-like manner. For specialist use there is
-also an alternative matching function, which is described
-.\" HTML
-.\"
-below
-.\"
-in the section about the \fBpcre_dfa_exec()\fP function.
-.P
-In most applications, the pattern will have been compiled (and optionally
-studied) in the same process that calls \fBpcre_exec()\fP. However, it is
-possible to save compiled patterns and study data, and then use them later
-in different processes, possibly even on different hosts. For a discussion
-about this, see the
-.\" HREF
-\fBpcreprecompile\fP
-.\"
-documentation.
-.P
-Here is an example of a simple call to \fBpcre_exec()\fP:
-.sp
- int rc;
- int ovector[30];
- rc = pcre_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector of integers for substring information */
- 30); /* number of elements (NOT size in bytes) */
-.
-.\" HTML
-.SS "Extra data for \fBpcre_exec()\fR"
-.rs
-.sp
-If the \fIextra\fP argument is not NULL, it must point to a \fBpcre_extra\fP
-data block. The \fBpcre_study()\fP function returns such a block (when it
-doesn't return NULL), but you can also create one for yourself, and pass
-additional information in it. The \fBpcre_extra\fP block contains the following
-fields (not necessarily in this order):
-.sp
- unsigned long int \fIflags\fP;
- void *\fIstudy_data\fP;
- unsigned long int \fImatch_limit\fP;
- unsigned long int \fImatch_limit_recursion\fP;
- void *\fIcallout_data\fP;
- const unsigned char *\fItables\fP;
-.sp
-The \fIflags\fP field is a bitmap that specifies which of the other fields
-are set. The flag bits are:
-.sp
- PCRE_EXTRA_STUDY_DATA
- PCRE_EXTRA_MATCH_LIMIT
- PCRE_EXTRA_MATCH_LIMIT_RECURSION
- PCRE_EXTRA_CALLOUT_DATA
- PCRE_EXTRA_TABLES
-.sp
-Other flag bits should be set to zero. The \fIstudy_data\fP field is set in the
-\fBpcre_extra\fP block that is returned by \fBpcre_study()\fP, together with
-the appropriate flag bit. You should not set this yourself, but you may add to
-the block by setting the other fields and their corresponding flag bits.
-.P
-The \fImatch_limit\fP field provides a means of preventing PCRE from using up a
-vast amount of resources when running patterns that are not going to match,
-but which have a very large number of possibilities in their search trees. The
-classic example is the use of nested unlimited repeats.
-.P
-Internally, PCRE uses a function called \fBmatch()\fP which it calls repeatedly
-(sometimes recursively). The limit set by \fImatch_limit\fP is imposed on the
-number of times this function is called during a match, which has the effect of
-limiting the amount of backtracking that can take place. For patterns that are
-not anchored, the count restarts from zero for each position in the subject
-string.
-.P
-The default value for the limit can be set when PCRE is built; the default
-default is 10 million, which handles all but the most extreme cases. You can
-override the default by suppling \fBpcre_exec()\fP with a \fBpcre_extra\fP
-block in which \fImatch_limit\fP is set, and PCRE_EXTRA_MATCH_LIMIT is set in
-the \fIflags\fP field. If the limit is exceeded, \fBpcre_exec()\fP returns
-PCRE_ERROR_MATCHLIMIT.
-.P
-The \fImatch_limit_recursion\fP field is similar to \fImatch_limit\fP, but
-instead of limiting the total number of times that \fBmatch()\fP is called, it
-limits the depth of recursion. The recursion depth is a smaller number than the
-total number of calls, because not all calls to \fBmatch()\fP are recursive.
-This limit is of use only if it is set smaller than \fImatch_limit\fP.
-.P
-Limiting the recursion depth limits the amount of stack that can be used, or,
-when PCRE has been compiled to use memory on the heap instead of the stack, the
-amount of heap memory that can be used.
-.P
-The default value for \fImatch_limit_recursion\fP can be set when PCRE is
-built; the default default is the same value as the default for
-\fImatch_limit\fP. You can override the default by suppling \fBpcre_exec()\fP
-with a \fBpcre_extra\fP block in which \fImatch_limit_recursion\fP is set, and
-PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the \fIflags\fP field. If the limit
-is exceeded, \fBpcre_exec()\fP returns PCRE_ERROR_RECURSIONLIMIT.
-.P
-The \fIpcre_callout\fP field is used in conjunction with the "callout" feature,
-which is described in the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation.
-.P
-The \fItables\fP field is used to pass a character tables pointer to
-\fBpcre_exec()\fP; this overrides the value that is stored with the compiled
-pattern. A non-NULL value is stored with the compiled pattern only if custom
-tables were supplied to \fBpcre_compile()\fP via its \fItableptr\fP argument.
-If NULL is passed to \fBpcre_exec()\fP using this mechanism, it forces PCRE's
-internal tables to be used. This facility is helpful when re-using patterns
-that have been saved after compiling with an external set of tables, because
-the external tables might be at a different address when \fBpcre_exec()\fP is
-called. See the
-.\" HREF
-\fBpcreprecompile\fP
-.\"
-documentation for a discussion of saving compiled patterns for later use.
-.
-.\" HTML
-.SS "Option bits for \fBpcre_exec()\fP"
-.rs
-.sp
-The unused bits of the \fIoptions\fP argument for \fBpcre_exec()\fP must be
-zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
-PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
-PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
-.sp
- PCRE_ANCHORED
-.sp
-The PCRE_ANCHORED option limits \fBpcre_exec()\fP to matching at the first
-matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
-to be anchored by virtue of its contents, it cannot be made unachored at
-matching time.
-.sp
- PCRE_BSR_ANYCRLF
- PCRE_BSR_UNICODE
-.sp
-These options (which are mutually exclusive) control what the \eR escape
-sequence matches. The choice is either to match only CR, LF, or CRLF, or to
-match any Unicode newline sequence. These options override the choice that was
-made or defaulted when the pattern was compiled.
-.sp
- PCRE_NEWLINE_CR
- PCRE_NEWLINE_LF
- PCRE_NEWLINE_CRLF
- PCRE_NEWLINE_ANYCRLF
- PCRE_NEWLINE_ANY
-.sp
-These options override the newline definition that was chosen or defaulted when
-the pattern was compiled. For details, see the description of
-\fBpcre_compile()\fP above. During matching, the newline choice affects the
-behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
-the way the match position is advanced after a match failure for an unanchored
-pattern.
-.P
-When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
-match attempt for an unanchored pattern fails when the current position is at a
-CRLF sequence, and the pattern contains no explicit matches for CR or LF
-characters, the match position is advanced by two characters instead of one, in
-other words, to after the CRLF.
-.P
-The above rule is a compromise that makes the most common cases work as
-expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
-set), it does not match the string "\er\enA" because, after failing at the
-start, it skips both the CR and the LF before retrying. However, the pattern
-[\er\en]A does match that string, because it contains an explicit CR or LF
-reference, and so advances only by one character after the first failure.
-.P
-An explicit match for CR of LF is either a literal appearance of one of those
-characters, or one of the \er or \en escape sequences. Implicit matches such as
-[^X] do not count, nor does \es (which includes CR and LF in the characters
-that it matches).
-.P
-Notwithstanding the above, anomalous effects may still occur when CRLF is a
-valid newline sequence and explicit \er or \en escapes appear in the pattern.
-.sp
- PCRE_NOTBOL
-.sp
-This option specifies that first character of the subject string is not the
-beginning of a line, so the circumflex metacharacter should not match before
-it. Setting this without PCRE_MULTILINE (at compile time) causes circumflex
-never to match. This option affects only the behaviour of the circumflex
-metacharacter. It does not affect \eA.
-.sp
- PCRE_NOTEOL
-.sp
-This option specifies that the end of the subject string is not the end of a
-line, so the dollar metacharacter should not match it nor (except in multiline
-mode) a newline immediately before it. Setting this without PCRE_MULTILINE (at
-compile time) causes dollar never to match. This option affects only the
-behaviour of the dollar metacharacter. It does not affect \eZ or \ez.
-.sp
- PCRE_NOTEMPTY
-.sp
-An empty string is not considered to be a valid match if this option is set. If
-there are alternatives in the pattern, they are tried. If all the alternatives
-match the empty string, the entire match fails. For example, if the pattern
-.sp
- a?b?
-.sp
-is applied to a string not beginning with "a" or "b", it matches the empty
-string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
-valid, so PCRE searches further into the string for occurrences of "a" or "b".
-.P
-Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
-of a pattern match of the empty string within its \fBsplit()\fP function, and
-when using the /g modifier. It is possible to emulate Perl's behaviour after
-matching a null string by first trying the match again at the same offset with
-PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
-starting offset (see below) and trying an ordinary match again. There is some
-code that demonstrates how to do this in the \fIpcredemo.c\fP sample program.
-.sp
- PCRE_NO_START_OPTIMIZE
-.sp
-There are a number of optimizations that \fBpcre_exec()\fP uses at the start of
-a match, in order to speed up the process. For example, if it is known that a
-match must start with a specific character, it searches the subject for that
-character, and fails immediately if it cannot find it, without actually running
-the main matching function. When callouts are in use, these optimizations can
-cause them to be skipped. This option disables the "start-up" optimizations,
-causing performance to suffer, but ensuring that the callouts do occur.
-.sp
- PCRE_NO_UTF8_CHECK
-.sp
-When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
-string is automatically checked when \fBpcre_exec()\fP is subsequently called.
-The value of \fIstartoffset\fP is also checked to ensure that it points to the
-start of a UTF-8 character. There is a discussion about the validity of UTF-8
-strings in the
-.\" HTML
-.\"
-section on UTF-8 support
-.\"
-in the main
-.\" HREF
-\fBpcre\fP
-.\"
-page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
-the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains an invalid value,
-PCRE_ERROR_BADUTF8_OFFSET is returned.
-.P
-If you already know that your subject is valid, and you want to skip these
-checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
-calling \fBpcre_exec()\fP. You might want to do this for the second and
-subsequent calls to \fBpcre_exec()\fP if you are making repeated calls to find
-all the matches in a single subject string. However, you should be sure that
-the value of \fIstartoffset\fP points to the start of a UTF-8 character. When
-PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a
-subject, or a value of \fIstartoffset\fP that does not point to the start of a
-UTF-8 character, is undefined. Your program may crash.
-.sp
- PCRE_PARTIAL
-.sp
-This option turns on the partial matching feature. If the subject string fails
-to match the pattern, but at some point during the matching process the end of
-the subject was reached (that is, the subject partially matches the pattern and
-the failure to match occurred only because there were not enough subject
-characters), \fBpcre_exec()\fP returns PCRE_ERROR_PARTIAL instead of
-PCRE_ERROR_NOMATCH. When PCRE_PARTIAL is used, there are restrictions on what
-may appear in the pattern. These are discussed in the
-.\" HREF
-\fBpcrepartial\fP
-.\"
-documentation.
-.
-.SS "The string to be matched by \fBpcre_exec()\fP"
-.rs
-.sp
-The subject string is passed to \fBpcre_exec()\fP as a pointer in
-\fIsubject\fP, a length (in bytes) in \fIlength\fP, and a starting byte offset
-in \fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of
-a UTF-8 character. Unlike the pattern string, the subject may contain binary
-zero bytes. When the starting offset is zero, the search for a match starts at
-the beginning of the subject, and this is by far the most common case.
-.P
-A non-zero starting offset is useful when searching for another match in the
-same subject by calling \fBpcre_exec()\fP again after a previous success.
-Setting \fIstartoffset\fP differs from just passing over a shortened string and
-setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
-lookbehind. For example, consider the pattern
-.sp
- \eBiss\eB
-.sp
-which finds occurrences of "iss" in the middle of words. (\eB matches only if
-the current position in the subject is not a word boundary.) When applied to
-the string "Mississipi" the first call to \fBpcre_exec()\fP finds the first
-occurrence. If \fBpcre_exec()\fP is called again with just the remainder of the
-subject, namely "issipi", it does not match, because \eB is always false at the
-start of the subject, which is deemed to be a word boundary. However, if
-\fBpcre_exec()\fP is passed the entire string again, but with \fIstartoffset\fP
-set to 4, it finds the second occurrence of "iss" because it is able to look
-behind the starting point to discover that it is preceded by a letter.
-.P
-If a non-zero starting offset is passed when the pattern is anchored, one
-attempt to match at the given offset is made. This can only succeed if the
-pattern does not require the match to be at the start of the subject.
-.
-.SS "How \fBpcre_exec()\fP returns captured substrings"
-.rs
-.sp
-In general, a pattern matches a certain portion of the subject, and in
-addition, further substrings from the subject may be picked out by parts of the
-pattern. Following the usage in Jeffrey Friedl's book, this is called
-"capturing" in what follows, and the phrase "capturing subpattern" is used for
-a fragment of a pattern that picks out a substring. PCRE supports several other
-kinds of parenthesized subpattern that do not cause substrings to be captured.
-.P
-Captured substrings are returned to the caller via a vector of integers whose
-address is passed in \fIovector\fP. The number of elements in the vector is
-passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP: this
-argument is NOT the size of \fIovector\fP in bytes.
-.P
-The first two-thirds of the vector is used to pass back captured substrings,
-each substring using a pair of integers. The remaining third of the vector is
-used as workspace by \fBpcre_exec()\fP while matching capturing subpatterns,
-and is not available for passing back information. The number passed in
-\fIovecsize\fP should always be a multiple of three. If it is not, it is
-rounded down.
-.P
-When a match is successful, information about captured substrings is returned
-in pairs of integers, starting at the beginning of \fIovector\fP, and
-continuing up to two-thirds of its length at the most. The first element of
-each pair is set to the byte offset of the first character in a substring, and
-the second is set to the byte offset of the first character after the end of a
-substring. \fBNote\fP: these values are always byte offsets, even in UTF-8
-mode. They are not character counts.
-.P
-The first pair of integers, \fIovector[0]\fP and \fIovector[1]\fP, identify the
-portion of the subject string matched by the entire pattern. The next pair is
-used for the first capturing subpattern, and so on. The value returned by
-\fBpcre_exec()\fP is one more than the highest numbered pair that has been set.
-For example, if two substrings have been captured, the returned value is 3. If
-there are no capturing subpatterns, the return value from a successful match is
-1, indicating that just the first pair of offsets has been set.
-.P
-If a capturing subpattern is matched repeatedly, it is the last portion of the
-string that it matched that is returned.
-.P
-If the vector is too small to hold all the captured substring offsets, it is
-used as far as possible (up to two-thirds of its length), and the function
-returns a value of zero. If the substring offsets are not of interest,
-\fBpcre_exec()\fP may be called with \fIovector\fP passed as NULL and
-\fIovecsize\fP as zero. However, if the pattern contains back references and
-the \fIovector\fP is not big enough to remember the related substrings, PCRE
-has to get additional memory for use during matching. Thus it is usually
-advisable to supply an \fIovector\fP.
-.P
-The \fBpcre_info()\fP function can be used to find out how many capturing
-subpatterns there are in a compiled pattern. The smallest size for
-\fIovector\fP that will allow for \fIn\fP captured substrings, in addition to
-the offsets of the substring matched by the whole pattern, is (\fIn\fP+1)*3.
-.P
-It is possible for capturing subpattern number \fIn+1\fP to match some part of
-the subject when subpattern \fIn\fP has not been used at all. For example, if
-the string "abc" is matched against the pattern (a|(z))(bc) the return from the
-function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
-happens, both values in the offset pairs corresponding to unused subpatterns
-are set to -1.
-.P
-Offset values that correspond to unused subpatterns at the end of the
-expression are also set to -1. For example, if the string "abc" is matched
-against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The
-return from the function is 2, because the highest used capturing subpattern
-number is 1. However, you can refer to the offsets for the second and third
-capturing subpatterns if you wish (assuming the vector is large enough, of
-course).
-.P
-Some convenience functions are provided for extracting the captured substrings
-as separate strings. These are described below.
-.
-.\" HTML
-.SS "Error return values from \fBpcre_exec()\fP"
-.rs
-.sp
-If \fBpcre_exec()\fP fails, it returns a negative number. The following are
-defined in the header file:
-.sp
- PCRE_ERROR_NOMATCH (-1)
-.sp
-The subject string did not match the pattern.
-.sp
- PCRE_ERROR_NULL (-2)
-.sp
-Either \fIcode\fP or \fIsubject\fP was passed as NULL, or \fIovector\fP was
-NULL and \fIovecsize\fP was not zero.
-.sp
- PCRE_ERROR_BADOPTION (-3)
-.sp
-An unrecognized bit was set in the \fIoptions\fP argument.
-.sp
- PCRE_ERROR_BADMAGIC (-4)
-.sp
-PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
-the case when it is passed a junk pointer and to detect when a pattern that was
-compiled in an environment of one endianness is run in an environment with the
-other endianness. This is the error that PCRE gives when the magic number is
-not present.
-.sp
- PCRE_ERROR_UNKNOWN_OPCODE (-5)
-.sp
-While running the pattern match, an unknown item was encountered in the
-compiled pattern. This error could be caused by a bug in PCRE or by overwriting
-of the compiled pattern.
-.sp
- PCRE_ERROR_NOMEMORY (-6)
-.sp
-If a pattern contains back references, but the \fIovector\fP that is passed to
-\fBpcre_exec()\fP is not big enough to remember the referenced substrings, PCRE
-gets a block of memory at the start of matching to use for this purpose. If the
-call via \fBpcre_malloc()\fP fails, this error is given. The memory is
-automatically freed at the end of matching.
-.sp
- PCRE_ERROR_NOSUBSTRING (-7)
-.sp
-This error is used by the \fBpcre_copy_substring()\fP,
-\fBpcre_get_substring()\fP, and \fBpcre_get_substring_list()\fP functions (see
-below). It is never returned by \fBpcre_exec()\fP.
-.sp
- PCRE_ERROR_MATCHLIMIT (-8)
-.sp
-The backtracking limit, as specified by the \fImatch_limit\fP field in a
-\fBpcre_extra\fP structure (or defaulted) was reached. See the description
-above.
-.sp
- PCRE_ERROR_CALLOUT (-9)
-.sp
-This error is never generated by \fBpcre_exec()\fP itself. It is provided for
-use by callout functions that want to yield a distinctive error code. See the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation for details.
-.sp
- PCRE_ERROR_BADUTF8 (-10)
-.sp
-A string that contains an invalid UTF-8 byte sequence was passed as a subject.
-.sp
- PCRE_ERROR_BADUTF8_OFFSET (-11)
-.sp
-The UTF-8 byte sequence that was passed as a subject was valid, but the value
-of \fIstartoffset\fP did not point to the beginning of a UTF-8 character.
-.sp
- PCRE_ERROR_PARTIAL (-12)
-.sp
-The subject string did not match, but it did match partially. See the
-.\" HREF
-\fBpcrepartial\fP
-.\"
-documentation for details of partial matching.
-.sp
- PCRE_ERROR_BADPARTIAL (-13)
-.sp
-The PCRE_PARTIAL option was used with a compiled pattern containing items that
-are not supported for partial matching. See the
-.\" HREF
-\fBpcrepartial\fP
-.\"
-documentation for details of partial matching.
-.sp
- PCRE_ERROR_INTERNAL (-14)
-.sp
-An unexpected internal error has occurred. This error could be caused by a bug
-in PCRE or by overwriting of the compiled pattern.
-.sp
- PCRE_ERROR_BADCOUNT (-15)
-.sp
-This error is given if the value of the \fIovecsize\fP argument is negative.
-.sp
- PCRE_ERROR_RECURSIONLIMIT (-21)
-.sp
-The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
-field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
-description above.
-.sp
- PCRE_ERROR_BADNEWLINE (-23)
-.sp
-An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
-.P
-Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
-.
-.
-.SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER"
-.rs
-.sp
-.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
-.ti +5n
-.B int \fIbuffersize\fP);
-.PP
-.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, int \fIstringnumber\fP,
-.ti +5n
-.B const char **\fIstringptr\fP);
-.PP
-.B int pcre_get_substring_list(const char *\fIsubject\fP,
-.ti +5n
-.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
-.PP
-Captured substrings can be accessed directly by using the offsets returned by
-\fBpcre_exec()\fP in \fIovector\fP. For convenience, the functions
-\fBpcre_copy_substring()\fP, \fBpcre_get_substring()\fP, and
-\fBpcre_get_substring_list()\fP are provided for extracting captured substrings
-as new, separate, zero-terminated strings. These functions identify substrings
-by number. The next section describes functions for extracting named
-substrings.
-.P
-A substring that contains a binary zero is correctly extracted and has a
-further zero added on the end, but the result is not, of course, a C string.
-However, you can process such a string by referring to the length that is
-returned by \fBpcre_copy_substring()\fP and \fBpcre_get_substring()\fP.
-Unfortunately, the interface to \fBpcre_get_substring_list()\fP is not adequate
-for handling strings containing binary zeros, because the end of the final
-string is not independently indicated.
-.P
-The first three arguments are the same for all three of these functions:
-\fIsubject\fP is the subject string that has just been successfully matched,
-\fIovector\fP is a pointer to the vector of integer offsets that was passed to
-\fBpcre_exec()\fP, and \fIstringcount\fP is the number of substrings that were
-captured by the match, including the substring that matched the entire regular
-expression. This is the value returned by \fBpcre_exec()\fP if it is greater
-than zero. If \fBpcre_exec()\fP returned zero, indicating that it ran out of
-space in \fIovector\fP, the value passed as \fIstringcount\fP should be the
-number of elements in the vector divided by three.
-.P
-The functions \fBpcre_copy_substring()\fP and \fBpcre_get_substring()\fP
-extract a single substring, whose number is given as \fIstringnumber\fP. A
-value of zero extracts the substring that matched the entire pattern, whereas
-higher values extract the captured substrings. For \fBpcre_copy_substring()\fP,
-the string is placed in \fIbuffer\fP, whose length is given by
-\fIbuffersize\fP, while for \fBpcre_get_substring()\fP a new block of memory is
-obtained via \fBpcre_malloc\fP, and its address is returned via
-\fIstringptr\fP. The yield of the function is the length of the string, not
-including the terminating zero, or one of these error codes:
-.sp
- PCRE_ERROR_NOMEMORY (-6)
-.sp
-The buffer was too small for \fBpcre_copy_substring()\fP, or the attempt to get
-memory failed for \fBpcre_get_substring()\fP.
-.sp
- PCRE_ERROR_NOSUBSTRING (-7)
-.sp
-There is no substring whose number is \fIstringnumber\fP.
-.P
-The \fBpcre_get_substring_list()\fP function extracts all available substrings
-and builds a list of pointers to them. All this is done in a single block of
-memory that is obtained via \fBpcre_malloc\fP. The address of the memory block
-is returned via \fIlistptr\fP, which is also the start of the list of string
-pointers. The end of the list is marked by a NULL pointer. The yield of the
-function is zero if all went well, or the error code
-.sp
- PCRE_ERROR_NOMEMORY (-6)
-.sp
-if the attempt to get the memory block failed.
-.P
-When any of these functions encounter a substring that is unset, which can
-happen when capturing subpattern number \fIn+1\fP matches some part of the
-subject, but subpattern \fIn\fP has not been used at all, they return an empty
-string. This can be distinguished from a genuine zero-length substring by
-inspecting the appropriate offset in \fIovector\fP, which is negative for unset
-substrings.
-.P
-The two convenience functions \fBpcre_free_substring()\fP and
-\fBpcre_free_substring_list()\fP can be used to free the memory returned by
-a previous call of \fBpcre_get_substring()\fP or
-\fBpcre_get_substring_list()\fP, respectively. They do nothing more than call
-the function pointed to by \fBpcre_free\fP, which of course could be called
-directly from a C program. However, PCRE is used in some situations where it is
-linked via a special interface to another programming language that cannot use
-\fBpcre_free\fP directly; it is for these cases that the functions are
-provided.
-.
-.
-.SH "EXTRACTING CAPTURED SUBSTRINGS BY NAME"
-.rs
-.sp
-.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIname\fP);
-.PP
-.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, const char *\fIstringname\fP,
-.ti +5n
-.B char *\fIbuffer\fP, int \fIbuffersize\fP);
-.PP
-.B int pcre_get_named_substring(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIsubject\fP, int *\fIovector\fP,
-.ti +5n
-.B int \fIstringcount\fP, const char *\fIstringname\fP,
-.ti +5n
-.B const char **\fIstringptr\fP);
-.PP
-To extract a substring by name, you first have to find associated number.
-For example, for this pattern
-.sp
- (a+)b(?\ed+)...
-.sp
-the number of the subpattern called "xxx" is 2. If the name is known to be
-unique (PCRE_DUPNAMES was not set), you can find the number from the name by
-calling \fBpcre_get_stringnumber()\fP. The first argument is the compiled
-pattern, and the second is the name. The yield of the function is the
-subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no subpattern of
-that name.
-.P
-Given the number, you can extract the substring directly, or use one of the
-functions described in the previous section. For convenience, there are also
-two functions that do the whole job.
-.P
-Most of the arguments of \fBpcre_copy_named_substring()\fP and
-\fBpcre_get_named_substring()\fP are the same as those for the similarly named
-functions that extract by number. As these are described in the previous
-section, they are not re-described here. There are just two differences:
-.P
-First, instead of a substring number, a substring name is given. Second, there
-is an extra argument, given at the start, which is a pointer to the compiled
-pattern. This is needed in order to gain access to the name-to-number
-translation table.
-.P
-These functions call \fBpcre_get_stringnumber()\fP, and if it succeeds, they
-then call \fBpcre_copy_substring()\fP or \fBpcre_get_substring()\fP, as
-appropriate. \fBNOTE:\fP If PCRE_DUPNAMES is set and there are duplicate names,
-the behaviour may not be what you want (see the next section).
-.P
-\fBWarning:\fP If the pattern uses the "(?|" feature to set up multiple
-subpatterns with the same number, you cannot use names to distinguish them,
-because names are not included in the compiled code. The matching process uses
-only numbers.
-.
-.SH "DUPLICATE SUBPATTERN NAMES"
-.rs
-.sp
-.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
-.ti +5n
-.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
-.PP
-When a pattern is compiled with the PCRE_DUPNAMES option, names for subpatterns
-are not required to be unique. Normally, patterns with duplicate names are such
-that in any one match, only one of the named subpatterns participates. An
-example is shown in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation.
-.P
-When duplicates are present, \fBpcre_copy_named_substring()\fP and
-\fBpcre_get_named_substring()\fP return the first substring corresponding to
-the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
-returned; no data is returned. The \fBpcre_get_stringnumber()\fP function
-returns one of the numbers that are associated with the name, but it is not
-defined which it is.
-.P
-If you want to get full details of all captured substrings for a given name,
-you must use the \fBpcre_get_stringtable_entries()\fP function. The first
-argument is the compiled pattern, and the second is the name. The third and
-fourth are pointers to variables which are updated by the function. After it
-has run, they point to the first and last entries in the name-to-number table
-for the given name. The function itself returns the length of each entry, or
-PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
-described above in the section entitled \fIInformation about a pattern\fP.
-Given all the relevant entries for the name, you can extract each of their
-numbers, and hence the captured data, if any.
-.
-.
-.SH "FINDING ALL POSSIBLE MATCHES"
-.rs
-.sp
-The traditional matching function uses a similar algorithm to Perl, which stops
-when it finds the first match, starting at a given point in the subject. If you
-want to find all possible matches, or the longest possible match, consider
-using the alternative matching function (see below) instead. If you cannot use
-the alternative function, but still need to find all possible matches, you
-can kludge it up by making use of the callout facility, which is described in
-the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation.
-.P
-What you have to do is to insert a callout right at the end of the pattern.
-When your callout function is called, extract and save the current matched
-substring. Then return 1, which forces \fBpcre_exec()\fP to backtrack and try
-other alternatives. Ultimately, when it runs out of matches, \fBpcre_exec()\fP
-will yield PCRE_ERROR_NOMATCH.
-.
-.
-.\" HTML
-.SH "MATCHING A PATTERN: THE ALTERNATIVE FUNCTION"
-.rs
-.sp
-.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
-.ti +5n
-.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
-.ti +5n
-.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
-.ti +5n
-.B int *\fIworkspace\fP, int \fIwscount\fP);
-.P
-The function \fBpcre_dfa_exec()\fP is called to match a subject string against
-a compiled pattern, using a matching algorithm that scans the subject string
-just once, and does not backtrack. This has different characteristics to the
-normal algorithm, and is not compatible with Perl. Some of the features of PCRE
-patterns are not supported. Nevertheless, there are times when this kind of
-matching can be useful. For a discussion of the two matching algorithms, see
-the
-.\" HREF
-\fBpcrematching\fP
-.\"
-documentation.
-.P
-The arguments for the \fBpcre_dfa_exec()\fP function are the same as for
-\fBpcre_exec()\fP, plus two extras. The \fIovector\fP argument is used in a
-different way, and this is described below. The other common arguments are used
-in the same way as for \fBpcre_exec()\fP, so their description is not repeated
-here.
-.P
-The two additional arguments provide workspace for the function. The workspace
-vector should contain at least 20 elements. It is used for keeping track of
-multiple paths through the pattern tree. More workspace will be needed for
-patterns and subjects where there are a lot of potential matches.
-.P
-Here is an example of a simple call to \fBpcre_dfa_exec()\fP:
-.sp
- int rc;
- int ovector[10];
- int wspace[20];
- rc = pcre_dfa_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector of integers for substring information */
- 10, /* number of elements (NOT size in bytes) */
- wspace, /* working space vector */
- 20); /* number of elements (NOT size in bytes) */
-.
-.SS "Option bits for \fBpcre_dfa_exec()\fP"
-.rs
-.sp
-The unused bits of the \fIoptions\fP argument for \fBpcre_dfa_exec()\fP must be
-zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
-PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL,
-PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last three of these are
-the same as for \fBpcre_exec()\fP, so their description is not repeated here.
-.sp
- PCRE_PARTIAL
-.sp
-This has the same general effect as it does for \fBpcre_exec()\fP, but the
-details are slightly different. When PCRE_PARTIAL is set for
-\fBpcre_dfa_exec()\fP, the return code PCRE_ERROR_NOMATCH is converted into
-PCRE_ERROR_PARTIAL if the end of the subject is reached, there have been no
-complete matches, but there is still at least one matching possibility. The
-portion of the string that provided the partial match is set as the first
-matching string.
-.sp
- PCRE_DFA_SHORTEST
-.sp
-Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
-soon as it has found one match. Because of the way the alternative algorithm
-works, this is necessarily the shortest possible match at the first possible
-matching point in the subject string.
-.sp
- PCRE_DFA_RESTART
-.sp
-When \fBpcre_dfa_exec()\fP is called with the PCRE_PARTIAL option, and returns
-a partial match, it is possible to call it again, with additional subject
-characters, and have it continue with the same match. The PCRE_DFA_RESTART
-option requests this action; when it is set, the \fIworkspace\fP and
-\fIwscount\fP options must reference the same vector as before because data
-about the match so far is left in them after a partial match. There is more
-discussion of this facility in the
-.\" HREF
-\fBpcrepartial\fP
-.\"
-documentation.
-.
-.SS "Successful returns from \fBpcre_dfa_exec()\fP"
-.rs
-.sp
-When \fBpcre_dfa_exec()\fP succeeds, it may have matched more than one
-substring in the subject. Note, however, that all the matches from one run of
-the function start at the same point in the subject. The shorter matches are
-all initial substrings of the longer matches. For example, if the pattern
-.sp
- <.*>
-.sp
-is matched against the string
-.sp
- This is no more
-.sp
-the three matched strings are
-.sp
-
-
-
-.sp
-On success, the yield of the function is a number greater than zero, which is
-the number of matched substrings. The substrings themselves are returned in
-\fIovector\fP. Each string uses two elements; the first is the offset to the
-start, and the second is the offset to the end. In fact, all the strings have
-the same start offset. (Space could have been saved by giving this only once,
-but it was decided to retain some compatibility with the way \fBpcre_exec()\fP
-returns data, even though the meaning of the strings is different.)
-.P
-The strings are returned in reverse order of length; that is, the longest
-matching string is given first. If there were too many matches to fit into
-\fIovector\fP, the yield of the function is zero, and the vector is filled with
-the longest matches.
-.
-.SS "Error returns from \fBpcre_dfa_exec()\fP"
-.rs
-.sp
-The \fBpcre_dfa_exec()\fP function returns a negative number when it fails.
-Many of the errors are the same as for \fBpcre_exec()\fP, and these are
-described
-.\" HTML
-.\"
-above.
-.\"
-There are in addition the following errors that are specific to
-\fBpcre_dfa_exec()\fP:
-.sp
- PCRE_ERROR_DFA_UITEM (-16)
-.sp
-This return is given if \fBpcre_dfa_exec()\fP encounters an item in the pattern
-that it does not support, for instance, the use of \eC or a back reference.
-.sp
- PCRE_ERROR_DFA_UCOND (-17)
-.sp
-This return is given if \fBpcre_dfa_exec()\fP encounters a condition item that
-uses a back reference for the condition, or a test for recursion in a specific
-group. These are not supported.
-.sp
- PCRE_ERROR_DFA_UMLIMIT (-18)
-.sp
-This return is given if \fBpcre_dfa_exec()\fP is called with an \fIextra\fP
-block that contains a setting of the \fImatch_limit\fP field. This is not
-supported (it is meaningless).
-.sp
- PCRE_ERROR_DFA_WSSIZE (-19)
-.sp
-This return is given if \fBpcre_dfa_exec()\fP runs out of space in the
-\fIworkspace\fP vector.
-.sp
- PCRE_ERROR_DFA_RECURSE (-20)
-.sp
-When a recursive subpattern is processed, the matching function calls itself
-recursively, using private vectors for \fIovector\fP and \fIworkspace\fP. This
-error is given if the output vector is not large enough. This should be
-extremely rare, as a vector of size 1000 is used.
-.
-.
-.SH "SEE ALSO"
-.rs
-.sp
-\fBpcrebuild\fP(3), \fBpcrecallout\fP(3), \fBpcrecpp(3)\fP(3),
-\fBpcrematching\fP(3), \fBpcrepartial\fP(3), \fBpcreposix\fP(3),
-\fBpcreprecompile\fP(3), \fBpcresample\fP(3), \fBpcrestack\fP(3).
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 11 April 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrebuild.3 b/libs/pcre/doc/pcrebuild.3
deleted file mode 100644
index 926a4af10c..0000000000
--- a/libs/pcre/doc/pcrebuild.3
+++ /dev/null
@@ -1,334 +0,0 @@
-.TH PCREBUILD 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE BUILD-TIME OPTIONS"
-.rs
-.sp
-This document describes the optional features of PCRE that can be selected when
-the library is compiled. It assumes use of the \fBconfigure\fP script, where
-the optional features are selected or deselected by providing options to
-\fBconfigure\fP before running the \fBmake\fP command. However, the same
-options can be selected in both Unix-like and non-Unix-like environments using
-the GUI facility of \fBCMakeSetup\fP if you are using \fBCMake\fP instead of
-\fBconfigure\fP to build PCRE.
-.P
-The complete list of options for \fBconfigure\fP (which includes the standard
-ones such as the selection of the installation directory) can be obtained by
-running
-.sp
- ./configure --help
-.sp
-The following sections include descriptions of options whose names begin with
---enable or --disable. These settings specify changes to the defaults for the
-\fBconfigure\fP command. Because of the way that \fBconfigure\fP works,
---enable and --disable always come in pairs, so the complementary option always
-exists as well, but as it specifies the default, it is not described.
-.
-.SH "C++ SUPPORT"
-.rs
-.sp
-By default, the \fBconfigure\fP script will search for a C++ compiler and C++
-header files. If it finds them, it automatically builds the C++ wrapper library
-for PCRE. You can disable this by adding
-.sp
- --disable-cpp
-.sp
-to the \fBconfigure\fP command.
-.
-.SH "UTF-8 SUPPORT"
-.rs
-.sp
-To build PCRE with support for UTF-8 Unicode character strings, add
-.sp
- --enable-utf8
-.sp
-to the \fBconfigure\fP command. Of itself, this does not make PCRE treat
-strings as UTF-8. As well as compiling PCRE with this option, you also have
-have to set the PCRE_UTF8 option when you call the \fBpcre_compile()\fP
-function.
-.P
-If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
-its input to be either ASCII or UTF-8 (depending on the runtime option). It is
-not possible to support both EBCDIC and UTF-8 codes in the same version of the
-library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
-exclusive.
-.
-.SH "UNICODE CHARACTER PROPERTY SUPPORT"
-.rs
-.sp
-UTF-8 support allows PCRE to process character values greater than 255 in the
-strings that it handles. On its own, however, it does not provide any
-facilities for accessing the properties of such characters. If you want to be
-able to use the pattern escapes \eP, \ep, and \eX, which refer to Unicode
-character properties, you must add
-.sp
- --enable-unicode-properties
-.sp
-to the \fBconfigure\fP command. This implies UTF-8 support, even if you have
-not explicitly requested it.
-.P
-Including Unicode property support adds around 30K of tables to the PCRE
-library. Only the general category properties such as \fILu\fP and \fINd\fP are
-supported. Details are given in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation.
-.
-.SH "CODE VALUE OF NEWLINE"
-.rs
-.sp
-By default, PCRE interprets the linefeed (LF) character as indicating the end
-of a line. This is the normal newline character on Unix-like systems. You can
-compile PCRE to use carriage return (CR) instead, by adding
-.sp
- --enable-newline-is-cr
-.sp
-to the \fBconfigure\fP command. There is also a --enable-newline-is-lf option,
-which explicitly specifies linefeed as the newline character.
-.sp
-Alternatively, you can specify that line endings are to be indicated by the two
-character sequence CRLF. If you want this, add
-.sp
- --enable-newline-is-crlf
-.sp
-to the \fBconfigure\fP command. There is a fourth option, specified by
-.sp
- --enable-newline-is-anycrlf
-.sp
-which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
-indicating a line ending. Finally, a fifth option, specified by
-.sp
- --enable-newline-is-any
-.sp
-causes PCRE to recognize any Unicode newline sequence.
-.P
-Whatever line ending convention is selected when PCRE is built can be
-overridden when the library functions are called. At build time it is
-conventional to use the standard for your operating system.
-.
-.SH "WHAT \eR MATCHES"
-.rs
-.sp
-By default, the sequence \eR in a pattern matches any Unicode newline sequence,
-whatever has been selected as the line ending sequence. If you specify
-.sp
- --enable-bsr-anycrlf
-.sp
-the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is
-selected when PCRE is built can be overridden when the library functions are
-called.
-.
-.SH "BUILDING SHARED AND STATIC LIBRARIES"
-.rs
-.sp
-The PCRE building process uses \fBlibtool\fP to build both shared and static
-Unix libraries by default. You can suppress one of these by adding one of
-.sp
- --disable-shared
- --disable-static
-.sp
-to the \fBconfigure\fP command, as required.
-.
-.SH "POSIX MALLOC USAGE"
-.rs
-.sp
-When PCRE is called through the POSIX interface (see the
-.\" HREF
-\fBpcreposix\fP
-.\"
-documentation), additional working storage is required for holding the pointers
-to capturing substrings, because PCRE requires three integers per substring,
-whereas the POSIX interface provides only two. If the number of expected
-substrings is small, the wrapper function uses space on the stack, because this
-is faster than using \fBmalloc()\fP for each call. The default threshold above
-which the stack is no longer used is 10; it can be changed by adding a setting
-such as
-.sp
- --with-posix-malloc-threshold=20
-.sp
-to the \fBconfigure\fP command.
-.
-.SH "HANDLING VERY LARGE PATTERNS"
-.rs
-.sp
-Within a compiled pattern, offset values are used to point from one part to
-another (for example, from an opening parenthesis to an alternation
-metacharacter). By default, two-byte values are used for these offsets, leading
-to a maximum size for a compiled pattern of around 64K. This is sufficient to
-handle all but the most gigantic patterns. Nevertheless, some people do want to
-process enormous patterns, so it is possible to compile PCRE to use three-byte
-or four-byte offsets by adding a setting such as
-.sp
- --with-link-size=3
-.sp
-to the \fBconfigure\fP command. The value given must be 2, 3, or 4. Using
-longer offsets slows down the operation of PCRE because it has to load
-additional bytes when handling them.
-.
-.SH "AVOIDING EXCESSIVE STACK USAGE"
-.rs
-.sp
-When matching with the \fBpcre_exec()\fP function, PCRE implements backtracking
-by making recursive calls to an internal function called \fBmatch()\fP. In
-environments where the size of the stack is limited, this can severely limit
-PCRE's operation. (The Unix environment does not usually suffer from this
-problem, but it may sometimes be necessary to increase the maximum stack size.
-There is a discussion in the
-.\" HREF
-\fBpcrestack\fP
-.\"
-documentation.) An alternative approach to recursion that uses memory from the
-heap to remember data, instead of using recursive function calls, has been
-implemented to work round the problem of limited stack size. If you want to
-build a version of PCRE that works this way, add
-.sp
- --disable-stack-for-recursion
-.sp
-to the \fBconfigure\fP command. With this configuration, PCRE will use the
-\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables to call memory
-management functions. By default these point to \fBmalloc()\fP and
-\fBfree()\fP, but you can replace the pointers so that your own functions are
-used.
-.P
-Separate functions are provided rather than using \fBpcre_malloc\fP and
-\fBpcre_free\fP because the usage is very predictable: the block sizes
-requested are always the same, and the blocks are always freed in reverse
-order. A calling program might be able to implement optimized functions that
-perform better than \fBmalloc()\fP and \fBfree()\fP. PCRE runs noticeably more
-slowly when built in this way. This option affects only the \fBpcre_exec()\fP
-function; it is not relevant for the the \fBpcre_dfa_exec()\fP function.
-.
-.SH "LIMITING PCRE RESOURCE USAGE"
-.rs
-.sp
-Internally, PCRE has a function called \fBmatch()\fP, which it calls repeatedly
-(sometimes recursively) when matching a pattern with the \fBpcre_exec()\fP
-function. By controlling the maximum number of times this function may be
-called during a single matching operation, a limit can be placed on the
-resources used by a single call to \fBpcre_exec()\fP. The limit can be changed
-at run time, as described in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation. The default is 10 million, but this can be changed by adding a
-setting such as
-.sp
- --with-match-limit=500000
-.sp
-to the \fBconfigure\fP command. This setting has no effect on the
-\fBpcre_dfa_exec()\fP matching function.
-.P
-In some environments it is desirable to limit the depth of recursive calls of
-\fBmatch()\fP more strictly than the total number of calls, in order to
-restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion
-is specified) that is used. A second limit controls this; it defaults to the
-value that is set for --with-match-limit, which imposes no additional
-constraints. However, you can set a lower limit by adding, for example,
-.sp
- --with-match-limit-recursion=10000
-.sp
-to the \fBconfigure\fP command. This value can also be overridden at run time.
-.
-.SH "CREATING CHARACTER TABLES AT BUILD TIME"
-.rs
-.sp
-PCRE uses fixed tables for processing characters whose code values are less
-than 256. By default, PCRE is built with a set of tables that are distributed
-in the file \fIpcre_chartables.c.dist\fP. These tables are for ASCII codes
-only. If you add
-.sp
- --enable-rebuild-chartables
-.sp
-to the \fBconfigure\fP command, the distributed tables are no longer used.
-Instead, a program called \fBdftables\fP is compiled and run. This outputs the
-source for new set of tables, created in the default locale of your C runtime
-system. (This method of replacing the tables does not work if you are cross
-compiling, because \fBdftables\fP is run on the local host. If you need to
-create alternative tables when cross compiling, you will have to do so "by
-hand".)
-.
-.SH "USING EBCDIC CODE"
-.rs
-.sp
-PCRE assumes by default that it will run in an environment where the character
-code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
-most computer operating systems. PCRE can, however, be compiled to run in an
-EBCDIC environment by adding
-.sp
- --enable-ebcdic
-.sp
-to the \fBconfigure\fP command. This setting implies
---enable-rebuild-chartables. You should only use it if you know that you are in
-an EBCDIC environment (for example, an IBM mainframe operating system). The
---enable-ebcdic option is incompatible with --enable-utf8.
-.
-.SH "PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT"
-.rs
-.sp
-By default, \fBpcregrep\fP reads all files as plain text. You can build it so
-that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads
-them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of
-.sp
- --enable-pcregrep-libz
- --enable-pcregrep-libbz2
-.sp
-to the \fBconfigure\fP command. These options naturally require that the
-relevant libraries are installed on your system. Configuration will fail if
-they are not.
-.
-.SH "PCRETEST OPTION FOR LIBREADLINE SUPPORT"
-.rs
-.sp
-If you add
-.sp
- --enable-pcretest-libreadline
-.sp
-to the \fBconfigure\fP command, \fBpcretest\fP is linked with the
-\fBlibreadline\fP library, and when its input is from a terminal, it reads it
-using the \fBreadline()\fP function. This provides line-editing and history
-facilities. Note that \fBlibreadline\fP is GPL-licenced, so if you distribute a
-binary of \fBpcretest\fP linked in this way, there may be licensing issues.
-.P
-Setting this option causes the \fB-lreadline\fP option to be added to the
-\fBpcretest\fP build. In many operating environments with a sytem-installed
-\fBlibreadline\fP this is sufficient. However, in some environments (e.g.
-if an unmodified distribution version of readline is in use), some extra
-configuration may be necessary. The INSTALL file for \fBlibreadline\fP says
-this:
-.sp
- "Readline uses the termcap functions, but does not link with the
- termcap or curses library itself, allowing applications which link
- with readline the to choose an appropriate library."
-.sp
-If your environment has not been set up so that an appropriate library is
-automatically included, you may need to add something like
-.sp
- LIBS="-ncurses"
-.sp
-immediately before the \fBconfigure\fP command.
-.
-.
-.SH "SEE ALSO"
-.rs
-.sp
-\fBpcreapi\fP(3), \fBpcre_config\fP(3).
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 17 March 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrecallout.3 b/libs/pcre/doc/pcrecallout.3
deleted file mode 100644
index abdbaed054..0000000000
--- a/libs/pcre/doc/pcrecallout.3
+++ /dev/null
@@ -1,183 +0,0 @@
-.TH PCRECALLOUT 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE CALLOUTS"
-.rs
-.sp
-.B int (*pcre_callout)(pcre_callout_block *);
-.PP
-PCRE provides a feature called "callout", which is a means of temporarily
-passing control to the caller of PCRE in the middle of pattern matching. The
-caller of PCRE provides an external function by putting its entry point in the
-global variable \fIpcre_callout\fP. By default, this variable contains NULL,
-which disables all calling out.
-.P
-Within a regular expression, (?C) indicates the points at which the external
-function is to be called. Different callout points can be identified by putting
-a number less than 256 after the letter C. The default value is zero.
-For example, this pattern has two callout points:
-.sp
- (?C1)abc(?C2)def
-.sp
-If the PCRE_AUTO_CALLOUT option bit is set when \fBpcre_compile()\fP is called,
-PCRE automatically inserts callouts, all with number 255, before each item in
-the pattern. For example, if PCRE_AUTO_CALLOUT is used with the pattern
-.sp
- A(\ed{2}|--)
-.sp
-it is processed as if it were
-.sp
-(?C255)A(?C255)((?C255)\ed{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
-.sp
-Notice that there is a callout before and after each parenthesis and
-alternation bar. Automatic callouts can be used for tracking the progress of
-pattern matching. The
-.\" HREF
-\fBpcretest\fP
-.\"
-command has an option that sets automatic callouts; when it is used, the output
-indicates how the pattern is matched. This is useful information when you are
-trying to optimize the performance of a particular pattern.
-.
-.
-.SH "MISSING CALLOUTS"
-.rs
-.sp
-You should be aware that, because of optimizations in the way PCRE matches
-patterns by default, callouts sometimes do not happen. For example, if the
-pattern is
-.sp
- ab(?C4)cd
-.sp
-PCRE knows that any matching string must contain the letter "d". If the subject
-string is "abyz", the lack of "d" means that matching doesn't ever start, and
-the callout is never reached. However, with "abyd", though the result is still
-no match, the callout is obeyed.
-.P
-You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
-option to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. This slows down the
-matching process, but does ensure that callouts such as the example above are
-obeyed.
-.
-.
-.SH "THE CALLOUT INTERFACE"
-.rs
-.sp
-During matching, when PCRE reaches a callout point, the external function
-defined by \fIpcre_callout\fP is called (if it is set). This applies to both
-the \fBpcre_exec()\fP and the \fBpcre_dfa_exec()\fP matching functions. The
-only argument to the callout function is a pointer to a \fBpcre_callout\fP
-block. This structure contains the following fields:
-.sp
- int \fIversion\fP;
- int \fIcallout_number\fP;
- int *\fIoffset_vector\fP;
- const char *\fIsubject\fP;
- int \fIsubject_length\fP;
- int \fIstart_match\fP;
- int \fIcurrent_position\fP;
- int \fIcapture_top\fP;
- int \fIcapture_last\fP;
- void *\fIcallout_data\fP;
- int \fIpattern_position\fP;
- int \fInext_item_length\fP;
-.sp
-The \fIversion\fP field is an integer containing the version number of the
-block format. The initial version was 0; the current version is 1. The version
-number will change again in future if additional fields are added, but the
-intention is never to remove any of the existing fields.
-.P
-The \fIcallout_number\fP field contains the number of the callout, as compiled
-into the pattern (that is, the number after ?C for manual callouts, and 255 for
-automatically generated callouts).
-.P
-The \fIoffset_vector\fP field is a pointer to the vector of offsets that was
-passed by the caller to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. When
-\fBpcre_exec()\fP is used, the contents can be inspected in order to extract
-substrings that have been matched so far, in the same way as for extracting
-substrings after a match has completed. For \fBpcre_dfa_exec()\fP this field is
-not useful.
-.P
-The \fIsubject\fP and \fIsubject_length\fP fields contain copies of the values
-that were passed to \fBpcre_exec()\fP.
-.P
-The \fIstart_match\fP field normally contains the offset within the subject at
-which the current match attempt started. However, if the escape sequence \eK
-has been encountered, this value is changed to reflect the modified starting
-point. If the pattern is not anchored, the callout function may be called
-several times from the same point in the pattern for different starting points
-in the subject.
-.P
-The \fIcurrent_position\fP field contains the offset within the subject of the
-current match pointer.
-.P
-When the \fBpcre_exec()\fP function is used, the \fIcapture_top\fP field
-contains one more than the number of the highest numbered captured substring so
-far. If no substrings have been captured, the value of \fIcapture_top\fP is
-one. This is always the case when \fBpcre_dfa_exec()\fP is used, because it
-does not support captured substrings.
-.P
-The \fIcapture_last\fP field contains the number of the most recently captured
-substring. If no substrings have been captured, its value is -1. This is always
-the case when \fBpcre_dfa_exec()\fP is used.
-.P
-The \fIcallout_data\fP field contains a value that is passed to
-\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP specifically so that it can be
-passed back in callouts. It is passed in the \fIpcre_callout\fP field of the
-\fBpcre_extra\fP data structure. If no such data was passed, the value of
-\fIcallout_data\fP in a \fBpcre_callout\fP block is NULL. There is a
-description of the \fBpcre_extra\fP structure in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation.
-.P
-The \fIpattern_position\fP field is present from version 1 of the
-\fIpcre_callout\fP structure. It contains the offset to the next item to be
-matched in the pattern string.
-.P
-The \fInext_item_length\fP field is present from version 1 of the
-\fIpcre_callout\fP structure. It contains the length of the next item to be
-matched in the pattern string. When the callout immediately precedes an
-alternation bar, a closing parenthesis, or the end of the pattern, the length
-is zero. When the callout precedes an opening parenthesis, the length is that
-of the entire subpattern.
-.P
-The \fIpattern_position\fP and \fInext_item_length\fP fields are intended to
-help in distinguishing between different automatic callouts, which all have the
-same callout number. However, they are set for all callouts.
-.
-.
-.SH "RETURN VALUES"
-.rs
-.sp
-The external callout function returns an integer to PCRE. If the value is zero,
-matching proceeds as normal. If the value is greater than zero, matching fails
-at the current point, but the testing of other matching possibilities goes
-ahead, just as if a lookahead assertion had failed. If the value is less than
-zero, the match is abandoned, and \fBpcre_exec()\fP (or \fBpcre_dfa_exec()\fP)
-returns the negative value.
-.P
-Negative values should normally be chosen from the set of PCRE_ERROR_xxx
-values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
-The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
-it will never be used by PCRE itself.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 15 March 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrecompat.3 b/libs/pcre/doc/pcrecompat.3
deleted file mode 100644
index 3be6a6a38d..0000000000
--- a/libs/pcre/doc/pcrecompat.3
+++ /dev/null
@@ -1,148 +0,0 @@
-.TH PCRECOMPAT 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "DIFFERENCES BETWEEN PCRE AND PERL"
-.rs
-.sp
-This document describes the differences in the ways that PCRE and Perl handle
-regular expressions. The differences described here are mainly with respect to
-Perl 5.8, though PCRE versions 7.0 and later contain some features that are
-expected to be in the forthcoming Perl 5.10.
-.P
-1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
-it does have are given in the
-.\" HTML
-.\"
-section on UTF-8 support
-.\"
-in the main
-.\" HREF
-\fBpcre\fP
-.\"
-page.
-.P
-2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
-them, but they do not mean what you might think. For example, (?!a){3} does
-not assert that the next three characters are not "a". It just asserts that the
-next character is not "a" three times.
-.P
-3. Capturing subpatterns that occur inside negative lookahead assertions are
-counted, but their entries in the offsets vector are never set. Perl sets its
-numerical variables from any such patterns that are matched before the
-assertion fails to match something (thereby succeeding), but only if the
-negative lookahead assertion contains just one branch.
-.P
-4. Though binary zero characters are supported in the subject string, they are
-not allowed in a pattern string because it is passed as a normal C string,
-terminated by zero. The escape sequence \e0 can be used in the pattern to
-represent a binary zero.
-.P
-5. The following Perl escape sequences are not supported: \el, \eu, \eL,
-\eU, and \eN. In fact these are implemented by Perl's general string-handling
-and are not part of its pattern matching engine. If any of these are
-encountered by PCRE, an error is generated.
-.P
-6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE is
-built with Unicode character property support. The properties that can be
-tested with \ep and \eP are limited to the general category properties such as
-Lu and Nd, script names such as Greek or Han, and the derived properties Any
-and L&.
-.P
-7. PCRE does support the \eQ...\eE escape for quoting substrings. Characters in
-between are treated as literals. This is slightly different from Perl in that $
-and @ are also handled as literals inside the quotes. In Perl, they cause
-variable interpolation (but of course PCRE does not have variables). Note the
-following examples:
-.sp
- Pattern PCRE matches Perl matches
-.sp
-.\" JOIN
- \eQabc$xyz\eE abc$xyz abc followed by the
- contents of $xyz
- \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz
- \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz
-.sp
-The \eQ...\eE sequence is recognized both inside and outside character classes.
-.P
-8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
-constructions. However, there is support for recursive patterns. This is not
-available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
-feature allows an external function to be called during pattern matching. See
-the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation for details.
-.P
-9. Subpatterns that are called recursively or as "subroutines" are always
-treated as atomic groups in PCRE. This is like Python, but unlike Perl.
-.P
-10. There are some differences that are concerned with the settings of captured
-strings when part of a pattern is repeated. For example, matching "aba" against
-the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
-.P
-11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
-(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
-argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
-parentheses, PCRE does not set that capture group; this is different to Perl.
-.P
-12. PCRE provides some extensions to the Perl regular expression facilities.
-Perl 5.10 will include new features that are not in earlier versions, some of
-which (such as named parentheses) have been in PCRE for some time. This list is
-with respect to Perl 5.10:
-.sp
-(a) Although lookbehind assertions must match fixed length strings, each
-alternative branch of a lookbehind assertion can match a different length of
-string. Perl requires them all to have the same length.
-.sp
-(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
-meta-character matches only at the very end of the string.
-.sp
-(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
-meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
-(Perl can be made to issue a warning.)
-.sp
-(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
-inverted, that is, by default they are not greedy, but if followed by a
-question mark they are.
-.sp
-(e) PCRE_ANCHORED can be used at matching time to force a pattern to be tried
-only at the first matching position in the subject string.
-.sp
-(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
-options for \fBpcre_exec()\fP have no Perl equivalents.
-.sp
-(g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF
-by the PCRE_BSR_ANYCRLF option.
-.sp
-(h) The callout facility is PCRE-specific.
-.sp
-(i) The partial matching facility is PCRE-specific.
-.sp
-(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
-different hosts that have the other endianness.
-.sp
-(k) The alternative matching function (\fBpcre_dfa_exec()\fP) matches in a
-different way and is not Perl-compatible.
-.sp
-(l) PCRE recognizes some special sequences such as (*CR) at the start of
-a pattern that set overall options that cannot be changed within the pattern.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 11 September 2007
-Copyright (c) 1997-2007 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrecpp.3 b/libs/pcre/doc/pcrecpp.3
deleted file mode 100644
index fa8c5581be..0000000000
--- a/libs/pcre/doc/pcrecpp.3
+++ /dev/null
@@ -1,347 +0,0 @@
-.TH PCRECPP 3
-.SH NAME
-PCRE - Perl-compatible regular expressions.
-.SH "SYNOPSIS OF C++ WRAPPER"
-.rs
-.sp
-.B #include
-.
-.SH DESCRIPTION
-.rs
-.sp
-The C++ wrapper for PCRE was provided by Google Inc. Some additional
-functionality was added by Giuseppe Maxia. This brief man page was constructed
-from the notes in the \fIpcrecpp.h\fP file, which should be consulted for
-further details.
-.
-.
-.SH "MATCHING INTERFACE"
-.rs
-.sp
-The "FullMatch" operation checks that supplied text matches a supplied pattern
-exactly. If pointer arguments are supplied, it copies matched sub-strings that
-match sub-patterns into them.
-.sp
- Example: successful match
- pcrecpp::RE re("h.*o");
- re.FullMatch("hello");
-.sp
- Example: unsuccessful match (requires full match):
- pcrecpp::RE re("e");
- !re.FullMatch("hello");
-.sp
- Example: creating a temporary RE object:
- pcrecpp::RE("h.*o").FullMatch("hello");
-.sp
-You can pass in a "const char*" or a "string" for "text". The examples below
-tend to use a const char*. You can, as in the different examples above, store
-the RE object explicitly in a variable or use a temporary RE object. The
-examples below use one mode or the other arbitrarily. Either could correctly be
-used for any of these examples.
-.P
-You must supply extra pointer arguments to extract matched subpieces.
-.sp
- Example: extracts "ruby" into "s" and 1234 into "i"
- int i;
- string s;
- pcrecpp::RE re("(\e\ew+):(\e\ed+)");
- re.FullMatch("ruby:1234", &s, &i);
-.sp
- Example: does not try to extract any extra sub-patterns
- re.FullMatch("ruby:1234", &s);
-.sp
- Example: does not try to extract into NULL
- re.FullMatch("ruby:1234", NULL, &i);
-.sp
- Example: integer overflow causes failure
- !re.FullMatch("ruby:1234567891234", NULL, &i);
-.sp
- Example: fails because there aren't enough sub-patterns:
- !pcrecpp::RE("\e\ew+:\e\ed+").FullMatch("ruby:1234", &s);
-.sp
- Example: fails because string cannot be stored in integer
- !pcrecpp::RE("(.*)").FullMatch("ruby", &i);
-.sp
-The provided pointer arguments can be pointers to any scalar numeric
-type, or one of:
-.sp
- string (matched piece is copied to string)
- StringPiece (StringPiece is mutated to point to matched piece)
- T (where "bool T::ParseFrom(const char*, int)" exists)
- NULL (the corresponding matched sub-pattern is not copied)
-.sp
-The function returns true iff all of the following conditions are satisfied:
-.sp
- a. "text" matches "pattern" exactly;
-.sp
- b. The number of matched sub-patterns is >= number of supplied
- pointers;
-.sp
- c. The "i"th argument has a suitable type for holding the
- string captured as the "i"th sub-pattern. If you pass in
- void * NULL for the "i"th argument, or a non-void * NULL
- of the correct type, or pass fewer arguments than the
- number of sub-patterns, "i"th captured sub-pattern is
- ignored.
-.sp
-CAVEAT: An optional sub-pattern that does not exist in the matched
-string is assigned the empty string. Therefore, the following will
-return false (because the empty string is not a valid number):
-.sp
- int number;
- pcrecpp::RE::FullMatch("abc", "[a-z]+(\e\ed+)?", &number);
-.sp
-The matching interface supports at most 16 arguments per call.
-If you need more, consider using the more general interface
-\fBpcrecpp::RE::DoMatch\fP. See \fBpcrecpp.h\fP for the signature for
-\fBDoMatch\fP.
-.P
-NOTE: Do not use \fBno_arg\fP, which is used internally to mark the end of a
-list of optional arguments, as a placeholder for missing arguments, as this can
-lead to segfaults.
-.
-.
-.SH "QUOTING METACHARACTERS"
-.rs
-.sp
-You can use the "QuoteMeta" operation to insert backslashes before all
-potentially meaningful characters in a string. The returned string, used as a
-regular expression, will exactly match the original string.
-.sp
- Example:
- string quoted = RE::QuoteMeta(unquoted);
-.sp
-Note that it's legal to escape a character even if it has no special meaning in
-a regular expression -- so this function does that. (This also makes it
-identical to the perl function of the same name; see "perldoc -f quotemeta".)
-For example, "1.5-2.0?" becomes "1\e.5\e-2\e.0\e?".
-.
-.SH "PARTIAL MATCHES"
-.rs
-.sp
-You can use the "PartialMatch" operation when you want the pattern
-to match any substring of the text.
-.sp
- Example: simple search for a string:
- pcrecpp::RE("ell").PartialMatch("hello");
-.sp
- Example: find first number in a string:
- int number;
- pcrecpp::RE re("(\e\ed+)");
- re.PartialMatch("x*100 + 20", &number);
- assert(number == 100);
-.
-.
-.SH "UTF-8 AND THE MATCHING INTERFACE"
-.rs
-.sp
-By default, pattern and text are plain text, one byte per character. The UTF8
-flag, passed to the constructor, causes both pattern and string to be treated
-as UTF-8 text, still a byte stream but potentially multiple bytes per
-character. In practice, the text is likelier to be UTF-8 than the pattern, but
-the match returned may depend on the UTF8 flag, so always use it when matching
-UTF8 text. For example, "." will match one byte normally but with UTF8 set may
-match up to three bytes of a multi-byte character.
-.sp
- Example:
- pcrecpp::RE_Options options;
- options.set_utf8();
- pcrecpp::RE re(utf8_pattern, options);
- re.FullMatch(utf8_string);
-.sp
- Example: using the convenience function UTF8():
- pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
- re.FullMatch(utf8_string);
-.sp
-NOTE: The UTF8 flag is ignored if pcre was not configured with the
- --enable-utf8 flag.
-.
-.
-.SH "PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE"
-.rs
-.sp
-PCRE defines some modifiers to change the behavior of the regular expression
-engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
-pass such modifiers to a RE class. Currently, the following modifiers are
-supported:
-.sp
- modifier description Perl corresponding
-.sp
- PCRE_CASELESS case insensitive match /i
- PCRE_MULTILINE multiple lines match /m
- PCRE_DOTALL dot matches newlines /s
- PCRE_DOLLAR_ENDONLY $ matches only at end N/A
- PCRE_EXTRA strict escape parsing N/A
- PCRE_EXTENDED ignore whitespaces /x
- PCRE_UTF8 handles UTF8 chars built-in
- PCRE_UNGREEDY reverses * and *? N/A
- PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
-.sp
-(*) Both Perl and PCRE allow non capturing parentheses by means of the
-"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
-capture, while (ab|cd) does.
-.P
-For a full account on how each modifier works, please check the
-PCRE API reference page.
-.P
-For each modifier, there are two member functions whose name is made
-out of the modifier in lowercase, without the "PCRE_" prefix. For
-instance, PCRE_CASELESS is handled by
-.sp
- bool caseless()
-.sp
-which returns true if the modifier is set, and
-.sp
- RE_Options & set_caseless(bool)
-.sp
-which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can be
-accessed through the \fBset_match_limit()\fR and \fBmatch_limit()\fR member
-functions. Setting \fImatch_limit\fR to a non-zero value will limit the
-execution of pcre to keep it from doing bad things like blowing the stack or
-taking an eternity to return a result. A value of 5000 is good enough to stop
-stack blowup in a 2MB thread stack. Setting \fImatch_limit\fR to zero disables
-match limiting. Alternatively, you can call \fBmatch_limit_recursion()\fP
-which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much PCRE
-recurses. \fBmatch_limit()\fP limits the number of matches PCRE does;
-\fBmatch_limit_recursion()\fP limits the depth of internal recursion, and
-therefore the amount of stack that is used.
-.P
-Normally, to pass one or more modifiers to a RE class, you declare
-a \fIRE_Options\fR object, set the appropriate options, and pass this
-object to a RE constructor. Example:
-.sp
- RE_options opt;
- opt.set_caseless(true);
- if (RE("HELLO", opt).PartialMatch("hello world")) ...
-.sp
-RE_options has two constructors. The default constructor takes no arguments and
-creates a set of flags that are off by default. The optional parameter
-\fIoption_flags\fR is to facilitate transfer of legacy code from C programs.
-This lets you do
-.sp
- RE(pattern,
- RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
-.sp
-However, new code is better off doing
-.sp
- RE(pattern,
- RE_Options().set_caseless(true).set_multiline(true))
- .PartialMatch(str);
-.sp
-If you are going to pass one of the most used modifiers, there are some
-convenience functions that return a RE_Options class with the
-appropriate modifier already set: \fBCASELESS()\fR, \fBUTF8()\fR,
-\fBMULTILINE()\fR, \fBDOTALL\fR(), and \fBEXTENDED()\fR.
-.P
-If you need to set several options at once, and you don't want to go through
-the pains of declaring a RE_Options object and setting several options, there
-is a parallel method that give you such ability on the fly. You can concatenate
-several \fBset_xxxxx()\fR member functions, since each of them returns a
-reference to its class object. For example, to pass PCRE_CASELESS,
-PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
-.sp
- RE(" ^ xyz \e\es+ .* blah$",
- RE_Options()
- .set_caseless(true)
- .set_extended(true)
- .set_multiline(true)).PartialMatch(sometext);
-.sp
-.
-.
-.SH "SCANNING TEXT INCREMENTALLY"
-.rs
-.sp
-The "Consume" operation may be useful if you want to repeatedly
-match regular expressions at the front of a string and skip over
-them as they match. This requires use of the "StringPiece" type,
-which represents a sub-range of a real string. Like RE, StringPiece
-is defined in the pcrecpp namespace.
-.sp
- Example: read lines of the form "var = value" from a string.
- string contents = ...; // Fill string somehow
- pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
-
- string var;
- int value;
- pcrecpp::RE re("(\e\ew+) = (\e\ed+)\en");
- while (re.Consume(&input, &var, &value)) {
- ...;
- }
-.sp
-Each successful call to "Consume" will set "var/value", and also
-advance "input" so it points past the matched text.
-.P
-The "FindAndConsume" operation is similar to "Consume" but does not
-anchor your match at the beginning of the string. For example, you
-could extract all words from a string by repeatedly calling
-.sp
- pcrecpp::RE("(\e\ew+)").FindAndConsume(&input, &word)
-.
-.
-.SH "PARSING HEX/OCTAL/C-RADIX NUMBERS"
-.rs
-.sp
-By default, if you pass a pointer to a numeric value, the
-corresponding text is interpreted as a base-10 number. You can
-instead wrap the pointer with a call to one of the operators Hex(),
-Octal(), or CRadix() to interpret the text in another base. The
-CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
-prefixes, but defaults to base-10.
-.sp
- Example:
- int a, b, c, d;
- pcrecpp::RE re("(.*) (.*) (.*) (.*)");
- re.FullMatch("100 40 0100 0x40",
- pcrecpp::Octal(&a), pcrecpp::Hex(&b),
- pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
-.sp
-will leave 64 in a, b, c, and d.
-.
-.
-.SH "REPLACING PARTS OF STRINGS"
-.rs
-.sp
-You can replace the first match of "pattern" in "str" with "rewrite".
-Within "rewrite", backslash-escaped digits (\e1 to \e9) can be
-used to insert text matching corresponding parenthesized group
-from the pattern. \e0 in "rewrite" refers to the entire matching
-text. For example:
-.sp
- string s = "yabba dabba doo";
- pcrecpp::RE("b+").Replace("d", &s);
-.sp
-will leave "s" containing "yada dabba doo". The result is true if the pattern
-matches and a replacement occurs, false otherwise.
-.P
-\fBGlobalReplace\fP is like \fBReplace\fP except that it replaces all
-occurrences of the pattern in the string with the rewrite. Replacements are
-not subject to re-matching. For example:
-.sp
- string s = "yabba dabba doo";
- pcrecpp::RE("b+").GlobalReplace("d", &s);
-.sp
-will leave "s" containing "yada dada doo". It returns the number of
-replacements made.
-.P
-\fBExtract\fP is like \fBReplace\fP, except that if the pattern matches,
-"rewrite" is copied into "out" (an additional argument) with substitutions.
-The non-matching portions of "text" are ignored. Returns true iff a match
-occurred and the extraction happened successfully; if no match occurs, the
-string is left unaffected.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-The C++ wrapper was contributed by Google Inc.
-Copyright (c) 2007 Google Inc.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 17 March 2009
-.fi
diff --git a/libs/pcre/doc/pcregrep.1 b/libs/pcre/doc/pcregrep.1
deleted file mode 100644
index e0d05197d1..0000000000
--- a/libs/pcre/doc/pcregrep.1
+++ /dev/null
@@ -1,477 +0,0 @@
-.TH PCREGREP 1
-.SH NAME
-pcregrep - a grep with Perl-compatible regular expressions.
-.SH SYNOPSIS
-.B pcregrep [options] [long options] [pattern] [path1 path2 ...]
-.
-.SH DESCRIPTION
-.rs
-.sp
-\fBpcregrep\fP searches files for character patterns, in the same way as other
-grep commands do, but it uses the PCRE regular expression library to support
-patterns that are compatible with the regular expressions of Perl 5. See
-.\" HREF
-\fBpcrepattern\fP(3)
-.\"
-for a full description of syntax and semantics of the regular expressions
-that PCRE supports.
-.P
-Patterns, whether supplied on the command line or in a separate file, are given
-without delimiters. For example:
-.sp
- pcregrep Thursday /etc/motd
-.sp
-If you attempt to use delimiters (for example, by surrounding a pattern with
-slashes, as is common in Perl scripts), they are interpreted as part of the
-pattern. Quotes can of course be used to delimit patterns on the command line
-because they are interpreted by the shell, and indeed they are required if a
-pattern contains white space or shell metacharacters.
-.P
-The first argument that follows any option settings is treated as the single
-pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present.
-Conversely, when one or both of these options are used to specify patterns, all
-arguments are treated as path names. At least one of \fB-e\fP, \fB-f\fP, or an
-argument pattern must be provided.
-.P
-If no files are specified, \fBpcregrep\fP reads the standard input. The
-standard input can also be referenced by a name consisting of a single hyphen.
-For example:
-.sp
- pcregrep some-pattern /file1 - /file3
-.sp
-By default, each line that matches a pattern is copied to the standard
-output, and if there is more than one file, the file name is output at the
-start of each line, followed by a colon. However, there are options that can
-change how \fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it
-possible to search for patterns that span line boundaries. What defines a line
-boundary is controlled by the \fB-N\fP (\fB--newline\fP) option.
-.P
-Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
-BUFSIZ is defined in \fB\fP. When there is more than one pattern
-(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to
-each line in the order in which they are defined, except that all the \fB-e\fP
-patterns are tried before the \fB-f\fP patterns.
-.P
-By default, as soon as one pattern matches (or fails to match when \fB-v\fP is
-used), no further patterns are considered. However, if \fB--colour\fP (or
-\fB--color\fP) is used to colour the matching substrings, or if
-\fB--only-matching\fP, \fB--file-offsets\fP, or \fB--line-offsets\fP is used to
-output only the part of the line that matched (either shown literally, or as an
-offset), scanning resumes immediately following the match, so that further
-matches on the same line can be found. If there are multiple patterns, they are
-all tried on the remainder of the line, but patterns that follow the one that
-matched are not tried on the earlier part of the line.
-.P
-This is the same behaviour as GNU grep, but it does mean that the order in
-which multiple patterns are specified can affect the output when one of the
-above options is used.
-.P
-Patterns that can match an empty string are accepted, but empty string
-matches are not recognized. An example is the pattern "(super)?(man)?", in
-which all components are optional. This pattern finds all occurrences of both
-"super" and "man"; the output differs from matching with "super|man" when only
-the matching substrings are being shown.
-.P
-If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set,
-\fBpcregrep\fP uses the value to set a locale when calling the PCRE library.
-The \fB--locale\fP option can be used to override this.
-.
-.SH "SUPPORT FOR COMPRESSED FILES"
-.rs
-.sp
-It is possible to compile \fBpcregrep\fP so that it uses \fBlibz\fP or
-\fBlibbz2\fP to read files whose names end in \fB.gz\fP or \fB.bz2\fP,
-respectively. You can find out whether your binary has support for one or both
-of these file types by running it with the \fB--help\fP option. If the
-appropriate support is not present, files are treated as plain text. The
-standard input is always so treated.
-.
-.SH OPTIONS
-.rs
-.TP 10
-\fB--\fP
-This terminate the list of options. It is useful if the next item on the
-command line starts with a hyphen but is not an option. This allows for the
-processing of patterns and filenames that start with hyphens.
-.TP
-\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
-Output \fInumber\fP lines of context after each matching line. If filenames
-and/or line numbers are being output, a hyphen separator is used instead of a
-colon for the context lines. A line containing "--" is output between each
-group of lines, unless they are in fact contiguous in the input file. The value
-of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
-guarantees to have up to 8K of following text available for context output.
-.TP
-\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
-Output \fInumber\fP lines of context before each matching line. If filenames
-and/or line numbers are being output, a hyphen separator is used instead of a
-colon for the context lines. A line containing "--" is output between each
-group of lines, unless they are in fact contiguous in the input file. The value
-of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
-guarantees to have up to 8K of preceding text available for context output.
-.TP
-\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
-Output \fInumber\fP lines of context both before and after each matching line.
-This is equivalent to setting both \fB-A\fP and \fB-B\fP to the same value.
-.TP
-\fB-c\fP, \fB--count\fP
-Do not output individual lines; instead just output a count of the number of
-lines that would otherwise have been output. If several files are given, a
-count is output for each of them. In this mode, the \fB-A\fP, \fB-B\fP, and
-\fB-C\fP options are ignored.
-.TP
-\fB--colour\fP, \fB--color\fP
-If this option is given without any data, it is equivalent to "--colour=auto".
-If data is required, it must be given in the same shell item, separated by an
-equals sign.
-.TP
-\fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
-This option specifies under what circumstances the parts of a line that matched
-a pattern should be coloured in the output. By default, the output is not
-coloured. The value (which is optional, see above) may be "never", "always", or
-"auto". In the latter case, colouring happens only if the standard output is
-connected to a terminal. More resources are used when colouring is enabled,
-because \fBpcregrep\fP has to search for all possible matches in a line, not
-just one, in order to colour them all.
-
-The colour that is used can be specified by setting the environment variable
-PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
-string of two numbers, separated by a semicolon. They are copied directly into
-the control string for setting colour on a terminal, so it is your
-responsibility to ensure that they make sense. If neither of the environment
-variables is set, the default is "1;31", which gives red.
-.TP
-\fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP
-If an input path is not a regular file or a directory, "action" specifies how
-it is to be processed. Valid values are "read" (the default) or "skip"
-(silently skip the path).
-.TP
-\fB-d\fP \fIaction\fP, \fB--directories=\fP\fIaction\fP
-If an input path is a directory, "action" specifies how it is to be processed.
-Valid values are "read" (the default), "recurse" (equivalent to the \fB-r\fP
-option), or "skip" (silently skip the path). In the default case, directories
-are read as if they were ordinary files. In some operating systems the effect
-of reading a directory like this is an immediate end-of-file.
-.TP
-\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
-Specify a pattern to be matched. This option can be used multiple times in
-order to specify several patterns. It can also be used as a way of specifying a
-single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
-pattern is taken from the command line; all arguments are treated as file
-names. There is an overall maximum of 100 patterns. They are applied to each
-line in the order in which they are defined until one matches (or fails to
-match if \fB-v\fP is used). If \fB-f\fP is used with \fB-e\fP, the command line
-patterns are matched first, followed by the patterns from the file, independent
-of the order in which these options are specified. Note that multiple use of
-\fB-e\fP is not the same as a single pattern with alternatives. For example,
-X|Y finds the first character in a line that is X or Y, whereas if the two
-patterns are given separately, \fBpcregrep\fP finds X if it is present, even if
-it follows Y in the line. It finds Y only if there is no X in the line. This
-really matters only if you are using \fB-o\fP to show the part(s) of the line
-that matched.
-.TP
-\fB--exclude\fP=\fIpattern\fP
-When \fBpcregrep\fP is searching the files in a directory as a consequence of
-the \fB-r\fP (recursive search) option, any regular files whose names match the
-pattern are excluded. Subdirectories are not excluded by this option; they are
-searched recursively, subject to the \fB--exclude_dir\fP and
-\fB--include_dir\fP options. The pattern is a PCRE regular expression, and is
-matched against the final component of the file name (not the entire path). If
-a file name matches both \fB--include\fP and \fB--exclude\fP, it is excluded.
-There is no short form for this option.
-.TP
-\fB--exclude_dir\fP=\fIpattern\fP
-When \fBpcregrep\fP is searching the contents of a directory as a consequence
-of the \fB-r\fP (recursive search) option, any subdirectories whose names match
-the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
-subdirectories.) The pattern is a PCRE regular expression, and is matched
-against the final component of the name (not the entire path). If a
-subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
-is excluded. There is no short form for this option.
-.TP
-\fB-F\fP, \fB--fixed-strings\fP
-Interpret each pattern as a list of fixed strings, separated by newlines,
-instead of as a regular expression. The \fB-w\fP (match as a word) and \fB-x\fP
-(match whole line) options can be used with \fB-F\fP. They apply to each of the
-fixed strings. A line is selected if any of the fixed strings are found in it
-(subject to \fB-w\fP or \fB-x\fP, if present).
-.TP
-\fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP
-Read a number of patterns from the file, one per line, and match them against
-each line of input. A data line is output if any of the patterns match it. The
-filename can be given as "-" to refer to the standard input. When \fB-f\fP is
-used, patterns specified on the command line using \fB-e\fP may also be
-present; they are tested before the file's patterns. However, no other pattern
-is taken from the command line; all arguments are treated as file names. There
-is an overall maximum of 100 patterns. Trailing white space is removed from
-each line, and blank lines are ignored. An empty file contains no patterns and
-therefore matches nothing. See also the comments about multiple patterns versus
-a single pattern with alternatives in the description of \fB-e\fP above.
-.TP
-\fB--file-offsets\fP
-Instead of showing lines or parts of lines that match, show each match as an
-offset from the start of the file and a length, separated by a comma. In this
-mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
-options are ignored. If there is more than one match in a line, each of them is
-shown separately. This option is mutually exclusive with \fB--line-offsets\fP
-and \fB--only-matching\fP.
-.TP
-\fB-H\fP, \fB--with-filename\fP
-Force the inclusion of the filename at the start of output lines when searching
-a single file. By default, the filename is not shown in this case. For matching
-lines, the filename is followed by a colon; for context lines, a hyphen
-separator is used. If a line number is also being output, it follows the file
-name.
-.TP
-\fB-h\fP, \fB--no-filename\fP
-Suppress the output filenames when searching multiple files. By default,
-filenames are shown when multiple files are searched. For matching lines, the
-filename is followed by a colon; for context lines, a hyphen separator is used.
-If a line number is also being output, it follows the file name.
-.TP
-\fB--help\fP
-Output a help message, giving brief details of the command options and file
-type support, and then exit.
-.TP
-\fB-i\fP, \fB--ignore-case\fP
-Ignore upper/lower case distinctions during comparisons.
-.TP
-\fB--include\fP=\fIpattern\fP
-When \fBpcregrep\fP is searching the files in a directory as a consequence of
-the \fB-r\fP (recursive search) option, only those regular files whose names
-match the pattern are included. Subdirectories are always included and searched
-recursively, subject to the \fP--include_dir\fP and \fB--exclude_dir\fP
-options. The pattern is a PCRE regular expression, and is matched against the
-final component of the file name (not the entire path). If a file name matches
-both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no short
-form for this option.
-.TP
-\fB--include_dir\fP=\fIpattern\fP
-When \fBpcregrep\fP is searching the contents of a directory as a consequence
-of the \fB-r\fP (recursive search) option, only those subdirectories whose
-names match the pattern are included. (Note that the \fB--include\fP option
-does not affect subdirectories.) The pattern is a PCRE regular expression, and
-is matched against the final component of the name (not the entire path). If a
-subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
-is excluded. There is no short form for this option.
-.TP
-\fB-L\fP, \fB--files-without-match\fP
-Instead of outputting lines from the files, just output the names of the files
-that do not contain any lines that would have been output. Each file name is
-output once, on a separate line.
-.TP
-\fB-l\fP, \fB--files-with-matches\fP
-Instead of outputting lines from the files, just output the names of the files
-containing lines that would have been output. Each file name is output
-once, on a separate line. Searching stops as soon as a matching line is found
-in a file.
-.TP
-\fB--label\fP=\fIname\fP
-This option supplies a name to be used for the standard input when file names
-are being output. If not supplied, "(standard input)" is used. There is no
-short form for this option.
-.TP
-\fB--line-offsets\fP
-Instead of showing lines or parts of lines that match, show each match as a
-line number, the offset from the start of the line, and a length. The line
-number is terminated by a colon (as usual; see the \fB-n\fP option), and the
-offset and length are separated by a comma. In this mode, no context is shown.
-That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is
-more than one match in a line, each of them is shown separately. This option is
-mutually exclusive with \fB--file-offsets\fP and \fB--only-matching\fP.
-.TP
-\fB--locale\fP=\fIlocale-name\fP
-This option specifies a locale to be used for pattern matching. It overrides
-the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
-locale is specified, the PCRE library's default (usually the "C" locale) is
-used. There is no short form for this option.
-.TP
-\fB-M\fP, \fB--multiline\fP
-Allow patterns to match more than one line. When this option is given, patterns
-may usefully contain literal newline characters and internal occurrences of ^
-and $ characters. The output for any one match may consist of more than one
-line. When this option is set, the PCRE library is called in "multiline" mode.
-There is a limit to the number of lines that can be matched, imposed by the way
-that \fBpcregrep\fP buffers the input file as it scans it. However,
-\fBpcregrep\fP ensures that at least 8K characters or the rest of the document
-(whichever is the shorter) are available for forward matching, and similarly
-the previous 8K characters (or all the previous characters, if fewer than 8K)
-are guaranteed to be available for lookbehind assertions.
-.TP
-\fB-N\fP \fInewline-type\fP, \fB--newline=\fP\fInewline-type\fP
-The PCRE library supports five different conventions for indicating
-the ends of lines. They are the single-character sequences CR (carriage return)
-and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
-which recognizes any of the preceding three types, and an "any" convention, in
-which any Unicode line ending sequence is assumed to end a line. The Unicode
-sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
-(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
-PS (paragraph separator, U+2029).
-.sp
-When the PCRE library is built, a default line-ending sequence is specified.
-This is normally the standard sequence for the operating system. Unless
-otherwise specified by this option, \fBpcregrep\fP uses the library's default.
-The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
-makes it possible to use \fBpcregrep\fP on files that have come from other
-environments without having to modify their line endings. If the data that is
-being scanned does not agree with the convention set by this option,
-\fBpcregrep\fP may behave in strange ways.
-.TP
-\fB-n\fP, \fB--line-number\fP
-Precede each output line by its line number in the file, followed by a colon
-for matching lines or a hyphen for context lines. If the filename is also being
-output, it precedes the line number. This option is forced if
-\fB--line-offsets\fP is used.
-.TP
-\fB-o\fP, \fB--only-matching\fP
-Show only the part of the line that matched a pattern. In this mode, no
-context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are
-ignored. If there is more than one match in a line, each of them is shown
-separately. If \fB-o\fP is combined with \fB-v\fP (invert the sense of the
-match to find non-matching lines), no output is generated, but the return code
-is set appropriately. This option is mutually exclusive with
-\fB--file-offsets\fP and \fB--line-offsets\fP.
-.TP
-\fB-q\fP, \fB--quiet\fP
-Work quietly, that is, display nothing except error messages. The exit
-status indicates whether or not any matches were found.
-.TP
-\fB-r\fP, \fB--recursive\fP
-If any given path is a directory, recursively scan the files it contains,
-taking note of any \fB--include\fP and \fB--exclude\fP settings. By default, a
-directory is read as a normal file; in some operating systems this gives an
-immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
-option to "recurse".
-.TP
-\fB-s\fP, \fB--no-messages\fP
-Suppress error messages about non-existent or unreadable files. Such files are
-quietly skipped. However, the return code is still 2, even if matches were
-found in other files.
-.TP
-\fB-u\fP, \fB--utf-8\fP
-Operate in UTF-8 mode. This option is available only if PCRE has been compiled
-with UTF-8 support. Both patterns and subject lines must be valid strings of
-UTF-8 characters.
-.TP
-\fB-V\fP, \fB--version\fP
-Write the version numbers of \fBpcregrep\fP and the PCRE library that is being
-used to the standard error stream.
-.TP
-\fB-v\fP, \fB--invert-match\fP
-Invert the sense of the match, so that lines which do \fInot\fP match any of
-the patterns are the ones that are found.
-.TP
-\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
-Force the patterns to match only whole words. This is equivalent to having \eb
-at the start and end of the pattern.
-.TP
-\fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP
-Force the patterns to be anchored (each must start matching at the beginning of
-a line) and in addition, require them to match entire lines. This is
-equivalent to having ^ and $ characters at the start and end of each
-alternative branch in every pattern.
-.
-.
-.SH "ENVIRONMENT VARIABLES"
-.rs
-.sp
-The environment variables \fBLC_ALL\fP and \fBLC_CTYPE\fP are examined, in that
-order, for a locale. The first one that is set is used. This can be overridden
-by the \fB--locale\fP option. If no locale is set, the PCRE library's default
-(usually the "C" locale) is used.
-.
-.
-.SH "NEWLINES"
-.rs
-.sp
-The \fB-N\fP (\fB--newline\fP) option allows \fBpcregrep\fP to scan files with
-different newline conventions from the default. However, the setting of this
-option does not affect the way in which \fBpcregrep\fP writes information to
-the standard error and output streams. It uses the string "\en" in C
-\fBprintf()\fP calls to indicate newlines, relying on the C I/O library to
-convert this to an appropriate sequence if the output is sent to a file.
-.
-.
-.SH "OPTIONS COMPATIBILITY"
-.rs
-.sp
-The majority of short and long forms of \fBpcregrep\fP's options are the same
-as in the GNU \fBgrep\fP program. Any long option of the form
-\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
-(PCRE terminology). However, the \fB--locale\fP, \fB-M\fP, \fB--multiline\fP,
-\fB-u\fP, and \fB--utf-8\fP options are specific to \fBpcregrep\fP.
-.
-.
-.SH "OPTIONS WITH DATA"
-.rs
-.sp
-There are four different ways in which an option with data can be specified.
-If a short form option is used, the data may follow immediately, or in the next
-command line item. For example:
-.sp
- -f/some/file
- -f /some/file
-.sp
-If a long form option is used, the data may appear in the same command line
-item, separated by an equals character, or (with one exception) it may appear
-in the next command line item. For example:
-.sp
- --file=/some/file
- --file /some/file
-.sp
-Note, however, that if you want to supply a file name beginning with ~ as data
-in a shell command, and have the shell expand ~ to a home directory, you must
-separate the file name from the option, because the shell does not treat ~
-specially unless it is at the start of an item.
-.P
-The exception to the above is the \fB--colour\fP (or \fB--color\fP) option,
-for which the data is optional. If this option does have data, it must be given
-in the first form, using an equals character. Otherwise it will be assumed that
-it has no data.
-.
-.
-.SH "MATCHING ERRORS"
-.rs
-.sp
-It is possible to supply a regular expression that takes a very long time to
-fail to match certain lines. Such patterns normally involve nested indefinite
-repeats, for example: (a+)*\ed when matched against a line of a's with no final
-digit. The PCRE matching function has a resource limit that causes it to abort
-in these circumstances. If this happens, \fBpcregrep\fP outputs an error
-message and the line that caused the problem to the standard error stream. If
-there are more than 20 such errors, \fBpcregrep\fP gives up.
-.
-.
-.SH DIAGNOSTICS
-.rs
-.sp
-Exit status is 0 if any matches were found, 1 if no matches were found, and 2
-for syntax errors and non-existent or inacessible files (even if matches were
-found in other files) or too many matching errors. Using the \fB-s\fP option to
-suppress error messages about inaccessble files does not affect the return
-code.
-.
-.
-.SH "SEE ALSO"
-.rs
-.sp
-\fBpcrepattern\fP(3), \fBpcretest\fP(1).
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 01 March 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcregrep.txt b/libs/pcre/doc/pcregrep.txt
deleted file mode 100644
index 0163d58060..0000000000
--- a/libs/pcre/doc/pcregrep.txt
+++ /dev/null
@@ -1,512 +0,0 @@
-PCREGREP(1) PCREGREP(1)
-
-
-NAME
- pcregrep - a grep with Perl-compatible regular expressions.
-
-
-SYNOPSIS
- pcregrep [options] [long options] [pattern] [path1 path2 ...]
-
-
-DESCRIPTION
-
- pcregrep searches files for character patterns, in the same way as
- other grep commands do, but it uses the PCRE regular expression library
- to support patterns that are compatible with the regular expressions of
- Perl 5. See pcrepattern(3) for a full description of syntax and seman-
- tics of the regular expressions that PCRE supports.
-
- Patterns, whether supplied on the command line or in a separate file,
- are given without delimiters. For example:
-
- pcregrep Thursday /etc/motd
-
- If you attempt to use delimiters (for example, by surrounding a pattern
- with slashes, as is common in Perl scripts), they are interpreted as
- part of the pattern. Quotes can of course be used to delimit patterns
- on the command line because they are interpreted by the shell, and
- indeed they are required if a pattern contains white space or shell
- metacharacters.
-
- The first argument that follows any option settings is treated as the
- single pattern to be matched when neither -e nor -f is present. Con-
- versely, when one or both of these options are used to specify pat-
- terns, all arguments are treated as path names. At least one of -e, -f,
- or an argument pattern must be provided.
-
- If no files are specified, pcregrep reads the standard input. The stan-
- dard input can also be referenced by a name consisting of a single
- hyphen. For example:
-
- pcregrep some-pattern /file1 - /file3
-
- By default, each line that matches a pattern is copied to the standard
- output, and if there is more than one file, the file name is output at
- the start of each line, followed by a colon. However, there are options
- that can change how pcregrep behaves. In particular, the -M option
- makes it possible to search for patterns that span line boundaries.
- What defines a line boundary is controlled by the -N (--newline)
- option.
-
- Patterns are limited to 8K or BUFSIZ characters, whichever is the
- greater. BUFSIZ is defined in . When there is more than one
- pattern (specified by the use of -e and/or -f), each pattern is applied
- to each line in the order in which they are defined, except that all
- the -e patterns are tried before the -f patterns.
-
- By default, as soon as one pattern matches (or fails to match when -v
- is used), no further patterns are considered. However, if --colour (or
- --color) is used to colour the matching substrings, or if --only-match-
- ing, --file-offsets, or --line-offsets is used to output only the part
- of the line that matched (either shown literally, or as an offset),
- scanning resumes immediately following the match, so that further
- matches on the same line can be found. If there are multiple patterns,
- they are all tried on the remainder of the line, but patterns that fol-
- low the one that matched are not tried on the earlier part of the line.
-
- This is the same behaviour as GNU grep, but it does mean that the order
- in which multiple patterns are specified can affect the output when one
- of the above options is used.
-
- Patterns that can match an empty string are accepted, but empty string
- matches are not recognized. An example is the pattern "(super)?(man)?",
- in which all components are optional. This pattern finds all occur-
- rences of both "super" and "man"; the output differs from matching with
- "super|man" when only the matching substrings are being shown.
-
- If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
- the value to set a locale when calling the PCRE library. The --locale
- option can be used to override this.
-
-
-SUPPORT FOR COMPRESSED FILES
-
- It is possible to compile pcregrep so that it uses libz or libbz2 to
- read files whose names end in .gz or .bz2, respectively. You can find
- out whether your binary has support for one or both of these file types
- by running it with the --help option. If the appropriate support is not
- present, files are treated as plain text. The standard input is always
- so treated.
-
-
-OPTIONS
-
- -- This terminate the list of options. It is useful if the next
- item on the command line starts with a hyphen but is not an
- option. This allows for the processing of patterns and file-
- names that start with hyphens.
-
- -A number, --after-context=number
- Output number lines of context after each matching line. If
- filenames and/or line numbers are being output, a hyphen sep-
- arator is used instead of a colon for the context lines. A
- line containing "--" is output between each group of lines,
- unless they are in fact contiguous in the input file. The
- value of number is expected to be relatively small. However,
- pcregrep guarantees to have up to 8K of following text avail-
- able for context output.
-
- -B number, --before-context=number
- Output number lines of context before each matching line. If
- filenames and/or line numbers are being output, a hyphen sep-
- arator is used instead of a colon for the context lines. A
- line containing "--" is output between each group of lines,
- unless they are in fact contiguous in the input file. The
- value of number is expected to be relatively small. However,
- pcregrep guarantees to have up to 8K of preceding text avail-
- able for context output.
-
- -C number, --context=number
- Output number lines of context both before and after each
- matching line. This is equivalent to setting both -A and -B
- to the same value.
-
- -c, --count
- Do not output individual lines; instead just output a count
- of the number of lines that would otherwise have been output.
- If several files are given, a count is output for each of
- them. In this mode, the -A, -B, and -C options are ignored.
-
- --colour, --color
- If this option is given without any data, it is equivalent to
- "--colour=auto". If data is required, it must be given in
- the same shell item, separated by an equals sign.
-
- --colour=value, --color=value
- This option specifies under what circumstances the parts of a
- line that matched a pattern should be coloured in the output.
- By default, the output is not coloured. The value (which is
- optional, see above) may be "never", "always", or "auto". In
- the latter case, colouring happens only if the standard out-
- put is connected to a terminal. More resources are used when
- colouring is enabled, because pcregrep has to search for all
- possible matches in a line, not just one, in order to colour
- them all.
-
- The colour that is used can be specified by setting the envi-
- ronment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
- of this variable should be a string of two numbers, separated
- by a semicolon. They are copied directly into the control
- string for setting colour on a terminal, so it is your
- responsibility to ensure that they make sense. If neither of
- the environment variables is set, the default is "1;31",
- which gives red.
-
- -D action, --devices=action
- If an input path is not a regular file or a directory,
- "action" specifies how it is to be processed. Valid values
- are "read" (the default) or "skip" (silently skip the path).
-
- -d action, --directories=action
- If an input path is a directory, "action" specifies how it is
- to be processed. Valid values are "read" (the default),
- "recurse" (equivalent to the -r option), or "skip" (silently
- skip the path). In the default case, directories are read as
- if they were ordinary files. In some operating systems the
- effect of reading a directory like this is an immediate end-
- of-file.
-
- -e pattern, --regex=pattern, --regexp=pattern
- Specify a pattern to be matched. This option can be used mul-
- tiple times in order to specify several patterns. It can also
- be used as a way of specifying a single pattern that starts
- with a hyphen. When -e is used, no argument pattern is taken
- from the command line; all arguments are treated as file
- names. There is an overall maximum of 100 patterns. They are
- applied to each line in the order in which they are defined
- until one matches (or fails to match if -v is used). If -f is
- used with -e, the command line patterns are matched first,
- followed by the patterns from the file, independent of the
- order in which these options are specified. Note that multi-
- ple use of -e is not the same as a single pattern with alter-
- natives. For example, X|Y finds the first character in a line
- that is X or Y, whereas if the two patterns are given sepa-
- rately, pcregrep finds X if it is present, even if it follows
- Y in the line. It finds Y only if there is no X in the line.
- This really matters only if you are using -o to show the
- part(s) of the line that matched.
-
- --exclude=pattern
- When pcregrep is searching the files in a directory as a con-
- sequence of the -r (recursive search) option, any regular
- files whose names match the pattern are excluded. Subdirecto-
- ries are not excluded by this option; they are searched
- recursively, subject to the --exclude_dir and --include_dir
- options. The pattern is a PCRE regular expression, and is
- matched against the final component of the file name (not the
- entire path). If a file name matches both --include and
- --exclude, it is excluded. There is no short form for this
- option.
-
- --exclude_dir=pattern
- When pcregrep is searching the contents of a directory as a
- consequence of the -r (recursive search) option, any subdi-
- rectories whose names match the pattern are excluded. (Note
- that the --exclude option does not affect subdirectories.)
- The pattern is a PCRE regular expression, and is matched
- against the final component of the name (not the entire
- path). If a subdirectory name matches both --include_dir and
- --exclude_dir, it is excluded. There is no short form for
- this option.
-
- -F, --fixed-strings
- Interpret each pattern as a list of fixed strings, separated
- by newlines, instead of as a regular expression. The -w
- (match as a word) and -x (match whole line) options can be
- used with -F. They apply to each of the fixed strings. A line
- is selected if any of the fixed strings are found in it (sub-
- ject to -w or -x, if present).
-
- -f filename, --file=filename
- Read a number of patterns from the file, one per line, and
- match them against each line of input. A data line is output
- if any of the patterns match it. The filename can be given as
- "-" to refer to the standard input. When -f is used, patterns
- specified on the command line using -e may also be present;
- they are tested before the file's patterns. However, no other
- pattern is taken from the command line; all arguments are
- treated as file names. There is an overall maximum of 100
- patterns. Trailing white space is removed from each line, and
- blank lines are ignored. An empty file contains no patterns
- and therefore matches nothing. See also the comments about
- multiple patterns versus a single pattern with alternatives
- in the description of -e above.
-
- --file-offsets
- Instead of showing lines or parts of lines that match, show
- each match as an offset from the start of the file and a
- length, separated by a comma. In this mode, no context is
- shown. That is, the -A, -B, and -C options are ignored. If
- there is more than one match in a line, each of them is shown
- separately. This option is mutually exclusive with --line-
- offsets and --only-matching.
-
- -H, --with-filename
- Force the inclusion of the filename at the start of output
- lines when searching a single file. By default, the filename
- is not shown in this case. For matching lines, the filename
- is followed by a colon; for context lines, a hyphen separator
- is used. If a line number is also being output, it follows
- the file name.
-
- -h, --no-filename
- Suppress the output filenames when searching multiple files.
- By default, filenames are shown when multiple files are
- searched. For matching lines, the filename is followed by a
- colon; for context lines, a hyphen separator is used. If a
- line number is also being output, it follows the file name.
-
- --help Output a help message, giving brief details of the command
- options and file type support, and then exit.
-
- -i, --ignore-case
- Ignore upper/lower case distinctions during comparisons.
-
- --include=pattern
- When pcregrep is searching the files in a directory as a con-
- sequence of the -r (recursive search) option, only those reg-
- ular files whose names match the pattern are included. Subdi-
- rectories are always included and searched recursively, sub-
- ject to the --include_dir and --exclude_dir options. The pat-
- tern is a PCRE regular expression, and is matched against the
- final component of the file name (not the entire path). If a
- file name matches both --include and --exclude, it is
- excluded. There is no short form for this option.
-
- --include_dir=pattern
- When pcregrep is searching the contents of a directory as a
- consequence of the -r (recursive search) option, only those
- subdirectories whose names match the pattern are included.
- (Note that the --include option does not affect subdirecto-
- ries.) The pattern is a PCRE regular expression, and is
- matched against the final component of the name (not the
- entire path). If a subdirectory name matches both
- --include_dir and --exclude_dir, it is excluded. There is no
- short form for this option.
-
- -L, --files-without-match
- Instead of outputting lines from the files, just output the
- names of the files that do not contain any lines that would
- have been output. Each file name is output once, on a sepa-
- rate line.
-
- -l, --files-with-matches
- Instead of outputting lines from the files, just output the
- names of the files containing lines that would have been out-
- put. Each file name is output once, on a separate line.
- Searching stops as soon as a matching line is found in a
- file.
-
- --label=name
- This option supplies a name to be used for the standard input
- when file names are being output. If not supplied, "(standard
- input)" is used. There is no short form for this option.
-
- --line-offsets
- Instead of showing lines or parts of lines that match, show
- each match as a line number, the offset from the start of the
- line, and a length. The line number is terminated by a colon
- (as usual; see the -n option), and the offset and length are
- separated by a comma. In this mode, no context is shown.
- That is, the -A, -B, and -C options are ignored. If there is
- more than one match in a line, each of them is shown sepa-
- rately. This option is mutually exclusive with --file-offsets
- and --only-matching.
-
- --locale=locale-name
- This option specifies a locale to be used for pattern match-
- ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
- ronment variables. If no locale is specified, the PCRE
- library's default (usually the "C" locale) is used. There is
- no short form for this option.
-
- -M, --multiline
- Allow patterns to match more than one line. When this option
- is given, patterns may usefully contain literal newline char-
- acters and internal occurrences of ^ and $ characters. The
- output for any one match may consist of more than one line.
- When this option is set, the PCRE library is called in "mul-
- tiline" mode. There is a limit to the number of lines that
- can be matched, imposed by the way that pcregrep buffers the
- input file as it scans it. However, pcregrep ensures that at
- least 8K characters or the rest of the document (whichever is
- the shorter) are available for forward matching, and simi-
- larly the previous 8K characters (or all the previous charac-
- ters, if fewer than 8K) are guaranteed to be available for
- lookbehind assertions.
-
- -N newline-type, --newline=newline-type
- The PCRE library supports five different conventions for
- indicating the ends of lines. They are the single-character
- sequences CR (carriage return) and LF (linefeed), the two-
- character sequence CRLF, an "anycrlf" convention, which rec-
- ognizes any of the preceding three types, and an "any" con-
- vention, in which any Unicode line ending sequence is assumed
- to end a line. The Unicode sequences are the three just men-
- tioned, plus VT (vertical tab, U+000B), FF (formfeed,
- U+000C), NEL (next line, U+0085), LS (line separator,
- U+2028), and PS (paragraph separator, U+2029).
-
- When the PCRE library is built, a default line-ending
- sequence is specified. This is normally the standard
- sequence for the operating system. Unless otherwise specified
- by this option, pcregrep uses the library's default. The
- possible values for this option are CR, LF, CRLF, ANYCRLF, or
- ANY. This makes it possible to use pcregrep on files that
- have come from other environments without having to modify
- their line endings. If the data that is being scanned does
- not agree with the convention set by this option, pcregrep
- may behave in strange ways.
-
- -n, --line-number
- Precede each output line by its line number in the file, fol-
- lowed by a colon for matching lines or a hyphen for context
- lines. If the filename is also being output, it precedes the
- line number. This option is forced if --line-offsets is used.
-
- -o, --only-matching
- Show only the part of the line that matched a pattern. In
- this mode, no context is shown. That is, the -A, -B, and -C
- options are ignored. If there is more than one match in a
- line, each of them is shown separately. If -o is combined
- with -v (invert the sense of the match to find non-matching
- lines), no output is generated, but the return code is set
- appropriately. This option is mutually exclusive with --file-
- offsets and --line-offsets.
-
- -q, --quiet
- Work quietly, that is, display nothing except error messages.
- The exit status indicates whether or not any matches were
- found.
-
- -r, --recursive
- If any given path is a directory, recursively scan the files
- it contains, taking note of any --include and --exclude set-
- tings. By default, a directory is read as a normal file; in
- some operating systems this gives an immediate end-of-file.
- This option is a shorthand for setting the -d option to
- "recurse".
-
- -s, --no-messages
- Suppress error messages about non-existent or unreadable
- files. Such files are quietly skipped. However, the return
- code is still 2, even if matches were found in other files.
-
- -u, --utf-8
- Operate in UTF-8 mode. This option is available only if PCRE
- has been compiled with UTF-8 support. Both patterns and sub-
- ject lines must be valid strings of UTF-8 characters.
-
- -V, --version
- Write the version numbers of pcregrep and the PCRE library
- that is being used to the standard error stream.
-
- -v, --invert-match
- Invert the sense of the match, so that lines which do not
- match any of the patterns are the ones that are found.
-
- -w, --word-regex, --word-regexp
- Force the patterns to match only whole words. This is equiva-
- lent to having \b at the start and end of the pattern.
-
- -x, --line-regex, --line-regexp
- Force the patterns to be anchored (each must start matching
- at the beginning of a line) and in addition, require them to
- match entire lines. This is equivalent to having ^ and $
- characters at the start and end of each alternative branch in
- every pattern.
-
-
-ENVIRONMENT VARIABLES
-
- The environment variables LC_ALL and LC_CTYPE are examined, in that
- order, for a locale. The first one that is set is used. This can be
- overridden by the --locale option. If no locale is set, the PCRE
- library's default (usually the "C" locale) is used.
-
-
-NEWLINES
-
- The -N (--newline) option allows pcregrep to scan files with different
- newline conventions from the default. However, the setting of this
- option does not affect the way in which pcregrep writes information to
- the standard error and output streams. It uses the string "\n" in C
- printf() calls to indicate newlines, relying on the C I/O library to
- convert this to an appropriate sequence if the output is sent to a
- file.
-
-
-OPTIONS COMPATIBILITY
-
- The majority of short and long forms of pcregrep's options are the same
- as in the GNU grep program. Any long option of the form --xxx-regexp
- (GNU terminology) is also available as --xxx-regex (PCRE terminology).
- However, the --locale, -M, --multiline, -u, and --utf-8 options are
- specific to pcregrep.
-
-
-OPTIONS WITH DATA
-
- There are four different ways in which an option with data can be spec-
- ified. If a short form option is used, the data may follow immedi-
- ately, or in the next command line item. For example:
-
- -f/some/file
- -f /some/file
-
- If a long form option is used, the data may appear in the same command
- line item, separated by an equals character, or (with one exception) it
- may appear in the next command line item. For example:
-
- --file=/some/file
- --file /some/file
-
- Note, however, that if you want to supply a file name beginning with ~
- as data in a shell command, and have the shell expand ~ to a home
- directory, you must separate the file name from the option, because the
- shell does not treat ~ specially unless it is at the start of an item.
-
- The exception to the above is the --colour (or --color) option, for
- which the data is optional. If this option does have data, it must be
- given in the first form, using an equals character. Otherwise it will
- be assumed that it has no data.
-
-
-MATCHING ERRORS
-
- It is possible to supply a regular expression that takes a very long
- time to fail to match certain lines. Such patterns normally involve
- nested indefinite repeats, for example: (a+)*\d when matched against a
- line of a's with no final digit. The PCRE matching function has a
- resource limit that causes it to abort in these circumstances. If this
- happens, pcregrep outputs an error message and the line that caused the
- problem to the standard error stream. If there are more than 20 such
- errors, pcregrep gives up.
-
-
-DIAGNOSTICS
-
- Exit status is 0 if any matches were found, 1 if no matches were found,
- and 2 for syntax errors and non-existent or inacessible files (even if
- matches were found in other files) or too many matching errors. Using
- the -s option to suppress error messages about inaccessble files does
- not affect the return code.
-
-
-SEE ALSO
-
- pcrepattern(3), pcretest(1).
-
-
-AUTHOR
-
- Philip Hazel
- University Computing Service
- Cambridge CB2 3QH, England.
-
-
-REVISION
-
- Last updated: 01 March 2009
- Copyright (c) 1997-2009 University of Cambridge.
diff --git a/libs/pcre/doc/pcrematching.3 b/libs/pcre/doc/pcrematching.3
deleted file mode 100644
index 560a48c022..0000000000
--- a/libs/pcre/doc/pcrematching.3
+++ /dev/null
@@ -1,188 +0,0 @@
-.TH PCREMATCHING 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE MATCHING ALGORITHMS"
-.rs
-.sp
-This document describes the two different algorithms that are available in PCRE
-for matching a compiled regular expression against a given subject string. The
-"standard" algorithm is the one provided by the \fBpcre_exec()\fP function.
-This works in the same was as Perl's matching function, and provides a
-Perl-compatible matching operation.
-.P
-An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP function;
-this operates in a different way, and is not Perl-compatible. It has advantages
-and disadvantages compared with the standard algorithm, and these are described
-below.
-.P
-When there is only one possible way in which a given subject string can match a
-pattern, the two algorithms give the same answer. A difference arises, however,
-when there are multiple possibilities. For example, if the pattern
-.sp
- ^<.*>
-.sp
-is matched against the string
-.sp
-
-.sp
-there are three possible answers. The standard algorithm finds only one of
-them, whereas the alternative algorithm finds all three.
-.
-.SH "REGULAR EXPRESSIONS AS TREES"
-.rs
-.sp
-The set of strings that are matched by a regular expression can be represented
-as a tree structure. An unlimited repetition in the pattern makes the tree of
-infinite size, but it is still a tree. Matching the pattern to a given subject
-string (from a given starting point) can be thought of as a search of the tree.
-There are two ways to search a tree: depth-first and breadth-first, and these
-correspond to the two matching algorithms provided by PCRE.
-.
-.SH "THE STANDARD MATCHING ALGORITHM"
-.rs
-.sp
-In the terminology of Jeffrey Friedl's book "Mastering Regular
-Expressions", the standard algorithm is an "NFA algorithm". It conducts a
-depth-first search of the pattern tree. That is, it proceeds along a single
-path through the tree, checking that the subject matches what is required. When
-there is a mismatch, the algorithm tries any alternatives at the current point,
-and if they all fail, it backs up to the previous branch point in the tree, and
-tries the next alternative branch at that level. This often involves backing up
-(moving to the left) in the subject string as well. The order in which
-repetition branches are tried is controlled by the greedy or ungreedy nature of
-the quantifier.
-.P
-If a leaf node is reached, a matching string has been found, and at that point
-the algorithm stops. Thus, if there is more than one possible match, this
-algorithm returns the first one that it finds. Whether this is the shortest,
-the longest, or some intermediate length depends on the way the greedy and
-ungreedy repetition quantifiers are specified in the pattern.
-.P
-Because it ends up with a single path through the tree, it is relatively
-straightforward for this algorithm to keep track of the substrings that are
-matched by portions of the pattern in parentheses. This provides support for
-capturing parentheses and back references.
-.
-.SH "THE ALTERNATIVE MATCHING ALGORITHM"
-.rs
-.sp
-This algorithm conducts a breadth-first search of the tree. Starting from the
-first matching point in the subject, it scans the subject string from left to
-right, once, character by character, and as it does this, it remembers all the
-paths through the tree that represent valid matches. In Friedl's terminology,
-this is a kind of "DFA algorithm", though it is not implemented as a
-traditional finite state machine (it keeps multiple states active
-simultaneously).
-.P
-The scan continues until either the end of the subject is reached, or there are
-no more unterminated paths. At this point, terminated paths represent the
-different matching possibilities (if there are none, the match has failed).
-Thus, if there is more than one possible match, this algorithm finds all of
-them, and in particular, it finds the longest. In PCRE, there is an option to
-stop the algorithm after the first match (which is necessarily the shortest)
-has been found.
-.P
-Note that all the matches that are found start at the same point in the
-subject. If the pattern
-.sp
- cat(er(pillar)?)
-.sp
-is matched against the string "the caterpillar catchment", the result will be
-the three strings "cat", "cater", and "caterpillar" that start at the fourth
-character of the subject. The algorithm does not automatically move on to find
-matches that start at later positions.
-.P
-There are a number of features of PCRE regular expressions that are not
-supported by the alternative matching algorithm. They are as follows:
-.P
-1. Because the algorithm finds all possible matches, the greedy or ungreedy
-nature of repetition quantifiers is not relevant. Greedy and ungreedy
-quantifiers are treated in exactly the same way. However, possessive
-quantifiers can make a difference when what follows could also match what is
-quantified, for example in a pattern like this:
-.sp
- ^a++\ew!
-.sp
-This pattern matches "aaab!" but not "aaa!", which would be matched by a
-non-possessive quantifier. Similarly, if an atomic group is present, it is
-matched as if it were a standalone pattern at the current point, and the
-longest match is then "locked in" for the rest of the overall pattern.
-.P
-2. When dealing with multiple paths through the tree simultaneously, it is not
-straightforward to keep track of captured substrings for the different matching
-possibilities, and PCRE's implementation of this algorithm does not attempt to
-do this. This means that no captured substrings are available.
-.P
-3. Because no substrings are captured, back references within the pattern are
-not supported, and cause errors if encountered.
-.P
-4. For the same reason, conditional expressions that use a backreference as the
-condition or test for a specific group recursion are not supported.
-.P
-5. Because many paths through the tree may be active, the \eK escape sequence,
-which resets the start of the match when encountered (but may be on some paths
-and not on others), is not supported. It causes an error if encountered.
-.P
-6. Callouts are supported, but the value of the \fIcapture_top\fP field is
-always 1, and the value of the \fIcapture_last\fP field is always -1.
-.P
-7. The \eC escape sequence, which (in the standard algorithm) matches a single
-byte, even in UTF-8 mode, is not supported because the alternative algorithm
-moves through the subject string one character at a time, for all active paths
-through the tree.
-.P
-8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
-supported. (*FAIL) is supported, and behaves like a failing negative assertion.
-.
-.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
-.rs
-.sp
-Using the alternative matching algorithm provides the following advantages:
-.P
-1. All possible matches (at a single point in the subject) are automatically
-found, and in particular, the longest match is found. To find more than one
-match using the standard algorithm, you have to do kludgy things with
-callouts.
-.P
-2. There is much better support for partial matching. The restrictions on the
-content of the pattern that apply when using the standard algorithm for partial
-matching do not apply to the alternative algorithm. For non-anchored patterns,
-the starting position of a partial match is available.
-.P
-3. Because the alternative algorithm scans the subject string just once, and
-never needs to backtrack, it is possible to pass very long subject strings to
-the matching function in several pieces, checking for partial matching each
-time.
-.
-.SH "DISADVANTAGES OF THE ALTERNATIVE ALGORITHM"
-.rs
-.sp
-The alternative algorithm suffers from a number of disadvantages:
-.P
-1. It is substantially slower than the standard algorithm. This is partly
-because it has to search for all possible matches, but is also because it is
-less susceptible to optimization.
-.P
-2. Capturing parentheses and back references are not supported.
-.P
-3. Although atomic groups are supported, their use does not provide the
-performance advantage that it does for the standard algorithm.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 19 April 2008
-Copyright (c) 1997-2008 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrepartial.3 b/libs/pcre/doc/pcrepartial.3
deleted file mode 100644
index e418734073..0000000000
--- a/libs/pcre/doc/pcrepartial.3
+++ /dev/null
@@ -1,219 +0,0 @@
-.TH PCREPARTIAL 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PARTIAL MATCHING IN PCRE"
-.rs
-.sp
-In normal use of PCRE, if the subject string that is passed to
-\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP matches as far as it goes, but is
-too short to match the entire pattern, PCRE_ERROR_NOMATCH is returned. There
-are circumstances where it might be helpful to distinguish this case from other
-cases in which there is no match.
-.P
-Consider, for example, an application where a human is required to type in data
-for a field with specific formatting requirements. An example might be a date
-in the form \fIddmmmyy\fP, defined by this pattern:
-.sp
- ^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
-.sp
-If the application sees the user's keystrokes one by one, and can check that
-what has been typed so far is potentially valid, it is able to raise an error
-as soon as a mistake is made, possibly beeping and not reflecting the
-character that has been typed. This immediate feedback is likely to be a better
-user interface than a check that is delayed until the entire string has been
-entered.
-.P
-PCRE supports the concept of partial matching by means of the PCRE_PARTIAL
-option, which can be set when calling \fBpcre_exec()\fP or
-\fBpcre_dfa_exec()\fP. When this flag is set for \fBpcre_exec()\fP, the return
-code PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
-during the matching process the last part of the subject string matched part of
-the pattern. Unfortunately, for non-anchored matching, it is not possible to
-obtain the position of the start of the partial match. No captured data is set
-when PCRE_ERROR_PARTIAL is returned.
-.P
-When PCRE_PARTIAL is set for \fBpcre_dfa_exec()\fP, the return code
-PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of the
-subject is reached, there have been no complete matches, but there is still at
-least one matching possibility. The portion of the string that provided the
-partial match is set as the first matching string.
-.P
-Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers the
-last literal byte in a pattern, and abandons matching immediately if such a
-byte is not present in the subject string. This optimization cannot be used
-for a subject string that might match only partially.
-.
-.
-.SH "RESTRICTED PATTERNS FOR PCRE_PARTIAL"
-.rs
-.sp
-Because of the way certain internal optimizations are implemented in the
-\fBpcre_exec()\fP function, the PCRE_PARTIAL option cannot be used with all
-patterns. These restrictions do not apply when \fBpcre_dfa_exec()\fP is used.
-For \fBpcre_exec()\fP, repeated single characters such as
-.sp
- a{2,4}
-.sp
-and repeated single metasequences such as
-.sp
- \ed+
-.sp
-are not permitted if the maximum number of occurrences is greater than one.
-Optional items such as \ed? (where the maximum is one) are permitted.
-Quantifiers with any values are permitted after parentheses, so the invalid
-examples above can be coded thus:
-.sp
- (a){2,4}
- (\ed)+
-.sp
-These constructions run more slowly, but for the kinds of application that are
-envisaged for this facility, this is not felt to be a major restriction.
-.P
-If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
-\fBpcre_exec()\fP returns the error code PCRE_ERROR_BADPARTIAL (-13).
-You can use the PCRE_INFO_OKPARTIAL call to \fBpcre_fullinfo()\fP to find out
-if a compiled pattern can be used for partial matching.
-.
-.
-.SH "EXAMPLE OF PARTIAL MATCHING USING PCRETEST"
-.rs
-.sp
-If the escape sequence \eP is present in a \fBpcretest\fP data line, the
-PCRE_PARTIAL flag is used for the match. Here is a run of \fBpcretest\fP that
-uses the date example quoted above:
-.sp
- re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
- data> 25jun04\eP
- 0: 25jun04
- 1: jun
- data> 25dec3\eP
- Partial match
- data> 3ju\eP
- Partial match
- data> 3juj\eP
- No match
- data> j\eP
- No match
-.sp
-The first data string is matched completely, so \fBpcretest\fP shows the
-matched substrings. The remaining four strings do not match the complete
-pattern, but the first two are partial matches. The same test, using
-\fBpcre_dfa_exec()\fP matching (by means of the \eD escape sequence), produces
-the following output:
-.sp
- re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
- data> 25jun04\eP\eD
- 0: 25jun04
- data> 23dec3\eP\eD
- Partial match: 23dec3
- data> 3ju\eP\eD
- Partial match: 3ju
- data> 3juj\eP\eD
- No match
- data> j\eP\eD
- No match
-.sp
-Notice that in this case the portion of the string that was matched is made
-available.
-.
-.
-.SH "MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()"
-.rs
-.sp
-When a partial match has been found using \fBpcre_dfa_exec()\fP, it is possible
-to continue the match by providing additional subject data and calling
-\fBpcre_dfa_exec()\fP again with the same compiled regular expression, this
-time setting the PCRE_DFA_RESTART option. You must also pass the same working
-space as before, because this is where details of the previous partial match
-are stored. Here is an example using \fBpcretest\fP, using the \eR escape
-sequence to set the PCRE_DFA_RESTART option (\eP and \eD are as above):
-.sp
- re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
- data> 23ja\eP\eD
- Partial match: 23ja
- data> n05\eR\eD
- 0: n05
-.sp
-The first call has "23ja" as the subject, and requests partial matching; the
-second call has "n05" as the subject for the continued (restarted) match.
-Notice that when the match is complete, only the last part is shown; PCRE does
-not retain the previously partially-matched string. It is up to the calling
-program to do that if it needs to.
-.P
-You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
-over multiple segments. This facility can be used to pass very long subject
-strings to \fBpcre_dfa_exec()\fP. However, some care is needed for certain
-types of pattern.
-.P
-1. If the pattern contains tests for the beginning or end of a line, you need
-to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
-subject string for any call does not contain the beginning or end of a line.
-.P
-2. If the pattern contains backward assertions (including \eb or \eB), you need
-to arrange for some overlap in the subject strings to allow for this. For
-example, you could pass the subject in chunks that are 500 bytes long, but in
-a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
-bytes at the start of the buffer.
-.P
-3. Matching a subject string that is split into multiple segments does not
-always produce exactly the same result as matching over one single long string.
-The difference arises when there are multiple matching possibilities, because a
-partial match result is given only when there are no completed matches in a
-call to \fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
-been found, continuation to a new subject segment is no longer possible.
-Consider this \fBpcretest\fP example:
-.sp
- re> /dog(sbody)?/
- data> do\eP\eD
- Partial match: do
- data> gsb\eR\eP\eD
- 0: g
- data> dogsbody\eD
- 0: dogsbody
- 1: dog
-.sp
-The pattern matches the words "dog" or "dogsbody". When the subject is
-presented in several parts ("do" and "gsb" being the first two) the match stops
-when "dog" has been found, and it is not possible to continue. On the other
-hand, if "dogsbody" is presented as a single string, both matches are found.
-.P
-Because of this phenomenon, it does not usually make sense to end a pattern
-that is going to be matched in this way with a variable repeat.
-.P
-4. Patterns that contain alternatives at the top level which do not all
-start with the same pattern item may not work as expected. For example,
-consider this pattern:
-.sp
- 1234|3789
-.sp
-If the first part of the subject is "ABC123", a partial match of the first
-alternative is found at offset 3. There is no partial match for the second
-alternative, because such a match does not start at the same point in the
-subject string. Attempting to continue with the string "789" does not yield a
-match because only those alternatives that match at one point in the subject
-are remembered. The problem arises because the start of the second alternative
-matches within the first alternative. There is no problem with anchored
-patterns or patterns such as:
-.sp
- 1234|ABCD
-.sp
-where no string can be a partial match for both alternatives.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 04 June 2007
-Copyright (c) 1997-2007 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrepattern.3 b/libs/pcre/doc/pcrepattern.3
deleted file mode 100644
index c6ea30a94a..0000000000
--- a/libs/pcre/doc/pcrepattern.3
+++ /dev/null
@@ -1,2259 +0,0 @@
-.TH PCREPATTERN 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE REGULAR EXPRESSION DETAILS"
-.rs
-.sp
-The syntax and semantics of the regular expressions that are supported by PCRE
-are described in detail below. There is a quick-reference syntax summary in the
-.\" HREF
-\fBpcresyntax\fP
-.\"
-page. PCRE tries to match Perl syntax and semantics as closely as it can. PCRE
-also supports some alternative regular expression syntax (which does not
-conflict with the Perl syntax) in order to provide some compatibility with
-regular expressions in Python, .NET, and Oniguruma.
-.P
-Perl's regular expressions are described in its own documentation, and
-regular expressions in general are covered in a number of books, some of which
-have copious examples. Jeffrey Friedl's "Mastering Regular Expressions",
-published by O'Reilly, covers regular expressions in great detail. This
-description of PCRE's regular expressions is intended as reference material.
-.P
-The original operation of PCRE was on strings of one-byte characters. However,
-there is now also support for UTF-8 character strings. To use this, you must
-build PCRE to include UTF-8 support, and then call \fBpcre_compile()\fP with
-the PCRE_UTF8 option. There is also a special sequence that can be given at the
-start of a pattern:
-.sp
- (*UTF8)
-.sp
-Starting a pattern with this sequence is equivalent to setting the PCRE_UTF8
-option. This feature is not Perl-compatible. How setting UTF-8 mode affects
-pattern matching is mentioned in several places below. There is also a summary
-of UTF-8 features in the
-.\" HTML
-.\"
-section on UTF-8 support
-.\"
-in the main
-.\" HREF
-\fBpcre\fP
-.\"
-page.
-.P
-The remainder of this document discusses the patterns that are supported by
-PCRE when its main matching function, \fBpcre_exec()\fP, is used.
-From release 6.0, PCRE offers a second matching function,
-\fBpcre_dfa_exec()\fP, which matches using a different algorithm that is not
-Perl-compatible. Some of the features discussed below are not available when
-\fBpcre_dfa_exec()\fP is used. The advantages and disadvantages of the
-alternative function, and how it differs from the normal function, are
-discussed in the
-.\" HREF
-\fBpcrematching\fP
-.\"
-page.
-.
-.
-.SH "NEWLINE CONVENTIONS"
-.rs
-.sp
-PCRE supports five different conventions for indicating line breaks in
-strings: a single CR (carriage return) character, a single LF (linefeed)
-character, the two-character sequence CRLF, any of the three preceding, or any
-Unicode newline sequence. The
-.\" HREF
-\fBpcreapi\fP
-.\"
-page has
-.\" HTML
-.\"
-further discussion
-.\"
-about newlines, and shows how to set the newline convention in the
-\fIoptions\fP arguments for the compiling and matching functions.
-.P
-It is also possible to specify a newline convention by starting a pattern
-string with one of the following five sequences:
-.sp
- (*CR) carriage return
- (*LF) linefeed
- (*CRLF) carriage return, followed by linefeed
- (*ANYCRLF) any of the three above
- (*ANY) all Unicode newline sequences
-.sp
-These override the default and the options given to \fBpcre_compile()\fP. For
-example, on a Unix system where LF is the default newline sequence, the pattern
-.sp
- (*CR)a.b
-.sp
-changes the convention to CR. That pattern matches "a\enb" because LF is no
-longer a newline. Note that these special settings, which are not
-Perl-compatible, are recognized only at the very start of a pattern, and that
-they must be in upper case. If more than one of them is present, the last one
-is used.
-.P
-The newline convention does not affect what the \eR escape sequence matches. By
-default, this is any Unicode newline sequence, for Perl compatibility. However,
-this can be changed; see the description of \eR in the section entitled
-.\" HTML
-.\"
-"Newline sequences"
-.\"
-below. A change of \eR setting can be combined with a change of newline
-convention.
-.
-.
-.SH "CHARACTERS AND METACHARACTERS"
-.rs
-.sp
-A regular expression is a pattern that is matched against a subject string from
-left to right. Most characters stand for themselves in a pattern, and match the
-corresponding characters in the subject. As a trivial example, the pattern
-.sp
- The quick brown fox
-.sp
-matches a portion of a subject string that is identical to itself. When
-caseless matching is specified (the PCRE_CASELESS option), letters are matched
-independently of case. In UTF-8 mode, PCRE always understands the concept of
-case for characters whose values are less than 128, so caseless matching is
-always possible. For characters with higher values, the concept of case is
-supported if PCRE is compiled with Unicode property support, but not otherwise.
-If you want to use caseless matching for characters 128 and above, you must
-ensure that PCRE is compiled with Unicode property support as well as with
-UTF-8 support.
-.P
-The power of regular expressions comes from the ability to include alternatives
-and repetitions in the pattern. These are encoded in the pattern by the use of
-\fImetacharacters\fP, which do not stand for themselves but instead are
-interpreted in some special way.
-.P
-There are two different sets of metacharacters: those that are recognized
-anywhere in the pattern except within square brackets, and those that are
-recognized within square brackets. Outside square brackets, the metacharacters
-are as follows:
-.sp
- \e general escape character with several uses
- ^ assert start of string (or line, in multiline mode)
- $ assert end of string (or line, in multiline mode)
- . match any character except newline (by default)
- [ start character class definition
- | start of alternative branch
- ( start subpattern
- ) end subpattern
- ? extends the meaning of (
- also 0 or 1 quantifier
- also quantifier minimizer
- * 0 or more quantifier
- + 1 or more quantifier
- also "possessive quantifier"
- { start min/max quantifier
-.sp
-Part of a pattern that is in square brackets is called a "character class". In
-a character class the only metacharacters are:
-.sp
- \e general escape character
- ^ negate the class, but only if the first character
- - indicates character range
-.\" JOIN
- [ POSIX character class (only if followed by POSIX
- syntax)
- ] terminates the character class
-.sp
-The following sections describe the use of each of the metacharacters.
-.
-.
-.SH BACKSLASH
-.rs
-.sp
-The backslash character has several uses. Firstly, if it is followed by a
-non-alphanumeric character, it takes away any special meaning that character
-may have. This use of backslash as an escape character applies both inside and
-outside character classes.
-.P
-For example, if you want to match a * character, you write \e* in the pattern.
-This escaping action applies whether or not the following character would
-otherwise be interpreted as a metacharacter, so it is always safe to precede a
-non-alphanumeric with backslash to specify that it stands for itself. In
-particular, if you want to match a backslash, you write \e\e.
-.P
-If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
-pattern (other than in a character class) and characters between a # outside
-a character class and the next newline are ignored. An escaping backslash can
-be used to include a whitespace or # character as part of the pattern.
-.P
-If you want to remove the special meaning from a sequence of characters, you
-can do so by putting them between \eQ and \eE. This is different from Perl in
-that $ and @ are handled as literals in \eQ...\eE sequences in PCRE, whereas in
-Perl, $ and @ cause variable interpolation. Note the following examples:
-.sp
- Pattern PCRE matches Perl matches
-.sp
-.\" JOIN
- \eQabc$xyz\eE abc$xyz abc followed by the
- contents of $xyz
- \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz
- \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz
-.sp
-The \eQ...\eE sequence is recognized both inside and outside character classes.
-.
-.
-.\" HTML
-.SS "Non-printing characters"
-.rs
-.sp
-A second use of backslash provides a way of encoding non-printing characters
-in patterns in a visible manner. There is no restriction on the appearance of
-non-printing characters, apart from the binary zero that terminates a pattern,
-but when a pattern is being prepared by text editing, it is usually easier to
-use one of the following escape sequences than the binary character it
-represents:
-.sp
- \ea alarm, that is, the BEL character (hex 07)
- \ecx "control-x", where x is any character
- \ee escape (hex 1B)
- \ef formfeed (hex 0C)
- \en linefeed (hex 0A)
- \er carriage return (hex 0D)
- \et tab (hex 09)
- \eddd character with octal code ddd, or backreference
- \exhh character with hex code hh
- \ex{hhh..} character with hex code hhh..
-.sp
-The precise effect of \ecx is as follows: if x is a lower case letter, it
-is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
-Thus \ecz becomes hex 1A, but \ec{ becomes hex 3B, while \ec; becomes hex
-7B.
-.P
-After \ex, from zero to two hexadecimal digits are read (letters can be in
-upper or lower case). Any number of hexadecimal digits may appear between \ex{
-and }, but the value of the character code must be less than 256 in non-UTF-8
-mode, and less than 2**31 in UTF-8 mode. That is, the maximum value in
-hexadecimal is 7FFFFFFF. Note that this is bigger than the largest Unicode code
-point, which is 10FFFF.
-.P
-If characters other than hexadecimal digits appear between \ex{ and }, or if
-there is no terminating }, this form of escape is not recognized. Instead, the
-initial \ex will be interpreted as a basic hexadecimal escape, with no
-following digits, giving a character whose value is zero.
-.P
-Characters whose value is less than 256 can be defined by either of the two
-syntaxes for \ex. There is no difference in the way they are handled. For
-example, \exdc is exactly the same as \ex{dc}.
-.P
-After \e0 up to two further octal digits are read. If there are fewer than two
-digits, just those that are present are used. Thus the sequence \e0\ex\e07
-specifies two binary zeros followed by a BEL character (code value 7). Make
-sure you supply two digits after the initial zero if the pattern character that
-follows is itself an octal digit.
-.P
-The handling of a backslash followed by a digit other than 0 is complicated.
-Outside a character class, PCRE reads it and any following digits as a decimal
-number. If the number is less than 10, or if there have been at least that many
-previous capturing left parentheses in the expression, the entire sequence is
-taken as a \fIback reference\fP. A description of how this works is given
-.\" HTML
-.\"
-later,
-.\"
-following the discussion of
-.\" HTML
-.\"
-parenthesized subpatterns.
-.\"
-.P
-Inside a character class, or if the decimal number is greater than 9 and there
-have not been that many capturing subpatterns, PCRE re-reads up to three octal
-digits following the backslash, and uses them to generate a data character. Any
-subsequent digits stand for themselves. In non-UTF-8 mode, the value of a
-character specified in octal must be less than \e400. In UTF-8 mode, values up
-to \e777 are permitted. For example:
-.sp
- \e040 is another way of writing a space
-.\" JOIN
- \e40 is the same, provided there are fewer than 40
- previous capturing subpatterns
- \e7 is always a back reference
-.\" JOIN
- \e11 might be a back reference, or another way of
- writing a tab
- \e011 is always a tab
- \e0113 is a tab followed by the character "3"
-.\" JOIN
- \e113 might be a back reference, otherwise the
- character with octal code 113
-.\" JOIN
- \e377 might be a back reference, otherwise
- the byte consisting entirely of 1 bits
-.\" JOIN
- \e81 is either a back reference, or a binary zero
- followed by the two characters "8" and "1"
-.sp
-Note that octal values of 100 or greater must not be introduced by a leading
-zero, because no more than three octal digits are ever read.
-.P
-All the sequences that define a single character value can be used both inside
-and outside character classes. In addition, inside a character class, the
-sequence \eb is interpreted as the backspace character (hex 08), and the
-sequences \eR and \eX are interpreted as the characters "R" and "X",
-respectively. Outside a character class, these sequences have different
-meanings
-.\" HTML
-.\"
-(see below).
-.\"
-.
-.
-.SS "Absolute and relative back references"
-.rs
-.sp
-The sequence \eg followed by an unsigned or a negative number, optionally
-enclosed in braces, is an absolute or relative back reference. A named back
-reference can be coded as \eg{name}. Back references are discussed
-.\" HTML
-.\"
-later,
-.\"
-following the discussion of
-.\" HTML
-.\"
-parenthesized subpatterns.
-.\"
-.
-.
-.SS "Absolute and relative subroutine calls"
-.rs
-.sp
-For compatibility with Oniguruma, the non-Perl syntax \eg followed by a name or
-a number enclosed either in angle brackets or single quotes, is an alternative
-syntax for referencing a subpattern as a "subroutine". Details are discussed
-.\" HTML
-.\"
-later.
-.\"
-Note that \eg{...} (Perl syntax) and \eg<...> (Oniguruma syntax) are \fInot\fP
-synonymous. The former is a back reference; the latter is a subroutine call.
-.
-.
-.SS "Generic character types"
-.rs
-.sp
-Another use of backslash is for specifying generic character types. The
-following are always recognized:
-.sp
- \ed any decimal digit
- \eD any character that is not a decimal digit
- \eh any horizontal whitespace character
- \eH any character that is not a horizontal whitespace character
- \es any whitespace character
- \eS any character that is not a whitespace character
- \ev any vertical whitespace character
- \eV any character that is not a vertical whitespace character
- \ew any "word" character
- \eW any "non-word" character
-.sp
-Each pair of escape sequences partitions the complete set of characters into
-two disjoint sets. Any given character matches one, and only one, of each pair.
-.P
-These character type sequences can appear both inside and outside character
-classes. They each match one character of the appropriate type. If the current
-matching point is at the end of the subject string, all of them fail, since
-there is no character to match.
-.P
-For compatibility with Perl, \es does not match the VT character (code 11).
-This makes it different from the the POSIX "space" class. The \es characters
-are HT (9), LF (10), FF (12), CR (13), and space (32). If "use locale;" is
-included in a Perl script, \es may match the VT character. In PCRE, it never
-does.
-.P
-In UTF-8 mode, characters with values greater than 128 never match \ed, \es, or
-\ew, and always match \eD, \eS, and \eW. This is true even when Unicode
-character property support is available. These sequences retain their original
-meanings from before UTF-8 support was available, mainly for efficiency
-reasons. Note that this also affects \eb, because it is defined in terms of \ew
-and \eW.
-.P
-The sequences \eh, \eH, \ev, and \eV are Perl 5.10 features. In contrast to the
-other sequences, these do match certain high-valued codepoints in UTF-8 mode.
-The horizontal space characters are:
-.sp
- U+0009 Horizontal tab
- U+0020 Space
- U+00A0 Non-break space
- U+1680 Ogham space mark
- U+180E Mongolian vowel separator
- U+2000 En quad
- U+2001 Em quad
- U+2002 En space
- U+2003 Em space
- U+2004 Three-per-em space
- U+2005 Four-per-em space
- U+2006 Six-per-em space
- U+2007 Figure space
- U+2008 Punctuation space
- U+2009 Thin space
- U+200A Hair space
- U+202F Narrow no-break space
- U+205F Medium mathematical space
- U+3000 Ideographic space
-.sp
-The vertical space characters are:
-.sp
- U+000A Linefeed
- U+000B Vertical tab
- U+000C Formfeed
- U+000D Carriage return
- U+0085 Next line
- U+2028 Line separator
- U+2029 Paragraph separator
-.P
-A "word" character is an underscore or any character less than 256 that is a
-letter or digit. The definition of letters and digits is controlled by PCRE's
-low-valued character tables, and may vary if locale-specific matching is taking
-place (see
-.\" HTML
-.\"
-"Locale support"
-.\"
-in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page). For example, in a French locale such as "fr_FR" in Unix-like systems,
-or "french" in Windows, some character codes greater than 128 are used for
-accented letters, and these are matched by \ew. The use of locales with Unicode
-is discouraged.
-.
-.
-.\" HTML
-.SS "Newline sequences"
-.rs
-.sp
-Outside a character class, by default, the escape sequence \eR matches any
-Unicode newline sequence. This is a Perl 5.10 feature. In non-UTF-8 mode \eR is
-equivalent to the following:
-.sp
- (?>\er\en|\en|\ex0b|\ef|\er|\ex85)
-.sp
-This is an example of an "atomic group", details of which are given
-.\" HTML
-.\"
-below.
-.\"
-This particular group matches either the two-character sequence CR followed by
-LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
-U+000B), FF (formfeed, U+000C), CR (carriage return, U+000D), or NEL (next
-line, U+0085). The two-character sequence is treated as a single unit that
-cannot be split.
-.P
-In UTF-8 mode, two additional characters whose codepoints are greater than 255
-are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029).
-Unicode character property support is not needed for these characters to be
-recognized.
-.P
-It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
-complete set of Unicode line endings) by setting the option PCRE_BSR_ANYCRLF
-either at compile time or when the pattern is matched. (BSR is an abbrevation
-for "backslash R".) This can be made the default when PCRE is built; if this is
-the case, the other behaviour can be requested via the PCRE_BSR_UNICODE option.
-It is also possible to specify these settings by starting a pattern string with
-one of the following sequences:
-.sp
- (*BSR_ANYCRLF) CR, LF, or CRLF only
- (*BSR_UNICODE) any Unicode newline sequence
-.sp
-These override the default and the options given to \fBpcre_compile()\fP, but
-they can be overridden by options given to \fBpcre_exec()\fP. Note that these
-special settings, which are not Perl-compatible, are recognized only at the
-very start of a pattern, and that they must be in upper case. If more than one
-of them is present, the last one is used. They can be combined with a change of
-newline convention, for example, a pattern can start with:
-.sp
- (*ANY)(*BSR_ANYCRLF)
-.sp
-Inside a character class, \eR matches the letter "R".
-.
-.
-.\" HTML
-.SS Unicode character properties
-.rs
-.sp
-When PCRE is built with Unicode character property support, three additional
-escape sequences that match characters with specific properties are available.
-When not in UTF-8 mode, these sequences are of course limited to testing
-characters whose codepoints are less than 256, but they do work in this mode.
-The extra escape sequences are:
-.sp
- \ep{\fIxx\fP} a character with the \fIxx\fP property
- \eP{\fIxx\fP} a character without the \fIxx\fP property
- \eX an extended Unicode sequence
-.sp
-The property names represented by \fIxx\fP above are limited to the Unicode
-script names, the general category properties, and "Any", which matches any
-character (including newline). Other properties such as "InMusicalSymbols" are
-not currently supported by PCRE. Note that \eP{Any} does not match any
-characters, so always causes a match failure.
-.P
-Sets of Unicode characters are defined as belonging to certain scripts. A
-character from one of these sets can be matched using a script name. For
-example:
-.sp
- \ep{Greek}
- \eP{Han}
-.sp
-Those that are not part of an identified script are lumped together as
-"Common". The current list of scripts is:
-.P
-Arabic,
-Armenian,
-Balinese,
-Bengali,
-Bopomofo,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Greek,
-Gujarati,
-Gurmukhi,
-Han,
-Hangul,
-Hanunoo,
-Hebrew,
-Hiragana,
-Inherited,
-Kannada,
-Katakana,
-Kharoshthi,
-Khmer,
-Lao,
-Latin,
-Limbu,
-Linear_B,
-Malayalam,
-Mongolian,
-Myanmar,
-New_Tai_Lue,
-Nko,
-Ogham,
-Old_Italic,
-Old_Persian,
-Oriya,
-Osmanya,
-Phags_Pa,
-Phoenician,
-Runic,
-Shavian,
-Sinhala,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tamil,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Ugaritic,
-Yi.
-.P
-Each character has exactly one general category property, specified by a
-two-letter abbreviation. For compatibility with Perl, negation can be specified
-by including a circumflex between the opening brace and the property name. For
-example, \ep{^Lu} is the same as \eP{Lu}.
-.P
-If only one letter is specified with \ep or \eP, it includes all the general
-category properties that start with that letter. In this case, in the absence
-of negation, the curly brackets in the escape sequence are optional; these two
-examples have the same effect:
-.sp
- \ep{L}
- \epL
-.sp
-The following general category property codes are supported:
-.sp
- C Other
- Cc Control
- Cf Format
- Cn Unassigned
- Co Private use
- Cs Surrogate
-.sp
- L Letter
- Ll Lower case letter
- Lm Modifier letter
- Lo Other letter
- Lt Title case letter
- Lu Upper case letter
-.sp
- M Mark
- Mc Spacing mark
- Me Enclosing mark
- Mn Non-spacing mark
-.sp
- N Number
- Nd Decimal number
- Nl Letter number
- No Other number
-.sp
- P Punctuation
- Pc Connector punctuation
- Pd Dash punctuation
- Pe Close punctuation
- Pf Final punctuation
- Pi Initial punctuation
- Po Other punctuation
- Ps Open punctuation
-.sp
- S Symbol
- Sc Currency symbol
- Sk Modifier symbol
- Sm Mathematical symbol
- So Other symbol
-.sp
- Z Separator
- Zl Line separator
- Zp Paragraph separator
- Zs Space separator
-.sp
-The special property L& is also supported: it matches a character that has
-the Lu, Ll, or Lt property, in other words, a letter that is not classified as
-a modifier or "other".
-.P
-The Cs (Surrogate) property applies only to characters in the range U+D800 to
-U+DFFF. Such characters are not valid in UTF-8 strings (see RFC 3629) and so
-cannot be tested by PCRE, unless UTF-8 validity checking has been turned off
-(see the discussion of PCRE_NO_UTF8_CHECK in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-page).
-.P
-The long synonyms for these properties that Perl supports (such as \ep{Letter})
-are not supported by PCRE, nor is it permitted to prefix any of these
-properties with "Is".
-.P
-No character that is in the Unicode table has the Cn (unassigned) property.
-Instead, this property is assumed for any code point that is not in the
-Unicode table.
-.P
-Specifying caseless matching does not affect these escape sequences. For
-example, \ep{Lu} always matches only upper case letters.
-.P
-The \eX escape matches any number of Unicode characters that form an extended
-Unicode sequence. \eX is equivalent to
-.sp
- (?>\ePM\epM*)
-.sp
-That is, it matches a character without the "mark" property, followed by zero
-or more characters with the "mark" property, and treats the sequence as an
-atomic group
-.\" HTML
-.\"
-(see below).
-.\"
-Characters with the "mark" property are typically accents that affect the
-preceding character. None of them have codepoints less than 256, so in
-non-UTF-8 mode \eX matches any one character.
-.P
-Matching characters by Unicode property is not fast, because PCRE has to search
-a structure that contains data for over fifteen thousand characters. That is
-why the traditional escape sequences such as \ed and \ew do not use Unicode
-properties in PCRE.
-.
-.
-.\" HTML
-.SS "Resetting the match start"
-.rs
-.sp
-The escape sequence \eK, which is a Perl 5.10 feature, causes any previously
-matched characters not to be included in the final matched sequence. For
-example, the pattern:
-.sp
- foo\eKbar
-.sp
-matches "foobar", but reports that it has matched "bar". This feature is
-similar to a lookbehind assertion
-.\" HTML
-.\"
-(described below).
-.\"
-However, in this case, the part of the subject before the real match does not
-have to be of fixed length, as lookbehind assertions do. The use of \eK does
-not interfere with the setting of
-.\" HTML
-.\"
-captured substrings.
-.\"
-For example, when the pattern
-.sp
- (foo)\eKbar
-.sp
-matches "foobar", the first substring is still set to "foo".
-.
-.
-.\" HTML
-.SS "Simple assertions"
-.rs
-.sp
-The final use of backslash is for certain simple assertions. An assertion
-specifies a condition that has to be met at a particular point in a match,
-without consuming any characters from the subject string. The use of
-subpatterns for more complicated assertions is described
-.\" HTML
-.\"
-below.
-.\"
-The backslashed assertions are:
-.sp
- \eb matches at a word boundary
- \eB matches when not at a word boundary
- \eA matches at the start of the subject
- \eZ matches at the end of the subject
- also matches before a newline at the end of the subject
- \ez matches only at the end of the subject
- \eG matches at the first matching position in the subject
-.sp
-These assertions may not appear in character classes (but note that \eb has a
-different meaning, namely the backspace character, inside a character class).
-.P
-A word boundary is a position in the subject string where the current character
-and the previous character do not both match \ew or \eW (i.e. one matches
-\ew and the other matches \eW), or the start or end of the string if the
-first or last character matches \ew, respectively.
-.P
-The \eA, \eZ, and \ez assertions differ from the traditional circumflex and
-dollar (described in the next section) in that they only ever match at the very
-start and end of the subject string, whatever options are set. Thus, they are
-independent of multiline mode. These three assertions are not affected by the
-PCRE_NOTBOL or PCRE_NOTEOL options, which affect only the behaviour of the
-circumflex and dollar metacharacters. However, if the \fIstartoffset\fP
-argument of \fBpcre_exec()\fP is non-zero, indicating that matching is to start
-at a point other than the beginning of the subject, \eA can never match. The
-difference between \eZ and \ez is that \eZ matches before a newline at the end
-of the string as well as at the very end, whereas \ez matches only at the end.
-.P
-The \eG assertion is true only when the current matching position is at the
-start point of the match, as specified by the \fIstartoffset\fP argument of
-\fBpcre_exec()\fP. It differs from \eA when the value of \fIstartoffset\fP is
-non-zero. By calling \fBpcre_exec()\fP multiple times with appropriate
-arguments, you can mimic Perl's /g option, and it is in this kind of
-implementation where \eG can be useful.
-.P
-Note, however, that PCRE's interpretation of \eG, as the start of the current
-match, is subtly different from Perl's, which defines it as the end of the
-previous match. In Perl, these can be different when the previously matched
-string was empty. Because PCRE does just one match at a time, it cannot
-reproduce this behaviour.
-.P
-If all the alternatives of a pattern begin with \eG, the expression is anchored
-to the starting match position, and the "anchored" flag is set in the compiled
-regular expression.
-.
-.
-.SH "CIRCUMFLEX AND DOLLAR"
-.rs
-.sp
-Outside a character class, in the default matching mode, the circumflex
-character is an assertion that is true only if the current matching point is
-at the start of the subject string. If the \fIstartoffset\fP argument of
-\fBpcre_exec()\fP is non-zero, circumflex can never match if the PCRE_MULTILINE
-option is unset. Inside a character class, circumflex has an entirely different
-meaning
-.\" HTML
-.\"
-(see below).
-.\"
-.P
-Circumflex need not be the first character of the pattern if a number of
-alternatives are involved, but it should be the first thing in each alternative
-in which it appears if the pattern is ever to match that branch. If all
-possible alternatives start with a circumflex, that is, if the pattern is
-constrained to match only at the start of the subject, it is said to be an
-"anchored" pattern. (There are also other constructs that can cause a pattern
-to be anchored.)
-.P
-A dollar character is an assertion that is true only if the current matching
-point is at the end of the subject string, or immediately before a newline
-at the end of the string (by default). Dollar need not be the last character of
-the pattern if a number of alternatives are involved, but it should be the last
-item in any branch in which it appears. Dollar has no special meaning in a
-character class.
-.P
-The meaning of dollar can be changed so that it matches only at the very end of
-the string, by setting the PCRE_DOLLAR_ENDONLY option at compile time. This
-does not affect the \eZ assertion.
-.P
-The meanings of the circumflex and dollar characters are changed if the
-PCRE_MULTILINE option is set. When this is the case, a circumflex matches
-immediately after internal newlines as well as at the start of the subject
-string. It does not match after a newline that ends the string. A dollar
-matches before any newlines in the string, as well as at the very end, when
-PCRE_MULTILINE is set. When newline is specified as the two-character
-sequence CRLF, isolated CR and LF characters do not indicate newlines.
-.P
-For example, the pattern /^abc$/ matches the subject string "def\enabc" (where
-\en represents a newline) in multiline mode, but not otherwise. Consequently,
-patterns that are anchored in single line mode because all branches start with
-^ are not anchored in multiline mode, and a match for circumflex is possible
-when the \fIstartoffset\fP argument of \fBpcre_exec()\fP is non-zero. The
-PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
-.P
-Note that the sequences \eA, \eZ, and \ez can be used to match the start and
-end of the subject in both modes, and if all branches of a pattern start with
-\eA it is always anchored, whether or not PCRE_MULTILINE is set.
-.
-.
-.SH "FULL STOP (PERIOD, DOT)"
-.rs
-.sp
-Outside a character class, a dot in the pattern matches any one character in
-the subject string except (by default) a character that signifies the end of a
-line. In UTF-8 mode, the matched character may be more than one byte long.
-.P
-When a line ending is defined as a single character, dot never matches that
-character; when the two-character sequence CRLF is used, dot does not match CR
-if it is immediately followed by LF, but otherwise it matches all characters
-(including isolated CRs and LFs). When any Unicode line endings are being
-recognized, dot does not match CR or LF or any of the other line ending
-characters.
-.P
-The behaviour of dot with regard to newlines can be changed. If the PCRE_DOTALL
-option is set, a dot matches any one character, without exception. If the
-two-character sequence CRLF is present in the subject string, it takes two dots
-to match it.
-.P
-The handling of dot is entirely independent of the handling of circumflex and
-dollar, the only relationship being that they both involve newlines. Dot has no
-special meaning in a character class.
-.
-.
-.SH "MATCHING A SINGLE BYTE"
-.rs
-.sp
-Outside a character class, the escape sequence \eC matches any one byte, both
-in and out of UTF-8 mode. Unlike a dot, it always matches any line-ending
-characters. The feature is provided in Perl in order to match individual bytes
-in UTF-8 mode. Because it breaks up UTF-8 characters into individual bytes,
-what remains in the string may be a malformed UTF-8 string. For this reason,
-the \eC escape sequence is best avoided.
-.P
-PCRE does not allow \eC to appear in lookbehind assertions
-.\" HTML
-.\"
-(described below),
-.\"
-because in UTF-8 mode this would make it impossible to calculate the length of
-the lookbehind.
-.
-.
-.\" HTML
-.SH "SQUARE BRACKETS AND CHARACTER CLASSES"
-.rs
-.sp
-An opening square bracket introduces a character class, terminated by a closing
-square bracket. A closing square bracket on its own is not special. If a
-closing square bracket is required as a member of the class, it should be the
-first data character in the class (after an initial circumflex, if present) or
-escaped with a backslash.
-.P
-A character class matches a single character in the subject. In UTF-8 mode, the
-character may occupy more than one byte. A matched character must be in the set
-of characters defined by the class, unless the first character in the class
-definition is a circumflex, in which case the subject character must not be in
-the set defined by the class. If a circumflex is actually required as a member
-of the class, ensure it is not the first character, or escape it with a
-backslash.
-.P
-For example, the character class [aeiou] matches any lower case vowel, while
-[^aeiou] matches any character that is not a lower case vowel. Note that a
-circumflex is just a convenient notation for specifying the characters that
-are in the class by enumerating those that are not. A class that starts with a
-circumflex is not an assertion: it still consumes a character from the subject
-string, and therefore it fails if the current pointer is at the end of the
-string.
-.P
-In UTF-8 mode, characters with values greater than 255 can be included in a
-class as a literal string of bytes, or by using the \ex{ escaping mechanism.
-.P
-When caseless matching is set, any letters in a class represent both their
-upper case and lower case versions, so for example, a caseless [aeiou] matches
-"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
-caseful version would. In UTF-8 mode, PCRE always understands the concept of
-case for characters whose values are less than 128, so caseless matching is
-always possible. For characters with higher values, the concept of case is
-supported if PCRE is compiled with Unicode property support, but not otherwise.
-If you want to use caseless matching for characters 128 and above, you must
-ensure that PCRE is compiled with Unicode property support as well as with
-UTF-8 support.
-.P
-Characters that might indicate line breaks are never treated in any special way
-when matching character classes, whatever line-ending sequence is in use, and
-whatever setting of the PCRE_DOTALL and PCRE_MULTILINE options is used. A class
-such as [^a] always matches one of these characters.
-.P
-The minus (hyphen) character can be used to specify a range of characters in a
-character class. For example, [d-m] matches any letter between d and m,
-inclusive. If a minus character is required in a class, it must be escaped with
-a backslash or appear in a position where it cannot be interpreted as
-indicating a range, typically as the first or last character in the class.
-.P
-It is not possible to have the literal character "]" as the end character of a
-range. A pattern such as [W-]46] is interpreted as a class of two characters
-("W" and "-") followed by a literal string "46]", so it would match "W46]" or
-"-46]". However, if the "]" is escaped with a backslash it is interpreted as
-the end of range, so [W-\e]46] is interpreted as a class containing a range
-followed by two other characters. The octal or hexadecimal representation of
-"]" can also be used to end a range.
-.P
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. In UTF-8
-mode, ranges can include characters whose values are greater than 255, for
-example [\ex{100}-\ex{2ff}].
-.P
-If a range that includes letters is used when caseless matching is set, it
-matches the letters in either case. For example, [W-c] is equivalent to
-[][\e\e^_`wxyzabc], matched caselessly, and in non-UTF-8 mode, if character
-tables for a French locale are in use, [\exc8-\excb] matches accented E
-characters in both cases. In UTF-8 mode, PCRE supports the concept of case for
-characters with values greater than 128 only when it is compiled with Unicode
-property support.
-.P
-The character types \ed, \eD, \ep, \eP, \es, \eS, \ew, and \eW may also appear
-in a character class, and add the characters that they match to the class. For
-example, [\edABCDEF] matches any hexadecimal digit. A circumflex can
-conveniently be used with the upper case character types to specify a more
-restricted set of characters than the matching lower case type. For example,
-the class [^\eW_] matches any letter or digit, but not underscore.
-.P
-The only metacharacters that are recognized in character classes are backslash,
-hyphen (only where it can be interpreted as specifying a range), circumflex
-(only at the start), opening square bracket (only when it can be interpreted as
-introducing a POSIX class name - see the next section), and the terminating
-closing square bracket. However, escaping other non-alphanumeric characters
-does no harm.
-.
-.
-.SH "POSIX CHARACTER CLASSES"
-.rs
-.sp
-Perl supports the POSIX notation for character classes. This uses names
-enclosed by [: and :] within the enclosing square brackets. PCRE also supports
-this notation. For example,
-.sp
- [01[:alpha:]%]
-.sp
-matches "0", "1", any alphabetic character, or "%". The supported class names
-are
-.sp
- alnum letters and digits
- alpha letters
- ascii character codes 0 - 127
- blank space or tab only
- cntrl control characters
- digit decimal digits (same as \ed)
- graph printing characters, excluding space
- lower lower case letters
- print printing characters, including space
- punct printing characters, excluding letters and digits
- space white space (not quite the same as \es)
- upper upper case letters
- word "word" characters (same as \ew)
- xdigit hexadecimal digits
-.sp
-The "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), and
-space (32). Notice that this list includes the VT character (code 11). This
-makes "space" different to \es, which does not include VT (for Perl
-compatibility).
-.P
-The name "word" is a Perl extension, and "blank" is a GNU extension from Perl
-5.8. Another Perl extension is negation, which is indicated by a ^ character
-after the colon. For example,
-.sp
- [12[:^digit:]]
-.sp
-matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
-syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
-supported, and an error is given if they are encountered.
-.P
-In UTF-8 mode, characters with values greater than 128 do not match any of
-the POSIX character classes.
-.
-.
-.SH "VERTICAL BAR"
-.rs
-.sp
-Vertical bar characters are used to separate alternative patterns. For example,
-the pattern
-.sp
- gilbert|sullivan
-.sp
-matches either "gilbert" or "sullivan". Any number of alternatives may appear,
-and an empty alternative is permitted (matching the empty string). The matching
-process tries each alternative in turn, from left to right, and the first one
-that succeeds is used. If the alternatives are within a subpattern
-.\" HTML
-.\"
-(defined below),
-.\"
-"succeeds" means matching the rest of the main pattern as well as the
-alternative in the subpattern.
-.
-.
-.SH "INTERNAL OPTION SETTING"
-.rs
-.sp
-The settings of the PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and
-PCRE_EXTENDED options (which are Perl-compatible) can be changed from within
-the pattern by a sequence of Perl option letters enclosed between "(?" and ")".
-The option letters are
-.sp
- i for PCRE_CASELESS
- m for PCRE_MULTILINE
- s for PCRE_DOTALL
- x for PCRE_EXTENDED
-.sp
-For example, (?im) sets caseless, multiline matching. It is also possible to
-unset these options by preceding the letter with a hyphen, and a combined
-setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
-PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
-permitted. If a letter appears both before and after the hyphen, the option is
-unset.
-.P
-The PCRE-specific options PCRE_DUPNAMES, PCRE_UNGREEDY, and PCRE_EXTRA can be
-changed in the same way as the Perl-compatible options by using the characters
-J, U and X respectively.
-.P
-When one of these option changes occurs at top level (that is, not inside
-subpattern parentheses), the change applies to the remainder of the pattern
-that follows. If the change is placed right at the start of a pattern, PCRE
-extracts it into the global options (and it will therefore show up in data
-extracted by the \fBpcre_fullinfo()\fP function).
-.P
-An option change within a subpattern (see below for a description of
-subpatterns) affects only that part of the current pattern that follows it, so
-.sp
- (a(?i)b)c
-.sp
-matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
-By this means, options can be made to have different settings in different
-parts of the pattern. Any changes made in one alternative do carry on
-into subsequent branches within the same subpattern. For example,
-.sp
- (a(?i)b|c)
-.sp
-matches "ab", "aB", "c", and "C", even though when matching "C" the first
-branch is abandoned before the option setting. This is because the effects of
-option settings happen at compile time. There would be some very weird
-behaviour otherwise.
-.P
-\fBNote:\fP There are other PCRE-specific options that can be set by the
-application when the compile or match functions are called. In some cases the
-pattern can contain special leading sequences such as (*CRLF) to override what
-the application has set or what has been defaulted. Details are given in the
-section entitled
-.\" HTML
-.\"
-"Newline sequences"
-.\"
-above. There is also the (*UTF8) leading sequence that can be used to set UTF-8
-mode; this is equivalent to setting the PCRE_UTF8 option.
-.
-.
-.\" HTML
-.SH SUBPATTERNS
-.rs
-.sp
-Subpatterns are delimited by parentheses (round brackets), which can be nested.
-Turning part of a pattern into a subpattern does two things:
-.sp
-1. It localizes a set of alternatives. For example, the pattern
-.sp
- cat(aract|erpillar|)
-.sp
-matches one of the words "cat", "cataract", or "caterpillar". Without the
-parentheses, it would match "cataract", "erpillar" or an empty string.
-.sp
-2. It sets up the subpattern as a capturing subpattern. This means that, when
-the whole pattern matches, that portion of the subject string that matched the
-subpattern is passed back to the caller via the \fIovector\fP argument of
-\fBpcre_exec()\fP. Opening parentheses are counted from left to right (starting
-from 1) to obtain numbers for the capturing subpatterns.
-.P
-For example, if the string "the red king" is matched against the pattern
-.sp
- the ((red|white) (king|queen))
-.sp
-the captured substrings are "red king", "red", and "king", and are numbered 1,
-2, and 3, respectively.
-.P
-The fact that plain parentheses fulfil two functions is not always helpful.
-There are often times when a grouping subpattern is required without a
-capturing requirement. If an opening parenthesis is followed by a question mark
-and a colon, the subpattern does not do any capturing, and is not counted when
-computing the number of any subsequent capturing subpatterns. For example, if
-the string "the white queen" is matched against the pattern
-.sp
- the ((?:red|white) (king|queen))
-.sp
-the captured substrings are "white queen" and "queen", and are numbered 1 and
-2. The maximum number of capturing subpatterns is 65535.
-.P
-As a convenient shorthand, if any option settings are required at the start of
-a non-capturing subpattern, the option letters may appear between the "?" and
-the ":". Thus the two patterns
-.sp
- (?i:saturday|sunday)
- (?:(?i)saturday|sunday)
-.sp
-match exactly the same set of strings. Because alternative branches are tried
-from left to right, and options are not reset until the end of the subpattern
-is reached, an option setting in one branch does affect subsequent branches, so
-the above patterns match "SUNDAY" as well as "Saturday".
-.
-.
-.SH "DUPLICATE SUBPATTERN NUMBERS"
-.rs
-.sp
-Perl 5.10 introduced a feature whereby each alternative in a subpattern uses
-the same numbers for its capturing parentheses. Such a subpattern starts with
-(?| and is itself a non-capturing subpattern. For example, consider this
-pattern:
-.sp
- (?|(Sat)ur|(Sun))day
-.sp
-Because the two alternatives are inside a (?| group, both sets of capturing
-parentheses are numbered one. Thus, when the pattern matches, you can look
-at captured substring number one, whichever alternative matched. This construct
-is useful when you want to capture part, but not all, of one of a number of
-alternatives. Inside a (?| group, parentheses are numbered as usual, but the
-number is reset at the start of each branch. The numbers of any capturing
-buffers that follow the subpattern start after the highest number used in any
-branch. The following example is taken from the Perl documentation.
-The numbers underneath show in which buffer the captured content will be
-stored.
-.sp
- # before ---------------branch-reset----------- after
- / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
- # 1 2 2 3 2 3 4
-.sp
-A backreference or a recursive call to a numbered subpattern always refers to
-the first one in the pattern with the given number.
-.P
-An alternative approach to using this "branch reset" feature is to use
-duplicate named subpatterns, as described in the next section.
-.
-.
-.SH "NAMED SUBPATTERNS"
-.rs
-.sp
-Identifying capturing parentheses by number is simple, but it can be very hard
-to keep track of the numbers in complicated regular expressions. Furthermore,
-if an expression is modified, the numbers may change. To help with this
-difficulty, PCRE supports the naming of subpatterns. This feature was not
-added to Perl until release 5.10. Python had the feature earlier, and PCRE
-introduced it at release 4.0, using the Python syntax. PCRE now supports both
-the Perl and the Python syntax.
-.P
-In PCRE, a subpattern can be named in one of three ways: (?...) or
-(?'name'...) as in Perl, or (?P...) as in Python. References to capturing
-parentheses from other parts of the pattern, such as
-.\" HTML
-.\"
-backreferences,
-.\"
-.\" HTML
-.\"
-recursion,
-.\"
-and
-.\" HTML
-.\"
-conditions,
-.\"
-can be made by name as well as by number.
-.P
-Names consist of up to 32 alphanumeric characters and underscores. Named
-capturing parentheses are still allocated numbers as well as names, exactly as
-if the names were not present. The PCRE API provides function calls for
-extracting the name-to-number translation table from a compiled pattern. There
-is also a convenience function for extracting a captured substring by name.
-.P
-By default, a name must be unique within a pattern, but it is possible to relax
-this constraint by setting the PCRE_DUPNAMES option at compile time. This can
-be useful for patterns where only one instance of the named parentheses can
-match. Suppose you want to match the name of a weekday, either as a 3-letter
-abbreviation or as the full name, and in both cases you want to extract the
-abbreviation. This pattern (ignoring the line breaks) does the job:
-.sp
- (?Mon|Fri|Sun)(?:day)?|
- (?Tue)(?:sday)?|
- (?Wed)(?:nesday)?|
- (?Thu)(?:rsday)?|
- (?Sat)(?:urday)?
-.sp
-There are five capturing substrings, but only one is ever set after a match.
-(An alternative way of solving this problem is to use a "branch reset"
-subpattern, as described in the previous section.)
-.P
-The convenience function for extracting the data by name returns the substring
-for the first (and in this example, the only) subpattern of that name that
-matched. This saves searching to find which numbered subpattern it was. If you
-make a reference to a non-unique named subpattern from elsewhere in the
-pattern, the one that corresponds to the lowest number is used. For further
-details of the interfaces for handling named subpatterns, see the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation.
-.P
-\fBWarning:\fP You cannot use different names to distinguish between two
-subpatterns with the same number (see the previous section) because PCRE uses
-only the numbers when matching.
-.
-.
-.SH REPETITION
-.rs
-.sp
-Repetition is specified by quantifiers, which can follow any of the following
-items:
-.sp
- a literal data character
- the dot metacharacter
- the \eC escape sequence
- the \eX escape sequence (in UTF-8 mode with Unicode properties)
- the \eR escape sequence
- an escape such as \ed that matches a single character
- a character class
- a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion)
-.sp
-The general repetition quantifier specifies a minimum and maximum number of
-permitted matches, by giving the two numbers in curly brackets (braces),
-separated by a comma. The numbers must be less than 65536, and the first must
-be less than or equal to the second. For example:
-.sp
- z{2,4}
-.sp
-matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
-character. If the second number is omitted, but the comma is present, there is
-no upper limit; if the second number and the comma are both omitted, the
-quantifier specifies an exact number of required matches. Thus
-.sp
- [aeiou]{3,}
-.sp
-matches at least 3 successive vowels, but may match many more, while
-.sp
- \ed{8}
-.sp
-matches exactly 8 digits. An opening curly bracket that appears in a position
-where a quantifier is not allowed, or one that does not match the syntax of a
-quantifier, is taken as a literal character. For example, {,6} is not a
-quantifier, but a literal string of four characters.
-.P
-In UTF-8 mode, quantifiers apply to UTF-8 characters rather than to individual
-bytes. Thus, for example, \ex{100}{2} matches two UTF-8 characters, each of
-which is represented by a two-byte sequence. Similarly, when Unicode property
-support is available, \eX{3} matches three Unicode extended sequences, each of
-which may be several bytes long (and they may be of different lengths).
-.P
-The quantifier {0} is permitted, causing the expression to behave as if the
-previous item and the quantifier were not present. This may be useful for
-subpatterns that are referenced as
-.\" HTML
-.\"
-subroutines
-.\"
-from elsewhere in the pattern. Items other than subpatterns that have a {0}
-quantifier are omitted from the compiled pattern.
-.P
-For convenience, the three most common quantifiers have single-character
-abbreviations:
-.sp
- * is equivalent to {0,}
- + is equivalent to {1,}
- ? is equivalent to {0,1}
-.sp
-It is possible to construct infinite loops by following a subpattern that can
-match no characters with a quantifier that has no upper limit, for example:
-.sp
- (a?)*
-.sp
-Earlier versions of Perl and PCRE used to give an error at compile time for
-such patterns. However, because there are cases where this can be useful, such
-patterns are now accepted, but if any repetition of the subpattern does in fact
-match no characters, the loop is forcibly broken.
-.P
-By default, the quantifiers are "greedy", that is, they match as much as
-possible (up to the maximum number of permitted times), without causing the
-rest of the pattern to fail. The classic example of where this gives problems
-is in trying to match comments in C programs. These appear between /* and */
-and within the comment, individual * and / characters may appear. An attempt to
-match C comments by applying the pattern
-.sp
- /\e*.*\e*/
-.sp
-to the string
-.sp
- /* first comment */ not comment /* second comment */
-.sp
-fails, because it matches the entire string owing to the greediness of the .*
-item.
-.P
-However, if a quantifier is followed by a question mark, it ceases to be
-greedy, and instead matches the minimum number of times possible, so the
-pattern
-.sp
- /\e*.*?\e*/
-.sp
-does the right thing with the C comments. The meaning of the various
-quantifiers is not otherwise changed, just the preferred number of matches.
-Do not confuse this use of question mark with its use as a quantifier in its
-own right. Because it has two uses, it can sometimes appear doubled, as in
-.sp
- \ed??\ed
-.sp
-which matches one digit by preference, but can match two if that is the only
-way the rest of the pattern matches.
-.P
-If the PCRE_UNGREEDY option is set (an option that is not available in Perl),
-the quantifiers are not greedy by default, but individual ones can be made
-greedy by following them with a question mark. In other words, it inverts the
-default behaviour.
-.P
-When a parenthesized subpattern is quantified with a minimum repeat count that
-is greater than 1 or with a limited maximum, more memory is required for the
-compiled pattern, in proportion to the size of the minimum or maximum.
-.P
-If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
-to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is
-implicitly anchored, because whatever follows will be tried against every
-character position in the subject string, so there is no point in retrying the
-overall match at any position after the first. PCRE normally treats such a
-pattern as though it were preceded by \eA.
-.P
-In cases where it is known that the subject string contains no newlines, it is
-worth setting PCRE_DOTALL in order to obtain this optimization, or
-alternatively using ^ to indicate anchoring explicitly.
-.P
-However, there is one situation where the optimization cannot be used. When .*
-is inside capturing parentheses that are the subject of a backreference
-elsewhere in the pattern, a match at the start may fail where a later one
-succeeds. Consider, for example:
-.sp
- (.*)abc\e1
-.sp
-If the subject is "xyz123abc123" the match point is the fourth character. For
-this reason, such a pattern is not implicitly anchored.
-.P
-When a capturing subpattern is repeated, the value captured is the substring
-that matched the final iteration. For example, after
-.sp
- (tweedle[dume]{3}\es*)+
-.sp
-has matched "tweedledum tweedledee" the value of the captured substring is
-"tweedledee". However, if there are nested capturing subpatterns, the
-corresponding captured values may have been set in previous iterations. For
-example, after
-.sp
- /(a|(b))+/
-.sp
-matches "aba" the value of the second captured substring is "b".
-.
-.
-.\" HTML
-.SH "ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS"
-.rs
-.sp
-With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy")
-repetition, failure of what follows normally causes the repeated item to be
-re-evaluated to see if a different number of repeats allows the rest of the
-pattern to match. Sometimes it is useful to prevent this, either to change the
-nature of the match, or to cause it fail earlier than it otherwise might, when
-the author of the pattern knows there is no point in carrying on.
-.P
-Consider, for example, the pattern \ed+foo when applied to the subject line
-.sp
- 123456bar
-.sp
-After matching all 6 digits and then failing to match "foo", the normal
-action of the matcher is to try again with only 5 digits matching the \ed+
-item, and then with 4, and so on, before ultimately failing. "Atomic grouping"
-(a term taken from Jeffrey Friedl's book) provides the means for specifying
-that once a subpattern has matched, it is not to be re-evaluated in this way.
-.P
-If we use atomic grouping for the previous example, the matcher gives up
-immediately on failing to match "foo" the first time. The notation is a kind of
-special parenthesis, starting with (?> as in this example:
-.sp
- (?>\ed+)foo
-.sp
-This kind of parenthesis "locks up" the part of the pattern it contains once
-it has matched, and a failure further into the pattern is prevented from
-backtracking into it. Backtracking past it to previous items, however, works as
-normal.
-.P
-An alternative description is that a subpattern of this type matches the string
-of characters that an identical standalone pattern would match, if anchored at
-the current point in the subject string.
-.P
-Atomic grouping subpatterns are not capturing subpatterns. Simple cases such as
-the above example can be thought of as a maximizing repeat that must swallow
-everything it can. So, while both \ed+ and \ed+? are prepared to adjust the
-number of digits they match in order to make the rest of the pattern match,
-(?>\ed+) can only match an entire sequence of digits.
-.P
-Atomic groups in general can of course contain arbitrarily complicated
-subpatterns, and can be nested. However, when the subpattern for an atomic
-group is just a single repeated item, as in the example above, a simpler
-notation, called a "possessive quantifier" can be used. This consists of an
-additional + character following a quantifier. Using this notation, the
-previous example can be rewritten as
-.sp
- \ed++foo
-.sp
-Note that a possessive quantifier can be used with an entire group, for
-example:
-.sp
- (abc|xyz){2,3}+
-.sp
-Possessive quantifiers are always greedy; the setting of the PCRE_UNGREEDY
-option is ignored. They are a convenient notation for the simpler forms of
-atomic group. However, there is no difference in the meaning of a possessive
-quantifier and the equivalent atomic group, though there may be a performance
-difference; possessive quantifiers should be slightly faster.
-.P
-The possessive quantifier syntax is an extension to the Perl 5.8 syntax.
-Jeffrey Friedl originated the idea (and the name) in the first edition of his
-book. Mike McCloskey liked it, so implemented it when he built Sun's Java
-package, and PCRE copied it from there. It ultimately found its way into Perl
-at release 5.10.
-.P
-PCRE has an optimization that automatically "possessifies" certain simple
-pattern constructs. For example, the sequence A+B is treated as A++B because
-there is no point in backtracking into a sequence of A's when B must follow.
-.P
-When a pattern contains an unlimited repeat inside a subpattern that can itself
-be repeated an unlimited number of times, the use of an atomic group is the
-only way to avoid some failing matches taking a very long time indeed. The
-pattern
-.sp
- (\eD+|<\ed+>)*[!?]
-.sp
-matches an unlimited number of substrings that either consist of non-digits, or
-digits enclosed in <>, followed by either ! or ?. When it matches, it runs
-quickly. However, if it is applied to
-.sp
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-.sp
-it takes a long time before reporting failure. This is because the string can
-be divided between the internal \eD+ repeat and the external * repeat in a
-large number of ways, and all have to be tried. (The example uses [!?] rather
-than a single character at the end, because both PCRE and Perl have an
-optimization that allows for fast failure when a single character is used. They
-remember the last single character that is required for a match, and fail early
-if it is not present in the string.) If the pattern is changed so that it uses
-an atomic group, like this:
-.sp
- ((?>\eD+)|<\ed+>)*[!?]
-.sp
-sequences of non-digits cannot be broken, and failure happens quickly.
-.
-.
-.\" HTML
-.SH "BACK REFERENCES"
-.rs
-.sp
-Outside a character class, a backslash followed by a digit greater than 0 (and
-possibly further digits) is a back reference to a capturing subpattern earlier
-(that is, to its left) in the pattern, provided there have been that many
-previous capturing left parentheses.
-.P
-However, if the decimal number following the backslash is less than 10, it is
-always taken as a back reference, and causes an error only if there are not
-that many capturing left parentheses in the entire pattern. In other words, the
-parentheses that are referenced need not be to the left of the reference for
-numbers less than 10. A "forward back reference" of this type can make sense
-when a repetition is involved and the subpattern to the right has participated
-in an earlier iteration.
-.P
-It is not possible to have a numerical "forward back reference" to a subpattern
-whose number is 10 or more using this syntax because a sequence such as \e50 is
-interpreted as a character defined in octal. See the subsection entitled
-"Non-printing characters"
-.\" HTML
-.\"
-above
-.\"
-for further details of the handling of digits following a backslash. There is
-no such problem when named parentheses are used. A back reference to any
-subpattern is possible using named parentheses (see below).
-.P
-Another way of avoiding the ambiguity inherent in the use of digits following a
-backslash is to use the \eg escape sequence, which is a feature introduced in
-Perl 5.10. This escape must be followed by an unsigned number or a negative
-number, optionally enclosed in braces. These examples are all identical:
-.sp
- (ring), \e1
- (ring), \eg1
- (ring), \eg{1}
-.sp
-An unsigned number specifies an absolute reference without the ambiguity that
-is present in the older syntax. It is also useful when literal digits follow
-the reference. A negative number is a relative reference. Consider this
-example:
-.sp
- (abc(def)ghi)\eg{-1}
-.sp
-The sequence \eg{-1} is a reference to the most recently started capturing
-subpattern before \eg, that is, is it equivalent to \e2. Similarly, \eg{-2}
-would be equivalent to \e1. The use of relative references can be helpful in
-long patterns, and also in patterns that are created by joining together
-fragments that contain references within themselves.
-.P
-A back reference matches whatever actually matched the capturing subpattern in
-the current subject string, rather than anything matching the subpattern
-itself (see
-.\" HTML
-.\"
-"Subpatterns as subroutines"
-.\"
-below for a way of doing that). So the pattern
-.sp
- (sens|respons)e and \e1ibility
-.sp
-matches "sense and sensibility" and "response and responsibility", but not
-"sense and responsibility". If caseful matching is in force at the time of the
-back reference, the case of letters is relevant. For example,
-.sp
- ((?i)rah)\es+\e1
-.sp
-matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
-capturing subpattern is matched caselessly.
-.P
-There are several different ways of writing back references to named
-subpatterns. The .NET syntax \ek{name} and the Perl syntax \ek or
-\ek'name' are supported, as is the Python syntax (?P=name). Perl 5.10's unified
-back reference syntax, in which \eg can be used for both numeric and named
-references, is also supported. We could rewrite the above example in any of
-the following ways:
-.sp
- (?(?i)rah)\es+\ek
- (?'p1'(?i)rah)\es+\ek{p1}
- (?P(?i)rah)\es+(?P=p1)
- (?(?i)rah)\es+\eg{p1}
-.sp
-A subpattern that is referenced by name may appear in the pattern before or
-after the reference.
-.P
-There may be more than one back reference to the same subpattern. If a
-subpattern has not actually been used in a particular match, any back
-references to it always fail. For example, the pattern
-.sp
- (a|(bc))\e2
-.sp
-always fails if it starts to match "a" rather than "bc". Because there may be
-many capturing parentheses in a pattern, all digits following the backslash are
-taken as part of a potential back reference number. If the pattern continues
-with a digit character, some delimiter must be used to terminate the back
-reference. If the PCRE_EXTENDED option is set, this can be whitespace.
-Otherwise an empty comment (see
-.\" HTML
-.\"
-"Comments"
-.\"
-below) can be used.
-.P
-A back reference that occurs inside the parentheses to which it refers fails
-when the subpattern is first used, so, for example, (a\e1) never matches.
-However, such references can be useful inside repeated subpatterns. For
-example, the pattern
-.sp
- (a|b\e1)+
-.sp
-matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
-the subpattern, the back reference matches the character string corresponding
-to the previous iteration. In order for this to work, the pattern must be such
-that the first iteration does not need to match the back reference. This can be
-done using alternation, as in the example above, or by a quantifier with a
-minimum of zero.
-.
-.
-.\" HTML
-.SH ASSERTIONS
-.rs
-.sp
-An assertion is a test on the characters following or preceding the current
-matching point that does not actually consume any characters. The simple
-assertions coded as \eb, \eB, \eA, \eG, \eZ, \ez, ^ and $ are described
-.\" HTML
-.\"
-above.
-.\"
-.P
-More complicated assertions are coded as subpatterns. There are two kinds:
-those that look ahead of the current position in the subject string, and those
-that look behind it. An assertion subpattern is matched in the normal way,
-except that it does not cause the current matching position to be changed.
-.P
-Assertion subpatterns are not capturing subpatterns, and may not be repeated,
-because it makes no sense to assert the same thing several times. If any kind
-of assertion contains capturing subpatterns within it, these are counted for
-the purposes of numbering the capturing subpatterns in the whole pattern.
-However, substring capturing is carried out only for positive assertions,
-because it does not make sense for negative assertions.
-.
-.
-.SS "Lookahead assertions"
-.rs
-.sp
-Lookahead assertions start with (?= for positive assertions and (?! for
-negative assertions. For example,
-.sp
- \ew+(?=;)
-.sp
-matches a word followed by a semicolon, but does not include the semicolon in
-the match, and
-.sp
- foo(?!bar)
-.sp
-matches any occurrence of "foo" that is not followed by "bar". Note that the
-apparently similar pattern
-.sp
- (?!foo)bar
-.sp
-does not find an occurrence of "bar" that is preceded by something other than
-"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
-(?!foo) is always true when the next three characters are "bar". A
-lookbehind assertion is needed to achieve the other effect.
-.P
-If you want to force a matching failure at some point in a pattern, the most
-convenient way to do it is with (?!) because an empty string always matches, so
-an assertion that requires there not to be an empty string must always fail.
-.
-.
-.\" HTML
-.SS "Lookbehind assertions"
-.rs
-.sp
-Lookbehind assertions start with (?<= for positive assertions and (?
-.\"
-(see above)
-.\"
-can be used instead of a lookbehind assertion; this is not restricted to a
-fixed-length.
-.P
-The implementation of lookbehind assertions is, for each alternative, to
-temporarily move the current position back by the fixed length and then try to
-match. If there are insufficient characters before the current position, the
-assertion fails.
-.P
-PCRE does not allow the \eC escape (which matches a single byte in UTF-8 mode)
-to appear in lookbehind assertions, because it makes it impossible to calculate
-the length of the lookbehind. The \eX and \eR escapes, which can match
-different numbers of bytes, are also not permitted.
-.P
-Possessive quantifiers can be used in conjunction with lookbehind assertions to
-specify efficient matching at the end of the subject string. Consider a simple
-pattern such as
-.sp
- abcd$
-.sp
-when applied to a long string that does not match. Because matching proceeds
-from left to right, PCRE will look for each "a" in the subject and then see if
-what follows matches the rest of the pattern. If the pattern is specified as
-.sp
- ^.*abcd$
-.sp
-the initial .* matches the entire string at first, but when this fails (because
-there is no following "a"), it backtracks to match all but the last character,
-then all but the last two characters, and so on. Once again the search for "a"
-covers the entire string, from right to left, so we are no better off. However,
-if the pattern is written as
-.sp
- ^.*+(?<=abcd)
-.sp
-there can be no backtracking for the .*+ item; it can match only the entire
-string. The subsequent lookbehind assertion does a single test on the last four
-characters. If it fails, the match fails immediately. For long strings, this
-approach makes a significant difference to the processing time.
-.
-.
-.SS "Using multiple assertions"
-.rs
-.sp
-Several assertions (of any sort) may occur in succession. For example,
-.sp
- (?<=\ed{3})(?
-.SH "CONDITIONAL SUBPATTERNS"
-.rs
-.sp
-It is possible to cause the matching process to obey a subpattern
-conditionally or to choose between two alternative subpatterns, depending on
-the result of an assertion, or whether a previous capturing subpattern matched
-or not. The two possible forms of conditional subpattern are
-.sp
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-.sp
-If the condition is satisfied, the yes-pattern is used; otherwise the
-no-pattern (if present) is used. If there are more than two alternatives in the
-subpattern, a compile-time error occurs.
-.P
-There are four kinds of condition: references to subpatterns, references to
-recursion, a pseudo-condition called DEFINE, and assertions.
-.
-.SS "Checking for a used subpattern by number"
-.rs
-.sp
-If the text between the parentheses consists of a sequence of digits, the
-condition is true if the capturing subpattern of that number has previously
-matched. An alternative notation is to precede the digits with a plus or minus
-sign. In this case, the subpattern number is relative rather than absolute.
-The most recently opened parentheses can be referenced by (?(-1), the next most
-recent by (?(-2), and so on. In looping constructs it can also make sense to
-refer to subsequent groups with constructs such as (?(+2).
-.P
-Consider the following pattern, which contains non-significant white space to
-make it more readable (assume the PCRE_EXTENDED option) and to divide it into
-three parts for ease of discussion:
-.sp
- ( \e( )? [^()]+ (?(1) \e) )
-.sp
-The first part matches an optional opening parenthesis, and if that
-character is present, sets it as the first captured substring. The second part
-matches one or more characters that are not parentheses. The third part is a
-conditional subpattern that tests whether the first set of parentheses matched
-or not. If they did, that is, if subject started with an opening parenthesis,
-the condition is true, and so the yes-pattern is executed and a closing
-parenthesis is required. Otherwise, since no-pattern is not present, the
-subpattern matches nothing. In other words, this pattern matches a sequence of
-non-parentheses, optionally enclosed in parentheses.
-.P
-If you were embedding this pattern in a larger one, you could use a relative
-reference:
-.sp
- ...other stuff... ( \e( )? [^()]+ (?(-1) \e) ) ...
-.sp
-This makes the fragment independent of the parentheses in the larger pattern.
-.
-.SS "Checking for a used subpattern by name"
-.rs
-.sp
-Perl uses the syntax (?()...) or (?('name')...) to test for a used
-subpattern by name. For compatibility with earlier versions of PCRE, which had
-this facility before Perl, the syntax (?(name)...) is also recognized. However,
-there is a possible ambiguity with this syntax, because subpattern names may
-consist entirely of digits. PCRE looks first for a named subpattern; if it
-cannot find one and the name consists entirely of digits, PCRE looks for a
-subpattern of that number, which must be greater than zero. Using subpattern
-names that consist entirely of digits is not recommended.
-.P
-Rewriting the above example to use a named subpattern gives this:
-.sp
- (? \e( )? [^()]+ (?() \e) )
-.sp
-.
-.SS "Checking for pattern recursion"
-.rs
-.sp
-If the condition is the string (R), and there is no subpattern with the name R,
-the condition is true if a recursive call to the whole pattern or any
-subpattern has been made. If digits or a name preceded by ampersand follow the
-letter R, for example:
-.sp
- (?(R3)...) or (?(R&name)...)
-.sp
-the condition is true if the most recent recursion is into the subpattern whose
-number or name is given. This condition does not check the entire recursion
-stack.
-.P
-At "top level", all these recursion test conditions are false. Recursive
-patterns are described below.
-.
-.SS "Defining subpatterns for use by reference only"
-.rs
-.sp
-If the condition is the string (DEFINE), and there is no subpattern with the
-name DEFINE, the condition is always false. In this case, there may be only one
-alternative in the subpattern. It is always skipped if control reaches this
-point in the pattern; the idea of DEFINE is that it can be used to define
-"subroutines" that can be referenced from elsewhere. (The use of "subroutines"
-is described below.) For example, a pattern to match an IPv4 address could be
-written like this (ignore whitespace and line breaks):
-.sp
- (?(DEFINE) (? 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
- \eb (?&byte) (\e.(?&byte)){3} \eb
-.sp
-The first part of the pattern is a DEFINE group inside which a another group
-named "byte" is defined. This matches an individual component of an IPv4
-address (a number less than 256). When matching takes place, this part of the
-pattern is skipped because DEFINE acts like a false condition.
-.P
-The rest of the pattern uses references to the named group to match the four
-dot-separated components of an IPv4 address, insisting on a word boundary at
-each end.
-.
-.SS "Assertion conditions"
-.rs
-.sp
-If the condition is not in any of the above formats, it must be an assertion.
-This may be a positive or negative lookahead or lookbehind assertion. Consider
-this pattern, again containing non-significant white space, and with the two
-alternatives on the second line:
-.sp
- (?(?=[^a-z]*[a-z])
- \ed{2}-[a-z]{3}-\ed{2} | \ed{2}-\ed{2}-\ed{2} )
-.sp
-The condition is a positive lookahead assertion that matches an optional
-sequence of non-letters followed by a letter. In other words, it tests for the
-presence of at least one letter in the subject. If a letter is found, the
-subject is matched against the first alternative; otherwise it is matched
-against the second. This pattern matches strings in one of the two forms
-dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
-.
-.
-.\" HTML
-.SH COMMENTS
-.rs
-.sp
-The sequence (?# marks the start of a comment that continues up to the next
-closing parenthesis. Nested parentheses are not permitted. The characters
-that make up a comment play no part in the pattern matching at all.
-.P
-If the PCRE_EXTENDED option is set, an unescaped # character outside a
-character class introduces a comment that continues to immediately after the
-next newline in the pattern.
-.
-.
-.\" HTML
-.SH "RECURSIVE PATTERNS"
-.rs
-.sp
-Consider the problem of matching a string in parentheses, allowing for
-unlimited nested parentheses. Without the use of recursion, the best that can
-be done is to use a pattern that matches up to some fixed depth of nesting. It
-is not possible to handle an arbitrary nesting depth.
-.P
-For some time, Perl has provided a facility that allows regular expressions to
-recurse (amongst other things). It does this by interpolating Perl code in the
-expression at run time, and the code can refer to the expression itself. A Perl
-pattern using code interpolation to solve the parentheses problem can be
-created like this:
-.sp
- $re = qr{\e( (?: (?>[^()]+) | (?p{$re}) )* \e)}x;
-.sp
-The (?p{...}) item interpolates Perl code at run time, and in this case refers
-recursively to the pattern in which it appears.
-.P
-Obviously, PCRE cannot support the interpolation of Perl code. Instead, it
-supports special syntax for recursion of the entire pattern, and also for
-individual subpattern recursion. After its introduction in PCRE and Python,
-this kind of recursion was introduced into Perl at release 5.10.
-.P
-A special item that consists of (? followed by a number greater than zero and a
-closing parenthesis is a recursive call of the subpattern of the given number,
-provided that it occurs inside that subpattern. (If not, it is a "subroutine"
-call, which is described in the next section.) The special item (?R) or (?0) is
-a recursive call of the entire regular expression.
-.P
-In PCRE (like Python, but unlike Perl), a recursive subpattern call is always
-treated as an atomic group. That is, once it has matched some of the subject
-string, it is never re-entered, even if it contains untried alternatives and
-there is a subsequent matching failure.
-.P
-This PCRE pattern solves the nested parentheses problem (assume the
-PCRE_EXTENDED option is set so that white space is ignored):
-.sp
- \e( ( (?>[^()]+) | (?R) )* \e)
-.sp
-First it matches an opening parenthesis. Then it matches any number of
-substrings which can either be a sequence of non-parentheses, or a recursive
-match of the pattern itself (that is, a correctly parenthesized substring).
-Finally there is a closing parenthesis.
-.P
-If this were part of a larger pattern, you would not want to recurse the entire
-pattern, so instead you could use this:
-.sp
- ( \e( ( (?>[^()]+) | (?1) )* \e) )
-.sp
-We have put the pattern into parentheses, and caused the recursion to refer to
-them instead of the whole pattern.
-.P
-In a larger pattern, keeping track of parenthesis numbers can be tricky. This
-is made easier by the use of relative references. (A Perl 5.10 feature.)
-Instead of (?1) in the pattern above you can write (?-2) to refer to the second
-most recently opened parentheses preceding the recursion. In other words, a
-negative number counts capturing parentheses leftwards from the point at which
-it is encountered.
-.P
-It is also possible to refer to subsequently opened parentheses, by writing
-references such as (?+2). However, these cannot be recursive because the
-reference is not inside the parentheses that are referenced. They are always
-"subroutine" calls, as described in the next section.
-.P
-An alternative approach is to use named parentheses instead. The Perl syntax
-for this is (?&name); PCRE's earlier syntax (?P>name) is also supported. We
-could rewrite the above example as follows:
-.sp
- (? \e( ( (?>[^()]+) | (?&pn) )* \e) )
-.sp
-If there is more than one subpattern with the same name, the earliest one is
-used.
-.P
-This particular example pattern that we have been looking at contains nested
-unlimited repeats, and so the use of atomic grouping for matching strings of
-non-parentheses is important when applying the pattern to strings that do not
-match. For example, when this pattern is applied to
-.sp
- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
-.sp
-it yields "no match" quickly. However, if atomic grouping is not used,
-the match runs for a very long time indeed because there are so many different
-ways the + and * repeats can carve up the subject, and all have to be tested
-before failure can be reported.
-.P
-At the end of a match, the values set for any capturing subpatterns are those
-from the outermost level of the recursion at which the subpattern value is set.
-If you want to obtain intermediate values, a callout function can be used (see
-below and the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation). If the pattern above is matched against
-.sp
- (ab(cd)ef)
-.sp
-the value for the capturing parentheses is "ef", which is the last value taken
-on at the top level. If additional parentheses are added, giving
-.sp
- \e( ( ( (?>[^()]+) | (?R) )* ) \e)
- ^ ^
- ^ ^
-.sp
-the string they capture is "ab(cd)ef", the contents of the top level
-parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
-has to obtain extra memory to store data during a recursion, which it does by
-using \fBpcre_malloc\fP, freeing it via \fBpcre_free\fP afterwards. If no
-memory can be obtained, the match fails with the PCRE_ERROR_NOMEMORY error.
-.P
-Do not confuse the (?R) item with the condition (R), which tests for recursion.
-Consider this pattern, which matches text in angle brackets, allowing for
-arbitrary nesting. Only digits are allowed in nested brackets (that is, when
-recursing), whereas any characters are permitted at the outer level.
-.sp
- < (?: (?(R) \ed++ | [^<>]*+) | (?R)) * >
-.sp
-In this pattern, (?(R) is the start of a conditional subpattern, with two
-different alternatives for the recursive and non-recursive cases. The (?R) item
-is the actual recursive call.
-.
-.
-.\" HTML
-.SH "SUBPATTERNS AS SUBROUTINES"
-.rs
-.sp
-If the syntax for a recursive subpattern reference (either by number or by
-name) is used outside the parentheses to which it refers, it operates like a
-subroutine in a programming language. The "called" subpattern may be defined
-before or after the reference. A numbered reference can be absolute or
-relative, as in these examples:
-.sp
- (...(absolute)...)...(?2)...
- (...(relative)...)...(?-1)...
- (...(?+1)...(relative)...
-.sp
-An earlier example pointed out that the pattern
-.sp
- (sens|respons)e and \e1ibility
-.sp
-matches "sense and sensibility" and "response and responsibility", but not
-"sense and responsibility". If instead the pattern
-.sp
- (sens|respons)e and (?1)ibility
-.sp
-is used, it does match "sense and responsibility" as well as the other two
-strings. Another example is given in the discussion of DEFINE above.
-.P
-Like recursive subpatterns, a "subroutine" call is always treated as an atomic
-group. That is, once it has matched some of the subject string, it is never
-re-entered, even if it contains untried alternatives and there is a subsequent
-matching failure.
-.P
-When a subpattern is used as a subroutine, processing options such as
-case-independence are fixed when the subpattern is defined. They cannot be
-changed for different calls. For example, consider this pattern:
-.sp
- (abc)(?i:(?-1))
-.sp
-It matches "abcabc". It does not match "abcABC" because the change of
-processing option does not affect the called subpattern.
-.
-.
-.\" HTML
-.SH "ONIGURUMA SUBROUTINE SYNTAX"
-.rs
-.sp
-For compatibility with Oniguruma, the non-Perl syntax \eg followed by a name or
-a number enclosed either in angle brackets or single quotes, is an alternative
-syntax for referencing a subpattern as a subroutine, possibly recursively. Here
-are two of the examples used above, rewritten using this syntax:
-.sp
- (? \e( ( (?>[^()]+) | \eg )* \e) )
- (sens|respons)e and \eg'1'ibility
-.sp
-PCRE supports an extension to Oniguruma: if a number is preceded by a
-plus or a minus sign it is taken as a relative reference. For example:
-.sp
- (abc)(?i:\eg<-1>)
-.sp
-Note that \eg{...} (Perl syntax) and \eg<...> (Oniguruma syntax) are \fInot\fP
-synonymous. The former is a back reference; the latter is a subroutine call.
-.
-.
-.SH CALLOUTS
-.rs
-.sp
-Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
-code to be obeyed in the middle of matching a regular expression. This makes it
-possible, amongst other things, to extract different substrings that match the
-same pair of parentheses when there is a repetition.
-.P
-PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
-code. The feature is called "callout". The caller of PCRE provides an external
-function by putting its entry point in the global variable \fIpcre_callout\fP.
-By default, this variable contains NULL, which disables all calling out.
-.P
-Within a regular expression, (?C) indicates the points at which the external
-function is to be called. If you want to identify different callout points, you
-can put a number less than 256 after the letter C. The default value is zero.
-For example, this pattern has two callout points:
-.sp
- (?C1)abc(?C2)def
-.sp
-If the PCRE_AUTO_CALLOUT flag is passed to \fBpcre_compile()\fP, callouts are
-automatically installed before each item in the pattern. They are all numbered
-255.
-.P
-During matching, when PCRE reaches a callout point (and \fIpcre_callout\fP is
-set), the external function is called. It is provided with the number of the
-callout, the position in the pattern, and, optionally, one item of data
-originally supplied by the caller of \fBpcre_exec()\fP. The callout function
-may cause matching to proceed, to backtrack, or to fail altogether. A complete
-description of the interface to the callout function is given in the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation.
-.
-.
-.SH "BACKTRACKING CONTROL"
-.rs
-.sp
-Perl 5.10 introduced a number of "Special Backtracking Control Verbs", which
-are described in the Perl documentation as "experimental and subject to change
-or removal in a future version of Perl". It goes on to say: "Their usage in
-production code should be noted to avoid problems during upgrades." The same
-remarks apply to the PCRE features described in this section.
-.P
-Since these verbs are specifically related to backtracking, most of them can be
-used only when the pattern is to be matched using \fBpcre_exec()\fP, which uses
-a backtracking algorithm. With the exception of (*FAIL), which behaves like a
-failing negative assertion, they cause an error if encountered by
-\fBpcre_dfa_exec()\fP.
-.P
-The new verbs make use of what was previously invalid syntax: an opening
-parenthesis followed by an asterisk. In Perl, they are generally of the form
-(*VERB:ARG) but PCRE does not support the use of arguments, so its general
-form is just (*VERB). Any number of these verbs may occur in a pattern. There
-are two kinds:
-.
-.SS "Verbs that act immediately"
-.rs
-.sp
-The following verbs act as soon as they are encountered:
-.sp
- (*ACCEPT)
-.sp
-This verb causes the match to end successfully, skipping the remainder of the
-pattern. When inside a recursion, only the innermost pattern is ended
-immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside
-capturing parentheses. In Perl, the data so far is captured: in PCRE no data is
-captured. For example:
-.sp
- A(A|B(*ACCEPT)|C)D
-.sp
-This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is
-captured.
-.sp
- (*FAIL) or (*F)
-.sp
-This verb causes the match to fail, forcing backtracking to occur. It is
-equivalent to (?!) but easier to read. The Perl documentation notes that it is
-probably useful only when combined with (?{}) or (??{}). Those are, of course,
-Perl features that are not present in PCRE. The nearest equivalent is the
-callout feature, as for example in this pattern:
-.sp
- a+(?C)(*FAIL)
-.sp
-A match with the string "aaaa" always fails, but the callout is taken before
-each backtrack happens (in this example, 10 times).
-.
-.SS "Verbs that act after backtracking"
-.rs
-.sp
-The following verbs do nothing when they are encountered. Matching continues
-with what follows, but if there is no subsequent match, a failure is forced.
-The verbs differ in exactly what kind of failure occurs.
-.sp
- (*COMMIT)
-.sp
-This verb causes the whole match to fail outright if the rest of the pattern
-does not match. Even if the pattern is unanchored, no further attempts to find
-a match by advancing the start point take place. Once (*COMMIT) has been
-passed, \fBpcre_exec()\fP is committed to finding a match at the current
-starting point, or not at all. For example:
-.sp
- a+(*COMMIT)b
-.sp
-This matches "xxaab" but not "aacaab". It can be thought of as a kind of
-dynamic anchor, or "I've started, so I must finish."
-.sp
- (*PRUNE)
-.sp
-This verb causes the match to fail at the current position if the rest of the
-pattern does not match. If the pattern is unanchored, the normal "bumpalong"
-advance to the next starting character then happens. Backtracking can occur as
-usual to the left of (*PRUNE), or when matching to the right of (*PRUNE), but
-if there is no match to the right, backtracking cannot cross (*PRUNE).
-In simple cases, the use of (*PRUNE) is just an alternative to an atomic
-group or possessive quantifier, but there are some uses of (*PRUNE) that cannot
-be expressed in any other way.
-.sp
- (*SKIP)
-.sp
-This verb is like (*PRUNE), except that if the pattern is unanchored, the
-"bumpalong" advance is not to the next character, but to the position in the
-subject where (*SKIP) was encountered. (*SKIP) signifies that whatever text
-was matched leading up to it cannot be part of a successful match. Consider:
-.sp
- a+(*SKIP)b
-.sp
-If the subject is "aaaac...", after the first match attempt fails (starting at
-the first character in the string), the starting point skips on to start the
-next attempt at "c". Note that a possessive quantifer does not have the same
-effect in this example; although it would suppress backtracking during the
-first match attempt, the second attempt would start at the second character
-instead of skipping on to "c".
-.sp
- (*THEN)
-.sp
-This verb causes a skip to the next alternation if the rest of the pattern does
-not match. That is, it cancels pending backtracking, but only within the
-current alternation. Its name comes from the observation that it can be used
-for a pattern-based if-then-else block:
-.sp
- ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
-.sp
-If the COND1 pattern matches, FOO is tried (and possibly further items after
-the end of the group if FOO succeeds); on failure the matcher skips to the
-second alternative and tries COND2, without backtracking into COND1. If (*THEN)
-is used outside of any alternation, it acts exactly like (*PRUNE).
-.
-.
-.SH "SEE ALSO"
-.rs
-.sp
-\fBpcreapi\fP(3), \fBpcrecallout\fP(3), \fBpcrematching\fP(3), \fBpcre\fP(3).
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 11 April 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcreperform.3 b/libs/pcre/doc/pcreperform.3
deleted file mode 100644
index 915f7b7854..0000000000
--- a/libs/pcre/doc/pcreperform.3
+++ /dev/null
@@ -1,153 +0,0 @@
-.TH PCREPERFORM 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE PERFORMANCE"
-.rs
-.sp
-Two aspects of performance are discussed below: memory usage and processing
-time. The way you express your pattern as a regular expression can affect both
-of them.
-.
-.SH "MEMORY USAGE"
-.rs
-.sp
-Patterns are compiled by PCRE into a reasonably efficient byte code, so that
-most simple patterns do not use much memory. However, there is one case where
-memory usage can be unexpectedly large. When a parenthesized subpattern has a
-quantifier with a minimum greater than 1 and/or a limited maximum, the whole
-subpattern is repeated in the compiled code. For example, the pattern
-.sp
- (abc|def){2,4}
-.sp
-is compiled as if it were
-.sp
- (abc|def)(abc|def)((abc|def)(abc|def)?)?
-.sp
-(Technical aside: It is done this way so that backtrack points within each of
-the repetitions can be independently maintained.)
-.P
-For regular expressions whose quantifiers use only small numbers, this is not
-usually a problem. However, if the numbers are large, and particularly if such
-repetitions are nested, the memory usage can become an embarrassment. For
-example, the very simple pattern
-.sp
- ((ab){1,1000}c){1,3}
-.sp
-uses 51K bytes when compiled. When PCRE is compiled with its default internal
-pointer size of two bytes, the size limit on a compiled pattern is 64K, and
-this is reached with the above pattern if the outer repetition is increased
-from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
-handle larger compiled patterns, but it is better to try to rewrite your
-pattern to use less memory if you can.
-.P
-One way of reducing the memory usage for such patterns is to make use of PCRE's
-.\" HTML
-.\"
-"subroutine"
-.\"
-facility. Re-writing the above pattern as
-.sp
- ((ab)(?2){0,999}c)(?1){0,2}
-.sp
-reduces the memory requirements to 18K, and indeed it remains under 20K even
-with the outer repetition increased to 100. However, this pattern is not
-exactly equivalent, because the "subroutine" calls are treated as
-.\" HTML
-.\"
-atomic groups
-.\"
-into which there can be no backtracking if there is a subsequent matching
-failure. Therefore, PCRE cannot do this kind of rewriting automatically.
-Furthermore, there is a noticeable loss of speed when executing the modified
-pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
-speed is acceptable, this kind of rewriting will allow you to process patterns
-that PCRE cannot otherwise handle.
-.
-.SH "PROCESSING TIME"
-.rs
-.sp
-Certain items in regular expression patterns are processed more efficiently
-than others. It is more efficient to use a character class like [aeiou] than a
-set of single-character alternatives such as (a|e|i|o|u). In general, the
-simplest construction that provides the required behaviour is usually the most
-efficient. Jeffrey Friedl's book contains a lot of useful general discussion
-about optimizing regular expressions for efficient performance. This document
-contains a few observations about PCRE.
-.P
-Using Unicode character properties (the \ep, \eP, and \eX escapes) is slow,
-because PCRE has to scan a structure that contains data for over fifteen
-thousand characters whenever it needs a character's property. If you can find
-an alternative pattern that does not use character properties, it will probably
-be faster.
-.P
-When a pattern begins with .* not in parentheses, or in parentheses that are
-not the subject of a backreference, and the PCRE_DOTALL option is set, the
-pattern is implicitly anchored by PCRE, since it can match only at the start of
-a subject string. However, if PCRE_DOTALL is not set, PCRE cannot make this
-optimization, because the . metacharacter does not then match a newline, and if
-the subject string contains newlines, the pattern may match from the character
-immediately following one of them instead of from the very start. For example,
-the pattern
-.sp
- .*second
-.sp
-matches the subject "first\enand second" (where \en stands for a newline
-character), with the match starting at the seventh character. In order to do
-this, PCRE has to retry the match starting after every newline in the subject.
-.P
-If you are using such a pattern with subject strings that do not contain
-newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
-the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE
-from having to scan along the subject looking for a newline to restart at.
-.P
-Beware of patterns that contain nested indefinite repeats. These can take a
-long time to run when applied to a string that does not match. Consider the
-pattern fragment
-.sp
- ^(a+)*
-.sp
-This can match "aaaa" in 16 different ways, and this number increases very
-rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
-times, and for each of those cases other than 0 or 4, the + repeats can match
-different numbers of times.) When the remainder of the pattern is such that the
-entire match is going to fail, PCRE has in principle to try every possible
-variation, and this can take an extremely long time, even for relatively short
-strings.
-.P
-An optimization catches some of the more simple cases such as
-.sp
- (a+)*b
-.sp
-where a literal character follows. Before embarking on the standard matching
-procedure, PCRE checks that there is a "b" later in the subject string, and if
-there is not, it fails the match immediately. However, when there is no
-following literal this optimization cannot be used. You can see the difference
-by comparing the behaviour of
-.sp
- (a+)*\ed
-.sp
-with the pattern above. The former gives a failure almost instantly when
-applied to a whole line of "a" characters, whereas the latter takes an
-appreciable time with strings longer than about 20 characters.
-.P
-In many cases, the solution to this kind of performance issue is to use an
-atomic group or a possessive quantifier.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 06 March 2007
-Copyright (c) 1997-2007 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcreposix.3 b/libs/pcre/doc/pcreposix.3
deleted file mode 100644
index d63ac035a7..0000000000
--- a/libs/pcre/doc/pcreposix.3
+++ /dev/null
@@ -1,244 +0,0 @@
-.TH PCREPOSIX 3
-.SH NAME
-PCRE - Perl-compatible regular expressions.
-.SH "SYNOPSIS OF POSIX API"
-.rs
-.sp
-.B #include
-.PP
-.SM
-.B int regcomp(regex_t *\fIpreg\fP, const char *\fIpattern\fP,
-.ti +5n
-.B int \fIcflags\fP);
-.PP
-.B int regexec(regex_t *\fIpreg\fP, const char *\fIstring\fP,
-.ti +5n
-.B size_t \fInmatch\fP, regmatch_t \fIpmatch\fP[], int \fIeflags\fP);
-.PP
-.B size_t regerror(int \fIerrcode\fP, const regex_t *\fIpreg\fP,
-.ti +5n
-.B char *\fIerrbuf\fP, size_t \fIerrbuf_size\fP);
-.PP
-.B void regfree(regex_t *\fIpreg\fP);
-.
-.SH DESCRIPTION
-.rs
-.sp
-This set of functions provides a POSIX-style API to the PCRE regular expression
-package. See the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation for a description of PCRE's native API, which contains much
-additional functionality.
-.P
-The functions described here are just wrapper functions that ultimately call
-the PCRE native API. Their prototypes are defined in the \fBpcreposix.h\fP
-header file, and on Unix systems the library itself is called
-\fBpcreposix.a\fP, so can be accessed by adding \fB-lpcreposix\fP to the
-command for linking an application that uses them. Because the POSIX functions
-call the native ones, it is also necessary to add \fB-lpcre\fP.
-.P
-I have implemented only those POSIX option bits that can be reasonably mapped
-to PCRE native options. In addition, the option REG_EXTENDED is defined with
-the value zero. This has no effect, but since programs that are written to the
-POSIX interface often use it, this makes it easier to slot in PCRE as a
-replacement library. Other POSIX options are not even defined.
-.P
-When PCRE is called via these functions, it is only the API that is POSIX-like
-in style. The syntax and semantics of the regular expressions themselves are
-still those of Perl, subject to the setting of various PCRE options, as
-described below. "POSIX-like in style" means that the API approximates to the
-POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
-domains it is probably even less compatible.
-.P
-The header for these functions is supplied as \fBpcreposix.h\fP to avoid any
-potential clash with other POSIX libraries. It can, of course, be renamed or
-aliased as \fBregex.h\fP, which is the "correct" name. It provides two
-structure types, \fIregex_t\fP for compiled internal forms, and
-\fIregmatch_t\fP for returning captured substrings. It also defines some
-constants whose names start with "REG_"; these are used for setting options and
-identifying error codes.
-.P
-.SH "COMPILING A PATTERN"
-.rs
-.sp
-The function \fBregcomp()\fP is called to compile a pattern into an
-internal form. The pattern is a C string terminated by a binary zero, and
-is passed in the argument \fIpattern\fP. The \fIpreg\fP argument is a pointer
-to a \fBregex_t\fP structure that is used as a base for storing information
-about the compiled regular expression.
-.P
-The argument \fIcflags\fP is either zero, or contains one or more of the bits
-defined by the following macros:
-.sp
- REG_DOTALL
-.sp
-The PCRE_DOTALL option is set when the regular expression is passed for
-compilation to the native function. Note that REG_DOTALL is not part of the
-POSIX standard.
-.sp
- REG_ICASE
-.sp
-The PCRE_CASELESS option is set when the regular expression is passed for
-compilation to the native function.
-.sp
- REG_NEWLINE
-.sp
-The PCRE_MULTILINE option is set when the regular expression is passed for
-compilation to the native function. Note that this does \fInot\fP mimic the
-defined POSIX behaviour for REG_NEWLINE (see the following section).
-.sp
- REG_NOSUB
-.sp
-The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is passed
-for compilation to the native function. In addition, when a pattern that is
-compiled with this flag is passed to \fBregexec()\fP for matching, the
-\fInmatch\fP and \fIpmatch\fP arguments are ignored, and no captured strings
-are returned.
-.sp
- REG_UTF8
-.sp
-The PCRE_UTF8 option is set when the regular expression is passed for
-compilation to the native function. This causes the pattern itself and all data
-strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF8
-is not part of the POSIX standard.
-.P
-In the absence of these flags, no options are passed to the native function.
-This means the the regex is compiled with PCRE default semantics. In
-particular, the way it handles newline characters in the subject string is the
-Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
-\fIsome\fP of the effects specified for REG_NEWLINE. It does not affect the way
-newlines are matched by . (they aren't) or by a negative class such as [^a]
-(they are).
-.P
-The yield of \fBregcomp()\fP is zero on success, and non-zero otherwise. The
-\fIpreg\fP structure is filled in on success, and one member of the structure
-is public: \fIre_nsub\fP contains the number of capturing subpatterns in
-the regular expression. Various error codes are defined in the header file.
-.
-.
-.SH "MATCHING NEWLINE CHARACTERS"
-.rs
-.sp
-This area is not simple, because POSIX and Perl take different views of things.
-It is not possible to get PCRE to obey POSIX semantics, but then PCRE was never
-intended to be a POSIX engine. The following table lists the different
-possibilities for matching newline characters in PCRE:
-.sp
- Default Change with
-.sp
- . matches newline no PCRE_DOTALL
- newline matches [^a] yes not changeable
- $ matches \en at end yes PCRE_DOLLARENDONLY
- $ matches \en in middle no PCRE_MULTILINE
- ^ matches \en in middle no PCRE_MULTILINE
-.sp
-This is the equivalent table for POSIX:
-.sp
- Default Change with
-.sp
- . matches newline yes REG_NEWLINE
- newline matches [^a] yes REG_NEWLINE
- $ matches \en at end no REG_NEWLINE
- $ matches \en in middle no REG_NEWLINE
- ^ matches \en in middle no REG_NEWLINE
-.sp
-PCRE's behaviour is the same as Perl's, except that there is no equivalent for
-PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is no way to stop
-newline from matching [^a].
-.P
-The default POSIX newline handling can be obtained by setting PCRE_DOTALL and
-PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE behave exactly as for the
-REG_NEWLINE action.
-.
-.
-.SH "MATCHING A PATTERN"
-.rs
-.sp
-The function \fBregexec()\fP is called to match a compiled pattern \fIpreg\fP
-against a given \fIstring\fP, which is by default terminated by a zero byte
-(but see REG_STARTEND below), subject to the options in \fIeflags\fP. These can
-be:
-.sp
- REG_NOTBOL
-.sp
-The PCRE_NOTBOL option is set when calling the underlying PCRE matching
-function.
-.sp
- REG_NOTEMPTY
-.sp
-The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
-function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
-setting this option can give more POSIX-like behaviour in some situations.
-.sp
- REG_NOTEOL
-.sp
-The PCRE_NOTEOL option is set when calling the underlying PCRE matching
-function.
-.sp
- REG_STARTEND
-.sp
-The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and
-to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
-(there need not actually be a NUL at that location), regardless of the value of
-\fInmatch\fP. This is a BSD extension, compatible with but not specified by
-IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
-intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
-not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
-how it is matched.
-.P
-If the pattern was compiled with the REG_NOSUB flag, no data about any matched
-strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
-\fBregexec()\fP are ignored.
-.P
-Otherwise,the portion of the string that was matched, and also any captured
-substrings, are returned via the \fIpmatch\fP argument, which points to an
-array of \fInmatch\fP structures of type \fIregmatch_t\fP, containing the
-members \fIrm_so\fP and \fIrm_eo\fP. These contain the offset to the first
-character of each substring and the offset to the first character after the end
-of each substring, respectively. The 0th element of the vector relates to the
-entire portion of \fIstring\fP that was matched; subsequent elements relate to
-the capturing subpatterns of the regular expression. Unused entries in the
-array have both structure members set to -1.
-.P
-A successful match yields a zero return; various error codes are defined in the
-header file, of which REG_NOMATCH is the "expected" failure code.
-.
-.
-.SH "ERROR MESSAGES"
-.rs
-.sp
-The \fBregerror()\fP function maps a non-zero errorcode from either
-\fBregcomp()\fP or \fBregexec()\fP to a printable message. If \fIpreg\fP is not
-NULL, the error should have arisen from the use of that structure. A message
-terminated by a binary zero is placed in \fIerrbuf\fP. The length of the
-message, including the zero, is limited to \fIerrbuf_size\fP. The yield of the
-function is the size of buffer needed to hold the whole message.
-.
-.
-.SH MEMORY USAGE
-.rs
-.sp
-Compiling a regular expression causes memory to be allocated and associated
-with the \fIpreg\fP structure. The function \fBregfree()\fP frees all such
-memory, after which \fIpreg\fP may no longer be used as a compiled expression.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 11 March 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcreprecompile.3 b/libs/pcre/doc/pcreprecompile.3
deleted file mode 100644
index aa525426bd..0000000000
--- a/libs/pcre/doc/pcreprecompile.3
+++ /dev/null
@@ -1,142 +0,0 @@
-.TH PCREPRECOMPILE 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "SAVING AND RE-USING PRECOMPILED PCRE PATTERNS"
-.rs
-.sp
-If you are running an application that uses a large number of regular
-expression patterns, it may be useful to store them in a precompiled form
-instead of having to compile them every time the application is run.
-If you are not using any private character tables (see the
-.\" HREF
-\fBpcre_maketables()\fP
-.\"
-documentation), this is relatively straightforward. If you are using private
-tables, it is a little bit more complicated.
-.P
-If you save compiled patterns to a file, you can copy them to a different host
-and run them there. This works even if the new host has the opposite endianness
-to the one on which the patterns were compiled. There may be a small
-performance penalty, but it should be insignificant. However, compiling regular
-expressions with one version of PCRE for use with a different version is not
-guaranteed to work and may cause crashes.
-.
-.
-.SH "SAVING A COMPILED PATTERN"
-.rs
-.sh
-The value returned by \fBpcre_compile()\fP points to a single block of memory
-that holds the compiled pattern and associated data. You can find the length of
-this block in bytes by calling \fBpcre_fullinfo()\fP with an argument of
-PCRE_INFO_SIZE. You can then save the data in any appropriate manner. Here is
-sample code that compiles a pattern and writes it to a file. It assumes that
-the variable \fIfd\fP refers to a file that is open for output:
-.sp
- int erroroffset, rc, size;
- char *error;
- pcre *re;
-.sp
- re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
- if (re == NULL) { ... handle errors ... }
- rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
- if (rc < 0) { ... handle errors ... }
- rc = fwrite(re, 1, size, fd);
- if (rc != size) { ... handle errors ... }
-.sp
-In this example, the bytes that comprise the compiled pattern are copied
-exactly. Note that this is binary data that may contain any of the 256 possible
-byte values. On systems that make a distinction between binary and non-binary
-data, be sure that the file is opened for binary output.
-.P
-If you want to write more than one pattern to a file, you will have to devise a
-way of separating them. For binary data, preceding each pattern with its length
-is probably the most straightforward approach. Another possibility is to write
-out the data in hexadecimal instead of binary, one pattern to a line.
-.P
-Saving compiled patterns in a file is only one possible way of storing them for
-later use. They could equally well be saved in a database, or in the memory of
-some daemon process that passes them via sockets to the processes that want
-them.
-.P
-If the pattern has been studied, it is also possible to save the study data in
-a similar way to the compiled pattern itself. When studying generates
-additional information, \fBpcre_study()\fP returns a pointer to a
-\fBpcre_extra\fP data block. Its format is defined in the
-.\" HTML
-.\"
-section on matching a pattern
-.\"
-in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation. The \fIstudy_data\fP field points to the binary study data, and
-this is what you must save (not the \fBpcre_extra\fP block itself). The length
-of the study data can be obtained by calling \fBpcre_fullinfo()\fP with an
-argument of PCRE_INFO_STUDYSIZE. Remember to check that \fBpcre_study()\fP did
-return a non-NULL value before trying to save the study data.
-.
-.
-.SH "RE-USING A PRECOMPILED PATTERN"
-.rs
-.sp
-Re-using a precompiled pattern is straightforward. Having reloaded it into main
-memory, you pass its pointer to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP in
-the usual way. This should work even on another host, and even if that host has
-the opposite endianness to the one where the pattern was compiled.
-.P
-However, if you passed a pointer to custom character tables when the pattern
-was compiled (the \fItableptr\fP argument of \fBpcre_compile()\fP), you must
-now pass a similar pointer to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP,
-because the value saved with the compiled pattern will obviously be nonsense. A
-field in a \fBpcre_extra()\fP block is used to pass this data, as described in
-the
-.\" HTML
-.\"
-section on matching a pattern
-.\"
-in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation.
-.P
-If you did not provide custom character tables when the pattern was compiled,
-the pointer in the compiled pattern is NULL, which causes \fBpcre_exec()\fP to
-use PCRE's internal tables. Thus, you do not need to take any special action at
-run time in this case.
-.P
-If you saved study data with the compiled pattern, you need to create your own
-\fBpcre_extra\fP data block and set the \fIstudy_data\fP field to point to the
-reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
-\fIflags\fP field to indicate that study data is present. Then pass the
-\fBpcre_extra\fP block to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP in the
-usual way.
-.
-.
-.SH "COMPATIBILITY WITH DIFFERENT PCRE RELEASES"
-.rs
-.sp
-In general, it is safest to recompile all saved patterns when you update to a
-new PCRE release, though not all updates actually require this. Recompiling is
-definitely needed for release 7.2.
-.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 13 June 2007
-Copyright (c) 1997-2007 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcresample.3 b/libs/pcre/doc/pcresample.3
deleted file mode 100644
index d27690a78c..0000000000
--- a/libs/pcre/doc/pcresample.3
+++ /dev/null
@@ -1,80 +0,0 @@
-.TH PCRESAMPLE 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE SAMPLE PROGRAM"
-.rs
-.sp
-A simple, complete demonstration program, to get you started with using PCRE,
-is supplied in the file \fIpcredemo.c\fP in the PCRE distribution.
-.P
-The program compiles the regular expression that is its first argument, and
-matches it against the subject string in its second argument. No PCRE options
-are set, and default character tables are used. If matching succeeds, the
-program outputs the portion of the subject that matched, together with the
-contents of any captured substrings.
-.P
-If the -g option is given on the command line, the program then goes on to
-check for further matches of the same regular expression in the same subject
-string. The logic is a little bit tricky because of the possibility of matching
-an empty string. Comments in the code explain what is going on.
-.P
-If PCRE is installed in the standard include and library directories for your
-system, you should be able to compile the demonstration program using this
-command:
-.sp
- gcc -o pcredemo pcredemo.c -lpcre
-.sp
-If PCRE is installed elsewhere, you may need to add additional options to the
-command line. For example, on a Unix-like system that has PCRE installed in
-\fI/usr/local\fP, you can compile the demonstration program using a command
-like this:
-.sp
-.\" JOINSH
- gcc -o pcredemo -I/usr/local/include pcredemo.c \e
- -L/usr/local/lib -lpcre
-.sp
-Once you have compiled the demonstration program, you can run simple tests like
-this:
-.sp
- ./pcredemo 'cat|dog' 'the cat sat on the mat'
- ./pcredemo -g 'cat|dog' 'the dog sat on the cat'
-.sp
-Note that there is a much more comprehensive test program, called
-.\" HREF
-\fBpcretest\fP,
-.\"
-which supports many more facilities for testing regular expressions and the
-PCRE library. The \fBpcredemo\fP program is provided as a simple coding
-example.
-.P
-On some operating systems (e.g. Solaris), when PCRE is not installed in the
-standard library directory, you may get an error like this when you try to run
-\fBpcredemo\fP:
-.sp
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
-.sp
-This is caused by the way shared library support works on those systems. You
-need to add
-.sp
- -R/usr/local/lib
-.sp
-(for example) to the compile command to get round this problem.
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 23 January 2008
-Copyright (c) 1997-2008 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcrestack.3 b/libs/pcre/doc/pcrestack.3
deleted file mode 100644
index 845425d97d..0000000000
--- a/libs/pcre/doc/pcrestack.3
+++ /dev/null
@@ -1,160 +0,0 @@
-.TH PCRESTACK 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE DISCUSSION OF STACK USAGE"
-.rs
-.sp
-When you call \fBpcre_exec()\fP, it makes use of an internal function called
-\fBmatch()\fP. This calls itself recursively at branch points in the pattern,
-in order to remember the state of the match so that it can back up and try a
-different alternative if the first one fails. As matching proceeds deeper and
-deeper into the tree of possibilities, the recursion depth increases.
-.P
-Not all calls of \fBmatch()\fP increase the recursion depth; for an item such
-as a* it may be called several times at the same level, after matching
-different numbers of a's. Furthermore, in a number of cases where the result of
-the recursive call would immediately be passed back as the result of the
-current call (a "tail recursion"), the function is just restarted instead.
-.P
-The \fBpcre_dfa_exec()\fP function operates in an entirely different way, and
-hardly uses recursion at all. The limit on its complexity is the amount of
-workspace it is given. The comments that follow do NOT apply to
-\fBpcre_dfa_exec()\fP; they are relevant only for \fBpcre_exec()\fP.
-.P
-You can set limits on the number of times that \fBmatch()\fP is called, both in
-total and recursively. If the limit is exceeded, an error occurs. For details,
-see the
-.\" HTML
-.\"
-section on extra data for \fBpcre_exec()\fP
-.\"
-in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation.
-.P
-Each time that \fBmatch()\fP is actually called recursively, it uses memory
-from the process stack. For certain kinds of pattern and data, very large
-amounts of stack may be needed, despite the recognition of "tail recursion".
-You can often reduce the amount of recursion, and therefore the amount of stack
-used, by modifying the pattern that is being matched. Consider, for example,
-this pattern:
-.sp
- ([^<]|<(?!inet))+
-.sp
-It matches from wherever it starts until it encounters "
-.\"
-http://developer.apple.com/qa/qa2005/qa1419.html.
-.\"
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 09 July 2008
-Copyright (c) 1997-2008 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcresyntax.3 b/libs/pcre/doc/pcresyntax.3
deleted file mode 100644
index 13c1e0f3a0..0000000000
--- a/libs/pcre/doc/pcresyntax.3
+++ /dev/null
@@ -1,449 +0,0 @@
-.TH PCRESYNTAX 3
-.SH NAME
-PCRE - Perl-compatible regular expressions
-.SH "PCRE REGULAR EXPRESSION SYNTAX SUMMARY"
-.rs
-.sp
-The full syntax and semantics of the regular expressions that are supported by
-PCRE are described in the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation. This document contains just a quick-reference summary of the
-syntax.
-.
-.
-.SH "QUOTING"
-.rs
-.sp
- \ex where x is non-alphanumeric is a literal x
- \eQ...\eE treat enclosed characters as literal
-.
-.
-.SH "CHARACTERS"
-.rs
-.sp
- \ea alarm, that is, the BEL character (hex 07)
- \ecx "control-x", where x is any character
- \ee escape (hex 1B)
- \ef formfeed (hex 0C)
- \en newline (hex 0A)
- \er carriage return (hex 0D)
- \et tab (hex 09)
- \eddd character with octal code ddd, or backreference
- \exhh character with hex code hh
- \ex{hhh..} character with hex code hhh..
-.
-.
-.SH "CHARACTER TYPES"
-.rs
-.sp
- . any character except newline;
- in dotall mode, any character whatsoever
- \eC one byte, even in UTF-8 mode (best avoided)
- \ed a decimal digit
- \eD a character that is not a decimal digit
- \eh a horizontal whitespace character
- \eH a character that is not a horizontal whitespace character
- \ep{\fIxx\fP} a character with the \fIxx\fP property
- \eP{\fIxx\fP} a character without the \fIxx\fP property
- \eR a newline sequence
- \es a whitespace character
- \eS a character that is not a whitespace character
- \ev a vertical whitespace character
- \eV a character that is not a vertical whitespace character
- \ew a "word" character
- \eW a "non-word" character
- \eX an extended Unicode sequence
-.sp
-In PCRE, \ed, \eD, \es, \eS, \ew, and \eW recognize only ASCII characters.
-.
-.
-.SH "GENERAL CATEGORY PROPERTY CODES FOR \ep and \eP"
-.rs
-.sp
- C Other
- Cc Control
- Cf Format
- Cn Unassigned
- Co Private use
- Cs Surrogate
-.sp
- L Letter
- Ll Lower case letter
- Lm Modifier letter
- Lo Other letter
- Lt Title case letter
- Lu Upper case letter
- L& Ll, Lu, or Lt
-.sp
- M Mark
- Mc Spacing mark
- Me Enclosing mark
- Mn Non-spacing mark
-.sp
- N Number
- Nd Decimal number
- Nl Letter number
- No Other number
-.sp
- P Punctuation
- Pc Connector punctuation
- Pd Dash punctuation
- Pe Close punctuation
- Pf Final punctuation
- Pi Initial punctuation
- Po Other punctuation
- Ps Open punctuation
-.sp
- S Symbol
- Sc Currency symbol
- Sk Modifier symbol
- Sm Mathematical symbol
- So Other symbol
-.sp
- Z Separator
- Zl Line separator
- Zp Paragraph separator
- Zs Space separator
-.
-.
-.SH "SCRIPT NAMES FOR \ep AND \eP"
-.rs
-.sp
-Arabic,
-Armenian,
-Balinese,
-Bengali,
-Bopomofo,
-Braille,
-Buginese,
-Buhid,
-Canadian_Aboriginal,
-Carian,
-Cham,
-Cherokee,
-Common,
-Coptic,
-Cuneiform,
-Cypriot,
-Cyrillic,
-Deseret,
-Devanagari,
-Ethiopic,
-Georgian,
-Glagolitic,
-Gothic,
-Greek,
-Gujarati,
-Gurmukhi,
-Han,
-Hangul,
-Hanunoo,
-Hebrew,
-Hiragana,
-Inherited,
-Kannada,
-Katakana,
-Kayah_Li,
-Kharoshthi,
-Khmer,
-Lao,
-Latin,
-Lepcha,
-Limbu,
-Linear_B,
-Lycian,
-Lydian,
-Malayalam,
-Mongolian,
-Myanmar,
-New_Tai_Lue,
-Nko,
-Ogham,
-Old_Italic,
-Old_Persian,
-Ol_Chiki,
-Oriya,
-Osmanya,
-Phags_Pa,
-Phoenician,
-Rejang,
-Runic,
-Saurashtra,
-Shavian,
-Sinhala,
-Sudanese,
-Syloti_Nagri,
-Syriac,
-Tagalog,
-Tagbanwa,
-Tai_Le,
-Tamil,
-Telugu,
-Thaana,
-Thai,
-Tibetan,
-Tifinagh,
-Ugaritic,
-Vai,
-Yi.
-.
-.
-.SH "CHARACTER CLASSES"
-.rs
-.sp
- [...] positive character class
- [^...] negative character class
- [x-y] range (can be used for hex characters)
- [[:xxx:]] positive POSIX named set
- [[:^xxx:]] negative POSIX named set
-.sp
- alnum alphanumeric
- alpha alphabetic
- ascii 0-127
- blank space or tab
- cntrl control character
- digit decimal digit
- graph printing, excluding space
- lower lower case letter
- print printing, including space
- punct printing, excluding alphanumeric
- space whitespace
- upper upper case letter
- word same as \ew
- xdigit hexadecimal digit
-.sp
-In PCRE, POSIX character set names recognize only ASCII characters. You can use
-\eQ...\eE inside a character class.
-.
-.
-.SH "QUANTIFIERS"
-.rs
-.sp
- ? 0 or 1, greedy
- ?+ 0 or 1, possessive
- ?? 0 or 1, lazy
- * 0 or more, greedy
- *+ 0 or more, possessive
- *? 0 or more, lazy
- + 1 or more, greedy
- ++ 1 or more, possessive
- +? 1 or more, lazy
- {n} exactly n
- {n,m} at least n, no more than m, greedy
- {n,m}+ at least n, no more than m, possessive
- {n,m}? at least n, no more than m, lazy
- {n,} n or more, greedy
- {n,}+ n or more, possessive
- {n,}? n or more, lazy
-.
-.
-.SH "ANCHORS AND SIMPLE ASSERTIONS"
-.rs
-.sp
- \eb word boundary (only ASCII letters recognized)
- \eB not a word boundary
- ^ start of subject
- also after internal newline in multiline mode
- \eA start of subject
- $ end of subject
- also before newline at end of subject
- also before internal newline in multiline mode
- \eZ end of subject
- also before newline at end of subject
- \ez end of subject
- \eG first matching position in subject
-.
-.
-.SH "MATCH POINT RESET"
-.rs
-.sp
- \eK reset start of match
-.
-.
-.SH "ALTERNATION"
-.rs
-.sp
- expr|expr|expr...
-.
-.
-.SH "CAPTURING"
-.rs
-.sp
- (...) capturing group
- (?...) named capturing group (Perl)
- (?'name'...) named capturing group (Perl)
- (?P...) named capturing group (Python)
- (?:...) non-capturing group
- (?|...) non-capturing group; reset group numbers for
- capturing groups in each alternative
-.
-.
-.SH "ATOMIC GROUPS"
-.rs
-.sp
- (?>...) atomic, non-capturing group
-.
-.
-.
-.
-.SH "COMMENT"
-.rs
-.sp
- (?#....) comment (not nestable)
-.
-.
-.SH "OPTION SETTING"
-.rs
-.sp
- (?i) caseless
- (?J) allow duplicate names
- (?m) multiline
- (?s) single line (dotall)
- (?U) default ungreedy (lazy)
- (?x) extended (ignore white space)
- (?-...) unset option(s)
-.sp
-The following is recognized only at the start of a pattern or after one of the
-newline-setting options with similar syntax:
-.sp
- (*UTF8) set UTF-8 mode
-.
-.
-.SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS"
-.rs
-.sp
- (?=...) positive look ahead
- (?!...) negative look ahead
- (?<=...) positive look behind
- (? reference by name (Perl)
- \ek'name' reference by name (Perl)
- \eg{name} reference by name (Perl)
- \ek{name} reference by name (.NET)
- (?P=name) reference by name (Python)
-.
-.
-.SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)"
-.rs
-.sp
- (?R) recurse whole pattern
- (?n) call subpattern by absolute number
- (?+n) call subpattern by relative number
- (?-n) call subpattern by relative number
- (?&name) call subpattern by name (Perl)
- (?P>name) call subpattern by name (Python)
- \eg call subpattern by name (Oniguruma)
- \eg'name' call subpattern by name (Oniguruma)
- \eg call subpattern by absolute number (Oniguruma)
- \eg'n' call subpattern by absolute number (Oniguruma)
- \eg<+n> call subpattern by relative number (PCRE extension)
- \eg'+n' call subpattern by relative number (PCRE extension)
- \eg<-n> call subpattern by relative number (PCRE extension)
- \eg'-n' call subpattern by relative number (PCRE extension)
-.
-.
-.SH "CONDITIONAL PATTERNS"
-.rs
-.sp
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-.sp
- (?(n)... absolute reference condition
- (?(+n)... relative reference condition
- (?(-n)... relative reference condition
- (?()... named reference condition (Perl)
- (?('name')... named reference condition (Perl)
- (?(name)... named reference condition (PCRE)
- (?(R)... overall recursion condition
- (?(Rn)... specific group recursion condition
- (?(R&name)... specific recursion condition
- (?(DEFINE)... define subpattern for reference
- (?(assert)... assertion condition
-.
-.
-.SH "BACKTRACKING CONTROL"
-.rs
-.sp
-The following act immediately they are reached:
-.sp
- (*ACCEPT) force successful match
- (*FAIL) force backtrack; synonym (*F)
-.sp
-The following act only when a subsequent match failure causes a backtrack to
-reach them. They all force a match failure, but they differ in what happens
-afterwards. Those that advance the start-of-match point do so only if the
-pattern is not anchored.
-.sp
- (*COMMIT) overall failure, no advance of starting point
- (*PRUNE) advance to next starting character
- (*SKIP) advance start to current matching position
- (*THEN) local failure, backtrack to next alternation
-.
-.
-.SH "NEWLINE CONVENTIONS"
-.rs
-.sp
-These are recognized only at the very start of the pattern or after a
-(*BSR_...) or (*UTF8) option.
-.sp
- (*CR) carriage return only
- (*LF) linefeed only
- (*CRLF) carriage return followed by linefeed
- (*ANYCRLF) all three of the above
- (*ANY) any Unicode newline sequence
-.
-.
-.SH "WHAT \eR MATCHES"
-.rs
-.sp
-These are recognized only at the very start of the pattern or after a
-(*...) option that sets the newline convention or UTF-8 mode.
-.sp
- (*BSR_ANYCRLF) CR, LF, or CRLF
- (*BSR_UNICODE) any Unicode newline sequence
-.
-.
-.SH "CALLOUTS"
-.rs
-.sp
- (?C) callout
- (?Cn) callout with data n
-.
-.
-.SH "SEE ALSO"
-.rs
-.sp
-\fBpcrepattern\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3),
-\fBpcrematching\fP(3), \fBpcre\fP(3).
-.
-.
-.SH AUTHOR
-.rs
-.sp
-.nf
-Philip Hazel
-University Computing Service
-Cambridge CB2 3QH, England.
-.fi
-.
-.
-.SH REVISION
-.rs
-.sp
-.nf
-Last updated: 11 April 2009
-Copyright (c) 1997-2009 University of Cambridge.
-.fi
diff --git a/libs/pcre/doc/pcretest.1 b/libs/pcre/doc/pcretest.1
deleted file mode 100644
index dc2f4a8c13..0000000000
--- a/libs/pcre/doc/pcretest.1
+++ /dev/null
@@ -1,728 +0,0 @@
-.TH PCRETEST 1
-.SH NAME
-pcretest - a program for testing Perl-compatible regular expressions.
-.SH SYNOPSIS
-.rs
-.sp
-.B pcretest "[options] [source] [destination]"
-.sp
-\fBpcretest\fP was written as a test program for the PCRE regular expression
-library itself, but it can also be used for experimenting with regular
-expressions. This document describes the features of the test program; for
-details of the regular expressions themselves, see the
-.\" HREF
-\fBpcrepattern\fP
-.\"
-documentation. For details of the PCRE library function calls and their
-options, see the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation.
-.
-.
-.SH OPTIONS
-.rs
-.TP 10
-\fB-b\fP
-Behave as if each regex has the \fB/B\fP (show bytecode) modifier; the internal
-form is output after compilation.
-.TP 10
-\fB-C\fP
-Output the version number of the PCRE library, and all available information
-about the optional features that are included, and then exit.
-.TP 10
-\fB-d\fP
-Behave as if each regex has the \fB/D\fP (debug) modifier; the internal
-form and information about the compiled pattern is output after compilation;
-\fB-d\fP is equivalent to \fB-b -i\fP.
-.TP 10
-\fB-dfa\fP
-Behave as if each data line contains the \eD escape sequence; this causes the
-alternative matching function, \fBpcre_dfa_exec()\fP, to be used instead of the
-standard \fBpcre_exec()\fP function (more detail is given below).
-.TP 10
-\fB-help\fP
-Output a brief summary these options and then exit.
-.TP 10
-\fB-i\fP
-Behave as if each regex has the \fB/I\fP modifier; information about the
-compiled pattern is given after compilation.
-.TP 10
-\fB-M\fP
-Behave as if each data line contains the \eM escape sequence; this causes
-PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
-calling \fBpcre_exec()\fP repeatedly with different limits.
-.TP 10
-\fB-m\fP
-Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding \fB/M\fP to each regular expression. For compatibility
-with earlier versions of pcretest, \fB-s\fP is a synonym for \fB-m\fP.
-.TP 10
-\fB-o\fP \fIosize\fP
-Set the number of elements in the output vector that is used when calling
-\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP to be \fIosize\fP. The default value
-is 45, which is enough for 14 capturing subexpressions for \fBpcre_exec()\fP or
-22 different matches for \fBpcre_dfa_exec()\fP. The vector size can be
-changed for individual matching calls by including \eO in the data line (see
-below).
-.TP 10
-\fB-p\fP
-Behave as if each regex has the \fB/P\fP modifier; the POSIX wrapper API is
-used to call PCRE. None of the other options has any effect when \fB-p\fP is
-set.
-.TP 10
-\fB-q\fP
-Do not output the version number of \fBpcretest\fP at the start of execution.
-.TP 10
-\fB-S\fP \fIsize\fP
-On Unix-like systems, set the size of the runtime stack to \fIsize\fP
-megabytes.
-.TP 10
-\fB-t\fP
-Run each compile, study, and match many times with a timer, and output
-resulting time per compile or match (in milliseconds). Do not set \fB-m\fP with
-\fB-t\fP, because you will then get the size output a zillion times, and the
-timing will be distorted. You can control the number of iterations that are
-used for timing by following \fB-t\fP with a number (as a separate item on the
-command line). For example, "-t 1000" would iterate 1000 times. The default is
-to iterate 500000 times.
-.TP 10
-\fB-tm\fP
-This is like \fB-t\fP except that it times only the matching phase, not the
-compile or study phases.
-.
-.
-.SH DESCRIPTION
-.rs
-.sp
-If \fBpcretest\fP is given two filename arguments, it reads from the first and
-writes to the second. If it is given only one filename argument, it reads from
-that file and writes to stdout. Otherwise, it reads from stdin and writes to
-stdout, and prompts for each line of input, using "re>" to prompt for regular
-expressions, and "data>" to prompt for data lines.
-.P
-When \fBpcretest\fP is built, a configuration option can specify that it should
-be linked with the \fBlibreadline\fP library. When this is done, if the input
-is from a terminal, it is read using the \fBreadline()\fP function. This
-provides line-editing and history facilities. The output from the \fB-help\fP
-option states whether or not \fBreadline()\fP will be used.
-.P
-The program handles any number of sets of input on a single input file. Each
-set starts with a regular expression, and continues with any number of data
-lines to be matched against the pattern.
-.P
-Each data line is matched separately and independently. If you want to do
-multi-line matches, you have to use the \en escape sequence (or \er or \er\en,
-etc., depending on the newline setting) in a single line of input to encode the
-newline sequences. There is no limit on the length of data lines; the input
-buffer is automatically extended if it is too small.
-.P
-An empty line signals the end of the data lines, at which point a new regular
-expression is read. The regular expressions are given enclosed in any
-non-alphanumeric delimiters other than backslash, for example:
-.sp
- /(a|bc)x+yz/
-.sp
-White space before the initial delimiter is ignored. A regular expression may
-be continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it, for example
-.sp
- /abc\e/def/
-.sp
-If you do so, the escape and the delimiter form part of the pattern, but since
-delimiters are always non-alphanumeric, this does not affect its interpretation.
-If the terminating delimiter is immediately followed by a backslash, for
-example,
-.sp
- /abc/\e
-.sp
-then a backslash is added to the end of the pattern. This is done to provide a
-way of testing the error condition that arises if a pattern finishes with a
-backslash, because
-.sp
- /abc\e/
-.sp
-is interpreted as the first line of a pattern that starts with "abc/", causing
-pcretest to read the next line as a continuation of the regular expression.
-.
-.
-.SH "PATTERN MODIFIERS"
-.rs
-.sp
-A pattern may be followed by any number of modifiers, which are mostly single
-characters. Following Perl usage, these are referred to below as, for example,
-"the \fB/i\fP modifier", even though the delimiter of the pattern need not
-always be a slash, and no slash is used when writing modifiers. Whitespace may
-appear between the final pattern delimiter and the first modifier, and between
-the modifiers themselves.
-.P
-The \fB/i\fP, \fB/m\fP, \fB/s\fP, and \fB/x\fP modifiers set the PCRE_CASELESS,
-PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when
-\fBpcre_compile()\fP is called. These four modifier letters have the same
-effect as they do in Perl. For example:
-.sp
- /caseless/i
-.sp
-The following table shows additional modifiers for setting PCRE options that do
-not correspond to anything in Perl:
-.sp
- \fB/A\fP PCRE_ANCHORED
- \fB/C\fP PCRE_AUTO_CALLOUT
- \fB/E\fP PCRE_DOLLAR_ENDONLY
- \fB/f\fP PCRE_FIRSTLINE
- \fB/J\fP PCRE_DUPNAMES
- \fB/N\fP PCRE_NO_AUTO_CAPTURE
- \fB/U\fP PCRE_UNGREEDY
- \fB/X\fP PCRE_EXTRA
- \fB/\fP PCRE_JAVASCRIPT_COMPAT
- \fB/\fP PCRE_NEWLINE_CR
- \fB/\fP PCRE_NEWLINE_LF
- \fB/\fP PCRE_NEWLINE_CRLF
- \fB/\fP PCRE_NEWLINE_ANYCRLF
- \fB/\fP PCRE_NEWLINE_ANY
- \fB/\fP PCRE_BSR_ANYCRLF
- \fB/\fP PCRE_BSR_UNICODE
-.sp
-Those specifying line ending sequences are literal strings as shown, but the
-letters can be in either case. This example sets multiline matching with CRLF
-as the line ending sequence:
-.sp
- /^abc/m
-.sp
-Details of the meanings of these PCRE options are given in the
-.\" HREF
-\fBpcreapi\fP
-.\"
-documentation.
-.
-.
-.SS "Finding all matches in a string"
-.rs
-.sp
-Searching for all possible matches within each subject string can be requested
-by the \fB/g\fP or \fB/G\fP modifier. After finding a match, PCRE is called
-again to search the remainder of the subject string. The difference between
-\fB/g\fP and \fB/G\fP is that the former uses the \fIstartoffset\fP argument to
-\fBpcre_exec()\fP to start searching at a new point within the entire string
-(which is in effect what Perl does), whereas the latter passes over a shortened
-substring. This makes a difference to the matching process if the pattern
-begins with a lookbehind assertion (including \eb or \eB).
-.P
-If any call to \fBpcre_exec()\fP in a \fB/g\fP or \fB/G\fP sequence matches an
-empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
-flags set in order to search for another, non-empty, match at the same point.
-If this second match fails, the start offset is advanced by one, and the normal
-match is retried. This imitates the way Perl handles such cases when using the
-\fB/g\fP modifier or the \fBsplit()\fP function.
-.
-.
-.SS "Other modifiers"
-.rs
-.sp
-There are yet more modifiers for controlling the way \fBpcretest\fP
-operates.
-.P
-The \fB/+\fP modifier requests that as well as outputting the substring that
-matched the entire pattern, pcretest should in addition output the remainder of
-the subject string. This is useful for tests where the subject contains
-multiple copies of the same substring.
-.P
-The \fB/B\fP modifier is a debugging feature. It requests that \fBpcretest\fP
-output a representation of the compiled byte code after compilation. Normally
-this information contains length and offset values; however, if \fB/Z\fP is
-also present, this data is replaced by spaces. This is a special feature for
-use in the automatic test scripts; it ensures that the same output is generated
-for different internal link sizes.
-.P
-The \fB/L\fP modifier must be followed directly by the name of a locale, for
-example,
-.sp
- /pattern/Lfr_FR
-.sp
-For this reason, it must be the last modifier. The given locale is set,
-\fBpcre_maketables()\fP is called to build a set of character tables for the
-locale, and this is then passed to \fBpcre_compile()\fP when compiling the
-regular expression. Without an \fB/L\fP modifier, NULL is passed as the tables
-pointer; that is, \fB/L\fP applies only to the expression on which it appears.
-.P
-The \fB/I\fP modifier requests that \fBpcretest\fP output information about the
-compiled pattern (whether it is anchored, has a fixed first character, and
-so on). It does this by calling \fBpcre_fullinfo()\fP after compiling a
-pattern. If the pattern is studied, the results of that are also output.
-.P
-The \fB/D\fP modifier is a PCRE debugging feature, and is equivalent to
-\fB/BI\fP, that is, both the \fB/B\fP and the \fB/I\fP modifiers.
-.P
-The \fB/F\fP modifier causes \fBpcretest\fP to flip the byte order of the
-fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
-facility is for testing the feature in PCRE that allows it to execute patterns
-that were compiled on a host with a different endianness. This feature is not
-available when the POSIX interface to PCRE is being used, that is, when the
-\fB/P\fP pattern modifier is specified. See also the section about saving and
-reloading compiled patterns below.
-.P
-The \fB/S\fP modifier causes \fBpcre_study()\fP to be called after the
-expression has been compiled, and the results used when the expression is
-matched.
-.P
-The \fB/M\fP modifier causes the size of memory block used to hold the compiled
-pattern to be output.
-.P
-The \fB/P\fP modifier causes \fBpcretest\fP to call PCRE via the POSIX wrapper
-API rather than its native API. When this is done, all other modifiers except
-\fB/i\fP, \fB/m\fP, and \fB/+\fP are ignored. REG_ICASE is set if \fB/i\fP is
-present, and REG_NEWLINE is set if \fB/m\fP is present. The wrapper functions
-force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
-.P
-The \fB/8\fP modifier causes \fBpcretest\fP to call PCRE with the PCRE_UTF8
-option set. This turns on support for UTF-8 character handling in PCRE,
-provided that it was compiled with this support enabled. This modifier also
-causes any non-printing characters in output strings to be printed using the
-\ex{hh...} notation if they are valid UTF-8 sequences.
-.P
-If the \fB/?\fP modifier is used with \fB/8\fP, it causes \fBpcretest\fP to
-call \fBpcre_compile()\fP with the PCRE_NO_UTF8_CHECK option, to suppress the
-checking of the string for UTF-8 validity.
-.
-.
-.SH "DATA LINES"
-.rs
-.sp
-Before each data line is passed to \fBpcre_exec()\fP, leading and trailing
-whitespace is removed, and it is then scanned for \e escapes. Some of these are
-pretty esoteric features, intended for checking out some of the more
-complicated features of PCRE. If you are just testing "ordinary" regular
-expressions, you probably don't need any of these. The following escapes are
-recognized:
-.sp
- \ea alarm (BEL, \ex07)
- \eb backspace (\ex08)
- \ee escape (\ex27)
- \ef formfeed (\ex0c)
- \en newline (\ex0a)
-.\" JOIN
- \eqdd set the PCRE_MATCH_LIMIT limit to dd
- (any number of digits)
- \er carriage return (\ex0d)
- \et tab (\ex09)
- \ev vertical tab (\ex0b)
- \ennn octal character (up to 3 octal digits)
- \exhh hexadecimal character (up to 2 hex digits)
-.\" JOIN
- \ex{hh...} hexadecimal character, any number of digits
- in UTF-8 mode
-.\" JOIN
- \eA pass the PCRE_ANCHORED option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \eB pass the PCRE_NOTBOL option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \eCdd call pcre_copy_substring() for substring dd
- after a successful match (number less than 32)
-.\" JOIN
- \eCname call pcre_copy_named_substring() for substring
- "name" after a successful match (name termin-
- ated by next non alphanumeric character)
-.\" JOIN
- \eC+ show the current captured substrings at callout
- time
- \eC- do not supply a callout function
-.\" JOIN
- \eC!n return 1 instead of 0 when callout number n is
- reached
-.\" JOIN
- \eC!n!m return 1 instead of 0 when callout number n is
- reached for the nth time
-.\" JOIN
- \eC*n pass the number n (may be negative) as callout
- data; this is used as the callout return value
- \eD use the \fBpcre_dfa_exec()\fP match function
- \eF only shortest match for \fBpcre_dfa_exec()\fP
-.\" JOIN
- \eGdd call pcre_get_substring() for substring dd
- after a successful match (number less than 32)
-.\" JOIN
- \eGname call pcre_get_named_substring() for substring
- "name" after a successful match (name termin-
- ated by next non-alphanumeric character)
-.\" JOIN
- \eL call pcre_get_substringlist() after a
- successful match
-.\" JOIN
- \eM discover the minimum MATCH_LIMIT and
- MATCH_LIMIT_RECURSION settings
-.\" JOIN
- \eN pass the PCRE_NOTEMPTY option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \eOdd set the size of the output vector passed to
- \fBpcre_exec()\fP to dd (any number of digits)
-.\" JOIN
- \eP pass the PCRE_PARTIAL option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \eQdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd
- (any number of digits)
- \eR pass the PCRE_DFA_RESTART option to \fBpcre_dfa_exec()\fP
- \eS output details of memory get/free calls during matching
-.\" JOIN
- \eZ pass the PCRE_NOTEOL option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \e? pass the PCRE_NO_UTF8_CHECK option to
- \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP
- \e>dd start the match at offset dd (any number of digits);
-.\" JOIN
- this sets the \fIstartoffset\fP argument for \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \e pass the PCRE_NEWLINE_CR option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \e pass the PCRE_NEWLINE_LF option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \e pass the PCRE_NEWLINE_CRLF option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \e pass the PCRE_NEWLINE_ANYCRLF option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.\" JOIN
- \e pass the PCRE_NEWLINE_ANY option to \fBpcre_exec()\fP
- or \fBpcre_dfa_exec()\fP
-.sp
-The escapes that specify line ending sequences are literal strings, exactly as
-shown. No more than one newline setting should be present in any data line.
-.P
-A backslash followed by anything else just escapes the anything else. If
-the very last character is a backslash, it is ignored. This gives a way of
-passing an empty line as data, since a real empty line terminates the data
-input.
-.P
-If \eM is present, \fBpcretest\fP calls \fBpcre_exec()\fP several times, with
-different values in the \fImatch_limit\fP and \fImatch_limit_recursion\fP
-fields of the \fBpcre_extra\fP data structure, until it finds the minimum
-numbers for each parameter that allow \fBpcre_exec()\fP to complete. The
-\fImatch_limit\fP number is a measure of the amount of backtracking that takes
-place, and checking it out can be instructive. For most simple matches, the
-number is quite small, but for patterns with very large numbers of matching
-possibilities, it can become large very quickly with increasing length of
-subject string. The \fImatch_limit_recursion\fP number is a measure of how much
-stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
-to complete the match attempt.
-.P
-When \eO is used, the value specified may be higher or lower than the size set
-by the \fB-O\fP command line option (or defaulted to 45); \eO applies only to
-the call of \fBpcre_exec()\fP for the line in which it appears.
-.P
-If the \fB/P\fP modifier was present on the pattern, causing the POSIX wrapper
-API to be used, the only option-setting sequences that have any effect are \eB
-and \eZ, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
-\fBregexec()\fP.
-.P
-The use of \ex{hh...} to represent UTF-8 characters is not dependent on the use
-of the \fB/8\fP modifier on the pattern. It is recognized always. There may be
-any number of hexadecimal digits inside the braces. The result is from one to
-six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
-allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
-valid Unicode code points, or indeed valid UTF-8 characters according to the
-later rules in RFC 3629.
-.
-.
-.SH "THE ALTERNATIVE MATCHING FUNCTION"
-.rs
-.sp
-By default, \fBpcretest\fP uses the standard PCRE matching function,
-\fBpcre_exec()\fP to match each data line. From release 6.0, PCRE supports an
-alternative matching function, \fBpcre_dfa_test()\fP, which operates in a
-different way, and has some restrictions. The differences between the two
-functions are described in the
-.\" HREF
-\fBpcrematching\fP
-.\"
-documentation.
-.P
-If a data line contains the \eD escape sequence, or if the command line
-contains the \fB-dfa\fP option, the alternative matching function is called.
-This function finds all possible matches at a given point. If, however, the \eF
-escape sequence is present in the data line, it stops after the first match is
-found. This is always the shortest possible match.
-.
-.
-.SH "DEFAULT OUTPUT FROM PCRETEST"
-.rs
-.sp
-This section describes the output when the normal matching function,
-\fBpcre_exec()\fP, is being used.
-.P
-When a match succeeds, pcretest outputs the list of captured substrings that
-\fBpcre_exec()\fP returns, starting with number 0 for the string that matched
-the whole pattern. Otherwise, it outputs "No match" or "Partial match"
-when \fBpcre_exec()\fP returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PARTIAL,
-respectively, and otherwise the PCRE negative error number. Here is an example
-of an interactive \fBpcretest\fP run.
-.sp
- $ pcretest
- PCRE version 7.0 30-Nov-2006
-.sp
- re> /^abc(\ed+)/
- data> abc123
- 0: abc123
- 1: 123
- data> xyz
- No match
-.sp
-Note that unset capturing substrings that are not followed by one that is set
-are not returned by \fBpcre_exec()\fP, and are not shown by \fBpcretest\fP. In
-the following example, there are two capturing substrings, but when the first
-data line is matched, the second, unset substring is not shown. An "internal"
-unset substring is shown as "", as for the second data line.
-.sp
- re> /(a)|(b)/
- data> a
- 0: a
- 1: a
- data> b
- 0: b
- 1:
- 2: b
-.sp
-If the strings contain any non-printing characters, they are output as \e0x
-escapes, or as \ex{...} escapes if the \fB/8\fP modifier was present on the
-pattern. See below for the definition of non-printing characters. If the
-pattern has the \fB/+\fP modifier, the output for substring 0 is followed by
-the the rest of the subject string, identified by "0+" like this:
-.sp
- re> /cat/+
- data> cataract
- 0: cat
- 0+ aract
-.sp
-If the pattern has the \fB/g\fP or \fB/G\fP modifier, the results of successive
-matching attempts are output in sequence, like this:
-.sp
- re> /\eBi(\ew\ew)/g
- data> Mississippi
- 0: iss
- 1: ss
- 0: iss
- 1: ss
- 0: ipp
- 1: pp
-.sp
-"No match" is output only if the first match attempt fails.
-.P
-If any of the sequences \fB\eC\fP, \fB\eG\fP, or \fB\eL\fP are present in a
-data line that is successfully matched, the substrings extracted by the
-convenience functions are output with C, G, or L after the string number
-instead of a colon. This is in addition to the normal full list. The string
-length (that is, the return from the extraction function) is given in
-parentheses after each string for \fB\eC\fP and \fB\eG\fP.
-.P
-Note that whereas patterns can be continued over several lines (a plain ">"
-prompt is used for continuations), data lines may not. However newlines can be
-included in data by means of the \en escape (or \er, \er\en, etc., depending on
-the newline sequence setting).
-.
-.
-.
-.SH "OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION"
-.rs
-.sp
-When the alternative matching function, \fBpcre_dfa_exec()\fP, is used (by
-means of the \eD escape sequence or the \fB-dfa\fP command line option), the
-output consists of a list of all the matches that start at the first point in
-the subject where there is at least one match. For example:
-.sp
- re> /(tang|tangerine|tan)/
- data> yellow tangerine\eD
- 0: tangerine
- 1: tang
- 2: tan
-.sp
-(Using the normal matching function on this data finds only "tang".) The
-longest matching string is always given first (and numbered zero).
-.P
-If \fB/g\fP is present on the pattern, the search for further matches resumes
-at the end of the longest match. For example:
-.sp
- re> /(tang|tangerine|tan)/g
- data> yellow tangerine and tangy sultana\eD
- 0: tangerine
- 1: tang
- 2: tan
- 0: tang
- 1: tan
- 0: tan
-.sp
-Since the matching function does not support substring capture, the escape
-sequences that are concerned with captured substrings are not relevant.
-.
-.
-.SH "RESTARTING AFTER A PARTIAL MATCH"
-.rs
-.sp
-When the alternative matching function has given the PCRE_ERROR_PARTIAL return,
-indicating that the subject partially matched the pattern, you can restart the
-match with additional subject data by means of the \eR escape sequence. For
-example:
-.sp
- re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
- data> 23ja\eP\eD
- Partial match: 23ja
- data> n05\eR\eD
- 0: n05
-.sp
-For further information about partial matching, see the
-.\" HREF
-\fBpcrepartial\fP
-.\"
-documentation.
-.
-.
-.SH CALLOUTS
-.rs
-.sp
-If the pattern contains any callout requests, \fBpcretest\fP's callout function
-is called during matching. This works with both matching functions. By default,
-the called function displays the callout number, the start and current
-positions in the text at the callout time, and the next pattern item to be
-tested. For example, the output
-.sp
- --->pqrabcdef
- 0 ^ ^ \ed
-.sp
-indicates that callout number 0 occurred for a match attempt starting at the
-fourth character of the subject string, when the pointer was at the seventh
-character of the data, and when the next pattern item was \ed. Just one
-circumflex is output if the start and current positions are the same.
-.P
-Callouts numbered 255 are assumed to be automatic callouts, inserted as a
-result of the \fB/C\fP pattern modifier. In this case, instead of showing the
-callout number, the offset in the pattern, preceded by a plus, is output. For
-example:
-.sp
- re> /\ed?[A-E]\e*/C
- data> E*
- --->E*
- +0 ^ \ed?
- +3 ^ [A-E]
- +8 ^^ \e*
- +10 ^ ^
- 0: E*
-.sp
-The callout function in \fBpcretest\fP returns zero (carry on matching) by
-default, but you can use a \eC item in a data line (as described above) to
-change this.
-.P
-Inserting callouts can be helpful when using \fBpcretest\fP to check
-complicated regular expressions. For further information about callouts, see
-the
-.\" HREF
-\fBpcrecallout\fP
-.\"
-documentation.
-.
-.
-.
-.SH "NON-PRINTING CHARACTERS"
-.rs
-.sp
-When \fBpcretest\fP is outputting text in the compiled version of a pattern,
-bytes other than 32-126 are always treated as non-printing characters are are
-therefore shown as hex escapes.
-.P
-When \fBpcretest\fP is outputting text that is a matched part of a subject
-string, it behaves in the same way, unless a different locale has been set for
-the pattern (using the \fB/L\fP modifier). In this case, the \fBisprint()\fP
-function to distinguish printing and non-printing characters.
-.
-.
-.
-.SH "SAVING AND RELOADING COMPILED PATTERNS"
-.rs
-.sp
-The facilities described in this section are not available when the POSIX
-inteface to PCRE is being used, that is, when the \fB/P\fP pattern modifier is
-specified.
-.P
-When the POSIX interface is not in use, you can cause \fBpcretest\fP to write a
-compiled pattern to a file, by following the modifiers with > and a file name.
-For example:
-.sp
- /pattern/im >/some/file
-.sp
-See the
-.\" HREF
-\fBpcreprecompile\fP
-.\"
-documentation for a discussion about saving and re-using compiled patterns.
-.P
-The data that is written is binary. The first eight bytes are the length of the
-compiled pattern data followed by the length of the optional study data, each
-written as four bytes in big-endian order (most significant byte first). If
-there is no study data (either the pattern was not studied, or studying did not
-return any data), the second length is zero. The lengths are followed by an
-exact copy of the compiled pattern. If there is additional study data, this
-follows immediately after the compiled pattern. After writing the file,
-\fBpcretest\fP expects to read a new pattern.
-.P
-A saved pattern can be reloaded into \fBpcretest\fP by specifing < and a file
-name instead of a pattern. The name of the file must not contain a < character,
-as otherwise \fBpcretest\fP will interpret the line as a pattern delimited by <
-characters.
-For example:
-.sp
- re> " to prompt for regular expressions, and "data>" to prompt for data
- lines.
-
- When pcretest is built, a configuration option can specify that it
- should be linked with the libreadline library. When this is done, if
- the input is from a terminal, it is read using the readline() function.
- This provides line-editing and history facilities. The output from the
- -help option states whether or not readline() will be used.
-
- The program handles any number of sets of input on a single input file.
- Each set starts with a regular expression, and continues with any num-
- ber of data lines to be matched against the pattern.
-
- Each data line is matched separately and independently. If you want to
- do multi-line matches, you have to use the \n escape sequence (or \r or
- \r\n, etc., depending on the newline setting) in a single line of input
- to encode the newline sequences. There is no limit on the length of
- data lines; the input buffer is automatically extended if it is too
- small.
-
- An empty line signals the end of the data lines, at which point a new
- regular expression is read. The regular expressions are given enclosed
- in any non-alphanumeric delimiters other than backslash, for example:
-
- /(a|bc)x+yz/
-
- White space before the initial delimiter is ignored. A regular expres-
- sion may be continued over several input lines, in which case the new-
- line characters are included within it. It is possible to include the
- delimiter within the pattern by escaping it, for example
-
- /abc\/def/
-
- If you do so, the escape and the delimiter form part of the pattern,
- but since delimiters are always non-alphanumeric, this does not affect
- its interpretation. If the terminating delimiter is immediately fol-
- lowed by a backslash, for example,
-
- /abc/\
-
- then a backslash is added to the end of the pattern. This is done to
- provide a way of testing the error condition that arises if a pattern
- finishes with a backslash, because
-
- /abc\/
-
- is interpreted as the first line of a pattern that starts with "abc/",
- causing pcretest to read the next line as a continuation of the regular
- expression.
-
-
-PATTERN MODIFIERS
-
- A pattern may be followed by any number of modifiers, which are mostly
- single characters. Following Perl usage, these are referred to below
- as, for example, "the /i modifier", even though the delimiter of the
- pattern need not always be a slash, and no slash is used when writing
- modifiers. Whitespace may appear between the final pattern delimiter
- and the first modifier, and between the modifiers themselves.
-
- The /i, /m, /s, and /x modifiers set the PCRE_CASELESS, PCRE_MULTILINE,
- PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when pcre_com-
- pile() is called. These four modifier letters have the same effect as
- they do in Perl. For example:
-
- /caseless/i
-
- The following table shows additional modifiers for setting PCRE options
- that do not correspond to anything in Perl:
-
- /A PCRE_ANCHORED
- /C PCRE_AUTO_CALLOUT
- /E PCRE_DOLLAR_ENDONLY
- /f PCRE_FIRSTLINE
- /J PCRE_DUPNAMES
- /N PCRE_NO_AUTO_CAPTURE
- /U PCRE_UNGREEDY
- /X PCRE_EXTRA
- / PCRE_JAVASCRIPT_COMPAT
- / PCRE_NEWLINE_CR
- / PCRE_NEWLINE_LF
- / PCRE_NEWLINE_CRLF
- / PCRE_NEWLINE_ANYCRLF
- / PCRE_NEWLINE_ANY
- / PCRE_BSR_ANYCRLF
- / PCRE_BSR_UNICODE
-
- Those specifying line ending sequences are literal strings as shown,
- but the letters can be in either case. This example sets multiline
- matching with CRLF as the line ending sequence:
-
- /^abc/m
-
- Details of the meanings of these PCRE options are given in the pcreapi
- documentation.
-
- Finding all matches in a string
-
- Searching for all possible matches within each subject string can be
- requested by the /g or /G modifier. After finding a match, PCRE is
- called again to search the remainder of the subject string. The differ-
- ence between /g and /G is that the former uses the startoffset argument
- to pcre_exec() to start searching at a new point within the entire
- string (which is in effect what Perl does), whereas the latter passes
- over a shortened substring. This makes a difference to the matching
- process if the pattern begins with a lookbehind assertion (including \b
- or \B).
-
- If any call to pcre_exec() in a /g or /G sequence matches an empty
- string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
- flags set in order to search for another, non-empty, match at the same
- point. If this second match fails, the start offset is advanced by
- one, and the normal match is retried. This imitates the way Perl han-
- dles such cases when using the /g modifier or the split() function.
-
- Other modifiers
-
- There are yet more modifiers for controlling the way pcretest operates.
-
- The /+ modifier requests that as well as outputting the substring that
- matched the entire pattern, pcretest should in addition output the
- remainder of the subject string. This is useful for tests where the
- subject contains multiple copies of the same substring.
-
- The /B modifier is a debugging feature. It requests that pcretest out-
- put a representation of the compiled byte code after compilation. Nor-
- mally this information contains length and offset values; however, if
- /Z is also present, this data is replaced by spaces. This is a special
- feature for use in the automatic test scripts; it ensures that the same
- output is generated for different internal link sizes.
-
- The /L modifier must be followed directly by the name of a locale, for
- example,
-
- /pattern/Lfr_FR
-
- For this reason, it must be the last modifier. The given locale is set,
- pcre_maketables() is called to build a set of character tables for the
- locale, and this is then passed to pcre_compile() when compiling the
- regular expression. Without an /L modifier, NULL is passed as the
- tables pointer; that is, /L applies only to the expression on which it
- appears.
-
- The /I modifier requests that pcretest output information about the
- compiled pattern (whether it is anchored, has a fixed first character,
- and so on). It does this by calling pcre_fullinfo() after compiling a
- pattern. If the pattern is studied, the results of that are also out-
- put.
-
- The /D modifier is a PCRE debugging feature, and is equivalent to /BI,
- that is, both the /B and the /I modifiers.
-
- The /F modifier causes pcretest to flip the byte order of the fields in
- the compiled pattern that contain 2-byte and 4-byte numbers. This
- facility is for testing the feature in PCRE that allows it to execute
- patterns that were compiled on a host with a different endianness. This
- feature is not available when the POSIX interface to PCRE is being
- used, that is, when the /P pattern modifier is specified. See also the
- section about saving and reloading compiled patterns below.
-
- The /S modifier causes pcre_study() to be called after the expression
- has been compiled, and the results used when the expression is matched.
-
- The /M modifier causes the size of memory block used to hold the com-
- piled pattern to be output.
-
- The /P modifier causes pcretest to call PCRE via the POSIX wrapper API
- rather than its native API. When this is done, all other modifiers
- except /i, /m, and /+ are ignored. REG_ICASE is set if /i is present,
- and REG_NEWLINE is set if /m is present. The wrapper functions force
- PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
-
- The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option
- set. This turns on support for UTF-8 character handling in PCRE, pro-
- vided that it was compiled with this support enabled. This modifier
- also causes any non-printing characters in output strings to be printed
- using the \x{hh...} notation if they are valid UTF-8 sequences.
-
- If the /? modifier is used with /8, it causes pcretest to call
- pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
- checking of the string for UTF-8 validity.
-
-
-DATA LINES
-
- Before each data line is passed to pcre_exec(), leading and trailing
- whitespace is removed, and it is then scanned for \ escapes. Some of
- these are pretty esoteric features, intended for checking out some of
- the more complicated features of PCRE. If you are just testing "ordi-
- nary" regular expressions, you probably don't need any of these. The
- following escapes are recognized:
-
- \a alarm (BEL, \x07)
- \b backspace (\x08)
- \e escape (\x27)
- \f formfeed (\x0c)
- \n newline (\x0a)
- \qdd set the PCRE_MATCH_LIMIT limit to dd
- (any number of digits)
- \r carriage return (\x0d)
- \t tab (\x09)
- \v vertical tab (\x0b)
- \nnn octal character (up to 3 octal digits)
- \xhh hexadecimal character (up to 2 hex digits)
- \x{hh...} hexadecimal character, any number of digits
- in UTF-8 mode
- \A pass the PCRE_ANCHORED option to pcre_exec()
- or pcre_dfa_exec()
- \B pass the PCRE_NOTBOL option to pcre_exec()
- or pcre_dfa_exec()
- \Cdd call pcre_copy_substring() for substring dd
- after a successful match (number less than 32)
- \Cname call pcre_copy_named_substring() for substring
- "name" after a successful match (name termin-
- ated by next non alphanumeric character)
- \C+ show the current captured substrings at callout
- time
- \C- do not supply a callout function
- \C!n return 1 instead of 0 when callout number n is
- reached
- \C!n!m return 1 instead of 0 when callout number n is
- reached for the nth time
- \C*n pass the number n (may be negative) as callout
- data; this is used as the callout return value
- \D use the pcre_dfa_exec() match function
- \F only shortest match for pcre_dfa_exec()
- \Gdd call pcre_get_substring() for substring dd
- after a successful match (number less than 32)
- \Gname call pcre_get_named_substring() for substring
- "name" after a successful match (name termin-
- ated by next non-alphanumeric character)
- \L call pcre_get_substringlist() after a
- successful match
- \M discover the minimum MATCH_LIMIT and
- MATCH_LIMIT_RECURSION settings
- \N pass the PCRE_NOTEMPTY option to pcre_exec()
- or pcre_dfa_exec()
- \Odd set the size of the output vector passed to
- pcre_exec() to dd (any number of digits)
- \P pass the PCRE_PARTIAL option to pcre_exec()
- or pcre_dfa_exec()
- \Qdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd
- (any number of digits)
- \R pass the PCRE_DFA_RESTART option to pcre_dfa_exec()
- \S output details of memory get/free calls during matching
- \Z pass the PCRE_NOTEOL option to pcre_exec()
- or pcre_dfa_exec()
- \? pass the PCRE_NO_UTF8_CHECK option to
- pcre_exec() or pcre_dfa_exec()
- \>dd start the match at offset dd (any number of digits);
- this sets the startoffset argument for pcre_exec()
- or pcre_dfa_exec()
- \ pass the PCRE_NEWLINE_CR option to pcre_exec()
- or pcre_dfa_exec()
- \ pass the PCRE_NEWLINE_LF option to pcre_exec()
- or pcre_dfa_exec()
- \ pass the PCRE_NEWLINE_CRLF option to pcre_exec()
- or pcre_dfa_exec()
- \ pass the PCRE_NEWLINE_ANYCRLF option to pcre_exec()
- or pcre_dfa_exec()
- \ pass the PCRE_NEWLINE_ANY option to pcre_exec()
- or pcre_dfa_exec()
-
- The escapes that specify line ending sequences are literal strings,
- exactly as shown. No more than one newline setting should be present in
- any data line.
-
- A backslash followed by anything else just escapes the anything else.
- If the very last character is a backslash, it is ignored. This gives a
- way of passing an empty line as data, since a real empty line termi-
- nates the data input.
-
- If \M is present, pcretest calls pcre_exec() several times, with dif-
- ferent values in the match_limit and match_limit_recursion fields of
- the pcre_extra data structure, until it finds the minimum numbers for
- each parameter that allow pcre_exec() to complete. The match_limit num-
- ber is a measure of the amount of backtracking that takes place, and
- checking it out can be instructive. For most simple matches, the number
- is quite small, but for patterns with very large numbers of matching
- possibilities, it can become large very quickly with increasing length
- of subject string. The match_limit_recursion number is a measure of how
- much stack (or, if PCRE is compiled with NO_RECURSE, how much heap)
- memory is needed to complete the match attempt.
-
- When \O is used, the value specified may be higher or lower than the
- size set by the -O command line option (or defaulted to 45); \O applies
- only to the call of pcre_exec() for the line in which it appears.
-
- If the /P modifier was present on the pattern, causing the POSIX wrap-
- per API to be used, the only option-setting sequences that have any
- effect are \B and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively,
- to be passed to regexec().
-
- The use of \x{hh...} to represent UTF-8 characters is not dependent on
- the use of the /8 modifier on the pattern. It is recognized always.
- There may be any number of hexadecimal digits inside the braces. The
- result is from one to six bytes, encoded according to the original
- UTF-8 rules of RFC 2279. This allows for values in the range 0 to
- 0x7FFFFFFF. Note that not all of those are valid Unicode code points,
- or indeed valid UTF-8 characters according to the later rules in RFC
- 3629.
-
-
-THE ALTERNATIVE MATCHING FUNCTION
-
- By default, pcretest uses the standard PCRE matching function,
- pcre_exec() to match each data line. From release 6.0, PCRE supports an
- alternative matching function, pcre_dfa_test(), which operates in a
- different way, and has some restrictions. The differences between the
- two functions are described in the pcrematching documentation.
-
- If a data line contains the \D escape sequence, or if the command line
- contains the -dfa option, the alternative matching function is called.
- This function finds all possible matches at a given point. If, however,
- the \F escape sequence is present in the data line, it stops after the
- first match is found. This is always the shortest possible match.
-
-
-DEFAULT OUTPUT FROM PCRETEST
-
- This section describes the output when the normal matching function,
- pcre_exec(), is being used.
-
- When a match succeeds, pcretest outputs the list of captured substrings
- that pcre_exec() returns, starting with number 0 for the string that
- matched the whole pattern. Otherwise, it outputs "No match" or "Partial
- match" when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PAR-
- TIAL, respectively, and otherwise the PCRE negative error number. Here
- is an example of an interactive pcretest run.
-
- $ pcretest
- PCRE version 7.0 30-Nov-2006
-
- re> /^abc(\d+)/
- data> abc123
- 0: abc123
- 1: 123
- data> xyz
- No match
-
- Note that unset capturing substrings that are not followed by one that
- is set are not returned by pcre_exec(), and are not shown by pcretest.
- In the following example, there are two capturing substrings, but when
- the first data line is matched, the second, unset substring is not
- shown. An "internal" unset substring is shown as "", as for the
- second data line.
-
- re> /(a)|(b)/
- data> a
- 0: a
- 1: a
- data> b
- 0: b
- 1:
- 2: b
-
- If the strings contain any non-printing characters, they are output as
- \0x escapes, or as \x{...} escapes if the /8 modifier was present on
- the pattern. See below for the definition of non-printing characters.
- If the pattern has the /+ modifier, the output for substring 0 is fol-
- lowed by the the rest of the subject string, identified by "0+" like
- this:
-
- re> /cat/+
- data> cataract
- 0: cat
- 0+ aract
-
- If the pattern has the /g or /G modifier, the results of successive
- matching attempts are output in sequence, like this:
-
- re> /\Bi(\w\w)/g
- data> Mississippi
- 0: iss
- 1: ss
- 0: iss
- 1: ss
- 0: ipp
- 1: pp
-
- "No match" is output only if the first match attempt fails.
-
- If any of the sequences \C, \G, or \L are present in a data line that
- is successfully matched, the substrings extracted by the convenience
- functions are output with C, G, or L after the string number instead of
- a colon. This is in addition to the normal full list. The string length
- (that is, the return from the extraction function) is given in paren-
- theses after each string for \C and \G.
-
- Note that whereas patterns can be continued over several lines (a plain
- ">" prompt is used for continuations), data lines may not. However new-
- lines can be included in data by means of the \n escape (or \r, \r\n,
- etc., depending on the newline sequence setting).
-
-
-OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
-
- When the alternative matching function, pcre_dfa_exec(), is used (by
- means of the \D escape sequence or the -dfa command line option), the
- output consists of a list of all the matches that start at the first
- point in the subject where there is at least one match. For example:
-
- re> /(tang|tangerine|tan)/
- data> yellow tangerine\D
- 0: tangerine
- 1: tang
- 2: tan
-
- (Using the normal matching function on this data finds only "tang".)
- The longest matching string is always given first (and numbered zero).
-
- If /g is present on the pattern, the search for further matches resumes
- at the end of the longest match. For example:
-
- re> /(tang|tangerine|tan)/g
- data> yellow tangerine and tangy sultana\D
- 0: tangerine
- 1: tang
- 2: tan
- 0: tang
- 1: tan
- 0: tan
-
- Since the matching function does not support substring capture, the
- escape sequences that are concerned with captured substrings are not
- relevant.
-
-
-RESTARTING AFTER A PARTIAL MATCH
-
- When the alternative matching function has given the PCRE_ERROR_PARTIAL
- return, indicating that the subject partially matched the pattern, you
- can restart the match with additional subject data by means of the \R
- escape sequence. For example:
-
- re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
- data> 23ja\P\D
- Partial match: 23ja
- data> n05\R\D
- 0: n05
-
- For further information about partial matching, see the pcrepartial
- documentation.
-
-
-CALLOUTS
-
- If the pattern contains any callout requests, pcretest's callout func-
- tion is called during matching. This works with both matching func-
- tions. By default, the called function displays the callout number, the
- start and current positions in the text at the callout time, and the
- next pattern item to be tested. For example, the output
-
- --->pqrabcdef
- 0 ^ ^ \d
-
- indicates that callout number 0 occurred for a match attempt starting
- at the fourth character of the subject string, when the pointer was at
- the seventh character of the data, and when the next pattern item was
- \d. Just one circumflex is output if the start and current positions
- are the same.
-
- Callouts numbered 255 are assumed to be automatic callouts, inserted as
- a result of the /C pattern modifier. In this case, instead of showing
- the callout number, the offset in the pattern, preceded by a plus, is
- output. For example:
-
- re> /\d?[A-E]\*/C
- data> E*
- --->E*
- +0 ^ \d?
- +3 ^ [A-E]
- +8 ^^ \*
- +10 ^ ^
- 0: E*
-
- The callout function in pcretest returns zero (carry on matching) by
- default, but you can use a \C item in a data line (as described above)
- to change this.
-
- Inserting callouts can be helpful when using pcretest to check compli-
- cated regular expressions. For further information about callouts, see
- the pcrecallout documentation.
-
-
-NON-PRINTING CHARACTERS
-
- When pcretest is outputting text in the compiled version of a pattern,
- bytes other than 32-126 are always treated as non-printing characters
- are are therefore shown as hex escapes.
-
- When pcretest is outputting text that is a matched part of a subject
- string, it behaves in the same way, unless a different locale has been
- set for the pattern (using the /L modifier). In this case, the
- isprint() function to distinguish printing and non-printing characters.
-
-
-SAVING AND RELOADING COMPILED PATTERNS
-
- The facilities described in this section are not available when the
- POSIX inteface to PCRE is being used, that is, when the /P pattern mod-
- ifier is specified.
-
- When the POSIX interface is not in use, you can cause pcretest to write
- a compiled pattern to a file, by following the modifiers with > and a
- file name. For example:
-
- /pattern/im >/some/file
-
- See the pcreprecompile documentation for a discussion about saving and
- re-using compiled patterns.
-
- The data that is written is binary. The first eight bytes are the
- length of the compiled pattern data followed by the length of the
- optional study data, each written as four bytes in big-endian order
- (most significant byte first). If there is no study data (either the
- pattern was not studied, or studying did not return any data), the sec-
- ond length is zero. The lengths are followed by an exact copy of the
- compiled pattern. If there is additional study data, this follows imme-
- diately after the compiled pattern. After writing the file, pcretest
- expects to read a new pattern.
-
- A saved pattern can be reloaded into pcretest by specifing < and a file
- name instead of a pattern. The name of the file must not contain a <
- character, as otherwise pcretest will interpret the line as a pattern
- delimited by < characters. For example:
-
- re> nul 2>nul
-
-:: sh configure
-
-:: check for needed header files
-if not exist pcre.h copy pcre.h.generic pcre.h
-if not exist config.h copy config.h.generic config.h
-
-bcc32 -DDFTABLES %COMPILE_DEFAULTS% -L%BORLAND%\lib dftables.c
-IF ERRORLEVEL 1 GOTO ERROR
-
-:: dftables > chartables.c
-dftables pcre_chartables.c
-
-REM compile and link the PCRE library into lib: option -B for ASM compile works too
-bcc32 -a4 -c -RT- -y- -v- -u- -R- -Q- -X -d -fp -ff -P- -O2 -Oc -Ov -3 -w-8004 -w-8064 -w-8065 -w-8012 -UDFTABLES -DVPCOMPAT %COMPILE_DEFAULTS% @makevp_c.txt
-IF ERRORLEVEL 1 GOTO ERROR
-
-tlib %BORLAND%\lib\cw32.lib *calloc *del *strncmp *memcpy *memmove *memset *memcmp *strlen
-IF ERRORLEVEL 1 GOTO ERROR
-tlib pcre%PCRE_VER%.lib @makevp_l.txt +calloc.obj +del.obj +strncmp.obj +memcpy.obj +memmove.obj +memset.obj +memcmp.obj +strlen.obj
-IF ERRORLEVEL 1 GOTO ERROR
-
-del *.obj *.tds *.bak >nul 2>nul
-
-echo ---
-echo Now the library should be complete. Please check all messages above.
-echo Don't care for warnings, it's OK.
-goto END
-
-:ERROR
-echo ---
-echo Error while compiling PCRE. Aborting...
-pause
-goto END
-
-:END
diff --git a/libs/pcre/makevp_c.txt b/libs/pcre/makevp_c.txt
deleted file mode 100644
index 931b8ab88d..0000000000
--- a/libs/pcre/makevp_c.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-pcre_chartables.c
-pcre_compile.c
-pcre_config.c
-pcre_dfa_exec.c
-pcre_exec.c
-pcre_fullinfo.c
-pcre_get.c
-pcre_globals.c
-pcre_info.c
-pcre_maketables.c
-pcre_newline.c
-pcre_ord2utf8.c
-pcre_refcount.c
-pcre_study.c
-pcre_tables.c
-pcre_try_flipped.c
-pcre_ucd.c
-pcre_valid_utf8.c
-pcre_version.c
-pcre_xclass.c
diff --git a/libs/pcre/makevp_l.txt b/libs/pcre/makevp_l.txt
deleted file mode 100644
index 6de1cb438e..0000000000
--- a/libs/pcre/makevp_l.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-+pcre_chartables.obj &
-+pcre_compile.obj &
-+pcre_config.obj &
-+pcre_dfa_exec.obj &
-+pcre_exec.obj &
-+pcre_fullinfo.obj &
-+pcre_get.obj &
-+pcre_globals.obj &
-+pcre_info.obj &
-+pcre_maketables.obj &
-+pcre_newline.obj &
-+pcre_ord2utf8.obj &
-+pcre_refcount.obj &
-+pcre_study.obj &
-+pcre_tables.obj &
-+pcre_try_flipped.obj &
-+pcre_ucd.obj &
-+pcre_valid_utf8.obj &
-+pcre_version.obj &
-+pcre_xclass.obj
diff --git a/libs/pcre/pcre-config.in b/libs/pcre/pcre-config.in
deleted file mode 100644
index 3b52101b9b..0000000000
--- a/libs/pcre/pcre-config.in
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/sh
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-exec_prefix_set=no
-
-usage="\
-Usage: pcre-config [--prefix] [--exec-prefix] [--version] [--libs] [--libs-posix] [--cflags] [--cflags-posix]"
-
-if test $# -eq 0; then
- echo "${usage}" 1>&2
- exit 1
-fi
-
-libR=
-case `uname -s` in
- *SunOS*)
- libR=" -R@libdir@"
- ;;
- *BSD*)
- libR=" -Wl,-R@libdir@"
- ;;
-esac
-
-while test $# -gt 0; do
- case "$1" in
- -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
- *) optarg= ;;
- esac
-
- case $1 in
- --prefix=*)
- prefix=$optarg
- if test $exec_prefix_set = no ; then
- exec_prefix=$optarg
- fi
- ;;
- --prefix)
- echo $prefix
- ;;
- --exec-prefix=*)
- exec_prefix=$optarg
- exec_prefix_set=yes
- ;;
- --exec-prefix)
- echo $exec_prefix
- ;;
- --version)
- echo @PACKAGE_VERSION@
- ;;
- --cflags | --cflags-posix)
- if test @includedir@ != /usr/include ; then
- includes=-I@includedir@
- fi
- echo $includes
- ;;
- --libs-posix)
- echo -L@libdir@$libR -lpcreposix -lpcre
- ;;
- --libs)
- echo -L@libdir@$libR -lpcre
- ;;
- *)
- echo "${usage}" 1>&2
- exit 1
- ;;
- esac
- shift
-done
diff --git a/libs/pcre/pcre.def b/libs/pcre/pcre.def
deleted file mode 100644
index ec2c7b395b..0000000000
--- a/libs/pcre/pcre.def
+++ /dev/null
@@ -1,29 +0,0 @@
-EXPORTS
-
-pcre_malloc DATA
-pcre_free DATA
-
-pcre_compile
-pcre_compile2
-pcre_config
-pcre_copy_named_substring
-pcre_copy_substring
-pcre_dfa_exec
-pcre_exec
-pcre_free_substring
-pcre_free_substring_list
-pcre_fullinfo
-pcre_get_named_substring
-pcre_get_stringnumber
-pcre_get_substring
-pcre_get_substring_list
-pcre_info
-pcre_maketables
-pcre_refcount
-pcre_study
-pcre_version
-
-regcomp
-regexec
-regerror
-regfree
diff --git a/libs/pcre/pcre.h b/libs/pcre/pcre.h
deleted file mode 100644
index c5fc4c13e4..0000000000
--- a/libs/pcre/pcre.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* This is the public header file for the PCRE library, to be #included by
-applications that call the PCRE functions.
-
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-#ifndef _PCRE_H
-#define _PCRE_H
-
-/* The current PCRE version information. */
-
-#define PCRE_MAJOR 7
-#define PCRE_MINOR 9
-#define PCRE_PRERELEASE
-#define PCRE_DATE 2009-04-11
-
-/* When an application links to a PCRE DLL in Windows, the symbols that are
-imported have to be identified as such. When building PCRE, the appropriate
-export setting is defined in pcre_internal.h, which includes this file. So we
-don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. */
-
-#if defined(_WIN32) && !defined(PCRE_STATIC)
-# ifndef PCRE_EXP_DECL
-# define PCRE_EXP_DECL extern __declspec(dllimport)
-# endif
-# ifdef __cplusplus
-# ifndef PCRECPP_EXP_DECL
-# define PCRECPP_EXP_DECL extern __declspec(dllimport)
-# endif
-# ifndef PCRECPP_EXP_DEFN
-# define PCRECPP_EXP_DEFN __declspec(dllimport)
-# endif
-# endif
-#endif
-
-/* By default, we use the standard "extern" declarations. */
-
-#ifndef PCRE_EXP_DECL
-# ifdef __cplusplus
-# define PCRE_EXP_DECL extern "C"
-# else
-# define PCRE_EXP_DECL extern
-# endif
-#endif
-
-#ifdef __cplusplus
-# ifndef PCRECPP_EXP_DECL
-# define PCRECPP_EXP_DECL extern
-# endif
-# ifndef PCRECPP_EXP_DEFN
-# define PCRECPP_EXP_DEFN
-# endif
-#endif
-
-/* Have to include stdlib.h in order to ensure that size_t is defined;
-it is needed here for malloc. */
-
-#include
-
-/* Allow for C++ users */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
-
-#define PCRE_CASELESS 0x00000001
-#define PCRE_MULTILINE 0x00000002
-#define PCRE_DOTALL 0x00000004
-#define PCRE_EXTENDED 0x00000008
-#define PCRE_ANCHORED 0x00000010
-#define PCRE_DOLLAR_ENDONLY 0x00000020
-#define PCRE_EXTRA 0x00000040
-#define PCRE_NOTBOL 0x00000080
-#define PCRE_NOTEOL 0x00000100
-#define PCRE_UNGREEDY 0x00000200
-#define PCRE_NOTEMPTY 0x00000400
-#define PCRE_UTF8 0x00000800
-#define PCRE_NO_AUTO_CAPTURE 0x00001000
-#define PCRE_NO_UTF8_CHECK 0x00002000
-#define PCRE_AUTO_CALLOUT 0x00004000
-#define PCRE_PARTIAL 0x00008000
-#define PCRE_DFA_SHORTEST 0x00010000
-#define PCRE_DFA_RESTART 0x00020000
-#define PCRE_FIRSTLINE 0x00040000
-#define PCRE_DUPNAMES 0x00080000
-#define PCRE_NEWLINE_CR 0x00100000
-#define PCRE_NEWLINE_LF 0x00200000
-#define PCRE_NEWLINE_CRLF 0x00300000
-#define PCRE_NEWLINE_ANY 0x00400000
-#define PCRE_NEWLINE_ANYCRLF 0x00500000
-#define PCRE_BSR_ANYCRLF 0x00800000
-#define PCRE_BSR_UNICODE 0x01000000
-#define PCRE_JAVASCRIPT_COMPAT 0x02000000
-#define PCRE_NO_START_OPTIMIZE 0x04000000
-#define PCRE_NO_START_OPTIMISE 0x04000000
-
-/* Exec-time and get/set-time error codes */
-
-#define PCRE_ERROR_NOMATCH (-1)
-#define PCRE_ERROR_NULL (-2)
-#define PCRE_ERROR_BADOPTION (-3)
-#define PCRE_ERROR_BADMAGIC (-4)
-#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
-#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
-#define PCRE_ERROR_NOMEMORY (-6)
-#define PCRE_ERROR_NOSUBSTRING (-7)
-#define PCRE_ERROR_MATCHLIMIT (-8)
-#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
-#define PCRE_ERROR_BADUTF8 (-10)
-#define PCRE_ERROR_BADUTF8_OFFSET (-11)
-#define PCRE_ERROR_PARTIAL (-12)
-#define PCRE_ERROR_BADPARTIAL (-13)
-#define PCRE_ERROR_INTERNAL (-14)
-#define PCRE_ERROR_BADCOUNT (-15)
-#define PCRE_ERROR_DFA_UITEM (-16)
-#define PCRE_ERROR_DFA_UCOND (-17)
-#define PCRE_ERROR_DFA_UMLIMIT (-18)
-#define PCRE_ERROR_DFA_WSSIZE (-19)
-#define PCRE_ERROR_DFA_RECURSE (-20)
-#define PCRE_ERROR_RECURSIONLIMIT (-21)
-#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
-#define PCRE_ERROR_BADNEWLINE (-23)
-
-/* Request types for pcre_fullinfo() */
-
-#define PCRE_INFO_OPTIONS 0
-#define PCRE_INFO_SIZE 1
-#define PCRE_INFO_CAPTURECOUNT 2
-#define PCRE_INFO_BACKREFMAX 3
-#define PCRE_INFO_FIRSTBYTE 4
-#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */
-#define PCRE_INFO_FIRSTTABLE 5
-#define PCRE_INFO_LASTLITERAL 6
-#define PCRE_INFO_NAMEENTRYSIZE 7
-#define PCRE_INFO_NAMECOUNT 8
-#define PCRE_INFO_NAMETABLE 9
-#define PCRE_INFO_STUDYSIZE 10
-#define PCRE_INFO_DEFAULT_TABLES 11
-#define PCRE_INFO_OKPARTIAL 12
-#define PCRE_INFO_JCHANGED 13
-#define PCRE_INFO_HASCRORLF 14
-
-/* Request types for pcre_config(). Do not re-arrange, in order to remain
-compatible. */
-
-#define PCRE_CONFIG_UTF8 0
-#define PCRE_CONFIG_NEWLINE 1
-#define PCRE_CONFIG_LINK_SIZE 2
-#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
-#define PCRE_CONFIG_MATCH_LIMIT 4
-#define PCRE_CONFIG_STACKRECURSE 5
-#define PCRE_CONFIG_UNICODE_PROPERTIES 6
-#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
-#define PCRE_CONFIG_BSR 8
-
-/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
-these bits, just add new ones on the end, in order to remain compatible. */
-
-#define PCRE_EXTRA_STUDY_DATA 0x0001
-#define PCRE_EXTRA_MATCH_LIMIT 0x0002
-#define PCRE_EXTRA_CALLOUT_DATA 0x0004
-#define PCRE_EXTRA_TABLES 0x0008
-#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
-
-/* Types */
-
-struct real_pcre; /* declaration; the definition is private */
-typedef struct real_pcre pcre;
-
-/* When PCRE is compiled as a C++ library, the subject pointer type can be
-replaced with a custom type. For conventional use, the public interface is a
-const char *. */
-
-#ifndef PCRE_SPTR
-#define PCRE_SPTR const char *
-#endif
-
-/* The structure for passing additional data to pcre_exec(). This is defined in
-such as way as to be extensible. Always add new fields at the end, in order to
-remain compatible. */
-
-typedef struct pcre_extra {
- unsigned long int flags; /* Bits for which fields are set */
- void *study_data; /* Opaque data from pcre_study() */
- unsigned long int match_limit; /* Maximum number of calls to match() */
- void *callout_data; /* Data passed back in callouts */
- const unsigned char *tables; /* Pointer to character tables */
- unsigned long int match_limit_recursion; /* Max recursive calls to match() */
-} pcre_extra;
-
-/* The structure for passing out data via the pcre_callout_function. We use a
-structure so that new fields can be added on the end in future versions,
-without changing the API of the function, thereby allowing old clients to work
-without modification. */
-
-typedef struct pcre_callout_block {
- int version; /* Identifies version of block */
- /* ------------------------ Version 0 ------------------------------- */
- int callout_number; /* Number compiled into pattern */
- int *offset_vector; /* The offset vector */
- PCRE_SPTR subject; /* The subject being matched */
- int subject_length; /* The length of the subject */
- int start_match; /* Offset to start of this match attempt */
- int current_position; /* Where we currently are in the subject */
- int capture_top; /* Max current capture */
- int capture_last; /* Most recently closed capture */
- void *callout_data; /* Data passed in with the call */
- /* ------------------- Added for Version 1 -------------------------- */
- int pattern_position; /* Offset to next item in the pattern */
- int next_item_length; /* Length of next item in the pattern */
- /* ------------------------------------------------------------------ */
-} pcre_callout_block;
-
-/* Indirection for store get and free functions. These can be set to
-alternative malloc/free functions if required. Special ones are used in the
-non-recursive case for "frames". There is also an optional callout function
-that is triggered by the (?) regex item. For Virtual Pascal, these definitions
-have to take another form. */
-
-#ifndef VPCOMPAT
-PCRE_EXP_DECL void *(*pcre_malloc)(size_t);
-PCRE_EXP_DECL void (*pcre_free)(void *);
-PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t);
-PCRE_EXP_DECL void (*pcre_stack_free)(void *);
-PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
-#else /* VPCOMPAT */
-PCRE_EXP_DECL void *pcre_malloc(size_t);
-PCRE_EXP_DECL void pcre_free(void *);
-PCRE_EXP_DECL void *pcre_stack_malloc(size_t);
-PCRE_EXP_DECL void pcre_stack_free(void *);
-PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
-#endif /* VPCOMPAT */
-
-/* Exported PCRE functions */
-
-PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
- const unsigned char *);
-PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
- int *, const unsigned char *);
-PCRE_EXP_DECL int pcre_config(int, void *);
-PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
- int *, int, const char *, char *, int);
-PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *,
- int);
-PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *,
- const char *, int, int, int, int *, int , int *, int);
-PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR,
- int, int, int, int *, int);
-PCRE_EXP_DECL void pcre_free_substring(const char *);
-PCRE_EXP_DECL void pcre_free_substring_list(const char **);
-PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
- void *);
-PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
- int *, int, const char *, const char **);
-PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
-PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *,
- char **, char **);
-PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
- const char **);
-PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
- const char ***);
-PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
-PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
-PCRE_EXP_DECL int pcre_refcount(pcre *, int);
-PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
-PCRE_EXP_DECL const char *pcre_version(void);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* End of pcre.h */
diff --git a/libs/pcre/pcre.h.generic b/libs/pcre/pcre.h.generic
deleted file mode 100644
index c5fc4c13e4..0000000000
--- a/libs/pcre/pcre.h.generic
+++ /dev/null
@@ -1,307 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* This is the public header file for the PCRE library, to be #included by
-applications that call the PCRE functions.
-
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-#ifndef _PCRE_H
-#define _PCRE_H
-
-/* The current PCRE version information. */
-
-#define PCRE_MAJOR 7
-#define PCRE_MINOR 9
-#define PCRE_PRERELEASE
-#define PCRE_DATE 2009-04-11
-
-/* When an application links to a PCRE DLL in Windows, the symbols that are
-imported have to be identified as such. When building PCRE, the appropriate
-export setting is defined in pcre_internal.h, which includes this file. So we
-don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. */
-
-#if defined(_WIN32) && !defined(PCRE_STATIC)
-# ifndef PCRE_EXP_DECL
-# define PCRE_EXP_DECL extern __declspec(dllimport)
-# endif
-# ifdef __cplusplus
-# ifndef PCRECPP_EXP_DECL
-# define PCRECPP_EXP_DECL extern __declspec(dllimport)
-# endif
-# ifndef PCRECPP_EXP_DEFN
-# define PCRECPP_EXP_DEFN __declspec(dllimport)
-# endif
-# endif
-#endif
-
-/* By default, we use the standard "extern" declarations. */
-
-#ifndef PCRE_EXP_DECL
-# ifdef __cplusplus
-# define PCRE_EXP_DECL extern "C"
-# else
-# define PCRE_EXP_DECL extern
-# endif
-#endif
-
-#ifdef __cplusplus
-# ifndef PCRECPP_EXP_DECL
-# define PCRECPP_EXP_DECL extern
-# endif
-# ifndef PCRECPP_EXP_DEFN
-# define PCRECPP_EXP_DEFN
-# endif
-#endif
-
-/* Have to include stdlib.h in order to ensure that size_t is defined;
-it is needed here for malloc. */
-
-#include
-
-/* Allow for C++ users */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
-
-#define PCRE_CASELESS 0x00000001
-#define PCRE_MULTILINE 0x00000002
-#define PCRE_DOTALL 0x00000004
-#define PCRE_EXTENDED 0x00000008
-#define PCRE_ANCHORED 0x00000010
-#define PCRE_DOLLAR_ENDONLY 0x00000020
-#define PCRE_EXTRA 0x00000040
-#define PCRE_NOTBOL 0x00000080
-#define PCRE_NOTEOL 0x00000100
-#define PCRE_UNGREEDY 0x00000200
-#define PCRE_NOTEMPTY 0x00000400
-#define PCRE_UTF8 0x00000800
-#define PCRE_NO_AUTO_CAPTURE 0x00001000
-#define PCRE_NO_UTF8_CHECK 0x00002000
-#define PCRE_AUTO_CALLOUT 0x00004000
-#define PCRE_PARTIAL 0x00008000
-#define PCRE_DFA_SHORTEST 0x00010000
-#define PCRE_DFA_RESTART 0x00020000
-#define PCRE_FIRSTLINE 0x00040000
-#define PCRE_DUPNAMES 0x00080000
-#define PCRE_NEWLINE_CR 0x00100000
-#define PCRE_NEWLINE_LF 0x00200000
-#define PCRE_NEWLINE_CRLF 0x00300000
-#define PCRE_NEWLINE_ANY 0x00400000
-#define PCRE_NEWLINE_ANYCRLF 0x00500000
-#define PCRE_BSR_ANYCRLF 0x00800000
-#define PCRE_BSR_UNICODE 0x01000000
-#define PCRE_JAVASCRIPT_COMPAT 0x02000000
-#define PCRE_NO_START_OPTIMIZE 0x04000000
-#define PCRE_NO_START_OPTIMISE 0x04000000
-
-/* Exec-time and get/set-time error codes */
-
-#define PCRE_ERROR_NOMATCH (-1)
-#define PCRE_ERROR_NULL (-2)
-#define PCRE_ERROR_BADOPTION (-3)
-#define PCRE_ERROR_BADMAGIC (-4)
-#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
-#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
-#define PCRE_ERROR_NOMEMORY (-6)
-#define PCRE_ERROR_NOSUBSTRING (-7)
-#define PCRE_ERROR_MATCHLIMIT (-8)
-#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
-#define PCRE_ERROR_BADUTF8 (-10)
-#define PCRE_ERROR_BADUTF8_OFFSET (-11)
-#define PCRE_ERROR_PARTIAL (-12)
-#define PCRE_ERROR_BADPARTIAL (-13)
-#define PCRE_ERROR_INTERNAL (-14)
-#define PCRE_ERROR_BADCOUNT (-15)
-#define PCRE_ERROR_DFA_UITEM (-16)
-#define PCRE_ERROR_DFA_UCOND (-17)
-#define PCRE_ERROR_DFA_UMLIMIT (-18)
-#define PCRE_ERROR_DFA_WSSIZE (-19)
-#define PCRE_ERROR_DFA_RECURSE (-20)
-#define PCRE_ERROR_RECURSIONLIMIT (-21)
-#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
-#define PCRE_ERROR_BADNEWLINE (-23)
-
-/* Request types for pcre_fullinfo() */
-
-#define PCRE_INFO_OPTIONS 0
-#define PCRE_INFO_SIZE 1
-#define PCRE_INFO_CAPTURECOUNT 2
-#define PCRE_INFO_BACKREFMAX 3
-#define PCRE_INFO_FIRSTBYTE 4
-#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */
-#define PCRE_INFO_FIRSTTABLE 5
-#define PCRE_INFO_LASTLITERAL 6
-#define PCRE_INFO_NAMEENTRYSIZE 7
-#define PCRE_INFO_NAMECOUNT 8
-#define PCRE_INFO_NAMETABLE 9
-#define PCRE_INFO_STUDYSIZE 10
-#define PCRE_INFO_DEFAULT_TABLES 11
-#define PCRE_INFO_OKPARTIAL 12
-#define PCRE_INFO_JCHANGED 13
-#define PCRE_INFO_HASCRORLF 14
-
-/* Request types for pcre_config(). Do not re-arrange, in order to remain
-compatible. */
-
-#define PCRE_CONFIG_UTF8 0
-#define PCRE_CONFIG_NEWLINE 1
-#define PCRE_CONFIG_LINK_SIZE 2
-#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
-#define PCRE_CONFIG_MATCH_LIMIT 4
-#define PCRE_CONFIG_STACKRECURSE 5
-#define PCRE_CONFIG_UNICODE_PROPERTIES 6
-#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
-#define PCRE_CONFIG_BSR 8
-
-/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
-these bits, just add new ones on the end, in order to remain compatible. */
-
-#define PCRE_EXTRA_STUDY_DATA 0x0001
-#define PCRE_EXTRA_MATCH_LIMIT 0x0002
-#define PCRE_EXTRA_CALLOUT_DATA 0x0004
-#define PCRE_EXTRA_TABLES 0x0008
-#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
-
-/* Types */
-
-struct real_pcre; /* declaration; the definition is private */
-typedef struct real_pcre pcre;
-
-/* When PCRE is compiled as a C++ library, the subject pointer type can be
-replaced with a custom type. For conventional use, the public interface is a
-const char *. */
-
-#ifndef PCRE_SPTR
-#define PCRE_SPTR const char *
-#endif
-
-/* The structure for passing additional data to pcre_exec(). This is defined in
-such as way as to be extensible. Always add new fields at the end, in order to
-remain compatible. */
-
-typedef struct pcre_extra {
- unsigned long int flags; /* Bits for which fields are set */
- void *study_data; /* Opaque data from pcre_study() */
- unsigned long int match_limit; /* Maximum number of calls to match() */
- void *callout_data; /* Data passed back in callouts */
- const unsigned char *tables; /* Pointer to character tables */
- unsigned long int match_limit_recursion; /* Max recursive calls to match() */
-} pcre_extra;
-
-/* The structure for passing out data via the pcre_callout_function. We use a
-structure so that new fields can be added on the end in future versions,
-without changing the API of the function, thereby allowing old clients to work
-without modification. */
-
-typedef struct pcre_callout_block {
- int version; /* Identifies version of block */
- /* ------------------------ Version 0 ------------------------------- */
- int callout_number; /* Number compiled into pattern */
- int *offset_vector; /* The offset vector */
- PCRE_SPTR subject; /* The subject being matched */
- int subject_length; /* The length of the subject */
- int start_match; /* Offset to start of this match attempt */
- int current_position; /* Where we currently are in the subject */
- int capture_top; /* Max current capture */
- int capture_last; /* Most recently closed capture */
- void *callout_data; /* Data passed in with the call */
- /* ------------------- Added for Version 1 -------------------------- */
- int pattern_position; /* Offset to next item in the pattern */
- int next_item_length; /* Length of next item in the pattern */
- /* ------------------------------------------------------------------ */
-} pcre_callout_block;
-
-/* Indirection for store get and free functions. These can be set to
-alternative malloc/free functions if required. Special ones are used in the
-non-recursive case for "frames". There is also an optional callout function
-that is triggered by the (?) regex item. For Virtual Pascal, these definitions
-have to take another form. */
-
-#ifndef VPCOMPAT
-PCRE_EXP_DECL void *(*pcre_malloc)(size_t);
-PCRE_EXP_DECL void (*pcre_free)(void *);
-PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t);
-PCRE_EXP_DECL void (*pcre_stack_free)(void *);
-PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
-#else /* VPCOMPAT */
-PCRE_EXP_DECL void *pcre_malloc(size_t);
-PCRE_EXP_DECL void pcre_free(void *);
-PCRE_EXP_DECL void *pcre_stack_malloc(size_t);
-PCRE_EXP_DECL void pcre_stack_free(void *);
-PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
-#endif /* VPCOMPAT */
-
-/* Exported PCRE functions */
-
-PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
- const unsigned char *);
-PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
- int *, const unsigned char *);
-PCRE_EXP_DECL int pcre_config(int, void *);
-PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
- int *, int, const char *, char *, int);
-PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *,
- int);
-PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *,
- const char *, int, int, int, int *, int , int *, int);
-PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR,
- int, int, int, int *, int);
-PCRE_EXP_DECL void pcre_free_substring(const char *);
-PCRE_EXP_DECL void pcre_free_substring_list(const char **);
-PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
- void *);
-PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
- int *, int, const char *, const char **);
-PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
-PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *,
- char **, char **);
-PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
- const char **);
-PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
- const char ***);
-PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
-PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
-PCRE_EXP_DECL int pcre_refcount(pcre *, int);
-PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
-PCRE_EXP_DECL const char *pcre_version(void);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* End of pcre.h */
diff --git a/libs/pcre/pcre.h.in b/libs/pcre/pcre.h.in
deleted file mode 100644
index 484ddfd3d5..0000000000
--- a/libs/pcre/pcre.h.in
+++ /dev/null
@@ -1,307 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* This is the public header file for the PCRE library, to be #included by
-applications that call the PCRE functions.
-
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-#ifndef _PCRE_H
-#define _PCRE_H
-
-/* The current PCRE version information. */
-
-#define PCRE_MAJOR @PCRE_MAJOR@
-#define PCRE_MINOR @PCRE_MINOR@
-#define PCRE_PRERELEASE @PCRE_PRERELEASE@
-#define PCRE_DATE @PCRE_DATE@
-
-/* When an application links to a PCRE DLL in Windows, the symbols that are
-imported have to be identified as such. When building PCRE, the appropriate
-export setting is defined in pcre_internal.h, which includes this file. So we
-don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. */
-
-#if defined(_WIN32) && !defined(PCRE_STATIC)
-# ifndef PCRE_EXP_DECL
-# define PCRE_EXP_DECL extern __declspec(dllimport)
-# endif
-# ifdef __cplusplus
-# ifndef PCRECPP_EXP_DECL
-# define PCRECPP_EXP_DECL extern __declspec(dllimport)
-# endif
-# ifndef PCRECPP_EXP_DEFN
-# define PCRECPP_EXP_DEFN __declspec(dllimport)
-# endif
-# endif
-#endif
-
-/* By default, we use the standard "extern" declarations. */
-
-#ifndef PCRE_EXP_DECL
-# ifdef __cplusplus
-# define PCRE_EXP_DECL extern "C"
-# else
-# define PCRE_EXP_DECL extern
-# endif
-#endif
-
-#ifdef __cplusplus
-# ifndef PCRECPP_EXP_DECL
-# define PCRECPP_EXP_DECL extern
-# endif
-# ifndef PCRECPP_EXP_DEFN
-# define PCRECPP_EXP_DEFN
-# endif
-#endif
-
-/* Have to include stdlib.h in order to ensure that size_t is defined;
-it is needed here for malloc. */
-
-#include
-
-/* Allow for C++ users */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
-
-#define PCRE_CASELESS 0x00000001
-#define PCRE_MULTILINE 0x00000002
-#define PCRE_DOTALL 0x00000004
-#define PCRE_EXTENDED 0x00000008
-#define PCRE_ANCHORED 0x00000010
-#define PCRE_DOLLAR_ENDONLY 0x00000020
-#define PCRE_EXTRA 0x00000040
-#define PCRE_NOTBOL 0x00000080
-#define PCRE_NOTEOL 0x00000100
-#define PCRE_UNGREEDY 0x00000200
-#define PCRE_NOTEMPTY 0x00000400
-#define PCRE_UTF8 0x00000800
-#define PCRE_NO_AUTO_CAPTURE 0x00001000
-#define PCRE_NO_UTF8_CHECK 0x00002000
-#define PCRE_AUTO_CALLOUT 0x00004000
-#define PCRE_PARTIAL 0x00008000
-#define PCRE_DFA_SHORTEST 0x00010000
-#define PCRE_DFA_RESTART 0x00020000
-#define PCRE_FIRSTLINE 0x00040000
-#define PCRE_DUPNAMES 0x00080000
-#define PCRE_NEWLINE_CR 0x00100000
-#define PCRE_NEWLINE_LF 0x00200000
-#define PCRE_NEWLINE_CRLF 0x00300000
-#define PCRE_NEWLINE_ANY 0x00400000
-#define PCRE_NEWLINE_ANYCRLF 0x00500000
-#define PCRE_BSR_ANYCRLF 0x00800000
-#define PCRE_BSR_UNICODE 0x01000000
-#define PCRE_JAVASCRIPT_COMPAT 0x02000000
-#define PCRE_NO_START_OPTIMIZE 0x04000000
-#define PCRE_NO_START_OPTIMISE 0x04000000
-
-/* Exec-time and get/set-time error codes */
-
-#define PCRE_ERROR_NOMATCH (-1)
-#define PCRE_ERROR_NULL (-2)
-#define PCRE_ERROR_BADOPTION (-3)
-#define PCRE_ERROR_BADMAGIC (-4)
-#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
-#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
-#define PCRE_ERROR_NOMEMORY (-6)
-#define PCRE_ERROR_NOSUBSTRING (-7)
-#define PCRE_ERROR_MATCHLIMIT (-8)
-#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
-#define PCRE_ERROR_BADUTF8 (-10)
-#define PCRE_ERROR_BADUTF8_OFFSET (-11)
-#define PCRE_ERROR_PARTIAL (-12)
-#define PCRE_ERROR_BADPARTIAL (-13)
-#define PCRE_ERROR_INTERNAL (-14)
-#define PCRE_ERROR_BADCOUNT (-15)
-#define PCRE_ERROR_DFA_UITEM (-16)
-#define PCRE_ERROR_DFA_UCOND (-17)
-#define PCRE_ERROR_DFA_UMLIMIT (-18)
-#define PCRE_ERROR_DFA_WSSIZE (-19)
-#define PCRE_ERROR_DFA_RECURSE (-20)
-#define PCRE_ERROR_RECURSIONLIMIT (-21)
-#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
-#define PCRE_ERROR_BADNEWLINE (-23)
-
-/* Request types for pcre_fullinfo() */
-
-#define PCRE_INFO_OPTIONS 0
-#define PCRE_INFO_SIZE 1
-#define PCRE_INFO_CAPTURECOUNT 2
-#define PCRE_INFO_BACKREFMAX 3
-#define PCRE_INFO_FIRSTBYTE 4
-#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */
-#define PCRE_INFO_FIRSTTABLE 5
-#define PCRE_INFO_LASTLITERAL 6
-#define PCRE_INFO_NAMEENTRYSIZE 7
-#define PCRE_INFO_NAMECOUNT 8
-#define PCRE_INFO_NAMETABLE 9
-#define PCRE_INFO_STUDYSIZE 10
-#define PCRE_INFO_DEFAULT_TABLES 11
-#define PCRE_INFO_OKPARTIAL 12
-#define PCRE_INFO_JCHANGED 13
-#define PCRE_INFO_HASCRORLF 14
-
-/* Request types for pcre_config(). Do not re-arrange, in order to remain
-compatible. */
-
-#define PCRE_CONFIG_UTF8 0
-#define PCRE_CONFIG_NEWLINE 1
-#define PCRE_CONFIG_LINK_SIZE 2
-#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
-#define PCRE_CONFIG_MATCH_LIMIT 4
-#define PCRE_CONFIG_STACKRECURSE 5
-#define PCRE_CONFIG_UNICODE_PROPERTIES 6
-#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
-#define PCRE_CONFIG_BSR 8
-
-/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
-these bits, just add new ones on the end, in order to remain compatible. */
-
-#define PCRE_EXTRA_STUDY_DATA 0x0001
-#define PCRE_EXTRA_MATCH_LIMIT 0x0002
-#define PCRE_EXTRA_CALLOUT_DATA 0x0004
-#define PCRE_EXTRA_TABLES 0x0008
-#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
-
-/* Types */
-
-struct real_pcre; /* declaration; the definition is private */
-typedef struct real_pcre pcre;
-
-/* When PCRE is compiled as a C++ library, the subject pointer type can be
-replaced with a custom type. For conventional use, the public interface is a
-const char *. */
-
-#ifndef PCRE_SPTR
-#define PCRE_SPTR const char *
-#endif
-
-/* The structure for passing additional data to pcre_exec(). This is defined in
-such as way as to be extensible. Always add new fields at the end, in order to
-remain compatible. */
-
-typedef struct pcre_extra {
- unsigned long int flags; /* Bits for which fields are set */
- void *study_data; /* Opaque data from pcre_study() */
- unsigned long int match_limit; /* Maximum number of calls to match() */
- void *callout_data; /* Data passed back in callouts */
- const unsigned char *tables; /* Pointer to character tables */
- unsigned long int match_limit_recursion; /* Max recursive calls to match() */
-} pcre_extra;
-
-/* The structure for passing out data via the pcre_callout_function. We use a
-structure so that new fields can be added on the end in future versions,
-without changing the API of the function, thereby allowing old clients to work
-without modification. */
-
-typedef struct pcre_callout_block {
- int version; /* Identifies version of block */
- /* ------------------------ Version 0 ------------------------------- */
- int callout_number; /* Number compiled into pattern */
- int *offset_vector; /* The offset vector */
- PCRE_SPTR subject; /* The subject being matched */
- int subject_length; /* The length of the subject */
- int start_match; /* Offset to start of this match attempt */
- int current_position; /* Where we currently are in the subject */
- int capture_top; /* Max current capture */
- int capture_last; /* Most recently closed capture */
- void *callout_data; /* Data passed in with the call */
- /* ------------------- Added for Version 1 -------------------------- */
- int pattern_position; /* Offset to next item in the pattern */
- int next_item_length; /* Length of next item in the pattern */
- /* ------------------------------------------------------------------ */
-} pcre_callout_block;
-
-/* Indirection for store get and free functions. These can be set to
-alternative malloc/free functions if required. Special ones are used in the
-non-recursive case for "frames". There is also an optional callout function
-that is triggered by the (?) regex item. For Virtual Pascal, these definitions
-have to take another form. */
-
-#ifndef VPCOMPAT
-PCRE_EXP_DECL void *(*pcre_malloc)(size_t);
-PCRE_EXP_DECL void (*pcre_free)(void *);
-PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t);
-PCRE_EXP_DECL void (*pcre_stack_free)(void *);
-PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
-#else /* VPCOMPAT */
-PCRE_EXP_DECL void *pcre_malloc(size_t);
-PCRE_EXP_DECL void pcre_free(void *);
-PCRE_EXP_DECL void *pcre_stack_malloc(size_t);
-PCRE_EXP_DECL void pcre_stack_free(void *);
-PCRE_EXP_DECL int pcre_callout(pcre_callout_block *);
-#endif /* VPCOMPAT */
-
-/* Exported PCRE functions */
-
-PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
- const unsigned char *);
-PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
- int *, const unsigned char *);
-PCRE_EXP_DECL int pcre_config(int, void *);
-PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
- int *, int, const char *, char *, int);
-PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *,
- int);
-PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *,
- const char *, int, int, int, int *, int , int *, int);
-PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR,
- int, int, int, int *, int);
-PCRE_EXP_DECL void pcre_free_substring(const char *);
-PCRE_EXP_DECL void pcre_free_substring_list(const char **);
-PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
- void *);
-PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
- int *, int, const char *, const char **);
-PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
-PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *,
- char **, char **);
-PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int,
- const char **);
-PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
- const char ***);
-PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
-PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
-PCRE_EXP_DECL int pcre_refcount(pcre *, int);
-PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
-PCRE_EXP_DECL const char *pcre_version(void);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* End of pcre.h */
diff --git a/libs/pcre/pcre_chartables.c.dist b/libs/pcre/pcre_chartables.c.dist
deleted file mode 100644
index ae45db0ca3..0000000000
--- a/libs/pcre/pcre_chartables.c.dist
+++ /dev/null
@@ -1,198 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* This file contains character tables that are used when no external tables
-are passed to PCRE by the application that calls it. The tables are used only
-for characters whose code values are less than 256.
-
-This is a default version of the tables that assumes ASCII encoding. A program
-called dftables (which is distributed with PCRE) can be used to build
-alternative versions of this file. This is necessary if you are running in an
-EBCDIC environment, or if you want to default to a different encoding, for
-example ISO-8859-1. When dftables is run, it creates these tables in the
-current locale. If PCRE is configured with --enable-rebuild-chartables, this
-happens automatically.
-
-The following #includes are present because without the gcc 4.x may remove the
-array definition from the final binary if PCRE is built into a static library
-and dead code stripping is activated. This leads to link errors. Pulling in the
-header ensures that the array gets flagged as "someone outside this compilation
-unit might reference this" and so it will always be supplied to the linker. */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-const unsigned char _pcre_default_tables[] = {
-
-/* This table is a lower casing table. */
-
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31,
- 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55,
- 56, 57, 58, 59, 60, 61, 62, 63,
- 64, 97, 98, 99,100,101,102,103,
- 104,105,106,107,108,109,110,111,
- 112,113,114,115,116,117,118,119,
- 120,121,122, 91, 92, 93, 94, 95,
- 96, 97, 98, 99,100,101,102,103,
- 104,105,106,107,108,109,110,111,
- 112,113,114,115,116,117,118,119,
- 120,121,122,123,124,125,126,127,
- 128,129,130,131,132,133,134,135,
- 136,137,138,139,140,141,142,143,
- 144,145,146,147,148,149,150,151,
- 152,153,154,155,156,157,158,159,
- 160,161,162,163,164,165,166,167,
- 168,169,170,171,172,173,174,175,
- 176,177,178,179,180,181,182,183,
- 184,185,186,187,188,189,190,191,
- 192,193,194,195,196,197,198,199,
- 200,201,202,203,204,205,206,207,
- 208,209,210,211,212,213,214,215,
- 216,217,218,219,220,221,222,223,
- 224,225,226,227,228,229,230,231,
- 232,233,234,235,236,237,238,239,
- 240,241,242,243,244,245,246,247,
- 248,249,250,251,252,253,254,255,
-
-/* This table is a case flipping table. */
-
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31,
- 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55,
- 56, 57, 58, 59, 60, 61, 62, 63,
- 64, 97, 98, 99,100,101,102,103,
- 104,105,106,107,108,109,110,111,
- 112,113,114,115,116,117,118,119,
- 120,121,122, 91, 92, 93, 94, 95,
- 96, 65, 66, 67, 68, 69, 70, 71,
- 72, 73, 74, 75, 76, 77, 78, 79,
- 80, 81, 82, 83, 84, 85, 86, 87,
- 88, 89, 90,123,124,125,126,127,
- 128,129,130,131,132,133,134,135,
- 136,137,138,139,140,141,142,143,
- 144,145,146,147,148,149,150,151,
- 152,153,154,155,156,157,158,159,
- 160,161,162,163,164,165,166,167,
- 168,169,170,171,172,173,174,175,
- 176,177,178,179,180,181,182,183,
- 184,185,186,187,188,189,190,191,
- 192,193,194,195,196,197,198,199,
- 200,201,202,203,204,205,206,207,
- 208,209,210,211,212,213,214,215,
- 216,217,218,219,220,221,222,223,
- 224,225,226,227,228,229,230,231,
- 232,233,234,235,236,237,238,239,
- 240,241,242,243,244,245,246,247,
- 248,249,250,251,252,253,254,255,
-
-/* This table contains bit maps for various character classes. Each map is 32
-bytes long and the bits run from the least significant end of each byte. The
-classes that have their own maps are: space, xdigit, digit, upper, lower, word,
-graph, print, punct, and cntrl. Other classes are built from combinations. */
-
- 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
- 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
- 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
- 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
-/* This table identifies various classes of character by individual bits:
- 0x01 white space character
- 0x02 letter
- 0x04 decimal digit
- 0x08 hexadecimal digit
- 0x10 alphanumeric or '_'
- 0x80 regular expression metacharacter or binary zero
-*/
-
- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
- 0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
- 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
- 0x12,0x12,0x12,0x80,0x80,0x00,0x80,0x10, /* X - _ */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
- 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
-
-/* End of pcre_chartables.c */
diff --git a/libs/pcre/pcre_compile.c b/libs/pcre/pcre_compile.c
deleted file mode 100644
index a90f105b06..0000000000
--- a/libs/pcre/pcre_compile.c
+++ /dev/null
@@ -1,6605 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_compile(), along with
-supporting internal functions that are not used by other modules. */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#define NLBLOCK cd /* Block containing newline information */
-#define PSSTART start_pattern /* Field containing processed string start */
-#define PSEND end_pattern /* Field containing processed string end */
-
-#include "pcre_internal.h"
-
-
-/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is also
-used by pcretest. PCRE_DEBUG is not defined when building a production library. */
-
-#ifdef PCRE_DEBUG
-#include "pcre_printint.src"
-#endif
-
-
-/* Macro for setting individual bits in class bitmaps. */
-
-#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
-
-/* Maximum length value to check against when making sure that the integer that
-holds the compiled pattern length does not overflow. We make it a bit less than
-INT_MAX to allow for adding in group terminating bytes, so that we don't have
-to check them every time. */
-
-#define OFLOW_MAX (INT_MAX - 20)
-
-
-/*************************************************
-* Code parameters and static tables *
-*************************************************/
-
-/* This value specifies the size of stack workspace that is used during the
-first pre-compile phase that determines how much memory is required. The regex
-is partly compiled into this space, but the compiled parts are discarded as
-soon as they can be, so that hopefully there will never be an overrun. The code
-does, however, check for an overrun. The largest amount I've seen used is 218,
-so this number is very generous.
-
-The same workspace is used during the second, actual compile phase for
-remembering forward references to groups so that they can be filled in at the
-end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
-is 4 there is plenty of room. */
-
-#define COMPILE_WORK_SIZE (4096)
-
-
-/* Table for handling escaped characters in the range '0'-'z'. Positive returns
-are simple data values; negative values are for special things like \d and so
-on. Zero means further processing is needed (for things like \x), or the escape
-is invalid. */
-
-#ifndef EBCDIC
-
-/* This is the "normal" table for ASCII systems or for EBCDIC systems running
-in UTF-8 mode. */
-
-static const short int escapes[] = {
- 0, 0,
- 0, 0,
- 0, 0,
- 0, 0,
- 0, 0,
- CHAR_COLON, CHAR_SEMICOLON,
- CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
- CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
- CHAR_COMMERCIAL_AT, -ESC_A,
- -ESC_B, -ESC_C,
- -ESC_D, -ESC_E,
- 0, -ESC_G,
- -ESC_H, 0,
- 0, -ESC_K,
- 0, 0,
- 0, 0,
- -ESC_P, -ESC_Q,
- -ESC_R, -ESC_S,
- 0, 0,
- -ESC_V, -ESC_W,
- -ESC_X, 0,
- -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
- CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
- CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
- CHAR_GRAVE_ACCENT, 7,
- -ESC_b, 0,
- -ESC_d, ESC_e,
- ESC_f, 0,
- -ESC_h, 0,
- 0, -ESC_k,
- 0, 0,
- ESC_n, 0,
- -ESC_p, 0,
- ESC_r, -ESC_s,
- ESC_tee, 0,
- -ESC_v, -ESC_w,
- 0, 0,
- -ESC_z
-};
-
-#else
-
-/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
-
-static const short int escapes[] = {
-/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
-/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
-/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
-/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
-/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
-/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
-/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
-/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
-/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
-/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
-/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
-/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
-/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
-/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
-/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
-/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
-/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
-/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
-/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
-/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
-/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
-/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
-/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
-};
-#endif
-
-
-/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
-searched linearly. Put all the names into a single string, in order to reduce
-the number of relocations when a shared library is dynamically linked. The
-string is built from string macros so that it works in UTF-8 mode on EBCDIC
-platforms. */
-
-typedef struct verbitem {
- int len;
- int op;
-} verbitem;
-
-static const char verbnames[] =
- STRING_ACCEPT0
- STRING_COMMIT0
- STRING_F0
- STRING_FAIL0
- STRING_PRUNE0
- STRING_SKIP0
- STRING_THEN;
-
-static const verbitem verbs[] = {
- { 6, OP_ACCEPT },
- { 6, OP_COMMIT },
- { 1, OP_FAIL },
- { 4, OP_FAIL },
- { 5, OP_PRUNE },
- { 4, OP_SKIP },
- { 4, OP_THEN }
-};
-
-static const int verbcount = sizeof(verbs)/sizeof(verbitem);
-
-
-/* Tables of names of POSIX character classes and their lengths. The names are
-now all in a single string, to reduce the number of relocations when a shared
-library is dynamically loaded. The list of lengths is terminated by a zero
-length entry. The first three must be alpha, lower, upper, as this is assumed
-for handling case independence. */
-
-static const char posix_names[] =
- STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
- STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
- STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
- STRING_word0 STRING_xdigit;
-
-static const uschar posix_name_lengths[] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
-
-/* Table of class bit maps for each POSIX class. Each class is formed from a
-base map, with an optional addition or removal of another map. Then, for some
-classes, there is some additional tweaking: for [:blank:] the vertical space
-characters are removed, and for [:alpha:] and [:alnum:] the underscore
-character is removed. The triples in the table consist of the base map offset,
-second map offset or -1 if no second map, and a non-negative value for map
-addition or a negative value for map subtraction (if there are two maps). The
-absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
-remove vertical space characters, 2 => remove underscore. */
-
-static const int posix_class_maps[] = {
- cbit_word, cbit_digit, -2, /* alpha */
- cbit_lower, -1, 0, /* lower */
- cbit_upper, -1, 0, /* upper */
- cbit_word, -1, 2, /* alnum - word without underscore */
- cbit_print, cbit_cntrl, 0, /* ascii */
- cbit_space, -1, 1, /* blank - a GNU extension */
- cbit_cntrl, -1, 0, /* cntrl */
- cbit_digit, -1, 0, /* digit */
- cbit_graph, -1, 0, /* graph */
- cbit_print, -1, 0, /* print */
- cbit_punct, -1, 0, /* punct */
- cbit_space, -1, 0, /* space */
- cbit_word, -1, 0, /* word - a Perl extension */
- cbit_xdigit,-1, 0 /* xdigit */
-};
-
-
-#define STRING(a) # a
-#define XSTRING(s) STRING(s)
-
-/* The texts of compile-time error messages. These are "char *" because they
-are passed to the outside world. Do not ever re-use any error number, because
-they are documented. Always add a new error instead. Messages marked DEAD below
-are no longer used. This used to be a table of strings, but in order to reduce
-the number of relocations needed when a shared library is loaded dynamically,
-it is now one long string. We cannot use a table of offsets, because the
-lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
-simply count through to the one we want - this isn't a performance issue
-because these strings are used only when there is a compilation error. */
-
-static const char error_texts[] =
- "no error\0"
- "\\ at end of pattern\0"
- "\\c at end of pattern\0"
- "unrecognized character follows \\\0"
- "numbers out of order in {} quantifier\0"
- /* 5 */
- "number too big in {} quantifier\0"
- "missing terminating ] for character class\0"
- "invalid escape sequence in character class\0"
- "range out of order in character class\0"
- "nothing to repeat\0"
- /* 10 */
- "operand of unlimited repeat could match the empty string\0" /** DEAD **/
- "internal error: unexpected repeat\0"
- "unrecognized character after (? or (?-\0"
- "POSIX named classes are supported only within a class\0"
- "missing )\0"
- /* 15 */
- "reference to non-existent subpattern\0"
- "erroffset passed as NULL\0"
- "unknown option bit(s) set\0"
- "missing ) after comment\0"
- "parentheses nested too deeply\0" /** DEAD **/
- /* 20 */
- "regular expression is too large\0"
- "failed to get memory\0"
- "unmatched parentheses\0"
- "internal error: code overflow\0"
- "unrecognized character after (?<\0"
- /* 25 */
- "lookbehind assertion is not fixed length\0"
- "malformed number or name after (?(\0"
- "conditional group contains more than two branches\0"
- "assertion expected after (?(\0"
- "(?R or (?[+-]digits must be followed by )\0"
- /* 30 */
- "unknown POSIX class name\0"
- "POSIX collating elements are not supported\0"
- "this version of PCRE is not compiled with PCRE_UTF8 support\0"
- "spare error\0" /** DEAD **/
- "character value in \\x{...} sequence is too large\0"
- /* 35 */
- "invalid condition (?(0)\0"
- "\\C not allowed in lookbehind assertion\0"
- "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
- "number after (?C is > 255\0"
- "closing ) for (?C expected\0"
- /* 40 */
- "recursive call could loop indefinitely\0"
- "unrecognized character after (?P\0"
- "syntax error in subpattern name (missing terminator)\0"
- "two named subpatterns have the same name\0"
- "invalid UTF-8 string\0"
- /* 45 */
- "support for \\P, \\p, and \\X has not been compiled\0"
- "malformed \\P or \\p sequence\0"
- "unknown property name after \\P or \\p\0"
- "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
- "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
- /* 50 */
- "repeated subpattern is too long\0" /** DEAD **/
- "octal value is greater than \\377 (not in UTF-8 mode)\0"
- "internal error: overran compiling workspace\0"
- "internal error: previously-checked referenced subpattern not found\0"
- "DEFINE group contains more than one branch\0"
- /* 55 */
- "repeating a DEFINE group is not allowed\0"
- "inconsistent NEWLINE options\0"
- "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
- "a numbered reference must not be zero\0"
- "(*VERB) with an argument is not supported\0"
- /* 60 */
- "(*VERB) not recognized\0"
- "number is too big\0"
- "subpattern name expected\0"
- "digit expected after (?+\0"
- "] is an invalid data character in JavaScript compatibility mode";
-
-
-/* Table to identify digits and hex digits. This is used when compiling
-patterns. Note that the tables in chartables are dependent on the locale, and
-may mark arbitrary characters as digits - but the PCRE compiling code expects
-to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
-a private table here. It costs 256 bytes, but it is a lot faster than doing
-character value tests (at least in some simple cases I timed), and in some
-applications one wants PCRE to compile efficiently as well as match
-efficiently.
-
-For convenience, we use the same bit definitions as in chartables:
-
- 0x04 decimal digit
- 0x08 hexadecimal digit
-
-Then we can use ctype_digit and ctype_xdigit in the code. */
-
-#ifndef EBCDIC
-
-/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
-UTF-8 mode. */
-
-static const unsigned char digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
-
-#else
-
-/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
-
-static const unsigned char digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
-
-static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
- 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
- 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
- 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
- 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
- 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
-#endif
-
-
-/* Definition to allow mutual recursion */
-
-static BOOL
- compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
- int *, int *, branch_chain *, compile_data *, int *);
-
-
-
-/*************************************************
-* Find an error text *
-*************************************************/
-
-/* The error texts are now all in one long string, to save on relocations. As
-some of the text is of unknown length, we can't use a table of offsets.
-Instead, just count through the strings. This is not a performance issue
-because it happens only when there has been a compilation error.
-
-Argument: the error number
-Returns: pointer to the error string
-*/
-
-static const char *
-find_error_text(int n)
-{
-const char *s = error_texts;
-for (; n > 0; n--) while (*s++ != 0) {};
-return s;
-}
-
-
-/*************************************************
-* Handle escapes *
-*************************************************/
-
-/* This function is called when a \ has been encountered. It either returns a
-positive value for a simple escape such as \n, or a negative value which
-encodes one of the more complicated things such as \d. A backreference to group
-n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
-UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
-ptr is pointing at the \. On exit, it is on the final character of the escape
-sequence.
-
-Arguments:
- ptrptr points to the pattern position pointer
- errorcodeptr points to the errorcode variable
- bracount number of previous extracting brackets
- options the options bits
- isclass TRUE if inside a character class
-
-Returns: zero or positive => a data character
- negative => a special escape sequence
- on error, errorcodeptr is set
-*/
-
-static int
-check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
- int options, BOOL isclass)
-{
-BOOL utf8 = (options & PCRE_UTF8) != 0;
-const uschar *ptr = *ptrptr + 1;
-int c, i;
-
-GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
-ptr--; /* Set pointer back to the last byte */
-
-/* If backslash is at the end of the pattern, it's an error. */
-
-if (c == 0) *errorcodeptr = ERR1;
-
-/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
-in a table. A non-zero result is something that can be returned immediately.
-Otherwise further processing may be required. */
-
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
-else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
-else if ((i = escapes[c - CHAR_0]) != 0) c = i;
-
-#else /* EBCDIC coding */
-else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
-else if ((i = escapes[c - 0x48]) != 0) c = i;
-#endif
-
-/* Escapes that need further processing, or are illegal. */
-
-else
- {
- const uschar *oldptr;
- BOOL braced, negated;
-
- switch (c)
- {
- /* A number of Perl escapes are not handled by PCRE. We give an explicit
- error. */
-
- case CHAR_l:
- case CHAR_L:
- case CHAR_N:
- case CHAR_u:
- case CHAR_U:
- *errorcodeptr = ERR37;
- break;
-
- /* \g must be followed by one of a number of specific things:
-
- (1) A number, either plain or braced. If positive, it is an absolute
- backreference. If negative, it is a relative backreference. This is a Perl
- 5.10 feature.
-
- (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
- is part of Perl's movement towards a unified syntax for back references. As
- this is synonymous with \k{name}, we fudge it up by pretending it really
- was \k.
-
- (3) For Oniguruma compatibility we also support \g followed by a name or a
- number either in angle brackets or in single quotes. However, these are
- (possibly recursive) subroutine calls, _not_ backreferences. Just return
- the -ESC_g code (cf \k). */
-
- case CHAR_g:
- if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
- {
- c = -ESC_g;
- break;
- }
-
- /* Handle the Perl-compatible cases */
-
- if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
- {
- const uschar *p;
- for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
- if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
- if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
- {
- c = -ESC_k;
- break;
- }
- braced = TRUE;
- ptr++;
- }
- else braced = FALSE;
-
- if (ptr[1] == CHAR_MINUS)
- {
- negated = TRUE;
- ptr++;
- }
- else negated = FALSE;
-
- c = 0;
- while ((digitab[ptr[1]] & ctype_digit) != 0)
- c = c * 10 + *(++ptr) - CHAR_0;
-
- if (c < 0) /* Integer overflow */
- {
- *errorcodeptr = ERR61;
- break;
- }
-
- if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
- {
- *errorcodeptr = ERR57;
- break;
- }
-
- if (c == 0)
- {
- *errorcodeptr = ERR58;
- break;
- }
-
- if (negated)
- {
- if (c > bracount)
- {
- *errorcodeptr = ERR15;
- break;
- }
- c = bracount - (c - 1);
- }
-
- c = -(ESC_REF + c);
- break;
-
- /* The handling of escape sequences consisting of a string of digits
- starting with one that is not zero is not straightforward. By experiment,
- the way Perl works seems to be as follows:
-
- Outside a character class, the digits are read as a decimal number. If the
- number is less than 10, or if there are that many previous extracting
- left brackets, then it is a back reference. Otherwise, up to three octal
- digits are read to form an escaped byte. Thus \123 is likely to be octal
- 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
- value is greater than 377, the least significant 8 bits are taken. Inside a
- character class, \ followed by a digit is always an octal number. */
-
- case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
- case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
-
- if (!isclass)
- {
- oldptr = ptr;
- c -= CHAR_0;
- while ((digitab[ptr[1]] & ctype_digit) != 0)
- c = c * 10 + *(++ptr) - CHAR_0;
- if (c < 0) /* Integer overflow */
- {
- *errorcodeptr = ERR61;
- break;
- }
- if (c < 10 || c <= bracount)
- {
- c = -(ESC_REF + c);
- break;
- }
- ptr = oldptr; /* Put the pointer back and fall through */
- }
-
- /* Handle an octal number following \. If the first digit is 8 or 9, Perl
- generates a binary zero byte and treats the digit as a following literal.
- Thus we have to pull back the pointer by one. */
-
- if ((c = *ptr) >= CHAR_8)
- {
- ptr--;
- c = 0;
- break;
- }
-
- /* \0 always starts an octal number, but we may drop through to here with a
- larger first octal digit. The original code used just to take the least
- significant 8 bits of octal numbers (I think this is what early Perls used
- to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
- than 3 octal digits. */
-
- case CHAR_0:
- c -= CHAR_0;
- while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
- c = c * 8 + *(++ptr) - CHAR_0;
- if (!utf8 && c > 255) *errorcodeptr = ERR51;
- break;
-
- /* \x is complicated. \x{ddd} is a character number which can be greater
- than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
- treated as a data character. */
-
- case CHAR_x:
- if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
- {
- const uschar *pt = ptr + 2;
- int count = 0;
-
- c = 0;
- while ((digitab[*pt] & ctype_xdigit) != 0)
- {
- register int cc = *pt++;
- if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
- count++;
-
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
-#else /* EBCDIC coding */
- if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
-#endif
- }
-
- if (*pt == CHAR_RIGHT_CURLY_BRACKET)
- {
- if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
- ptr = pt;
- break;
- }
-
- /* If the sequence of hex digits does not end with '}', then we don't
- recognize this construct; fall through to the normal \x handling. */
- }
-
- /* Read just a single-byte hex-defined char */
-
- c = 0;
- while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
- {
- int cc; /* Some compilers don't like */
- cc = *(++ptr); /* ++ in initializers */
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
-#else /* EBCDIC coding */
- if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
-#endif
- }
- break;
-
- /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
- This coding is ASCII-specific, but then the whole concept of \cx is
- ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
-
- case CHAR_c:
- c = *(++ptr);
- if (c == 0)
- {
- *errorcodeptr = ERR2;
- break;
- }
-
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (c >= CHAR_a && c <= CHAR_z) c -= 32;
- c ^= 0x40;
-#else /* EBCDIC coding */
- if (c >= CHAR_a && c <= CHAR_z) c += 64;
- c ^= 0xC0;
-#endif
- break;
-
- /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
- other alphanumeric following \ is an error if PCRE_EXTRA was set;
- otherwise, for Perl compatibility, it is a literal. This code looks a bit
- odd, but there used to be some cases other than the default, and there may
- be again in future, so I haven't "optimized" it. */
-
- default:
- if ((options & PCRE_EXTRA) != 0) switch(c)
- {
- default:
- *errorcodeptr = ERR3;
- break;
- }
- break;
- }
- }
-
-*ptrptr = ptr;
-return c;
-}
-
-
-
-#ifdef SUPPORT_UCP
-/*************************************************
-* Handle \P and \p *
-*************************************************/
-
-/* This function is called after \P or \p has been encountered, provided that
-PCRE is compiled with support for Unicode properties. On entry, ptrptr is
-pointing at the P or p. On exit, it is pointing at the final character of the
-escape sequence.
-
-Argument:
- ptrptr points to the pattern position pointer
- negptr points to a boolean that is set TRUE for negation else FALSE
- dptr points to an int that is set to the detailed property value
- errorcodeptr points to the error code variable
-
-Returns: type value from ucp_type_table, or -1 for an invalid type
-*/
-
-static int
-get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
-{
-int c, i, bot, top;
-const uschar *ptr = *ptrptr;
-char name[32];
-
-c = *(++ptr);
-if (c == 0) goto ERROR_RETURN;
-
-*negptr = FALSE;
-
-/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
-negation. */
-
-if (c == CHAR_LEFT_CURLY_BRACKET)
- {
- if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
- {
- *negptr = TRUE;
- ptr++;
- }
- for (i = 0; i < (int)sizeof(name) - 1; i++)
- {
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- if (c == CHAR_RIGHT_CURLY_BRACKET) break;
- name[i] = c;
- }
- if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
- name[i] = 0;
- }
-
-/* Otherwise there is just one following character */
-
-else
- {
- name[0] = c;
- name[1] = 0;
- }
-
-*ptrptr = ptr;
-
-/* Search for a recognized property name using binary chop */
-
-bot = 0;
-top = _pcre_utt_size;
-
-while (bot < top)
- {
- i = (bot + top) >> 1;
- c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
- if (c == 0)
- {
- *dptr = _pcre_utt[i].value;
- return _pcre_utt[i].type;
- }
- if (c > 0) bot = i + 1; else top = i;
- }
-
-*errorcodeptr = ERR47;
-*ptrptr = ptr;
-return -1;
-
-ERROR_RETURN:
-*errorcodeptr = ERR46;
-*ptrptr = ptr;
-return -1;
-}
-#endif
-
-
-
-
-/*************************************************
-* Check for counted repeat *
-*************************************************/
-
-/* This function is called when a '{' is encountered in a place where it might
-start a quantifier. It looks ahead to see if it really is a quantifier or not.
-It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
-where the ddds are digits.
-
-Arguments:
- p pointer to the first char after '{'
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-is_counted_repeat(const uschar *p)
-{
-if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
-while ((digitab[*p] & ctype_digit) != 0) p++;
-if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
-
-if (*p++ != CHAR_COMMA) return FALSE;
-if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
-
-if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
-while ((digitab[*p] & ctype_digit) != 0) p++;
-
-return (*p == CHAR_RIGHT_CURLY_BRACKET);
-}
-
-
-
-/*************************************************
-* Read repeat counts *
-*************************************************/
-
-/* Read an item of the form {n,m} and return the values. This is called only
-after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
-so the syntax is guaranteed to be correct, but we need to check the values.
-
-Arguments:
- p pointer to first char after '{'
- minp pointer to int for min
- maxp pointer to int for max
- returned as -1 if no max
- errorcodeptr points to error code variable
-
-Returns: pointer to '}' on success;
- current ptr on error, with errorcodeptr set non-zero
-*/
-
-static const uschar *
-read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
-{
-int min = 0;
-int max = -1;
-
-/* Read the minimum value and do a paranoid check: a negative value indicates
-an integer overflow. */
-
-while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
-if (min < 0 || min > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
-
-/* Read the maximum value if there is one, and again do a paranoid on its size.
-Also, max must not be less than min. */
-
-if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
- {
- if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
- {
- max = 0;
- while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
- if (max < 0 || max > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- if (max < min)
- {
- *errorcodeptr = ERR4;
- return p;
- }
- }
- }
-
-/* Fill in the required variables, and pass back the pointer to the terminating
-'}'. */
-
-*minp = min;
-*maxp = max;
-return p;
-}
-
-
-
-/*************************************************
-* Subroutine for finding forward reference *
-*************************************************/
-
-/* This recursive function is called only from find_parens() below. The
-top-level call starts at the beginning of the pattern. All other calls must
-start at a parenthesis. It scans along a pattern's text looking for capturing
-subpatterns, and counting them. If it finds a named pattern that matches the
-name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. We know that if (?P< is
-encountered, the name will be terminated by '>' because that is checked in the
-first pass. Recursion is used to keep track of subpatterns that reset the
-capturing group numbers - the (?| feature.
-
-Arguments:
- ptrptr address of the current character pointer (updated)
- cd compile background data
- name name to seek, or NULL if seeking a numbered subpattern
- lorn name length, or subpattern number if name is NULL
- xmode TRUE if we are in /x mode
- count pointer to the current capturing subpattern number (updated)
-
-Returns: the number of the named subpattern, or -1 if not found
-*/
-
-static int
-find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
-{
-uschar *ptr = *ptrptr;
-int start_count = *count;
-int hwm_count = start_count;
-BOOL dup_parens = FALSE;
-
-/* If the first character is a parenthesis, check on the type of group we are
-dealing with. The very first call may not start with a parenthesis. */
-
-if (ptr[0] == CHAR_LEFT_PARENTHESIS)
- {
- if (ptr[1] == CHAR_QUESTION_MARK &&
- ptr[2] == CHAR_VERTICAL_LINE)
- {
- ptr += 3;
- dup_parens = TRUE;
- }
-
- /* Handle a normal, unnamed capturing parenthesis */
-
- else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
- {
- *count += 1;
- if (name == NULL && *count == lorn) return *count;
- ptr++;
- }
-
- /* Handle a condition. If it is an assertion, just carry on so that it
- is processed as normal. If not, skip to the closing parenthesis of the
- condition (there can't be any nested parens. */
-
- else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
- {
- ptr += 2;
- if (ptr[1] != CHAR_QUESTION_MARK)
- {
- while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
- if (*ptr != 0) ptr++;
- }
- }
-
- /* We have either (? or (* and not a condition */
-
- else
- {
- ptr += 2;
- if (*ptr == CHAR_P) ptr++; /* Allow optional P */
-
- /* We have to disambiguate (? for named groups */
-
- if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
- ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
- {
- int term;
- const uschar *thisname;
- *count += 1;
- if (name == NULL && *count == lorn) return *count;
- term = *ptr++;
- if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
- thisname = ptr;
- while (*ptr != term) ptr++;
- if (name != NULL && lorn == ptr - thisname &&
- strncmp((const char *)name, (const char *)thisname, lorn) == 0)
- return *count;
- }
- }
- }
-
-/* Past any initial parenthesis handling, scan for parentheses or vertical
-bars. */
-
-for (; *ptr != 0; ptr++)
- {
- /* Skip over backslashed characters and also entire \Q...\E */
-
- if (*ptr == CHAR_BACKSLASH)
- {
- if (*(++ptr) == 0) goto FAIL_EXIT;
- if (*ptr == CHAR_Q) for (;;)
- {
- while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
- if (*ptr == 0) goto FAIL_EXIT;
- if (*(++ptr) == CHAR_E) break;
- }
- continue;
- }
-
- /* Skip over character classes; this logic must be similar to the way they
- are handled for real. If the first character is '^', skip it. Also, if the
- first few characters (either before or after ^) are \Q\E or \E we skip them
- too. This makes for compatibility with Perl. Note the use of STR macros to
- encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
-
- if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
- {
- BOOL negate_class = FALSE;
- for (;;)
- {
- int c = *(++ptr);
- if (c == CHAR_BACKSLASH)
- {
- if (ptr[1] == CHAR_E)
- ptr++;
- else if (strncmp((const char *)ptr+1,
- STR_Q STR_BACKSLASH STR_E, 3) == 0)
- ptr += 3;
- else
- break;
- }
- else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
- negate_class = TRUE;
- else break;
- }
-
- /* If the next character is ']', it is a data character that must be
- skipped, except in JavaScript compatibility mode. */
-
- if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
- (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
- ptr++;
-
- while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
- {
- if (*ptr == 0) return -1;
- if (*ptr == CHAR_BACKSLASH)
- {
- if (*(++ptr) == 0) goto FAIL_EXIT;
- if (*ptr == CHAR_Q) for (;;)
- {
- while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
- if (*ptr == 0) goto FAIL_EXIT;
- if (*(++ptr) == CHAR_E) break;
- }
- continue;
- }
- }
- continue;
- }
-
- /* Skip comments in /x mode */
-
- if (xmode && *ptr == CHAR_NUMBER_SIGN)
- {
- while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
- if (*ptr == 0) goto FAIL_EXIT;
- continue;
- }
-
- /* Check for the special metacharacters */
-
- if (*ptr == CHAR_LEFT_PARENTHESIS)
- {
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
- if (rc > 0) return rc;
- if (*ptr == 0) goto FAIL_EXIT;
- }
-
- else if (*ptr == CHAR_RIGHT_PARENTHESIS)
- {
- if (dup_parens && *count < hwm_count) *count = hwm_count;
- *ptrptr = ptr;
- return -1;
- }
-
- else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
- {
- if (*count > hwm_count) hwm_count = *count;
- *count = start_count;
- }
- }
-
-FAIL_EXIT:
-*ptrptr = ptr;
-return -1;
-}
-
-
-
-
-/*************************************************
-* Find forward referenced subpattern *
-*************************************************/
-
-/* This function scans along a pattern's text looking for capturing
-subpatterns, and counting them. If it finds a named pattern that matches the
-name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. This is used for forward
-references to subpatterns. We used to be able to start this scan from the
-current compiling point, using the current count value from cd->bracount, and
-do it all in a single loop, but the addition of the possibility of duplicate
-subpattern numbers means that we have to scan from the very start, in order to
-take account of such duplicates, and to use a recursive function to keep track
-of the different types of group.
-
-Arguments:
- cd compile background data
- name name to seek, or NULL if seeking a numbered subpattern
- lorn name length, or subpattern number if name is NULL
- xmode TRUE if we are in /x mode
-
-Returns: the number of the found subpattern, or -1 if not found
-*/
-
-static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
-{
-uschar *ptr = (uschar *)cd->start_pattern;
-int count = 0;
-int rc;
-
-/* If the pattern does not start with an opening parenthesis, the first call
-to find_parens_sub() will scan right to the end (if necessary). However, if it
-does start with a parenthesis, find_parens_sub() will return when it hits the
-matching closing parens. That is why we have to have a loop. */
-
-for (;;)
- {
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
- if (rc > 0 || *ptr++ == 0) break;
- }
-
-return rc;
-}
-
-
-
-
-/*************************************************
-* Find first significant op code *
-*************************************************/
-
-/* This is called by several functions that scan a compiled expression looking
-for a fixed first character, or an anchoring op code etc. It skips over things
-that do not influence this. For some calls, a change of option is important.
-For some calls, it makes sense to skip negative forward and all backward
-assertions, and also the \b assertion; for others it does not.
-
-Arguments:
- code pointer to the start of the group
- options pointer to external options
- optbit the option bit whose changing is significant, or
- zero if none are
- skipassert TRUE if certain assertions are to be skipped
-
-Returns: pointer to the first significant opcode
-*/
-
-static const uschar*
-first_significant_code(const uschar *code, int *options, int optbit,
- BOOL skipassert)
-{
-for (;;)
- {
- switch ((int)*code)
- {
- case OP_OPT:
- if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
- *options = (int)code[1];
- code += 2;
- break;
-
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- if (!skipassert) return code;
- do code += GET(code, 1); while (*code == OP_ALT);
- code += _pcre_OP_lengths[*code];
- break;
-
- case OP_WORD_BOUNDARY:
- case OP_NOT_WORD_BOUNDARY:
- if (!skipassert) return code;
- /* Fall through */
-
- case OP_CALLOUT:
- case OP_CREF:
- case OP_RREF:
- case OP_DEF:
- code += _pcre_OP_lengths[*code];
- break;
-
- default:
- return code;
- }
- }
-/* Control never reaches here */
-}
-
-
-
-
-/*************************************************
-* Find the fixed length of a pattern *
-*************************************************/
-
-/* Scan a pattern and compute the fixed length of subject that will match it,
-if the length is fixed. This is needed for dealing with backward assertions.
-In UTF8 mode, the result is in characters rather than bytes.
-
-Arguments:
- code points to the start of the pattern (the bracket)
- options the compiling options
-
-Returns: the fixed length, or -1 if there is no fixed length,
- or -2 if \C was encountered
-*/
-
-static int
-find_fixedlength(uschar *code, int options)
-{
-int length = -1;
-
-register int branchlength = 0;
-register uschar *cc = code + 1 + LINK_SIZE;
-
-/* Scan along the opcodes for this branch. If we get to the end of the
-branch, check the length against that of the other branches. */
-
-for (;;)
- {
- int d;
- register int op = *cc;
- switch (op)
- {
- case OP_CBRA:
- case OP_BRA:
- case OP_ONCE:
- case OP_COND:
- d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
- if (d < 0) return d;
- branchlength += d;
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- cc += 1 + LINK_SIZE;
- break;
-
- /* Reached end of a branch; if it's a ket it is the end of a nested
- call. If it's ALT it is an alternation in a nested call. If it is
- END it's the end of the outer call. All can be handled by the same code. */
-
- case OP_ALT:
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_END:
- if (length < 0) length = branchlength;
- else if (length != branchlength) return -1;
- if (*cc != OP_ALT) return length;
- cc += 1 + LINK_SIZE;
- branchlength = 0;
- break;
-
- /* Skip over assertive subpatterns */
-
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- /* Fall through */
-
- /* Skip over things that don't match chars */
-
- case OP_REVERSE:
- case OP_CREF:
- case OP_RREF:
- case OP_DEF:
- case OP_OPT:
- case OP_CALLOUT:
- case OP_SOD:
- case OP_SOM:
- case OP_EOD:
- case OP_EODN:
- case OP_CIRC:
- case OP_DOLL:
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- cc += _pcre_OP_lengths[*cc];
- break;
-
- /* Handle literal characters */
-
- case OP_CHAR:
- case OP_CHARNC:
- case OP_NOT:
- branchlength++;
- cc += 2;
-#ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while ((*cc & 0xc0) == 0x80) cc++;
- }
-#endif
- break;
-
- /* Handle exact repetitions. The count is already in characters, but we
- need to skip over a multibyte character in UTF8 mode. */
-
- case OP_EXACT:
- branchlength += GET2(cc,1);
- cc += 4;
-#ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while((*cc & 0x80) == 0x80) cc++;
- }
-#endif
- break;
-
- case OP_TYPEEXACT:
- branchlength += GET2(cc,1);
- if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
- cc += 4;
- break;
-
- /* Handle single-char matchers */
-
- case OP_PROP:
- case OP_NOTPROP:
- cc += 2;
- /* Fall through */
-
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- case OP_ALLANY:
- branchlength++;
- cc++;
- break;
-
- /* The single-byte matcher isn't allowed */
-
- case OP_ANYBYTE:
- return -2;
-
- /* Check a class for variable quantification */
-
-#ifdef SUPPORT_UTF8
- case OP_XCLASS:
- cc += GET(cc, 1) - 33;
- /* Fall through */
-#endif
-
- case OP_CLASS:
- case OP_NCLASS:
- cc += 33;
-
- switch (*cc)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- return -1;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(cc,1) != GET2(cc,3)) return -1;
- branchlength += GET2(cc,1);
- cc += 5;
- break;
-
- default:
- branchlength++;
- }
- break;
-
- /* Anything else is variable length */
-
- default:
- return -1;
- }
- }
-/* Control never gets here */
-}
-
-
-
-
-/*************************************************
-* Scan compiled regex for numbered bracket *
-*************************************************/
-
-/* This little function scans through a compiled pattern until it finds a
-capturing bracket with the given number.
-
-Arguments:
- code points to start of expression
- utf8 TRUE in UTF-8 mode
- number the required bracket number
-
-Returns: pointer to the opcode for the bracket, or NULL if not found
-*/
-
-static const uschar *
-find_bracket(const uschar *code, BOOL utf8, int number)
-{
-for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
-
- /* XCLASS is used for classes that cannot be represented just by a bit
- map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compiled code. */
-
- if (c == OP_XCLASS) code += GET(code, 1);
-
- /* Handle capturing bracket */
-
- else if (c == OP_CBRA)
- {
- int n = GET2(code, 1+LINK_SIZE);
- if (n == number) return (uschar *)code;
- code += _pcre_OP_lengths[c];
- }
-
- /* Otherwise, we can get the item's length from the table, except that for
- repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
-
- else
- {
- switch(c)
- {
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSSTAR:
- case OP_TYPEPOSPLUS:
- case OP_TYPEPOSQUERY:
- if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
- break;
-
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEEXACT:
- case OP_TYPEPOSUPTO:
- if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
- break;
- }
-
- /* Add in the fixed length from the table */
-
- code += _pcre_OP_lengths[c];
-
- /* In UTF-8 mode, opcodes that are followed by a character may be followed by
- a multi-byte character. The length in the table is a minimum, so we have to
- arrange to skip the extra bytes. */
-
-#ifdef SUPPORT_UTF8
- if (utf8) switch(c)
- {
- case OP_CHAR:
- case OP_CHARNC:
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
- break;
- }
-#else
- (void)(utf8); /* Keep compiler happy by referencing function argument */
-#endif
- }
- }
-}
-
-
-
-/*************************************************
-* Scan compiled regex for recursion reference *
-*************************************************/
-
-/* This little function scans through a compiled pattern until it finds an
-instance of OP_RECURSE.
-
-Arguments:
- code points to start of expression
- utf8 TRUE in UTF-8 mode
-
-Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
-*/
-
-static const uschar *
-find_recurse(const uschar *code, BOOL utf8)
-{
-for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
- if (c == OP_RECURSE) return code;
-
- /* XCLASS is used for classes that cannot be represented just by a bit
- map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compiled code. */
-
- if (c == OP_XCLASS) code += GET(code, 1);
-
- /* Otherwise, we can get the item's length from the table, except that for
- repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
-
- else
- {
- switch(c)
- {
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSSTAR:
- case OP_TYPEPOSPLUS:
- case OP_TYPEPOSQUERY:
- if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
- break;
-
- case OP_TYPEPOSUPTO:
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEEXACT:
- if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
- break;
- }
-
- /* Add in the fixed length from the table */
-
- code += _pcre_OP_lengths[c];
-
- /* In UTF-8 mode, opcodes that are followed by a character may be followed
- by a multi-byte character. The length in the table is a minimum, so we have
- to arrange to skip the extra bytes. */
-
-#ifdef SUPPORT_UTF8
- if (utf8) switch(c)
- {
- case OP_CHAR:
- case OP_CHARNC:
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
- break;
- }
-#else
- (void)(utf8); /* Keep compiler happy by referencing function argument */
-#endif
- }
- }
-}
-
-
-
-/*************************************************
-* Scan compiled branch for non-emptiness *
-*************************************************/
-
-/* This function scans through a branch of a compiled pattern to see whether it
-can match the empty string or not. It is called from could_be_empty()
-below and from compile_branch() when checking for an unlimited repeat of a
-group that can match nothing. Note that first_significant_code() skips over
-backward and negative forward assertions when its final argument is TRUE. If we
-hit an unclosed bracket, we return "empty" - this means we've struck an inner
-bracket whose current branch will already have been scanned.
-
-Arguments:
- code points to start of search
- endcode points to where to stop
- utf8 TRUE if in UTF8 mode
-
-Returns: TRUE if what is matched could be empty
-*/
-
-static BOOL
-could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
-{
-register int c;
-for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
- code < endcode;
- code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
- {
- const uschar *ccode;
-
- c = *code;
-
- /* Skip over forward assertions; the other assertions are skipped by
- first_significant_code() with a TRUE final argument. */
-
- if (c == OP_ASSERT)
- {
- do code += GET(code, 1); while (*code == OP_ALT);
- c = *code;
- continue;
- }
-
- /* Groups with zero repeats can of course be empty; skip them. */
-
- if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
- {
- code += _pcre_OP_lengths[c];
- do code += GET(code, 1); while (*code == OP_ALT);
- c = *code;
- continue;
- }
-
- /* For other groups, scan the branches. */
-
- if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
- {
- BOOL empty_branch;
- if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
-
- /* If a conditional group has only one branch, there is a second, implied,
- empty branch, so just skip over the conditional, because it could be empty.
- Otherwise, scan the individual branches of the group. */
-
- if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
- code += GET(code, 1);
- else
- {
- empty_branch = FALSE;
- do
- {
- if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
- empty_branch = TRUE;
- code += GET(code, 1);
- }
- while (*code == OP_ALT);
- if (!empty_branch) return FALSE; /* All branches are non-empty */
- }
-
- c = *code;
- continue;
- }
-
- /* Handle the other opcodes */
-
- switch (c)
- {
- /* Check for quantifiers after a class. XCLASS is used for classes that
- cannot be represented just by a bit map. This includes negated single
- high-valued characters. The length in _pcre_OP_lengths[] is zero; the
- actual length is stored in the compiled code, so we must update "code"
- here. */
-
-#ifdef SUPPORT_UTF8
- case OP_XCLASS:
- ccode = code += GET(code, 1);
- goto CHECK_CLASS_REPEAT;
-#endif
-
- case OP_CLASS:
- case OP_NCLASS:
- ccode = code + 33;
-
-#ifdef SUPPORT_UTF8
- CHECK_CLASS_REPEAT:
-#endif
-
- switch (*ccode)
- {
- case OP_CRSTAR: /* These could be empty; continue */
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- break;
-
- default: /* Non-repeat => class must match */
- case OP_CRPLUS: /* These repeats aren't empty */
- case OP_CRMINPLUS:
- return FALSE;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
- break;
- }
- break;
-
- /* Opcodes that must match a character */
-
- case OP_PROP:
- case OP_NOTPROP:
- case OP_EXTUNI:
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- case OP_ALLANY:
- case OP_ANYBYTE:
- case OP_CHAR:
- case OP_CHARNC:
- case OP_NOT:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_EXACT:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTPOSPLUS:
- case OP_NOTEXACT:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEPOSPLUS:
- case OP_TYPEEXACT:
- return FALSE;
-
- /* These are going to continue, as they may be empty, but we have to
- fudge the length for the \p and \P cases. */
-
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPOSSTAR:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSQUERY:
- if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
- break;
-
- /* Same for these */
-
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEPOSUPTO:
- if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
- break;
-
- /* End of branch */
-
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_ALT:
- return TRUE;
-
- /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
- MINUPTO, and POSUPTO may be followed by a multibyte character */
-
-#ifdef SUPPORT_UTF8
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
- break;
-#endif
- }
- }
-
-return TRUE;
-}
-
-
-
-/*************************************************
-* Scan compiled regex for non-emptiness *
-*************************************************/
-
-/* This function is called to check for left recursive calls. We want to check
-the current branch of the current pattern to see if it could match the empty
-string. If it could, we must look outwards for branches at other levels,
-stopping when we pass beyond the bracket which is the subject of the recursion.
-
-Arguments:
- code points to start of the recursion
- endcode points to where to stop (current RECURSE item)
- bcptr points to the chain of current (unclosed) branch starts
- utf8 TRUE if in UTF-8 mode
-
-Returns: TRUE if what is matched could be empty
-*/
-
-static BOOL
-could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
- BOOL utf8)
-{
-while (bcptr != NULL && bcptr->current >= code)
- {
- if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
- bcptr = bcptr->outer;
- }
-return TRUE;
-}
-
-
-
-/*************************************************
-* Check for POSIX class syntax *
-*************************************************/
-
-/* This function is called when the sequence "[:" or "[." or "[=" is
-encountered in a character class. It checks whether this is followed by a
-sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
-reach an unescaped ']' without the special preceding character, return FALSE.
-
-Originally, this function only recognized a sequence of letters between the
-terminators, but it seems that Perl recognizes any sequence of characters,
-though of course unknown POSIX names are subsequently rejected. Perl gives an
-"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
-didn't consider this to be a POSIX class. Likewise for [:1234:].
-
-The problem in trying to be exactly like Perl is in the handling of escapes. We
-have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
-class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
-below handles the special case of \], but does not try to do any other escape
-processing. This makes it different from Perl for cases such as [:l\ower:]
-where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
-"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
-I think.
-
-Arguments:
- ptr pointer to the initial [
- endptr where to return the end pointer
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-check_posix_syntax(const uschar *ptr, const uschar **endptr)
-{
-int terminator; /* Don't combine these lines; the Solaris cc */
-terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
-for (++ptr; *ptr != 0; ptr++)
- {
- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
- {
- if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
- if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
- {
- *endptr = ptr;
- return TRUE;
- }
- }
- }
-return FALSE;
-}
-
-
-
-
-/*************************************************
-* Check POSIX class name *
-*************************************************/
-
-/* This function is called to check the name given in a POSIX-style class entry
-such as [:alnum:].
-
-Arguments:
- ptr points to the first letter
- len the length of the name
-
-Returns: a value representing the name, or -1 if unknown
-*/
-
-static int
-check_posix_name(const uschar *ptr, int len)
-{
-const char *pn = posix_names;
-register int yield = 0;
-while (posix_name_lengths[yield] != 0)
- {
- if (len == posix_name_lengths[yield] &&
- strncmp((const char *)ptr, pn, len) == 0) return yield;
- pn += posix_name_lengths[yield] + 1;
- yield++;
- }
-return -1;
-}
-
-
-/*************************************************
-* Adjust OP_RECURSE items in repeated group *
-*************************************************/
-
-/* OP_RECURSE items contain an offset from the start of the regex to the group
-that is referenced. This means that groups can be replicated for fixed
-repetition simply by copying (because the recursion is allowed to refer to
-earlier groups that are outside the current group). However, when a group is
-optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
-inserted before it, after it has been compiled. This means that any OP_RECURSE
-items within it that refer to the group itself or any contained groups have to
-have their offsets adjusted. That one of the jobs of this function. Before it
-is called, the partially compiled regex must be temporarily terminated with
-OP_END.
-
-This function has been extended with the possibility of forward references for
-recursions and subroutine calls. It must also check the list of such references
-for the group we are dealing with. If it finds that one of the recursions in
-the current group is on this list, it adjusts the offset in the list, not the
-value in the reference (which is a group number).
-
-Arguments:
- group points to the start of the group
- adjust the amount by which the group is to be moved
- utf8 TRUE in UTF-8 mode
- cd contains pointers to tables etc.
- save_hwm the hwm forward reference pointer at the start of the group
-
-Returns: nothing
-*/
-
-static void
-adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
- uschar *save_hwm)
-{
-uschar *ptr = group;
-
-while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
- {
- int offset;
- uschar *hc;
-
- /* See if this recursion is on the forward reference list. If so, adjust the
- reference. */
-
- for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
- {
- offset = GET(hc, 0);
- if (cd->start_code + offset == ptr + 1)
- {
- PUT(hc, 0, offset + adjust);
- break;
- }
- }
-
- /* Otherwise, adjust the recursion offset if it's after the start of this
- group. */
-
- if (hc >= cd->hwm)
- {
- offset = GET(ptr, 1);
- if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
- }
-
- ptr += 1 + LINK_SIZE;
- }
-}
-
-
-
-/*************************************************
-* Insert an automatic callout point *
-*************************************************/
-
-/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
-callout points before each pattern item.
-
-Arguments:
- code current code pointer
- ptr current pattern pointer
- cd pointers to tables etc
-
-Returns: new code pointer
-*/
-
-static uschar *
-auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
-{
-*code++ = OP_CALLOUT;
-*code++ = 255;
-PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
-PUT(code, LINK_SIZE, 0); /* Default length */
-return code + 2*LINK_SIZE;
-}
-
-
-
-/*************************************************
-* Complete a callout item *
-*************************************************/
-
-/* A callout item contains the length of the next item in the pattern, which
-we can't fill in till after we have reached the relevant point. This is used
-for both automatic and manual callouts.
-
-Arguments:
- previous_callout points to previous callout item
- ptr current pattern pointer
- cd pointers to tables etc
-
-Returns: nothing
-*/
-
-static void
-complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
-{
-int length = ptr - cd->start_pattern - GET(previous_callout, 2);
-PUT(previous_callout, 2 + LINK_SIZE, length);
-}
-
-
-
-#ifdef SUPPORT_UCP
-/*************************************************
-* Get othercase range *
-*************************************************/
-
-/* This function is passed the start and end of a class range, in UTF-8 mode
-with UCP support. It searches up the characters, looking for internal ranges of
-characters in the "other" case. Each call returns the next one, updating the
-start address.
-
-Arguments:
- cptr points to starting character value; updated
- d end value
- ocptr where to put start of othercase range
- odptr where to put end of othercase range
-
-Yield: TRUE when range returned; FALSE when no more
-*/
-
-static BOOL
-get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
- unsigned int *odptr)
-{
-unsigned int c, othercase, next;
-
-for (c = *cptr; c <= d; c++)
- { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
-
-if (c > d) return FALSE;
-
-*ocptr = othercase;
-next = othercase + 1;
-
-for (++c; c <= d; c++)
- {
- if (UCD_OTHERCASE(c) != next) break;
- next++;
- }
-
-*odptr = next - 1;
-*cptr = c;
-
-return TRUE;
-}
-#endif /* SUPPORT_UCP */
-
-
-
-/*************************************************
-* Check if auto-possessifying is possible *
-*************************************************/
-
-/* This function is called for unlimited repeats of certain items, to see
-whether the next thing could possibly match the repeated item. If not, it makes
-sense to automatically possessify the repeated item.
-
-Arguments:
- op_code the repeated op code
- this data for this item, depends on the opcode
- utf8 TRUE in UTF-8 mode
- utf8_char used for utf8 character bytes, NULL if not relevant
- ptr next character in pattern
- options options bits
- cd contains pointers to tables etc.
-
-Returns: TRUE if possessifying is wanted
-*/
-
-static BOOL
-check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
- const uschar *ptr, int options, compile_data *cd)
-{
-int next;
-
-/* Skip whitespace and comments in extended mode */
-
-if ((options & PCRE_EXTENDED) != 0)
- {
- for (;;)
- {
- while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
- if (*ptr == CHAR_NUMBER_SIGN)
- {
- while (*(++ptr) != 0)
- if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
- }
- else break;
- }
- }
-
-/* If the next item is one that we can handle, get its value. A non-negative
-value is a character, a negative value is an escape value. */
-
-if (*ptr == CHAR_BACKSLASH)
- {
- int temperrorcode = 0;
- next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
- if (temperrorcode != 0) return FALSE;
- ptr++; /* Point after the escape sequence */
- }
-
-else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARINC(next, ptr); } else
-#endif
- next = *ptr++;
- }
-
-else return FALSE;
-
-/* Skip whitespace and comments in extended mode */
-
-if ((options & PCRE_EXTENDED) != 0)
- {
- for (;;)
- {
- while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
- if (*ptr == CHAR_NUMBER_SIGN)
- {
- while (*(++ptr) != 0)
- if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
- }
- else break;
- }
- }
-
-/* If the next thing is itself optional, we have to give up. */
-
-if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
- strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
- return FALSE;
-
-/* Now compare the next item with the previous opcode. If the previous is a
-positive single character match, "item" either contains the character or, if
-"item" is greater than 127 in utf8 mode, the character's bytes are in
-utf8_char. */
-
-
-/* Handle cases when the next item is a character. */
-
-if (next >= 0) switch(op_code)
- {
- case OP_CHAR:
-#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#else
- (void)(utf8_char); /* Keep compiler happy by referencing function argument */
-#endif
- return item != next;
-
- /* For CHARNC (caseless character) we must check the other case. If we have
- Unicode property support, we can use it to test the other case of
- high-valued characters. */
-
- case OP_CHARNC:
-#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#endif
- if (item == next) return FALSE;
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- unsigned int othercase;
- if (next < 128) othercase = cd->fcc[next]; else
-#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE((unsigned int)next);
-#else
- othercase = NOTACHAR;
-#endif
- return (unsigned int)item != othercase;
- }
- else
-#endif /* SUPPORT_UTF8 */
- return (item != cd->fcc[next]); /* Non-UTF-8 mode */
-
- /* For OP_NOT, "item" must be a single-byte character. */
-
- case OP_NOT:
- if (item == next) return TRUE;
- if ((options & PCRE_CASELESS) == 0) return FALSE;
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- unsigned int othercase;
- if (next < 128) othercase = cd->fcc[next]; else
-#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE(next);
-#else
- othercase = NOTACHAR;
-#endif
- return (unsigned int)item == othercase;
- }
- else
-#endif /* SUPPORT_UTF8 */
- return (item == cd->fcc[next]); /* Non-UTF-8 mode */
-
- case OP_DIGIT:
- return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
-
- case OP_NOT_DIGIT:
- return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
-
- case OP_WHITESPACE:
- return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
-
- case OP_NOT_WHITESPACE:
- return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
-
- case OP_WORDCHAR:
- return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
-
- case OP_NOT_WORDCHAR:
- return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
-
- case OP_HSPACE:
- case OP_NOT_HSPACE:
- switch(next)
- {
- case 0x09:
- case 0x20:
- case 0xa0:
- case 0x1680:
- case 0x180e:
- case 0x2000:
- case 0x2001:
- case 0x2002:
- case 0x2003:
- case 0x2004:
- case 0x2005:
- case 0x2006:
- case 0x2007:
- case 0x2008:
- case 0x2009:
- case 0x200A:
- case 0x202f:
- case 0x205f:
- case 0x3000:
- return op_code != OP_HSPACE;
- default:
- return op_code == OP_HSPACE;
- }
-
- case OP_VSPACE:
- case OP_NOT_VSPACE:
- switch(next)
- {
- case 0x0a:
- case 0x0b:
- case 0x0c:
- case 0x0d:
- case 0x85:
- case 0x2028:
- case 0x2029:
- return op_code != OP_VSPACE;
- default:
- return op_code == OP_VSPACE;
- }
-
- default:
- return FALSE;
- }
-
-
-/* Handle the case when the next item is \d, \s, etc. */
-
-switch(op_code)
- {
- case OP_CHAR:
- case OP_CHARNC:
-#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#endif
- switch(-next)
- {
- case ESC_d:
- return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
-
- case ESC_D:
- return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
-
- case ESC_s:
- return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
-
- case ESC_S:
- return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
-
- case ESC_w:
- return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
-
- case ESC_W:
- return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
-
- case ESC_h:
- case ESC_H:
- switch(item)
- {
- case 0x09:
- case 0x20:
- case 0xa0:
- case 0x1680:
- case 0x180e:
- case 0x2000:
- case 0x2001:
- case 0x2002:
- case 0x2003:
- case 0x2004:
- case 0x2005:
- case 0x2006:
- case 0x2007:
- case 0x2008:
- case 0x2009:
- case 0x200A:
- case 0x202f:
- case 0x205f:
- case 0x3000:
- return -next != ESC_h;
- default:
- return -next == ESC_h;
- }
-
- case ESC_v:
- case ESC_V:
- switch(item)
- {
- case 0x0a:
- case 0x0b:
- case 0x0c:
- case 0x0d:
- case 0x85:
- case 0x2028:
- case 0x2029:
- return -next != ESC_v;
- default:
- return -next == ESC_v;
- }
-
- default:
- return FALSE;
- }
-
- case OP_DIGIT:
- return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
- next == -ESC_h || next == -ESC_v;
-
- case OP_NOT_DIGIT:
- return next == -ESC_d;
-
- case OP_WHITESPACE:
- return next == -ESC_S || next == -ESC_d || next == -ESC_w;
-
- case OP_NOT_WHITESPACE:
- return next == -ESC_s || next == -ESC_h || next == -ESC_v;
-
- case OP_HSPACE:
- return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
-
- case OP_NOT_HSPACE:
- return next == -ESC_h;
-
- /* Can't have \S in here because VT matches \S (Perl anomaly) */
- case OP_VSPACE:
- return next == -ESC_V || next == -ESC_d || next == -ESC_w;
-
- case OP_NOT_VSPACE:
- return next == -ESC_v;
-
- case OP_WORDCHAR:
- return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
-
- case OP_NOT_WORDCHAR:
- return next == -ESC_w || next == -ESC_d;
-
- default:
- return FALSE;
- }
-
-/* Control does not reach here */
-}
-
-
-
-/*************************************************
-* Compile one branch *
-*************************************************/
-
-/* Scan the pattern, compiling it into the a vector. If the options are
-changed during the branch, the pointer is used to change the external options
-bits. This function is used during the pre-compile phase when we are trying
-to find out the amount of memory needed, as well as during the real compile
-phase. The value of lengthptr distinguishes the two phases.
-
-Arguments:
- optionsptr pointer to the option bits
- codeptr points to the pointer to the current code point
- ptrptr points to the current pattern pointer
- errorcodeptr points to error code variable
- firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
- reqbyteptr set to the last literal character required, else < 0
- bcptr points to current branch chain
- cd contains pointers to tables etc.
- lengthptr NULL during the real compile phase
- points to length accumulator during pre-compile phase
-
-Returns: TRUE on success
- FALSE, with *errorcodeptr set non-zero on error
-*/
-
-static BOOL
-compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
- int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
- compile_data *cd, int *lengthptr)
-{
-int repeat_type, op_type;
-int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
-int bravalue = 0;
-int greedy_default, greedy_non_default;
-int firstbyte, reqbyte;
-int zeroreqbyte, zerofirstbyte;
-int req_caseopt, reqvary, tempreqvary;
-int options = *optionsptr;
-int after_manual_callout = 0;
-int length_prevgroup = 0;
-register int c;
-register uschar *code = *codeptr;
-uschar *last_code = code;
-uschar *orig_code = code;
-uschar *tempcode;
-BOOL inescq = FALSE;
-BOOL groupsetfirstbyte = FALSE;
-const uschar *ptr = *ptrptr;
-const uschar *tempptr;
-uschar *previous = NULL;
-uschar *previous_callout = NULL;
-uschar *save_hwm = NULL;
-uschar classbits[32];
-
-#ifdef SUPPORT_UTF8
-BOOL class_utf8;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
-uschar *class_utf8data;
-uschar *class_utf8data_base;
-uschar utf8_char[6];
-#else
-BOOL utf8 = FALSE;
-uschar *utf8_char = NULL;
-#endif
-
-#ifdef PCRE_DEBUG
-if (lengthptr != NULL) DPRINTF((">> start branch\n"));
-#endif
-
-/* Set up the default and non-default settings for greediness */
-
-greedy_default = ((options & PCRE_UNGREEDY) != 0);
-greedy_non_default = greedy_default ^ 1;
-
-/* Initialize no first byte, no required byte. REQ_UNSET means "no char
-matching encountered yet". It gets changed to REQ_NONE if we hit something that
-matches a non-fixed char first char; reqbyte just remains unset if we never
-find one.
-
-When we hit a repeat whose minimum is zero, we may have to adjust these values
-to take the zero repeat into account. This is implemented by setting them to
-zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
-item types that can be repeated set these backoff variables appropriately. */
-
-firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
-
-/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
-according to the current setting of the caseless flag. REQ_CASELESS is a bit
-value > 255. It is added into the firstbyte or reqbyte variables to record the
-case status of the value. This is used only for ASCII characters. */
-
-req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
-
-/* Switch on next character until the end of the branch */
-
-for (;; ptr++)
- {
- BOOL negate_class;
- BOOL should_flip_negation;
- BOOL possessive_quantifier;
- BOOL is_quantifier;
- BOOL is_recurse;
- BOOL reset_bracount;
- int class_charcount;
- int class_lastchar;
- int newoptions;
- int recno;
- int refsign;
- int skipbytes;
- int subreqbyte;
- int subfirstbyte;
- int terminator;
- int mclength;
- uschar mcbuffer[8];
-
- /* Get next byte in the pattern */
-
- c = *ptr;
-
- /* If we are in the pre-compile phase, accumulate the length used for the
- previous cycle of this loop. */
-
- if (lengthptr != NULL)
- {
-#ifdef PCRE_DEBUG
- if (code > cd->hwm) cd->hwm = code; /* High water info */
-#endif
- if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
- {
- *errorcodeptr = ERR52;
- goto FAILED;
- }
-
- /* There is at least one situation where code goes backwards: this is the
- case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
- the class is simply eliminated. However, it is created first, so we have to
- allow memory for it. Therefore, don't ever reduce the length at this point.
- */
-
- if (code < last_code) code = last_code;
-
- /* Paranoid check for integer overflow */
-
- if (OFLOW_MAX - *lengthptr < code - last_code)
- {
- *errorcodeptr = ERR20;
- goto FAILED;
- }
-
- *lengthptr += code - last_code;
- DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
-
- /* If "previous" is set and it is not at the start of the work space, move
- it back to there, in order to avoid filling up the work space. Otherwise,
- if "previous" is NULL, reset the current code pointer to the start. */
-
- if (previous != NULL)
- {
- if (previous > orig_code)
- {
- memmove(orig_code, previous, code - previous);
- code -= previous - orig_code;
- previous = orig_code;
- }
- }
- else code = orig_code;
-
- /* Remember where this code item starts so we can pick up the length
- next time round. */
-
- last_code = code;
- }
-
- /* In the real compile phase, just check the workspace used by the forward
- reference list. */
-
- else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
- {
- *errorcodeptr = ERR52;
- goto FAILED;
- }
-
- /* If in \Q...\E, check for the end; if not, we have a literal */
-
- if (inescq && c != 0)
- {
- if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
- {
- inescq = FALSE;
- ptr++;
- continue;
- }
- else
- {
- if (previous_callout != NULL)
- {
- if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
- complete_callout(previous_callout, ptr, cd);
- previous_callout = NULL;
- }
- if ((options & PCRE_AUTO_CALLOUT) != 0)
- {
- previous_callout = code;
- code = auto_callout(code, ptr, cd);
- }
- goto NORMAL_CHAR;
- }
- }
-
- /* Fill in length of a previous callout, except when the next thing is
- a quantifier. */
-
- is_quantifier =
- c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
- (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
-
- if (!is_quantifier && previous_callout != NULL &&
- after_manual_callout-- <= 0)
- {
- if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
- complete_callout(previous_callout, ptr, cd);
- previous_callout = NULL;
- }
-
- /* In extended mode, skip white space and comments */
-
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((cd->ctypes[c] & ctype_space) != 0) continue;
- if (c == CHAR_NUMBER_SIGN)
- {
- while (*(++ptr) != 0)
- {
- if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
- }
- if (*ptr != 0) continue;
-
- /* Else fall through to handle end of string */
- c = 0;
- }
- }
-
- /* No auto callout for quantifiers. */
-
- if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
- {
- previous_callout = code;
- code = auto_callout(code, ptr, cd);
- }
-
- switch(c)
- {
- /* ===================================================================*/
- case 0: /* The branch terminates at string end */
- case CHAR_VERTICAL_LINE: /* or | or ) */
- case CHAR_RIGHT_PARENTHESIS:
- *firstbyteptr = firstbyte;
- *reqbyteptr = reqbyte;
- *codeptr = code;
- *ptrptr = ptr;
- if (lengthptr != NULL)
- {
- if (OFLOW_MAX - *lengthptr < code - last_code)
- {
- *errorcodeptr = ERR20;
- goto FAILED;
- }
- *lengthptr += code - last_code; /* To include callout length */
- DPRINTF((">> end branch\n"));
- }
- return TRUE;
-
-
- /* ===================================================================*/
- /* Handle single-character metacharacters. In multiline mode, ^ disables
- the setting of any following char as a first character. */
-
- case CHAR_CIRCUMFLEX_ACCENT:
- if ((options & PCRE_MULTILINE) != 0)
- {
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- }
- previous = NULL;
- *code++ = OP_CIRC;
- break;
-
- case CHAR_DOLLAR_SIGN:
- previous = NULL;
- *code++ = OP_DOLL;
- break;
-
- /* There can never be a first char if '.' is first, whatever happens about
- repeats. The value of reqbyte doesn't change either. */
-
- case CHAR_DOT:
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- zerofirstbyte = firstbyte;
- zeroreqbyte = reqbyte;
- previous = code;
- *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
- break;
-
-
- /* ===================================================================*/
- /* Character classes. If the included characters are all < 256, we build a
- 32-byte bitmap of the permitted characters, except in the special case
- where there is only one such character. For negated classes, we build the
- map as usual, then invert it at the end. However, we use a different opcode
- so that data characters > 255 can be handled correctly.
-
- If the class contains characters outside the 0-255 range, a different
- opcode is compiled. It may optionally have a bit map for characters < 256,
- but those above are are explicitly listed afterwards. A flag byte tells
- whether the bitmap is present, and whether this is a negated class or not.
-
- In JavaScript compatibility mode, an isolated ']' causes an error. In
- default (Perl) mode, it is treated as a data character. */
-
- case CHAR_RIGHT_SQUARE_BRACKET:
- if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
- {
- *errorcodeptr = ERR64;
- goto FAILED;
- }
- goto NORMAL_CHAR;
-
- case CHAR_LEFT_SQUARE_BRACKET:
- previous = code;
-
- /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
- they are encountered at the top level, so we'll do that too. */
-
- if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
- ptr[1] == CHAR_EQUALS_SIGN) &&
- check_posix_syntax(ptr, &tempptr))
- {
- *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
- goto FAILED;
- }
-
- /* If the first character is '^', set the negation flag and skip it. Also,
- if the first few characters (either before or after ^) are \Q\E or \E we
- skip them too. This makes for compatibility with Perl. */
-
- negate_class = FALSE;
- for (;;)
- {
- c = *(++ptr);
- if (c == CHAR_BACKSLASH)
- {
- if (ptr[1] == CHAR_E)
- ptr++;
- else if (strncmp((const char *)ptr+1,
- STR_Q STR_BACKSLASH STR_E, 3) == 0)
- ptr += 3;
- else
- break;
- }
- else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
- negate_class = TRUE;
- else break;
- }
-
- /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
- an initial ']' is taken as a data character -- the code below handles
- that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
- [^] must match any character, so generate OP_ALLANY. */
-
- if (c == CHAR_RIGHT_SQUARE_BRACKET &&
- (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
- {
- *code++ = negate_class? OP_ALLANY : OP_FAIL;
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- zerofirstbyte = firstbyte;
- break;
- }
-
- /* If a class contains a negative special such as \S, we need to flip the
- negation flag at the end, so that support for characters > 255 works
- correctly (they are all included in the class). */
-
- should_flip_negation = FALSE;
-
- /* Keep a count of chars with values < 256 so that we can optimize the case
- of just a single character (as long as it's < 256). However, For higher
- valued UTF-8 characters, we don't yet do any optimization. */
-
- class_charcount = 0;
- class_lastchar = -1;
-
- /* Initialize the 32-char bit map to all zeros. We build the map in a
- temporary bit of memory, in case the class contains only 1 character (less
- than 256), because in that case the compiled code doesn't use the bit map.
- */
-
- memset(classbits, 0, 32 * sizeof(uschar));
-
-#ifdef SUPPORT_UTF8
- class_utf8 = FALSE; /* No chars >= 256 */
- class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
- class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
-#endif
-
- /* Process characters until ] is reached. By writing this as a "do" it
- means that an initial ] is taken as a data character. At the start of the
- loop, c contains the first byte of the character. */
-
- if (c != 0) do
- {
- const uschar *oldptr;
-
-#ifdef SUPPORT_UTF8
- if (utf8 && c > 127)
- { /* Braces are required because the */
- GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
- }
-
- /* In the pre-compile phase, accumulate the length of any UTF-8 extra
- data and reset the pointer. This is so that very large classes that
- contain a zillion UTF-8 characters no longer overwrite the work space
- (which is on the stack). */
-
- if (lengthptr != NULL)
- {
- *lengthptr += class_utf8data - class_utf8data_base;
- class_utf8data = class_utf8data_base;
- }
-
-#endif
-
- /* Inside \Q...\E everything is literal except \E */
-
- if (inescq)
- {
- if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
- {
- inescq = FALSE; /* Reset literal state */
- ptr++; /* Skip the 'E' */
- continue; /* Carry on with next */
- }
- goto CHECK_RANGE; /* Could be range if \E follows */
- }
-
- /* Handle POSIX class names. Perl allows a negation extension of the
- form [:^name:]. A square bracket that doesn't match the syntax is
- treated as a literal. We also recognize the POSIX constructions
- [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
- 5.6 and 5.8 do. */
-
- if (c == CHAR_LEFT_SQUARE_BRACKET &&
- (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
- ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
- {
- BOOL local_negate = FALSE;
- int posix_class, taboffset, tabopt;
- register const uschar *cbits = cd->cbits;
- uschar pbits[32];
-
- if (ptr[1] != CHAR_COLON)
- {
- *errorcodeptr = ERR31;
- goto FAILED;
- }
-
- ptr += 2;
- if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
- {
- local_negate = TRUE;
- should_flip_negation = TRUE; /* Note negative special */
- ptr++;
- }
-
- posix_class = check_posix_name(ptr, tempptr - ptr);
- if (posix_class < 0)
- {
- *errorcodeptr = ERR30;
- goto FAILED;
- }
-
- /* If matching is caseless, upper and lower are converted to
- alpha. This relies on the fact that the class table starts with
- alpha, lower, upper as the first 3 entries. */
-
- if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
- posix_class = 0;
-
- /* We build the bit map for the POSIX class in a chunk of local store
- because we may be adding and subtracting from it, and we don't want to
- subtract bits that may be in the main map already. At the end we or the
- result into the bit map that is being built. */
-
- posix_class *= 3;
-
- /* Copy in the first table (always present) */
-
- memcpy(pbits, cbits + posix_class_maps[posix_class],
- 32 * sizeof(uschar));
-
- /* If there is a second table, add or remove it as required. */
-
- taboffset = posix_class_maps[posix_class + 1];
- tabopt = posix_class_maps[posix_class + 2];
-
- if (taboffset >= 0)
- {
- if (tabopt >= 0)
- for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
- else
- for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
- }
-
- /* Not see if we need to remove any special characters. An option
- value of 1 removes vertical space and 2 removes underscore. */
-
- if (tabopt < 0) tabopt = -tabopt;
- if (tabopt == 1) pbits[1] &= ~0x3c;
- else if (tabopt == 2) pbits[11] &= 0x7f;
-
- /* Add the POSIX table or its complement into the main table that is
- being built and we are done. */
-
- if (local_negate)
- for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
- else
- for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
-
- ptr = tempptr + 1;
- class_charcount = 10; /* Set > 1; assumes more than 1 per class */
- continue; /* End of POSIX syntax handling */
- }
-
- /* Backslash may introduce a single character, or it may introduce one
- of the specials, which just set a flag. The sequence \b is a special
- case. Inside a class (and only there) it is treated as backspace.
- Elsewhere it marks a word boundary. Other escapes have preset maps ready
- to 'or' into the one we are building. We assume they have more than one
- character in them, so set class_charcount bigger than one. */
-
- if (c == CHAR_BACKSLASH)
- {
- c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
- if (*errorcodeptr != 0) goto FAILED;
-
- if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
- else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
- else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
- else if (-c == ESC_Q) /* Handle start of quoted string */
- {
- if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
- {
- ptr += 2; /* avoid empty string */
- }
- else inescq = TRUE;
- continue;
- }
- else if (-c == ESC_E) continue; /* Ignore orphan \E */
-
- if (c < 0)
- {
- register const uschar *cbits = cd->cbits;
- class_charcount += 2; /* Greater than 1 is what matters */
-
- /* Save time by not doing this in the pre-compile phase. */
-
- if (lengthptr == NULL) switch (-c)
- {
- case ESC_d:
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
- continue;
-
- case ESC_D:
- should_flip_negation = TRUE;
- for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
- continue;
-
- case ESC_w:
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
- continue;
-
- case ESC_W:
- should_flip_negation = TRUE;
- for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
- continue;
-
- case ESC_s:
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
- classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
- continue;
-
- case ESC_S:
- should_flip_negation = TRUE;
- for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
- classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
- continue;
-
- default: /* Not recognized; fall through */
- break; /* Need "default" setting to stop compiler warning. */
- }
-
- /* In the pre-compile phase, just do the recognition. */
-
- else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
- c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
-
- /* We need to deal with \H, \h, \V, and \v in both phases because
- they use extra memory. */
-
- if (-c == ESC_h)
- {
- SETBIT(classbits, 0x09); /* VT */
- SETBIT(classbits, 0x20); /* SPACE */
- SETBIT(classbits, 0xa0); /* NSBP */
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
- }
-#endif
- continue;
- }
-
- if (-c == ESC_H)
- {
- for (c = 0; c < 32; c++)
- {
- int x = 0xff;
- switch (c)
- {
- case 0x09/8: x ^= 1 << (0x09%8); break;
- case 0x20/8: x ^= 1 << (0x20%8); break;
- case 0xa0/8: x ^= 1 << (0xa0%8); break;
- default: break;
- }
- classbits[c] |= x;
- }
-
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
- }
-#endif
- continue;
- }
-
- if (-c == ESC_v)
- {
- SETBIT(classbits, 0x0a); /* LF */
- SETBIT(classbits, 0x0b); /* VT */
- SETBIT(classbits, 0x0c); /* FF */
- SETBIT(classbits, 0x0d); /* CR */
- SETBIT(classbits, 0x85); /* NEL */
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
- }
-#endif
- continue;
- }
-
- if (-c == ESC_V)
- {
- for (c = 0; c < 32; c++)
- {
- int x = 0xff;
- switch (c)
- {
- case 0x0a/8: x ^= 1 << (0x0a%8);
- x ^= 1 << (0x0b%8);
- x ^= 1 << (0x0c%8);
- x ^= 1 << (0x0d%8);
- break;
- case 0x85/8: x ^= 1 << (0x85%8); break;
- default: break;
- }
- classbits[c] |= x;
- }
-
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
- class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
- }
-#endif
- continue;
- }
-
- /* We need to deal with \P and \p in both phases. */
-
-#ifdef SUPPORT_UCP
- if (-c == ESC_p || -c == ESC_P)
- {
- BOOL negated;
- int pdata;
- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
- if (ptype < 0) goto FAILED;
- class_utf8 = TRUE;
- *class_utf8data++ = ((-c == ESC_p) != negated)?
- XCL_PROP : XCL_NOTPROP;
- *class_utf8data++ = ptype;
- *class_utf8data++ = pdata;
- class_charcount -= 2; /* Not a < 256 character */
- continue;
- }
-#endif
- /* Unrecognized escapes are faulted if PCRE is running in its
- strict mode. By default, for compatibility with Perl, they are
- treated as literals. */
-
- if ((options & PCRE_EXTRA) != 0)
- {
- *errorcodeptr = ERR7;
- goto FAILED;
- }
-
- class_charcount -= 2; /* Undo the default count from above */
- c = *ptr; /* Get the final character and fall through */
- }
-
- /* Fall through if we have a single character (c >= 0). This may be
- greater than 256 in UTF-8 mode. */
-
- } /* End of backslash handling */
-
- /* A single character may be followed by '-' to form a range. However,
- Perl does not permit ']' to be the end of the range. A '-' character
- at the end is treated as a literal. Perl ignores orphaned \E sequences
- entirely. The code for handling \Q and \E is messy. */
-
- CHECK_RANGE:
- while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
- {
- inescq = FALSE;
- ptr += 2;
- }
-
- oldptr = ptr;
-
- /* Remember \r or \n */
-
- if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
-
- /* Check for range */
-
- if (!inescq && ptr[1] == CHAR_MINUS)
- {
- int d;
- ptr += 2;
- while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
-
- /* If we hit \Q (not followed by \E) at this point, go into escaped
- mode. */
-
- while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
- {
- ptr += 2;
- if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
- { ptr += 2; continue; }
- inescq = TRUE;
- break;
- }
-
- if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
- {
- ptr = oldptr;
- goto LONE_SINGLE_CHARACTER;
- }
-
-#ifdef SUPPORT_UTF8
- if (utf8)
- { /* Braces are required because the */
- GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
- }
- else
-#endif
- d = *ptr; /* Not UTF-8 mode */
-
- /* The second part of a range can be a single-character escape, but
- not any of the other escapes. Perl 5.6 treats a hyphen as a literal
- in such circumstances. */
-
- if (!inescq && d == CHAR_BACKSLASH)
- {
- d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
- if (*errorcodeptr != 0) goto FAILED;
-
- /* \b is backspace; \X is literal X; \R is literal R; any other
- special means the '-' was literal */
-
- if (d < 0)
- {
- if (d == -ESC_b) d = CHAR_BS;
- else if (d == -ESC_X) d = CHAR_X;
- else if (d == -ESC_R) d = CHAR_R; else
- {
- ptr = oldptr;
- goto LONE_SINGLE_CHARACTER; /* A few lines below */
- }
- }
- }
-
- /* Check that the two values are in the correct order. Optimize
- one-character ranges */
-
- if (d < c)
- {
- *errorcodeptr = ERR8;
- goto FAILED;
- }
-
- if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
-
- /* Remember \r or \n */
-
- if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
-
- /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
- matching, we have to use an XCLASS with extra data items. Caseless
- matching for characters > 127 is available only if UCP support is
- available. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
- {
- class_utf8 = TRUE;
-
- /* With UCP support, we can find the other case equivalents of
- the relevant characters. There may be several ranges. Optimize how
- they fit with the basic range. */
-
-#ifdef SUPPORT_UCP
- if ((options & PCRE_CASELESS) != 0)
- {
- unsigned int occ, ocd;
- unsigned int cc = c;
- unsigned int origd = d;
- while (get_othercase_range(&cc, origd, &occ, &ocd))
- {
- if (occ >= (unsigned int)c &&
- ocd <= (unsigned int)d)
- continue; /* Skip embedded ranges */
-
- if (occ < (unsigned int)c &&
- ocd >= (unsigned int)c - 1) /* Extend the basic range */
- { /* if there is overlap, */
- c = occ; /* noting that if occ < c */
- continue; /* we can't have ocd > d */
- } /* because a subrange is */
- if (ocd > (unsigned int)d &&
- occ <= (unsigned int)d + 1) /* always shorter than */
- { /* the basic range. */
- d = ocd;
- continue;
- }
-
- if (occ == ocd)
- {
- *class_utf8data++ = XCL_SINGLE;
- }
- else
- {
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
- }
- class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
- }
- }
-#endif /* SUPPORT_UCP */
-
- /* Now record the original range, possibly modified for UCP caseless
- overlapping ranges. */
-
- *class_utf8data++ = XCL_RANGE;
- class_utf8data += _pcre_ord2utf8(c, class_utf8data);
- class_utf8data += _pcre_ord2utf8(d, class_utf8data);
-
- /* With UCP support, we are done. Without UCP support, there is no
- caseless matching for UTF-8 characters > 127; we can use the bit map
- for the smaller ones. */
-
-#ifdef SUPPORT_UCP
- continue; /* With next character in the class */
-#else
- if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
-
- /* Adjust upper limit and fall through to set up the map */
-
- d = 127;
-
-#endif /* SUPPORT_UCP */
- }
-#endif /* SUPPORT_UTF8 */
-
- /* We use the bit map for all cases when not in UTF-8 mode; else
- ranges that lie entirely within 0-127 when there is UCP support; else
- for partial ranges without UCP support. */
-
- class_charcount += d - c + 1;
- class_lastchar = d;
-
- /* We can save a bit of time by skipping this in the pre-compile. */
-
- if (lengthptr == NULL) for (; c <= d; c++)
- {
- classbits[c/8] |= (1 << (c&7));
- if ((options & PCRE_CASELESS) != 0)
- {
- int uc = cd->fcc[c]; /* flip case */
- classbits[uc/8] |= (1 << (uc&7));
- }
- }
-
- continue; /* Go get the next char in the class */
- }
-
- /* Handle a lone single character - we can get here for a normal
- non-escape char, or after \ that introduces a single character or for an
- apparent range that isn't. */
-
- LONE_SINGLE_CHARACTER:
-
- /* Handle a character that cannot go in the bit map */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
- {
- class_utf8 = TRUE;
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(c, class_utf8data);
-
-#ifdef SUPPORT_UCP
- if ((options & PCRE_CASELESS) != 0)
- {
- unsigned int othercase;
- if ((othercase = UCD_OTHERCASE(c)) != c)
- {
- *class_utf8data++ = XCL_SINGLE;
- class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
- }
- }
-#endif /* SUPPORT_UCP */
-
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Handle a single-byte character */
- {
- classbits[c/8] |= (1 << (c&7));
- if ((options & PCRE_CASELESS) != 0)
- {
- c = cd->fcc[c]; /* flip case */
- classbits[c/8] |= (1 << (c&7));
- }
- class_charcount++;
- class_lastchar = c;
- }
- }
-
- /* Loop until ']' reached. This "while" is the end of the "do" above. */
-
- while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
-
- if (c == 0) /* Missing terminating ']' */
- {
- *errorcodeptr = ERR6;
- goto FAILED;
- }
-
-
-/* This code has been disabled because it would mean that \s counts as
-an explicit \r or \n reference, and that's not really what is wanted. Now
-we set the flag only if there is a literal "\r" or "\n" in the class. */
-
-#if 0
- /* Remember whether \r or \n are in this class */
-
- if (negate_class)
- {
- if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
- }
- else
- {
- if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
- }
-#endif
-
-
- /* If class_charcount is 1, we saw precisely one character whose value is
- less than 256. As long as there were no characters >= 128 and there was no
- use of \p or \P, in other words, no use of any XCLASS features, we can
- optimize.
-
- In UTF-8 mode, we can optimize the negative case only if there were no
- characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
- operate on single-bytes only. This is an historical hangover. Maybe one day
- we can tidy these opcodes to handle multi-byte characters.
-
- The optimization throws away the bit map. We turn the item into a
- 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
- that OP_NOT does not support multibyte characters. In the positive case, it
- can cause firstbyte to be set. Otherwise, there can be no first char if
- this item is first, whatever repeat count may follow. In the case of
- reqbyte, save the previous value for reinstating. */
-
-#ifdef SUPPORT_UTF8
- if (class_charcount == 1 && !class_utf8 &&
- (!utf8 || !negate_class || class_lastchar < 128))
-#else
- if (class_charcount == 1)
-#endif
- {
- zeroreqbyte = reqbyte;
-
- /* The OP_NOT opcode works on one-byte characters only. */
-
- if (negate_class)
- {
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- zerofirstbyte = firstbyte;
- *code++ = OP_NOT;
- *code++ = class_lastchar;
- break;
- }
-
- /* For a single, positive character, get the value into mcbuffer, and
- then we can handle this with the normal one-character code. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && class_lastchar > 127)
- mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
- else
-#endif
- {
- mcbuffer[0] = class_lastchar;
- mclength = 1;
- }
- goto ONE_CHAR;
- } /* End of 1-char optimization */
-
- /* The general case - not the one-char optimization. If this is the first
- thing in the branch, there can be no first char setting, whatever the
- repeat count. Any reqbyte setting must remain unchanged after any kind of
- repeat. */
-
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- zerofirstbyte = firstbyte;
- zeroreqbyte = reqbyte;
-
- /* If there are characters with values > 255, we have to compile an
- extended class, with its own opcode, unless there was a negated special
- such as \S in the class, because in that case all characters > 255 are in
- the class, so any that were explicitly given as well can be ignored. If
- (when there are explicit characters > 255 that must be listed) there are no
- characters < 256, we can omit the bitmap in the actual compiled code. */
-
-#ifdef SUPPORT_UTF8
- if (class_utf8 && !should_flip_negation)
- {
- *class_utf8data++ = XCL_END; /* Marks the end of extra data */
- *code++ = OP_XCLASS;
- code += LINK_SIZE;
- *code = negate_class? XCL_NOT : 0;
-
- /* If the map is required, move up the extra data to make room for it;
- otherwise just move the code pointer to the end of the extra data. */
-
- if (class_charcount > 0)
- {
- *code++ |= XCL_MAP;
- memmove(code + 32, code, class_utf8data - code);
- memcpy(code, classbits, 32);
- code = class_utf8data + 32;
- }
- else code = class_utf8data;
-
- /* Now fill in the complete length of the item */
-
- PUT(previous, 1, code - previous);
- break; /* End of class handling */
- }
-#endif
-
- /* If there are no characters > 255, set the opcode to OP_CLASS or
- OP_NCLASS, depending on whether the whole class was negated and whether
- there were negative specials such as \S in the class. Then copy the 32-byte
- map into the code vector, negating it if necessary. */
-
- *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
- if (negate_class)
- {
- if (lengthptr == NULL) /* Save time in the pre-compile phase */
- for (c = 0; c < 32; c++) code[c] = ~classbits[c];
- }
- else
- {
- memcpy(code, classbits, 32);
- }
- code += 32;
- break;
-
-
- /* ===================================================================*/
- /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
- has been tested above. */
-
- case CHAR_LEFT_CURLY_BRACKET:
- if (!is_quantifier) goto NORMAL_CHAR;
- ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
- if (*errorcodeptr != 0) goto FAILED;
- goto REPEAT;
-
- case CHAR_ASTERISK:
- repeat_min = 0;
- repeat_max = -1;
- goto REPEAT;
-
- case CHAR_PLUS:
- repeat_min = 1;
- repeat_max = -1;
- goto REPEAT;
-
- case CHAR_QUESTION_MARK:
- repeat_min = 0;
- repeat_max = 1;
-
- REPEAT:
- if (previous == NULL)
- {
- *errorcodeptr = ERR9;
- goto FAILED;
- }
-
- if (repeat_min == 0)
- {
- firstbyte = zerofirstbyte; /* Adjust for zero repeat */
- reqbyte = zeroreqbyte; /* Ditto */
- }
-
- /* Remember whether this is a variable length repeat */
-
- reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
-
- op_type = 0; /* Default single-char op codes */
- possessive_quantifier = FALSE; /* Default not possessive quantifier */
-
- /* Save start of previous item, in case we have to move it up to make space
- for an inserted OP_ONCE for the additional '+' extension. */
-
- tempcode = previous;
-
- /* If the next character is '+', we have a possessive quantifier. This
- implies greediness, whatever the setting of the PCRE_UNGREEDY option.
- If the next character is '?' this is a minimizing repeat, by default,
- but if PCRE_UNGREEDY is set, it works the other way round. We change the
- repeat type to the non-default. */
-
- if (ptr[1] == CHAR_PLUS)
- {
- repeat_type = 0; /* Force greedy */
- possessive_quantifier = TRUE;
- ptr++;
- }
- else if (ptr[1] == CHAR_QUESTION_MARK)
- {
- repeat_type = greedy_non_default;
- ptr++;
- }
- else repeat_type = greedy_default;
-
- /* If previous was a character match, abolish the item and generate a
- repeat item instead. If a char item has a minumum of more than one, ensure
- that it is set in reqbyte - it might not be if a sequence such as x{3} is
- the first thing in a branch because the x will have gone into firstbyte
- instead. */
-
- if (*previous == OP_CHAR || *previous == OP_CHARNC)
- {
- /* Deal with UTF-8 characters that take up more than one byte. It's
- easier to write this out separately than try to macrify it. Use c to
- hold the length of the character in bytes, plus 0x80 to flag that it's a
- length rather than a small character. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && (code[-1] & 0x80) != 0)
- {
- uschar *lastchar = code - 1;
- while((*lastchar & 0xc0) == 0x80) lastchar--;
- c = code - lastchar; /* Length of UTF-8 character */
- memcpy(utf8_char, lastchar, c); /* Save the char */
- c |= 0x80; /* Flag c as a length */
- }
- else
-#endif
-
- /* Handle the case of a single byte - either with no UTF8 support, or
- with UTF-8 disabled, or for a UTF-8 character < 128. */
-
- {
- c = code[-1];
- if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
- }
-
- /* If the repetition is unlimited, it pays to see if the next thing on
- the line is something that cannot possibly match this character. If so,
- automatically possessifying this item gains some performance in the case
- where the match fails. */
-
- if (!possessive_quantifier &&
- repeat_max < 0 &&
- check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
- options, cd))
- {
- repeat_type = 0; /* Force greedy */
- possessive_quantifier = TRUE;
- }
-
- goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
- }
-
- /* If previous was a single negated character ([^a] or similar), we use
- one of the special opcodes, replacing it. The code is shared with single-
- character repeats by setting opt_type to add a suitable offset into
- repeat_type. We can also test for auto-possessification. OP_NOT is
- currently used only for single-byte chars. */
-
- else if (*previous == OP_NOT)
- {
- op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
- c = previous[1];
- if (!possessive_quantifier &&
- repeat_max < 0 &&
- check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
- {
- repeat_type = 0; /* Force greedy */
- possessive_quantifier = TRUE;
- }
- goto OUTPUT_SINGLE_REPEAT;
- }
-
- /* If previous was a character type match (\d or similar), abolish it and
- create a suitable repeat item. The code is shared with single-character
- repeats by setting op_type to add a suitable offset into repeat_type. Note
- the the Unicode property types will be present only when SUPPORT_UCP is
- defined, but we don't wrap the little bits of code here because it just
- makes it horribly messy. */
-
- else if (*previous < OP_EODN)
- {
- uschar *oldcode;
- int prop_type, prop_value;
- op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
- c = *previous;
-
- if (!possessive_quantifier &&
- repeat_max < 0 &&
- check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
- {
- repeat_type = 0; /* Force greedy */
- possessive_quantifier = TRUE;
- }
-
- OUTPUT_SINGLE_REPEAT:
- if (*previous == OP_PROP || *previous == OP_NOTPROP)
- {
- prop_type = previous[1];
- prop_value = previous[2];
- }
- else prop_type = prop_value = -1;
-
- oldcode = code;
- code = previous; /* Usually overwrite previous item */
-
- /* If the maximum is zero then the minimum must also be zero; Perl allows
- this case, so we do too - by simply omitting the item altogether. */
-
- if (repeat_max == 0) goto END_REPEAT;
-
- /* All real repeats make it impossible to handle partial matching (maybe
- one day we will be able to remove this restriction). */
-
- if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
-
- /* Combine the op_type with the repeat_type */
-
- repeat_type += op_type;
-
- /* A minimum of zero is handled either as the special case * or ?, or as
- an UPTO, with the maximum given. */
-
- if (repeat_min == 0)
- {
- if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
- else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
- else
- {
- *code++ = OP_UPTO + repeat_type;
- PUT2INC(code, 0, repeat_max);
- }
- }
-
- /* A repeat minimum of 1 is optimized into some special cases. If the
- maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
- left in place and, if the maximum is greater than 1, we use OP_UPTO with
- one less than the maximum. */
-
- else if (repeat_min == 1)
- {
- if (repeat_max == -1)
- *code++ = OP_PLUS + repeat_type;
- else
- {
- code = oldcode; /* leave previous item in place */
- if (repeat_max == 1) goto END_REPEAT;
- *code++ = OP_UPTO + repeat_type;
- PUT2INC(code, 0, repeat_max - 1);
- }
- }
-
- /* The case {n,n} is just an EXACT, while the general case {n,m} is
- handled as an EXACT followed by an UPTO. */
-
- else
- {
- *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
- PUT2INC(code, 0, repeat_min);
-
- /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
- we have to insert the character for the previous code. For a repeated
- Unicode property match, there are two extra bytes that define the
- required property. In UTF-8 mode, long characters have their length in
- c, with the 0x80 bit as a flag. */
-
- if (repeat_max < 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128)
- {
- memcpy(code, utf8_char, c & 7);
- code += c & 7;
- }
- else
-#endif
- {
- *code++ = c;
- if (prop_type >= 0)
- {
- *code++ = prop_type;
- *code++ = prop_value;
- }
- }
- *code++ = OP_STAR + repeat_type;
- }
-
- /* Else insert an UPTO if the max is greater than the min, again
- preceded by the character, for the previously inserted code. If the
- UPTO is just for 1 instance, we can use QUERY instead. */
-
- else if (repeat_max != repeat_min)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128)
- {
- memcpy(code, utf8_char, c & 7);
- code += c & 7;
- }
- else
-#endif
- *code++ = c;
- if (prop_type >= 0)
- {
- *code++ = prop_type;
- *code++ = prop_value;
- }
- repeat_max -= repeat_min;
-
- if (repeat_max == 1)
- {
- *code++ = OP_QUERY + repeat_type;
- }
- else
- {
- *code++ = OP_UPTO + repeat_type;
- PUT2INC(code, 0, repeat_max);
- }
- }
- }
-
- /* The character or character type itself comes last in all cases. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && c >= 128)
- {
- memcpy(code, utf8_char, c & 7);
- code += c & 7;
- }
- else
-#endif
- *code++ = c;
-
- /* For a repeated Unicode property match, there are two extra bytes that
- define the required property. */
-
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- *code++ = prop_type;
- *code++ = prop_value;
- }
-#endif
- }
-
- /* If previous was a character class or a back reference, we put the repeat
- stuff after it, but just skip the item if the repeat was {0,0}. */
-
- else if (*previous == OP_CLASS ||
- *previous == OP_NCLASS ||
-#ifdef SUPPORT_UTF8
- *previous == OP_XCLASS ||
-#endif
- *previous == OP_REF)
- {
- if (repeat_max == 0)
- {
- code = previous;
- goto END_REPEAT;
- }
-
- /* All real repeats make it impossible to handle partial matching (maybe
- one day we will be able to remove this restriction). */
-
- if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
-
- if (repeat_min == 0 && repeat_max == -1)
- *code++ = OP_CRSTAR + repeat_type;
- else if (repeat_min == 1 && repeat_max == -1)
- *code++ = OP_CRPLUS + repeat_type;
- else if (repeat_min == 0 && repeat_max == 1)
- *code++ = OP_CRQUERY + repeat_type;
- else
- {
- *code++ = OP_CRRANGE + repeat_type;
- PUT2INC(code, 0, repeat_min);
- if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
- PUT2INC(code, 0, repeat_max);
- }
- }
-
- /* If previous was a bracket group, we may have to replicate it in certain
- cases. */
-
- else if (*previous == OP_BRA || *previous == OP_CBRA ||
- *previous == OP_ONCE || *previous == OP_COND)
- {
- register int i;
- int ketoffset = 0;
- int len = code - previous;
- uschar *bralink = NULL;
-
- /* Repeating a DEFINE group is pointless */
-
- if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
- {
- *errorcodeptr = ERR55;
- goto FAILED;
- }
-
- /* If the maximum repeat count is unlimited, find the end of the bracket
- by scanning through from the start, and compute the offset back to it
- from the current code pointer. There may be an OP_OPT setting following
- the final KET, so we can't find the end just by going back from the code
- pointer. */
-
- if (repeat_max == -1)
- {
- register uschar *ket = previous;
- do ket += GET(ket, 1); while (*ket != OP_KET);
- ketoffset = code - ket;
- }
-
- /* The case of a zero minimum is special because of the need to stick
- OP_BRAZERO in front of it, and because the group appears once in the
- data, whereas in other cases it appears the minimum number of times. For
- this reason, it is simplest to treat this case separately, as otherwise
- the code gets far too messy. There are several special subcases when the
- minimum is zero. */
-
- if (repeat_min == 0)
- {
- /* If the maximum is also zero, we used to just omit the group from the
- output altogether, like this:
-
- ** if (repeat_max == 0)
- ** {
- ** code = previous;
- ** goto END_REPEAT;
- ** }
-
- However, that fails when a group is referenced as a subroutine from
- elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
- so that it is skipped on execution. As we don't have a list of which
- groups are referenced, we cannot do this selectively.
-
- If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
- and do no more at this point. However, we do need to adjust any
- OP_RECURSE calls inside the group that refer to the group itself or any
- internal or forward referenced group, because the offset is from the
- start of the whole regex. Temporarily terminate the pattern while doing
- this. */
-
- if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
- {
- *code = OP_END;
- adjust_recurse(previous, 1, utf8, cd, save_hwm);
- memmove(previous+1, previous, len);
- code++;
- if (repeat_max == 0)
- {
- *previous++ = OP_SKIPZERO;
- goto END_REPEAT;
- }
- *previous++ = OP_BRAZERO + repeat_type;
- }
-
- /* If the maximum is greater than 1 and limited, we have to replicate
- in a nested fashion, sticking OP_BRAZERO before each set of brackets.
- The first one has to be handled carefully because it's the original
- copy, which has to be moved up. The remainder can be handled by code
- that is common with the non-zero minimum case below. We have to
- adjust the value or repeat_max, since one less copy is required. Once
- again, we may have to adjust any OP_RECURSE calls inside the group. */
-
- else
- {
- int offset;
- *code = OP_END;
- adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
- memmove(previous + 2 + LINK_SIZE, previous, len);
- code += 2 + LINK_SIZE;
- *previous++ = OP_BRAZERO + repeat_type;
- *previous++ = OP_BRA;
-
- /* We chain together the bracket offset fields that have to be
- filled in later when the ends of the brackets are reached. */
-
- offset = (bralink == NULL)? 0 : previous - bralink;
- bralink = previous;
- PUTINC(previous, 0, offset);
- }
-
- repeat_max--;
- }
-
- /* If the minimum is greater than zero, replicate the group as many
- times as necessary, and adjust the maximum to the number of subsequent
- copies that we need. If we set a first char from the group, and didn't
- set a required char, copy the latter from the former. If there are any
- forward reference subroutine calls in the group, there will be entries on
- the workspace list; replicate these with an appropriate increment. */
-
- else
- {
- if (repeat_min > 1)
- {
- /* In the pre-compile phase, we don't actually do the replication. We
- just adjust the length as if we had. Do some paranoid checks for
- potential integer overflow. */
-
- if (lengthptr != NULL)
- {
- int delta = (repeat_min - 1)*length_prevgroup;
- if ((double)(repeat_min - 1)*(double)length_prevgroup >
- (double)INT_MAX ||
- OFLOW_MAX - *lengthptr < delta)
- {
- *errorcodeptr = ERR20;
- goto FAILED;
- }
- *lengthptr += delta;
- }
-
- /* This is compiling for real */
-
- else
- {
- if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
- for (i = 1; i < repeat_min; i++)
- {
- uschar *hc;
- uschar *this_hwm = cd->hwm;
- memcpy(code, previous, len);
- for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
- {
- PUT(cd->hwm, 0, GET(hc, 0) + len);
- cd->hwm += LINK_SIZE;
- }
- save_hwm = this_hwm;
- code += len;
- }
- }
- }
-
- if (repeat_max > 0) repeat_max -= repeat_min;
- }
-
- /* This code is common to both the zero and non-zero minimum cases. If
- the maximum is limited, it replicates the group in a nested fashion,
- remembering the bracket starts on a stack. In the case of a zero minimum,
- the first one was set up above. In all cases the repeat_max now specifies
- the number of additional copies needed. Again, we must remember to
- replicate entries on the forward reference list. */
-
- if (repeat_max >= 0)
- {
- /* In the pre-compile phase, we don't actually do the replication. We
- just adjust the length as if we had. For each repetition we must add 1
- to the length for BRAZERO and for all but the last repetition we must
- add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
- paranoid checks to avoid integer overflow. */
-
- if (lengthptr != NULL && repeat_max > 0)
- {
- int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
- 2 - 2*LINK_SIZE; /* Last one doesn't nest */
- if ((double)repeat_max *
- (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
- > (double)INT_MAX ||
- OFLOW_MAX - *lengthptr < delta)
- {
- *errorcodeptr = ERR20;
- goto FAILED;
- }
- *lengthptr += delta;
- }
-
- /* This is compiling for real */
-
- else for (i = repeat_max - 1; i >= 0; i--)
- {
- uschar *hc;
- uschar *this_hwm = cd->hwm;
-
- *code++ = OP_BRAZERO + repeat_type;
-
- /* All but the final copy start a new nesting, maintaining the
- chain of brackets outstanding. */
-
- if (i != 0)
- {
- int offset;
- *code++ = OP_BRA;
- offset = (bralink == NULL)? 0 : code - bralink;
- bralink = code;
- PUTINC(code, 0, offset);
- }
-
- memcpy(code, previous, len);
- for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
- {
- PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
- cd->hwm += LINK_SIZE;
- }
- save_hwm = this_hwm;
- code += len;
- }
-
- /* Now chain through the pending brackets, and fill in their length
- fields (which are holding the chain links pro tem). */
-
- while (bralink != NULL)
- {
- int oldlinkoffset;
- int offset = code - bralink + 1;
- uschar *bra = code - offset;
- oldlinkoffset = GET(bra, 1);
- bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
- *code++ = OP_KET;
- PUTINC(code, 0, offset);
- PUT(bra, 1, offset);
- }
- }
-
- /* If the maximum is unlimited, set a repeater in the final copy. We
- can't just offset backwards from the current code point, because we
- don't know if there's been an options resetting after the ket. The
- correct offset was computed above.
-
- Then, when we are doing the actual compile phase, check to see whether
- this group is a non-atomic one that could match an empty string. If so,
- convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
- that runtime checking can be done. [This check is also applied to
- atomic groups at runtime, but in a different way.] */
-
- else
- {
- uschar *ketcode = code - ketoffset;
- uschar *bracode = ketcode - GET(ketcode, 1);
- *ketcode = OP_KETRMAX + repeat_type;
- if (lengthptr == NULL && *bracode != OP_ONCE)
- {
- uschar *scode = bracode;
- do
- {
- if (could_be_empty_branch(scode, ketcode, utf8))
- {
- *bracode += OP_SBRA - OP_BRA;
- break;
- }
- scode += GET(scode, 1);
- }
- while (*scode == OP_ALT);
- }
- }
- }
-
- /* If previous is OP_FAIL, it was generated by an empty class [] in
- JavaScript mode. The other ways in which OP_FAIL can be generated, that is
- by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
- error above. We can just ignore the repeat in JS case. */
-
- else if (*previous == OP_FAIL) goto END_REPEAT;
-
- /* Else there's some kind of shambles */
-
- else
- {
- *errorcodeptr = ERR11;
- goto FAILED;
- }
-
- /* If the character following a repeat is '+', or if certain optimization
- tests above succeeded, possessive_quantifier is TRUE. For some of the
- simpler opcodes, there is an special alternative opcode for this. For
- anything else, we wrap the entire repeated item inside OP_ONCE brackets.
- The '+' notation is just syntactic sugar, taken from Sun's Java package,
- but the special opcodes can optimize it a bit. The repeated item starts at
- tempcode, not at previous, which might be the first part of a string whose
- (former) last char we repeated.
-
- Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
- an 'upto' may follow. We skip over an 'exact' item, and then test the
- length of what remains before proceeding. */
-
- if (possessive_quantifier)
- {
- int len;
- if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
- *tempcode == OP_NOTEXACT)
- tempcode += _pcre_OP_lengths[*tempcode] +
- ((*tempcode == OP_TYPEEXACT &&
- (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
- len = code - tempcode;
- if (len > 0) switch (*tempcode)
- {
- case OP_STAR: *tempcode = OP_POSSTAR; break;
- case OP_PLUS: *tempcode = OP_POSPLUS; break;
- case OP_QUERY: *tempcode = OP_POSQUERY; break;
- case OP_UPTO: *tempcode = OP_POSUPTO; break;
-
- case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
- case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
- case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
- case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
-
- case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
- case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
- case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
- case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
-
- default:
- memmove(tempcode + 1+LINK_SIZE, tempcode, len);
- code += 1 + LINK_SIZE;
- len += 1 + LINK_SIZE;
- tempcode[0] = OP_ONCE;
- *code++ = OP_KET;
- PUTINC(code, 0, len);
- PUT(tempcode, 1, len);
- break;
- }
- }
-
- /* In all case we no longer have a previous item. We also set the
- "follows varying string" flag for subsequently encountered reqbytes if
- it isn't already set and we have just passed a varying length item. */
-
- END_REPEAT:
- previous = NULL;
- cd->req_varyopt |= reqvary;
- break;
-
-
- /* ===================================================================*/
- /* Start of nested parenthesized sub-expression, or comment or lookahead or
- lookbehind or option setting or condition or all the other extended
- parenthesis forms. */
-
- case CHAR_LEFT_PARENTHESIS:
- newoptions = options;
- skipbytes = 0;
- bravalue = OP_CBRA;
- save_hwm = cd->hwm;
- reset_bracount = FALSE;
-
- /* First deal with various "verbs" that can be introduced by '*'. */
-
- if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
- {
- int i, namelen;
- const char *vn = verbnames;
- const uschar *name = ++ptr;
- previous = NULL;
- while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
- if (*ptr == CHAR_COLON)
- {
- *errorcodeptr = ERR59; /* Not supported */
- goto FAILED;
- }
- if (*ptr != CHAR_RIGHT_PARENTHESIS)
- {
- *errorcodeptr = ERR60;
- goto FAILED;
- }
- namelen = ptr - name;
- for (i = 0; i < verbcount; i++)
- {
- if (namelen == verbs[i].len &&
- strncmp((char *)name, vn, namelen) == 0)
- {
- *code = verbs[i].op;
- if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
- break;
- }
- vn += verbs[i].len + 1;
- }
- if (i < verbcount) continue;
- *errorcodeptr = ERR60;
- goto FAILED;
- }
-
- /* Deal with the extended parentheses; all are introduced by '?', and the
- appearance of any of them means that this is not a capturing group. */
-
- else if (*ptr == CHAR_QUESTION_MARK)
- {
- int i, set, unset, namelen;
- int *optset;
- const uschar *name;
- uschar *slot;
-
- switch (*(++ptr))
- {
- case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
- ptr++;
- while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
- if (*ptr == 0)
- {
- *errorcodeptr = ERR18;
- goto FAILED;
- }
- continue;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
- reset_bracount = TRUE;
- /* Fall through */
-
- /* ------------------------------------------------------------ */
- case CHAR_COLON: /* Non-capturing bracket */
- bravalue = OP_BRA;
- ptr++;
- break;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_LEFT_PARENTHESIS:
- bravalue = OP_COND; /* Conditional group */
-
- /* A condition can be an assertion, a number (referring to a numbered
- group), a name (referring to a named group), or 'R', referring to
- recursion. R and R&name are also permitted for recursion tests.
-
- There are several syntaxes for testing a named group: (?(name)) is used
- by Python; Perl 5.10 onwards uses (?() or (?('name')).
-
- There are two unfortunate ambiguities, caused by history. (a) 'R' can
- be the recursive thing or the name 'R' (and similarly for 'R' followed
- by digits), and (b) a number could be a name that consists of digits.
- In both cases, we look for a name first; if not found, we try the other
- cases. */
-
- /* For conditions that are assertions, check the syntax, and then exit
- the switch. This will take control down to where bracketed groups,
- including assertions, are processed. */
-
- if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
- ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
- break;
-
- /* Most other conditions use OP_CREF (a couple change to OP_RREF
- below), and all need to skip 3 bytes at the start of the group. */
-
- code[1+LINK_SIZE] = OP_CREF;
- skipbytes = 3;
- refsign = -1;
-
- /* Check for a test for recursion in a named group. */
-
- if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
- {
- terminator = -1;
- ptr += 2;
- code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
- }
-
- /* Check for a test for a named group's having been set, using the Perl
- syntax (?() or (?('name') */
-
- else if (ptr[1] == CHAR_LESS_THAN_SIGN)
- {
- terminator = CHAR_GREATER_THAN_SIGN;
- ptr++;
- }
- else if (ptr[1] == CHAR_APOSTROPHE)
- {
- terminator = CHAR_APOSTROPHE;
- ptr++;
- }
- else
- {
- terminator = 0;
- if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
- }
-
- /* We now expect to read a name; any thing else is an error */
-
- if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
- {
- ptr += 1; /* To get the right offset */
- *errorcodeptr = ERR28;
- goto FAILED;
- }
-
- /* Read the name, but also get it as a number if it's all digits */
-
- recno = 0;
- name = ++ptr;
- while ((cd->ctypes[*ptr] & ctype_word) != 0)
- {
- if (recno >= 0)
- recno = ((digitab[*ptr] & ctype_digit) != 0)?
- recno * 10 + *ptr - CHAR_0 : -1;
- ptr++;
- }
- namelen = ptr - name;
-
- if ((terminator > 0 && *ptr++ != terminator) ||
- *ptr++ != CHAR_RIGHT_PARENTHESIS)
- {
- ptr--; /* Error offset */
- *errorcodeptr = ERR26;
- goto FAILED;
- }
-
- /* Do no further checking in the pre-compile phase. */
-
- if (lengthptr != NULL) break;
-
- /* In the real compile we do the work of looking for the actual
- reference. If the string started with "+" or "-" we require the rest to
- be digits, in which case recno will be set. */
-
- if (refsign > 0)
- {
- if (recno <= 0)
- {
- *errorcodeptr = ERR58;
- goto FAILED;
- }
- recno = (refsign == CHAR_MINUS)?
- cd->bracount - recno + 1 : recno +cd->bracount;
- if (recno <= 0 || recno > cd->final_bracount)
- {
- *errorcodeptr = ERR15;
- goto FAILED;
- }
- PUT2(code, 2+LINK_SIZE, recno);
- break;
- }
-
- /* Otherwise (did not start with "+" or "-"), start by looking for the
- name. */
-
- slot = cd->name_table;
- for (i = 0; i < cd->names_found; i++)
- {
- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
- slot += cd->name_entry_size;
- }
-
- /* Found a previous named subpattern */
-
- if (i < cd->names_found)
- {
- recno = GET2(slot, 0);
- PUT2(code, 2+LINK_SIZE, recno);
- }
-
- /* Search the pattern for a forward reference */
-
- else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) > 0)
- {
- PUT2(code, 2+LINK_SIZE, i);
- }
-
- /* If terminator == 0 it means that the name followed directly after
- the opening parenthesis [e.g. (?(abc)...] and in this case there are
- some further alternatives to try. For the cases where terminator != 0
- [things like (?(... or (?('name')... or (?(R&name)... ] we have
- now checked all the possibilities, so give an error. */
-
- else if (terminator != 0)
- {
- *errorcodeptr = ERR15;
- goto FAILED;
- }
-
- /* Check for (?(R) for recursion. Allow digits after R to specify a
- specific group number. */
-
- else if (*name == CHAR_R)
- {
- recno = 0;
- for (i = 1; i < namelen; i++)
- {
- if ((digitab[name[i]] & ctype_digit) == 0)
- {
- *errorcodeptr = ERR15;
- goto FAILED;
- }
- recno = recno * 10 + name[i] - CHAR_0;
- }
- if (recno == 0) recno = RREF_ANY;
- code[1+LINK_SIZE] = OP_RREF; /* Change test type */
- PUT2(code, 2+LINK_SIZE, recno);
- }
-
- /* Similarly, check for the (?(DEFINE) "condition", which is always
- false. */
-
- else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
- {
- code[1+LINK_SIZE] = OP_DEF;
- skipbytes = 1;
- }
-
- /* Check for the "name" actually being a subpattern number. We are
- in the second pass here, so final_bracount is set. */
-
- else if (recno > 0 && recno <= cd->final_bracount)
- {
- PUT2(code, 2+LINK_SIZE, recno);
- }
-
- /* Either an unidentified subpattern, or a reference to (?(0) */
-
- else
- {
- *errorcodeptr = (recno == 0)? ERR35: ERR15;
- goto FAILED;
- }
- break;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_EQUALS_SIGN: /* Positive lookahead */
- bravalue = OP_ASSERT;
- ptr++;
- break;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
- ptr++;
- if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
- {
- *code++ = OP_FAIL;
- previous = NULL;
- continue;
- }
- bravalue = OP_ASSERT_NOT;
- break;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
- switch (ptr[1])
- {
- case CHAR_EQUALS_SIGN: /* Positive lookbehind */
- bravalue = OP_ASSERTBACK;
- ptr += 2;
- break;
-
- case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
- bravalue = OP_ASSERTBACK_NOT;
- ptr += 2;
- break;
-
- default: /* Could be name define, else bad */
- if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
- ptr++; /* Correct offset for error */
- *errorcodeptr = ERR24;
- goto FAILED;
- }
- break;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
- bravalue = OP_ONCE;
- ptr++;
- break;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_C: /* Callout - may be followed by digits; */
- previous_callout = code; /* Save for later completion */
- after_manual_callout = 1; /* Skip one item before completing */
- *code++ = OP_CALLOUT;
- {
- int n = 0;
- while ((digitab[*(++ptr)] & ctype_digit) != 0)
- n = n * 10 + *ptr - CHAR_0;
- if (*ptr != CHAR_RIGHT_PARENTHESIS)
- {
- *errorcodeptr = ERR39;
- goto FAILED;
- }
- if (n > 255)
- {
- *errorcodeptr = ERR38;
- goto FAILED;
- }
- *code++ = n;
- PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
- PUT(code, LINK_SIZE, 0); /* Default length */
- code += 2 * LINK_SIZE;
- }
- previous = NULL;
- continue;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_P: /* Python-style named subpattern handling */
- if (*(++ptr) == CHAR_EQUALS_SIGN ||
- *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
- {
- is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
- terminator = CHAR_RIGHT_PARENTHESIS;
- goto NAMED_REF_OR_RECURSE;
- }
- else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
- {
- *errorcodeptr = ERR41;
- goto FAILED;
- }
- /* Fall through to handle (?P< as (?< is handled */
-
-
- /* ------------------------------------------------------------ */
- DEFINE_NAME: /* Come here from (?< handling */
- case CHAR_APOSTROPHE:
- {
- terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
- CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
- name = ++ptr;
-
- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
- namelen = ptr - name;
-
- /* In the pre-compile phase, just do a syntax check. */
-
- if (lengthptr != NULL)
- {
- if (*ptr != terminator)
- {
- *errorcodeptr = ERR42;
- goto FAILED;
- }
- if (cd->names_found >= MAX_NAME_COUNT)
- {
- *errorcodeptr = ERR49;
- goto FAILED;
- }
- if (namelen + 3 > cd->name_entry_size)
- {
- cd->name_entry_size = namelen + 3;
- if (namelen > MAX_NAME_SIZE)
- {
- *errorcodeptr = ERR48;
- goto FAILED;
- }
- }
- }
-
- /* In the real compile, create the entry in the table */
-
- else
- {
- slot = cd->name_table;
- for (i = 0; i < cd->names_found; i++)
- {
- int crc = memcmp(name, slot+2, namelen);
- if (crc == 0)
- {
- if (slot[2+namelen] == 0)
- {
- if ((options & PCRE_DUPNAMES) == 0)
- {
- *errorcodeptr = ERR43;
- goto FAILED;
- }
- }
- else crc = -1; /* Current name is substring */
- }
- if (crc < 0)
- {
- memmove(slot + cd->name_entry_size, slot,
- (cd->names_found - i) * cd->name_entry_size);
- break;
- }
- slot += cd->name_entry_size;
- }
-
- PUT2(slot, 0, cd->bracount + 1);
- memcpy(slot + 2, name, namelen);
- slot[2+namelen] = 0;
- }
- }
-
- /* In both cases, count the number of names we've encountered. */
-
- ptr++; /* Move past > or ' */
- cd->names_found++;
- goto NUMBERED_GROUP;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
- terminator = CHAR_RIGHT_PARENTHESIS;
- is_recurse = TRUE;
- /* Fall through */
-
- /* We come here from the Python syntax above that handles both
- references (?P=name) and recursion (?P>name), as well as falling
- through from the Perl recursion syntax (?&name). We also come here from
- the Perl \k or \k'name' back reference syntax and the \k{name}
- .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
-
- NAMED_REF_OR_RECURSE:
- name = ++ptr;
- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
- namelen = ptr - name;
-
- /* In the pre-compile phase, do a syntax check and set a dummy
- reference number. */
-
- if (lengthptr != NULL)
- {
- if (namelen == 0)
- {
- *errorcodeptr = ERR62;
- goto FAILED;
- }
- if (*ptr != terminator)
- {
- *errorcodeptr = ERR42;
- goto FAILED;
- }
- if (namelen > MAX_NAME_SIZE)
- {
- *errorcodeptr = ERR48;
- goto FAILED;
- }
- recno = 0;
- }
-
- /* In the real compile, seek the name in the table. We check the name
- first, and then check that we have reached the end of the name in the
- table. That way, if the name that is longer than any in the table,
- the comparison will fail without reading beyond the table entry. */
-
- else
- {
- slot = cd->name_table;
- for (i = 0; i < cd->names_found; i++)
- {
- if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
- slot[2+namelen] == 0)
- break;
- slot += cd->name_entry_size;
- }
-
- if (i < cd->names_found) /* Back reference */
- {
- recno = GET2(slot, 0);
- }
- else if ((recno = /* Forward back reference */
- find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) <= 0)
- {
- *errorcodeptr = ERR15;
- goto FAILED;
- }
- }
-
- /* In both phases, we can now go to the code than handles numerical
- recursion or backreferences. */
-
- if (is_recurse) goto HANDLE_RECURSION;
- else goto HANDLE_REFERENCE;
-
-
- /* ------------------------------------------------------------ */
- case CHAR_R: /* Recursion */
- ptr++; /* Same as (?0) */
- /* Fall through */
-
-
- /* ------------------------------------------------------------ */
- case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
- case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
- case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
- {
- const uschar *called;
- terminator = CHAR_RIGHT_PARENTHESIS;
-
- /* Come here from the \g<...> and \g'...' code (Oniguruma
- compatibility). However, the syntax has been checked to ensure that
- the ... are a (signed) number, so that neither ERR63 nor ERR29 will
- be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
- ever be taken. */
-
- HANDLE_NUMERICAL_RECURSION:
-
- if ((refsign = *ptr) == CHAR_PLUS)
- {
- ptr++;
- if ((digitab[*ptr] & ctype_digit) == 0)
- {
- *errorcodeptr = ERR63;
- goto FAILED;
- }
- }
- else if (refsign == CHAR_MINUS)
- {
- if ((digitab[ptr[1]] & ctype_digit) == 0)
- goto OTHER_CHAR_AFTER_QUERY;
- ptr++;
- }
-
- recno = 0;
- while((digitab[*ptr] & ctype_digit) != 0)
- recno = recno * 10 + *ptr++ - CHAR_0;
-
- if (*ptr != terminator)
- {
- *errorcodeptr = ERR29;
- goto FAILED;
- }
-
- if (refsign == CHAR_MINUS)
- {
- if (recno == 0)
- {
- *errorcodeptr = ERR58;
- goto FAILED;
- }
- recno = cd->bracount - recno + 1;
- if (recno <= 0)
- {
- *errorcodeptr = ERR15;
- goto FAILED;
- }
- }
- else if (refsign == CHAR_PLUS)
- {
- if (recno == 0)
- {
- *errorcodeptr = ERR58;
- goto FAILED;
- }
- recno += cd->bracount;
- }
-
- /* Come here from code above that handles a named recursion */
-
- HANDLE_RECURSION:
-
- previous = code;
- called = cd->start_code;
-
- /* When we are actually compiling, find the bracket that is being
- referenced. Temporarily end the regex in case it doesn't exist before
- this point. If we end up with a forward reference, first check that
- the bracket does occur later so we can give the error (and position)
- now. Then remember this forward reference in the workspace so it can
- be filled in at the end. */
-
- if (lengthptr == NULL)
- {
- *code = OP_END;
- if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
-
- /* Forward reference */
-
- if (called == NULL)
- {
- if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0) < 0)
- {
- *errorcodeptr = ERR15;
- goto FAILED;
- }
- called = cd->start_code + recno;
- PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
- }
-
- /* If not a forward reference, and the subpattern is still open,
- this is a recursive call. We check to see if this is a left
- recursion that could loop for ever, and diagnose that case. */
-
- else if (GET(called, 1) == 0 &&
- could_be_empty(called, code, bcptr, utf8))
- {
- *errorcodeptr = ERR40;
- goto FAILED;
- }
- }
-
- /* Insert the recursion/subroutine item, automatically wrapped inside
- "once" brackets. Set up a "previous group" length so that a
- subsequent quantifier will work. */
-
- *code = OP_ONCE;
- PUT(code, 1, 2 + 2*LINK_SIZE);
- code += 1 + LINK_SIZE;
-
- *code = OP_RECURSE;
- PUT(code, 1, called - cd->start_code);
- code += 1 + LINK_SIZE;
-
- *code = OP_KET;
- PUT(code, 1, 2 + 2*LINK_SIZE);
- code += 1 + LINK_SIZE;
-
- length_prevgroup = 3 + 3*LINK_SIZE;
- }
-
- /* Can't determine a first byte now */
-
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- continue;
-
-
- /* ------------------------------------------------------------ */
- default: /* Other characters: check option setting */
- OTHER_CHAR_AFTER_QUERY:
- set = unset = 0;
- optset = &set;
-
- while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
- {
- switch (*ptr++)
- {
- case CHAR_MINUS: optset = &unset; break;
-
- case CHAR_J: /* Record that it changed in the external options */
- *optset |= PCRE_DUPNAMES;
- cd->external_flags |= PCRE_JCHANGED;
- break;
-
- case CHAR_i: *optset |= PCRE_CASELESS; break;
- case CHAR_m: *optset |= PCRE_MULTILINE; break;
- case CHAR_s: *optset |= PCRE_DOTALL; break;
- case CHAR_x: *optset |= PCRE_EXTENDED; break;
- case CHAR_U: *optset |= PCRE_UNGREEDY; break;
- case CHAR_X: *optset |= PCRE_EXTRA; break;
-
- default: *errorcodeptr = ERR12;
- ptr--; /* Correct the offset */
- goto FAILED;
- }
- }
-
- /* Set up the changed option bits, but don't change anything yet. */
-
- newoptions = (options | set) & (~unset);
-
- /* If the options ended with ')' this is not the start of a nested
- group with option changes, so the options change at this level. If this
- item is right at the start of the pattern, the options can be
- abstracted and made external in the pre-compile phase, and ignored in
- the compile phase. This can be helpful when matching -- for instance in
- caseless checking of required bytes.
-
- If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
- definitely *not* at the start of the pattern because something has been
- compiled. In the pre-compile phase, however, the code pointer can have
- that value after the start, because it gets reset as code is discarded
- during the pre-compile. However, this can happen only at top level - if
- we are within parentheses, the starting BRA will still be present. At
- any parenthesis level, the length value can be used to test if anything
- has been compiled at that level. Thus, a test for both these conditions
- is necessary to ensure we correctly detect the start of the pattern in
- both phases.
-
- If we are not at the pattern start, compile code to change the ims
- options if this setting actually changes any of them, and reset the
- greedy defaults and the case value for firstbyte and reqbyte. */
-
- if (*ptr == CHAR_RIGHT_PARENTHESIS)
- {
- if (code == cd->start_code + 1 + LINK_SIZE &&
- (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
- {
- cd->external_options = newoptions;
- }
- else
- {
- if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
- {
- *code++ = OP_OPT;
- *code++ = newoptions & PCRE_IMS;
- }
- greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
- greedy_non_default = greedy_default ^ 1;
- req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
- }
-
- /* Change options at this level, and pass them back for use
- in subsequent branches. When not at the start of the pattern, this
- information is also necessary so that a resetting item can be
- compiled at the end of a group (if we are in a group). */
-
- *optionsptr = options = newoptions;
- previous = NULL; /* This item can't be repeated */
- continue; /* It is complete */
- }
-
- /* If the options ended with ':' we are heading into a nested group
- with possible change of options. Such groups are non-capturing and are
- not assertions of any kind. All we need to do is skip over the ':';
- the newoptions value is handled below. */
-
- bravalue = OP_BRA;
- ptr++;
- } /* End of switch for character following (? */
- } /* End of (? handling */
-
- /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
- all unadorned brackets become non-capturing and behave like (?:...)
- brackets. */
-
- else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
- {
- bravalue = OP_BRA;
- }
-
- /* Else we have a capturing group. */
-
- else
- {
- NUMBERED_GROUP:
- cd->bracount += 1;
- PUT2(code, 1+LINK_SIZE, cd->bracount);
- skipbytes = 2;
- }
-
- /* Process nested bracketed regex. Assertions may not be repeated, but
- other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
- non-register variable in order to be able to pass its address because some
- compilers complain otherwise. Pass in a new setting for the ims options if
- they have changed. */
-
- previous = (bravalue >= OP_ONCE)? code : NULL;
- *code = bravalue;
- tempcode = code;
- tempreqvary = cd->req_varyopt; /* Save value before bracket */
- length_prevgroup = 0; /* Initialize for pre-compile phase */
-
- if (!compile_regex(
- newoptions, /* The complete new option state */
- options & PCRE_IMS, /* The previous ims option state */
- &tempcode, /* Where to put code (updated) */
- &ptr, /* Input pointer (updated) */
- errorcodeptr, /* Where to put an error message */
- (bravalue == OP_ASSERTBACK ||
- bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
- reset_bracount, /* True if (?| group */
- skipbytes, /* Skip over bracket number */
- &subfirstbyte, /* For possible first char */
- &subreqbyte, /* For possible last char */
- bcptr, /* Current branch chain */
- cd, /* Tables block */
- (lengthptr == NULL)? NULL : /* Actual compile phase */
- &length_prevgroup /* Pre-compile phase */
- ))
- goto FAILED;
-
- /* At the end of compiling, code is still pointing to the start of the
- group, while tempcode has been updated to point past the end of the group
- and any option resetting that may follow it. The pattern pointer (ptr)
- is on the bracket. */
-
- /* If this is a conditional bracket, check that there are no more than
- two branches in the group, or just one if it's a DEFINE group. We do this
- in the real compile phase, not in the pre-pass, where the whole group may
- not be available. */
-
- if (bravalue == OP_COND && lengthptr == NULL)
- {
- uschar *tc = code;
- int condcount = 0;
-
- do {
- condcount++;
- tc += GET(tc,1);
- }
- while (*tc != OP_KET);
-
- /* A DEFINE group is never obeyed inline (the "condition" is always
- false). It must have only one branch. */
-
- if (code[LINK_SIZE+1] == OP_DEF)
- {
- if (condcount > 1)
- {
- *errorcodeptr = ERR54;
- goto FAILED;
- }
- bravalue = OP_DEF; /* Just a flag to suppress char handling below */
- }
-
- /* A "normal" conditional group. If there is just one branch, we must not
- make use of its firstbyte or reqbyte, because this is equivalent to an
- empty second branch. */
-
- else
- {
- if (condcount > 2)
- {
- *errorcodeptr = ERR27;
- goto FAILED;
- }
- if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
- }
- }
-
- /* Error if hit end of pattern */
-
- if (*ptr != CHAR_RIGHT_PARENTHESIS)
- {
- *errorcodeptr = ERR14;
- goto FAILED;
- }
-
- /* In the pre-compile phase, update the length by the length of the group,
- less the brackets at either end. Then reduce the compiled code to just a
- set of non-capturing brackets so that it doesn't use much memory if it is
- duplicated by a quantifier.*/
-
- if (lengthptr != NULL)
- {
- if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
- {
- *errorcodeptr = ERR20;
- goto FAILED;
- }
- *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
- *code++ = OP_BRA;
- PUTINC(code, 0, 1 + LINK_SIZE);
- *code++ = OP_KET;
- PUTINC(code, 0, 1 + LINK_SIZE);
- break; /* No need to waste time with special character handling */
- }
-
- /* Otherwise update the main code pointer to the end of the group. */
-
- code = tempcode;
-
- /* For a DEFINE group, required and first character settings are not
- relevant. */
-
- if (bravalue == OP_DEF) break;
-
- /* Handle updating of the required and first characters for other types of
- group. Update for normal brackets of all kinds, and conditions with two
- branches (see code above). If the bracket is followed by a quantifier with
- zero repeat, we have to back off. Hence the definition of zeroreqbyte and
- zerofirstbyte outside the main loop so that they can be accessed for the
- back off. */
-
- zeroreqbyte = reqbyte;
- zerofirstbyte = firstbyte;
- groupsetfirstbyte = FALSE;
-
- if (bravalue >= OP_ONCE)
- {
- /* If we have not yet set a firstbyte in this branch, take it from the
- subpattern, remembering that it was set here so that a repeat of more
- than one can replicate it as reqbyte if necessary. If the subpattern has
- no firstbyte, set "none" for the whole branch. In both cases, a zero
- repeat forces firstbyte to "none". */
-
- if (firstbyte == REQ_UNSET)
- {
- if (subfirstbyte >= 0)
- {
- firstbyte = subfirstbyte;
- groupsetfirstbyte = TRUE;
- }
- else firstbyte = REQ_NONE;
- zerofirstbyte = REQ_NONE;
- }
-
- /* If firstbyte was previously set, convert the subpattern's firstbyte
- into reqbyte if there wasn't one, using the vary flag that was in
- existence beforehand. */
-
- else if (subfirstbyte >= 0 && subreqbyte < 0)
- subreqbyte = subfirstbyte | tempreqvary;
-
- /* If the subpattern set a required byte (or set a first byte that isn't
- really the first byte - see above), set it. */
-
- if (subreqbyte >= 0) reqbyte = subreqbyte;
- }
-
- /* For a forward assertion, we take the reqbyte, if set. This can be
- helpful if the pattern that follows the assertion doesn't set a different
- char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
- for an assertion, however because it leads to incorrect effect for patterns
- such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
- of a firstbyte. This is overcome by a scan at the end if there's no
- firstbyte, looking for an asserted first char. */
-
- else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
- break; /* End of processing '(' */
-
-
- /* ===================================================================*/
- /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
- are arranged to be the negation of the corresponding OP_values. For the
- back references, the values are ESC_REF plus the reference number. Only
- back references and those types that consume a character may be repeated.
- We can test for values between ESC_b and ESC_Z for the latter; this may
- have to change if any new ones are ever created. */
-
- case CHAR_BACKSLASH:
- tempptr = ptr;
- c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
- if (*errorcodeptr != 0) goto FAILED;
-
- if (c < 0)
- {
- if (-c == ESC_Q) /* Handle start of quoted string */
- {
- if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
- ptr += 2; /* avoid empty string */
- else inescq = TRUE;
- continue;
- }
-
- if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
-
- /* For metasequences that actually match a character, we disable the
- setting of a first character if it hasn't already been set. */
-
- if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
- firstbyte = REQ_NONE;
-
- /* Set values to reset to if this is followed by a zero repeat. */
-
- zerofirstbyte = firstbyte;
- zeroreqbyte = reqbyte;
-
- /* \g or \g'name' is a subroutine call by name and \g or \g'n'
- is a subroutine call by number (Oniguruma syntax). In fact, the value
- -ESC_g is returned only for these cases. So we don't need to check for <
- or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
- -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
- that is a synonym for a named back reference). */
-
- if (-c == ESC_g)
- {
- const uschar *p;
- save_hwm = cd->hwm; /* Normally this is set when '(' is read */
- terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
- CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
-
- /* These two statements stop the compiler for warning about possibly
- unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
- fact, because we actually check for a number below, the paths that
- would actually be in error are never taken. */
-
- skipbytes = 0;
- reset_bracount = FALSE;
-
- /* Test for a name */
-
- if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
- {
- BOOL isnumber = TRUE;
- for (p = ptr + 1; *p != 0 && *p != terminator; p++)
- {
- if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
- if ((cd->ctypes[*p] & ctype_word) == 0) break;
- }
- if (*p != terminator)
- {
- *errorcodeptr = ERR57;
- break;
- }
- if (isnumber)
- {
- ptr++;
- goto HANDLE_NUMERICAL_RECURSION;
- }
- is_recurse = TRUE;
- goto NAMED_REF_OR_RECURSE;
- }
-
- /* Test a signed number in angle brackets or quotes. */
-
- p = ptr + 2;
- while ((digitab[*p] & ctype_digit) != 0) p++;
- if (*p != terminator)
- {
- *errorcodeptr = ERR57;
- break;
- }
- ptr++;
- goto HANDLE_NUMERICAL_RECURSION;
- }
-
- /* \k or \k'name' is a back reference by name (Perl syntax).
- We also support \k{name} (.NET syntax) */
-
- if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
- ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
- {
- is_recurse = FALSE;
- terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
- CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
- CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
- goto NAMED_REF_OR_RECURSE;
- }
-
- /* Back references are handled specially; must disable firstbyte if
- not set to cope with cases like (?=(\w+))\1: which would otherwise set
- ':' later. */
-
- if (-c >= ESC_REF)
- {
- recno = -c - ESC_REF;
-
- HANDLE_REFERENCE: /* Come here from named backref handling */
- if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
- previous = code;
- *code++ = OP_REF;
- PUT2INC(code, 0, recno);
- cd->backref_map |= (recno < 32)? (1 << recno) : 1;
- if (recno > cd->top_backref) cd->top_backref = recno;
- }
-
- /* So are Unicode property matches, if supported. */
-
-#ifdef SUPPORT_UCP
- else if (-c == ESC_P || -c == ESC_p)
- {
- BOOL negated;
- int pdata;
- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
- if (ptype < 0) goto FAILED;
- previous = code;
- *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
- *code++ = ptype;
- *code++ = pdata;
- }
-#else
-
- /* If Unicode properties are not supported, \X, \P, and \p are not
- allowed. */
-
- else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
- {
- *errorcodeptr = ERR45;
- goto FAILED;
- }
-#endif
-
- /* For the rest (including \X when Unicode properties are supported), we
- can obtain the OP value by negating the escape value. */
-
- else
- {
- previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = -c;
- }
- continue;
- }
-
- /* We have a data character whose value is in c. In UTF-8 mode it may have
- a value > 127. We set its representation in the length/buffer, and then
- handle it as a data character. */
-
-#ifdef SUPPORT_UTF8
- if (utf8 && c > 127)
- mclength = _pcre_ord2utf8(c, mcbuffer);
- else
-#endif
-
- {
- mcbuffer[0] = c;
- mclength = 1;
- }
- goto ONE_CHAR;
-
-
- /* ===================================================================*/
- /* Handle a literal character. It is guaranteed not to be whitespace or #
- when the extended flag is set. If we are in UTF-8 mode, it may be a
- multi-byte literal character. */
-
- default:
- NORMAL_CHAR:
- mclength = 1;
- mcbuffer[0] = c;
-
-#ifdef SUPPORT_UTF8
- if (utf8 && c >= 0xc0)
- {
- while ((ptr[1] & 0xc0) == 0x80)
- mcbuffer[mclength++] = *(++ptr);
- }
-#endif
-
- /* At this point we have the character's bytes in mcbuffer, and the length
- in mclength. When not in UTF-8 mode, the length is always 1. */
-
- ONE_CHAR:
- previous = code;
- *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
- for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
-
- /* Remember if \r or \n were seen */
-
- if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
- cd->external_flags |= PCRE_HASCRORLF;
-
- /* Set the first and required bytes appropriately. If no previous first
- byte, set it from this character, but revert to none on a zero repeat.
- Otherwise, leave the firstbyte value alone, and don't change it on a zero
- repeat. */
-
- if (firstbyte == REQ_UNSET)
- {
- zerofirstbyte = REQ_NONE;
- zeroreqbyte = reqbyte;
-
- /* If the character is more than one byte long, we can set firstbyte
- only if it is not to be matched caselessly. */
-
- if (mclength == 1 || req_caseopt == 0)
- {
- firstbyte = mcbuffer[0] | req_caseopt;
- if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
- }
- else firstbyte = reqbyte = REQ_NONE;
- }
-
- /* firstbyte was previously set; we can set reqbyte only the length is
- 1 or the matching is caseful. */
-
- else
- {
- zerofirstbyte = firstbyte;
- zeroreqbyte = reqbyte;
- if (mclength == 1 || req_caseopt == 0)
- reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
- }
-
- break; /* End of literal character handling */
- }
- } /* end of big loop */
-
-
-/* Control never reaches here by falling through, only by a goto for all the
-error states. Pass back the position in the pattern so that it can be displayed
-to the user for diagnosing the error. */
-
-FAILED:
-*ptrptr = ptr;
-return FALSE;
-}
-
-
-
-
-/*************************************************
-* Compile sequence of alternatives *
-*************************************************/
-
-/* On entry, ptr is pointing past the bracket character, but on return it
-points to the closing bracket, or vertical bar, or end of string. The code
-variable is pointing at the byte into which the BRA operator has been stored.
-If the ims options are changed at the start (for a (?ims: group) or during any
-branch, we need to insert an OP_OPT item at the start of every following branch
-to ensure they get set correctly at run time, and also pass the new options
-into every subsequent branch compile.
-
-This function is used during the pre-compile phase when we are trying to find
-out the amount of memory needed, as well as during the real compile phase. The
-value of lengthptr distinguishes the two phases.
-
-Arguments:
- options option bits, including any changes for this subpattern
- oldims previous settings of ims option bits
- codeptr -> the address of the current code pointer
- ptrptr -> the address of the current pattern pointer
- errorcodeptr -> pointer to error code variable
- lookbehind TRUE if this is a lookbehind assertion
- reset_bracount TRUE to reset the count for each branch
- skipbytes skip this many bytes at start (for brackets and OP_COND)
- firstbyteptr place to put the first required character, or a negative number
- reqbyteptr place to put the last required character, or a negative number
- bcptr pointer to the chain of currently open branches
- cd points to the data block with tables pointers etc.
- lengthptr NULL during the real compile phase
- points to length accumulator during pre-compile phase
-
-Returns: TRUE on success
-*/
-
-static BOOL
-compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
- int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
- int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
- int *lengthptr)
-{
-const uschar *ptr = *ptrptr;
-uschar *code = *codeptr;
-uschar *last_branch = code;
-uschar *start_bracket = code;
-uschar *reverse_count = NULL;
-int firstbyte, reqbyte;
-int branchfirstbyte, branchreqbyte;
-int length;
-int orig_bracount;
-int max_bracount;
-branch_chain bc;
-
-bc.outer = bcptr;
-bc.current = code;
-
-firstbyte = reqbyte = REQ_UNSET;
-
-/* Accumulate the length for use in the pre-compile phase. Start with the
-length of the BRA and KET and any extra bytes that are required at the
-beginning. We accumulate in a local variable to save frequent testing of
-lenthptr for NULL. We cannot do this by looking at the value of code at the
-start and end of each alternative, because compiled items are discarded during
-the pre-compile phase so that the work space is not exceeded. */
-
-length = 2 + 2*LINK_SIZE + skipbytes;
-
-/* WARNING: If the above line is changed for any reason, you must also change
-the code that abstracts option settings at the start of the pattern and makes
-them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
-pre-compile phase to find out whether anything has yet been compiled or not. */
-
-/* Offset is set zero to mark that this bracket is still open */
-
-PUT(code, 1, 0);
-code += 1 + LINK_SIZE + skipbytes;
-
-/* Loop for each alternative branch */
-
-orig_bracount = max_bracount = cd->bracount;
-for (;;)
- {
- /* For a (?| group, reset the capturing bracket count so that each branch
- uses the same numbers. */
-
- if (reset_bracount) cd->bracount = orig_bracount;
-
- /* Handle a change of ims options at the start of the branch */
-
- if ((options & PCRE_IMS) != oldims)
- {
- *code++ = OP_OPT;
- *code++ = options & PCRE_IMS;
- length += 2;
- }
-
- /* Set up dummy OP_REVERSE if lookbehind assertion */
-
- if (lookbehind)
- {
- *code++ = OP_REVERSE;
- reverse_count = code;
- PUTINC(code, 0, 0);
- length += 1 + LINK_SIZE;
- }
-
- /* Now compile the branch; in the pre-compile phase its length gets added
- into the length. */
-
- if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
- &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
- {
- *ptrptr = ptr;
- return FALSE;
- }
-
- /* Keep the highest bracket count in case (?| was used and some branch
- has fewer than the rest. */
-
- if (cd->bracount > max_bracount) max_bracount = cd->bracount;
-
- /* In the real compile phase, there is some post-processing to be done. */
-
- if (lengthptr == NULL)
- {
- /* If this is the first branch, the firstbyte and reqbyte values for the
- branch become the values for the regex. */
-
- if (*last_branch != OP_ALT)
- {
- firstbyte = branchfirstbyte;
- reqbyte = branchreqbyte;
- }
-
- /* If this is not the first branch, the first char and reqbyte have to
- match the values from all the previous branches, except that if the
- previous value for reqbyte didn't have REQ_VARY set, it can still match,
- and we set REQ_VARY for the regex. */
-
- else
- {
- /* If we previously had a firstbyte, but it doesn't match the new branch,
- we have to abandon the firstbyte for the regex, but if there was
- previously no reqbyte, it takes on the value of the old firstbyte. */
-
- if (firstbyte >= 0 && firstbyte != branchfirstbyte)
- {
- if (reqbyte < 0) reqbyte = firstbyte;
- firstbyte = REQ_NONE;
- }
-
- /* If we (now or from before) have no firstbyte, a firstbyte from the
- branch becomes a reqbyte if there isn't a branch reqbyte. */
-
- if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
- branchreqbyte = branchfirstbyte;
-
- /* Now ensure that the reqbytes match */
-
- if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
- reqbyte = REQ_NONE;
- else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
- }
-
- /* If lookbehind, check that this branch matches a fixed-length string, and
- put the length into the OP_REVERSE item. Temporarily mark the end of the
- branch with OP_END. */
-
- if (lookbehind)
- {
- int fixed_length;
- *code = OP_END;
- fixed_length = find_fixedlength(last_branch, options);
- DPRINTF(("fixed length = %d\n", fixed_length));
- if (fixed_length < 0)
- {
- *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
- *ptrptr = ptr;
- return FALSE;
- }
- PUT(reverse_count, 0, fixed_length);
- }
- }
-
- /* Reached end of expression, either ')' or end of pattern. In the real
- compile phase, go back through the alternative branches and reverse the chain
- of offsets, with the field in the BRA item now becoming an offset to the
- first alternative. If there are no alternatives, it points to the end of the
- group. The length in the terminating ket is always the length of the whole
- bracketed item. If any of the ims options were changed inside the group,
- compile a resetting op-code following, except at the very end of the pattern.
- Return leaving the pointer at the terminating char. */
-
- if (*ptr != CHAR_VERTICAL_LINE)
- {
- if (lengthptr == NULL)
- {
- int branch_length = code - last_branch;
- do
- {
- int prev_length = GET(last_branch, 1);
- PUT(last_branch, 1, branch_length);
- branch_length = prev_length;
- last_branch -= branch_length;
- }
- while (branch_length > 0);
- }
-
- /* Fill in the ket */
-
- *code = OP_KET;
- PUT(code, 1, code - start_bracket);
- code += 1 + LINK_SIZE;
-
- /* Resetting option if needed */
-
- if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
- {
- *code++ = OP_OPT;
- *code++ = oldims;
- length += 2;
- }
-
- /* Retain the highest bracket number, in case resetting was used. */
-
- cd->bracount = max_bracount;
-
- /* Set values to pass back */
-
- *codeptr = code;
- *ptrptr = ptr;
- *firstbyteptr = firstbyte;
- *reqbyteptr = reqbyte;
- if (lengthptr != NULL)
- {
- if (OFLOW_MAX - *lengthptr < length)
- {
- *errorcodeptr = ERR20;
- return FALSE;
- }
- *lengthptr += length;
- }
- return TRUE;
- }
-
- /* Another branch follows. In the pre-compile phase, we can move the code
- pointer back to where it was for the start of the first branch. (That is,
- pretend that each branch is the only one.)
-
- In the real compile phase, insert an ALT node. Its length field points back
- to the previous branch while the bracket remains open. At the end the chain
- is reversed. It's done like this so that the start of the bracket has a
- zero offset until it is closed, making it possible to detect recursion. */
-
- if (lengthptr != NULL)
- {
- code = *codeptr + 1 + LINK_SIZE + skipbytes;
- length += 1 + LINK_SIZE;
- }
- else
- {
- *code = OP_ALT;
- PUT(code, 1, code - last_branch);
- bc.current = last_branch = code;
- code += 1 + LINK_SIZE;
- }
-
- ptr++;
- }
-/* Control never reaches here */
-}
-
-
-
-
-/*************************************************
-* Check for anchored expression *
-*************************************************/
-
-/* Try to find out if this is an anchored regular expression. Consider each
-alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
-all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
-it's anchored. However, if this is a multiline pattern, then only OP_SOD
-counts, since OP_CIRC can match in the middle.
-
-We can also consider a regex to be anchored if OP_SOM starts all its branches.
-This is the code for \G, which means "match at start of match position, taking
-into account the match offset".
-
-A branch is also implicitly anchored if it starts with .* and DOTALL is set,
-because that will try the rest of the pattern at all possible matching points,
-so there is no point trying again.... er ....
-
-.... except when the .* appears inside capturing parentheses, and there is a
-subsequent back reference to those parentheses. We haven't enough information
-to catch that case precisely.
-
-At first, the best we could do was to detect when .* was in capturing brackets
-and the highest back reference was greater than or equal to that level.
-However, by keeping a bitmap of the first 31 back references, we can catch some
-of the more common cases more precisely.
-
-Arguments:
- code points to start of expression (the bracket)
- options points to the options setting
- bracket_map a bitmap of which brackets we are inside while testing; this
- handles up to substring 31; after that we just have to take
- the less precise approach
- backref_map the back reference bitmap
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
- unsigned int backref_map)
-{
-do {
- const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
- options, PCRE_MULTILINE, FALSE);
- register int op = *scode;
-
- /* Non-capturing brackets */
-
- if (op == OP_BRA)
- {
- if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
- }
-
- /* Capturing brackets */
-
- else if (op == OP_CBRA)
- {
- int n = GET2(scode, 1+LINK_SIZE);
- int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
- if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
- }
-
- /* Other brackets */
-
- else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
- {
- if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
- }
-
- /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
- it isn't in brackets that are or may be referenced. */
-
- else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
- op == OP_TYPEPOSSTAR))
- {
- if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
- return FALSE;
- }
-
- /* Check for explicit anchoring */
-
- else if (op != OP_SOD && op != OP_SOM &&
- ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
- return FALSE;
- code += GET(code, 1);
- }
-while (*code == OP_ALT); /* Loop for each alternative */
-return TRUE;
-}
-
-
-
-/*************************************************
-* Check for starting with ^ or .* *
-*************************************************/
-
-/* This is called to find out if every branch starts with ^ or .* so that
-"first char" processing can be done to speed things up in multiline
-matching and for non-DOTALL patterns that start with .* (which must start at
-the beginning or after \n). As in the case of is_anchored() (see above), we
-have to take account of back references to capturing brackets that contain .*
-because in that case we can't make the assumption.
-
-Arguments:
- code points to start of expression (the bracket)
- bracket_map a bitmap of which brackets we are inside while testing; this
- handles up to substring 31; after that we just have to take
- the less precise approach
- backref_map the back reference bitmap
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-is_startline(const uschar *code, unsigned int bracket_map,
- unsigned int backref_map)
-{
-do {
- const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
- NULL, 0, FALSE);
- register int op = *scode;
-
- /* If we are at the start of a conditional assertion group, *both* the
- conditional assertion *and* what follows the condition must satisfy the test
- for start of line. Other kinds of condition fail. Note that there may be an
- auto-callout at the start of a condition. */
-
- if (op == OP_COND)
- {
- scode += 1 + LINK_SIZE;
- if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
- switch (*scode)
- {
- case OP_CREF:
- case OP_RREF:
- case OP_DEF:
- return FALSE;
-
- default: /* Assertion */
- if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
- do scode += GET(scode, 1); while (*scode == OP_ALT);
- scode += 1 + LINK_SIZE;
- break;
- }
- scode = first_significant_code(scode, NULL, 0, FALSE);
- op = *scode;
- }
-
- /* Non-capturing brackets */
-
- if (op == OP_BRA)
- {
- if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
- }
-
- /* Capturing brackets */
-
- else if (op == OP_CBRA)
- {
- int n = GET2(scode, 1+LINK_SIZE);
- int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
- if (!is_startline(scode, new_map, backref_map)) return FALSE;
- }
-
- /* Other brackets */
-
- else if (op == OP_ASSERT || op == OP_ONCE)
- {
- if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
- }
-
- /* .* means "start at start or after \n" if it isn't in brackets that
- may be referenced. */
-
- else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
- {
- if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
- }
-
- /* Check for explicit circumflex */
-
- else if (op != OP_CIRC) return FALSE;
-
- /* Move on to the next alternative */
-
- code += GET(code, 1);
- }
-while (*code == OP_ALT); /* Loop for each alternative */
-return TRUE;
-}
-
-
-
-/*************************************************
-* Check for asserted fixed first char *
-*************************************************/
-
-/* During compilation, the "first char" settings from forward assertions are
-discarded, because they can cause conflicts with actual literals that follow.
-However, if we end up without a first char setting for an unanchored pattern,
-it is worth scanning the regex to see if there is an initial asserted first
-char. If all branches start with the same asserted char, or with a bracket all
-of whose alternatives start with the same asserted char (recurse ad lib), then
-we return that char, otherwise -1.
-
-Arguments:
- code points to start of expression (the bracket)
- options pointer to the options (used to check casing changes)
- inassert TRUE if in an assertion
-
-Returns: -1 or the fixed first char
-*/
-
-static int
-find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
-{
-register int c = -1;
-do {
- int d;
- const uschar *scode =
- first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
- register int op = *scode;
-
- switch(op)
- {
- default:
- return -1;
-
- case OP_BRA:
- case OP_CBRA:
- case OP_ASSERT:
- case OP_ONCE:
- case OP_COND:
- if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
- return -1;
- if (c < 0) c = d; else if (c != d) return -1;
- break;
-
- case OP_EXACT: /* Fall through */
- scode += 2;
-
- case OP_CHAR:
- case OP_CHARNC:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- if (!inassert) return -1;
- if (c < 0)
- {
- c = scode[1];
- if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
- }
- else if (c != scode[1]) return -1;
- break;
- }
-
- code += GET(code, 1);
- }
-while (*code == OP_ALT);
-return c;
-}
-
-
-
-/*************************************************
-* Compile a Regular Expression *
-*************************************************/
-
-/* This function takes a string and returns a pointer to a block of store
-holding a compiled version of the expression. The original API for this
-function had no error code return variable; it is retained for backwards
-compatibility. The new function is given a new name.
-
-Arguments:
- pattern the regular expression
- options various option bits
- errorcodeptr pointer to error code variable (pcre_compile2() only)
- can be NULL if you don't want a code value
- errorptr pointer to pointer to error text
- erroroffset ptr offset in pattern where error was detected
- tables pointer to character tables or NULL
-
-Returns: pointer to compiled data block, or NULL on error,
- with errorptr and erroroffset set
-*/
-
-PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
-pcre_compile(const char *pattern, int options, const char **errorptr,
- int *erroroffset, const unsigned char *tables)
-{
-return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
-}
-
-
-PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
-pcre_compile2(const char *pattern, int options, int *errorcodeptr,
- const char **errorptr, int *erroroffset, const unsigned char *tables)
-{
-real_pcre *re;
-int length = 1; /* For final END opcode */
-int firstbyte, reqbyte, newline;
-int errorcode = 0;
-int skipatstart = 0;
-#ifdef SUPPORT_UTF8
-BOOL utf8;
-#endif
-size_t size;
-uschar *code;
-const uschar *codestart;
-const uschar *ptr;
-compile_data compile_block;
-compile_data *cd = &compile_block;
-
-/* This space is used for "compiling" into during the first phase, when we are
-computing the amount of memory that is needed. Compiled items are thrown away
-as soon as possible, so that a fairly large buffer should be sufficient for
-this purpose. The same space is used in the second phase for remembering where
-to fill in forward references to subpatterns. */
-
-uschar cworkspace[COMPILE_WORK_SIZE];
-
-/* Set this early so that early errors get offset 0. */
-
-ptr = (const uschar *)pattern;
-
-/* We can't pass back an error message if errorptr is NULL; I guess the best we
-can do is just return NULL, but we can set a code value if there is a code
-pointer. */
-
-if (errorptr == NULL)
- {
- if (errorcodeptr != NULL) *errorcodeptr = 99;
- return NULL;
- }
-
-*errorptr = NULL;
-if (errorcodeptr != NULL) *errorcodeptr = ERR0;
-
-/* However, we can give a message for this error */
-
-if (erroroffset == NULL)
- {
- errorcode = ERR16;
- goto PCRE_EARLY_ERROR_RETURN2;
- }
-
-*erroroffset = 0;
-
-/* Set up pointers to the individual character tables */
-
-if (tables == NULL) tables = _pcre_default_tables;
-cd->lcc = tables + lcc_offset;
-cd->fcc = tables + fcc_offset;
-cd->cbits = tables + cbits_offset;
-cd->ctypes = tables + ctypes_offset;
-
-/* Check that all undefined public option bits are zero */
-
-if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
- {
- errorcode = ERR17;
- goto PCRE_EARLY_ERROR_RETURN;
- }
-
-/* Check for global one-time settings at the start of the pattern, and remember
-the offset for later. */
-
-while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
- ptr[skipatstart+1] == CHAR_ASTERISK)
- {
- int newnl = 0;
- int newbsr = 0;
-
- if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
- { skipatstart += 7; options |= PCRE_UTF8; continue; }
-
- if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
- { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
- else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
- { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
- else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
- { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
- else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
- { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
- else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
- { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
-
- else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
- { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
- else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
- { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
-
- if (newnl != 0)
- options = (options & ~PCRE_NEWLINE_BITS) | newnl;
- else if (newbsr != 0)
- options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
- else break;
- }
-
-/* Can't support UTF8 unless PCRE has been compiled to include the code. */
-
-#ifdef SUPPORT_UTF8
-utf8 = (options & PCRE_UTF8) != 0;
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
- (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
- {
- errorcode = ERR44;
- goto PCRE_EARLY_ERROR_RETURN2;
- }
-#else
-if ((options & PCRE_UTF8) != 0)
- {
- errorcode = ERR32;
- goto PCRE_EARLY_ERROR_RETURN;
- }
-#endif
-
-/* Check validity of \R options. */
-
-switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
- {
- case 0:
- case PCRE_BSR_ANYCRLF:
- case PCRE_BSR_UNICODE:
- break;
- default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
- }
-
-/* Handle different types of newline. The three bits give seven cases. The
-current code allows for fixed one- or two-byte sequences, plus "any" and
-"anycrlf". */
-
-switch (options & PCRE_NEWLINE_BITS)
- {
- case 0: newline = NEWLINE; break; /* Build-time default */
- case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
- case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
- case PCRE_NEWLINE_CR+
- PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
- case PCRE_NEWLINE_ANY: newline = -1; break;
- case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
- default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
- }
-
-if (newline == -2)
- {
- cd->nltype = NLTYPE_ANYCRLF;
- }
-else if (newline < 0)
- {
- cd->nltype = NLTYPE_ANY;
- }
-else
- {
- cd->nltype = NLTYPE_FIXED;
- if (newline > 255)
- {
- cd->nllen = 2;
- cd->nl[0] = (newline >> 8) & 255;
- cd->nl[1] = newline & 255;
- }
- else
- {
- cd->nllen = 1;
- cd->nl[0] = newline;
- }
- }
-
-/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
-references to help in deciding whether (.*) can be treated as anchored or not.
-*/
-
-cd->top_backref = 0;
-cd->backref_map = 0;
-
-/* Reflect pattern for debugging output */
-
-DPRINTF(("------------------------------------------------------------------\n"));
-DPRINTF(("%s\n", pattern));
-
-/* Pretend to compile the pattern while actually just accumulating the length
-of memory required. This behaviour is triggered by passing a non-NULL final
-argument to compile_regex(). We pass a block of workspace (cworkspace) for it
-to compile parts of the pattern into; the compiled code is discarded when it is
-no longer needed, so hopefully this workspace will never overflow, though there
-is a test for its doing so. */
-
-cd->bracount = cd->final_bracount = 0;
-cd->names_found = 0;
-cd->name_entry_size = 0;
-cd->name_table = NULL;
-cd->start_workspace = cworkspace;
-cd->start_code = cworkspace;
-cd->hwm = cworkspace;
-cd->start_pattern = (const uschar *)pattern;
-cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
-cd->req_varyopt = 0;
-cd->external_options = options;
-cd->external_flags = 0;
-
-/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
-don't need to look at the result of the function here. The initial options have
-been put into the cd block so that they can be changed if an option setting is
-found within the regex right at the beginning. Bringing initial option settings
-outside can help speed up starting point checks. */
-
-ptr += skipatstart;
-code = cworkspace;
-*code = OP_BRA;
-(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
- &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
- &length);
-if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
-
-DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
- cd->hwm - cworkspace));
-
-if (length > MAX_PATTERN_SIZE)
- {
- errorcode = ERR20;
- goto PCRE_EARLY_ERROR_RETURN;
- }
-
-/* Compute the size of data block needed and get it, either from malloc or
-externally provided function. Integer overflow should no longer be possible
-because nowadays we limit the maximum value of cd->names_found and
-cd->name_entry_size. */
-
-size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
-re = (real_pcre *)(pcre_malloc)(size);
-
-if (re == NULL)
- {
- errorcode = ERR21;
- goto PCRE_EARLY_ERROR_RETURN;
- }
-
-/* Put in the magic number, and save the sizes, initial options, internal
-flags, and character table pointer. NULL is used for the default character
-tables. The nullpad field is at the end; it's there to help in the case when a
-regex compiled on a system with 4-byte pointers is run on another with 8-byte
-pointers. */
-
-re->magic_number = MAGIC_NUMBER;
-re->size = size;
-re->options = cd->external_options;
-re->flags = cd->external_flags;
-re->dummy1 = 0;
-re->first_byte = 0;
-re->req_byte = 0;
-re->name_table_offset = sizeof(real_pcre);
-re->name_entry_size = cd->name_entry_size;
-re->name_count = cd->names_found;
-re->ref_count = 0;
-re->tables = (tables == _pcre_default_tables)? NULL : tables;
-re->nullpad = NULL;
-
-/* The starting points of the name/number translation table and of the code are
-passed around in the compile data block. The start/end pattern and initial
-options are already set from the pre-compile phase, as is the name_entry_size
-field. Reset the bracket count and the names_found field. Also reset the hwm
-field; this time it's used for remembering forward references to subpatterns.
-*/
-
-cd->final_bracount = cd->bracount; /* Save for checking forward references */
-cd->bracount = 0;
-cd->names_found = 0;
-cd->name_table = (uschar *)re + re->name_table_offset;
-codestart = cd->name_table + re->name_entry_size * re->name_count;
-cd->start_code = codestart;
-cd->hwm = cworkspace;
-cd->req_varyopt = 0;
-cd->had_accept = FALSE;
-
-/* Set up a starting, non-extracting bracket, then compile the expression. On
-error, errorcode will be set non-zero, so we don't need to look at the result
-of the function here. */
-
-ptr = (const uschar *)pattern + skipatstart;
-code = (uschar *)codestart;
-*code = OP_BRA;
-(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
- &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
-re->top_bracket = cd->bracount;
-re->top_backref = cd->top_backref;
-re->flags = cd->external_flags;
-
-if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
-
-/* If not reached end of pattern on success, there's an excess bracket. */
-
-if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
-
-/* Fill in the terminating state and check for disastrous overflow, but
-if debugging, leave the test till after things are printed out. */
-
-*code++ = OP_END;
-
-#ifndef PCRE_DEBUG
-if (code - codestart > length) errorcode = ERR23;
-#endif
-
-/* Fill in any forward references that are required. */
-
-while (errorcode == 0 && cd->hwm > cworkspace)
- {
- int offset, recno;
- const uschar *groupptr;
- cd->hwm -= LINK_SIZE;
- offset = GET(cd->hwm, 0);
- recno = GET(codestart, offset);
- groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
- if (groupptr == NULL) errorcode = ERR53;
- else PUT(((uschar *)codestart), offset, groupptr - codestart);
- }
-
-/* Give an error if there's back reference to a non-existent capturing
-subpattern. */
-
-if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
-
-/* Failed to compile, or error while post-processing */
-
-if (errorcode != 0)
- {
- (pcre_free)(re);
- PCRE_EARLY_ERROR_RETURN:
- *erroroffset = ptr - (const uschar *)pattern;
- PCRE_EARLY_ERROR_RETURN2:
- *errorptr = find_error_text(errorcode);
- if (errorcodeptr != NULL) *errorcodeptr = errorcode;
- return NULL;
- }
-
-/* If the anchored option was not passed, set the flag if we can determine that
-the pattern is anchored by virtue of ^ characters or \A or anything else (such
-as starting with .* when DOTALL is set).
-
-Otherwise, if we know what the first byte has to be, save it, because that
-speeds up unanchored matches no end. If not, see if we can set the
-PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
-start with ^. and also when all branches start with .* for non-DOTALL matches.
-*/
-
-if ((re->options & PCRE_ANCHORED) == 0)
- {
- int temp_options = re->options; /* May get changed during these scans */
- if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
- re->options |= PCRE_ANCHORED;
- else
- {
- if (firstbyte < 0)
- firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
- if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
- {
- int ch = firstbyte & 255;
- re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
- cd->fcc[ch] == ch)? ch : firstbyte;
- re->flags |= PCRE_FIRSTSET;
- }
- else if (is_startline(codestart, 0, cd->backref_map))
- re->flags |= PCRE_STARTLINE;
- }
- }
-
-/* For an anchored pattern, we use the "required byte" only if it follows a
-variable length item in the regex. Remove the caseless flag for non-caseable
-bytes. */
-
-if (reqbyte >= 0 &&
- ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
- {
- int ch = reqbyte & 255;
- re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
- cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
- re->flags |= PCRE_REQCHSET;
- }
-
-/* Print out the compiled data if debugging is enabled. This is never the
-case when building a production library. */
-
-#ifdef PCRE_DEBUG
-
-printf("Length = %d top_bracket = %d top_backref = %d\n",
- length, re->top_bracket, re->top_backref);
-
-printf("Options=%08x\n", re->options);
-
-if ((re->flags & PCRE_FIRSTSET) != 0)
- {
- int ch = re->first_byte & 255;
- const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
- "" : " (caseless)";
- if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
- else printf("First char = \\x%02x%s\n", ch, caseless);
- }
-
-if ((re->flags & PCRE_REQCHSET) != 0)
- {
- int ch = re->req_byte & 255;
- const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
- "" : " (caseless)";
- if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
- else printf("Req char = \\x%02x%s\n", ch, caseless);
- }
-
-pcre_printint(re, stdout, TRUE);
-
-/* This check is done here in the debugging case so that the code that
-was compiled can be seen. */
-
-if (code - codestart > length)
- {
- (pcre_free)(re);
- *errorptr = find_error_text(ERR23);
- *erroroffset = ptr - (uschar *)pattern;
- if (errorcodeptr != NULL) *errorcodeptr = ERR23;
- return NULL;
- }
-#endif /* PCRE_DEBUG */
-
-return (pcre *)re;
-}
-
-/* End of pcre_compile.c */
diff --git a/libs/pcre/pcre_config.c b/libs/pcre/pcre_config.c
deleted file mode 100644
index 78e8560ba6..0000000000
--- a/libs/pcre/pcre_config.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_config(). */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-/*************************************************
-* Return info about what features are configured *
-*************************************************/
-
-/* This function has an extensible interface so that additional items can be
-added compatibly.
-
-Arguments:
- what what information is required
- where where to put the information
-
-Returns: 0 if data returned, negative on error
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_config(int what, void *where)
-{
-switch (what)
- {
- case PCRE_CONFIG_UTF8:
-#ifdef SUPPORT_UTF8
- *((int *)where) = 1;
-#else
- *((int *)where) = 0;
-#endif
- break;
-
- case PCRE_CONFIG_UNICODE_PROPERTIES:
-#ifdef SUPPORT_UCP
- *((int *)where) = 1;
-#else
- *((int *)where) = 0;
-#endif
- break;
-
- case PCRE_CONFIG_NEWLINE:
- *((int *)where) = NEWLINE;
- break;
-
- case PCRE_CONFIG_BSR:
-#ifdef BSR_ANYCRLF
- *((int *)where) = 1;
-#else
- *((int *)where) = 0;
-#endif
- break;
-
- case PCRE_CONFIG_LINK_SIZE:
- *((int *)where) = LINK_SIZE;
- break;
-
- case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
- *((int *)where) = POSIX_MALLOC_THRESHOLD;
- break;
-
- case PCRE_CONFIG_MATCH_LIMIT:
- *((unsigned long int *)where) = MATCH_LIMIT;
- break;
-
- case PCRE_CONFIG_MATCH_LIMIT_RECURSION:
- *((unsigned long int *)where) = MATCH_LIMIT_RECURSION;
- break;
-
- case PCRE_CONFIG_STACKRECURSE:
-#ifdef NO_RECURSE
- *((int *)where) = 0;
-#else
- *((int *)where) = 1;
-#endif
- break;
-
- default: return PCRE_ERROR_BADOPTION;
- }
-
-return 0;
-}
-
-/* End of pcre_config.c */
diff --git a/libs/pcre/pcre_dfa_exec.c b/libs/pcre/pcre_dfa_exec.c
deleted file mode 100644
index d0d0ee5174..0000000000
--- a/libs/pcre/pcre_dfa_exec.c
+++ /dev/null
@@ -1,2963 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language (but see
-below for why this module is different).
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_dfa_exec(), which is an
-alternative matching function that uses a sort of DFA algorithm (not a true
-FSM). This is NOT Perl- compatible, but it has advantages in certain
-applications. */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#define NLBLOCK md /* Block containing newline information */
-#define PSSTART start_subject /* Field containing processed string start */
-#define PSEND end_subject /* Field containing processed string end */
-
-#include "pcre_internal.h"
-
-
-/* For use to indent debugging output */
-
-#define SP " "
-
-
-/*************************************************
-* Code parameters and static tables *
-*************************************************/
-
-/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
-into others, under special conditions. A gap of 20 between the blocks should be
-enough. The resulting opcodes don't have to be less than 256 because they are
-never stored, so we push them well clear of the normal opcodes. */
-
-#define OP_PROP_EXTRA 300
-#define OP_EXTUNI_EXTRA 320
-#define OP_ANYNL_EXTRA 340
-#define OP_HSPACE_EXTRA 360
-#define OP_VSPACE_EXTRA 380
-
-
-/* This table identifies those opcodes that are followed immediately by a
-character that is to be tested in some way. This makes is possible to
-centralize the loading of these characters. In the case of Type * etc, the
-"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
-small value. ***NOTE*** If the start of this table is modified, the two tables
-that follow must also be modified. */
-
-static const uschar coptable[] = {
- 0, /* End */
- 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
- 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
- 0, 0, 0, /* Any, AllAny, Anybyte */
- 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
- 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
- 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
- 1, /* Char */
- 1, /* Charnc */
- 1, /* not */
- /* Positive single-char repeats */
- 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
- 3, 3, 3, /* upto, minupto, exact */
- 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
- /* Negative single-char repeats - only for chars < 256 */
- 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
- 3, 3, 3, /* NOT upto, minupto, exact */
- 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
- /* Positive type repeats */
- 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
- 3, 3, 3, /* Type upto, minupto, exact */
- 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
- /* Character class & ref repeats */
- 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
- 0, 0, /* CRRANGE, CRMINRANGE */
- 0, /* CLASS */
- 0, /* NCLASS */
- 0, /* XCLASS - variable length */
- 0, /* REF */
- 0, /* RECURSE */
- 0, /* CALLOUT */
- 0, /* Alt */
- 0, /* Ket */
- 0, /* KetRmax */
- 0, /* KetRmin */
- 0, /* Assert */
- 0, /* Assert not */
- 0, /* Assert behind */
- 0, /* Assert behind not */
- 0, /* Reverse */
- 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
- 0, 0, 0, /* SBRA, SCBRA, SCOND */
- 0, /* CREF */
- 0, /* RREF */
- 0, /* DEF */
- 0, 0, /* BRAZERO, BRAMINZERO */
- 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
- 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
-};
-
-/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
-and \w */
-
-static const uschar toptable1[] = {
- 0, 0, 0, 0, 0, 0,
- ctype_digit, ctype_digit,
- ctype_space, ctype_space,
- ctype_word, ctype_word,
- 0, 0 /* OP_ANY, OP_ALLANY */
-};
-
-static const uschar toptable2[] = {
- 0, 0, 0, 0, 0, 0,
- ctype_digit, 0,
- ctype_space, 0,
- ctype_word, 0,
- 1, 1 /* OP_ANY, OP_ALLANY */
-};
-
-
-/* Structure for holding data about a particular state, which is in effect the
-current data for an active path through the match tree. It must consist
-entirely of ints because the working vector we are passed, and which we put
-these structures in, is a vector of ints. */
-
-typedef struct stateblock {
- int offset; /* Offset to opcode */
- int count; /* Count for repeats */
- int ims; /* ims flag bits */
- int data; /* Some use extra data */
-} stateblock;
-
-#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
-
-
-#ifdef PCRE_DEBUG
-/*************************************************
-* Print character string *
-*************************************************/
-
-/* Character string printing function for debugging.
-
-Arguments:
- p points to string
- length number of bytes
- f where to print
-
-Returns: nothing
-*/
-
-static void
-pchars(unsigned char *p, int length, FILE *f)
-{
-int c;
-while (length-- > 0)
- {
- if (isprint(c = *(p++)))
- fprintf(f, "%c", c);
- else
- fprintf(f, "\\x%02x", c);
- }
-}
-#endif
-
-
-
-/*************************************************
-* Execute a Regular Expression - DFA engine *
-*************************************************/
-
-/* This internal function applies a compiled pattern to a subject string,
-starting at a given point, using a DFA engine. This function is called from the
-external one, possibly multiple times if the pattern is not anchored. The
-function calls itself recursively for some kinds of subpattern.
-
-Arguments:
- md the match_data block with fixed information
- this_start_code the opening bracket of this subexpression's code
- current_subject where we currently are in the subject string
- start_offset start offset in the subject string
- offsets vector to contain the matching string offsets
- offsetcount size of same
- workspace vector of workspace
- wscount size of same
- ims the current ims flags
- rlevel function call recursion level
- recursing regex recursive call level
-
-Returns: > 0 => number of match offset pairs placed in offsets
- = 0 => offsets overflowed; longest matches are present
- -1 => failed to match
- < -1 => some kind of unexpected problem
-
-The following macros are used for adding states to the two state vectors (one
-for the current character, one for the following character). */
-
-#define ADD_ACTIVE(x,y) \
- if (active_count++ < wscount) \
- { \
- next_active_state->offset = (x); \
- next_active_state->count = (y); \
- next_active_state->ims = ims; \
- next_active_state++; \
- DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
- } \
- else return PCRE_ERROR_DFA_WSSIZE
-
-#define ADD_ACTIVE_DATA(x,y,z) \
- if (active_count++ < wscount) \
- { \
- next_active_state->offset = (x); \
- next_active_state->count = (y); \
- next_active_state->ims = ims; \
- next_active_state->data = (z); \
- next_active_state++; \
- DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
- } \
- else return PCRE_ERROR_DFA_WSSIZE
-
-#define ADD_NEW(x,y) \
- if (new_count++ < wscount) \
- { \
- next_new_state->offset = (x); \
- next_new_state->count = (y); \
- next_new_state->ims = ims; \
- next_new_state++; \
- DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
- } \
- else return PCRE_ERROR_DFA_WSSIZE
-
-#define ADD_NEW_DATA(x,y,z) \
- if (new_count++ < wscount) \
- { \
- next_new_state->offset = (x); \
- next_new_state->count = (y); \
- next_new_state->ims = ims; \
- next_new_state->data = (z); \
- next_new_state++; \
- DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
- } \
- else return PCRE_ERROR_DFA_WSSIZE
-
-/* And now, here is the code */
-
-static int
-internal_dfa_exec(
- dfa_match_data *md,
- const uschar *this_start_code,
- const uschar *current_subject,
- int start_offset,
- int *offsets,
- int offsetcount,
- int *workspace,
- int wscount,
- int ims,
- int rlevel,
- int recursing)
-{
-stateblock *active_states, *new_states, *temp_states;
-stateblock *next_active_state, *next_new_state;
-
-const uschar *ctypes, *lcc, *fcc;
-const uschar *ptr;
-const uschar *end_code, *first_op;
-
-int active_count, new_count, match_count;
-
-/* Some fields in the md block are frequently referenced, so we load them into
-independent variables in the hope that this will perform better. */
-
-const uschar *start_subject = md->start_subject;
-const uschar *end_subject = md->end_subject;
-const uschar *start_code = md->start_code;
-
-#ifdef SUPPORT_UTF8
-BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
-#else
-BOOL utf8 = FALSE;
-#endif
-
-rlevel++;
-offsetcount &= (-2);
-
-wscount -= 2;
-wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
- (2 * INTS_PER_STATEBLOCK);
-
-DPRINTF(("\n%.*s---------------------\n"
- "%.*sCall to internal_dfa_exec f=%d r=%d\n",
- rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
-
-ctypes = md->tables + ctypes_offset;
-lcc = md->tables + lcc_offset;
-fcc = md->tables + fcc_offset;
-
-match_count = PCRE_ERROR_NOMATCH; /* A negative number */
-
-active_states = (stateblock *)(workspace + 2);
-next_new_state = new_states = active_states + wscount;
-new_count = 0;
-
-first_op = this_start_code + 1 + LINK_SIZE +
- ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
-
-/* The first thing in any (sub) pattern is a bracket of some sort. Push all
-the alternative states onto the list, and find out where the end is. This
-makes is possible to use this function recursively, when we want to stop at a
-matching internal ket rather than at the end.
-
-If the first opcode in the first alternative is OP_REVERSE, we are dealing with
-a backward assertion. In that case, we have to find out the maximum amount to
-move back, and set up each alternative appropriately. */
-
-if (*first_op == OP_REVERSE)
- {
- int max_back = 0;
- int gone_back;
-
- end_code = this_start_code;
- do
- {
- int back = GET(end_code, 2+LINK_SIZE);
- if (back > max_back) max_back = back;
- end_code += GET(end_code, 1);
- }
- while (*end_code == OP_ALT);
-
- /* If we can't go back the amount required for the longest lookbehind
- pattern, go back as far as we can; some alternatives may still be viable. */
-
-#ifdef SUPPORT_UTF8
- /* In character mode we have to step back character by character */
-
- if (utf8)
- {
- for (gone_back = 0; gone_back < max_back; gone_back++)
- {
- if (current_subject <= start_subject) break;
- current_subject--;
- while (current_subject > start_subject &&
- (*current_subject & 0xc0) == 0x80)
- current_subject--;
- }
- }
- else
-#endif
-
- /* In byte-mode we can do this quickly. */
-
- {
- gone_back = (current_subject - max_back < start_subject)?
- current_subject - start_subject : max_back;
- current_subject -= gone_back;
- }
-
- /* Now we can process the individual branches. */
-
- end_code = this_start_code;
- do
- {
- int back = GET(end_code, 2+LINK_SIZE);
- if (back <= gone_back)
- {
- int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
- ADD_NEW_DATA(-bstate, 0, gone_back - back);
- }
- end_code += GET(end_code, 1);
- }
- while (*end_code == OP_ALT);
- }
-
-/* This is the code for a "normal" subpattern (not a backward assertion). The
-start of a whole pattern is always one of these. If we are at the top level,
-we may be asked to restart matching from the same point that we reached for a
-previous partial match. We still have to scan through the top-level branches to
-find the end state. */
-
-else
- {
- end_code = this_start_code;
-
- /* Restarting */
-
- if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
- {
- do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
- new_count = workspace[1];
- if (!workspace[0])
- memcpy(new_states, active_states, new_count * sizeof(stateblock));
- }
-
- /* Not restarting */
-
- else
- {
- int length = 1 + LINK_SIZE +
- ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
- do
- {
- ADD_NEW(end_code - start_code + length, 0);
- end_code += GET(end_code, 1);
- length = 1 + LINK_SIZE;
- }
- while (*end_code == OP_ALT);
- }
- }
-
-workspace[0] = 0; /* Bit indicating which vector is current */
-
-DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
-
-/* Loop for scanning the subject */
-
-ptr = current_subject;
-for (;;)
- {
- int i, j;
- int clen, dlen;
- unsigned int c, d;
-
- /* Make the new state list into the active state list and empty the
- new state list. */
-
- temp_states = active_states;
- active_states = new_states;
- new_states = temp_states;
- active_count = new_count;
- new_count = 0;
-
- workspace[0] ^= 1; /* Remember for the restarting feature */
- workspace[1] = active_count;
-
-#ifdef PCRE_DEBUG
- printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
- pchars((uschar *)ptr, strlen((char *)ptr), stdout);
- printf("\"\n");
-
- printf("%.*sActive states: ", rlevel*2-2, SP);
- for (i = 0; i < active_count; i++)
- printf("%d/%d ", active_states[i].offset, active_states[i].count);
- printf("\n");
-#endif
-
- /* Set the pointers for adding new states */
-
- next_active_state = active_states + active_count;
- next_new_state = new_states;
-
- /* Load the current character from the subject outside the loop, as many
- different states may want to look at it, and we assume that at least one
- will. */
-
- if (ptr < end_subject)
- {
- clen = 1; /* Number of bytes in the character */
-#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARLEN(c, ptr, clen); } else
-#endif /* SUPPORT_UTF8 */
- c = *ptr;
- }
- else
- {
- clen = 0; /* This indicates the end of the subject */
- c = NOTACHAR; /* This value should never actually be used */
- }
-
- /* Scan up the active states and act on each one. The result of an action
- may be to add more states to the currently active list (e.g. on hitting a
- parenthesis) or it may be to put states on the new list, for considering
- when we move the character pointer on. */
-
- for (i = 0; i < active_count; i++)
- {
- stateblock *current_state = active_states + i;
- const uschar *code;
- int state_offset = current_state->offset;
- int count, codevalue, rrc;
-
-#ifdef PCRE_DEBUG
- printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
- if (clen == 0) printf("EOL\n");
- else if (c > 32 && c < 127) printf("'%c'\n", c);
- else printf("0x%02x\n", c);
-#endif
-
- /* This variable is referred to implicity in the ADD_xxx macros. */
-
- ims = current_state->ims;
-
- /* A negative offset is a special case meaning "hold off going to this
- (negated) state until the number of characters in the data field have
- been skipped". */
-
- if (state_offset < 0)
- {
- if (current_state->data > 0)
- {
- DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
- ADD_NEW_DATA(state_offset, current_state->count,
- current_state->data - 1);
- continue;
- }
- else
- {
- current_state->offset = state_offset = -state_offset;
- }
- }
-
- /* Check for a duplicate state with the same count, and skip if found. */
-
- for (j = 0; j < i; j++)
- {
- if (active_states[j].offset == state_offset &&
- active_states[j].count == current_state->count)
- {
- DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
- goto NEXT_ACTIVE_STATE;
- }
- }
-
- /* The state offset is the offset to the opcode */
-
- code = start_code + state_offset;
- codevalue = *code;
-
- /* If this opcode is followed by an inline character, load it. It is
- tempting to test for the presence of a subject character here, but that
- is wrong, because sometimes zero repetitions of the subject are
- permitted.
-
- We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
- argument that is not a data character - but is always one byte long. We
- have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
- this case. To keep the other cases fast, convert these ones to new opcodes.
- */
-
- if (coptable[codevalue] > 0)
- {
- dlen = 1;
-#ifdef SUPPORT_UTF8
- if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
-#endif /* SUPPORT_UTF8 */
- d = code[coptable[codevalue]];
- if (codevalue >= OP_TYPESTAR)
- {
- switch(d)
- {
- case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
- case OP_NOTPROP:
- case OP_PROP: codevalue += OP_PROP_EXTRA; break;
- case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
- case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
- case OP_NOT_HSPACE:
- case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
- case OP_NOT_VSPACE:
- case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
- default: break;
- }
- }
- }
- else
- {
- dlen = 0; /* Not strictly necessary, but compilers moan */
- d = NOTACHAR; /* if these variables are not set. */
- }
-
-
- /* Now process the individual opcodes */
-
- switch (codevalue)
- {
-
-/* ========================================================================== */
- /* Reached a closing bracket. If not at the end of the pattern, carry
- on with the next opcode. Otherwise, unless we have an empty string and
- PCRE_NOTEMPTY is set, save the match data, shifting up all previous
- matches so we always have the longest first. */
-
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- if (code != end_code)
- {
- ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
- if (codevalue != OP_KET)
- {
- ADD_ACTIVE(state_offset - GET(code, 1), 0);
- }
- }
- else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
- {
- if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
- else if (match_count > 0 && ++match_count * 2 >= offsetcount)
- match_count = 0;
- count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
- if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
- if (offsetcount >= 2)
- {
- offsets[0] = current_subject - start_subject;
- offsets[1] = ptr - start_subject;
- DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
- offsets[1] - offsets[0], current_subject));
- }
- if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
- {
- DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
- "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
- match_count, rlevel*2-2, SP));
- return match_count;
- }
- }
- break;
-
-/* ========================================================================== */
- /* These opcodes add to the current list of states without looking
- at the current character. */
-
- /*-----------------------------------------------------------------*/
- case OP_ALT:
- do { code += GET(code, 1); } while (*code == OP_ALT);
- ADD_ACTIVE(code - start_code, 0);
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_BRA:
- case OP_SBRA:
- do
- {
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
- code += GET(code, 1);
- }
- while (*code == OP_ALT);
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_CBRA:
- case OP_SCBRA:
- ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
- code += GET(code, 1);
- while (*code == OP_ALT)
- {
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
- code += GET(code, 1);
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_BRAZERO:
- case OP_BRAMINZERO:
- ADD_ACTIVE(state_offset + 1, 0);
- code += 1 + GET(code, 2);
- while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_SKIPZERO:
- code += 1 + GET(code, 2);
- while (*code == OP_ALT) code += GET(code, 1);
- ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_CIRC:
- if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
- ((ims & PCRE_MULTILINE) != 0 &&
- ptr != end_subject &&
- WAS_NEWLINE(ptr)))
- { ADD_ACTIVE(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_EOD:
- if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_OPT:
- ims = code[1];
- ADD_ACTIVE(state_offset + 2, 0);
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_SOD:
- if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_SOM:
- if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
- break;
-
-
-/* ========================================================================== */
- /* These opcodes inspect the next subject character, and sometimes
- the previous one as well, but do not have an argument. The variable
- clen contains the length of the current character and is zero if we are
- at the end of the subject. */
-
- /*-----------------------------------------------------------------*/
- case OP_ANY:
- if (clen > 0 && !IS_NEWLINE(ptr))
- { ADD_NEW(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_ALLANY:
- if (clen > 0)
- { ADD_NEW(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_EODN:
- if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
- { ADD_ACTIVE(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_DOLL:
- if ((md->moptions & PCRE_NOTEOL) == 0)
- {
- if (clen == 0 ||
- ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
- ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
- ))
- { ADD_ACTIVE(state_offset + 1, 0); }
- }
- else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
- { ADD_ACTIVE(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
-
- case OP_DIGIT:
- case OP_WHITESPACE:
- case OP_WORDCHAR:
- if (clen > 0 && c < 256 &&
- ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
- { ADD_NEW(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_NOT_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_NOT_WORDCHAR:
- if (clen > 0 && (c >= 256 ||
- ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
- { ADD_NEW(state_offset + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_WORD_BOUNDARY:
- case OP_NOT_WORD_BOUNDARY:
- {
- int left_word, right_word;
-
- if (ptr > start_subject)
- {
- const uschar *temp = ptr - 1;
-#ifdef SUPPORT_UTF8
- if (utf8) BACKCHAR(temp);
-#endif
- GETCHARTEST(d, temp);
- left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
- }
- else left_word = 0;
-
- if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
- else right_word = 0;
-
- if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
- { ADD_ACTIVE(state_offset + 1, 0); }
- }
- break;
-
-
- /*-----------------------------------------------------------------*/
- /* Check the next character by Unicode property. We will get here only
- if the support is in the binary; otherwise a compile-time error occurs.
- */
-
-#ifdef SUPPORT_UCP
- case OP_PROP:
- case OP_NOTPROP:
- if (clen > 0)
- {
- BOOL OK;
- const ucd_record * prop = GET_UCD(c);
- switch(code[1])
- {
- case PT_ANY:
- OK = TRUE;
- break;
-
- case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
- break;
-
- case PT_GC:
- OK = _pcre_ucp_gentype[prop->chartype] == code[2];
- break;
-
- case PT_PC:
- OK = prop->chartype == code[2];
- break;
-
- case PT_SC:
- OK = prop->script == code[2];
- break;
-
- /* Should never occur, but keep compilers from grumbling. */
-
- default:
- OK = codevalue != OP_PROP;
- break;
- }
-
- if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
- }
- break;
-#endif
-
-
-
-/* ========================================================================== */
- /* These opcodes likewise inspect the subject character, but have an
- argument that is not a data character. It is one of these opcodes:
- OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
- OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
-
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
- if (clen > 0)
- {
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
- (c < 256 &&
- (d != OP_ANY || !IS_NEWLINE(ptr)) &&
- ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
- {
- if (count > 0 && codevalue == OP_TYPEPOSPLUS)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- count++;
- ADD_NEW(state_offset, count);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSQUERY:
- ADD_ACTIVE(state_offset + 2, 0);
- if (clen > 0)
- {
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
- (c < 256 &&
- (d != OP_ANY || !IS_NEWLINE(ptr)) &&
- ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
- {
- if (codevalue == OP_TYPEPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW(state_offset + 2, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPOSSTAR:
- ADD_ACTIVE(state_offset + 2, 0);
- if (clen > 0)
- {
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
- (c < 256 &&
- (d != OP_ANY || !IS_NEWLINE(ptr)) &&
- ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
- {
- if (codevalue == OP_TYPEPOSSTAR)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW(state_offset, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_TYPEEXACT:
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
- (c < 256 &&
- (d != OP_ANY || !IS_NEWLINE(ptr)) &&
- ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
- {
- if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + 4, 0); }
- else
- { ADD_NEW(state_offset, count); }
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEPOSUPTO:
- ADD_ACTIVE(state_offset + 4, 0);
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
- (c < 256 &&
- (d != OP_ANY || !IS_NEWLINE(ptr)) &&
- ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
- {
- if (codevalue == OP_TYPEPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + 4, 0); }
- else
- { ADD_NEW(state_offset, count); }
- }
- }
- break;
-
-/* ========================================================================== */
- /* These are virtual opcodes that are used when something like
- OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
- argument. It keeps the code above fast for the other cases. The argument
- is in the d variable. */
-
-#ifdef SUPPORT_UCP
- case OP_PROP_EXTRA + OP_TYPEPLUS:
- case OP_PROP_EXTRA + OP_TYPEMINPLUS:
- case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
- if (clen > 0)
- {
- BOOL OK;
- const ucd_record * prop = GET_UCD(c);
- switch(code[2])
- {
- case PT_ANY:
- OK = TRUE;
- break;
-
- case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
- break;
-
- case PT_GC:
- OK = _pcre_ucp_gentype[prop->chartype] == code[3];
- break;
-
- case PT_PC:
- OK = prop->chartype == code[3];
- break;
-
- case PT_SC:
- OK = prop->script == code[3];
- break;
-
- /* Should never occur, but keep compilers from grumbling. */
-
- default:
- OK = codevalue != OP_PROP;
- break;
- }
-
- if (OK == (d == OP_PROP))
- {
- if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- count++;
- ADD_NEW(state_offset, count);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
- case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
- case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
- if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
- {
- const uschar *nptr = ptr + clen;
- int ncount = 0;
- if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- while (nptr < end_subject)
- {
- int nd;
- int ndlen = 1;
- GETCHARLEN(nd, nptr, ndlen);
- if (UCD_CATEGORY(nd) != ucp_M) break;
- ncount++;
- nptr += ndlen;
- }
- count++;
- ADD_NEW_DATA(-state_offset, count, ncount);
- }
- break;
-#endif
-
- /*-----------------------------------------------------------------*/
- case OP_ANYNL_EXTRA + OP_TYPEPLUS:
- case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
- case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
- if (clen > 0)
- {
- int ncount = 0;
- switch (c)
- {
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
- goto ANYNL01;
-
- case 0x000d:
- if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
- /* Fall through */
-
- ANYNL01:
- case 0x000a:
- if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- count++;
- ADD_NEW_DATA(-state_offset, count, ncount);
- break;
-
- default:
- break;
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_VSPACE_EXTRA + OP_TYPEPLUS:
- case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
- case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
- if (clen > 0)
- {
- BOOL OK;
- switch (c)
- {
- case 0x000a:
- case 0x000b:
- case 0x000c:
- case 0x000d:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- OK = TRUE;
- break;
-
- default:
- OK = FALSE;
- break;
- }
-
- if (OK == (d == OP_VSPACE))
- {
- if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- count++;
- ADD_NEW_DATA(-state_offset, count, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_HSPACE_EXTRA + OP_TYPEPLUS:
- case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
- case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
- if (clen > 0)
- {
- BOOL OK;
- switch (c)
- {
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- OK = TRUE;
- break;
-
- default:
- OK = FALSE;
- break;
- }
-
- if (OK == (d == OP_HSPACE))
- {
- if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- count++;
- ADD_NEW_DATA(-state_offset, count, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
-#ifdef SUPPORT_UCP
- case OP_PROP_EXTRA + OP_TYPEQUERY:
- case OP_PROP_EXTRA + OP_TYPEMINQUERY:
- case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
- count = 4;
- goto QS1;
-
- case OP_PROP_EXTRA + OP_TYPESTAR:
- case OP_PROP_EXTRA + OP_TYPEMINSTAR:
- case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
- count = 0;
-
- QS1:
-
- ADD_ACTIVE(state_offset + 4, 0);
- if (clen > 0)
- {
- BOOL OK;
- const ucd_record * prop = GET_UCD(c);
- switch(code[2])
- {
- case PT_ANY:
- OK = TRUE;
- break;
-
- case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
- break;
-
- case PT_GC:
- OK = _pcre_ucp_gentype[prop->chartype] == code[3];
- break;
-
- case PT_PC:
- OK = prop->chartype == code[3];
- break;
-
- case PT_SC:
- OK = prop->script == code[3];
- break;
-
- /* Should never occur, but keep compilers from grumbling. */
-
- default:
- OK = codevalue != OP_PROP;
- break;
- }
-
- if (OK == (d == OP_PROP))
- {
- if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
- codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW(state_offset + count, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
- case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
- case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
- count = 2;
- goto QS2;
-
- case OP_EXTUNI_EXTRA + OP_TYPESTAR:
- case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
- case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
- count = 0;
-
- QS2:
-
- ADD_ACTIVE(state_offset + 2, 0);
- if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
- {
- const uschar *nptr = ptr + clen;
- int ncount = 0;
- if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
- codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- while (nptr < end_subject)
- {
- int nd;
- int ndlen = 1;
- GETCHARLEN(nd, nptr, ndlen);
- if (UCD_CATEGORY(nd) != ucp_M) break;
- ncount++;
- nptr += ndlen;
- }
- ADD_NEW_DATA(-(state_offset + count), 0, ncount);
- }
- break;
-#endif
-
- /*-----------------------------------------------------------------*/
- case OP_ANYNL_EXTRA + OP_TYPEQUERY:
- case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
- case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
- count = 2;
- goto QS3;
-
- case OP_ANYNL_EXTRA + OP_TYPESTAR:
- case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
- case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
- count = 0;
-
- QS3:
- ADD_ACTIVE(state_offset + 2, 0);
- if (clen > 0)
- {
- int ncount = 0;
- switch (c)
- {
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
- goto ANYNL02;
-
- case 0x000d:
- if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
- /* Fall through */
-
- ANYNL02:
- case 0x000a:
- if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
- codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW_DATA(-(state_offset + count), 0, ncount);
- break;
-
- default:
- break;
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_VSPACE_EXTRA + OP_TYPEQUERY:
- case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
- case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
- count = 2;
- goto QS4;
-
- case OP_VSPACE_EXTRA + OP_TYPESTAR:
- case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
- case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
- count = 0;
-
- QS4:
- ADD_ACTIVE(state_offset + 2, 0);
- if (clen > 0)
- {
- BOOL OK;
- switch (c)
- {
- case 0x000a:
- case 0x000b:
- case 0x000c:
- case 0x000d:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- OK = TRUE;
- break;
-
- default:
- OK = FALSE;
- break;
- }
- if (OK == (d == OP_VSPACE))
- {
- if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
- codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW_DATA(-(state_offset + count), 0, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_HSPACE_EXTRA + OP_TYPEQUERY:
- case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
- case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
- count = 2;
- goto QS5;
-
- case OP_HSPACE_EXTRA + OP_TYPESTAR:
- case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
- case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
- count = 0;
-
- QS5:
- ADD_ACTIVE(state_offset + 2, 0);
- if (clen > 0)
- {
- BOOL OK;
- switch (c)
- {
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- OK = TRUE;
- break;
-
- default:
- OK = FALSE;
- break;
- }
-
- if (OK == (d == OP_HSPACE))
- {
- if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
- codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW_DATA(-(state_offset + count), 0, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
-#ifdef SUPPORT_UCP
- case OP_PROP_EXTRA + OP_TYPEEXACT:
- case OP_PROP_EXTRA + OP_TYPEUPTO:
- case OP_PROP_EXTRA + OP_TYPEMINUPTO:
- case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
- if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 6, 0); }
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- BOOL OK;
- const ucd_record * prop = GET_UCD(c);
- switch(code[4])
- {
- case PT_ANY:
- OK = TRUE;
- break;
-
- case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
- break;
-
- case PT_GC:
- OK = _pcre_ucp_gentype[prop->chartype] == code[5];
- break;
-
- case PT_PC:
- OK = prop->chartype == code[5];
- break;
-
- case PT_SC:
- OK = prop->script == code[5];
- break;
-
- /* Should never occur, but keep compilers from grumbling. */
-
- default:
- OK = codevalue != OP_PROP;
- break;
- }
-
- if (OK == (d == OP_PROP))
- {
- if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + 6, 0); }
- else
- { ADD_NEW(state_offset, count); }
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
- case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
- case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
- case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
- if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
- count = current_state->count; /* Number already matched */
- if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
- {
- const uschar *nptr = ptr + clen;
- int ncount = 0;
- if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- while (nptr < end_subject)
- {
- int nd;
- int ndlen = 1;
- GETCHARLEN(nd, nptr, ndlen);
- if (UCD_CATEGORY(nd) != ucp_M) break;
- ncount++;
- nptr += ndlen;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
- else
- { ADD_NEW_DATA(-state_offset, count, ncount); }
- }
- break;
-#endif
-
- /*-----------------------------------------------------------------*/
- case OP_ANYNL_EXTRA + OP_TYPEEXACT:
- case OP_ANYNL_EXTRA + OP_TYPEUPTO:
- case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
- case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
- if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- int ncount = 0;
- switch (c)
- {
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
- goto ANYNL03;
-
- case 0x000d:
- if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
- /* Fall through */
-
- ANYNL03:
- case 0x000a:
- if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
- else
- { ADD_NEW_DATA(-state_offset, count, ncount); }
- break;
-
- default:
- break;
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_VSPACE_EXTRA + OP_TYPEEXACT:
- case OP_VSPACE_EXTRA + OP_TYPEUPTO:
- case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
- case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
- if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- BOOL OK;
- switch (c)
- {
- case 0x000a:
- case 0x000b:
- case 0x000c:
- case 0x000d:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- OK = TRUE;
- break;
-
- default:
- OK = FALSE;
- }
-
- if (OK == (d == OP_VSPACE))
- {
- if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
- else
- { ADD_NEW_DATA(-state_offset, count, 0); }
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_HSPACE_EXTRA + OP_TYPEEXACT:
- case OP_HSPACE_EXTRA + OP_TYPEUPTO:
- case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
- case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
- if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
- { ADD_ACTIVE(state_offset + 4, 0); }
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- BOOL OK;
- switch (c)
- {
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- OK = TRUE;
- break;
-
- default:
- OK = FALSE;
- break;
- }
-
- if (OK == (d == OP_HSPACE))
- {
- if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
- else
- { ADD_NEW_DATA(-state_offset, count, 0); }
- }
- }
- break;
-
-/* ========================================================================== */
- /* These opcodes are followed by a character that is usually compared
- to the current subject character; it is loaded into d. We still get
- here even if there is no subject character, because in some cases zero
- repetitions are permitted. */
-
- /*-----------------------------------------------------------------*/
- case OP_CHAR:
- if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_CHARNC:
- if (clen == 0) break;
-
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
- {
- unsigned int othercase;
- if (c < 128) othercase = fcc[c]; else
-
- /* If we have Unicode property support, we can use it to test the
- other case of the character. */
-
-#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE(c);
-#else
- othercase = NOTACHAR;
-#endif
-
- if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
- }
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Non-UTF-8 mode */
- {
- if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
- }
- break;
-
-
-#ifdef SUPPORT_UCP
- /*-----------------------------------------------------------------*/
- /* This is a tricky one because it can match more than one character.
- Find out how many characters to skip, and then set up a negative state
- to wait for them to pass before continuing. */
-
- case OP_EXTUNI:
- if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
- {
- const uschar *nptr = ptr + clen;
- int ncount = 0;
- while (nptr < end_subject)
- {
- int nclen = 1;
- GETCHARLEN(c, nptr, nclen);
- if (UCD_CATEGORY(c) != ucp_M) break;
- ncount++;
- nptr += nclen;
- }
- ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
- }
- break;
-#endif
-
- /*-----------------------------------------------------------------*/
- /* This is a tricky like EXTUNI because it too can match more than one
- character (when CR is followed by LF). In this case, set up a negative
- state to wait for one character to pass before continuing. */
-
- case OP_ANYNL:
- if (clen > 0) switch(c)
- {
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
-
- case 0x000a:
- ADD_NEW(state_offset + 1, 0);
- break;
-
- case 0x000d:
- if (ptr + 1 < end_subject && ptr[1] == 0x0a)
- {
- ADD_NEW_DATA(-(state_offset + 1), 0, 1);
- }
- else
- {
- ADD_NEW(state_offset + 1, 0);
- }
- break;
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_NOT_VSPACE:
- if (clen > 0) switch(c)
- {
- case 0x000a:
- case 0x000b:
- case 0x000c:
- case 0x000d:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- break;
-
- default:
- ADD_NEW(state_offset + 1, 0);
- break;
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_VSPACE:
- if (clen > 0) switch(c)
- {
- case 0x000a:
- case 0x000b:
- case 0x000c:
- case 0x000d:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- ADD_NEW(state_offset + 1, 0);
- break;
-
- default: break;
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_NOT_HSPACE:
- if (clen > 0) switch(c)
- {
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
-
- default:
- ADD_NEW(state_offset + 1, 0);
- break;
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_HSPACE:
- if (clen > 0) switch(c)
- {
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- ADD_NEW(state_offset + 1, 0);
- break;
- }
- break;
-
- /*-----------------------------------------------------------------*/
- /* Match a negated single character. This is only used for one-byte
- characters, that is, we know that d < 256. The character we are
- checking (c) can be multibyte. */
-
- case OP_NOT:
- if (clen > 0)
- {
- unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
- if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTPOSPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
- if (clen > 0)
- {
- unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
- {
-#ifdef SUPPORT_UCP
- otherd = UCD_OTHERCASE(d);
-#endif /* SUPPORT_UCP */
- }
- else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
- }
- if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
- {
- if (count > 0 &&
- (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- count++;
- ADD_NEW(state_offset, count);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- case OP_NOTPOSQUERY:
- ADD_ACTIVE(state_offset + dlen + 1, 0);
- if (clen > 0)
- {
- unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
- {
-#ifdef SUPPORT_UCP
- otherd = UCD_OTHERCASE(d);
-#endif /* SUPPORT_UCP */
- }
- else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
- }
- if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
- {
- if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW(state_offset + dlen + 1, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPOSSTAR:
- ADD_ACTIVE(state_offset + dlen + 1, 0);
- if (clen > 0)
- {
- unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
- {
-#ifdef SUPPORT_UCP
- otherd = UCD_OTHERCASE(d);
-#endif /* SUPPORT_UCP */
- }
- else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
- }
- if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
- {
- if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- ADD_NEW(state_offset, 0);
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_EXACT:
- case OP_NOTEXACT:
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
- {
-#ifdef SUPPORT_UCP
- otherd = UCD_OTHERCASE(d);
-#endif /* SUPPORT_UCP */
- }
- else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
- }
- if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
- {
- if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + dlen + 3, 0); }
- else
- { ADD_NEW(state_offset, count); }
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- case OP_NOTPOSUPTO:
- ADD_ACTIVE(state_offset + dlen + 3, 0);
- count = current_state->count; /* Number already matched */
- if (clen > 0)
- {
- unsigned int otherd = NOTACHAR;
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (utf8 && d >= 128)
- {
-#ifdef SUPPORT_UCP
- otherd = UCD_OTHERCASE(d);
-#endif /* SUPPORT_UCP */
- }
- else
-#endif /* SUPPORT_UTF8 */
- otherd = fcc[d];
- }
- if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
- {
- if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
- {
- active_count--; /* Remove non-match possibility */
- next_active_state--;
- }
- if (++count >= GET2(code, 1))
- { ADD_NEW(state_offset + dlen + 3, 0); }
- else
- { ADD_NEW(state_offset, count); }
- }
- }
- break;
-
-
-/* ========================================================================== */
- /* These are the class-handling opcodes */
-
- case OP_CLASS:
- case OP_NCLASS:
- case OP_XCLASS:
- {
- BOOL isinclass = FALSE;
- int next_state_offset;
- const uschar *ecode;
-
- /* For a simple class, there is always just a 32-byte table, and we
- can set isinclass from it. */
-
- if (codevalue != OP_XCLASS)
- {
- ecode = code + 33;
- if (clen > 0)
- {
- isinclass = (c > 255)? (codevalue == OP_NCLASS) :
- ((code[1 + c/8] & (1 << (c&7))) != 0);
- }
- }
-
- /* An extended class may have a table or a list of single characters,
- ranges, or both, and it may be positive or negative. There's a
- function that sorts all this out. */
-
- else
- {
- ecode = code + GET(code, 1);
- if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
- }
-
- /* At this point, isinclass is set for all kinds of class, and ecode
- points to the byte after the end of the class. If there is a
- quantifier, this is where it will be. */
-
- next_state_offset = ecode - start_code;
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- ADD_ACTIVE(next_state_offset + 1, 0);
- if (isinclass) { ADD_NEW(state_offset, 0); }
- break;
-
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- count = current_state->count; /* Already matched */
- if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
- if (isinclass) { count++; ADD_NEW(state_offset, count); }
- break;
-
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- ADD_ACTIVE(next_state_offset + 1, 0);
- if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- count = current_state->count; /* Already matched */
- if (count >= GET2(ecode, 1))
- { ADD_ACTIVE(next_state_offset + 5, 0); }
- if (isinclass)
- {
- int max = GET2(ecode, 3);
- if (++count >= max && max != 0) /* Max 0 => no limit */
- { ADD_NEW(next_state_offset + 5, 0); }
- else
- { ADD_NEW(state_offset, count); }
- }
- break;
-
- default:
- if (isinclass) { ADD_NEW(next_state_offset, 0); }
- break;
- }
- }
- break;
-
-/* ========================================================================== */
- /* These are the opcodes for fancy brackets of various kinds. We have
- to use recursion in order to handle them. The "always failing" assersion
- (?!) is optimised when compiling to OP_FAIL, so we have to support that,
- though the other "backtracking verbs" are not supported. */
-
- case OP_FAIL:
- break;
-
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- {
- int rc;
- int local_offsets[2];
- int local_workspace[1000];
- const uschar *endasscode = code + GET(code, 1);
-
- while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
-
- rc = internal_dfa_exec(
- md, /* static match data */
- code, /* this subexpression's code */
- ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
- local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(int), /* size of same */
- local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing); /* pass on regex recursion */
-
- if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_COND:
- case OP_SCOND:
- {
- int local_offsets[1000];
- int local_workspace[1000];
- int codelink = GET(code, 1);
- int condcode;
-
- /* Because of the way auto-callout works during compile, a callout item
- is inserted between OP_COND and an assertion condition. This does not
- happen for the other conditions. */
-
- if (code[LINK_SIZE+1] == OP_CALLOUT)
- {
- rrc = 0;
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 1; /* Version 1 of the callout block */
- cb.callout_number = code[LINK_SIZE+2];
- cb.offset_vector = offsets;
- cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
- cb.pattern_position = GET(code, LINK_SIZE + 3);
- cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
- cb.capture_top = 1;
- cb.capture_last = -1;
- cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
- }
- if (rrc > 0) break; /* Fail this thread */
- code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
- }
-
- condcode = code[LINK_SIZE+1];
-
- /* Back reference conditions are not supported */
-
- if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
-
- /* The DEFINE condition is always false */
-
- if (condcode == OP_DEF)
- { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
-
- /* The only supported version of OP_RREF is for the value RREF_ANY,
- which means "test if in any recursion". We can't test for specifically
- recursed groups. */
-
- else if (condcode == OP_RREF)
- {
- int value = GET2(code, LINK_SIZE+2);
- if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
- if (recursing > 0)
- { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
- else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
- }
-
- /* Otherwise, the condition is an assertion */
-
- else
- {
- int rc;
- const uschar *asscode = code + LINK_SIZE + 1;
- const uschar *endasscode = asscode + GET(asscode, 1);
-
- while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
-
- rc = internal_dfa_exec(
- md, /* fixed match data */
- asscode, /* this subexpression's code */
- ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
- local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(int), /* size of same */
- local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing); /* pass on regex recursion */
-
- if ((rc >= 0) ==
- (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
- { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
- else
- { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
- }
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_RECURSE:
- {
- int local_offsets[1000];
- int local_workspace[1000];
- int rc;
-
- DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
- recursing + 1));
-
- rc = internal_dfa_exec(
- md, /* fixed match data */
- start_code + GET(code, 1), /* this subexpression's code */
- ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
- local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(int), /* size of same */
- local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing + 1); /* regex recurse level */
-
- DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
- recursing + 1, rc));
-
- /* Ran out of internal offsets */
-
- if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
-
- /* For each successful matched substring, set up the next state with a
- count of characters to skip before trying it. Note that the count is in
- characters, not bytes. */
-
- if (rc > 0)
- {
- for (rc = rc*2 - 2; rc >= 0; rc -= 2)
- {
- const uschar *p = start_subject + local_offsets[rc];
- const uschar *pp = start_subject + local_offsets[rc+1];
- int charcount = local_offsets[rc+1] - local_offsets[rc];
- while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
- if (charcount > 0)
- {
- ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
- }
- else
- {
- ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
- }
- }
- }
- else if (rc != PCRE_ERROR_NOMATCH) return rc;
- }
- break;
-
- /*-----------------------------------------------------------------*/
- case OP_ONCE:
- {
- int local_offsets[2];
- int local_workspace[1000];
-
- int rc = internal_dfa_exec(
- md, /* fixed match data */
- code, /* this subexpression's code */
- ptr, /* where we currently are */
- ptr - start_subject, /* start offset */
- local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(int), /* size of same */
- local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- ims, /* the current ims flags */
- rlevel, /* function recursion level */
- recursing); /* pass on regex recursion */
-
- if (rc >= 0)
- {
- const uschar *end_subpattern = code;
- int charcount = local_offsets[1] - local_offsets[0];
- int next_state_offset, repeat_state_offset;
-
- do { end_subpattern += GET(end_subpattern, 1); }
- while (*end_subpattern == OP_ALT);
- next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
-
- /* If the end of this subpattern is KETRMAX or KETRMIN, we must
- arrange for the repeat state also to be added to the relevant list.
- Calculate the offset, or set -1 for no repeat. */
-
- repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
- *end_subpattern == OP_KETRMIN)?
- end_subpattern - start_code - GET(end_subpattern, 1) : -1;
-
- /* If we have matched an empty string, add the next state at the
- current character pointer. This is important so that the duplicate
- checking kicks in, which is what breaks infinite loops that match an
- empty string. */
-
- if (charcount == 0)
- {
- ADD_ACTIVE(next_state_offset, 0);
- }
-
- /* Optimization: if there are no more active states, and there
- are no new states yet set up, then skip over the subject string
- right here, to save looping. Otherwise, set up the new state to swing
- into action when the end of the substring is reached. */
-
- else if (i + 1 >= active_count && new_count == 0)
- {
- ptr += charcount;
- clen = 0;
- ADD_NEW(next_state_offset, 0);
-
- /* If we are adding a repeat state at the new character position,
- we must fudge things so that it is the only current state.
- Otherwise, it might be a duplicate of one we processed before, and
- that would cause it to be skipped. */
-
- if (repeat_state_offset >= 0)
- {
- next_active_state = active_states;
- active_count = 0;
- i = -1;
- ADD_ACTIVE(repeat_state_offset, 0);
- }
- }
- else
- {
- const uschar *p = start_subject + local_offsets[0];
- const uschar *pp = start_subject + local_offsets[1];
- while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
- ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
- if (repeat_state_offset >= 0)
- { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
- }
-
- }
- else if (rc != PCRE_ERROR_NOMATCH) return rc;
- }
- break;
-
-
-/* ========================================================================== */
- /* Handle callouts */
-
- case OP_CALLOUT:
- rrc = 0;
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 1; /* Version 1 of the callout block */
- cb.callout_number = code[1];
- cb.offset_vector = offsets;
- cb.subject = (PCRE_SPTR)start_subject;
- cb.subject_length = end_subject - start_subject;
- cb.start_match = current_subject - start_subject;
- cb.current_position = ptr - start_subject;
- cb.pattern_position = GET(code, 2);
- cb.next_item_length = GET(code, 2 + LINK_SIZE);
- cb.capture_top = 1;
- cb.capture_last = -1;
- cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
- }
- if (rrc == 0)
- { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
- break;
-
-
-/* ========================================================================== */
- default: /* Unsupported opcode */
- return PCRE_ERROR_DFA_UITEM;
- }
-
- NEXT_ACTIVE_STATE: continue;
-
- } /* End of loop scanning active states */
-
- /* We have finished the processing at the current subject character. If no
- new states have been set for the next character, we have found all the
- matches that we are going to find. If we are at the top level and partial
- matching has been requested, check for appropriate conditions. */
-
- if (new_count <= 0)
- {
- if (match_count < 0 && /* No matches found */
- rlevel == 1 && /* Top level match function */
- (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
- ptr >= end_subject && /* Reached end of subject */
- ptr > current_subject) /* Matched non-empty string */
- {
- if (offsetcount >= 2)
- {
- offsets[0] = current_subject - start_subject;
- offsets[1] = end_subject - start_subject;
- }
- match_count = PCRE_ERROR_PARTIAL;
- }
-
- DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
- "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
- rlevel*2-2, SP));
- break; /* In effect, "return", but see the comment below */
- }
-
- /* One or more states are active for the next character. */
-
- ptr += clen; /* Advance to next subject character */
- } /* Loop to move along the subject string */
-
-/* Control gets here from "break" a few lines above. We do it this way because
-if we use "return" above, we have compiler trouble. Some compilers warn if
-there's nothing here because they think the function doesn't return a value. On
-the other hand, if we put a dummy statement here, some more clever compilers
-complain that it can't be reached. Sigh. */
-
-return match_count;
-}
-
-
-
-
-/*************************************************
-* Execute a Regular Expression - DFA engine *
-*************************************************/
-
-/* This external function applies a compiled re to a subject string using a DFA
-engine. This function calls the internal function multiple times if the pattern
-is not anchored.
-
-Arguments:
- argument_re points to the compiled expression
- extra_data points to extra data or is NULL
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- offsets vector of match offsets
- offsetcount size of same
- workspace workspace vector
- wscount size of same
-
-Returns: > 0 => number of match offset pairs placed in offsets
- = 0 => offsets overflowed; longest matches are present
- -1 => failed to match
- < -1 => some kind of unexpected problem
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
- const char *subject, int length, int start_offset, int options, int *offsets,
- int offsetcount, int *workspace, int wscount)
-{
-real_pcre *re = (real_pcre *)argument_re;
-dfa_match_data match_block;
-dfa_match_data *md = &match_block;
-BOOL utf8, anchored, startline, firstline;
-const uschar *current_subject, *end_subject, *lcc;
-
-pcre_study_data internal_study;
-const pcre_study_data *study = NULL;
-real_pcre internal_re;
-
-const uschar *req_byte_ptr;
-const uschar *start_bits = NULL;
-BOOL first_byte_caseless = FALSE;
-BOOL req_byte_caseless = FALSE;
-int first_byte = -1;
-int req_byte = -1;
-int req_byte2 = -1;
-int newline;
-
-/* Plausibility checks */
-
-if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
-if (re == NULL || subject == NULL || workspace == NULL ||
- (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
-if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
-if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
-
-/* We need to find the pointer to any study data before we test for byte
-flipping, so we scan the extra_data block first. This may set two fields in the
-match block, so we must initialize them beforehand. However, the other fields
-in the match block must not be set until after the byte flipping. */
-
-md->tables = re->tables;
-md->callout_data = NULL;
-
-if (extra_data != NULL)
- {
- unsigned int flags = extra_data->flags;
- if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
- study = (const pcre_study_data *)extra_data->study_data;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
- return PCRE_ERROR_DFA_UMLIMIT;
- if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
- md->callout_data = extra_data->callout_data;
- if ((flags & PCRE_EXTRA_TABLES) != 0)
- md->tables = extra_data->tables;
- }
-
-/* Check that the first field in the block is the magic number. If it is not,
-test for a regex that was compiled on a host of opposite endianness. If this is
-the case, flipped values are put in internal_re and internal_study if there was
-study data too. */
-
-if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- if (study != NULL) study = &internal_study;
- }
-
-/* Set some local values */
-
-current_subject = (const unsigned char *)subject + start_offset;
-end_subject = (const unsigned char *)subject + length;
-req_byte_ptr = current_subject - 1;
-
-#ifdef SUPPORT_UTF8
-utf8 = (re->options & PCRE_UTF8) != 0;
-#else
-utf8 = FALSE;
-#endif
-
-anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
- (re->options & PCRE_ANCHORED) != 0;
-
-/* The remaining fixed data for passing around. */
-
-md->start_code = (const uschar *)argument_re +
- re->name_table_offset + re->name_count * re->name_entry_size;
-md->start_subject = (const unsigned char *)subject;
-md->end_subject = end_subject;
-md->moptions = options;
-md->poptions = re->options;
-
-/* If the BSR option is not set at match time, copy what was set
-at compile time. */
-
-if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
- {
- if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
- md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
-#ifdef BSR_ANYCRLF
- else md->moptions |= PCRE_BSR_ANYCRLF;
-#endif
- }
-
-/* Handle different types of newline. The three bits give eight cases. If
-nothing is set at run time, whatever was used at compile time applies. */
-
-switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
- PCRE_NEWLINE_BITS)
- {
- case 0: newline = NEWLINE; break; /* Compile-time default */
- case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
- case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
- case PCRE_NEWLINE_CR+
- PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
- case PCRE_NEWLINE_ANY: newline = -1; break;
- case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
- default: return PCRE_ERROR_BADNEWLINE;
- }
-
-if (newline == -2)
- {
- md->nltype = NLTYPE_ANYCRLF;
- }
-else if (newline < 0)
- {
- md->nltype = NLTYPE_ANY;
- }
-else
- {
- md->nltype = NLTYPE_FIXED;
- if (newline > 255)
- {
- md->nllen = 2;
- md->nl[0] = (newline >> 8) & 255;
- md->nl[1] = newline & 255;
- }
- else
- {
- md->nllen = 1;
- md->nl[0] = newline;
- }
- }
-
-/* Check a UTF-8 string if required. Unfortunately there's no way of passing
-back the character offset. */
-
-#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
- {
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
- if (start_offset > 0 && start_offset < length)
- {
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
- }
- }
-#endif
-
-/* If the exec call supplied NULL for tables, use the inbuilt ones. This
-is a feature that makes it possible to save compiled regex and re-use them
-in other programs later. */
-
-if (md->tables == NULL) md->tables = _pcre_default_tables;
-
-/* The lower casing table and the "must be at the start of a line" flag are
-used in a loop when finding where to start. */
-
-lcc = md->tables + lcc_offset;
-startline = (re->flags & PCRE_STARTLINE) != 0;
-firstline = (re->options & PCRE_FIRSTLINE) != 0;
-
-/* Set up the first character to match, if available. The first_byte value is
-never set for an anchored regular expression, but the anchoring may be forced
-at run time, so we have to test for anchoring. The first char may be unset for
-an unanchored pattern, of course. If there's no first char and the pattern was
-studied, there may be a bitmap of possible first characters. */
-
-if (!anchored)
- {
- if ((re->flags & PCRE_FIRSTSET) != 0)
- {
- first_byte = re->first_byte & 255;
- if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
- first_byte = lcc[first_byte];
- }
- else
- {
- if (startline && study != NULL &&
- (study->options & PCRE_STUDY_MAPPED) != 0)
- start_bits = study->start_bits;
- }
- }
-
-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
-
-if ((re->flags & PCRE_REQCHSET) != 0)
- {
- req_byte = re->req_byte & 255;
- req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
- req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
- }
-
-/* Call the main matching function, looping for a non-anchored regex after a
-failed match. If not restarting, perform certain optimizations at the start of
-a match. */
-
-for (;;)
- {
- int rc;
-
- if ((options & PCRE_DFA_RESTART) == 0)
- {
- const uschar *save_end_subject = end_subject;
-
- /* If firstline is TRUE, the start of the match is constrained to the first
- line of a multiline string. Implement this by temporarily adjusting
- end_subject so that we stop scanning at a newline. If the match fails at
- the newline, later code breaks this loop. */
-
- if (firstline)
- {
- USPTR t = current_subject;
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (t < md->end_subject && !IS_NEWLINE(t))
- {
- t++;
- while (t < end_subject && (*t & 0xc0) == 0x80) t++;
- }
- }
- else
-#endif
- while (t < md->end_subject && !IS_NEWLINE(t)) t++;
- end_subject = t;
- }
-
- /* There are some optimizations that avoid running the match if a known
- starting point is not found, or if a known later character is not present.
- However, there is an option that disables these, for testing and for
- ensuring that all callouts do actually occur. */
-
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
- {
-
- /* Advance to a known first byte. */
-
- if (first_byte >= 0)
- {
- if (first_byte_caseless)
- while (current_subject < end_subject &&
- lcc[*current_subject] != first_byte)
- current_subject++;
- else
- while (current_subject < end_subject &&
- *current_subject != first_byte)
- current_subject++;
- }
-
- /* Or to just after a linebreak for a multiline match if possible */
-
- else if (startline)
- {
- if (current_subject > md->start_subject + start_offset)
- {
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (current_subject < end_subject &&
- !WAS_NEWLINE(current_subject))
- {
- current_subject++;
- while(current_subject < end_subject &&
- (*current_subject & 0xc0) == 0x80)
- current_subject++;
- }
- }
- else
-#endif
- while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
- current_subject++;
-
- /* If we have just passed a CR and the newline option is ANY or
- ANYCRLF, and we are now at a LF, advance the match position by one
- more character. */
-
- if (current_subject[-1] == CHAR_CR &&
- (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
- current_subject < end_subject &&
- *current_subject == CHAR_NL)
- current_subject++;
- }
- }
-
- /* Or to a non-unique first char after study */
-
- else if (start_bits != NULL)
- {
- while (current_subject < end_subject)
- {
- register unsigned int c = *current_subject;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
- else break;
- }
- }
- }
-
- /* Restore fudged end_subject */
-
- end_subject = save_end_subject;
- }
-
- /* If req_byte is set, we know that that character must appear in the subject
- for the match to succeed. If the first character is set, req_byte must be
- later in the subject; otherwise the test starts at the match point. This
- optimization can save a huge amount of work in patterns with nested unlimited
- repeats that aren't going to match. Writing separate code for cased/caseless
- versions makes it go faster, as does using an autoincrement and backing off
- on a match.
-
- HOWEVER: when the subject string is very, very long, searching to its end can
- take a long time, and give bad performance on quite ordinary patterns. This
- showed up when somebody was matching /^C/ on a 32-megabyte string... so we
- don't do this when the string is sufficiently long.
-
- ALSO: this processing is disabled when partial matching is requested, and can
- also be explicitly deactivated. */
-
- if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
- req_byte >= 0 &&
- end_subject - current_subject < REQ_BYTE_MAX &&
- (options & PCRE_PARTIAL) == 0)
- {
- register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
-
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
-
- if (p > req_byte_ptr)
- {
- if (req_byte_caseless)
- {
- while (p < end_subject)
- {
- register int pp = *p++;
- if (pp == req_byte || pp == req_byte2) { p--; break; }
- }
- }
- else
- {
- while (p < end_subject)
- {
- if (*p++ == req_byte) { p--; break; }
- }
- }
-
- /* If we can't find the required character, break the matching loop,
- which will cause a return or PCRE_ERROR_NOMATCH. */
-
- if (p >= end_subject) break;
-
- /* If we have found the required character, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
-
- req_byte_ptr = p;
- }
- }
-
- /* OK, now we can do the business */
-
- rc = internal_dfa_exec(
- md, /* fixed match data */
- md->start_code, /* this subexpression's code */
- current_subject, /* where we currently are */
- start_offset, /* start offset in subject */
- offsets, /* offset vector */
- offsetcount, /* size of same */
- workspace, /* workspace vector */
- wscount, /* size of same */
- re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
- 0, /* function recurse level */
- 0); /* regex recurse level */
-
- /* Anything other than "no match" means we are done, always; otherwise, carry
- on only if not anchored. */
-
- if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
-
- /* Advance to the next subject character unless we are at the end of a line
- and firstline is set. */
-
- if (firstline && IS_NEWLINE(current_subject)) break;
- current_subject++;
- if (utf8)
- {
- while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
- current_subject++;
- }
- if (current_subject > end_subject) break;
-
- /* If we have just passed a CR and we are now at a LF, and the pattern does
- not contain any explicit matches for \r or \n, and the newline option is CRLF
- or ANY or ANYCRLF, advance the match position by one more character. */
-
- if (current_subject[-1] == CHAR_CR &&
- current_subject < end_subject &&
- *current_subject == CHAR_NL &&
- (re->flags & PCRE_HASCRORLF) == 0 &&
- (md->nltype == NLTYPE_ANY ||
- md->nltype == NLTYPE_ANYCRLF ||
- md->nllen == 2))
- current_subject++;
-
- } /* "Bumpalong" loop */
-
-return PCRE_ERROR_NOMATCH;
-}
-
-/* End of pcre_dfa_exec.c */
diff --git a/libs/pcre/pcre_exec.c b/libs/pcre/pcre_exec.c
deleted file mode 100644
index 9756362445..0000000000
--- a/libs/pcre/pcre_exec.c
+++ /dev/null
@@ -1,5050 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains pcre_exec(), the externally visible function that does
-pattern matching using an NFA algorithm, trying to mimic Perl as closely as
-possible. There are also some static supporting functions. */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#define NLBLOCK md /* Block containing newline information */
-#define PSSTART start_subject /* Field containing processed string start */
-#define PSEND end_subject /* Field containing processed string end */
-
-#include "pcre_internal.h"
-
-/* Undefine some potentially clashing cpp symbols */
-
-#undef min
-#undef max
-
-/* Flag bits for the match() function */
-
-#define match_condassert 0x01 /* Called to check a condition assertion */
-#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
-
-/* Non-error returns from the match() function. Error returns are externally
-defined PCRE_ERROR_xxx codes, which are all negative. */
-
-#define MATCH_MATCH 1
-#define MATCH_NOMATCH 0
-
-/* Special internal returns from the match() function. Make them sufficiently
-negative to avoid the external error codes. */
-
-#define MATCH_COMMIT (-999)
-#define MATCH_PRUNE (-998)
-#define MATCH_SKIP (-997)
-#define MATCH_THEN (-996)
-
-/* Maximum number of ints of offset to save on the stack for recursive calls.
-If the offset vector is bigger, malloc is used. This should be a multiple of 3,
-because the offset vector is always a multiple of 3 long. */
-
-#define REC_STACK_SAVE_MAX 30
-
-/* Min and max values for the common repeats; for the maxima, 0 => infinity */
-
-static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
-static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
-
-
-
-#ifdef PCRE_DEBUG
-/*************************************************
-* Debugging function to print chars *
-*************************************************/
-
-/* Print a sequence of chars in printable format, stopping at the end of the
-subject if the requested.
-
-Arguments:
- p points to characters
- length number to print
- is_subject TRUE if printing from within md->start_subject
- md pointer to matching data block, if is_subject is TRUE
-
-Returns: nothing
-*/
-
-static void
-pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
-{
-unsigned int c;
-if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
-while (length-- > 0)
- if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
-}
-#endif
-
-
-
-/*************************************************
-* Match a back-reference *
-*************************************************/
-
-/* If a back reference hasn't been set, the length that is passed is greater
-than the number of characters left in the string, so the match fails.
-
-Arguments:
- offset index into the offset vector
- eptr points into the subject
- length length to be matched
- md points to match data block
- ims the ims flags
-
-Returns: TRUE if matched
-*/
-
-static BOOL
-match_ref(int offset, register USPTR eptr, int length, match_data *md,
- unsigned long int ims)
-{
-USPTR p = md->start_subject + md->offset_vector[offset];
-
-#ifdef PCRE_DEBUG
-if (eptr >= md->end_subject)
- printf("matching subject ");
-else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- }
-printf(" against backref ");
-pchars(p, length, FALSE, md);
-printf("\n");
-#endif
-
-/* Always fail if not enough characters left */
-
-if (length > md->end_subject - eptr) return FALSE;
-
-/* Separate the caseless case for speed. In UTF-8 mode we can only do this
-properly if Unicode properties are supported. Otherwise, we can check only
-ASCII characters. */
-
-if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
-#ifdef SUPPORT_UCP
- if (md->utf8)
- {
- USPTR endptr = eptr + length;
- while (eptr < endptr)
- {
- int c, d;
- GETCHARINC(c, eptr);
- GETCHARINC(d, p);
- if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
- }
- }
- else
-#endif
-#endif
-
- /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
- is no UCP support. */
-
- while (length-- > 0)
- { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
- }
-
-/* In the caseful case, we can just compare the bytes, whether or not we
-are in UTF-8 mode. */
-
-else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
-
-return TRUE;
-}
-
-
-
-/***************************************************************************
-****************************************************************************
- RECURSION IN THE match() FUNCTION
-
-The match() function is highly recursive, though not every recursive call
-increases the recursive depth. Nevertheless, some regular expressions can cause
-it to recurse to a great depth. I was writing for Unix, so I just let it call
-itself recursively. This uses the stack for saving everything that has to be
-saved for a recursive call. On Unix, the stack can be large, and this works
-fine.
-
-It turns out that on some non-Unix-like systems there are problems with
-programs that use a lot of stack. (This despite the fact that every last chip
-has oodles of memory these days, and techniques for extending the stack have
-been known for decades.) So....
-
-There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
-calls by keeping local variables that need to be preserved in blocks of memory
-obtained from malloc() instead instead of on the stack. Macros are used to
-achieve this so that the actual code doesn't look very different to what it
-always used to.
-
-The original heap-recursive code used longjmp(). However, it seems that this
-can be very slow on some operating systems. Following a suggestion from Stan
-Switzer, the use of longjmp() has been abolished, at the cost of having to
-provide a unique number for each call to RMATCH. There is no way of generating
-a sequence of numbers at compile time in C. I have given them names, to make
-them stand out more clearly.
-
-Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
-FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
-tests. Furthermore, not using longjmp() means that local dynamic variables
-don't have indeterminate values; this has meant that the frame size can be
-reduced because the result can be "passed back" by straight setting of the
-variable instead of being passed in the frame.
-****************************************************************************
-***************************************************************************/
-
-/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
-below must be updated in sync. */
-
-enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
- RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
- RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
- RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
- RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
- RM51, RM52, RM53, RM54 };
-
-/* These versions of the macros use the stack, as normal. There are debugging
-versions and production versions. Note that the "rw" argument of RMATCH isn't
-actuall used in this definition. */
-
-#ifndef NO_RECURSE
-#define REGISTER register
-
-#ifdef PCRE_DEBUG
-#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
- { \
- printf("match() called in line %d\n", __LINE__); \
- rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
- printf("to line %d\n", __LINE__); \
- }
-#define RRETURN(ra) \
- { \
- printf("match() returned %d from line %d ", ra, __LINE__); \
- return ra; \
- }
-#else
-#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
- rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
-#define RRETURN(ra) return ra
-#endif
-
-#else
-
-
-/* These versions of the macros manage a private stack on the heap. Note that
-the "rd" argument of RMATCH isn't actually used in this definition. It's the md
-argument of match(), which never changes. */
-
-#define REGISTER
-
-#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
- {\
- heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
- frame->Xwhere = rw; \
- newframe->Xeptr = ra;\
- newframe->Xecode = rb;\
- newframe->Xmstart = mstart;\
- newframe->Xoffset_top = rc;\
- newframe->Xims = re;\
- newframe->Xeptrb = rf;\
- newframe->Xflags = rg;\
- newframe->Xrdepth = frame->Xrdepth + 1;\
- newframe->Xprevframe = frame;\
- frame = newframe;\
- DPRINTF(("restarting from line %d\n", __LINE__));\
- goto HEAP_RECURSE;\
- L_##rw:\
- DPRINTF(("jumped back to line %d\n", __LINE__));\
- }
-
-#define RRETURN(ra)\
- {\
- heapframe *newframe = frame;\
- frame = newframe->Xprevframe;\
- (pcre_stack_free)(newframe);\
- if (frame != NULL)\
- {\
- rrc = ra;\
- goto HEAP_RETURN;\
- }\
- return ra;\
- }
-
-
-/* Structure for remembering the local variables in a private frame */
-
-typedef struct heapframe {
- struct heapframe *Xprevframe;
-
- /* Function arguments that may change */
-
- USPTR Xeptr;
- const uschar *Xecode;
- USPTR Xmstart;
- int Xoffset_top;
- long int Xims;
- eptrblock *Xeptrb;
- int Xflags;
- unsigned int Xrdepth;
-
- /* Function local variables */
-
- USPTR Xcallpat;
-#ifdef SUPPORT_UTF8
- USPTR Xcharptr;
-#endif
- USPTR Xdata;
- USPTR Xnext;
- USPTR Xpp;
- USPTR Xprev;
- USPTR Xsaved_eptr;
-
- recursion_info Xnew_recursive;
-
- BOOL Xcur_is_word;
- BOOL Xcondition;
- BOOL Xprev_is_word;
-
- unsigned long int Xoriginal_ims;
-
-#ifdef SUPPORT_UCP
- int Xprop_type;
- int Xprop_value;
- int Xprop_fail_result;
- int Xprop_category;
- int Xprop_chartype;
- int Xprop_script;
- int Xoclength;
- uschar Xocchars[8];
-#endif
-
- int Xcodelink;
- int Xctype;
- unsigned int Xfc;
- int Xfi;
- int Xlength;
- int Xmax;
- int Xmin;
- int Xnumber;
- int Xoffset;
- int Xop;
- int Xsave_capture_last;
- int Xsave_offset1, Xsave_offset2, Xsave_offset3;
- int Xstacksave[REC_STACK_SAVE_MAX];
-
- eptrblock Xnewptrb;
-
- /* Where to jump back to */
-
- int Xwhere;
-
-} heapframe;
-
-#endif
-
-
-/***************************************************************************
-***************************************************************************/
-
-
-
-/*************************************************
-* Match from current position *
-*************************************************/
-
-/* This function is called recursively in many circumstances. Whenever it
-returns a negative (error) response, the outer incarnation must also return the
-same response.
-
-Performance note: It might be tempting to extract commonly used fields from the
-md structure (e.g. utf8, end_subject) into individual variables to improve
-performance. Tests using gcc on a SPARC disproved this; in the first case, it
-made performance worse.
-
-Arguments:
- eptr pointer to current character in subject
- ecode pointer to current position in compiled code
- mstart pointer to the current match start position (can be modified
- by encountering \K)
- offset_top current top pointer
- md pointer to "static" info for the match
- ims current /i, /m, and /s options
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- flags can contain
- match_condassert - this is an assertion condition
- match_cbegroup - this is the start of an unlimited repeat
- group that can match an empty string
- rdepth the recursion depth
-
-Returns: MATCH_MATCH if matched ) these values are >= 0
- MATCH_NOMATCH if failed to match )
- a negative PCRE_ERROR_xxx value if aborted by an error condition
- (e.g. stopped by repeated call or recursion limit)
-*/
-
-static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
- int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
- int flags, unsigned int rdepth)
-{
-/* These variables do not need to be preserved over recursion in this function,
-so they can be ordinary variables in all cases. Mark some of them with
-"register" because they are used a lot in loops. */
-
-register int rrc; /* Returns from recursive calls */
-register int i; /* Used for loops not involving calls to RMATCH() */
-register unsigned int c; /* Character values not kept over RMATCH() calls */
-register BOOL utf8; /* Local copy of UTF-8 flag for speed */
-
-BOOL minimize, possessive; /* Quantifier options */
-int condcode;
-
-/* When recursion is not being used, all "local" variables that have to be
-preserved over calls to RMATCH() are part of a "frame" which is obtained from
-heap storage. Set up the top-level frame here; others are obtained from the
-heap whenever RMATCH() does a "recursion". See the macro definitions above. */
-
-#ifdef NO_RECURSE
-heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
-frame->Xprevframe = NULL; /* Marks the top level */
-
-/* Copy in the original argument variables */
-
-frame->Xeptr = eptr;
-frame->Xecode = ecode;
-frame->Xmstart = mstart;
-frame->Xoffset_top = offset_top;
-frame->Xims = ims;
-frame->Xeptrb = eptrb;
-frame->Xflags = flags;
-frame->Xrdepth = rdepth;
-
-/* This is where control jumps back to to effect "recursion" */
-
-HEAP_RECURSE:
-
-/* Macros make the argument variables come from the current frame */
-
-#define eptr frame->Xeptr
-#define ecode frame->Xecode
-#define mstart frame->Xmstart
-#define offset_top frame->Xoffset_top
-#define ims frame->Xims
-#define eptrb frame->Xeptrb
-#define flags frame->Xflags
-#define rdepth frame->Xrdepth
-
-/* Ditto for the local variables */
-
-#ifdef SUPPORT_UTF8
-#define charptr frame->Xcharptr
-#endif
-#define callpat frame->Xcallpat
-#define codelink frame->Xcodelink
-#define data frame->Xdata
-#define next frame->Xnext
-#define pp frame->Xpp
-#define prev frame->Xprev
-#define saved_eptr frame->Xsaved_eptr
-
-#define new_recursive frame->Xnew_recursive
-
-#define cur_is_word frame->Xcur_is_word
-#define condition frame->Xcondition
-#define prev_is_word frame->Xprev_is_word
-
-#define original_ims frame->Xoriginal_ims
-
-#ifdef SUPPORT_UCP
-#define prop_type frame->Xprop_type
-#define prop_value frame->Xprop_value
-#define prop_fail_result frame->Xprop_fail_result
-#define prop_category frame->Xprop_category
-#define prop_chartype frame->Xprop_chartype
-#define prop_script frame->Xprop_script
-#define oclength frame->Xoclength
-#define occhars frame->Xocchars
-#endif
-
-#define ctype frame->Xctype
-#define fc frame->Xfc
-#define fi frame->Xfi
-#define length frame->Xlength
-#define max frame->Xmax
-#define min frame->Xmin
-#define number frame->Xnumber
-#define offset frame->Xoffset
-#define op frame->Xop
-#define save_capture_last frame->Xsave_capture_last
-#define save_offset1 frame->Xsave_offset1
-#define save_offset2 frame->Xsave_offset2
-#define save_offset3 frame->Xsave_offset3
-#define stacksave frame->Xstacksave
-
-#define newptrb frame->Xnewptrb
-
-/* When recursion is being used, local variables are allocated on the stack and
-get preserved during recursion in the normal way. In this environment, fi and
-i, and fc and c, can be the same variables. */
-
-#else /* NO_RECURSE not defined */
-#define fi i
-#define fc c
-
-
-#ifdef SUPPORT_UTF8 /* Many of these variables are used only */
-const uschar *charptr; /* in small blocks of the code. My normal */
-#endif /* style of coding would have declared */
-const uschar *callpat; /* them within each of those blocks. */
-const uschar *data; /* However, in order to accommodate the */
-const uschar *next; /* version of this code that uses an */
-USPTR pp; /* external "stack" implemented on the */
-const uschar *prev; /* heap, it is easier to declare them all */
-USPTR saved_eptr; /* here, so the declarations can be cut */
- /* out in a block. The only declarations */
-recursion_info new_recursive; /* within blocks below are for variables */
- /* that do not have to be preserved over */
-BOOL cur_is_word; /* a recursive call to RMATCH(). */
-BOOL condition;
-BOOL prev_is_word;
-
-unsigned long int original_ims;
-
-#ifdef SUPPORT_UCP
-int prop_type;
-int prop_value;
-int prop_fail_result;
-int prop_category;
-int prop_chartype;
-int prop_script;
-int oclength;
-uschar occhars[8];
-#endif
-
-int codelink;
-int ctype;
-int length;
-int max;
-int min;
-int number;
-int offset;
-int op;
-int save_capture_last;
-int save_offset1, save_offset2, save_offset3;
-int stacksave[REC_STACK_SAVE_MAX];
-
-eptrblock newptrb;
-#endif /* NO_RECURSE */
-
-/* These statements are here to stop the compiler complaining about unitialized
-variables. */
-
-#ifdef SUPPORT_UCP
-prop_value = 0;
-prop_fail_result = 0;
-#endif
-
-
-/* This label is used for tail recursion, which is used in a few cases even
-when NO_RECURSE is not defined, in order to reduce the amount of stack that is
-used. Thanks to Ian Taylor for noticing this possibility and sending the
-original patch. */
-
-TAIL_RECURSE:
-
-/* OK, now we can get on with the real code of the function. Recursive calls
-are specified by the macro RMATCH and RRETURN is used to return. When
-NO_RECURSE is *not* defined, these just turn into a recursive call to match()
-and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
-defined). However, RMATCH isn't like a function call because it's quite a
-complicated macro. It has to be used in one particular way. This shouldn't,
-however, impact performance when true recursion is being used. */
-
-#ifdef SUPPORT_UTF8
-utf8 = md->utf8; /* Local copy of the flag */
-#else
-utf8 = FALSE;
-#endif
-
-/* First check that we haven't called match() too many times, or that we
-haven't exceeded the recursive call limit. */
-
-if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
-if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
-
-original_ims = ims; /* Save for resetting on ')' */
-
-/* At the start of a group with an unlimited repeat that may match an empty
-string, the match_cbegroup flag is set. When this is the case, add the current
-subject pointer to the chain of such remembered pointers, to be checked when we
-hit the closing ket, in order to break infinite loops that match no characters.
-When match() is called in other circumstances, don't add to the chain. The
-match_cbegroup flag must NOT be used with tail recursion, because the memory
-block that is used is on the stack, so a new one may be required for each
-match(). */
-
-if ((flags & match_cbegroup) != 0)
- {
- newptrb.epb_saved_eptr = eptr;
- newptrb.epb_prev = eptrb;
- eptrb = &newptrb;
- }
-
-/* Now start processing the opcodes. */
-
-for (;;)
- {
- minimize = possessive = FALSE;
- op = *ecode;
-
- /* For partial matching, remember if we ever hit the end of the subject after
- matching at least one subject character. */
-
- if (md->partial &&
- eptr >= md->end_subject &&
- eptr > mstart)
- md->hitend = TRUE;
-
- switch(op)
- {
- case OP_FAIL:
- RRETURN(MATCH_NOMATCH);
-
- case OP_PRUNE:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM51);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_PRUNE);
-
- case OP_COMMIT:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM52);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_COMMIT);
-
- case OP_SKIP:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM53);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- md->start_match_ptr = eptr; /* Pass back current position */
- RRETURN(MATCH_SKIP);
-
- case OP_THEN:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM54);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_THEN);
-
- /* Handle a capturing bracket. If there is space in the offset vector, save
- the current subject position in the working slot at the top of the vector.
- We mustn't change the current values of the data slot, because they may be
- set from a previous iteration of this group, and be referred to by a
- reference inside the group.
-
- If the bracket fails to match, we need to restore this value and also the
- values of the final offsets, in case they were set by a previous iteration
- of the same bracket.
-
- If there isn't enough space in the offset vector, treat this as if it were
- a non-capturing bracket. Don't worry about setting the flag for the error
- case here; that is handled in the code for KET. */
-
- case OP_CBRA:
- case OP_SCBRA:
- number = GET2(ecode, 1+LINK_SIZE);
- offset = number << 1;
-
-#ifdef PCRE_DEBUG
- printf("start bracket %d\n", number);
- printf("subject=");
- pchars(eptr, 16, TRUE, md);
- printf("\n");
-#endif
-
- if (offset < md->offset_max)
- {
- save_offset1 = md->offset_vector[offset];
- save_offset2 = md->offset_vector[offset+1];
- save_offset3 = md->offset_vector[md->offset_end - number];
- save_capture_last = md->capture_last;
-
- DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
-
- flags = (op == OP_SCBRA)? match_cbegroup : 0;
- do
- {
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM1);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- md->capture_last = save_capture_last;
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
-
- DPRINTF(("bracket %d failed\n", number));
-
- md->offset_vector[offset] = save_offset1;
- md->offset_vector[offset+1] = save_offset2;
- md->offset_vector[md->offset_end - number] = save_offset3;
-
- RRETURN(MATCH_NOMATCH);
- }
-
- /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
- as a non-capturing bracket. */
-
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
-
- DPRINTF(("insufficient capture room: treat as non-capturing\n"));
-
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
-
- /* Non-capturing bracket. Loop for all the alternatives. When we get to the
- final alternative within the brackets, we would return the result of a
- recursive call to match() whatever happened. We can reduce stack usage by
- turning this into a tail recursion, except in the case when match_cbegroup
- is set.*/
-
- case OP_BRA:
- case OP_SBRA:
- DPRINTF(("start non-capturing bracket\n"));
- flags = (op >= OP_SBRA)? match_cbegroup : 0;
- for (;;)
- {
- if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
- {
- if (flags == 0) /* Not a possibly empty group */
- {
- ecode += _pcre_OP_lengths[*ecode];
- DPRINTF(("bracket 0 tail recursion\n"));
- goto TAIL_RECURSE;
- }
-
- /* Possibly empty group; can't use tail recursion. */
-
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
- eptrb, flags, RM48);
- RRETURN(rrc);
- }
-
- /* For non-final alternatives, continue the loop for a NOMATCH result;
- otherwise return. */
-
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
- eptrb, flags, RM2);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- /* Control never reaches here. */
-
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. As there is only one branch to be
- obeyed, we can use tail recursion to avoid using another stack frame. */
-
- case OP_COND:
- case OP_SCOND:
- codelink= GET(ecode, 1);
-
- /* Because of the way auto-callout works during compile, a callout item is
- inserted between OP_COND and an assertion condition. */
-
- if (ecode[LINK_SIZE+1] == OP_CALLOUT)
- {
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 1; /* Version 1 of the callout block */
- cb.callout_number = ecode[LINK_SIZE+2];
- cb.offset_vector = md->offset_vector;
- cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
- cb.pattern_position = GET(ecode, LINK_SIZE + 3);
- cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
- cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
- cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
- if (rrc < 0) RRETURN(rrc);
- }
- ecode += _pcre_OP_lengths[OP_CALLOUT];
- }
-
- condcode = ecode[LINK_SIZE+1];
-
- /* Now see what the actual condition is */
-
- if (condcode == OP_RREF) /* Recursion test */
- {
- offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
- condition = md->recursive != NULL &&
- (offset == RREF_ANY || offset == md->recursive->group_num);
- ecode += condition? 3 : GET(ecode, 1);
- }
-
- else if (condcode == OP_CREF) /* Group used test */
- {
- offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
- condition = offset < offset_top && md->offset_vector[offset] >= 0;
- ecode += condition? 3 : GET(ecode, 1);
- }
-
- else if (condcode == OP_DEF) /* DEFINE - always false */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
-
- /* The condition is an assertion. Call match() to evaluate it - setting
- the final argument match_condassert causes it to stop at the end of an
- assertion. */
-
- else
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_condassert, RM3);
- if (rrc == MATCH_MATCH)
- {
- condition = TRUE;
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
- while (*ecode == OP_ALT) ecode += GET(ecode, 1);
- }
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- RRETURN(rrc); /* Need braces because of following else */
- }
- else
- {
- condition = FALSE;
- ecode += codelink;
- }
- }
-
- /* We are now at the branch that is to be obeyed. As there is only one,
- we can use tail recursion to avoid using another stack frame, except when
- match_cbegroup is required for an unlimited repeat of a possibly empty
- group. If the second alternative doesn't exist, we can just plough on. */
-
- if (condition || *ecode == OP_ALT)
- {
- ecode += 1 + LINK_SIZE;
- if (op == OP_SCOND) /* Possibly empty group */
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
- RRETURN(rrc);
- }
- else /* Group must match something */
- {
- flags = 0;
- goto TAIL_RECURSE;
- }
- }
- else /* Condition false & no alternative */
- {
- ecode += 1 + LINK_SIZE;
- }
- break;
-
-
- /* End of the pattern, either real or forced. If we are in a top-level
- recursion, we should restore the offsets appropriately and continue from
- after the call. */
-
- case OP_ACCEPT:
- case OP_END:
- if (md->recursive != NULL && md->recursive->group_num == 0)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("End of pattern in a (?0) recursion\n"));
- md->recursive = rec->prevrec;
- memmove(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- mstart = rec->save_start;
- ims = original_ims;
- ecode = rec->after_call;
- break;
- }
-
- /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
- string - backtracking will then try other alternatives, if any. */
-
- if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
- md->end_match_ptr = eptr; /* Record where we ended */
- md->end_offset_top = offset_top; /* and how many extracts were taken */
- md->start_match_ptr = mstart; /* and the start (\K can modify) */
- RRETURN(MATCH_MATCH);
-
- /* Change option settings */
-
- case OP_OPT:
- ims = ecode[1];
- ecode += 2;
- DPRINTF(("ims set to %02lx\n", ims));
- break;
-
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. */
-
- case OP_ASSERT:
- case OP_ASSERTBACK:
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
- RM4);
- if (rrc == MATCH_MATCH) break;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
- if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
-
- /* If checking an assertion for a condition, return MATCH_MATCH. */
-
- if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
-
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
-
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- ecode += 1 + LINK_SIZE;
- offset_top = md->end_offset_top;
- continue;
-
- /* Negative assertion: all branches must fail to match */
-
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK_NOT:
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
- RM5);
- if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
-
- ecode += 1 + LINK_SIZE;
- continue;
-
- /* Move the subject pointer back. This occurs only at the start of
- each branch of a lookbehind assertion. If we are too close to the start to
- move back, this match function fails. When working with UTF-8 we move
- back a number of characters, not bytes. */
-
- case OP_REVERSE:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- i = GET(ecode, 1);
- while (i-- > 0)
- {
- eptr--;
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
- BACKCHAR(eptr);
- }
- }
- else
-#endif
-
- /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
-
- {
- eptr -= GET(ecode, 1);
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
- }
-
- /* Skip to next op code */
-
- ecode += 1 + LINK_SIZE;
- break;
-
- /* The callout item calls an external function, if one is provided, passing
- details of the match so far. This is mainly for debugging, though the
- function is able to force a failure. */
-
- case OP_CALLOUT:
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 1; /* Version 1 of the callout block */
- cb.callout_number = ecode[1];
- cb.offset_vector = md->offset_vector;
- cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
- cb.pattern_position = GET(ecode, 2);
- cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
- cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
- cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
- if (rrc < 0) RRETURN(rrc);
- }
- ecode += 2 + 2*LINK_SIZE;
- break;
-
- /* Recursion either matches the current regex, or some subexpression. The
- offset data is the offset to the starting bracket from the start of the
- whole pattern. (This is so that it works from duplicated subpatterns.)
-
- If there are any capturing brackets started but not finished, we have to
- save their starting points and reinstate them after the recursion. However,
- we don't know how many such there are (offset_top records the completed
- total) so we just have to save all the potential data. There may be up to
- 65535 such values, which is too large to put on the stack, but using malloc
- for small numbers seems expensive. As a compromise, the stack is used when
- there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
- is used. A problem is what to do if the malloc fails ... there is no way of
- returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
- values on the stack, and accept that the rest may be wrong.
-
- There are also other values that have to be saved. We use a chained
- sequence of blocks that actually live on the stack. Thanks to Robin Houston
- for the original version of this logic. */
-
- case OP_RECURSE:
- {
- callpat = md->start_code + GET(ecode, 1);
- new_recursive.group_num = (callpat == md->start_code)? 0 :
- GET2(callpat, 1 + LINK_SIZE);
-
- /* Add to "recursing stack" */
-
- new_recursive.prevrec = md->recursive;
- md->recursive = &new_recursive;
-
- /* Find where to continue from afterwards */
-
- ecode += 1 + LINK_SIZE;
- new_recursive.after_call = ecode;
-
- /* Now save the offset data. */
-
- new_recursive.saved_max = md->offset_end;
- if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
- new_recursive.offset_save = stacksave;
- else
- {
- new_recursive.offset_save =
- (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
- if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
- }
-
- memcpy(new_recursive.offset_save, md->offset_vector,
- new_recursive.saved_max * sizeof(int));
- new_recursive.save_start = mstart;
- mstart = eptr;
-
- /* OK, now we can do the recursion. For each top-level alternative we
- restore the offset and recursion data. */
-
- DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
- flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
- do
- {
- RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
- md, ims, eptrb, flags, RM6);
- if (rrc == MATCH_MATCH)
- {
- DPRINTF(("Recursion matched\n"));
- md->recursive = new_recursive.prevrec;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_MATCH);
- }
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- DPRINTF(("Recursion gave error %d\n", rrc));
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(rrc);
- }
-
- md->recursive = &new_recursive;
- memcpy(md->offset_vector, new_recursive.offset_save,
- new_recursive.saved_max * sizeof(int));
- callpat += GET(callpat, 1);
- }
- while (*callpat == OP_ALT);
-
- DPRINTF(("Recursion didn't match\n"));
- md->recursive = new_recursive.prevrec;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never reaches here */
-
- /* "Once" brackets are like assertion brackets except that after a match,
- the point in the subject string is not moved back. Thus there can never be
- a move back into the brackets. Friedl calls these "atomic" subpatterns.
- Check the alternative branches in turn - the matching won't pass the KET
- for this kind of subpattern. If any one branch matches, we carry on as at
- the end of a normal bracket, leaving the subject pointer. */
-
- case OP_ONCE:
- prev = ecode;
- saved_eptr = eptr;
-
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
- if (rrc == MATCH_MATCH) break;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- /* If hit the end of the group (which could be repeated), fail */
-
- if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
-
- /* Continue as from after the assertion, updating the offsets high water
- mark, since extracts may have been taken. */
-
- do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
-
- offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1+LINK_SIZE;
- break;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. The second "call" of match()
- uses tail recursion, to avoid using another stack frame. We need to reset
- any options that changed within the bracket before re-running it, so
- check the next opcode. */
-
- if (ecode[1+LINK_SIZE] == OP_OPT)
- {
- ims = (ims & ~PCRE_IMS) | ecode[4];
- DPRINTF(("ims set to %02lx at group repeat\n", ims));
- }
-
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode = prev;
- flags = 0;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- flags = 0;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
-
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
-
- case OP_ALT:
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- break;
-
- /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
- indicating that it may occur zero times. It may repeat infinitely, or not
- at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
- with fixed upper repeat limits are compiled as a number of copies, with the
- optional ones preceded by BRAZERO or BRAMINZERO. */
-
- case OP_BRAZERO:
- {
- next = ecode+1;
- RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1 + LINK_SIZE;
- }
- break;
-
- case OP_BRAMINZERO:
- {
- next = ecode+1;
- do next += GET(next, 1); while (*next == OP_ALT);
- RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode++;
- }
- break;
-
- case OP_SKIPZERO:
- {
- next = ecode+1;
- do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1 + LINK_SIZE;
- }
- break;
-
- /* End of a group, repeated or non-repeating. */
-
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- prev = ecode - GET(ecode, 1);
-
- /* If this was a group that remembered the subject start, in order to break
- infinite repeats of empty string matches, retrieve the subject start from
- the chain. Otherwise, set it NULL. */
-
- if (*prev >= OP_SBRA)
- {
- saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
- eptrb = eptrb->epb_prev; /* Backup to previous group */
- }
- else saved_eptr = NULL;
-
- /* If we are at the end of an assertion group, stop matching and return
- MATCH_MATCH, but record the current high water mark for use by positive
- assertions. Do this also for the "once" (atomic) groups. */
-
- if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
- *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
- *prev == OP_ONCE)
- {
- md->end_match_ptr = eptr; /* For ONCE */
- md->end_offset_top = offset_top;
- RRETURN(MATCH_MATCH);
- }
-
- /* For capturing groups we have to check the group number back at the start
- and if necessary complete handling an extraction by setting the offsets and
- bumping the high water mark. Note that whole-pattern recursion is coded as
- a recurse into group 0, so it won't be picked up here. Instead, we catch it
- when the OP_END is reached. Other recursion is handled here. */
-
- if (*prev == OP_CBRA || *prev == OP_SCBRA)
- {
- number = GET2(prev, 1+LINK_SIZE);
- offset = number << 1;
-
-#ifdef PCRE_DEBUG
- printf("end bracket %d", number);
- printf("\n");
-#endif
-
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
- {
- md->offset_vector[offset] =
- md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
-
- /* Handle a recursively called group. Restore the offsets
- appropriately and continue from after the call. */
-
- if (md->recursive != NULL && md->recursive->group_num == number)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
- md->recursive = rec->prevrec;
- mstart = rec->save_start;
- memcpy(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- ecode = rec->after_call;
- ims = original_ims;
- break;
- }
- }
-
- /* For both capturing and non-capturing groups, reset the value of the ims
- flags, in case they got changed during the group. */
-
- ims = original_ims;
- DPRINTF(("ims reset to %02lx\n", ims));
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1 + LINK_SIZE;
- break;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. In the second case, we can use
- tail recursion to avoid using another stack frame, unless we have an
- unlimited repeat of a group that can match an empty string. */
-
- flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
-
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (flags != 0) /* Could match an empty string */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
- RRETURN(rrc);
- }
- ecode = prev;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- flags = 0;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
-
- /* Start of subject unless notbol, or after internal newline if multiline */
-
- case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr != md->start_subject &&
- (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
- /* ... else fall through */
-
- /* Start of subject assertion */
-
- case OP_SOD:
- if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Start of match assertion */
-
- case OP_SOM:
- if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Reset the start of match point */
-
- case OP_SET_SOM:
- mstart = eptr;
- ecode++;
- break;
-
- /* Assert before internal newline if multiline, or before a terminating
- newline unless endonly is set, else end of subject unless noteol is set. */
-
- case OP_DOLL:
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr < md->end_subject)
- { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
- else
- { if (md->noteol) RRETURN(MATCH_NOMATCH); }
- ecode++;
- break;
- }
- else
- {
- if (md->noteol) RRETURN(MATCH_NOMATCH);
- if (!md->endonly)
- {
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
- }
- /* ... else fall through for endonly */
-
- /* End of subject assertion (\z) */
-
- case OP_EOD:
- if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* End of subject or ending \n assertion (\Z) */
-
- case OP_EODN:
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Word boundary assertions */
-
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- {
-
- /* Find out if the previous and current characters are "word" characters.
- It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
- be "non-word" characters. */
-
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- if (eptr == md->start_subject) prev_is_word = FALSE; else
- {
- USPTR lastptr = eptr - 1;
- while((*lastptr & 0xc0) == 0x80) lastptr--;
- GETCHAR(c, lastptr);
- prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
- }
- if (eptr >= md->end_subject) cur_is_word = FALSE; else
- {
- GETCHAR(c, eptr);
- cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
- }
- }
- else
-#endif
-
- /* More streamlined when not in UTF-8 mode */
-
- {
- prev_is_word = (eptr != md->start_subject) &&
- ((md->ctypes[eptr[-1]] & ctype_word) != 0);
- cur_is_word = (eptr < md->end_subject) &&
- ((md->ctypes[*eptr] & ctype_word) != 0);
- }
-
- /* Now see if the situation is what we want */
-
- if ((*ecode++ == OP_WORD_BOUNDARY)?
- cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- /* Match a single character type; inline for speed */
-
- case OP_ANY:
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- /* Fall through */
-
- case OP_ALLANY:
- if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- ecode++;
- break;
-
- /* Match a single byte, even in UTF-8 mode. This opcode really does match
- any byte, even newline, independent of the setting of PCRE_DOTALL. */
-
- case OP_ANYBYTE:
- if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_DIGIT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c < 256 &&
-#endif
- (md->ctypes[c] & ctype_digit) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_DIGIT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
-#endif
- (md->ctypes[c] & ctype_digit) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_WHITESPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c < 256 &&
-#endif
- (md->ctypes[c] & ctype_space) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_WHITESPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
-#endif
- (md->ctypes[c] & ctype_space) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_WORDCHAR:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c < 256 &&
-#endif
- (md->ctypes[c] & ctype_word) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_WORDCHAR:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
-#endif
- (md->ctypes[c] & ctype_word) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_ANYNL:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
-
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- ecode++;
- break;
-
- case OP_NOT_HSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
-
- case OP_HSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- ecode++;
- break;
-
- case OP_NOT_VSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
-
- case OP_VSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- ecode++;
- break;
-
-#ifdef SUPPORT_UCP
- /* Check the next character by Unicode property. We will get here only
- if the support is in the binary; otherwise a compile-time error occurs. */
-
- case OP_PROP:
- case OP_NOTPROP:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- {
- const ucd_record *prop = GET_UCD(c);
-
- switch(ecode[1])
- {
- case PT_ANY:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_LAMP:
- if ((prop->chartype == ucp_Lu ||
- prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_GC:
- if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_PC:
- if ((ecode[2] != prop->chartype) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_SC:
- if ((ecode[2] != prop->script) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
-
- ecode += 3;
- }
- break;
-
- /* Match an extended Unicode sequence. We will get here only if the support
- is in the binary; otherwise a compile-time error occurs. */
-
- case OP_EXTUNI:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- {
- int category = UCD_CATEGORY(c);
- if (category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- category = UCD_CATEGORY(c);
- if (category != ucp_M) break;
- eptr += len;
- }
- }
- ecode++;
- break;
-#endif
-
-
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. The code is similar
- to that for character classes, but repeated for efficiency. Then obey
- similar code to character type repeats - written out again for speed.
- However, if the referenced string is the empty string, always treat
- it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- case OP_REF:
- {
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3;
-
- /* If the reference is unset, there are two possibilities:
-
- (a) In the default, Perl-compatible state, set the length to be longer
- than the amount of subject left; this ensures that every attempt at a
- match fails. We can't just fail here, because of the possibility of
- quantifiers with zero minima.
-
- (b) If the JavaScript compatibility flag is set, set the length to zero
- so that the back reference matches an empty string.
-
- Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
-
- if (offset >= offset_top || md->offset_vector[offset] < 0)
- length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
- else
- length = md->offset_vector[offset+1] - md->offset_vector[offset];
-
- /* Set up for repetition, or handle the non-repeated case */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
- eptr += length;
- continue; /* With the main loop */
- }
-
- /* If the length of the reference is zero, just continue with the
- main loop. */
-
- if (length == 0) continue;
-
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
-
- for (i = 1; i <= min; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
- eptr += length;
- }
-
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
-
- if (min == max) continue;
-
- /* If minimizing, keep trying and advancing the pointer */
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || !match_ref(offset, eptr, length, md, ims))
- RRETURN(MATCH_NOMATCH);
- eptr += length;
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest string and work backwards */
-
- else
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) break;
- eptr += length;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr -= length;
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
-
-
- /* Match a bit-mapped character class, possibly repeatedly. This op code is
- used when all the characters in the class have values in the range 0-255,
- and either the matching is caseful, or the characters are in the range
- 0-127 when UTF-8 processing is enabled. The only difference between
- OP_CLASS and OP_NCLASS occurs when a data character outside the range is
- encountered.
-
- First, look past the end of the item to see if there is repeat information
- following. Then obey similar code to character type repeats - written out
- again for speed. */
-
- case OP_NCLASS:
- case OP_CLASS:
- {
- data = ecode + 1; /* Save for matching */
- ecode += 33; /* Advance past the item */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
-
- /* First, ensure the minimum number of matches are present. */
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c > 255)
- {
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
-
- /* If max == min we can continue with the main loop without the
- need to recurse. */
-
- if (min == max) continue;
-
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
-
- if (minimize)
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c > 255)
- {
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest possible run, then work backwards. */
-
- else
- {
- pp = eptr;
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c > 255)
- {
- if (op == OP_CLASS) break;
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- }
- eptr += len;
- }
- for (;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- eptr++;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
-
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
-
- /* Match an extended character class. This opcode is encountered only
- when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
- mode, because Unicode properties are supported in non-UTF-8 mode. */
-
-#ifdef SUPPORT_UTF8
- case OP_XCLASS:
- {
- data = ecode + 1 + LINK_SIZE; /* Save for matching */
- ecode += GET(ecode, 1); /* Advance past the item */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
-
- /* First, ensure the minimum number of matches are present. */
-
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
- }
-
- /* If max == min we can continue with the main loop without the
- need to recurse. */
-
- if (min == max) continue;
-
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest possible run, then work backwards. */
-
- else
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLENTEST(c, eptr, len);
- if (!_pcre_xclass(c, data)) break;
- eptr += len;
- }
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
- }
- RRETURN(MATCH_NOMATCH);
- }
-
- /* Control never gets here */
- }
-#endif /* End of XCLASS */
-
- /* Match a single character, casefully */
-
- case OP_CHAR:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
- if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
- }
- else
-#endif
-
- /* Non-UTF-8 mode */
- {
- if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
- if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
- ecode += 2;
- }
- break;
-
- /* Match a single character, caselessly */
-
- case OP_CHARNC:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
-
- if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
-
- /* If the pattern character's value is < 128, we have only one byte, and
- can use the fast lookup table. */
-
- if (fc < 128)
- {
- if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- }
-
- /* Otherwise we must pick up the subject character */
-
- else
- {
- unsigned int dc;
- GETCHARINC(dc, eptr);
- ecode += length;
-
- /* If we have Unicode property support, we can use it to test the other
- case of the character, if there is one. */
-
- if (fc != dc)
- {
-#ifdef SUPPORT_UCP
- if (dc != UCD_OTHERCASE(fc))
-#endif
- RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Non-UTF-8 mode */
- {
- if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- ecode += 2;
- }
- break;
-
- /* Match a single character repeatedly. */
-
- case OP_EXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATCHAR;
-
- case OP_POSUPTO:
- possessive = TRUE;
- /* Fall through */
-
- case OP_UPTO:
- case OP_MINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_MINUPTO;
- ecode += 3;
- goto REPEATCHAR;
-
- case OP_POSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
-
- case OP_POSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
-
- case OP_POSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATCHAR;
-
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- c = *ecode++ - OP_STAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
-
- REPEATCHAR:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- charptr = ecode;
- GETCHARLEN(fc, ecode, length);
- if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- ecode += length;
-
- /* Handle multibyte character matching specially here. There is
- support for caseless matching if UCP support is present. */
-
- if (length > 1)
- {
-#ifdef SUPPORT_UCP
- unsigned int othercase;
- if ((ims & PCRE_CASELESS) != 0 &&
- (othercase = UCD_OTHERCASE(fc)) != fc)
- oclength = _pcre_ord2utf8(othercase, occhars);
- else oclength = 0;
-#endif /* SUPPORT_UCP */
-
- for (i = 1; i <= min; i++)
- {
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
-#ifdef SUPPORT_UCP
- /* Need braces because of following else */
- else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
- eptr += oclength;
- }
-#else /* without SUPPORT_UCP */
- else { RRETURN(MATCH_NOMATCH); }
-#endif /* SUPPORT_UCP */
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
-#ifdef SUPPORT_UCP
- /* Need braces because of following else */
- else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
- eptr += oclength;
- }
-#else /* without SUPPORT_UCP */
- else { RRETURN (MATCH_NOMATCH); }
-#endif /* SUPPORT_UCP */
- }
- /* Control never gets here */
- }
-
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr > md->end_subject - length) break;
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
-#ifdef SUPPORT_UCP
- else if (oclength == 0) break;
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) break;
- eptr += oclength;
- }
-#else /* without SUPPORT_UCP */
- else break;
-#endif /* SUPPORT_UCP */
- }
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) RRETURN(MATCH_NOMATCH);
-#ifdef SUPPORT_UCP
- eptr--;
- BACKCHAR(eptr);
-#else /* without SUPPORT_UCP */
- eptr -= length;
-#endif /* SUPPORT_UCP */
- }
- }
- /* Control never gets here */
- }
-
- /* If the length of a UTF-8 character is 1, we fall through here, and
- obey the code as for non-UTF-8 characters below, though in this case the
- value of fc will always be < 128. */
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* When not in UTF-8 mode, load a single-byte character. */
- {
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- fc = *ecode++;
- }
-
- /* The value of fc at this point is always less than 256, though we may or
- may not be in UTF-8 mode. The code is duplicated for the caseless and
- caseful cases, for speed, since matching characters is likely to be quite
- common. First, ensure the minimum number of matches are present. If min =
- max, continue at the same level without recursing. Otherwise, if
- minimizing, keep trying the rest of the expression and advancing one
- matching character if failing, up to the maximum. Alternatively, if
- maximizing, find the maximum number of characters and work backwards. */
-
- DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
- max, eptr));
-
- if ((ims & PCRE_CASELESS) != 0)
- {
- fc = md->lcc[fc];
- for (i = 1; i <= min; i++)
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- fc != md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
-
- /* Caseful comparisons (includes all multi-byte characters) */
-
- else
- {
- for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc != *eptr) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
- /* Match a negated single one-byte character. The character we are
- checking can be multibyte. */
-
- case OP_NOT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- GETCHARINCTEST(c, eptr);
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (c < 256)
-#endif
- c = md->lcc[c];
- if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
- }
- break;
-
- /* Match a negated single one-byte character repeatedly. This is almost a
- repeat of the code for a repeated single character, but I haven't found a
- nice way of commoning these up that doesn't require a test of the
- positive/negative option for each character match. Maybe that wouldn't add
- very much to the time taken, but character matching *is* what this is all
- about... */
-
- case OP_NOTEXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_NOTMINUPTO;
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- c = *ecode++ - OP_NOTSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-byte matches. We can give up quickly
- if there are fewer than the minimum number of bytes left in the
- subject. */
-
- REPEATNOTCHAR:
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- fc = *ecode++;
-
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
-
- DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
- max, eptr));
-
- if ((ims & PCRE_CASELESS) != 0)
- {
- fc = md->lcc[fc];
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = 1; i <= min; i++)
- {
- GETCHARINC(d, eptr);
- if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
-
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(d, eptr);
- if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
-
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* Maximize case */
-
- else
- {
- pp = eptr;
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(d, eptr, len);
- if (d < 256) d = md->lcc[d];
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
-
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
-
- /* Caseful comparisons */
-
- else
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = 1; i <= min; i++)
- {
- GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* Maximize case */
-
- else
- {
- pp = eptr;
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(d, eptr, len);
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc == *eptr) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
-
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
- /* Match a single character type repeatedly; several different opcodes
- share code. This is very similar to the code for single characters, but we
- repeat it in the interests of efficiency. */
-
- case OP_TYPEEXACT:
- min = max = GET2(ecode, 1);
- minimize = TRUE;
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_TYPEMINUPTO;
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPEPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATTYPE;
-
- case OP_TYPEPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATTYPE;
-
- case OP_TYPEPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATTYPE;
-
- case OP_TYPEPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- c = *ecode++ - OP_TYPESTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single character type matches. Note that
- in UTF-8 mode, '.' matches a character of any length, but for the other
- character types, the valid characters are all one-byte long. */
-
- REPEATTYPE:
- ctype = *ecode++; /* Code for the character type */
-
-#ifdef SUPPORT_UCP
- if (ctype == OP_PROP || ctype == OP_NOTPROP)
- {
- prop_fail_result = ctype == OP_NOTPROP;
- prop_type = *ecode++;
- prop_value = *ecode++;
- }
- else prop_type = -1;
-#endif
-
- /* First, ensure the minimum number of matches are present. Use inline
- code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). Also we can test that there are at least
- the minimum number of bytes before we start. This isn't as effective in
- UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
- is tidier. Also separate the UCP code, which can be the same for both UTF-8
- and single-bytes. */
-
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- if (min > 0)
- {
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- }
- break;
-
- case PT_LAMP:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_chartype = UCD_CHARTYPE(c);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_GC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = UCD_CATEGORY(c);
- if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_PC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_chartype = UCD_CHARTYPE(c);
- if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_SC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_script = UCD_SCRIPT(c);
- if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
-
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
-
- else if (ctype == OP_EXTUNI)
- {
- for (i = 1; i <= min; i++)
- {
- GETCHARINCTEST(c, eptr);
- prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = UCD_CATEGORY(c);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- }
-
- else
-#endif /* SUPPORT_UCP */
-
-/* Handle all other cases when the coding is UTF-8 */
-
-#ifdef SUPPORT_UTF8
- if (utf8) switch(ctype)
- {
- case OP_ANY:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
-
- case OP_ALLANY:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
-
- case OP_ANYBYTE:
- eptr += min;
- break;
-
- case OP_ANYNL:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
-
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- }
- break;
-
- case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
- RRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
- }
- break;
-
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
- RRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
- }
- break;
-
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- } /* End switch(ctype) */
-
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Code for the non-UTF-8 case for minimum matching of operators other
- than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
- number of bytes present, as this was tested above. */
-
- switch(ctype)
- {
- case OP_ANY:
- for (i = 1; i <= min; i++)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
- break;
-
- case OP_ALLANY:
- eptr += min;
- break;
-
- case OP_ANYBYTE:
- eptr += min;
- break;
-
- /* Because of the CRLF case, we can't assume the minimum number of
- bytes are present in this case. */
-
- case OP_ANYNL:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- break;
- }
- }
- break;
-
- case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- break;
- }
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
-
- /* If min = max, continue at the same level without recursing */
-
- if (min == max) continue;
-
- /* If minimizing, we have to test the rest of the pattern before each
- subsequent match. Again, separate the UTF-8 case for speed, and also
- separate the UCP cases. */
-
- if (minimize)
- {
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_LAMP:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_chartype = UCD_CHARTYPE(c);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_GC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = UCD_CATEGORY(c);
- if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_PC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_chartype = UCD_CHARTYPE(c);
- if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_SC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_script = UCD_SCRIPT(c);
- if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
-
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
-
- else if (ctype == OP_EXTUNI)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = UCD_CATEGORY(c);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- }
-
- else
-#endif /* SUPPORT_UCP */
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
-
- GETCHARINC(c, eptr);
- switch(ctype)
- {
- case OP_ANY: /* This is the non-NL case */
- case OP_ALLANY:
- case OP_ANYBYTE:
- break;
-
- case OP_ANYNL:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
-
- case OP_NOT_HSPACE:
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_HSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- break;
-
- case OP_NOT_VSPACE:
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_VSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- break;
-
- case OP_NOT_DIGIT:
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_DIGIT:
- if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WHITESPACE:
- if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WHITESPACE:
- if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WORDCHAR:
- if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WORDCHAR:
- if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
-
- c = *eptr++;
- switch(ctype)
- {
- case OP_ANY: /* This is the non-NL case */
- case OP_ALLANY:
- case OP_ANYBYTE:
- break;
-
- case OP_ANYNL:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
-
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
-
- case OP_NOT_HSPACE:
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_HSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- break;
- }
- break;
-
- case OP_NOT_VSPACE:
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_VSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- break;
- }
- break;
-
- case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- }
- /* Control never gets here */
- }
-
- /* If maximizing, it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). Again, keep the
- UTF-8 and UCP stuff separate. */
-
- else
- {
- pp = eptr; /* Remember where we started */
-
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (prop_fail_result) break;
- eptr+= len;
- }
- break;
-
- case PT_LAMP:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_chartype = UCD_CHARTYPE(c);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
-
- case PT_GC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = UCD_CATEGORY(c);
- if ((prop_category == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
-
- case PT_PC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_chartype = UCD_CHARTYPE(c);
- if ((prop_chartype == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
-
- case PT_SC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_script = UCD_SCRIPT(c);
- if ((prop_script == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
- }
- }
-
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
-
- else if (ctype == OP_EXTUNI)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- GETCHARINCTEST(c, eptr);
- prop_category = UCD_CATEGORY(c);
- if (prop_category == ucp_M) break;
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = UCD_CATEGORY(c);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- for (;;) /* Move back over one extended */
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- BACKCHAR(eptr);
- GETCHARLEN(c, eptr, len);
- }
- prop_category = UCD_CATEGORY(c);
- if (prop_category != ucp_M) break;
- eptr--;
- }
- }
- }
-
- else
-#endif /* SUPPORT_UCP */
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
-
- if (utf8)
- {
- switch(ctype)
- {
- case OP_ANY:
- if (max < INT_MAX)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
-
- /* Handle unlimited UTF-8 repeat */
-
- else
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- break;
-
- case OP_ALLANY:
- if (max < INT_MAX)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
- break;
-
- /* The byte case is the same as non-UTF8 */
-
- case OP_ANYBYTE:
- c = max - min;
- if (c > (unsigned int)(md->end_subject - eptr))
- c = md->end_subject - eptr;
- eptr += c;
- break;
-
- case OP_ANYNL:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c == 0x000d)
- {
- if (++eptr >= md->end_subject) break;
- if (*eptr == 0x000a) eptr++;
- }
- else
- {
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c &&
- c != 0x0085 && c != 0x2028 && c != 0x2029)))
- break;
- eptr += len;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- case OP_HSPACE:
- for (i = min; i < max; i++)
- {
- BOOL gotspace;
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- switch(c)
- {
- default: gotspace = FALSE; break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- gotspace = TRUE;
- break;
- }
- if (gotspace == (ctype == OP_NOT_HSPACE)) break;
- eptr += len;
- }
- break;
-
- case OP_NOT_VSPACE:
- case OP_VSPACE:
- for (i = min; i < max; i++)
- {
- BOOL gotspace;
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- switch(c)
- {
- default: gotspace = FALSE; break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- gotspace = TRUE;
- break;
- }
- if (gotspace == (ctype == OP_NOT_VSPACE)) break;
- eptr += len;
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
- eptr+= len;
- }
- break;
-
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
- eptr+= len;
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
- eptr+= len;
- }
- break;
-
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
- eptr+= len;
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
- eptr+= len;
- }
- break;
-
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
- eptr+= len;
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Not UTF-8 mode */
- {
- switch(ctype)
- {
- case OP_ANY:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- }
- break;
-
- case OP_ALLANY:
- case OP_ANYBYTE:
- c = max - min;
- if (c > (unsigned int)(md->end_subject - eptr))
- c = md->end_subject - eptr;
- eptr += c;
- break;
-
- case OP_ANYNL:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x000d)
- {
- if (++eptr >= md->end_subject) break;
- if (*eptr == 0x000a) eptr++;
- }
- else
- {
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c && c != 0x0085)))
- break;
- eptr++;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x09 || c == 0x20 || c == 0xa0) break;
- eptr++;
- }
- break;
-
- case OP_HSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c != 0x09 && c != 0x20 && c != 0xa0) break;
- eptr++;
- }
- break;
-
- case OP_NOT_VSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
- break;
- eptr++;
- }
- break;
-
- case OP_VSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
- break;
- eptr++;
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- }
-
- /* Get here if we can't make it match with any permitted repetitions */
-
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- /* There's been some horrible disaster. Arrival here can only mean there is
- something seriously wrong in the code above or the OP_xxx definitions. */
-
- default:
- DPRINTF(("Unknown opcode %d\n", *ecode));
- RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
- }
-
- /* Do not stick any code in here without much thought; it is assumed
- that "continue" in the code above comes out to here to repeat the main
- loop. */
-
- } /* End of main loop */
-/* Control never reaches here */
-
-
-/* When compiling to use the heap rather than the stack for recursive calls to
-match(), the RRETURN() macro jumps here. The number that is saved in
-frame->Xwhere indicates which label we actually want to return to. */
-
-#ifdef NO_RECURSE
-#define LBL(val) case val: goto L_RM##val;
-HEAP_RETURN:
-switch (frame->Xwhere)
- {
- LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
- LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
- LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
- LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54)
-#ifdef SUPPORT_UTF8
- LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
- LBL(32) LBL(34) LBL(42) LBL(46)
-#ifdef SUPPORT_UCP
- LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
-#endif /* SUPPORT_UCP */
-#endif /* SUPPORT_UTF8 */
- default:
- DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
- return PCRE_ERROR_INTERNAL;
- }
-#undef LBL
-#endif /* NO_RECURSE */
-}
-
-
-/***************************************************************************
-****************************************************************************
- RECURSION IN THE match() FUNCTION
-
-Undefine all the macros that were defined above to handle this. */
-
-#ifdef NO_RECURSE
-#undef eptr
-#undef ecode
-#undef mstart
-#undef offset_top
-#undef ims
-#undef eptrb
-#undef flags
-
-#undef callpat
-#undef charptr
-#undef data
-#undef next
-#undef pp
-#undef prev
-#undef saved_eptr
-
-#undef new_recursive
-
-#undef cur_is_word
-#undef condition
-#undef prev_is_word
-
-#undef original_ims
-
-#undef ctype
-#undef length
-#undef max
-#undef min
-#undef number
-#undef offset
-#undef op
-#undef save_capture_last
-#undef save_offset1
-#undef save_offset2
-#undef save_offset3
-#undef stacksave
-
-#undef newptrb
-
-#endif
-
-/* These two are defined as macros in both cases */
-
-#undef fc
-#undef fi
-
-/***************************************************************************
-***************************************************************************/
-
-
-
-/*************************************************
-* Execute a Regular Expression *
-*************************************************/
-
-/* This function applies a compiled re to a subject string and picks out
-portions of the string if it matches. Two elements in the vector are set for
-each substring: the offsets to the start and end of the substring.
-
-Arguments:
- argument_re points to the compiled expression
- extra_data points to extra data or is NULL
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- offsets points to a vector of ints to be filled in with offsets
- offsetcount the number of elements in the vector
-
-Returns: > 0 => success; value is the number of elements filled in
- = 0 => success, but offsets is not big enough
- -1 => failed to match
- < -1 => some kind of unexpected problem
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
- PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
- int offsetcount)
-{
-int rc, resetcount, ocount;
-int first_byte = -1;
-int req_byte = -1;
-int req_byte2 = -1;
-int newline;
-unsigned long int ims;
-BOOL using_temporary_offsets = FALSE;
-BOOL anchored;
-BOOL startline;
-BOOL firstline;
-BOOL first_byte_caseless = FALSE;
-BOOL req_byte_caseless = FALSE;
-BOOL utf8;
-match_data match_block;
-match_data *md = &match_block;
-const uschar *tables;
-const uschar *start_bits = NULL;
-USPTR start_match = (USPTR)subject + start_offset;
-USPTR end_subject;
-USPTR req_byte_ptr = start_match - 1;
-
-pcre_study_data internal_study;
-const pcre_study_data *study;
-
-real_pcre internal_re;
-const real_pcre *external_re = (const real_pcre *)argument_re;
-const real_pcre *re = external_re;
-
-/* Plausibility checks */
-
-if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
-if (re == NULL || subject == NULL ||
- (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
-if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
-
-/* Fish out the optional data from the extra_data structure, first setting
-the default values. */
-
-study = NULL;
-md->match_limit = MATCH_LIMIT;
-md->match_limit_recursion = MATCH_LIMIT_RECURSION;
-md->callout_data = NULL;
-
-/* The table pointer is always in native byte order. */
-
-tables = external_re->tables;
-
-if (extra_data != NULL)
- {
- register unsigned int flags = extra_data->flags;
- if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
- study = (const pcre_study_data *)extra_data->study_data;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
- md->match_limit = extra_data->match_limit;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
- md->match_limit_recursion = extra_data->match_limit_recursion;
- if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
- md->callout_data = extra_data->callout_data;
- if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
- }
-
-/* If the exec call supplied NULL for tables, use the inbuilt ones. This
-is a feature that makes it possible to save compiled regex and re-use them
-in other programs later. */
-
-if (tables == NULL) tables = _pcre_default_tables;
-
-/* Check that the first field in the block is the magic number. If it is not,
-test for a regex that was compiled on a host of opposite endianness. If this is
-the case, flipped values are put in internal_re and internal_study if there was
-study data too. */
-
-if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- if (study != NULL) study = &internal_study;
- }
-
-/* Set up other data */
-
-anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
-startline = (re->flags & PCRE_STARTLINE) != 0;
-firstline = (re->options & PCRE_FIRSTLINE) != 0;
-
-/* The code starts after the real_pcre block and the capture name table. */
-
-md->start_code = (const uschar *)external_re + re->name_table_offset +
- re->name_count * re->name_entry_size;
-
-md->start_subject = (USPTR)subject;
-md->start_offset = start_offset;
-md->end_subject = md->start_subject + length;
-end_subject = md->end_subject;
-
-md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
-utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
-md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
-
-md->notbol = (options & PCRE_NOTBOL) != 0;
-md->noteol = (options & PCRE_NOTEOL) != 0;
-md->notempty = (options & PCRE_NOTEMPTY) != 0;
-md->partial = (options & PCRE_PARTIAL) != 0;
-md->hitend = FALSE;
-
-md->recursive = NULL; /* No recursion at top level */
-
-md->lcc = tables + lcc_offset;
-md->ctypes = tables + ctypes_offset;
-
-/* Handle different \R options. */
-
-switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
- {
- case 0:
- if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
- md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
- else
-#ifdef BSR_ANYCRLF
- md->bsr_anycrlf = TRUE;
-#else
- md->bsr_anycrlf = FALSE;
-#endif
- break;
-
- case PCRE_BSR_ANYCRLF:
- md->bsr_anycrlf = TRUE;
- break;
-
- case PCRE_BSR_UNICODE:
- md->bsr_anycrlf = FALSE;
- break;
-
- default: return PCRE_ERROR_BADNEWLINE;
- }
-
-/* Handle different types of newline. The three bits give eight cases. If
-nothing is set at run time, whatever was used at compile time applies. */
-
-switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
- (pcre_uint32)options) & PCRE_NEWLINE_BITS)
- {
- case 0: newline = NEWLINE; break; /* Compile-time default */
- case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
- case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
- case PCRE_NEWLINE_CR+
- PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
- case PCRE_NEWLINE_ANY: newline = -1; break;
- case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
- default: return PCRE_ERROR_BADNEWLINE;
- }
-
-if (newline == -2)
- {
- md->nltype = NLTYPE_ANYCRLF;
- }
-else if (newline < 0)
- {
- md->nltype = NLTYPE_ANY;
- }
-else
- {
- md->nltype = NLTYPE_FIXED;
- if (newline > 255)
- {
- md->nllen = 2;
- md->nl[0] = (newline >> 8) & 255;
- md->nl[1] = newline & 255;
- }
- else
- {
- md->nllen = 1;
- md->nl[0] = newline;
- }
- }
-
-/* Partial matching is supported only for a restricted set of regexes at the
-moment. */
-
-if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
- return PCRE_ERROR_BADPARTIAL;
-
-/* Check a UTF-8 string if required. Unfortunately there's no way of passing
-back the character offset. */
-
-#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
- {
- if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
- if (start_offset > 0 && start_offset < length)
- {
- int tb = ((USPTR)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
- }
- }
-#endif
-
-/* The ims options can vary during the matching as a result of the presence
-of (?ims) items in the pattern. They are kept in a local variable so that
-restoring at the exit of a group is easy. */
-
-ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
-
-/* If the expression has got more back references than the offsets supplied can
-hold, we get a temporary chunk of working store to use during the matching.
-Otherwise, we can use the vector supplied, rounding down its size to a multiple
-of 3. */
-
-ocount = offsetcount - (offsetcount % 3);
-
-if (re->top_backref > 0 && re->top_backref >= ocount/3)
- {
- ocount = re->top_backref * 3 + 3;
- md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
- if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
- using_temporary_offsets = TRUE;
- DPRINTF(("Got memory to hold back references\n"));
- }
-else md->offset_vector = offsets;
-
-md->offset_end = ocount;
-md->offset_max = (2*ocount)/3;
-md->offset_overflow = FALSE;
-md->capture_last = -1;
-
-/* Compute the minimum number of offsets that we need to reset each time. Doing
-this makes a huge difference to execution time when there aren't many brackets
-in the pattern. */
-
-resetcount = 2 + re->top_bracket * 2;
-if (resetcount > offsetcount) resetcount = ocount;
-
-/* Reset the working variable associated with each extraction. These should
-never be used unless previously set, but they get saved and restored, and so we
-initialize them to avoid reading uninitialized locations. */
-
-if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector + ocount;
- register int *iend = iptr - resetcount/2 + 1;
- while (--iptr >= iend) *iptr = -1;
- }
-
-/* Set up the first character to match, if available. The first_byte value is
-never set for an anchored regular expression, but the anchoring may be forced
-at run time, so we have to test for anchoring. The first char may be unset for
-an unanchored pattern, of course. If there's no first char and the pattern was
-studied, there may be a bitmap of possible first characters. */
-
-if (!anchored)
- {
- if ((re->flags & PCRE_FIRSTSET) != 0)
- {
- first_byte = re->first_byte & 255;
- if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
- first_byte = md->lcc[first_byte];
- }
- else
- if (!startline && study != NULL &&
- (study->options & PCRE_STUDY_MAPPED) != 0)
- start_bits = study->start_bits;
- }
-
-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
-
-if ((re->flags & PCRE_REQCHSET) != 0)
- {
- req_byte = re->req_byte & 255;
- req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
- req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
- }
-
-
-/* ==========================================================================*/
-
-/* Loop for handling unanchored repeated matching attempts; for anchored regexs
-the loop runs just once. */
-
-for(;;)
- {
- USPTR save_end_subject = end_subject;
- USPTR new_start_match;
-
- /* Reset the maximum number of extractions we might see. */
-
- if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector;
- register int *iend = iptr + resetcount;
- while (iptr < iend) *iptr++ = -1;
- }
-
- /* If firstline is TRUE, the start of the match is constrained to the first
- line of a multiline string. That is, the match must be before or at the first
- newline. Implement this by temporarily adjusting end_subject so that we stop
- scanning at a newline. If the match fails at the newline, later code breaks
- this loop. */
-
- if (firstline)
- {
- USPTR t = start_match;
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (t < md->end_subject && !IS_NEWLINE(t))
- {
- t++;
- while (t < end_subject && (*t & 0xc0) == 0x80) t++;
- }
- }
- else
-#endif
- while (t < md->end_subject && !IS_NEWLINE(t)) t++;
- end_subject = t;
- }
-
- /* There are some optimizations that avoid running the match if a known
- starting point is not found, or if a known later character is not present.
- However, there is an option that disables these, for testing and for ensuring
- that all callouts do actually occur. */
-
- if ((options & PCRE_NO_START_OPTIMIZE) == 0)
- {
- /* Advance to a unique first byte if there is one. */
-
- if (first_byte >= 0)
- {
- if (first_byte_caseless)
- while (start_match < end_subject && md->lcc[*start_match] != first_byte)
- start_match++;
- else
- while (start_match < end_subject && *start_match != first_byte)
- start_match++;
- }
-
- /* Or to just after a linebreak for a multiline match */
-
- else if (startline)
- {
- if (start_match > md->start_subject + start_offset)
- {
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject && !WAS_NEWLINE(start_match))
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- }
- else
-#endif
- while (start_match < end_subject && !WAS_NEWLINE(start_match))
- start_match++;
-
- /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
- and we are now at a LF, advance the match position by one more character.
- */
-
- if (start_match[-1] == CHAR_CR &&
- (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
- start_match < end_subject &&
- *start_match == CHAR_NL)
- start_match++;
- }
- }
-
- /* Or to a non-unique first byte after study */
-
- else if (start_bits != NULL)
- {
- while (start_match < end_subject)
- {
- register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
- else break;
- }
- }
- } /* Starting optimizations */
-
- /* Restore fudged end_subject */
-
- end_subject = save_end_subject;
-
-#ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
- printf(">>>> Match against: ");
- pchars(start_match, end_subject - start_match, TRUE, md);
- printf("\n");
-#endif
-
- /* If req_byte is set, we know that that character must appear in the
- subject for the match to succeed. If the first character is set, req_byte
- must be later in the subject; otherwise the test starts at the match point.
- This optimization can save a huge amount of backtracking in patterns with
- nested unlimited repeats that aren't going to match. Writing separate code
- for cased/caseless versions makes it go faster, as does using an
- autoincrement and backing off on a match.
-
- HOWEVER: when the subject string is very, very long, searching to its end
- can take a long time, and give bad performance on quite ordinary patterns.
- This showed up when somebody was matching something like /^\d+C/ on a
- 32-megabyte string... so we don't do this when the string is sufficiently
- long.
-
- ALSO: this processing is disabled when partial matching is requested, or if
- disabling is explicitly requested. */
-
- if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
- req_byte >= 0 &&
- end_subject - start_match < REQ_BYTE_MAX &&
- !md->partial)
- {
- register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
-
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
-
- if (p > req_byte_ptr)
- {
- if (req_byte_caseless)
- {
- while (p < end_subject)
- {
- register int pp = *p++;
- if (pp == req_byte || pp == req_byte2) { p--; break; }
- }
- }
- else
- {
- while (p < end_subject)
- {
- if (*p++ == req_byte) { p--; break; }
- }
- }
-
- /* If we can't find the required character, break the matching loop,
- forcing a match failure. */
-
- if (p >= end_subject)
- {
- rc = MATCH_NOMATCH;
- break;
- }
-
- /* If we have found the required character, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
-
- req_byte_ptr = p;
- }
- }
-
- /* OK, we can now run the match. */
-
- md->start_match_ptr = start_match;
- md->match_call_count = 0;
- rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
-
- switch(rc)
- {
- /* NOMATCH and PRUNE advance by one character. THEN at this level acts
- exactly like PRUNE. */
-
- case MATCH_NOMATCH:
- case MATCH_PRUNE:
- case MATCH_THEN:
- new_start_match = start_match + 1;
-#ifdef SUPPORT_UTF8
- if (utf8)
- while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
- new_start_match++;
-#endif
- break;
-
- /* SKIP passes back the next starting point explicitly. */
-
- case MATCH_SKIP:
- new_start_match = md->start_match_ptr;
- break;
-
- /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
-
- case MATCH_COMMIT:
- rc = MATCH_NOMATCH;
- goto ENDLOOP;
-
- /* Any other return is some kind of error. */
-
- default:
- goto ENDLOOP;
- }
-
- /* Control reaches here for the various types of "no match at this point"
- result. Reset the code to MATCH_NOMATCH for subsequent checking. */
-
- rc = MATCH_NOMATCH;
-
- /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
- newline in the subject (though it may continue over the newline). Therefore,
- if we have just failed to match, starting at a newline, do not continue. */
-
- if (firstline && IS_NEWLINE(start_match)) break;
-
- /* Advance to new matching position */
-
- start_match = new_start_match;
-
- /* Break the loop if the pattern is anchored or if we have passed the end of
- the subject. */
-
- if (anchored || start_match > end_subject) break;
-
- /* If we have just passed a CR and we are now at a LF, and the pattern does
- not contain any explicit matches for \r or \n, and the newline option is CRLF
- or ANY or ANYCRLF, advance the match position by one more character. */
-
- if (start_match[-1] == CHAR_CR &&
- start_match < end_subject &&
- *start_match == CHAR_NL &&
- (re->flags & PCRE_HASCRORLF) == 0 &&
- (md->nltype == NLTYPE_ANY ||
- md->nltype == NLTYPE_ANYCRLF ||
- md->nllen == 2))
- start_match++;
-
- } /* End of for(;;) "bumpalong" loop */
-
-/* ==========================================================================*/
-
-/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
-conditions is true:
-
-(1) The pattern is anchored or the match was failed by (*COMMIT);
-
-(2) We are past the end of the subject;
-
-(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
- this option requests that a match occur at or before the first newline in
- the subject.
-
-When we have a match and the offset vector is big enough to deal with any
-backreferences, captured substring offsets will already be set up. In the case
-where we had to get some local store to hold offsets for backreference
-processing, copy those that we can. In this case there need not be overflow if
-certain parts of the pattern were not used, even though there are more
-capturing parentheses than vector slots. */
-
-ENDLOOP:
-
-if (rc == MATCH_MATCH)
- {
- if (using_temporary_offsets)
- {
- if (offsetcount >= 4)
- {
- memcpy(offsets + 2, md->offset_vector + 2,
- (offsetcount - 2) * sizeof(int));
- DPRINTF(("Copied offsets from temporary memory\n"));
- }
- if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(md->offset_vector);
- }
-
- /* Set the return code to the number of captured strings, or 0 if there are
- too many to fit into the vector. */
-
- rc = md->offset_overflow? 0 : md->end_offset_top/2;
-
- /* If there is space, set up the whole thing as substring 0. The value of
- md->start_match_ptr might be modified if \K was encountered on the success
- matching path. */
-
- if (offsetcount < 2) rc = 0; else
- {
- offsets[0] = md->start_match_ptr - md->start_subject;
- offsets[1] = md->end_match_ptr - md->start_subject;
- }
-
- DPRINTF((">>>> returning %d\n", rc));
- return rc;
- }
-
-/* Control gets here if there has been an error, or if the overall match
-attempt has failed at all permitted starting positions. */
-
-if (using_temporary_offsets)
- {
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(md->offset_vector);
- }
-
-if (rc != MATCH_NOMATCH)
- {
- DPRINTF((">>>> error: returning %d\n", rc));
- return rc;
- }
-else if (md->partial && md->hitend)
- {
- DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
- return PCRE_ERROR_PARTIAL;
- }
-else
- {
- DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
- return PCRE_ERROR_NOMATCH;
- }
-}
-
-/* End of pcre_exec.c */
diff --git a/libs/pcre/pcre_fullinfo.c b/libs/pcre/pcre_fullinfo.c
deleted file mode 100644
index 3a343bd4a1..0000000000
--- a/libs/pcre/pcre_fullinfo.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_fullinfo(), which returns
-information about a compiled pattern. */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-/*************************************************
-* Return info about compiled pattern *
-*************************************************/
-
-/* This is a newer "info" function which has an extensible interface so
-that additional items can be added compatibly.
-
-Arguments:
- argument_re points to compiled code
- extra_data points extra data, or NULL
- what what information is required
- where where to put the information
-
-Returns: 0 if data returned, negative on error
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
- void *where)
-{
-real_pcre internal_re;
-pcre_study_data internal_study;
-const real_pcre *re = (const real_pcre *)argument_re;
-const pcre_study_data *study = NULL;
-
-if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
-
-if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
- study = (const pcre_study_data *)extra_data->study_data;
-
-if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- if (study != NULL) study = &internal_study;
- }
-
-switch (what)
- {
- case PCRE_INFO_OPTIONS:
- *((unsigned long int *)where) = re->options & PUBLIC_COMPILE_OPTIONS;
- break;
-
- case PCRE_INFO_SIZE:
- *((size_t *)where) = re->size;
- break;
-
- case PCRE_INFO_STUDYSIZE:
- *((size_t *)where) = (study == NULL)? 0 : study->size;
- break;
-
- case PCRE_INFO_CAPTURECOUNT:
- *((int *)where) = re->top_bracket;
- break;
-
- case PCRE_INFO_BACKREFMAX:
- *((int *)where) = re->top_backref;
- break;
-
- case PCRE_INFO_FIRSTBYTE:
- *((int *)where) =
- ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
- ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
- break;
-
- /* Make sure we pass back the pointer to the bit vector in the external
- block, not the internal copy (with flipped integer fields). */
-
- case PCRE_INFO_FIRSTTABLE:
- *((const uschar **)where) =
- (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
- ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
- break;
-
- case PCRE_INFO_LASTLITERAL:
- *((int *)where) =
- ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
- break;
-
- case PCRE_INFO_NAMEENTRYSIZE:
- *((int *)where) = re->name_entry_size;
- break;
-
- case PCRE_INFO_NAMECOUNT:
- *((int *)where) = re->name_count;
- break;
-
- case PCRE_INFO_NAMETABLE:
- *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
- break;
-
- case PCRE_INFO_DEFAULT_TABLES:
- *((const uschar **)where) = (const uschar *)(_pcre_default_tables);
- break;
-
- case PCRE_INFO_OKPARTIAL:
- *((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0;
- break;
-
- case PCRE_INFO_JCHANGED:
- *((int *)where) = (re->flags & PCRE_JCHANGED) != 0;
- break;
-
- case PCRE_INFO_HASCRORLF:
- *((int *)where) = (re->flags & PCRE_HASCRORLF) != 0;
- break;
-
- default: return PCRE_ERROR_BADOPTION;
- }
-
-return 0;
-}
-
-/* End of pcre_fullinfo.c */
diff --git a/libs/pcre/pcre_get.c b/libs/pcre/pcre_get.c
deleted file mode 100644
index 6117786409..0000000000
--- a/libs/pcre/pcre_get.c
+++ /dev/null
@@ -1,465 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains some convenience functions for extracting substrings
-from the subject string after a regex match has succeeded. The original idea
-for these functions came from Scott Wimer. */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-/*************************************************
-* Find number for named string *
-*************************************************/
-
-/* This function is used by the get_first_set() function below, as well
-as being generally available. It assumes that names are unique.
-
-Arguments:
- code the compiled regex
- stringname the name whose number is required
-
-Returns: the number of the named parentheses, or a negative number
- (PCRE_ERROR_NOSUBSTRING) if not found
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_get_stringnumber(const pcre *code, const char *stringname)
-{
-int rc;
-int entrysize;
-int top, bot;
-uschar *nametable;
-
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
- return rc;
-if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
-
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
- return rc;
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
- return rc;
-
-bot = 0;
-while (top > bot)
- {
- int mid = (top + bot) / 2;
- uschar *entry = nametable + entrysize*mid;
- int c = strcmp(stringname, (char *)(entry + 2));
- if (c == 0) return (entry[0] << 8) + entry[1];
- if (c > 0) bot = mid + 1; else top = mid;
- }
-
-return PCRE_ERROR_NOSUBSTRING;
-}
-
-
-
-/*************************************************
-* Find (multiple) entries for named string *
-*************************************************/
-
-/* This is used by the get_first_set() function below, as well as being
-generally available. It is used when duplicated names are permitted.
-
-Arguments:
- code the compiled regex
- stringname the name whose entries required
- firstptr where to put the pointer to the first entry
- lastptr where to put the pointer to the last entry
-
-Returns: the length of each entry, or a negative number
- (PCRE_ERROR_NOSUBSTRING) if not found
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_get_stringtable_entries(const pcre *code, const char *stringname,
- char **firstptr, char **lastptr)
-{
-int rc;
-int entrysize;
-int top, bot;
-uschar *nametable, *lastentry;
-
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
- return rc;
-if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
-
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
- return rc;
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
- return rc;
-
-lastentry = nametable + entrysize * (top - 1);
-bot = 0;
-while (top > bot)
- {
- int mid = (top + bot) / 2;
- uschar *entry = nametable + entrysize*mid;
- int c = strcmp(stringname, (char *)(entry + 2));
- if (c == 0)
- {
- uschar *first = entry;
- uschar *last = entry;
- while (first > nametable)
- {
- if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break;
- first -= entrysize;
- }
- while (last < lastentry)
- {
- if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break;
- last += entrysize;
- }
- *firstptr = (char *)first;
- *lastptr = (char *)last;
- return entrysize;
- }
- if (c > 0) bot = mid + 1; else top = mid;
- }
-
-return PCRE_ERROR_NOSUBSTRING;
-}
-
-
-
-/*************************************************
-* Find first set of multiple named strings *
-*************************************************/
-
-/* This function allows for duplicate names in the table of named substrings.
-It returns the number of the first one that was set in a pattern match.
-
-Arguments:
- code the compiled regex
- stringname the name of the capturing substring
- ovector the vector of matched substrings
-
-Returns: the number of the first that is set,
- or the number of the last one if none are set,
- or a negative number on error
-*/
-
-static int
-get_first_set(const pcre *code, const char *stringname, int *ovector)
-{
-const real_pcre *re = (const real_pcre *)code;
-int entrysize;
-char *first, *last;
-uschar *entry;
-if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0)
- return pcre_get_stringnumber(code, stringname);
-entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last);
-if (entrysize <= 0) return entrysize;
-for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize)
- {
- int n = (entry[0] << 8) + entry[1];
- if (ovector[n*2] >= 0) return n;
- }
-return (first[0] << 8) + first[1];
-}
-
-
-
-
-/*************************************************
-* Copy captured string to given buffer *
-*************************************************/
-
-/* This function copies a single captured substring into a given buffer.
-Note that we use memcpy() rather than strncpy() in case there are binary zeros
-in the string.
-
-Arguments:
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- stringnumber the number of the required substring
- buffer where to put the substring
- size the size of the buffer
-
-Returns: if successful:
- the length of the copied string, not including the zero
- that is put on the end; can be zero
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) buffer too small
- PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_copy_substring(const char *subject, int *ovector, int stringcount,
- int stringnumber, char *buffer, int size)
-{
-int yield;
-if (stringnumber < 0 || stringnumber >= stringcount)
- return PCRE_ERROR_NOSUBSTRING;
-stringnumber *= 2;
-yield = ovector[stringnumber+1] - ovector[stringnumber];
-if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
-memcpy(buffer, subject + ovector[stringnumber], yield);
-buffer[yield] = 0;
-return yield;
-}
-
-
-
-/*************************************************
-* Copy named captured string to given buffer *
-*************************************************/
-
-/* This function copies a single captured substring into a given buffer,
-identifying it by name. If the regex permits duplicate names, the first
-substring that is set is chosen.
-
-Arguments:
- code the compiled regex
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- stringname the name of the required substring
- buffer where to put the substring
- size the size of the buffer
-
-Returns: if successful:
- the length of the copied string, not including the zero
- that is put on the end; can be zero
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) buffer too small
- PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
- int stringcount, const char *stringname, char *buffer, int size)
-{
-int n = get_first_set(code, stringname, ovector);
-if (n <= 0) return n;
-return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
-}
-
-
-
-/*************************************************
-* Copy all captured strings to new store *
-*************************************************/
-
-/* This function gets one chunk of store and builds a list of pointers and all
-of the captured substrings in it. A NULL pointer is put on the end of the list.
-
-Arguments:
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- listptr set to point to the list of pointers
-
-Returns: if successful: 0
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) failed to get store
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
- const char ***listptr)
-{
-int i;
-int size = sizeof(char *);
-int double_count = stringcount * 2;
-char **stringlist;
-char *p;
-
-for (i = 0; i < double_count; i += 2)
- size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
-
-stringlist = (char **)(pcre_malloc)(size);
-if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
-
-*listptr = (const char **)stringlist;
-p = (char *)(stringlist + stringcount + 1);
-
-for (i = 0; i < double_count; i += 2)
- {
- int len = ovector[i+1] - ovector[i];
- memcpy(p, subject + ovector[i], len);
- *stringlist++ = p;
- p += len;
- *p++ = 0;
- }
-
-*stringlist = NULL;
-return 0;
-}
-
-
-
-/*************************************************
-* Free store obtained by get_substring_list *
-*************************************************/
-
-/* This function exists for the benefit of people calling PCRE from non-C
-programs that can call its functions, but not free() or (pcre_free)() directly.
-
-Argument: the result of a previous pcre_get_substring_list()
-Returns: nothing
-*/
-
-PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
-pcre_free_substring_list(const char **pointer)
-{
-(pcre_free)((void *)pointer);
-}
-
-
-
-/*************************************************
-* Copy captured string to new store *
-*************************************************/
-
-/* This function copies a single captured substring into a piece of new
-store
-
-Arguments:
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- stringnumber the number of the required substring
- stringptr where to put a pointer to the substring
-
-Returns: if successful:
- the length of the string, not including the zero that
- is put on the end; can be zero
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) failed to get store
- PCRE_ERROR_NOSUBSTRING (-7) substring not present
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_get_substring(const char *subject, int *ovector, int stringcount,
- int stringnumber, const char **stringptr)
-{
-int yield;
-char *substring;
-if (stringnumber < 0 || stringnumber >= stringcount)
- return PCRE_ERROR_NOSUBSTRING;
-stringnumber *= 2;
-yield = ovector[stringnumber+1] - ovector[stringnumber];
-substring = (char *)(pcre_malloc)(yield + 1);
-if (substring == NULL) return PCRE_ERROR_NOMEMORY;
-memcpy(substring, subject + ovector[stringnumber], yield);
-substring[yield] = 0;
-*stringptr = substring;
-return yield;
-}
-
-
-
-/*************************************************
-* Copy named captured string to new store *
-*************************************************/
-
-/* This function copies a single captured substring, identified by name, into
-new store. If the regex permits duplicate names, the first substring that is
-set is chosen.
-
-Arguments:
- code the compiled regex
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- stringname the name of the required substring
- stringptr where to put the pointer
-
-Returns: if successful:
- the length of the copied string, not including the zero
- that is put on the end; can be zero
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) couldn't get memory
- PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
- int stringcount, const char *stringname, const char **stringptr)
-{
-int n = get_first_set(code, stringname, ovector);
-if (n <= 0) return n;
-return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
-}
-
-
-
-
-/*************************************************
-* Free store obtained by get_substring *
-*************************************************/
-
-/* This function exists for the benefit of people calling PCRE from non-C
-programs that can call its functions, but not free() or (pcre_free)() directly.
-
-Argument: the result of a previous pcre_get_substring()
-Returns: nothing
-*/
-
-PCRE_EXP_DEFN void PCRE_CALL_CONVENTION
-pcre_free_substring(const char *pointer)
-{
-(pcre_free)((void *)pointer);
-}
-
-/* End of pcre_get.c */
diff --git a/libs/pcre/pcre_globals.c b/libs/pcre/pcre_globals.c
deleted file mode 100644
index 24ed03d396..0000000000
--- a/libs/pcre/pcre_globals.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains global variables that are exported by the PCRE library.
-PCRE is thread-clean and doesn't use any global variables in the normal sense.
-However, it calls memory allocation and freeing functions via the four
-indirections below, and it can optionally do callouts, using the fifth
-indirection. These values can be changed by the caller, but are shared between
-all threads. However, when compiling for Virtual Pascal, things are done
-differently, and global variables are not used (see pcre.in). */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-#ifndef VPCOMPAT
-PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc;
-PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = free;
-PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc;
-PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = free;
-PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
-#endif
-
-/* End of pcre_globals.c */
diff --git a/libs/pcre/pcre_info.c b/libs/pcre/pcre_info.c
deleted file mode 100644
index f35f398db5..0000000000
--- a/libs/pcre/pcre_info.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_info(), which gives some
-information about a compiled pattern. However, use of this function is now
-deprecated, as it has been superseded by pcre_fullinfo(). */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-/*************************************************
-* (Obsolete) Return info about compiled pattern *
-*************************************************/
-
-/* This is the original "info" function. It picks potentially useful data out
-of the private structure, but its interface was too rigid. It remains for
-backwards compatibility. The public options are passed back in an int - though
-the re->options field has been expanded to a long int, all the public options
-at the low end of it, and so even on 16-bit systems this will still be OK.
-Therefore, I haven't changed the API for pcre_info().
-
-Arguments:
- argument_re points to compiled code
- optptr where to pass back the options
- first_byte where to pass back the first character,
- or -1 if multiline and all branches start ^,
- or -2 otherwise
-
-Returns: number of capturing subpatterns
- or negative values on error
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
-{
-real_pcre internal_re;
-const real_pcre *re = (const real_pcre *)argument_re;
-if (re == NULL) return PCRE_ERROR_NULL;
-if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- }
-if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
-if (first_byte != NULL)
- *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
- ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
-return re->top_bracket;
-}
-
-/* End of pcre_info.c */
diff --git a/libs/pcre/pcre_internal.h b/libs/pcre/pcre_internal.h
deleted file mode 100644
index 850b3b2a60..0000000000
--- a/libs/pcre/pcre_internal.h
+++ /dev/null
@@ -1,1744 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-/* This header contains definitions that are shared between the different
-modules, but which are not relevant to the exported API. This includes some
-functions whose names all begin with "_pcre_". */
-
-#ifndef PCRE_INTERNAL_H
-#define PCRE_INTERNAL_H
-
-/* Define PCRE_DEBUG to get debugging output on stdout. */
-
-#if 0
-#define PCRE_DEBUG
-#endif
-
-/* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
-script prevents both being selected, but not everybody uses "configure". */
-
-#if defined EBCDIC && defined SUPPORT_UTF8
-#error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
-#endif
-
-/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
-"configure" script ensures this, but not everybody uses "configure". */
-
-#if defined SUPPORT_UCP && !defined SUPPORT_UTF8
-#define SUPPORT_UTF8 1
-#endif
-
-/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
-inline, and there are *still* stupid compilers about that don't like indented
-pre-processor statements, or at least there were when I first wrote this. After
-all, it had only been about 10 years then...
-
-It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
-be absolutely sure we get our version. */
-
-#undef DPRINTF
-#ifdef PCRE_DEBUG
-#define DPRINTF(p) printf p
-#else
-#define DPRINTF(p) /* Nothing */
-#endif
-
-
-/* Standard C headers plus the external interface definition. The only time
-setjmp and stdarg are used is when NO_RECURSE is set. */
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-/* When compiling a DLL for Windows, the exported symbols have to be declared
-using some MS magic. I found some useful information on this web page:
-http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
-information there, using __declspec(dllexport) without "extern" we have a
-definition; with "extern" we have a declaration. The settings here override the
-setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL,
-which is all that is needed for applications (they just import the symbols). We
-use:
-
- PCRE_EXP_DECL for declarations
- PCRE_EXP_DEFN for definitions of exported functions
- PCRE_EXP_DATA_DEFN for definitions of exported variables
-
-The reason for the two DEFN macros is that in non-Windows environments, one
-does not want to have "extern" before variable definitions because it leads to
-compiler warnings. So we distinguish between functions and variables. In
-Windows, the two should always be the same.
-
-The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest,
-which is an application, but needs to import this file in order to "peek" at
-internals, can #include pcre.h first to get an application's-eye view.
-
-In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
-special-purpose environments) might want to stick other stuff in front of
-exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and
-PCRE_EXP_DATA_DEFN only if they are not already set. */
-
-#ifndef PCRE_EXP_DECL
-# ifdef _WIN32
-# ifndef PCRE_STATIC
-# define PCRE_EXP_DECL extern __declspec(dllexport)
-# define PCRE_EXP_DEFN __declspec(dllexport)
-# define PCRE_EXP_DATA_DEFN __declspec(dllexport)
-# else
-# define PCRE_EXP_DECL extern
-# define PCRE_EXP_DEFN
-# define PCRE_EXP_DATA_DEFN
-# endif
-# else
-# ifdef __cplusplus
-# define PCRE_EXP_DECL extern "C"
-# else
-# define PCRE_EXP_DECL extern
-# endif
-# ifndef PCRE_EXP_DEFN
-# define PCRE_EXP_DEFN PCRE_EXP_DECL
-# endif
-# ifndef PCRE_EXP_DATA_DEFN
-# define PCRE_EXP_DATA_DEFN
-# endif
-# endif
-#endif
-
-/* When compiling with the MSVC compiler, it is sometimes necessary to include
-a "calling convention" before exported function names. (This is secondhand
-information; I know nothing about MSVC myself). For example, something like
-
- void __cdecl function(....)
-
-might be needed. In order so make this easy, all the exported functions have
-PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not
-set, we ensure here that it has no effect. */
-
-#ifndef PCRE_CALL_CONVENTION
-#define PCRE_CALL_CONVENTION
-#endif
-
-/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
-cannot determine these outside the compilation (e.g. by running a program as
-part of "configure") because PCRE is often cross-compiled for use on other
-systems. Instead we make use of the maximum sizes that are available at
-preprocessor time in standard C environments. */
-
-#if USHRT_MAX == 65535
- typedef unsigned short pcre_uint16;
- typedef short pcre_int16;
-#elif UINT_MAX == 65535
- typedef unsigned int pcre_uint16;
- typedef int pcre_int16;
-#else
- #error Cannot determine a type for 16-bit unsigned integers
-#endif
-
-#if UINT_MAX == 4294967295
- typedef unsigned int pcre_uint32;
- typedef int pcre_int32;
-#elif ULONG_MAX == 4294967295
- typedef unsigned long int pcre_uint32;
- typedef long int pcre_int32;
-#else
- #error Cannot determine a type for 32-bit unsigned integers
-#endif
-
-/* All character handling must be done as unsigned characters. Otherwise there
-are problems with top-bit-set characters and functions such as isspace().
-However, we leave the interface to the outside world as char *, because that
-should make things easier for callers. We define a short type for unsigned char
-to save lots of typing. I tried "uchar", but it causes problems on Digital
-Unix, where it is defined in sys/types, so use "uschar" instead. */
-
-typedef unsigned char uschar;
-
-/* This is an unsigned int value that no character can ever have. UTF-8
-characters only go up to 0x7fffffff (though Unicode doesn't go beyond
-0x0010ffff). */
-
-#define NOTACHAR 0xffffffff
-
-/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
-"any" and "anycrlf" at present). The following macros are used to package up
-testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
-modules to indicate in which datablock the parameters exist, and what the
-start/end of string field names are. */
-
-#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
-#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
-#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
-
-/* This macro checks for a newline at the given position */
-
-#define IS_NEWLINE(p) \
- ((NLBLOCK->nltype != NLTYPE_FIXED)? \
- ((p) < NLBLOCK->PSEND && \
- _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
- utf8)) \
- : \
- ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
- (p)[0] == NLBLOCK->nl[0] && \
- (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
- ) \
- )
-
-/* This macro checks for a newline immediately preceding the given position */
-
-#define WAS_NEWLINE(p) \
- ((NLBLOCK->nltype != NLTYPE_FIXED)? \
- ((p) > NLBLOCK->PSSTART && \
- _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
- &(NLBLOCK->nllen), utf8)) \
- : \
- ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
- (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
- (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
- ) \
- )
-
-/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
-with a custom type. This makes it possible, for example, to allow pcre_exec()
-to process subject strings that are discontinuous by using a smart pointer
-class. It must always be possible to inspect all of the subject string in
-pcre_exec() because of the way it backtracks. Two macros are required in the
-normal case, for sign-unspecified and unsigned char pointers. The former is
-used for the external interface and appears in pcre.h, which is why its name
-must begin with PCRE_. */
-
-#ifdef CUSTOM_SUBJECT_PTR
-#define PCRE_SPTR CUSTOM_SUBJECT_PTR
-#define USPTR CUSTOM_SUBJECT_PTR
-#else
-#define PCRE_SPTR const char *
-#define USPTR const unsigned char *
-#endif
-
-
-
-/* Include the public PCRE header and the definitions of UCP character property
-values. */
-
-#include "pcre.h"
-#include "ucp.h"
-
-/* When compiling for use with the Virtual Pascal compiler, these functions
-need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
-option on the command line. */
-
-#ifdef VPCOMPAT
-#define strlen(s) _strlen(s)
-#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
-#define memcmp(s,c,n) _memcmp(s,c,n)
-#define memcpy(d,s,n) _memcpy(d,s,n)
-#define memmove(d,s,n) _memmove(d,s,n)
-#define memset(s,c,n) _memset(s,c,n)
-#else /* VPCOMPAT */
-
-/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
-define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
-is set. Otherwise, include an emulating function for those systems that have
-neither (there some non-Unix environments where this is the case). */
-
-#ifndef HAVE_MEMMOVE
-#undef memmove /* some systems may have a macro */
-#ifdef HAVE_BCOPY
-#define memmove(a, b, c) bcopy(b, a, c)
-#else /* HAVE_BCOPY */
-static void *
-pcre_memmove(void *d, const void *s, size_t n)
-{
-size_t i;
-unsigned char *dest = (unsigned char *)d;
-const unsigned char *src = (const unsigned char *)s;
-if (dest > src)
- {
- dest += n;
- src += n;
- for (i = 0; i < n; ++i) *(--dest) = *(--src);
- return (void *)dest;
- }
-else
- {
- for (i = 0; i < n; ++i) *dest++ = *src++;
- return (void *)(dest - n);
- }
-}
-#define memmove(a, b, c) pcre_memmove(a, b, c)
-#endif /* not HAVE_BCOPY */
-#endif /* not HAVE_MEMMOVE */
-#endif /* not VPCOMPAT */
-
-
-/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
-in big-endian order) by default. These are used, for example, to link from the
-start of a subpattern to its alternatives and its end. The use of 2 bytes per
-offset limits the size of the compiled regex to around 64K, which is big enough
-for almost everybody. However, I received a request for an even bigger limit.
-For this reason, and also to make the code easier to maintain, the storing and
-loading of offsets from the byte string is now handled by the macros that are
-defined here.
-
-The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
-the config.h file, but can be overridden by using -D on the command line. This
-is automated on Unix systems via the "configure" command. */
-
-#if LINK_SIZE == 2
-
-#define PUT(a,n,d) \
- (a[n] = (d) >> 8), \
- (a[(n)+1] = (d) & 255)
-
-#define GET(a,n) \
- (((a)[n] << 8) | (a)[(n)+1])
-
-#define MAX_PATTERN_SIZE (1 << 16)
-
-
-#elif LINK_SIZE == 3
-
-#define PUT(a,n,d) \
- (a[n] = (d) >> 16), \
- (a[(n)+1] = (d) >> 8), \
- (a[(n)+2] = (d) & 255)
-
-#define GET(a,n) \
- (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
-
-#define MAX_PATTERN_SIZE (1 << 24)
-
-
-#elif LINK_SIZE == 4
-
-#define PUT(a,n,d) \
- (a[n] = (d) >> 24), \
- (a[(n)+1] = (d) >> 16), \
- (a[(n)+2] = (d) >> 8), \
- (a[(n)+3] = (d) & 255)
-
-#define GET(a,n) \
- (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
-
-#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
-
-
-#else
-#error LINK_SIZE must be either 2, 3, or 4
-#endif
-
-
-/* Convenience macro defined in terms of the others */
-
-#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
-
-
-/* PCRE uses some other 2-byte quantities that do not change when the size of
-offsets changes. There are used for repeat counts and for other things such as
-capturing parenthesis numbers in back references. */
-
-#define PUT2(a,n,d) \
- a[n] = (d) >> 8; \
- a[(n)+1] = (d) & 255
-
-#define GET2(a,n) \
- (((a)[n] << 8) | (a)[(n)+1])
-
-#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
-
-
-/* When UTF-8 encoding is being used, a character is no longer just a single
-byte. The macros for character handling generate simple sequences when used in
-byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
-never be called in byte mode. To make sure it can never even appear when UTF-8
-support is omitted, we don't even define it. */
-
-#ifndef SUPPORT_UTF8
-#define GETCHAR(c, eptr) c = *eptr;
-#define GETCHARTEST(c, eptr) c = *eptr;
-#define GETCHARINC(c, eptr) c = *eptr++;
-#define GETCHARINCTEST(c, eptr) c = *eptr++;
-#define GETCHARLEN(c, eptr, len) c = *eptr;
-/* #define BACKCHAR(eptr) */
-
-#else /* SUPPORT_UTF8 */
-
-/* Get the next UTF-8 character, not advancing the pointer. This is called when
-we know we are in UTF-8 mode. */
-
-#define GETCHAR(c, eptr) \
- c = *eptr; \
- if (c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- }
-
-/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
-pointer. */
-
-#define GETCHARTEST(c, eptr) \
- c = *eptr; \
- if (utf8 && c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- }
-
-/* Get the next UTF-8 character, advancing the pointer. This is called when we
-know we are in UTF-8 mode. */
-
-#define GETCHARINC(c, eptr) \
- c = *eptr++; \
- if (c >= 0xc0) \
- { \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- while (gcaa-- > 0) \
- { \
- gcss -= 6; \
- c |= (*eptr++ & 0x3f) << gcss; \
- } \
- }
-
-/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
-
-#define GETCHARINCTEST(c, eptr) \
- c = *eptr++; \
- if (utf8 && c >= 0xc0) \
- { \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- while (gcaa-- > 0) \
- { \
- gcss -= 6; \
- c |= (*eptr++ & 0x3f) << gcss; \
- } \
- }
-
-/* Get the next UTF-8 character, not advancing the pointer, incrementing length
-if there are extra bytes. This is called when we know we are in UTF-8 mode. */
-
-#define GETCHARLEN(c, eptr, len) \
- c = *eptr; \
- if (c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
-
-/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
-pointer, incrementing length if there are extra bytes. This is called when we
-know we are in UTF-8 mode. */
-
-#define GETCHARLENTEST(c, eptr, len) \
- c = *eptr; \
- if (utf8 && c >= 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
-
-/* If the pointer is not at the start of a character, move it back until
-it is. This is called only in UTF-8 mode - we don't put a test within the macro
-because almost all calls are already within a block of UTF-8 only code. */
-
-#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
-
-#endif
-
-
-/* In case there is no definition of offsetof() provided - though any proper
-Standard C system should have one. */
-
-#ifndef offsetof
-#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
-#endif
-
-
-/* These are the public options that can change during matching. */
-
-#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
-
-/* Private flags containing information about the compiled regex. They used to
-live at the top end of the options word, but that got almost full, so now they
-are in a 16-bit flags word. */
-
-#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
-#define PCRE_FIRSTSET 0x0002 /* first_byte is set */
-#define PCRE_REQCHSET 0x0004 /* req_byte is set */
-#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
-#define PCRE_JCHANGED 0x0010 /* j option used in regex */
-#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
-
-/* Options for the "extra" block produced by pcre_study(). */
-
-#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
-
-/* Masks for identifying the public options that are permitted at compile
-time, run time, or study time, respectively. */
-
-#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
- PCRE_NEWLINE_ANYCRLF)
-
-#define PUBLIC_COMPILE_OPTIONS \
- (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
- PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
- PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
- PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
- PCRE_JAVASCRIPT_COMPAT)
-
-#define PUBLIC_EXEC_OPTIONS \
- (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
- PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
- PCRE_NO_START_OPTIMIZE)
-
-#define PUBLIC_DFA_EXEC_OPTIONS \
- (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
- PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \
- PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE)
-
-#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
-
-/* Magic number to provide a small check against being handed junk. Also used
-to detect whether a pattern was compiled on a host of different endianness. */
-
-#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
-
-/* Negative values for the firstchar and reqchar variables */
-
-#define REQ_UNSET (-2)
-#define REQ_NONE (-1)
-
-/* The maximum remaining length of subject we are prepared to search for a
-req_byte match. */
-
-#define REQ_BYTE_MAX 1000
-
-/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
-variable-length repeat, or a anything other than literal characters. */
-
-#define REQ_CASELESS 0x0100 /* indicates caselessness */
-#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
-
-/* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
-environments where these macros are defined elsewhere. Unfortunately, there
-is no way to do the same for the typedef. */
-
-typedef int BOOL;
-
-#ifndef FALSE
-#define FALSE 0
-#define TRUE 1
-#endif
-
-/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
-character constants like '*' because the compiler would emit their EBCDIC code,
-which is different from their ASCII/UTF-8 code. Instead we define macros for
-the characters so that they always use the ASCII/UTF-8 code when UTF-8 support
-is enabled. When UTF-8 support is not enabled, the definitions use character
-literals. Both character and string versions of each character are needed, and
-there are some longer strings as well.
-
-This means that, on EBCDIC platforms, the PCRE library can handle either
-EBCDIC, or UTF-8, but not both. To support both in the same compiled library
-would need different lookups depending on whether PCRE_UTF8 was set or not.
-This would make it impossible to use characters in switch/case statements,
-which would reduce performance. For a theoretical use (which nobody has asked
-for) in a minority area (EBCDIC platforms), this is not sensible. Any
-application that did need both could compile two versions of the library, using
-macros to give the functions distinct names. */
-
-#ifndef SUPPORT_UTF8
-
-/* UTF-8 support is not enabled; use the platform-dependent character literals
-so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
-
-#define CHAR_HT '\t'
-#define CHAR_VT '\v'
-#define CHAR_FF '\f'
-#define CHAR_CR '\r'
-#define CHAR_NL '\n'
-#define CHAR_BS '\b'
-#define CHAR_BEL '\a'
-#ifdef EBCDIC
-#define CHAR_ESC '\047'
-#define CHAR_DEL '\007'
-#else
-#define CHAR_ESC '\033'
-#define CHAR_DEL '\177'
-#endif
-
-#define CHAR_SPACE ' '
-#define CHAR_EXCLAMATION_MARK '!'
-#define CHAR_QUOTATION_MARK '"'
-#define CHAR_NUMBER_SIGN '#'
-#define CHAR_DOLLAR_SIGN '$'
-#define CHAR_PERCENT_SIGN '%'
-#define CHAR_AMPERSAND '&'
-#define CHAR_APOSTROPHE '\''
-#define CHAR_LEFT_PARENTHESIS '('
-#define CHAR_RIGHT_PARENTHESIS ')'
-#define CHAR_ASTERISK '*'
-#define CHAR_PLUS '+'
-#define CHAR_COMMA ','
-#define CHAR_MINUS '-'
-#define CHAR_DOT '.'
-#define CHAR_SLASH '/'
-#define CHAR_0 '0'
-#define CHAR_1 '1'
-#define CHAR_2 '2'
-#define CHAR_3 '3'
-#define CHAR_4 '4'
-#define CHAR_5 '5'
-#define CHAR_6 '6'
-#define CHAR_7 '7'
-#define CHAR_8 '8'
-#define CHAR_9 '9'
-#define CHAR_COLON ':'
-#define CHAR_SEMICOLON ';'
-#define CHAR_LESS_THAN_SIGN '<'
-#define CHAR_EQUALS_SIGN '='
-#define CHAR_GREATER_THAN_SIGN '>'
-#define CHAR_QUESTION_MARK '?'
-#define CHAR_COMMERCIAL_AT '@'
-#define CHAR_A 'A'
-#define CHAR_B 'B'
-#define CHAR_C 'C'
-#define CHAR_D 'D'
-#define CHAR_E 'E'
-#define CHAR_F 'F'
-#define CHAR_G 'G'
-#define CHAR_H 'H'
-#define CHAR_I 'I'
-#define CHAR_J 'J'
-#define CHAR_K 'K'
-#define CHAR_L 'L'
-#define CHAR_M 'M'
-#define CHAR_N 'N'
-#define CHAR_O 'O'
-#define CHAR_P 'P'
-#define CHAR_Q 'Q'
-#define CHAR_R 'R'
-#define CHAR_S 'S'
-#define CHAR_T 'T'
-#define CHAR_U 'U'
-#define CHAR_V 'V'
-#define CHAR_W 'W'
-#define CHAR_X 'X'
-#define CHAR_Y 'Y'
-#define CHAR_Z 'Z'
-#define CHAR_LEFT_SQUARE_BRACKET '['
-#define CHAR_BACKSLASH '\\'
-#define CHAR_RIGHT_SQUARE_BRACKET ']'
-#define CHAR_CIRCUMFLEX_ACCENT '^'
-#define CHAR_UNDERSCORE '_'
-#define CHAR_GRAVE_ACCENT '`'
-#define CHAR_a 'a'
-#define CHAR_b 'b'
-#define CHAR_c 'c'
-#define CHAR_d 'd'
-#define CHAR_e 'e'
-#define CHAR_f 'f'
-#define CHAR_g 'g'
-#define CHAR_h 'h'
-#define CHAR_i 'i'
-#define CHAR_j 'j'
-#define CHAR_k 'k'
-#define CHAR_l 'l'
-#define CHAR_m 'm'
-#define CHAR_n 'n'
-#define CHAR_o 'o'
-#define CHAR_p 'p'
-#define CHAR_q 'q'
-#define CHAR_r 'r'
-#define CHAR_s 's'
-#define CHAR_t 't'
-#define CHAR_u 'u'
-#define CHAR_v 'v'
-#define CHAR_w 'w'
-#define CHAR_x 'x'
-#define CHAR_y 'y'
-#define CHAR_z 'z'
-#define CHAR_LEFT_CURLY_BRACKET '{'
-#define CHAR_VERTICAL_LINE '|'
-#define CHAR_RIGHT_CURLY_BRACKET '}'
-#define CHAR_TILDE '~'
-
-#define STR_HT "\t"
-#define STR_VT "\v"
-#define STR_FF "\f"
-#define STR_CR "\r"
-#define STR_NL "\n"
-#define STR_BS "\b"
-#define STR_BEL "\a"
-#ifdef EBCDIC
-#define STR_ESC "\047"
-#define STR_DEL "\007"
-#else
-#define STR_ESC "\033"
-#define STR_DEL "\177"
-#endif
-
-#define STR_SPACE " "
-#define STR_EXCLAMATION_MARK "!"
-#define STR_QUOTATION_MARK "\""
-#define STR_NUMBER_SIGN "#"
-#define STR_DOLLAR_SIGN "$"
-#define STR_PERCENT_SIGN "%"
-#define STR_AMPERSAND "&"
-#define STR_APOSTROPHE "'"
-#define STR_LEFT_PARENTHESIS "("
-#define STR_RIGHT_PARENTHESIS ")"
-#define STR_ASTERISK "*"
-#define STR_PLUS "+"
-#define STR_COMMA ","
-#define STR_MINUS "-"
-#define STR_DOT "."
-#define STR_SLASH "/"
-#define STR_0 "0"
-#define STR_1 "1"
-#define STR_2 "2"
-#define STR_3 "3"
-#define STR_4 "4"
-#define STR_5 "5"
-#define STR_6 "6"
-#define STR_7 "7"
-#define STR_8 "8"
-#define STR_9 "9"
-#define STR_COLON ":"
-#define STR_SEMICOLON ";"
-#define STR_LESS_THAN_SIGN "<"
-#define STR_EQUALS_SIGN "="
-#define STR_GREATER_THAN_SIGN ">"
-#define STR_QUESTION_MARK "?"
-#define STR_COMMERCIAL_AT "@"
-#define STR_A "A"
-#define STR_B "B"
-#define STR_C "C"
-#define STR_D "D"
-#define STR_E "E"
-#define STR_F "F"
-#define STR_G "G"
-#define STR_H "H"
-#define STR_I "I"
-#define STR_J "J"
-#define STR_K "K"
-#define STR_L "L"
-#define STR_M "M"
-#define STR_N "N"
-#define STR_O "O"
-#define STR_P "P"
-#define STR_Q "Q"
-#define STR_R "R"
-#define STR_S "S"
-#define STR_T "T"
-#define STR_U "U"
-#define STR_V "V"
-#define STR_W "W"
-#define STR_X "X"
-#define STR_Y "Y"
-#define STR_Z "Z"
-#define STR_LEFT_SQUARE_BRACKET "["
-#define STR_BACKSLASH "\\"
-#define STR_RIGHT_SQUARE_BRACKET "]"
-#define STR_CIRCUMFLEX_ACCENT "^"
-#define STR_UNDERSCORE "_"
-#define STR_GRAVE_ACCENT "`"
-#define STR_a "a"
-#define STR_b "b"
-#define STR_c "c"
-#define STR_d "d"
-#define STR_e "e"
-#define STR_f "f"
-#define STR_g "g"
-#define STR_h "h"
-#define STR_i "i"
-#define STR_j "j"
-#define STR_k "k"
-#define STR_l "l"
-#define STR_m "m"
-#define STR_n "n"
-#define STR_o "o"
-#define STR_p "p"
-#define STR_q "q"
-#define STR_r "r"
-#define STR_s "s"
-#define STR_t "t"
-#define STR_u "u"
-#define STR_v "v"
-#define STR_w "w"
-#define STR_x "x"
-#define STR_y "y"
-#define STR_z "z"
-#define STR_LEFT_CURLY_BRACKET "{"
-#define STR_VERTICAL_LINE "|"
-#define STR_RIGHT_CURLY_BRACKET "}"
-#define STR_TILDE "~"
-
-#define STRING_ACCEPT0 "ACCEPT\0"
-#define STRING_COMMIT0 "COMMIT\0"
-#define STRING_F0 "F\0"
-#define STRING_FAIL0 "FAIL\0"
-#define STRING_PRUNE0 "PRUNE\0"
-#define STRING_SKIP0 "SKIP\0"
-#define STRING_THEN "THEN"
-
-#define STRING_alpha0 "alpha\0"
-#define STRING_lower0 "lower\0"
-#define STRING_upper0 "upper\0"
-#define STRING_alnum0 "alnum\0"
-#define STRING_ascii0 "ascii\0"
-#define STRING_blank0 "blank\0"
-#define STRING_cntrl0 "cntrl\0"
-#define STRING_digit0 "digit\0"
-#define STRING_graph0 "graph\0"
-#define STRING_print0 "print\0"
-#define STRING_punct0 "punct\0"
-#define STRING_space0 "space\0"
-#define STRING_word0 "word\0"
-#define STRING_xdigit "xdigit"
-
-#define STRING_DEFINE "DEFINE"
-
-#define STRING_CR_RIGHTPAR "CR)"
-#define STRING_LF_RIGHTPAR "LF)"
-#define STRING_CRLF_RIGHTPAR "CRLF)"
-#define STRING_ANY_RIGHTPAR "ANY)"
-#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
-#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
-#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
-#define STRING_UTF8_RIGHTPAR "UTF8)"
-
-#else /* SUPPORT_UTF8 */
-
-/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
-works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
-only. */
-
-#define CHAR_HT '\011'
-#define CHAR_VT '\013'
-#define CHAR_FF '\014'
-#define CHAR_CR '\015'
-#define CHAR_NL '\012'
-#define CHAR_BS '\010'
-#define CHAR_BEL '\007'
-#define CHAR_ESC '\033'
-#define CHAR_DEL '\177'
-
-#define CHAR_SPACE '\040'
-#define CHAR_EXCLAMATION_MARK '\041'
-#define CHAR_QUOTATION_MARK '\042'
-#define CHAR_NUMBER_SIGN '\043'
-#define CHAR_DOLLAR_SIGN '\044'
-#define CHAR_PERCENT_SIGN '\045'
-#define CHAR_AMPERSAND '\046'
-#define CHAR_APOSTROPHE '\047'
-#define CHAR_LEFT_PARENTHESIS '\050'
-#define CHAR_RIGHT_PARENTHESIS '\051'
-#define CHAR_ASTERISK '\052'
-#define CHAR_PLUS '\053'
-#define CHAR_COMMA '\054'
-#define CHAR_MINUS '\055'
-#define CHAR_DOT '\056'
-#define CHAR_SLASH '\057'
-#define CHAR_0 '\060'
-#define CHAR_1 '\061'
-#define CHAR_2 '\062'
-#define CHAR_3 '\063'
-#define CHAR_4 '\064'
-#define CHAR_5 '\065'
-#define CHAR_6 '\066'
-#define CHAR_7 '\067'
-#define CHAR_8 '\070'
-#define CHAR_9 '\071'
-#define CHAR_COLON '\072'
-#define CHAR_SEMICOLON '\073'
-#define CHAR_LESS_THAN_SIGN '\074'
-#define CHAR_EQUALS_SIGN '\075'
-#define CHAR_GREATER_THAN_SIGN '\076'
-#define CHAR_QUESTION_MARK '\077'
-#define CHAR_COMMERCIAL_AT '\100'
-#define CHAR_A '\101'
-#define CHAR_B '\102'
-#define CHAR_C '\103'
-#define CHAR_D '\104'
-#define CHAR_E '\105'
-#define CHAR_F '\106'
-#define CHAR_G '\107'
-#define CHAR_H '\110'
-#define CHAR_I '\111'
-#define CHAR_J '\112'
-#define CHAR_K '\113'
-#define CHAR_L '\114'
-#define CHAR_M '\115'
-#define CHAR_N '\116'
-#define CHAR_O '\117'
-#define CHAR_P '\120'
-#define CHAR_Q '\121'
-#define CHAR_R '\122'
-#define CHAR_S '\123'
-#define CHAR_T '\124'
-#define CHAR_U '\125'
-#define CHAR_V '\126'
-#define CHAR_W '\127'
-#define CHAR_X '\130'
-#define CHAR_Y '\131'
-#define CHAR_Z '\132'
-#define CHAR_LEFT_SQUARE_BRACKET '\133'
-#define CHAR_BACKSLASH '\134'
-#define CHAR_RIGHT_SQUARE_BRACKET '\135'
-#define CHAR_CIRCUMFLEX_ACCENT '\136'
-#define CHAR_UNDERSCORE '\137'
-#define CHAR_GRAVE_ACCENT '\140'
-#define CHAR_a '\141'
-#define CHAR_b '\142'
-#define CHAR_c '\143'
-#define CHAR_d '\144'
-#define CHAR_e '\145'
-#define CHAR_f '\146'
-#define CHAR_g '\147'
-#define CHAR_h '\150'
-#define CHAR_i '\151'
-#define CHAR_j '\152'
-#define CHAR_k '\153'
-#define CHAR_l '\154'
-#define CHAR_m '\155'
-#define CHAR_n '\156'
-#define CHAR_o '\157'
-#define CHAR_p '\160'
-#define CHAR_q '\161'
-#define CHAR_r '\162'
-#define CHAR_s '\163'
-#define CHAR_t '\164'
-#define CHAR_u '\165'
-#define CHAR_v '\166'
-#define CHAR_w '\167'
-#define CHAR_x '\170'
-#define CHAR_y '\171'
-#define CHAR_z '\172'
-#define CHAR_LEFT_CURLY_BRACKET '\173'
-#define CHAR_VERTICAL_LINE '\174'
-#define CHAR_RIGHT_CURLY_BRACKET '\175'
-#define CHAR_TILDE '\176'
-
-#define STR_HT "\011"
-#define STR_VT "\013"
-#define STR_FF "\014"
-#define STR_CR "\015"
-#define STR_NL "\012"
-#define STR_BS "\010"
-#define STR_BEL "\007"
-#define STR_ESC "\033"
-#define STR_DEL "\177"
-
-#define STR_SPACE "\040"
-#define STR_EXCLAMATION_MARK "\041"
-#define STR_QUOTATION_MARK "\042"
-#define STR_NUMBER_SIGN "\043"
-#define STR_DOLLAR_SIGN "\044"
-#define STR_PERCENT_SIGN "\045"
-#define STR_AMPERSAND "\046"
-#define STR_APOSTROPHE "\047"
-#define STR_LEFT_PARENTHESIS "\050"
-#define STR_RIGHT_PARENTHESIS "\051"
-#define STR_ASTERISK "\052"
-#define STR_PLUS "\053"
-#define STR_COMMA "\054"
-#define STR_MINUS "\055"
-#define STR_DOT "\056"
-#define STR_SLASH "\057"
-#define STR_0 "\060"
-#define STR_1 "\061"
-#define STR_2 "\062"
-#define STR_3 "\063"
-#define STR_4 "\064"
-#define STR_5 "\065"
-#define STR_6 "\066"
-#define STR_7 "\067"
-#define STR_8 "\070"
-#define STR_9 "\071"
-#define STR_COLON "\072"
-#define STR_SEMICOLON "\073"
-#define STR_LESS_THAN_SIGN "\074"
-#define STR_EQUALS_SIGN "\075"
-#define STR_GREATER_THAN_SIGN "\076"
-#define STR_QUESTION_MARK "\077"
-#define STR_COMMERCIAL_AT "\100"
-#define STR_A "\101"
-#define STR_B "\102"
-#define STR_C "\103"
-#define STR_D "\104"
-#define STR_E "\105"
-#define STR_F "\106"
-#define STR_G "\107"
-#define STR_H "\110"
-#define STR_I "\111"
-#define STR_J "\112"
-#define STR_K "\113"
-#define STR_L "\114"
-#define STR_M "\115"
-#define STR_N "\116"
-#define STR_O "\117"
-#define STR_P "\120"
-#define STR_Q "\121"
-#define STR_R "\122"
-#define STR_S "\123"
-#define STR_T "\124"
-#define STR_U "\125"
-#define STR_V "\126"
-#define STR_W "\127"
-#define STR_X "\130"
-#define STR_Y "\131"
-#define STR_Z "\132"
-#define STR_LEFT_SQUARE_BRACKET "\133"
-#define STR_BACKSLASH "\134"
-#define STR_RIGHT_SQUARE_BRACKET "\135"
-#define STR_CIRCUMFLEX_ACCENT "\136"
-#define STR_UNDERSCORE "\137"
-#define STR_GRAVE_ACCENT "\140"
-#define STR_a "\141"
-#define STR_b "\142"
-#define STR_c "\143"
-#define STR_d "\144"
-#define STR_e "\145"
-#define STR_f "\146"
-#define STR_g "\147"
-#define STR_h "\150"
-#define STR_i "\151"
-#define STR_j "\152"
-#define STR_k "\153"
-#define STR_l "\154"
-#define STR_m "\155"
-#define STR_n "\156"
-#define STR_o "\157"
-#define STR_p "\160"
-#define STR_q "\161"
-#define STR_r "\162"
-#define STR_s "\163"
-#define STR_t "\164"
-#define STR_u "\165"
-#define STR_v "\166"
-#define STR_w "\167"
-#define STR_x "\170"
-#define STR_y "\171"
-#define STR_z "\172"
-#define STR_LEFT_CURLY_BRACKET "\173"
-#define STR_VERTICAL_LINE "\174"
-#define STR_RIGHT_CURLY_BRACKET "\175"
-#define STR_TILDE "\176"
-
-#define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0"
-#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
-#define STRING_F0 STR_F "\0"
-#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
-#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
-#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
-#define STRING_THEN STR_T STR_H STR_E STR_N
-
-#define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0"
-#define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0"
-#define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0"
-#define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0"
-#define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0"
-#define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0"
-#define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0"
-#define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0"
-#define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0"
-#define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0"
-#define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0"
-#define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0"
-#define STRING_word0 STR_w STR_o STR_r STR_d "\0"
-#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
-
-#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
-
-#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
-#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
-#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
-#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
-
-#endif /* SUPPORT_UTF8 */
-
-/* Escape items that are just an encoding of a particular data value. */
-
-#ifndef ESC_e
-#define ESC_e CHAR_ESC
-#endif
-
-#ifndef ESC_f
-#define ESC_f CHAR_FF
-#endif
-
-#ifndef ESC_n
-#define ESC_n CHAR_NL
-#endif
-
-#ifndef ESC_r
-#define ESC_r CHAR_CR
-#endif
-
-/* We can't officially use ESC_t because it is a POSIX reserved identifier
-(presumably because of all the others like size_t). */
-
-#ifndef ESC_tee
-#define ESC_tee CHAR_HT
-#endif
-
-/* Codes for different types of Unicode property */
-
-#define PT_ANY 0 /* Any property - matches all chars */
-#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
-#define PT_GC 2 /* General characteristic (e.g. L) */
-#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
-#define PT_SC 4 /* Script (e.g. Han) */
-
-/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
-contain UTF-8 characters with values greater than 255. */
-
-#define XCL_NOT 0x01 /* Flag: this is a negative class */
-#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
-
-#define XCL_END 0 /* Marks end of individual items */
-#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
-#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
-#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
-#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
-
-/* These are escaped items that aren't just an encoding of a particular data
-value such as \n. They must have non-zero values, as check_escape() returns
-their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence, and another for OP_ALLANY
-(which is used for [^] in JavaScript compatibility mode).
-
-The final escape must be ESC_REF as subsequent values are used for
-backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
-greater than ESC_b and less than ESC_Z to detect the types that may be
-repeated. These are the types that consume characters. If any new escapes are
-put in between that don't consume a character, that code will have to change.
-*/
-
-enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
- ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
- ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
- ESC_REF };
-
-
-/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
-OP_EOD must correspond in order to the list of escapes immediately above.
-
-*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
-that follow must also be updated to match. There is also a table called
-"coptable" in pcre_dfa_exec.c that must be updated. */
-
-enum {
- OP_END, /* 0 End of pattern */
-
- /* Values corresponding to backslashed metacharacters */
-
- OP_SOD, /* 1 Start of data: \A */
- OP_SOM, /* 2 Start of match (subject + offset): \G */
- OP_SET_SOM, /* 3 Set start of match (\K) */
- OP_NOT_WORD_BOUNDARY, /* 4 \B */
- OP_WORD_BOUNDARY, /* 5 \b */
- OP_NOT_DIGIT, /* 6 \D */
- OP_DIGIT, /* 7 \d */
- OP_NOT_WHITESPACE, /* 8 \S */
- OP_WHITESPACE, /* 9 \s */
- OP_NOT_WORDCHAR, /* 10 \W */
- OP_WORDCHAR, /* 11 \w */
- OP_ANY, /* 12 Match any character (subject to DOTALL) */
- OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
- OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
- OP_NOTPROP, /* 15 \P (not Unicode property) */
- OP_PROP, /* 16 \p (Unicode property) */
- OP_ANYNL, /* 17 \R (any newline sequence) */
- OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
- OP_HSPACE, /* 19 \h (horizontal whitespace) */
- OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
- OP_VSPACE, /* 21 \v (vertical whitespace) */
- OP_EXTUNI, /* 22 \X (extended Unicode sequence */
- OP_EODN, /* 23 End of data or \n at end of data: \Z. */
- OP_EOD, /* 24 End of data: \z */
-
- OP_OPT, /* 25 Set runtime options */
- OP_CIRC, /* 26 Start of line - varies with multiline switch */
- OP_DOLL, /* 27 End of line - varies with multiline switch */
- OP_CHAR, /* 28 Match one character, casefully */
- OP_CHARNC, /* 29 Match one character, caselessly */
- OP_NOT, /* 30 Match one character, not the following one */
-
- OP_STAR, /* 31 The maximizing and minimizing versions of */
- OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
- OP_PLUS, /* 33 the minimizing one second. */
- OP_MINPLUS, /* 34 This first set applies to single characters.*/
- OP_QUERY, /* 35 */
- OP_MINQUERY, /* 36 */
-
- OP_UPTO, /* 37 From 0 to n matches */
- OP_MINUPTO, /* 38 */
- OP_EXACT, /* 39 Exactly n matches */
-
- OP_POSSTAR, /* 40 Possessified star */
- OP_POSPLUS, /* 41 Possessified plus */
- OP_POSQUERY, /* 42 Posesssified query */
- OP_POSUPTO, /* 43 Possessified upto */
-
- OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
- OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
- OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
- OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
- OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
- OP_NOTMINQUERY, /* 49 */
-
- OP_NOTUPTO, /* 50 From 0 to n matches */
- OP_NOTMINUPTO, /* 51 */
- OP_NOTEXACT, /* 52 Exactly n matches */
-
- OP_NOTPOSSTAR, /* 53 Possessified versions */
- OP_NOTPOSPLUS, /* 54 */
- OP_NOTPOSQUERY, /* 55 */
- OP_NOTPOSUPTO, /* 56 */
-
- OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
- OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
- OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
- OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
- OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
- OP_TYPEMINQUERY, /* 62 */
-
- OP_TYPEUPTO, /* 63 From 0 to n matches */
- OP_TYPEMINUPTO, /* 64 */
- OP_TYPEEXACT, /* 65 Exactly n matches */
-
- OP_TYPEPOSSTAR, /* 66 Possessified versions */
- OP_TYPEPOSPLUS, /* 67 */
- OP_TYPEPOSQUERY, /* 68 */
- OP_TYPEPOSUPTO, /* 69 */
-
- OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
- OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
- OP_CRPLUS, /* 72 the minimizing one second. These codes must */
- OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
- OP_CRQUERY, /* 74 These are for character classes and back refs */
- OP_CRMINQUERY, /* 75 */
- OP_CRRANGE, /* 76 These are different to the three sets above. */
- OP_CRMINRANGE, /* 77 */
-
- OP_CLASS, /* 78 Match a character class, chars < 256 only */
- OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
- class - the difference is relevant only when a UTF-8
- character > 255 is encountered. */
-
- OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
- class. This does both positive and negative. */
-
- OP_REF, /* 81 Match a back reference */
- OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
- OP_CALLOUT, /* 83 Call out to external function if provided */
-
- OP_ALT, /* 84 Start of alternation */
- OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
- OP_KETRMAX, /* 86 These two must remain together and in this */
- OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
-
- /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
-
- OP_ASSERT, /* 88 Positive lookahead */
- OP_ASSERT_NOT, /* 89 Negative lookahead */
- OP_ASSERTBACK, /* 90 Positive lookbehind */
- OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
- OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
-
- /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
- as there's a test for >= ONCE for a subpattern that isn't an assertion. */
-
- OP_ONCE, /* 93 Atomic group */
- OP_BRA, /* 94 Start of non-capturing bracket */
- OP_CBRA, /* 95 Start of capturing bracket */
- OP_COND, /* 96 Conditional group */
-
- /* These three must follow the previous three, in the same order. There's a
- check for >= SBRA to distinguish the two sets. */
-
- OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
- OP_SCBRA, /* 98 Start of capturing bracket, check empty */
- OP_SCOND, /* 99 Conditional group, check empty */
-
- OP_CREF, /* 100 Used to hold a capture number as condition */
- OP_RREF, /* 101 Used to hold a recursion number as condition */
- OP_DEF, /* 102 The DEFINE condition */
-
- OP_BRAZERO, /* 103 These two must remain together and in this */
- OP_BRAMINZERO, /* 104 order. */
-
- /* These are backtracking control verbs */
-
- OP_PRUNE, /* 105 */
- OP_SKIP, /* 106 */
- OP_THEN, /* 107 */
- OP_COMMIT, /* 108 */
-
- /* These are forced failure and success verbs */
-
- OP_FAIL, /* 109 */
- OP_ACCEPT, /* 110 */
-
- /* This is used to skip a subpattern with a {0} quantifier */
-
- OP_SKIPZERO /* 111 */
-};
-
-
-/* This macro defines textual names for all the opcodes. These are used only
-for debugging. The macro is referenced only in pcre_printint.c. */
-
-#define OP_NAME_LIST \
- "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
- "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
- "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
- "extuni", "\\Z", "\\z", \
- "Opt", "^", "$", "char", "charnc", "not", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
- "*+","++", "?+", "{", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
- "*+","++", "?+", "{", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
- "*+","++", "?+", "{", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", \
- "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
- "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
- "AssertB", "AssertB not", "Reverse", \
- "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
- "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
- "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
- "Skip zero"
-
-
-/* This macro defines the length of fixed length operations in the compiled
-regex. The lengths are used when searching for specific things, and also in the
-debugging printing of a compiled regex. We use a macro so that it can be
-defined close to the definitions of the opcodes themselves.
-
-As things have been extended, some of these are no longer fixed lenths, but are
-minima instead. For example, the length of a single-character repeat may vary
-in UTF-8 mode. The code that uses this table must know about such things. */
-
-#define OP_LENGTHS \
- 1, /* End */ \
- 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
- 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
- 1, 1, 1, /* Any, AllAny, Anybyte */ \
- 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
- 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
- 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
- 2, /* Char - the minimum length */ \
- 2, /* Charnc - the minimum length */ \
- 2, /* not */ \
- /* Positive single-char repeats ** These are */ \
- 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
- 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
- 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
- /* Negative single-char repeats - only for chars < 256 */ \
- 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
- 4, 4, 4, /* NOT upto, minupto, exact */ \
- 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
- /* Positive type repeats */ \
- 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
- 4, 4, 4, /* Type upto, minupto, exact */ \
- 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
- /* Character class & ref repeats */ \
- 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
- 5, 5, /* CRRANGE, CRMINRANGE */ \
- 33, /* CLASS */ \
- 33, /* NCLASS */ \
- 0, /* XCLASS - variable length */ \
- 3, /* REF */ \
- 1+LINK_SIZE, /* RECURSE */ \
- 2+2*LINK_SIZE, /* CALLOUT */ \
- 1+LINK_SIZE, /* Alt */ \
- 1+LINK_SIZE, /* Ket */ \
- 1+LINK_SIZE, /* KetRmax */ \
- 1+LINK_SIZE, /* KetRmin */ \
- 1+LINK_SIZE, /* Assert */ \
- 1+LINK_SIZE, /* Assert not */ \
- 1+LINK_SIZE, /* Assert behind */ \
- 1+LINK_SIZE, /* Assert behind not */ \
- 1+LINK_SIZE, /* Reverse */ \
- 1+LINK_SIZE, /* ONCE */ \
- 1+LINK_SIZE, /* BRA */ \
- 3+LINK_SIZE, /* CBRA */ \
- 1+LINK_SIZE, /* COND */ \
- 1+LINK_SIZE, /* SBRA */ \
- 3+LINK_SIZE, /* SCBRA */ \
- 1+LINK_SIZE, /* SCOND */ \
- 3, /* CREF */ \
- 3, /* RREF */ \
- 1, /* DEF */ \
- 1, 1, /* BRAZERO, BRAMINZERO */ \
- 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
-
-
-/* A magic value for OP_RREF to indicate the "any recursion" condition. */
-
-#define RREF_ANY 0xffff
-
-/* Error code numbers. They are given names so that they can more easily be
-tracked. */
-
-enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
- ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
- ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
- ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
- ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
- ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
- ERR60, ERR61, ERR62, ERR63, ERR64 };
-
-/* The real format of the start of the pcre block; the index of names and the
-code vector run on as long as necessary after the end. We store an explicit
-offset to the name table so that if a regex is compiled on one host, saved, and
-then run on another where the size of pointers is different, all might still
-be well. For the case of compiled-on-4 and run-on-8, we include an extra
-pointer that is always NULL. For future-proofing, a few dummy fields were
-originally included - even though you can never get this planning right - but
-there is only one left now.
-
-NOTE NOTE NOTE:
-Because people can now save and re-use compiled patterns, any additions to this
-structure should be made at the end, and something earlier (e.g. a new
-flag in the options or one of the dummy fields) should indicate that the new
-fields are present. Currently PCRE always sets the dummy fields to zero.
-NOTE NOTE NOTE:
-*/
-
-typedef struct real_pcre {
- pcre_uint32 magic_number;
- pcre_uint32 size; /* Total that was malloced */
- pcre_uint32 options; /* Public options */
- pcre_uint16 flags; /* Private flags */
- pcre_uint16 dummy1; /* For future use */
- pcre_uint16 top_bracket;
- pcre_uint16 top_backref;
- pcre_uint16 first_byte;
- pcre_uint16 req_byte;
- pcre_uint16 name_table_offset; /* Offset to name table that follows */
- pcre_uint16 name_entry_size; /* Size of any name items */
- pcre_uint16 name_count; /* Number of name items */
- pcre_uint16 ref_count; /* Reference count */
-
- const unsigned char *tables; /* Pointer to tables or NULL for std */
- const unsigned char *nullpad; /* NULL padding */
-} real_pcre;
-
-/* The format of the block used to store data from pcre_study(). The same
-remark (see NOTE above) about extending this structure applies. */
-
-typedef struct pcre_study_data {
- pcre_uint32 size; /* Total that was malloced */
- pcre_uint32 options;
- uschar start_bits[32];
-} pcre_study_data;
-
-/* Structure for passing "static" information around between the functions
-doing the compiling, so that they are thread-safe. */
-
-typedef struct compile_data {
- const uschar *lcc; /* Points to lower casing table */
- const uschar *fcc; /* Points to case-flipping table */
- const uschar *cbits; /* Points to character type table */
- const uschar *ctypes; /* Points to table of type maps */
- const uschar *start_workspace;/* The start of working space */
- const uschar *start_code; /* The start of the compiled code */
- const uschar *start_pattern; /* The start of the pattern */
- const uschar *end_pattern; /* The end of the pattern */
- uschar *hwm; /* High watermark of workspace */
- uschar *name_table; /* The name/number table */
- int names_found; /* Number of entries so far */
- int name_entry_size; /* Size of each entry */
- int bracount; /* Count of capturing parens as we compile */
- int final_bracount; /* Saved value after first pass */
- int top_backref; /* Maximum back reference */
- unsigned int backref_map; /* Bitmap of low back refs */
- int external_options; /* External (initial) options */
- int external_flags; /* External flag bits to be set */
- int req_varyopt; /* "After variable item" flag for reqbyte */
- BOOL had_accept; /* (*ACCEPT) encountered */
- int nltype; /* Newline type */
- int nllen; /* Newline string length */
- uschar nl[4]; /* Newline string when fixed length */
-} compile_data;
-
-/* Structure for maintaining a chain of pointers to the currently incomplete
-branches, for testing for left recursion. */
-
-typedef struct branch_chain {
- struct branch_chain *outer;
- uschar *current;
-} branch_chain;
-
-/* Structure for items in a linked list that represents an explicit recursive
-call within the pattern. */
-
-typedef struct recursion_info {
- struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
- int group_num; /* Number of group that was called */
- const uschar *after_call; /* "Return value": points after the call in the expr */
- USPTR save_start; /* Old value of mstart */
- int *offset_save; /* Pointer to start of saved offsets */
- int saved_max; /* Number of saved offsets */
-} recursion_info;
-
-/* Structure for building a chain of data for holding the values of the subject
-pointer at the start of each subpattern, so as to detect when an empty string
-has been matched by a subpattern - to break infinite loops. */
-
-typedef struct eptrblock {
- struct eptrblock *epb_prev;
- USPTR epb_saved_eptr;
-} eptrblock;
-
-
-/* Structure for passing "static" information around between the functions
-doing traditional NFA matching, so that they are thread-safe. */
-
-typedef struct match_data {
- unsigned long int match_call_count; /* As it says */
- unsigned long int match_limit; /* As it says */
- unsigned long int match_limit_recursion; /* As it says */
- int *offset_vector; /* Offset vector */
- int offset_end; /* One past the end */
- int offset_max; /* The maximum usable for return data */
- int nltype; /* Newline type */
- int nllen; /* Newline string length */
- uschar nl[4]; /* Newline string when fixed */
- const uschar *lcc; /* Points to lower casing table */
- const uschar *ctypes; /* Points to table of type maps */
- BOOL offset_overflow; /* Set if too many extractions */
- BOOL notbol; /* NOTBOL flag */
- BOOL noteol; /* NOTEOL flag */
- BOOL utf8; /* UTF8 flag */
- BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
- BOOL endonly; /* Dollar not before final \n */
- BOOL notempty; /* Empty string match not wanted */
- BOOL partial; /* PARTIAL flag */
- BOOL hitend; /* Hit the end of the subject at some point */
- BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
- const uschar *start_code; /* For use when recursing */
- USPTR start_subject; /* Start of the subject string */
- USPTR end_subject; /* End of the subject string */
- USPTR start_match_ptr; /* Start of matched string */
- USPTR end_match_ptr; /* Subject position at end match */
- int end_offset_top; /* Highwater mark at end of match */
- int capture_last; /* Most recent capture number */
- int start_offset; /* The start offset value */
- eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
- int eptrn; /* Next free eptrblock */
- recursion_info *recursive; /* Linked list of recursion data */
- void *callout_data; /* To pass back to callouts */
-} match_data;
-
-/* A similar structure is used for the same purpose by the DFA matching
-functions. */
-
-typedef struct dfa_match_data {
- const uschar *start_code; /* Start of the compiled pattern */
- const uschar *start_subject; /* Start of the subject string */
- const uschar *end_subject; /* End of subject string */
- const uschar *tables; /* Character tables */
- int moptions; /* Match options */
- int poptions; /* Pattern options */
- int nltype; /* Newline type */
- int nllen; /* Newline string length */
- uschar nl[4]; /* Newline string when fixed */
- void *callout_data; /* To pass back to callouts */
-} dfa_match_data;
-
-/* Bit definitions for entries in the pcre_ctypes table. */
-
-#define ctype_space 0x01
-#define ctype_letter 0x02
-#define ctype_digit 0x04
-#define ctype_xdigit 0x08
-#define ctype_word 0x10 /* alphanumeric or '_' */
-#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
-
-/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
-of bits for a class map. Some classes are built by combining these tables. */
-
-#define cbit_space 0 /* [:space:] or \s */
-#define cbit_xdigit 32 /* [:xdigit:] */
-#define cbit_digit 64 /* [:digit:] or \d */
-#define cbit_upper 96 /* [:upper:] */
-#define cbit_lower 128 /* [:lower:] */
-#define cbit_word 160 /* [:word:] or \w */
-#define cbit_graph 192 /* [:graph:] */
-#define cbit_print 224 /* [:print:] */
-#define cbit_punct 256 /* [:punct:] */
-#define cbit_cntrl 288 /* [:cntrl:] */
-#define cbit_length 320 /* Length of the cbits table */
-
-/* Offsets of the various tables from the base tables pointer, and
-total length. */
-
-#define lcc_offset 0
-#define fcc_offset 256
-#define cbits_offset 512
-#define ctypes_offset (cbits_offset + cbit_length)
-#define tables_length (ctypes_offset + 256)
-
-/* Layout of the UCP type table that translates property names into types and
-codes. Each entry used to point directly to a name, but to reduce the number of
-relocations in shared libraries, it now has an offset into a single string
-instead. */
-
-typedef struct {
- pcre_uint16 name_offset;
- pcre_uint16 type;
- pcre_uint16 value;
-} ucp_type_table;
-
-
-/* Internal shared data tables. These are tables that are used by more than one
-of the exported public functions. They have to be "external" in the C sense,
-but are not part of the PCRE public API. The data for these tables is in the
-pcre_tables.c module. */
-
-extern const int _pcre_utf8_table1[];
-extern const int _pcre_utf8_table2[];
-extern const int _pcre_utf8_table3[];
-extern const uschar _pcre_utf8_table4[];
-
-extern const int _pcre_utf8_table1_size;
-
-extern const char _pcre_utt_names[];
-extern const ucp_type_table _pcre_utt[];
-extern const int _pcre_utt_size;
-
-extern const uschar _pcre_default_tables[];
-
-extern const uschar _pcre_OP_lengths[];
-
-
-/* Internal shared functions. These are functions that are used by more than
-one of the exported public functions. They have to be "external" in the C
-sense, but are not part of the PCRE public API. */
-
-extern BOOL _pcre_is_newline(const uschar *, int, const uschar *,
- int *, BOOL);
-extern int _pcre_ord2utf8(int, uschar *);
-extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
- const pcre_study_data *, pcre_study_data *);
-extern int _pcre_valid_utf8(const uschar *, int);
-extern BOOL _pcre_was_newline(const uschar *, int, const uschar *,
- int *, BOOL);
-extern BOOL _pcre_xclass(int, const uschar *);
-
-
-/* Unicode character database (UCD) */
-
-typedef struct {
- uschar script;
- uschar chartype;
- pcre_int32 other_case;
-} ucd_record;
-
-extern const ucd_record _pcre_ucd_records[];
-extern const uschar _pcre_ucd_stage1[];
-extern const pcre_uint16 _pcre_ucd_stage2[];
-extern const int _pcre_ucp_gentype[];
-
-
-/* UCD access macros */
-
-#define UCD_BLOCK_SIZE 128
-#define GET_UCD(ch) (_pcre_ucd_records + \
- _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
- UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE])
-
-#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
-#define UCD_SCRIPT(ch) GET_UCD(ch)->script
-#define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)]
-#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
-
-#endif
-
-/* End of pcre_internal.h */
diff --git a/libs/pcre/pcre_maketables.c b/libs/pcre/pcre_maketables.c
deleted file mode 100644
index 219973e378..0000000000
--- a/libs/pcre/pcre_maketables.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_maketables(), which builds
-character tables for PCRE in the current locale. The file is compiled on its
-own as part of the PCRE library. However, it is also included in the
-compilation of dftables.c, in which case the macro DFTABLES is defined. */
-
-
-#ifndef DFTABLES
-# ifdef HAVE_CONFIG_H
-# include "config.h"
-# endif
-# include "pcre_internal.h"
-#endif
-
-
-/*************************************************
-* Create PCRE character tables *
-*************************************************/
-
-/* This function builds a set of character tables for use by PCRE and returns
-a pointer to them. They are build using the ctype functions, and consequently
-their contents will depend upon the current locale setting. When compiled as
-part of the library, the store is obtained via pcre_malloc(), but when compiled
-inside dftables, use malloc().
-
-Arguments: none
-Returns: pointer to the contiguous block of data
-*/
-
-const unsigned char *
-pcre_maketables(void)
-{
-unsigned char *yield, *p;
-int i;
-
-#ifndef DFTABLES
-yield = (unsigned char*)(pcre_malloc)(tables_length);
-#else
-yield = (unsigned char*)malloc(tables_length);
-#endif
-
-if (yield == NULL) return NULL;
-p = yield;
-
-/* First comes the lower casing table */
-
-for (i = 0; i < 256; i++) *p++ = tolower(i);
-
-/* Next the case-flipping table */
-
-for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
-
-/* Then the character class tables. Don't try to be clever and save effort on
-exclusive ones - in some locales things may be different. Note that the table
-for "space" includes everything "isspace" gives, including VT in the default
-locale. This makes it work for the POSIX class [:space:]. Note also that it is
-possible for a character to be alnum or alpha without being lower or upper,
-such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at
-least under Debian Linux's locales as of 12/2005). So we must test for alnum
-specially. */
-
-memset(p, 0, cbit_length);
-for (i = 0; i < 256; i++)
- {
- if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7);
- if (isupper(i)) p[cbit_upper + i/8] |= 1 << (i&7);
- if (islower(i)) p[cbit_lower + i/8] |= 1 << (i&7);
- if (isalnum(i)) p[cbit_word + i/8] |= 1 << (i&7);
- if (i == '_') p[cbit_word + i/8] |= 1 << (i&7);
- if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7);
- if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
- if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7);
- if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7);
- if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7);
- if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7);
- }
-p += cbit_length;
-
-/* Finally, the character type table. In this, we exclude VT from the white
-space chars, because Perl doesn't recognize it as such for \s and for comments
-within regexes. */
-
-for (i = 0; i < 256; i++)
- {
- int x = 0;
- if (i != 0x0b && isspace(i)) x += ctype_space;
- if (isalpha(i)) x += ctype_letter;
- if (isdigit(i)) x += ctype_digit;
- if (isxdigit(i)) x += ctype_xdigit;
- if (isalnum(i) || i == '_') x += ctype_word;
-
- /* Note: strchr includes the terminating zero in the characters it considers.
- In this instance, that is ok because we want binary zero to be flagged as a
- meta-character, which in this sense is any character that terminates a run
- of data characters. */
-
- if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta;
- *p++ = x;
- }
-
-return yield;
-}
-
-/* End of pcre_maketables.c */
diff --git a/libs/pcre/pcre_newline.c b/libs/pcre/pcre_newline.c
deleted file mode 100644
index 38cf7f72f8..0000000000
--- a/libs/pcre/pcre_newline.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains internal functions for testing newlines when more than
-one kind of newline is to be recognized. When a newline is found, its length is
-returned. In principle, we could implement several newline "types", each
-referring to a different set of newline characters. At present, PCRE supports
-only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
-and NLTYPE_ANY. The full list of Unicode newline characters is taken from
-http://unicode.org/unicode/reports/tr18/. */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-
-/*************************************************
-* Check for newline at given position *
-*************************************************/
-
-/* It is guaranteed that the initial value of ptr is less than the end of the
-string that is being processed.
-
-Arguments:
- ptr pointer to possible newline
- type the newline type
- endptr pointer to the end of the string
- lenptr where to return the length
- utf8 TRUE if in utf8 mode
-
-Returns: TRUE or FALSE
-*/
-
-BOOL
-_pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8)
-{
-int c;
-if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
-
-if (type == NLTYPE_ANYCRLF) switch(c)
- {
- case 0x000a: *lenptr = 1; return TRUE; /* LF */
- case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
- return TRUE; /* CR */
- default: return FALSE;
- }
-
-/* NLTYPE_ANY */
-
-else switch(c)
- {
- case 0x000a: /* LF */
- case 0x000b: /* VT */
- case 0x000c: *lenptr = 1; return TRUE; /* FF */
- case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
- return TRUE; /* CR */
- case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
- case 0x2028: /* LS */
- case 0x2029: *lenptr = 3; return TRUE; /* PS */
- default: return FALSE;
- }
-}
-
-
-
-/*************************************************
-* Check for newline at previous position *
-*************************************************/
-
-/* It is guaranteed that the initial value of ptr is greater than the start of
-the string that is being processed.
-
-Arguments:
- ptr pointer to possible newline
- type the newline type
- startptr pointer to the start of the string
- lenptr where to return the length
- utf8 TRUE if in utf8 mode
-
-Returns: TRUE or FALSE
-*/
-
-BOOL
-_pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8)
-{
-int c;
-ptr--;
-#ifdef SUPPORT_UTF8
-if (utf8)
- {
- BACKCHAR(ptr);
- GETCHAR(c, ptr);
- }
-else c = *ptr;
-#else /* no UTF-8 support */
-c = *ptr;
-#endif /* SUPPORT_UTF8 */
-
-if (type == NLTYPE_ANYCRLF) switch(c)
- {
- case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
- return TRUE; /* LF */
- case 0x000d: *lenptr = 1; return TRUE; /* CR */
- default: return FALSE;
- }
-
-else switch(c)
- {
- case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
- return TRUE; /* LF */
- case 0x000b: /* VT */
- case 0x000c: /* FF */
- case 0x000d: *lenptr = 1; return TRUE; /* CR */
- case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
- case 0x2028: /* LS */
- case 0x2029: *lenptr = 3; return TRUE; /* PS */
- default: return FALSE;
- }
-}
-
-/* End of pcre_newline.c */
diff --git a/libs/pcre/pcre_ord2utf8.c b/libs/pcre/pcre_ord2utf8.c
deleted file mode 100644
index 6f4eb9ebe9..0000000000
--- a/libs/pcre/pcre_ord2utf8.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This file contains a private PCRE function that converts an ordinal
-character value into a UTF8 string. */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-/*************************************************
-* Convert character value to UTF-8 *
-*************************************************/
-
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
-
-Arguments:
- cvalue the character value
- buffer pointer to buffer for result - at least 6 bytes long
-
-Returns: number of characters placed in the buffer
-*/
-
-int
-_pcre_ord2utf8(int cvalue, uschar *buffer)
-{
-#ifdef SUPPORT_UTF8
-register int i, j;
-for (i = 0; i < _pcre_utf8_table1_size; i++)
- if (cvalue <= _pcre_utf8_table1[i]) break;
-buffer += i;
-for (j = i; j > 0; j--)
- {
- *buffer-- = 0x80 | (cvalue & 0x3f);
- cvalue >>= 6;
- }
-*buffer = _pcre_utf8_table2[i] | cvalue;
-return i + 1;
-#else
-(void)(cvalue); /* Keep compiler happy; this function won't ever be */
-(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
-return 0;
-#endif
-}
-
-/* End of pcre_ord2utf8.c */
diff --git a/libs/pcre/pcre_printint.src b/libs/pcre/pcre_printint.src
deleted file mode 100644
index 5f45fc1985..0000000000
--- a/libs/pcre/pcre_printint.src
+++ /dev/null
@@ -1,516 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2009 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains a PCRE private debugging function for printing out the
-internal form of a compiled regular expression, along with some supporting
-local functions. This source file is used in two places:
-
-(1) It is #included by pcre_compile.c when it is compiled in debugging mode
-(DEBUG defined in pcre_internal.h). It is not included in production compiles.
-
-(2) It is always #included by pcretest.c, which can be asked to print out a
-compiled regex for debugging purposes. */
-
-
-/* Macro that decides whether a character should be output as a literal or in
-hexadecimal. We don't use isprint() because that can vary from system to system
-(even without the use of locales) and we want the output always to be the same,
-for testing purposes. This macro is used in pcretest as well as in this file. */
-
-#ifdef EBCDIC
-#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
-#else
-#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
-#endif
-
-/* The table of operator names. */
-
-static const char *OP_names[] = { OP_NAME_LIST };
-
-
-
-/*************************************************
-* Print single- or multi-byte character *
-*************************************************/
-
-static int
-print_char(FILE *f, uschar *ptr, BOOL utf8)
-{
-int c = *ptr;
-
-#ifndef SUPPORT_UTF8
-utf8 = utf8; /* Avoid compiler warning */
-if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
-return 0;
-
-#else
-if (!utf8 || (c & 0xc0) != 0xc0)
- {
- if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
- return 0;
- }
-else
- {
- int i;
- int a = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
- int s = 6*a;
- c = (c & _pcre_utf8_table3[a]) << s;
- for (i = 1; i <= a; i++)
- {
- /* This is a check for malformed UTF-8; it should only occur if the sanity
- check has been turned off. Rather than swallow random bytes, just stop if
- we hit a bad one. Print it with \X instead of \x as an indication. */
-
- if ((ptr[i] & 0xc0) != 0x80)
- {
- fprintf(f, "\\X{%x}", c);
- return i - 1;
- }
-
- /* The byte is OK */
-
- s -= 6;
- c |= (ptr[i] & 0x3f) << s;
- }
- if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
- return a;
- }
-#endif
-}
-
-
-
-/*************************************************
-* Find Unicode property name *
-*************************************************/
-
-static const char *
-get_ucpname(int ptype, int pvalue)
-{
-#ifdef SUPPORT_UCP
-int i;
-for (i = _pcre_utt_size - 1; i >= 0; i--)
- {
- if (ptype == _pcre_utt[i].type && pvalue == _pcre_utt[i].value) break;
- }
-return (i >= 0)? _pcre_utt_names + _pcre_utt[i].name_offset : "??";
-#else
-/* It gets harder and harder to shut off unwanted compiler warnings. */
-ptype = ptype * pvalue;
-return (ptype == pvalue)? "??" : "??";
-#endif
-}
-
-
-
-/*************************************************
-* Print compiled regex *
-*************************************************/
-
-/* Make this function work for a regex with integers either byte order.
-However, we assume that what we are passed is a compiled regex. The
-print_lengths flag controls whether offsets and lengths of items are printed.
-They can be turned off from pcretest so that automatic tests on bytecode can be
-written that do not depend on the value of LINK_SIZE. */
-
-static void
-pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths)
-{
-real_pcre *re = (real_pcre *)external_re;
-uschar *codestart, *code;
-BOOL utf8;
-
-unsigned int options = re->options;
-int offset = re->name_table_offset;
-int count = re->name_count;
-int size = re->name_entry_size;
-
-if (re->magic_number != MAGIC_NUMBER)
- {
- offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
- count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
- size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
- options = ((options << 24) & 0xff000000) |
- ((options << 8) & 0x00ff0000) |
- ((options >> 8) & 0x0000ff00) |
- ((options >> 24) & 0x000000ff);
- }
-
-code = codestart = (uschar *)re + offset + count * size;
-utf8 = (options & PCRE_UTF8) != 0;
-
-for(;;)
- {
- uschar *ccode;
- int c;
- int extra = 0;
-
- if (print_lengths)
- fprintf(f, "%3d ", (int)(code - codestart));
- else
- fprintf(f, " ");
-
- switch(*code)
- {
- case OP_END:
- fprintf(f, " %s\n", OP_names[*code]);
- fprintf(f, "------------------------------------------------------------------\n");
- return;
-
- case OP_OPT:
- fprintf(f, " %.2x %s", code[1], OP_names[*code]);
- break;
-
- case OP_CHAR:
- fprintf(f, " ");
- do
- {
- code++;
- code += 1 + print_char(f, code, utf8);
- }
- while (*code == OP_CHAR);
- fprintf(f, "\n");
- continue;
-
- case OP_CHARNC:
- fprintf(f, " NC ");
- do
- {
- code++;
- code += 1 + print_char(f, code, utf8);
- }
- while (*code == OP_CHARNC);
- fprintf(f, "\n");
- continue;
-
- case OP_CBRA:
- case OP_SCBRA:
- if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
- else fprintf(f, " ");
- fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
- break;
-
- case OP_BRA:
- case OP_SBRA:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_ALT:
- case OP_KET:
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- case OP_ONCE:
- case OP_COND:
- case OP_SCOND:
- case OP_REVERSE:
- if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
- else fprintf(f, " ");
- fprintf(f, "%s", OP_names[*code]);
- break;
-
- case OP_CREF:
- fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
- break;
-
- case OP_RREF:
- c = GET2(code, 1);
- if (c == RREF_ANY)
- fprintf(f, " Cond recurse any");
- else
- fprintf(f, " Cond recurse %d", c);
- break;
-
- case OP_DEF:
- fprintf(f, " Cond def");
- break;
-
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPOSSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEPOSPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSQUERY:
- fprintf(f, " ");
- if (*code >= OP_TYPESTAR)
- {
- fprintf(f, "%s", OP_names[code[1]]);
- if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
- {
- fprintf(f, " %s ", get_ucpname(code[2], code[3]));
- extra = 2;
- }
- }
- else extra = print_char(f, code+1, utf8);
- fprintf(f, "%s", OP_names[*code]);
- break;
-
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- fprintf(f, " ");
- extra = print_char(f, code+3, utf8);
- fprintf(f, "{");
- if (*code != OP_EXACT) fprintf(f, "0,");
- fprintf(f, "%d}", GET2(code,1));
- if (*code == OP_MINUPTO) fprintf(f, "?");
- else if (*code == OP_POSUPTO) fprintf(f, "+");
- break;
-
- case OP_TYPEEXACT:
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEPOSUPTO:
- fprintf(f, " %s", OP_names[code[3]]);
- if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
- {
- fprintf(f, " %s ", get_ucpname(code[4], code[5]));
- extra = 2;
- }
- fprintf(f, "{");
- if (*code != OP_TYPEEXACT) fprintf(f, "0,");
- fprintf(f, "%d}", GET2(code,1));
- if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
- else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
- break;
-
- case OP_NOT:
- c = code[1];
- if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
- else fprintf(f, " [^\\x%02x]", c);
- break;
-
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPOSSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTPOSPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- case OP_NOTPOSQUERY:
- c = code[1];
- if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
- else fprintf(f, " [^\\x%02x]", c);
- fprintf(f, "%s", OP_names[*code]);
- break;
-
- case OP_NOTEXACT:
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- case OP_NOTPOSUPTO:
- c = code[3];
- if (PRINTABLE(c)) fprintf(f, " [^%c]{", c);
- else fprintf(f, " [^\\x%02x]{", c);
- if (*code != OP_NOTEXACT) fprintf(f, "0,");
- fprintf(f, "%d}", GET2(code,1));
- if (*code == OP_NOTMINUPTO) fprintf(f, "?");
- else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
- break;
-
- case OP_RECURSE:
- if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
- else fprintf(f, " ");
- fprintf(f, "%s", OP_names[*code]);
- break;
-
- case OP_REF:
- fprintf(f, " \\%d", GET2(code,1));
- ccode = code + _pcre_OP_lengths[*code];
- goto CLASS_REF_REPEAT;
-
- case OP_CALLOUT:
- fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
- GET(code, 2 + LINK_SIZE));
- break;
-
- case OP_PROP:
- case OP_NOTPROP:
- fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1], code[2]));
- break;
-
- /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
- having this code always here, and it makes it less messy without all those
- #ifdefs. */
-
- case OP_CLASS:
- case OP_NCLASS:
- case OP_XCLASS:
- {
- int i, min, max;
- BOOL printmap;
-
- fprintf(f, " [");
-
- if (*code == OP_XCLASS)
- {
- extra = GET(code, 1);
- ccode = code + LINK_SIZE + 1;
- printmap = (*ccode & XCL_MAP) != 0;
- if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
- }
- else
- {
- printmap = TRUE;
- ccode = code + 1;
- }
-
- /* Print a bit map */
-
- if (printmap)
- {
- for (i = 0; i < 256; i++)
- {
- if ((ccode[i/8] & (1 << (i&7))) != 0)
- {
- int j;
- for (j = i+1; j < 256; j++)
- if ((ccode[j/8] & (1 << (j&7))) == 0) break;
- if (i == '-' || i == ']') fprintf(f, "\\");
- if (PRINTABLE(i)) fprintf(f, "%c", i);
- else fprintf(f, "\\x%02x", i);
- if (--j > i)
- {
- if (j != i + 1) fprintf(f, "-");
- if (j == '-' || j == ']') fprintf(f, "\\");
- if (PRINTABLE(j)) fprintf(f, "%c", j);
- else fprintf(f, "\\x%02x", j);
- }
- i = j;
- }
- }
- ccode += 32;
- }
-
- /* For an XCLASS there is always some additional data */
-
- if (*code == OP_XCLASS)
- {
- int ch;
- while ((ch = *ccode++) != XCL_END)
- {
- if (ch == XCL_PROP)
- {
- int ptype = *ccode++;
- int pvalue = *ccode++;
- fprintf(f, "\\p{%s}", get_ucpname(ptype, pvalue));
- }
- else if (ch == XCL_NOTPROP)
- {
- int ptype = *ccode++;
- int pvalue = *ccode++;
- fprintf(f, "\\P{%s}", get_ucpname(ptype, pvalue));
- }
- else
- {
- ccode += 1 + print_char(f, ccode, TRUE);
- if (ch == XCL_RANGE)
- {
- fprintf(f, "-");
- ccode += 1 + print_char(f, ccode, TRUE);
- }
- }
- }
- }
-
- /* Indicate a non-UTF8 class which was created by negation */
-
- fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
-
- /* Handle repeats after a class or a back reference */
-
- CLASS_REF_REPEAT:
- switch(*ccode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- fprintf(f, "%s", OP_names[*ccode]);
- extra += _pcre_OP_lengths[*ccode];
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- min = GET2(ccode,1);
- max = GET2(ccode,3);
- if (max == 0) fprintf(f, "{%d,}", min);
- else fprintf(f, "{%d,%d}", min, max);
- if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
- extra += _pcre_OP_lengths[*ccode];
- break;
-
- /* Do nothing if it's not a repeat; this code stops picky compilers
- warning about the lack of a default code path. */
-
- default:
- break;
- }
- }
- break;
-
- /* Anything else is just an item with no data*/
-
- default:
- fprintf(f, " %s", OP_names[*code]);
- break;
- }
-
- code += _pcre_OP_lengths[*code] + extra;
- fprintf(f, "\n");
- }
-}
-
-/* End of pcre_printint.src */
diff --git a/libs/pcre/pcre_refcount.c b/libs/pcre/pcre_refcount.c
deleted file mode 100644
index 92e4b8505c..0000000000
--- a/libs/pcre/pcre_refcount.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains the external function pcre_refcount(), which is an
-auxiliary function that can be used to maintain a reference count in a compiled
-pattern data block. This might be helpful in applications where the block is
-shared by different users. */
-
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "pcre_internal.h"
-
-
-/*************************************************
-* Maintain reference count *
-*************************************************/
-
-/* The reference count is a 16-bit field, initialized to zero. It is not
-possible to transfer a non-zero count from one host to a different host that
-has a different byte order - though I can't see why anyone in their right mind
-would ever want to do that!
-
-Arguments:
- argument_re points to compiled code
- adjust value to add to the count
-
-Returns: the (possibly updated) count value (a non-negative number), or
- a negative error number
-*/
-
-PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_refcount(pcre *argument_re, int adjust)
-{
-real_pcre *re = (real_pcre *)argument_re;
-if (re == NULL) return PCRE_ERROR_NULL;
-re->ref_count = (-adjust > re->ref_count)? 0 :
- (adjust + re->ref_count > 65535)? 65535 :
- re->ref_count + adjust;
-return re->ref_count;
-}
-
-/* End of pcre_refcount.c */
diff --git a/libs/pcre/pcre_scanner.cc b/libs/pcre/pcre_scanner.cc
deleted file mode 100644
index a817a684e2..0000000000
--- a/libs/pcre/pcre_scanner.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright (c) 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: Sanjay Ghemawat
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include
-#include
-
-#include "pcrecpp_internal.h"
-#include "pcre_scanner.h"
-
-using std::vector;
-
-namespace pcrecpp {
-
-Scanner::Scanner()
- : data_(),
- input_(data_),
- skip_(NULL),
- should_skip_(false),
- skip_repeat_(false),
- save_comments_(false),
- comments_(NULL),
- comments_offset_(0) {
-}
-
-Scanner::Scanner(const string& in)
- : data_(in),
- input_(data_),
- skip_(NULL),
- should_skip_(false),
- skip_repeat_(false),
- save_comments_(false),
- comments_(NULL),
- comments_offset_(0) {
-}
-
-Scanner::~Scanner() {
- delete skip_;
- delete comments_;
-}
-
-void Scanner::SetSkipExpression(const char* re) {
- delete skip_;
- if (re != NULL) {
- skip_ = new RE(re);
- should_skip_ = true;
- skip_repeat_ = true;
- ConsumeSkip();
- } else {
- skip_ = NULL;
- should_skip_ = false;
- skip_repeat_ = false;
- }
-}
-
-void Scanner::Skip(const char* re) {
- delete skip_;
- if (re != NULL) {
- skip_ = new RE(re);
- should_skip_ = true;
- skip_repeat_ = false;
- ConsumeSkip();
- } else {
- skip_ = NULL;
- should_skip_ = false;
- skip_repeat_ = false;
- }
-}
-
-void Scanner::DisableSkip() {
- assert(skip_ != NULL);
- should_skip_ = false;
-}
-
-void Scanner::EnableSkip() {
- assert(skip_ != NULL);
- should_skip_ = true;
- ConsumeSkip();
-}
-
-int Scanner::LineNumber() const {
- // TODO: Make it more efficient by keeping track of the last point
- // where we computed line numbers and counting newlines since then.
- // We could use std:count, but not all systems have it. :-(
- int count = 1;
- for (const char* p = data_.data(); p < input_.data(); ++p)
- if (*p == '\n')
- ++count;
- return count;
-}
-
-int Scanner::Offset() const {
- return input_.data() - data_.c_str();
-}
-
-bool Scanner::LookingAt(const RE& re) const {
- int consumed;
- return re.DoMatch(input_, RE::ANCHOR_START, &consumed, 0, 0);
-}
-
-
-bool Scanner::Consume(const RE& re,
- const Arg& arg0,
- const Arg& arg1,
- const Arg& arg2) {
- const bool result = re.Consume(&input_, arg0, arg1, arg2);
- if (result && should_skip_) ConsumeSkip();
- return result;
-}
-
-// helper function to consume *skip_ and honour save_comments_
-void Scanner::ConsumeSkip() {
- const char* start_data = input_.data();
- while (skip_->Consume(&input_)) {
- if (!skip_repeat_) {
- // Only one skip allowed.
- break;
- }
- }
- if (save_comments_) {
- if (comments_ == NULL) {
- comments_ = new vector;
- }
- // already pointing one past end, so no need to +1
- int length = input_.data() - start_data;
- if (length > 0) {
- comments_->push_back(StringPiece(start_data, length));
- }
- }
-}
-
-
-void Scanner::GetComments(int start, int end, vector *ranges) {
- // short circuit out if we've not yet initialized comments_
- // (e.g., when save_comments is false)
- if (!comments_) {
- return;
- }
- // TODO: if we guarantee that comments_ will contain StringPieces
- // that are ordered by their start, then we can do a binary search
- // for the first StringPiece at or past start and then scan for the
- // ones contained in the range, quit early (use equal_range or
- // lower_bound)
- for (vector::const_iterator it = comments_->begin();
- it != comments_->end(); ++it) {
- if ((it->data() >= data_.c_str() + start &&
- it->data() + it->size() <= data_.c_str() + end)) {
- ranges->push_back(*it);
- }
- }
-}
-
-
-void Scanner::GetNextComments(vector *ranges) {
- // short circuit out if we've not yet initialized comments_
- // (e.g., when save_comments is false)
- if (!comments_) {
- return;
- }
- for (vector::const_iterator it =
- comments_->begin() + comments_offset_;
- it != comments_->end(); ++it) {
- ranges->push_back(*it);
- ++comments_offset_;
- }
-}
-
-} // namespace pcrecpp
diff --git a/libs/pcre/pcre_scanner.h b/libs/pcre/pcre_scanner.h
deleted file mode 100644
index 5617e4515c..0000000000
--- a/libs/pcre/pcre_scanner.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright (c) 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: Sanjay Ghemawat
-//
-// Regular-expression based scanner for parsing an input stream.
-//
-// Example 1: parse a sequence of "var = number" entries from input:
-//
-// Scanner scanner(input);
-// string var;
-// int number;
-// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
-// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
-// ...;
-// }
-
-#ifndef _PCRE_SCANNER_H
-#define _PCRE_SCANNER_H
-
-#include
-#include
-#include
-
-#include
-#include
-
-namespace pcrecpp {
-
-class PCRECPP_EXP_DEFN Scanner {
- public:
- Scanner();
- explicit Scanner(const std::string& input);
- ~Scanner();
-
- // Return current line number. The returned line-number is
- // one-based. I.e. it returns 1 + the number of consumed newlines.
- //
- // Note: this method may be slow. It may take time proportional to
- // the size of the input.
- int LineNumber() const;
-
- // Return the byte-offset that the scanner is looking in the
- // input data;
- int Offset() const;
-
- // Return true iff the start of the remaining input matches "re"
- bool LookingAt(const RE& re) const;
-
- // Return true iff all of the following are true
- // a. the start of the remaining input matches "re",
- // b. if any arguments are supplied, matched sub-patterns can be
- // parsed and stored into the arguments.
- // If it returns true, it skips over the matched input and any
- // following input that matches the "skip" regular expression.
- bool Consume(const RE& re,
- const Arg& arg0 = RE::no_arg,
- const Arg& arg1 = RE::no_arg,
- const Arg& arg2 = RE::no_arg
- // TODO: Allow more arguments?
- );
-
- // Set the "skip" regular expression. If after consuming some data,
- // a prefix of the input matches this RE, it is automatically
- // skipped. For example, a programming language scanner would use
- // a skip RE that matches white space and comments.
- //
- // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
- //
- // Skipping repeats as long as it succeeds. We used to let people do
- // this by writing "(...)*" in the regular expression, but that added
- // up to lots of recursive calls within the pcre library, so now we
- // control repetition explicitly via the function call API.
- //
- // You can pass NULL for "re" if you do not want any data to be skipped.
- void Skip(const char* re); // DEPRECATED; does *not* repeat
- void SetSkipExpression(const char* re);
-
- // Temporarily pause "skip"ing. This
- // Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
- // is similar to
- // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo");
- // but avoids creating/deleting new RE objects.
- void DisableSkip();
-
- // Reenable previously paused skipping. Any prefix of the input
- // that matches the skip pattern is immediately dropped.
- void EnableSkip();
-
- /***** Special wrappers around SetSkip() for some common idioms *****/
-
- // Arranges to skip whitespace, C comments, C++ comments.
- // The overall RE is a disjunction of the following REs:
- // \\s whitespace
- // //.*\n C++ comment
- // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x)
- // We get repetition via the semantics of SetSkipExpression, not by using *
- void SkipCXXComments() {
- SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
- }
-
- void set_save_comments(bool comments) {
- save_comments_ = comments;
- }
-
- bool save_comments() {
- return save_comments_;
- }
-
- // Append to vector ranges the comments found in the
- // byte range [start,end] (inclusive) of the input data.
- // Only comments that were extracted entirely within that
- // range are returned: no range splitting of atomically-extracted
- // comments is performed.
- void GetComments(int start, int end, std::vector *ranges);
-
- // Append to vector ranges the comments added
- // since the last time this was called. This
- // functionality is provided for efficiency when
- // interleaving scanning with parsing.
- void GetNextComments(std::vector *ranges);
-
- private:
- std::string data_; // All the input data
- StringPiece input_; // Unprocessed input
- RE* skip_; // If non-NULL, RE for skipping input
- bool should_skip_; // If true, use skip_
- bool skip_repeat_; // If true, repeat skip_ as long as it works
- bool save_comments_; // If true, aggregate the skip expression
-
- // the skipped comments
- // TODO: later consider requiring that the StringPieces be added
- // in order by their start position
- std::vector *comments_;
-
- // the offset into comments_ that has been returned by GetNextComments
- int comments_offset_;
-
- // helper function to consume *skip_ and honour
- // save_comments_
- void ConsumeSkip();
-};
-
-} // namespace pcrecpp
-
-#endif /* _PCRE_SCANNER_H */
diff --git a/libs/pcre/pcre_scanner_unittest.cc b/libs/pcre/pcre_scanner_unittest.cc
deleted file mode 100644
index 284c8ea99e..0000000000
--- a/libs/pcre/pcre_scanner_unittest.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright (c) 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: Greg J. Badros
-//
-// Unittest for scanner, especially GetNextComments and GetComments()
-// functionality.
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include
-#include
-#include
-
-#include "pcrecpp.h"
-#include "pcre_stringpiece.h"
-#include "pcre_scanner.h"
-
-#define FLAGS_unittest_stack_size 49152
-
-// Dies with a fatal error if the two values are not equal.
-#define CHECK_EQ(a, b) do { \
- if ( (a) != (b) ) { \
- fprintf(stderr, "%s:%d: Check failed because %s != %s\n", \
- __FILE__, __LINE__, #a, #b); \
- exit(1); \
- } \
-} while (0)
-
-using std::vector;
-using pcrecpp::StringPiece;
-using pcrecpp::Scanner;
-
-static void TestScanner() {
- const char input[] = "\n"
- "alpha = 1; // this sets alpha\n"
- "bravo = 2; // bravo is set here\n"
- "gamma = 33; /* and here is gamma */\n";
-
- const char *re = "(\\w+) = (\\d+);";
-
- Scanner s(input);
- string var;
- int number;
- s.SkipCXXComments();
- s.set_save_comments(true);
- vector