Merge in change to require system libpcre

libpcre is now a hard system dependency for building and running
FreeSWITCH.

FS-353
This commit is contained in:
Travis Cross 2014-03-20 14:44:09 +00:00
commit aec04d474c
208 changed files with 9 additions and 125437 deletions

View File

@ -84,7 +84,6 @@ CORE_CFLAGS = $(AM_LIBAPR_CFLAGS) $(AM_LIBAPR_CPPFLAGS)
CORE_CFLAGS += $(AM_LIBAPU_CPPFLAGS)
CORE_CFLAGS += -I$(switch_srcdir)/libs/libtpl-1.5/src
CORE_CFLAGS += -I$(switch_builddir)/libs/sqlite
CORE_CFLAGS += -I$(switch_srcdir)/libs/pcre
CORE_CFLAGS += -I$(switch_srcdir)/libs/speex/include -Ilibs/speex/include
CORE_CFLAGS += -I$(switch_srcdir)/libs/srtp/include
CORE_CFLAGS += -I$(switch_srcdir)/libs/srtp/crypto/include -Ilibs/srtp/crypto/include
@ -93,7 +92,7 @@ CORE_CFLAGS += -I$(switch_builddir)/libs/tiff-4.0.2/libtiff -I$(switch_srcdir)/l
APR_LIBS = $(AM_LIBAPU_LIBS) $(AM_LIBAPR_LIBS)
CORE_LIBS = libs/apr-util/libaprutil-1.la libs/apr/libapr-1.la
CORE_LIBS += libs/sqlite/libsqlite3.la libs/pcre/libpcre.la libs/speex/libspeex/libspeexdsp.la libs/speex/libspeex/libspeex.la
CORE_LIBS += libs/sqlite/libsqlite3.la libs/speex/libspeex/libspeexdsp.la libs/speex/libspeex/libspeex.la
if ENABLE_SRTP
CORE_CFLAGS += -DENABLE_SRTP
@ -125,9 +124,9 @@ libfreeswitch_spandsp_la_SOURCES = libs/spandsp/src/plc.c libs/spandsp/src/alloc
libfreeswitch_spandsp_la_CFLAGS = -Ilibs/spandsp/src $(CORE_CFLAGS) $(AM_CFLAGS)
CORE_LIBS+=libfreeswitch_spandsp.la
lib_LTLIBRARIES = libfreeswitch.la
libfreeswitch_la_CFLAGS = $(CORE_CFLAGS) $(AM_CFLAGS)
libfreeswitch_la_CFLAGS = $(CORE_CFLAGS) $(PCRE_CFLAGS) $(AM_CFLAGS)
libfreeswitch_la_LDFLAGS = -version-info 1:0:0 $(AM_LDFLAGS) $(PLATFORM_CORE_LDFLAGS) -shared -no-undefined
libfreeswitch_la_LIBADD = $(CORE_LIBS) $(APR_LIBS) $(PLATFORM_CORE_LIBS)
libfreeswitch_la_LIBADD = $(CORE_LIBS) $(APR_LIBS) $(PCRE_LIBS) $(PLATFORM_CORE_LIBS)
libfreeswitch_la_DEPENDENCIES = $(BUILT_SOURCES)
if CURL_BUILTIN
@ -482,11 +481,6 @@ libs/sqlite/Makefile: libs/sqlite/configure.ac
cd libs/sqlite && ./config.status
@$(TOUCH_TARGET)
libs/pcre/libpcre.la: libs/pcre libs/pcre/.update
touch $(switch_srcdir)/src/include/switch.h
@cd libs/pcre && $(MAKE)
@$(TOUCH_TARGET)
SRTP_SRC = libs/srtp/srtp/srtp.c libs/srtp/srtp/ekt.c libs/srtp/crypto/cipher/cipher.c libs/srtp/crypto/cipher/null_cipher.c \
libs/srtp/crypto/cipher/aes.c libs/srtp/crypto/cipher/aes_icm.c \
libs/srtp/crypto/cipher/aes_cbc.c \

View File

@ -1102,6 +1102,7 @@ AM_CONDITIONAL([CURL_BUILTIN],[test "${ac_cv_use_system_curl}" != "yes"])
AC_SUBST(LIBCURL_DEPS)
PKG_CHECK_MODULES([PCRE], [libpcre >= 7.9])
AC_ARG_ENABLE(core-libedit-support,
[AS_HELP_STRING([--disable-core-libedit-support], [Compile without libedit Support])], [enable_core_libedit_support="$enableval"], [enable_core_libedit_support="yes"])
@ -1492,7 +1493,6 @@ if test "$enable_core_libedit_support" = "yes" ; then
AC_CONFIG_SUBDIRS([libs/libedit])
fi
AC_CONFIG_SUBDIRS([libs/pcre])
AC_CONFIG_SUBDIRS([libs/apr])
AC_CONFIG_SUBDIRS([libs/apr-util])
AC_CONFIG_SUBDIRS([libs/ilbc])

1
debian/bootstrap.sh vendored
View File

@ -283,6 +283,7 @@ Build-Depends:
# core build
dpkg-dev (>= 1.15.8.12), gcc (>= 4:4.4.5), g++ (>= 4:4.4.5),
libc6-dev (>= 2.11.3), make (>= 3.81),
libpcre3-dev,
wget, pkg-config,
# core codecs
libogg-dev,

8
debian/copyright vendored
View File

@ -1821,14 +1821,6 @@ Copyright: 2008-2010, Eric des Courtis <eric.des.courtis@benbria.com>
Benbria.
License: MPL-1.1
Files: libs/pcre/*
Copyright: 1997-2009 University of Cambridge
2003 and onwards Google Inc.
2005-2006, Google Inc
2001 Alexander Tokarev <dwalin@dwalin.ru>
2001 Peter S. Voronov aka Chem O'Dun <petervrn@yahoo.com>
License: BSD-3-clause
Files: libs/silk/*
Copyright: 2006-2011, Skype Limited.
License: BSD-2-clause

View File

@ -1243,7 +1243,7 @@ SEARCH_INCLUDES = YES
INCLUDE_PATH =../libs/apr ../libs/apr-util ../libs/curl \
../libs/iksemel ../libs/ilbc ../libs/js \
../libs/libedit ../libs/libg722_1 ../libs/libnatpmp \
../libs/libsndfile ../libs/miniupnpc ../libs/pcre \
../libs/libsndfile ../libs/miniupnpc \
../libs/portaudio ../libs/sofia-sip ../libs/spandsp \
../libs/speex ../libs/sqlite ../libs/srtp \
../libs/tiff-4.0.2 ../libs/udns \

15
libs/.gitignore vendored
View File

@ -408,20 +408,6 @@ opal
/openzap/INSTALL
/openzap/Makefile.in
/opus-*/
/pcre/config.h
/pcre/Makefile
/pcre/Makefile.in
/pcre/pcre_chartables.c
/pcre/pcre-config
/pcre/pcrecpparg.h
/pcre/pcrecpp_unittest
/pcre/pcregrep
/pcre/pcre_scanner_unittest
/pcre/pcre_stringpiece.h
/pcre/pcre_stringpiece_unittest
/pcre/pcretest
/pcre/stamp-h1
/pcre-*/
/pocketsphinx-*/
/portaudio/bin/
/portaudio/bin-stamp
@ -1012,7 +998,6 @@ opal
!/libdingaling/missing
!/libg722_1/config/depcomp
!/libg722_1/config/missing
!/pcre/depcomp
!/portaudio/bindings/cpp/build/gnu/aclocal.m4
!/portaudio/bindings/cpp/build/gnu/config.guess
!/portaudio/bindings/cpp/build/gnu/config.sub

View File

@ -1 +0,0 @@
Mon Jun 8 19:51:53 EDT 2009

View File

@ -1,296 +0,0 @@
#! /usr/bin/perl -w
# Script to turn PCRE man pages into HTML
# Subroutine to handle font changes and other escapes
sub do_line {
my($s) = $_[0];
$s =~ s/</&#60;/g; # Deal with < and >
$s =~ s/>/&#62;/g;
$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
$s =~ s"\\e"\\"g;
$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
$s;
}
# Subroutine to ensure not in a paragraph
sub end_para {
if ($inpara)
{
print TEMP "</PRE>\n" if ($inpre);
print TEMP "</P>\n";
}
$inpara = $inpre = 0;
$wrotetext = 0;
}
# Subroutine to start a new paragraph
sub new_para {
&end_para();
print TEMP "<P>\n";
$inpara = 1;
}
# Main program
$innf = 0;
$inpara = 0;
$inpre = 0;
$wrotetext = 0;
$toc = 0;
$ref = 1;
while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
{
$toc = 1 if $ARGV[0] eq "-toc";
shift;
}
# Initial output to STDOUT
print <<End ;
<html>
<head>
<title>$ARGV[0] specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>$ARGV[0] man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
End
print "<ul>\n" if ($toc);
open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
while (<STDIN>)
{
# Handle lines beginning with a dot
if (/^\./)
{
# Some of the PCRE man pages used to contain instances of .br. However,
# they should have all been removed because they cause trouble in some
# (other) automated systems that translate man pages to HTML. Complain if
# we find .br or .in (another macro that is deprecated).
if (/^\.br/ || /^\.in/)
{
print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
print STDERR "*** $_\n";
die "*** Processing abandoned\n";
}
# Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
elsif (/^\.nf/)
{
$innf = 1;
}
elsif (/^\.fi/)
{
$innf = 0;
}
# Handling .sp is subtle. If it is inside a literal section, do nothing if
# the next line is a non literal text line; similarly, if not inside a
# literal section, do nothing if a literal follows. The point being that
# the <pre> and </pre> that delimit literal sections will do the spacing.
# Always skip if no previous output.
elsif (/^\.sp/)
{
if ($wrotetext)
{
$_ = <STDIN>;
if ($inpre)
{
print TEMP "\n" if (/^[\s.]/);
}
else
{
print TEMP "<br>\n<br>\n" if (!/^[\s.]/);
}
redo; # Now process the lookahead line we just read
}
}
elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
{
&new_para();
}
elsif (/^\.SH\s*("?)(.*)\1/)
{
# Ignore the NAME section
if ($2 =~ /^NAME\b/)
{
<STDIN>;
next;
}
&end_para();
my($title) = &do_line($2);
if ($toc)
{
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
$ref, $ref);
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
$ref, $ref);
$ref++;
}
else
{
print TEMP "<br><b>\n$title\n</b><br>\n";
}
}
elsif (/^\.SS\s*("?)(.*)\1/)
{
&end_para();
my($title) = &do_line($2);
print TEMP "<br><b>\n$title\n</b><br>\n";
}
elsif (/^\.B\s*(.*)/)
{
&new_para() if (!$inpara);
$_ = &do_line($1);
s/"(.*?)"/$1/g;
print TEMP "<b>$_</b>\n";
$wrotetext = 1;
}
elsif (/^\.I\s*(.*)/)
{
&new_para() if (!$inpara);
$_ = &do_line($1);
s/"(.*?)"/$1/g;
print TEMP "<i>$_</i>\n";
$wrotetext = 1;
}
# A comment that starts "HREF" takes the next line as a name that
# is turned into a hyperlink, using the text given, which might be
# in a special font. If it ends in () or (digits) or punctuation, they
# aren't part of the link.
elsif (/^\.\\"\s*HREF/)
{
$_=<STDIN>;
chomp;
$_ = &do_line($_);
$_ =~ s/\s+$//;
$_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
print TEMP "<a href=\"$1.html\">$_</a>\n";
}
# A comment that starts "HTML" inserts literal HTML
elsif (/^\.\\"\s*HTML\s*(.*)/)
{
print TEMP $1;
}
# A comment that starts < inserts that HTML at the end of the
# *next* input line - so as not to get a newline between them.
elsif (/^\.\\"\s*(<.*>)/)
{
my($markup) = $1;
$_=<STDIN>;
chomp;
$_ = &do_line($_);
$_ =~ s/\s+$//;
print TEMP "$_$markup\n";
}
# A comment that starts JOIN joins the next two lines together, with one
# space between them. Then that line is processed. This is used in some
# displays where two lines are needed for the "man" version. JOINSH works
# the same, except that it assumes this is a shell command, so removes
# continuation backslashes.
elsif (/^\.\\"\s*JOIN(SH)?/)
{
my($one,$two);
$one = <STDIN>;
$two = <STDIN>;
$one =~ s/\s*\\e\s*$// if (defined($1));
chomp($one);
$two =~ s/^\s+//;
$_ = "$one $two";
redo; # Process the joined lines
}
# Ignore anything not recognized
next;
}
# Line does not begin with a dot. Replace blank lines with new paragraphs
if (/^\s*$/)
{
&end_para() if ($wrotetext);
next;
}
# Convert fonts changes and output an ordinary line. Ensure that indented
# lines are marked as literal.
$_ = &do_line($_);
&new_para() if (!$inpara);
if (/^\s/)
{
if (!$inpre)
{
print TEMP "<pre>\n";
$inpre = 1;
}
}
elsif ($inpre)
{
print TEMP "</pre>\n";
$inpre = 0;
}
# Add <br> to the end of a non-literal line if we are within .nf/.fi
$_ .= "<br>\n" if (!$inpre && $innf);
print TEMP;
$wrotetext = 1;
}
# The TOC, if present, will have been written - terminate it
print "</ul>\n" if ($toc);
# Copy the remainder to the standard output
close(TEMP);
open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
print while (<TEMP>);
print <<End ;
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
End
close(TEMP);
unlink("/tmp/$$");
# End

View File

@ -1,23 +0,0 @@
THE MAIN PCRE LIBRARY
---------------------
Written by: Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
Copyright (c) 1997-2009 University of Cambridge
All rights reserved
THE C++ WRAPPER LIBRARY
-----------------------
Written by: Google Inc.
Copyright (c) 2007-2008 Google Inc
All rights reserved
####

View File

@ -1,578 +0,0 @@
# CMakeLists.txt
#
#
# This file allows building PCRE with the CMake configuration and build
# tool. Download CMake in source or binary form from http://www.cmake.org/
#
# Original listfile by Christian Ehrlicher <Ch.Ehrlicher@gmx.de>
# Refined and expanded by Daniel Richard G. <skunk@iSKUNK.ORG>
# 2007-09-14 mod by Sheri so 7.4 supported configuration options can be entered
# 2007-09-19 Adjusted by PH to retain previous default settings
# 2007-12-26 (a) On UNIX, use names libpcre instead of just pcre
# (b) Ensure pcretest and pcregrep link with the local library,
# not a previously-installed one.
# (c) Add PCRE_SUPPORT_LIBREADLINE, PCRE_SUPPORT_LIBZ, and
# PCRE_SUPPORT_LIBBZ2.
# 2008-01-20 Brought up to date to include several new features by Christian
# Ehrlicher.
# 2008-01-22 Sheri added options for backward compatibility of library names
# when building with minGW:
# if "ON", NON_STANDARD_LIB_PREFIX causes shared libraries to
# be built without "lib" as prefix. (The libraries will be named
# pcre.dll, pcreposix.dll and pcrecpp.dll).
# if "ON", NON_STANDARD_LIB_SUFFIX causes shared libraries to
# be built with suffix of "-0.dll". (The libraries will be named
# libpcre-0.dll, libpcreposix-0.dll and libpcrecpp-0.dll - same names
# built by default with Configure and Make.
# 2008-01-23 PH removed the automatic build of pcredemo.
# 2008-04-22 PH modified READLINE support so it finds NCURSES when needed.
# 2008-07-03 PH updated for revised UCP property support (change of files)
# 2009-03-23 PH applied Steven Van Ingelgem's patch to change the name
# CMAKE_BINARY_DIR to PROJECT_BINARY_DIR so that it works when PCRE
# is included within another project.
# 2009-03-23 PH applied a modified version of Steven Van Ingelgem's patches to
# add options to stop the building of pcregrep and the tests, and
# to disable the final configuration report.
# 2009-04-11 PH applied Christian Ehrlicher's patch to show compiler flags that
# are set by specifying a release type.
PROJECT(PCRE C CXX)
CMAKE_MINIMUM_REQUIRED(VERSION 2.4.6)
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
# external packages
FIND_PACKAGE( BZip2 )
FIND_PACKAGE( ZLIB )
FIND_PACKAGE( Readline )
# Configuration checks
INCLUDE(CheckIncludeFile)
INCLUDE(CheckIncludeFileCXX)
INCLUDE(CheckFunctionExists)
INCLUDE(CheckTypeSize)
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
CHECK_INCLUDE_FILE_CXX(type_traits.h HAVE_TYPE_TRAITS_H)
CHECK_INCLUDE_FILE_CXX(bits/type_traits.h HAVE_BITS_TYPE_TRAITS_H)
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
CHECK_FUNCTION_EXISTS(strtoll HAVE_STRTOLL)
CHECK_FUNCTION_EXISTS(strtoq HAVE_STRTOQ)
CHECK_FUNCTION_EXISTS(_strtoi64 HAVE__STRTOI64)
CHECK_TYPE_SIZE("long long" LONG_LONG)
CHECK_TYPE_SIZE("unsigned long long" UNSIGNED_LONG_LONG)
# User-configurable options
#
# (Note: CMakeSetup displays these in alphabetical order, regardless of
# the order we use here)
SET(BUILD_SHARED_LIBS OFF CACHE BOOL
"Build shared libraries instead of static ones.")
OPTION(PCRE_BUILD_PCRECPP "Build the PCRE C++ library (pcrecpp)." ON)
SET(PCRE_EBCDIC OFF CACHE BOOL
"Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems)")
SET(PCRE_LINK_SIZE "2" CACHE STRING
"Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details.")
SET(PCRE_MATCH_LIMIT "10000000" CACHE STRING
"Default limit on internal looping. See MATCH_LIMIT in config.h.in for details.")
SET(PCRE_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
SET(PCRE_NEWLINE "LF" CACHE STRING
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")
SET(PCRE_NO_RECURSE OFF CACHE BOOL
"If ON, then don't use stack recursion when matching. See NO_RECURSE in config.h.in for details.")
SET(PCRE_POSIX_MALLOC_THRESHOLD "10" CACHE STRING
"Threshold for malloc() usage. See POSIX_MALLOC_THRESHOLD in config.h.in for details.")
SET(PCRE_SUPPORT_UNICODE_PROPERTIES OFF CACHE BOOL
"Enable support for Unicode properties. (If set, UTF-8 support will be enabled as well)")
SET(PCRE_SUPPORT_UTF8 OFF CACHE BOOL
"Enable support for the Unicode UTF-8 encoding.")
SET(PCRE_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
OPTION(PCRE_SHOW_REPORT "Show the final configuration report" ON)
OPTION(PCRE_BUILD_PCREGREP "Build pcregrep" ON)
OPTION(PCRE_BUILD_TESTS "Build the tests" ON)
IF (PCRE_BUILD_TESTS)
IF (NOT PCRE_BUILD_PCREGREP)
MESSAGE(STATUS "** Building tests requires pcregrep: PCRE_BUILD_PCREGREP forced ON")
SET(PCRE_BUILD_PCREGREP ON)
ENDIF(NOT PCRE_BUILD_PCREGREP)
ENDIF(PCRE_BUILD_TESTS)
IF (MINGW)
OPTION(NON_STANDARD_LIB_PREFIX
"ON=Shared libraries built in mingw will be named pcre.dll, etc., instead of libpcre.dll, etc."
OFF)
OPTION(NON_STANDARD_LIB_SUFFIX
"ON=Shared libraries built in mingw will be named libpcre-0.dll, etc., instead of libpcre.dll, etc."
OFF)
ENDIF(MINGW)
# bzip2 lib
IF(BZIP2_FOUND)
OPTION (PCRE_SUPPORT_LIBBZ2 "Enable support for linking pcregrep with libbz2." ON)
ENDIF(BZIP2_FOUND)
IF(PCRE_SUPPORT_LIBBZ2)
INCLUDE_DIRECTORIES(${BZIP2_INCLUDE_DIR})
ENDIF(PCRE_SUPPORT_LIBBZ2)
# zlib
IF(ZLIB_FOUND)
OPTION (PCRE_SUPPORT_LIBZ "Enable support for linking pcregrep with libz." ON)
ENDIF(ZLIB_FOUND)
IF(PCRE_SUPPORT_LIBZ)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
ENDIF(PCRE_SUPPORT_LIBZ)
# readline lib
IF(READLINE_FOUND)
OPTION (PCRE_SUPPORT_LIBREADLINE "Enable support for linking pcretest with libreadline." ON)
ENDIF(READLINE_FOUND)
IF(PCRE_SUPPORT_LIBREADLINE)
INCLUDE_DIRECTORIES(${READLINE_INCLUDE_DIR})
ENDIF(PCRE_SUPPORT_LIBREADLINE)
# Prepare build configuration
SET(pcre_have_type_traits 0)
SET(pcre_have_bits_type_traits 0)
IF(HAVE_TYPE_TRAITS_H)
SET(pcre_have_type_traits 1)
ENDIF(HAVE_TYPE_TRAITS_H)
IF(HAVE_BITS_TYPE_TRAITS_H)
SET(pcre_have_bits_type_traits 1)
ENDIF(HAVE_BITS_TYPE_TRAITS_H)
SET(pcre_have_long_long 0)
SET(pcre_have_ulong_long 0)
IF(HAVE_LONG_LONG)
SET(pcre_have_long_long 1)
ENDIF(HAVE_LONG_LONG)
IF(HAVE_UNSIGNED_LONG_LONG)
SET(pcre_have_ulong_long 1)
ENDIF(HAVE_UNSIGNED_LONG_LONG)
IF(NOT BUILD_SHARED_LIBS)
SET(PCRE_STATIC 1)
ENDIF(NOT BUILD_SHARED_LIBS)
IF(PCRE_SUPPORT_BSR_ANYCRLF)
SET(BSR_ANYCRLF 1)
ENDIF(PCRE_SUPPORT_BSR_ANYCRLF)
IF(PCRE_SUPPORT_UTF8 OR PCRE_SUPPORT_UNICODE_PROPERTIES)
SET(SUPPORT_UTF8 1)
ENDIF(PCRE_SUPPORT_UTF8 OR PCRE_SUPPORT_UNICODE_PROPERTIES)
IF(PCRE_SUPPORT_UNICODE_PROPERTIES)
SET(SUPPORT_UCP 1)
ENDIF(PCRE_SUPPORT_UNICODE_PROPERTIES)
# This next one used to contain
# SET(PCRETEST_LIBS ${READLINE_LIBRARY})
# but I was advised to add the NCURSES test as well, along with
# some modifications to cmake/FindReadline.cmake which should
# make it possible to override the default if necessary. PH
IF(PCRE_SUPPORT_LIBREADLINE)
SET(SUPPORT_LIBREADLINE 1)
SET(PCRETEST_LIBS ${READLINE_LIBRARY} ${NCURSES_LIBRARY})
ENDIF(PCRE_SUPPORT_LIBREADLINE)
IF(PCRE_SUPPORT_LIBZ)
SET(SUPPORT_LIBZ 1)
SET(PCREGREP_LIBS ${PCREGREP_LIBS} ${ZLIB_LIBRARIES})
ENDIF(PCRE_SUPPORT_LIBZ)
IF(PCRE_SUPPORT_LIBBZ2)
SET(SUPPORT_LIBBZ2 1)
SET(PCREGREP_LIBS ${PCREGREP_LIBS} ${BZIP2_LIBRARIES})
ENDIF(PCRE_SUPPORT_LIBBZ2)
SET(NEWLINE "")
IF(PCRE_NEWLINE STREQUAL "LF")
SET(NEWLINE "10")
ENDIF(PCRE_NEWLINE STREQUAL "LF")
IF(PCRE_NEWLINE STREQUAL "CR")
SET(NEWLINE "13")
ENDIF(PCRE_NEWLINE STREQUAL "CR")
IF(PCRE_NEWLINE STREQUAL "CRLF")
SET(NEWLINE "3338")
ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
IF(PCRE_NEWLINE STREQUAL "ANY")
SET(NEWLINE "-1")
ENDIF(PCRE_NEWLINE STREQUAL "ANY")
IF(PCRE_NEWLINE STREQUAL "ANYCRLF")
SET(NEWLINE "-2")
ENDIF(PCRE_NEWLINE STREQUAL "ANYCRLF")
IF(NEWLINE STREQUAL "")
MESSAGE(FATAL_ERROR "The PCRE_NEWLINE variable must be set to one of the following values: \"LF\", \"CR\", \"CRLF\", \"ANY\", \"ANYCRLF\".")
ENDIF(NEWLINE STREQUAL "")
IF(PCRE_EBCDIC)
SET(EBCDIC 1)
ENDIF(PCRE_EBCDIC)
IF(PCRE_NO_RECURSE)
SET(NO_RECURSE 1)
ENDIF(PCRE_NO_RECURSE)
# Output files
CONFIGURE_FILE(config-cmake.h.in
${PROJECT_BINARY_DIR}/config.h
@ONLY)
CONFIGURE_FILE(pcre.h.generic
${PROJECT_BINARY_DIR}/pcre.h
COPYONLY)
# What about pcre-config and libpcre.pc?
IF(PCRE_BUILD_PCRECPP)
CONFIGURE_FILE(pcre_stringpiece.h.in
${PROJECT_BINARY_DIR}/pcre_stringpiece.h
@ONLY)
CONFIGURE_FILE(pcrecpparg.h.in
${PROJECT_BINARY_DIR}/pcrecpparg.h
@ONLY)
ENDIF(PCRE_BUILD_PCRECPP)
# Character table generation
OPTION(PCRE_REBUILD_CHARTABLES "Rebuild char tables" OFF)
IF(PCRE_REBUILD_CHARTABLES)
ADD_EXECUTABLE(dftables dftables.c)
GET_TARGET_PROPERTY(DFTABLES_EXE dftables LOCATION)
ADD_CUSTOM_COMMAND(
COMMENT "Generating character tables (pcre_chartables.c) for current locale"
DEPENDS dftables
COMMAND ${DFTABLES_EXE}
ARGS ${PROJECT_BINARY_DIR}/pcre_chartables.c
OUTPUT ${PROJECT_BINARY_DIR}/pcre_chartables.c
)
ELSE(PCRE_REBUILD_CHARTABLES)
CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/pcre_chartables.c.dist
${PROJECT_BINARY_DIR}/pcre_chartables.c
COPYONLY)
ENDIF(PCRE_REBUILD_CHARTABLES)
# Source code
SET(PCRE_HEADERS ${PROJECT_BINARY_DIR}/pcre.h)
SET(PCRE_SOURCES
${PROJECT_BINARY_DIR}/pcre_chartables.c
pcre_compile.c
pcre_config.c
pcre_dfa_exec.c
pcre_exec.c
pcre_fullinfo.c
pcre_get.c
pcre_globals.c
pcre_info.c
pcre_newline.c
pcre_maketables.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
pcre_tables.c
pcre_try_flipped.c
pcre_ucd.c
pcre_valid_utf8.c
pcre_version.c
pcre_xclass.c
)
SET(PCREPOSIX_HEADERS pcreposix.h)
SET(PCREPOSIX_SOURCES pcreposix.c)
SET(PCRECPP_HEADERS
pcrecpp.h
pcre_scanner.h
${PROJECT_BINARY_DIR}/pcrecpparg.h
${PROJECT_BINARY_DIR}/pcre_stringpiece.h
)
SET(PCRECPP_SOURCES
pcrecpp.cc
pcre_scanner.cc
pcre_stringpiece.cc
)
# Build setup
ADD_DEFINITIONS(-DHAVE_CONFIG_H)
IF(MSVC)
ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE)
ENDIF(MSVC)
SET(CMAKE_INCLUDE_CURRENT_DIR 1)
# needed to make sure to not link debug libs
# against release libs and vice versa
IF(WIN32)
SET(CMAKE_DEBUG_POSTFIX "d")
ENDIF(WIN32)
SET(targets)
# Libraries
# pcre
ADD_LIBRARY(pcre ${PCRE_HEADERS} ${PCRE_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET(targets ${targets} pcre)
ADD_LIBRARY(pcreposix ${PCREPOSIX_HEADERS} ${PCREPOSIX_SOURCES})
SET(targets ${targets} pcreposix)
TARGET_LINK_LIBRARIES(pcreposix pcre)
IF(MINGW AND NOT PCRE_STATIC)
IF(NON_STANDARD_LIB_PREFIX)
SET_TARGET_PROPERTIES(pcre pcreposix
PROPERTIES PREFIX ""
)
ENDIF(NON_STANDARD_LIB_PREFIX)
IF(NON_STANDARD_LIB_SUFFIX)
SET_TARGET_PROPERTIES(pcre pcreposix
PROPERTIES SUFFIX "-0.dll"
)
ENDIF(NON_STANDARD_LIB_SUFFIX)
ENDIF(MINGW AND NOT PCRE_STATIC)
# pcrecpp
IF(PCRE_BUILD_PCRECPP)
ADD_LIBRARY(pcrecpp ${PCRECPP_HEADERS} ${PCRECPP_SOURCES})
SET(targets ${targets} pcrecpp)
TARGET_LINK_LIBRARIES(pcrecpp pcre)
IF(MINGW AND NOT PCRE_STATIC)
IF(NON_STANDARD_LIB_PREFIX)
SET_TARGET_PROPERTIES(pcrecpp
PROPERTIES PREFIX ""
)
ENDIF(NON_STANDARD_LIB_PREFIX)
IF(NON_STANDARD_LIB_SUFFIX)
SET_TARGET_PROPERTIES(pcrecpp
PROPERTIES SUFFIX "-0.dll"
)
ENDIF(NON_STANDARD_LIB_SUFFIX)
ENDIF(MINGW AND NOT PCRE_STATIC)
ENDIF(PCRE_BUILD_PCRECPP)
# Executables
# Removed by PH (2008-01-23) because pcredemo shouldn't really be built
# automatically, and it gave trouble in some environments anyway.
# ADD_EXECUTABLE(pcredemo pcredemo.c)
# TARGET_LINK_LIBRARIES(pcredemo pcreposix)
# IF(NOT BUILD_SHARED_LIBS)
# # make sure to not use declspec(dllimport) in static mode on windows
# SET_TARGET_PROPERTIES(pcredemo PROPERTIES COMPILE_FLAGS "-DPCRE_STATIC")
# ENDIF(NOT BUILD_SHARED_LIBS)
IF(PCRE_BUILD_PCREGREP)
ADD_EXECUTABLE(pcregrep pcregrep.c)
SET(targets ${targets} pcregrep)
TARGET_LINK_LIBRARIES(pcregrep pcreposix ${PCREGREP_LIBS})
ENDIF(PCRE_BUILD_PCREGREP)
# Testing
IF(PCRE_BUILD_TESTS)
ENABLE_TESTING()
ADD_EXECUTABLE(pcretest pcretest.c)
SET(targets ${targets} pcretest)
TARGET_LINK_LIBRARIES(pcretest pcreposix ${PCRETEST_LIBS})
IF(PCRE_BUILD_PCRECPP)
ADD_EXECUTABLE(pcrecpp_unittest pcrecpp_unittest.cc)
SET(targets ${targets} pcrecpp_unittest)
TARGET_LINK_LIBRARIES(pcrecpp_unittest pcrecpp)
IF(MINGW AND NON_STANDARD_LIB_NAMES AND NOT PCRE_STATIC)
SET_TARGET_PROPERTIES(pcrecpp
PROPERTIES PREFIX ""
)
ENDIF(MINGW AND NON_STANDARD_LIB_NAMES AND NOT PCRE_STATIC)
ADD_EXECUTABLE(pcre_scanner_unittest pcre_scanner_unittest.cc)
SET(targets ${targets} pcre_scanner_unittest)
TARGET_LINK_LIBRARIES(pcre_scanner_unittest pcrecpp)
ADD_EXECUTABLE(pcre_stringpiece_unittest pcre_stringpiece_unittest.cc)
SET(targets ${targets} pcre_stringpiece_unittest)
TARGET_LINK_LIBRARIES(pcre_stringpiece_unittest pcrecpp)
ENDIF(PCRE_BUILD_PCRECPP)
GET_TARGET_PROPERTY(PCREGREP_EXE pcregrep DEBUG_LOCATION)
GET_TARGET_PROPERTY(PCRETEST_EXE pcretest DEBUG_LOCATION)
# Write out a CTest configuration file that sets some needed environment
# variables for the test scripts.
#
FILE(WRITE ${PROJECT_BINARY_DIR}/CTestCustom.ctest
"# This is a generated file.
SET(ENV{srcdir} ${PROJECT_SOURCE_DIR})
SET(ENV{pcregrep} ${PCREGREP_EXE})
SET(ENV{pcretest} ${PCRETEST_EXE})
")
IF(UNIX)
ADD_TEST(pcre_test ${PROJECT_SOURCE_DIR}/RunTest)
ADD_TEST(pcre_grep_test ${PROJECT_SOURCE_DIR}/RunGrepTest)
ENDIF(UNIX)
IF(WIN32)
ADD_TEST(pcre_test cmd /C ${PROJECT_SOURCE_DIR}/RunTest.bat)
ENDIF(WIN32)
GET_TARGET_PROPERTY(PCRECPP_UNITTEST_EXE
pcrecpp_unittest
DEBUG_LOCATION)
GET_TARGET_PROPERTY(PCRE_SCANNER_UNITTEST_EXE
pcre_scanner_unittest
DEBUG_LOCATION)
GET_TARGET_PROPERTY(PCRE_STRINGPIECE_UNITTEST_EXE
pcre_stringpiece_unittest
DEBUG_LOCATION)
ADD_TEST(pcrecpp_test ${PCRECPP_UNITTEST_EXE})
ADD_TEST(pcre_scanner_test ${PCRE_SCANNER_UNITTEST_EXE})
ADD_TEST(pcre_stringpiece_test ${PCRE_STRINGPIECE_UNITTEST_EXE})
ENDIF(PCRE_BUILD_TESTS)
# Installation
SET(CMAKE_INSTALL_ALWAYS 1)
INSTALL(TARGETS ${targets}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
INSTALL(FILES ${PCRE_HEADERS} ${PCREPOSIX_HEADERS} DESTINATION include)
FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
IF(PCRE_BUILD_PCRECPP)
INSTALL(FILES ${PCRECPP_HEADERS} DESTINATION include)
ELSE(PCRE_BUILD_PCRECPP)
# Remove pcrecpp.3
FOREACH(man ${man3})
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
IF(NOT man_tmp STREQUAL "pcrecpp.3")
SET(man3_new ${man3} ${man})
ENDIF(NOT man_tmp STREQUAL "pcrecpp.3")
ENDFOREACH(man ${man3})
SET(man3 ${man3_new})
ENDIF(PCRE_BUILD_PCRECPP)
INSTALL(FILES ${man1} DESTINATION man/man1)
INSTALL(FILES ${man3} DESTINATION man/man3)
INSTALL(FILES ${html} DESTINATION share/doc/pcre/html)
# help, only for nice output
IF(BUILD_SHARED_LIBS)
SET(BUILD_STATIC_LIBS OFF)
ELSE(BUILD_SHARED_LIBS)
SET(BUILD_STATIC_LIBS ON)
ENDIF(BUILD_SHARED_LIBS)
IF(PCRE_SHOW_REPORT)
STRING(TOUPPER "${CMAKE_BUILD_TYPE}" buildtype)
IF (CMAKE_C_FLAGS)
SET(cfsp " ")
ENDIF(CMAKE_C_FLAGS)
IF (CMAKE_CXX_FLAGS)
SET(cxxfsp " ")
ENDIF(CMAKE_CXX_FLAGS)
MESSAGE(STATUS "")
MESSAGE(STATUS "")
MESSAGE(STATUS "PCRE configuration summary:")
MESSAGE(STATUS "")
MESSAGE(STATUS " Install prefix .................. : ${CMAKE_INSTALL_PREFIX}")
MESSAGE(STATUS " C compiler ...................... : ${CMAKE_C_COMPILER}")
MESSAGE(STATUS " C++ compiler .................... : ${CMAKE_CXX_COMPILER}")
MESSAGE(STATUS " C compiler flags ................ : ${CMAKE_C_FLAGS}${cfsp}${CMAKE_C_FLAGS_${buildtype}}")
MESSAGE(STATUS " C++ compiler flags .............. : ${CMAKE_CXX_FLAGS}${cxxfsp}${CMAKE_CXX_FLAGS_${buildtype}}")
MESSAGE(STATUS "")
MESSAGE(STATUS " Build C++ library ............... : ${PCRE_BUILD_PCRECPP}")
MESSAGE(STATUS " Enable UTF-8 support ............ : ${PCRE_SUPPORT_UNICODE_PROPERTIES}")
MESSAGE(STATUS " Unicode properties .............. : ${PCRE_SUPPORT_UNICODE_PROPERTIES}")
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE_SUPPORT_BSR_ANYCRLF}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE_EBCDIC}")
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE_REBUILD_CHARTABLES}")
MESSAGE(STATUS " No stack recursion .............. : ${PCRE_NO_RECURSE}")
MESSAGE(STATUS " POSIX mem threshold ............. : ${PCRE_POSIX_MALLOC_THRESHOLD}")
MESSAGE(STATUS " Internal link size .............. : ${PCRE_LINK_SIZE}")
MESSAGE(STATUS " Match limit ..................... : ${PCRE_MATCH_LIMIT}")
MESSAGE(STATUS " Match limit recursion ........... : ${PCRE_MATCH_LIMIT_RECURSION}")
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
MESSAGE(STATUS " Build pcregrep .................. : ${PCRE_BUILD_PCREGREP}")
MESSAGE(STATUS " Build tests (implies pcretest) .. : ${PCRE_BUILD_TESTS}")
IF(ZLIB_FOUND)
MESSAGE(STATUS " Link pcregrep with libz ......... : ${PCRE_SUPPORT_LIBZ}")
ELSE(ZLIB_FOUND)
MESSAGE(STATUS " Link pcregrep with libz ......... : None" )
ENDIF(ZLIB_FOUND)
IF(BZIP2_FOUND)
MESSAGE(STATUS " Link pcregrep with libbz2 ....... : ${PCRE_SUPPORT_LIBBZ2}")
ELSE(BZIP2_FOUND)
MESSAGE(STATUS " Link pcregrep with libbz2 ....... : None" )
ENDIF(BZIP2_FOUND)
IF(NOT PCRE_SUPPORT_LIBREADLINE)
MESSAGE(STATUS " Link pcretest with libreadline .. : None" )
ELSE(NOT PCRE_SUPPORT_LIBREADLINE)
MESSAGE(STATUS " Link pcretest with libreadline .. : ${PCRE_SUPPORT_LIBREADLINE}")
ENDIF(NOT PCRE_SUPPORT_LIBREADLINE)
IF(MINGW AND NOT PCRE_STATIC)
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
ENDIF(MINGW AND NOT PCRE_STATIC)
MESSAGE(STATUS "")
ENDIF(PCRE_SHOW_REPORT)
# end CMakeLists.txt

View File

@ -1,5 +0,0 @@
PCRE LICENCE
Please see the file LICENCE in the PCRE distribution for licensing details.
End

File diff suppressed because it is too large Load Diff

View File

@ -1,113 +0,0 @@
#! /usr/bin/perl -w
# Script to take the output of nroff -man and remove all the backspacing and
# the page footers and the screen commands etc so that it is more usefully
# readable online. In fact, in the latest nroff, intermediate footers don't
# seem to be generated any more.
$blankcount = 0;
$lastwascut = 0;
$firstheader = 1;
# Input on STDIN; output to STDOUT.
while (<STDIN>)
{
s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
s/.\x8//g; # Remove "char, backspace"
# Handle header lines. Retain only the first one we encounter, but remove
# the blank line that follows. Any others (e.g. at end of document) and the
# following blank line are dropped.
if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/)
{
if ($firstheader)
{
$firstheader = 0;
print;
$lastprinted = $_;
$lastwascut = 0;
}
$_=<STDIN>; # Remove a blank that follows
next;
}
# Count runs of empty lines
if (/^\s*$/)
{
$blankcount++;
$lastwascut = 0;
next;
}
# If a chunk of lines has been cut out (page footer) and the next line
# has a different indentation, put back one blank line.
if ($lastwascut && $blankcount < 1 && defined($lastprinted))
{
($a) = $lastprinted =~ /^(\s*)/;
($b) = $_ =~ /^(\s*)/;
$blankcount++ if ($a ne $b);
}
# We get here only when we have a non-blank line in hand. If it was preceded
# by 3 or more blank lines, read the next 3 lines and see if they are blank.
# If so, remove all 7 lines, and remember that we have just done a cut.
if ($blankcount >= 3)
{
for ($i = 0; $i < 3; $i++)
{
$next[$i] = <STDIN>;
$next[$i] = "" if !defined $next[$i];
$next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
$next[$i] =~ s/.\x8//g; # Remove "char, backspace"
}
# Cut out chunks of the form <3 blanks><non-blank><3 blanks>
if ($next[0] =~ /^\s*$/ &&
$next[1] =~ /^\s*$/ &&
$next[2] =~ /^\s*$/)
{
$blankcount -= 3;
$lastwascut = 1;
}
# Otherwise output the saved blanks, the current, and the next three
# lines. Remember the last printed line.
else
{
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
print;
for ($i = 0; $i < 3; $i++)
{
$next[$i] =~ s/.\x8//g;
print $next[$i];
$lastprinted = $_;
}
$lastwascut = 0;
$blankcount = 0;
}
}
# This non-blank line is not preceded by 3 or more blank lines. Output
# any blanks there are, and the line. Remember it. Force two blank lines
# before headings.
else
{
$blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ &&
defined($lastprinted);
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
print;
$lastprinted = $_;
$lastwascut = 0;
$blankcount = 0;
}
}
# End

View File

@ -1,35 +0,0 @@
#!/usr/bin/perl
# This is a script for removing trailing whitespace from lines in files that
# are listed on the command line.
# This subroutine does the work for one file.
sub detrail {
my($file) = $_[0];
my($changed) = 0;
open(IN, "$file") || die "Can't open $file for input";
@lines = <IN>;
close(IN);
foreach (@lines)
{
if (/\s+\n$/)
{
s/\s+\n$/\n/;
$changed = 1;
}
}
if ($changed)
{
open(OUT, ">$file") || die "Can't open $file for output";
print OUT @lines;
close(OUT);
}
}
# This is the main program
$, = ""; # Output field separator
for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); }
# End

View File

@ -1,418 +0,0 @@
Technical Notes about PCRE
--------------------------
These are very rough technical notes that record potentially useful information
about PCRE internals.
Historical note 1
-----------------
Many years ago I implemented some regular expression functions to an algorithm
suggested by Martin Richards. These were not Unix-like in form, and were quite
restricted in what they could do by comparison with Perl. The interesting part
about the algorithm was that the amount of space required to hold the compiled
form of an expression was known in advance. The code to apply an expression did
not operate by backtracking, as the original Henry Spencer code and current
Perl code does, but instead checked all possibilities simultaneously by keeping
a list of current states and checking all of them as it advanced through the
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
algorithm", though it was not a traditional Finite State Machine (FSM). When
the pattern was all used up, all remaining states were possible matches, and
the one matching the longest subset of the subject string was chosen. This did
not necessarily maximize the individual wild portions of the pattern, as is
expected in Unix and Perl-style regular expressions.
Historical note 2
-----------------
By contrast, the code originally written by Henry Spencer (which was
subsequently heavily modified for Perl) compiles the expression twice: once in
a dummy mode in order to find out how much store will be needed, and then for
real. (The Perl version probably doesn't do this any more; I'm talking about
the original library.) The execution function operates by backtracking and
maximizing (or, optionally, minimizing in Perl) the amount of the subject that
matches individual wild portions of the pattern. This is an "NFA algorithm" in
Friedl's terminology.
OK, here's the real stuff
-------------------------
For the set of functions that form the "basic" PCRE library (which are
unrelated to those mentioned above), I tried at first to invent an algorithm
that used an amount of store bounded by a multiple of the number of characters
in the pattern, to save on compiling time. However, because of the greater
complexity in Perl regular expressions, I couldn't do this. In any case, a
first pass through the pattern is helpful for other reasons.
Computing the memory requirement: how it was
--------------------------------------------
Up to and including release 6.7, PCRE worked by running a very degenerate first
pass to calculate a maximum store size, and then a second pass to do the real
compile - which might use a bit less than the predicted amount of memory. The
idea was that this would turn out faster than the Henry Spencer code because
the first pass is degenerate and the second pass can just store stuff straight
into the vector, which it knows is big enough.
Computing the memory requirement: how it is
-------------------------------------------
By the time I was working on a potential 6.8 release, the degenerate first pass
had become very complicated and hard to maintain. Indeed one of the early
things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
I had a flash of inspiration as to how I could run the real compile function in
a "fake" mode that enables it to compute how much memory it would need, while
actually only ever using a few hundred bytes of working memory, and without too
many tests of the mode that might slow it down. So I re-factored the compiling
functions to work this way. This got rid of about 600 lines of source. It
should make future maintenance and development easier. As this was such a major
change, I never released 6.8, instead upping the number to 7.0 (other quite
major changes are also present in the 7.0 release).
A side effect of this work is that the previous limit of 200 on the nesting
depth of parentheses was removed. However, there is a downside: pcre_compile()
runs more slowly than before (30% or more, depending on the pattern) because it
is doing a full analysis of the pattern. My hope is that this is not a big
issue.
Traditional matching function
-----------------------------
The "traditional", and original, matching function is called pcre_exec(), and
it implements an NFA algorithm, similar to the original Henry Spencer algorithm
and the way that Perl works. Not surprising, since it is intended to be as
compatible with Perl as possible. This is the function most users of PCRE will
use most of the time.
Supplementary matching function
-------------------------------
From PCRE 6.0, there is also a supplementary matching function called
pcre_dfa_exec(). This implements a DFA matching algorithm that searches
simultaneously for all possible matches that start at one point in the subject
string. (Going back to my roots: see Historical Note 1 above.) This function
intreprets the same compiled pattern data as pcre_exec(); however, not all the
facilities are available, and those that are do not always work in quite the
same way. See the user documentation for details.
The algorithm that is used for pcre_dfa_exec() is not a traditional FSM,
because it may have a number of states active at one time. More work would be
needed at compile time to produce a traditional FSM where only one state is
ever active at once. I believe some other regex matchers work this way.
Format of compiled patterns
---------------------------
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
item is either implicit in the opcode or contained in the data bytes that
follow it.
In many cases below LINK_SIZE data values are specified for offsets within the
compiled pattern. The default value for LINK_SIZE is 2, but PCRE can be
compiled to use 3-byte or 4-byte values for these offsets (impairing the
performance). This is necessary only when patterns whose compiled length is
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options. Data values that are counts (e.g. for
quantifiers) are always just two bytes long.
A list of the opcodes follows:
Opcodes with no following data
------------------------------
These items are all just one byte long
OP_END end of pattern
OP_ANY match any one character other than newline
OP_ALLANY match any one character, including newline
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
OP_SET_SOM, set start of match (\K)
OP_CIRC ^ (start of data, or after \n in multiline)
OP_NOT_WORD_BOUNDARY \W
OP_WORD_BOUNDARY \w
OP_NOT_DIGIT \D
OP_DIGIT \d
OP_NOT_HSPACE \H
OP_HSPACE \h
OP_NOT_WHITESPACE \S
OP_WHITESPACE \s
OP_NOT_VSPACE \V
OP_VSPACE \v
OP_NOT_WORDCHAR \W
OP_WORDCHAR \w
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_EXTUNI match an extended Unicode character
OP_ANYNL match any Unicode newline sequence
OP_ACCEPT )
OP_COMMIT )
OP_FAIL ) These are Perl 5.10's "backtracking
OP_PRUNE ) control verbs".
OP_SKIP )
OP_THEN )
Repeating single characters
---------------------------
The common repeats (*, +, ?) when applied to a single character use the
following opcodes:
OP_STAR
OP_MINSTAR
OP_POSSTAR
OP_PLUS
OP_MINPLUS
OP_POSPLUS
OP_QUERY
OP_MINQUERY
OP_POSQUERY
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
their names are possessive versions. Each is followed by the character that is
to be repeated. Other repeats make use of
OP_UPTO
OP_MINUPTO
OP_POSUPTO
OP_EXACT
which are followed by a two-byte count (most significant first) and the
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
Repeating character types
-------------------------
Repeats of things like \d are done exactly as for single characters, except
that instead of a character, the opcode for the type is stored in the data
byte. The opcodes are:
OP_TYPESTAR
OP_TYPEMINSTAR
OP_TYPEPOSSTAR
OP_TYPEPLUS
OP_TYPEMINPLUS
OP_TYPEPOSPLUS
OP_TYPEQUERY
OP_TYPEMINQUERY
OP_TYPEPOSQUERY
OP_TYPEUPTO
OP_TYPEMINUPTO
OP_TYPEPOSUPTO
OP_TYPEEXACT
Match by Unicode property
-------------------------
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
character by testing its Unicode property (the \p and \P escape sequences).
Each is followed by two bytes that encode the desired property as a type and a
value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
value.
Matching literal characters
---------------------------
The OP_CHAR opcode is followed by a single character that is to be matched
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
character may be more than one byte long. (Earlier versions of PCRE used
multi-character strings, but this was changed to allow some new features to be
added.)
Character classes
-----------------
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
class, and OP_NOT for a negative one (that is, for something like [^a]).
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
values < 128, because OP_NOT is confined to single bytes.
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
repeated positive single-character class.
When there's more than one character in a class and all the characters are less
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
bit for every character that is acceptable. The bits are counted from the least
significant end of each byte.
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
subject characters with values greater than 256 can be handled correctly. For
OP_CLASS they don't match, whereas for OP_NCLASS they do.
For classes containing characters with values > 255, OP_XCLASS is used. It
optionally uses a bit map (if any characters lie within it), followed by a list
of pairs and single characters. There is a flag character than indicates
whether it's a positive or a negative class.
Back references
---------------
OP_REF is followed by two bytes containing the reference number.
Repeating character classes and back references
-----------------------------------------------
Single-character classes are handled specially (see above). This section
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
the base item. The matching code looks at the following opcode to see if it is
one of
OP_CRSTAR
OP_CRMINSTAR
OP_CRPLUS
OP_CRMINPLUS
OP_CRQUERY
OP_CRMINQUERY
OP_CRRANGE
OP_CRMINRANGE
All but the last two are just single-byte items. The others are followed by
four bytes of data, comprising the minimum and maximum repeat counts. There are
no special possessive opcodes for these repeats; a possessive repeat is
compiled into an atomic group.
Brackets and alternation
------------------------
A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
[Note for North Americans: "bracket" to some English speakers, including
myself, can be round, square, curly, or pointy. Hence this usage.]
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
capturing brackets and it used a different opcode for each one. From release
3.5, the limit was removed by putting the bracket number into the data for
higher-numbered brackets. From release 7.0 all capturing brackets are handled
this way, using the single opcode OP_CBRA.
A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
next alternative OP_ALT or, if there aren't any branches, to the matching
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
number immediately follows the offset, always as a 2-byte item.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
positive number) the offset back to the matching bracket opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
single-byte opcodes that tell the matcher that skipping the following
subpattern entirely is a valid branch. In the case of the first two, not
skipping the pattern is also valid (greedy and non-greedy). The third is used
when a pattern has the quantifier {0,0}. It cannot be entirely discarded,
because it may be called as a subroutine from elsewhere in the regex.
A subpattern with an indefinite maximum repetition is replicated in the
compiled data its minimum number of times (or once with OP_BRAZERO if the
minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
as appropriate.
A subpattern with a bounded maximum repetition is replicated in a nested
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
before each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group
has the same number.
When a repeated subpattern has an unbounded upper limit, it is checked to see
whether it could match an empty string. If this is the case, the opcode in the
final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
that it needs to check for matching an empty string when it hits OP_KETRMIN or
OP_KETRMAX, and if so, to break the loop.
Assertions
----------
Forward assertions are just like other subpatterns, but starting with one of
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
is OP_REVERSE, followed by a two byte count of the number of characters to move
back the pointer in the subject string. When operating in UTF-8 mode, the count
is a character count rather than a byte count. A separate count is present in
each alternative of a lookbehind assertion, allowing them to have different
fixed lengths.
Once-only (atomic) subpatterns
------------------------------
These are also just like other subpatterns, but they start with the opcode
OP_ONCE. The check for matching an empty string in an unbounded repeat is
handled entirely at runtime, so there is just this one opcode.
Conditional subpatterns
-----------------------
These are like other subpatterns, but they start with the opcode OP_COND, or
OP_SCOND for one that might match an empty string in an unbounded repeat. If
the condition is a back reference, this is stored at the start of the
subpattern using the opcode OP_CREF followed by two bytes containing the
reference number. If the condition is "in recursion" (coded as "(?(R)"), or "in
recursion of group x" (coded as "(?(Rx)"), the group number is stored at the
start of the subpattern using the opcode OP_RREF, and a value of zero for "the
whole pattern". For a DEFINE condition, just the single byte OP_DEF is used (it
has no associated data). Otherwise, a conditional subpattern always starts with
one of the assertions.
Recursion
---------
Recursion either matches the current regex, or some subexpression. The opcode
OP_RECURSE is followed by an value which is the offset to the starting bracket
from the start of the whole pattern. From release 6.5, OP_RECURSE is
automatically wrapped inside OP_ONCE brackets (because otherwise some patterns
broke it). OP_RECURSE is also used for "subroutine" calls, even though they
are not strictly a recursion.
Callout
-------
OP_CALLOUT is followed by one byte of data that holds a callout number in the
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
cases there follows a two-byte value giving the offset in the pattern to the
start of the following item, and another two-byte item giving the length of the
next item.
Changing options
----------------
If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
opcode is compiled, followed by one byte containing the new settings of these
flags. If there are several alternatives, there is an occurrence of OP_OPT at
the start of all those following the first options change, to set appropriate
options for the start of the alternative. Immediately after the end of the
group there is another such item to reset the flags to their previous values. A
change of flag right at the very start of the pattern can be handled entirely
at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
April 2008

View File

@ -1,291 +0,0 @@
Installation Instructions
*************************
Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
2006, 2007, 2008 Free Software Foundation, Inc.
This file is free documentation; the Free Software Foundation gives
unlimited permission to copy, distribute and modify it.
Basic Installation
==================
Briefly, the shell commands `./configure; make; make install' should
configure, build, and install this package. The following
more-detailed instructions are generic; see the `README' file for
instructions specific to this package.
The `configure' shell script attempts to guess correct values for
various system-dependent variables used during compilation. It uses
those values to create a `Makefile' in each directory of the package.
It may also create one or more `.h' files containing system-dependent
definitions. Finally, it creates a shell script `config.status' that
you can run in the future to recreate the current configuration, and a
file `config.log' containing compiler output (useful mainly for
debugging `configure').
It can also use an optional file (typically called `config.cache'
and enabled with `--cache-file=config.cache' or simply `-C') that saves
the results of its tests to speed up reconfiguring. Caching is
disabled by default to prevent problems with accidental use of stale
cache files.
If you need to do unusual things to compile the package, please try
to figure out how `configure' could check whether to do them, and mail
diffs or instructions to the address given in the `README' so they can
be considered for the next release. If you are using the cache, and at
some point `config.cache' contains results you don't want to keep, you
may remove or edit it.
The file `configure.ac' (or `configure.in') is used to create
`configure' by a program called `autoconf'. You need `configure.ac' if
you want to change it or regenerate `configure' using a newer version
of `autoconf'.
The simplest way to compile this package is:
1. `cd' to the directory containing the package's source code and type
`./configure' to configure the package for your system.
Running `configure' might take a while. While running, it prints
some messages telling which features it is checking for.
2. Type `make' to compile the package.
3. Optionally, type `make check' to run any self-tests that come with
the package.
4. Type `make install' to install the programs and any data files and
documentation.
5. You can remove the program binaries and object files from the
source code directory by typing `make clean'. To also remove the
files that `configure' created (so you can compile the package for
a different kind of computer), type `make distclean'. There is
also a `make maintainer-clean' target, but that is intended mainly
for the package's developers. If you use it, you may have to get
all sorts of other programs in order to regenerate files that came
with the distribution.
6. Often, you can also type `make uninstall' to remove the installed
files again.
Compilers and Options
=====================
Some systems require unusual options for compilation or linking that
the `configure' script does not know about. Run `./configure --help'
for details on some of the pertinent environment variables.
You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment. Here
is an example:
./configure CC=c99 CFLAGS=-g LIBS=-lposix
*Note Defining Variables::, for more details.
Compiling For Multiple Architectures
====================================
You can compile the package for more than one kind of computer at the
same time, by placing the object files for each architecture in their
own directory. To do this, you can use GNU `make'. `cd' to the
directory where you want the object files and executables to go and run
the `configure' script. `configure' automatically checks for the
source code in the directory that `configure' is in and in `..'.
With a non-GNU `make', it is safer to compile the package for one
architecture at a time in the source code directory. After you have
installed the package for one architecture, use `make distclean' before
reconfiguring for another architecture.
On MacOS X 10.5 and later systems, you can create libraries and
executables that work on multiple system types--known as "fat" or
"universal" binaries--by specifying multiple `-arch' options to the
compiler but only a single `-arch' option to the preprocessor. Like
this:
./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
CPP="gcc -E" CXXCPP="g++ -E"
This is not guaranteed to produce working output in all cases, you
may have to build one architecture at a time and combine the results
using the `lipo' tool if you have problems.
Installation Names
==================
By default, `make install' installs the package's commands under
`/usr/local/bin', include files under `/usr/local/include', etc. You
can specify an installation prefix other than `/usr/local' by giving
`configure' the option `--prefix=PREFIX'.
You can specify separate installation prefixes for
architecture-specific files and architecture-independent files. If you
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
PREFIX as the prefix for installing programs and libraries.
Documentation and other data files still use the regular prefix.
In addition, if you use an unusual directory layout you can give
options like `--bindir=DIR' to specify different values for particular
kinds of files. Run `configure --help' for a list of the directories
you can set and what kinds of files go in them.
If the package supports it, you can cause programs to be installed
with an extra prefix or suffix on their names by giving `configure' the
option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
Optional Features
=================
Some packages pay attention to `--enable-FEATURE' options to
`configure', where FEATURE indicates an optional part of the package.
They may also pay attention to `--with-PACKAGE' options, where PACKAGE
is something like `gnu-as' or `x' (for the X Window System). The
`README' should mention any `--enable-' and `--with-' options that the
package recognizes.
For packages that use the X Window System, `configure' can usually
find the X include and library files automatically, but if it doesn't,
you can use the `configure' options `--x-includes=DIR' and
`--x-libraries=DIR' to specify their locations.
Particular systems
==================
On HP-UX, the default C compiler is not ANSI C compatible. If GNU
CC is not installed, it is recommended to use the following options in
order to use an ANSI C compiler:
./configure CC="cc -Ae"
and if that doesn't work, install pre-built binaries of GCC for HP-UX.
On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
parse its `<wchar.h>' header file. The option `-nodtk' can be used as
a workaround. If GNU CC is not installed, it is therefore recommended
to try
./configure CC="cc"
and if that doesn't work, try
./configure CC="cc -nodtk"
Specifying the System Type
==========================
There may be some features `configure' cannot figure out
automatically, but needs to determine by the type of machine the package
will run on. Usually, assuming the package is built to be run on the
_same_ architectures, `configure' can figure that out, but if it prints
a message saying it cannot guess the machine type, give it the
`--build=TYPE' option. TYPE can either be a short name for the system
type, such as `sun4', or a canonical name which has the form:
CPU-COMPANY-SYSTEM
where SYSTEM can have one of these forms:
OS KERNEL-OS
See the file `config.sub' for the possible values of each field. If
`config.sub' isn't included in this package, then this package doesn't
need to know the machine type.
If you are _building_ compiler tools for cross-compiling, you should
use the option `--target=TYPE' to select the type of system they will
produce code for.
If you want to _use_ a cross compiler, that generates code for a
platform different from the build platform, you should specify the
"host" platform (i.e., that on which the generated programs will
eventually be run) with `--host=TYPE'.
Sharing Defaults
================
If you want to set default values for `configure' scripts to share,
you can create a site shell script called `config.site' that gives
default values for variables like `CC', `cache_file', and `prefix'.
`configure' looks for `PREFIX/share/config.site' if it exists, then
`PREFIX/etc/config.site' if it exists. Or, you can set the
`CONFIG_SITE' environment variable to the location of the site script.
A warning: not all `configure' scripts look for a site script.
Defining Variables
==================
Variables not defined in a site shell script can be set in the
environment passed to `configure'. However, some packages may run
configure again during the build, and the customized values of these
variables may be lost. In order to avoid this problem, you should set
them in the `configure' command line, using `VAR=value'. For example:
./configure CC=/usr/local2/bin/gcc
causes the specified `gcc' to be used as the C compiler (unless it is
overridden in the site shell script).
Unfortunately, this technique does not work for `CONFIG_SHELL' due to
an Autoconf bug. Until the bug is fixed you can use this workaround:
CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
`configure' Invocation
======================
`configure' recognizes the following options to control how it
operates.
`--help'
`-h'
Print a summary of all of the options to `configure', and exit.
`--help=short'
`--help=recursive'
Print a summary of the options unique to this package's
`configure', and exit. The `short' variant lists options used
only in the top level, while the `recursive' variant lists options
also present in any nested packages.
`--version'
`-V'
Print the version of Autoconf used to generate the `configure'
script, and exit.
`--cache-file=FILE'
Enable the cache: use and save the results of the tests in FILE,
traditionally `config.cache'. FILE defaults to `/dev/null' to
disable caching.
`--config-cache'
`-C'
Alias for `--cache-file=config.cache'.
`--quiet'
`--silent'
`-q'
Do not print messages saying which checks are being made. To
suppress all normal output, redirect it to `/dev/null' (any error
messages will still be shown).
`--srcdir=DIR'
Look for the package's source code in directory DIR. Usually
`configure' can determine that directory automatically.
`--prefix=DIR'
Use DIR as the installation prefix. *Note Installation Names::
for more details, including other options available for fine-tuning
the installation locations.
`--no-create'
`-n'
Run the configure checks, but stop before creating any output
files.
`configure' also accepts some other, not widely useful, options. Run
`configure --help' for more details.

View File

@ -1,68 +0,0 @@
PCRE LICENCE
------------
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
The basic library functions are written in C and are freestanding. Also
included in the distribution is a set of C++ wrapper functions.
THE BASIC LIBRARY FUNCTIONS
---------------------------
Written by: Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
Copyright (c) 1997-2009 University of Cambridge
All rights reserved.
THE C++ WRAPPER FUNCTIONS
-------------------------
Contributed by: Google Inc.
Copyright (c) 2007-2008, Google Inc.
All rights reserved.
THE "BSD" LICENCE
-----------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the name of Google
Inc. nor the names of their contributors may be used to endorse or
promote products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
End

View File

@ -1,334 +0,0 @@
## Process this file with automake to produce Makefile.in.
pcrecpp_html = doc/html/pcrecpp.html
dist_noinst_DATA = $(pcrecpp_html)
# The Libtool libraries to install. We'll add to this later.
lib_LTLIBRARIES =
# Unit tests you want to run when people type 'make check'.
# TESTS is for binary unit tests, check_SCRIPTS for script-based tests
TESTS =
check_SCRIPTS =
dist_noinst_SCRIPTS =
# Some of the binaries we make are to be installed, and others are
# (non-user-visible) helper programs needed to build libpcre.
bin_PROGRAMS =
noinst_PROGRAMS =
# Additional files to delete on 'make clean' and 'make maintainer-clean'.
CLEANFILES =
MAINTAINERCLEANFILES =
# Additional files to bundle with the distribution, over and above what
# the Autotools include by default.
EXTRA_DIST =
# These files contain maintenance information
EXTRA_DIST += \
doc/perltest.txt \
NON-UNIX-USE \
HACKING
# These files are used in the preparation of a release
EXTRA_DIST += \
PrepareRelease \
CleanTxt \
Detrail \
132html \
doc/index.html.src
# These files are to do with building for Virtual Pascal
EXTRA_DIST += \
makevp.bat \
makevp_c.txt \
makevp_l.txt \
pcregexp.pas
# These files are usable versions of pcre.h and config.h that are distributed
# for the benefit of people who are building PCRE manually, without the
# Autotools support.
EXTRA_DIST += \
pcre.h.generic \
config.h.generic
pcre.h.generic: configure.ac
rm -f $@
cp -p pcre.h $@
MAINTAINERCLEANFILES += pcre.h.generic
# These are the header files we'll install. We do not distribute pcre.h because
# it is generated from pcre.h.in.
nodist_include_HEADERS = \
pcre.h
include_HEADERS = \
pcreposix.h
# These additional headers will be be installed if C++ support is enabled. We
# do not distribute pcrecpparg.h or pcre_stringpiece.h, as these are generated
# from corresponding .h.in files (which we do distribute).
if WITH_PCRE_CPP
nodist_include_HEADERS += \
pcrecpparg.h \
pcre_stringpiece.h
include_HEADERS += \
pcrecpp.h \
pcre_scanner.h
endif # WITH_PCRE_CPP
bin_SCRIPTS = pcre-config
## ---------------------------------------------------------------
## The dftables program is used to rebuild character tables before compiling
## PCRE, if --enable-rebuild-chartables is specified. It is not a user-visible
## program. The default (when --enable-rebuild-chartables is not specified) is
## to copy a distributed set of tables that are defined for ASCII code. In this
## case, dftables is not needed.
if WITH_REBUILD_CHARTABLES
noinst_PROGRAMS += dftables
dftables_SOURCES = dftables.c
pcre_chartables.c: dftables$(EXEEXT)
./dftables$(EXEEXT) $@
else
pcre_chartables.c: $(srcdir)/pcre_chartables.c.dist
rm -f $@
$(LN_S) $(srcdir)/pcre_chartables.c.dist $@
endif # WITH_REBUILD_CHARTABLES
## The main pcre library
lib_LTLIBRARIES += libpcre.la
libpcre_la_SOURCES = \
pcre_compile.c \
pcre_config.c \
pcre_dfa_exec.c \
pcre_exec.c \
pcre_fullinfo.c \
pcre_get.c \
pcre_globals.c \
pcre_info.c \
pcre_internal.h \
pcre_maketables.c \
pcre_newline.c \
pcre_ord2utf8.c \
pcre_refcount.c \
pcre_study.c \
pcre_tables.c \
pcre_try_flipped.c \
pcre_ucd.c \
pcre_valid_utf8.c \
pcre_version.c \
pcre_xclass.c \
ucp.h
## This file is generated as part of the building process, so don't distribute.
nodist_libpcre_la_SOURCES = \
pcre_chartables.c
# The pcre_printint.src file is #included by some source files, so it must be
# distributed. The pcre_chartables.c.dist file is the default version of
# pcre_chartables.c, used unless --enable-rebuild-chartables is specified.
EXTRA_DIST += pcre_printint.src pcre_chartables.c.dist
libpcre_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
CLEANFILES += pcre_chartables.c
## A version of the main pcre library that has a posix re API.
lib_LTLIBRARIES += libpcreposix.la
libpcreposix_la_SOURCES = \
pcreposix.c
libpcreposix_la_LDFLAGS = $(EXTRA_LIBPCREPOSIX_LDFLAGS)
libpcreposix_la_LIBADD = libpcre.la
## There's a C++ library as well.
if WITH_PCRE_CPP
lib_LTLIBRARIES += libpcrecpp.la
libpcrecpp_la_SOURCES = \
pcrecpp_internal.h \
pcrecpp.cc \
pcre_scanner.cc \
pcre_stringpiece.cc
libpcrecpp_la_LDFLAGS = $(EXTRA_LIBPCRECPP_LDFLAGS)
libpcrecpp_la_LIBADD = libpcre.la
TESTS += pcrecpp_unittest
noinst_PROGRAMS += pcrecpp_unittest
pcrecpp_unittest_SOURCES = pcrecpp_unittest.cc
pcrecpp_unittest_LDADD = libpcrecpp.la
TESTS += pcre_scanner_unittest
noinst_PROGRAMS += pcre_scanner_unittest
pcre_scanner_unittest_SOURCES = pcre_scanner_unittest.cc
pcre_scanner_unittest_LDADD = libpcrecpp.la
TESTS += pcre_stringpiece_unittest
noinst_PROGRAMS += pcre_stringpiece_unittest
pcre_stringpiece_unittest_SOURCES = pcre_stringpiece_unittest.cc
pcre_stringpiece_unittest_LDADD = libpcrecpp.la
endif # WITH_PCRE_CPP
## The main unit tests
# Each unit test is a binary plus a script that runs that binary in various
# ways. We install these test binaries in case folks find it helpful.
TESTS += RunTest
dist_noinst_SCRIPTS += RunTest
EXTRA_DIST += RunTest.bat
bin_PROGRAMS += pcretest
pcretest_SOURCES = pcretest.c
pcretest_LDADD = libpcreposix.la $(LIBREADLINE)
TESTS += RunGrepTest
dist_noinst_SCRIPTS += RunGrepTest
bin_PROGRAMS += pcregrep
pcregrep_SOURCES = pcregrep.c
pcregrep_LDADD = libpcreposix.la $(LIBZ) $(LIBBZ2)
EXTRA_DIST += \
testdata/grepinput \
testdata/grepinput8 \
testdata/grepinputv \
testdata/grepinputx \
testdata/greplist \
testdata/grepoutput \
testdata/grepoutput8 \
testdata/grepoutputN \
testdata/testinput1 \
testdata/testinput2 \
testdata/testinput3 \
testdata/testinput4 \
testdata/testinput5 \
testdata/testinput6 \
testdata/testinput7 \
testdata/testinput8 \
testdata/testinput9 \
testdata/testinput10 \
testdata/testoutput1 \
testdata/testoutput2 \
testdata/testoutput3 \
testdata/testoutput4 \
testdata/testoutput5 \
testdata/testoutput6 \
testdata/testoutput7 \
testdata/testoutput8 \
testdata/testoutput9 \
testdata/testoutput10 \
testdata/wintestinput3 \
testdata/wintestoutput3 \
perltest.pl
CLEANFILES += \
testsavedregex \
teststderr \
testtry \
testNinput
# PCRE demonstration program. No longer built automatcally. The point is that
# the users should build it themselves. So just distribute the source.
# noinst_PROGRAMS += pcredemo
# pcredemo_SOURCES = pcredemo.c
# pcredemo_LDADD = libpcre.la
EXTRA_DIST += pcredemo.c
## Utility rules, documentation, etc.
# A compatibility line, the old build system worked with 'make test'
test: check ;
# A PCRE user submitted the following addition, saying that it "will allow
# anyone using the 'mingw32' compiler to simply type 'make pcre.dll' and get a
# nice DLL for Windows use". (It is used by the pcre.dll target.)
DLL_OBJS= pcre_compile.o pcre_config.o \
pcre_dfa_exec.o pcre_exec.o pcre_fullinfo.o pcre_get.o \
pcre_globals.o pcre_info.o pcre_maketables.o \
pcre_newline.o pcre_ord2utf8.o pcre_refcount.o \
pcre_study.o pcre_tables.o pcre_try_flipped.o \
pcre_ucd.o pcre_valid_utf8.o pcre_version.o \
pcre_chartables.o \
pcre_xclass.o
# A PCRE user submitted the following addition, saying that it "will allow
# anyone using the 'mingw32' compiler to simply type 'make pcre.dll' and get a
# nice DLL for Windows use".
pcre.dll: $(DLL_OBJS)
$(CC) -shared -o pcre.dll -Wl,"--strip-all" -Wl,"--export-all-symbols" $(DLL_OBJS)
# We have .pc files for pkg-config users.
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpcre.pc
if WITH_PCRE_CPP
pkgconfig_DATA += libpcrecpp.pc
endif
dist_man_MANS = \
doc/pcre.3 \
doc/pcre-config.1 \
doc/pcre_compile.3 \
doc/pcre_compile2.3 \
doc/pcre_config.3 \
doc/pcre_copy_named_substring.3 \
doc/pcre_copy_substring.3 \
doc/pcre_dfa_exec.3 \
doc/pcre_exec.3 \
doc/pcre_free_substring.3 \
doc/pcre_free_substring_list.3 \
doc/pcre_fullinfo.3 \
doc/pcre_get_named_substring.3 \
doc/pcre_get_stringnumber.3 \
doc/pcre_get_stringtable_entries.3 \
doc/pcre_get_substring.3 \
doc/pcre_get_substring_list.3 \
doc/pcre_info.3 \
doc/pcre_maketables.3 \
doc/pcre_refcount.3 \
doc/pcre_study.3 \
doc/pcre_version.3 \
doc/pcreapi.3 \
doc/pcrebuild.3 \
doc/pcrecallout.3 \
doc/pcrecompat.3 \
doc/pcregrep.1 \
doc/pcrematching.3 \
doc/pcrepartial.3 \
doc/pcrepattern.3 \
doc/pcreperform.3 \
doc/pcreposix.3 \
doc/pcreprecompile.3 \
doc/pcresample.3 \
doc/pcrestack.3 \
doc/pcresyntax.3 \
doc/pcretest.1
pcrecpp_man = doc/pcrecpp.3
EXTRA_DIST += $(pcrecpp_man)
if WITH_PCRE_CPP
man_MANS = $(pcrecpp_man)
endif
## CMake support
EXTRA_DIST += \
cmake/COPYING-CMAKE-SCRIPTS \
cmake/FindPackageHandleStandardArgs.cmake \
cmake/FindReadline.cmake \
CMakeLists.txt \
config-cmake.h.in
## end Makefile.am

View File

@ -1,428 +0,0 @@
News about PCRE releases
------------------------
Release 7.9 11-Apr-09
---------------------
Mostly bugfixes and tidies with just a couple of minor functional additions.
Release 7.8 05-Sep-08
---------------------
More bug fixes, plus a performance improvement in Unicode character property
lookup.
Release 7.7 07-May-08
---------------------
This is once again mainly a bug-fix release, but there are a couple of new
features.
Release 7.6 28-Jan-08
---------------------
The main reason for having this release so soon after 7.5 is because it fixes a
potential buffer overflow problem in pcre_compile() when run in UTF-8 mode. In
addition, the CMake configuration files have been brought up to date.
Release 7.5 10-Jan-08
---------------------
This is mainly a bug-fix release. However the ability to link pcregrep with
libz or libbz2 and the ability to link pcretest with libreadline have been
added. Also the --line-offsets and --file-offsets options were added to
pcregrep.
Release 7.4 21-Sep-07
---------------------
The only change of specification is the addition of options to control whether
\R matches any Unicode line ending (the default) or just CR, LF, and CRLF.
Otherwise, the changes are bug fixes and a refactoring to reduce the number of
relocations needed in a shared library. There have also been some documentation
updates, in particular, some more information about using CMake to build PCRE
has been added to the NON-UNIX-USE file.
Release 7.3 28-Aug-07
---------------------
Most changes are bug fixes. Some that are not:
1. There is some support for Perl 5.10's experimental "backtracking control
verbs" such as (*PRUNE).
2. UTF-8 checking is now as per RFC 3629 instead of RFC 2279; this is more
restrictive in the strings it accepts.
3. Checking for potential integer overflow has been made more dynamic, and as a
consequence there is no longer a hard limit on the size of a subpattern that
has a limited repeat count.
4. When CRLF is a valid line-ending sequence, pcre_exec() and pcre_dfa_exec()
no longer advance by two characters instead of one when an unanchored match
fails at CRLF if there are explicit CR or LF matches within the pattern.
This gets rid of some anomalous effects that previously occurred.
5. Some PCRE-specific settings for varying the newline options at the start of
a pattern have been added.
Release 7.2 19-Jun-07
---------------------
WARNING: saved patterns that were compiled by earlier versions of PCRE must be
recompiled for use with 7.2 (necessitated by the addition of \K, \h, \H, \v,
and \V).
Correction to the notes for 7.1: the note about shared libraries for Windows is
wrong. Previously, three libraries were built, but each could function
independently. For example, the pcreposix library also included all the
functions from the basic pcre library. The change is that the three libraries
are no longer independent. They are like the Unix libraries. To use the
pcreposix functions, for example, you need to link with both the pcreposix and
the basic pcre library.
Some more features from Perl 5.10 have been added:
(?-n) and (?+n) relative references for recursion and subroutines.
(?(-n) and (?(+n) relative references as conditions.
\k{name} and \g{name} are synonyms for \k<name>.
\K to reset the start of the matched string; for example, (foo)\Kbar
matches bar preceded by foo, but only sets bar as the matched string.
(?| introduces a group where the capturing parentheses in each alternative
start from the same number; for example, (?|(abc)|(xyz)) sets capturing
parentheses number 1 in both cases.
\h, \H, \v, \V match horizontal and vertical whitespace, respectively.
Release 7.1 24-Apr-07
---------------------
There is only one new feature in this release: a linebreak setting of
PCRE_NEWLINE_ANYCRLF. It is a cut-down version of PCRE_NEWLINE_ANY, which
recognizes only CRLF, CR, and LF as linebreaks.
A few bugs are fixed (see ChangeLog for details), but the major change is a
complete re-implementation of the build system. This now has full Autotools
support and so is now "standard" in some sense. It should help with compiling
PCRE in a wide variety of environments.
NOTE: when building shared libraries for Windows, three dlls are now built,
called libpcre, libpcreposix, and libpcrecpp. Previously, everything was
included in a single dll.
Another important change is that the dftables auxiliary program is no longer
compiled and run at "make" time by default. Instead, a default set of character
tables (assuming ASCII coding) is used. If you want to use dftables to generate
the character tables as previously, add --enable-rebuild-chartables to the
"configure" command. You must do this if you are compiling PCRE to run on a
system that uses EBCDIC code.
There is a discussion about character tables in the README file. The default is
not to use dftables so that that there is no problem when cross-compiling.
Release 7.0 19-Dec-06
---------------------
This release has a new major number because there have been some internal
upheavals to facilitate the addition of new optimizations and other facilities,
and to make subsequent maintenance and extension easier. Compilation is likely
to be a bit slower, but there should be no major effect on runtime performance.
Previously compiled patterns are NOT upwards compatible with this release. If
you have saved compiled patterns from a previous release, you will have to
re-compile them. Important changes that are visible to users are:
1. The Unicode property tables have been updated to Unicode 5.0.0, which adds
some more scripts.
2. The option PCRE_NEWLINE_ANY causes PCRE to recognize any Unicode newline
sequence as a newline.
3. The \R escape matches a single Unicode newline sequence as a single unit.
4. New features that will appear in Perl 5.10 are now in PCRE. These include
alternative Perl syntax for named parentheses, and Perl syntax for
recursion.
5. The C++ wrapper interface has been extended by the addition of a
QuoteMeta function and the ability to allow copy construction and
assignment.
For a complete list of changes, see the ChangeLog file.
Release 6.7 04-Jul-06
---------------------
The main additions to this release are the ability to use the same name for
multiple sets of parentheses, and support for CRLF line endings in both the
library and pcregrep (and in pcretest for testing).
Thanks to Ian Taylor, the stack usage for many kinds of pattern has been
significantly reduced for certain subject strings.
Release 6.5 01-Feb-06
---------------------
Important changes in this release:
1. A number of new features have been added to pcregrep.
2. The Unicode property tables have been updated to Unicode 4.1.0, and the
supported properties have been extended with script names such as "Arabic",
and the derived properties "Any" and "L&". This has necessitated a change to
the interal format of compiled patterns. Any saved compiled patterns that
use \p or \P must be recompiled.
3. The specification of recursion in patterns has been changed so that all
recursive subpatterns are automatically treated as atomic groups. Thus, for
example, (?R) is treated as if it were (?>(?R)). This is necessary because
otherwise there are situations where recursion does not work.
See the ChangeLog for a complete list of changes, which include a number of bug
fixes and tidies.
Release 6.0 07-Jun-05
---------------------
The release number has been increased to 6.0 because of the addition of several
major new pieces of functionality.
A new function, pcre_dfa_exec(), which implements pattern matching using a DFA
algorithm, has been added. This has a number of advantages for certain cases,
though it does run more slowly, and lacks the ability to capture substrings. On
the other hand, it does find all matches, not just the first, and it works
better for partial matching. The pcrematching man page discusses the
differences.
The pcretest program has been enhanced so that it can make use of the new
pcre_dfa_exec() matching function and the extra features it provides.
The distribution now includes a C++ wrapper library. This is built
automatically if a C++ compiler is found. The pcrecpp man page discusses this
interface.
The code itself has been re-organized into many more files, one for each
function, so it no longer requires everything to be linked in when static
linkage is used. As a consequence, some internal functions have had to have
their names exposed. These functions all have names starting with _pcre_. They
are undocumented, and are not intended for use by outside callers.
The pcregrep program has been enhanced with new functionality such as
multiline-matching and options for output more matching context. See the
ChangeLog for a complete list of changes to the library and the utility
programs.
Release 5.0 13-Sep-04
---------------------
The licence under which PCRE is released has been changed to the more
conventional "BSD" licence.
In the code, some bugs have been fixed, and there are also some major changes
in this release (which is why I've increased the number to 5.0). Some changes
are internal rearrangements, and some provide a number of new facilities. The
new features are:
1. There's an "automatic callout" feature that inserts callouts before every
item in the regex, and there's a new callout field that gives the position
in the pattern - useful for debugging and tracing.
2. The extra_data structure can now be used to pass in a set of character
tables at exec time. This is useful if compiled regex are saved and re-used
at a later time when the tables may not be at the same address. If the
default internal tables are used, the pointer saved with the compiled
pattern is now set to NULL, which means that you don't need to do anything
special unless you are using custom tables.
3. It is possible, with some restrictions on the content of the regex, to
request "partial" matching. A special return code is given if all of the
subject string matched part of the regex. This could be useful for testing
an input field as it is being typed.
4. There is now some optional support for Unicode character properties, which
means that the patterns items such as \p{Lu} and \X can now be used. Only
the general category properties are supported. If PCRE is compiled with this
support, an additional 90K data structure is include, which increases the
size of the library dramatically.
5. There is support for saving compiled patterns and re-using them later.
6. There is support for running regular expressions that were compiled on a
different host with the opposite endianness.
7. The pcretest program has been extended to accommodate the new features.
The main internal rearrangement is that sequences of literal characters are no
longer handled as strings. Instead, each character is handled on its own. This
makes some UTF-8 handling easier, and makes the support of partial matching
possible. Compiled patterns containing long literal strings will be larger as a
result of this change; I hope that performance will not be much affected.
Release 4.5 01-Dec-03
---------------------
Again mainly a bug-fix and tidying release, with only a couple of new features:
1. It's possible now to compile PCRE so that it does not use recursive
function calls when matching. Instead it gets memory from the heap. This slows
things down, but may be necessary on systems with limited stacks.
2. UTF-8 string checking has been tightened to reject overlong sequences and to
check that a starting offset points to the start of a character. Failure of the
latter returns a new error code: PCRE_ERROR_BADUTF8_OFFSET.
3. PCRE can now be compiled for systems that use EBCDIC code.
Release 4.4 21-Aug-03
---------------------
This is mainly a bug-fix and tidying release. The only new feature is that PCRE
checks UTF-8 strings for validity by default. There is an option to suppress
this, just in case anybody wants that teeny extra bit of performance.
Releases 4.1 - 4.3
------------------
Sorry, I forgot about updating the NEWS file for these releases. Please take a
look at ChangeLog.
Release 4.0 17-Feb-03
---------------------
There have been a lot of changes for the 4.0 release, adding additional
functionality and mending bugs. Below is a list of the highlights of the new
functionality. For full details of these features, please consult the
documentation. For a complete list of changes, see the ChangeLog file.
1. Support for Perl's \Q...\E escapes.
2. "Possessive quantifiers" ?+, *+, ++, and {,}+ which come from Sun's Java
package. They provide some syntactic sugar for simple cases of "atomic
grouping".
3. Support for the \G assertion. It is true when the current matching position
is at the start point of the match.
4. A new feature that provides some of the functionality that Perl provides
with (?{...}). The facility is termed a "callout". The way it is done in PCRE
is for the caller to provide an optional function, by setting pcre_callout to
its entry point. To get the function called, the regex must include (?C) at
appropriate points.
5. Support for recursive calls to individual subpatterns. This makes it really
easy to get totally confused.
6. Support for named subpatterns. The Python syntax (?P<name>...) is used to
name a group.
7. Several extensions to UTF-8 support; it is now fairly complete. There is an
option for pcregrep to make it operate in UTF-8 mode.
8. The single man page has been split into a number of separate man pages.
These also give rise to individual HTML pages which are put in a separate
directory. There is an index.html page that lists them all. Some hyperlinking
between the pages has been installed.
Release 3.5 15-Aug-01
---------------------
1. The configuring system has been upgraded to use later versions of autoconf
and libtool. By default it builds both a shared and a static library if the OS
supports it. You can use --disable-shared or --disable-static on the configure
command if you want only one of them.
2. The pcretest utility is now installed along with pcregrep because it is
useful for users (to test regexs) and by doing this, it automatically gets
relinked by libtool. The documentation has been turned into a man page, so
there are now .1, .txt, and .html versions in /doc.
3. Upgrades to pcregrep:
(i) Added long-form option names like gnu grep.
(ii) Added --help to list all options with an explanatory phrase.
(iii) Added -r, --recursive to recurse into sub-directories.
(iv) Added -f, --file to read patterns from a file.
4. Added --enable-newline-is-cr and --enable-newline-is-lf to the configure
script, to force use of CR or LF instead of \n in the source. On non-Unix
systems, the value can be set in config.h.
5. The limit of 200 on non-capturing parentheses is a _nesting_ limit, not an
absolute limit. Changed the text of the error message to make this clear, and
likewise updated the man page.
6. The limit of 99 on the number of capturing subpatterns has been removed.
The new limit is 65535, which I hope will not be a "real" limit.
Release 3.3 01-Aug-00
---------------------
There is some support for UTF-8 character strings. This is incomplete and
experimental. The documentation describes what is and what is not implemented.
Otherwise, this is just a bug-fixing release.
Release 3.0 01-Feb-00
---------------------
1. A "configure" script is now used to configure PCRE for Unix systems. It
builds a Makefile, a config.h file, and the pcre-config script.
2. PCRE is built as a shared library by default.
3. There is support for POSIX classes such as [:alpha:].
5. There is an experimental recursion feature.
----------------------------------------------------------------------------
IMPORTANT FOR THOSE UPGRADING FROM VERSIONS BEFORE 2.00
Please note that there has been a change in the API such that a larger
ovector is required at matching time, to provide some additional workspace.
The new man page has details. This change was necessary in order to support
some of the new functionality in Perl 5.005.
IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.00
Another (I hope this is the last!) change has been made to the API for the
pcre_compile() function. An additional argument has been added to make it
possible to pass over a pointer to character tables built in the current
locale by pcre_maketables(). To use the default tables, this new arguement
should be passed as NULL.
IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.05
Yet another (and again I hope this really is the last) change has been made
to the API for the pcre_exec() function. An additional argument has been
added to make it possible to start the match other than at the start of the
subject string. This is important if there are lookbehinds. The new man
page has the details, but you just want to convert existing programs, all
you need to do is to stick in a new fifth argument to pcre_exec(), with a
value of zero. For example, change
pcre_exec(pattern, extra, subject, length, options, ovec, ovecsize)
to
pcre_exec(pattern, extra, subject, length, 0, options, ovec, ovecsize)
****

View File

@ -1,448 +0,0 @@
Compiling PCRE on non-Unix systems
----------------------------------
This document contains the following sections:
General
Generic instructions for the PCRE C library
The C++ wrapper functions
Building for virtual Pascal
Stack size in Windows environments
Linking programs in Windows environments
Comments about Win32 builds
Building PCRE on Windows with CMake
Use of relative paths with CMake on Windows
Testing with runtest.bat
Building under Windows with BCC5.5
Building PCRE on OpenVMS
GENERAL
I (Philip Hazel) have no experience of Windows or VMS sytems and how their
libraries work. The items in the PCRE distribution and Makefile that relate to
anything other than Unix-like systems are untested by me.
There are some other comments and files (including some documentation in CHM
format) in the Contrib directory on the FTP site:
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
If you want to compile PCRE for a non-Unix system (especially for a system that
does not support "configure" and "make" files), note that the basic PCRE
library consists entirely of code written in Standard C, and so should compile
successfully on any system that has a Standard C compiler and library. The C++
wrapper functions are a separate issue (see below).
The PCRE distribution includes a "configure" file for use by the Configure/Make
build system, as found in many Unix-like environments. There is also support
support for CMake, which some users prefer, in particular in Windows
environments. There are some instructions for CMake under Windows in the
section entitled "Building PCRE with CMake" below. CMake can also be used to
build PCRE in Unix-like systems.
GENERIC INSTRUCTIONS FOR THE PCRE C LIBRARY
The following are generic comments about building the PCRE C library "by hand".
(1) Copy or rename the file config.h.generic as config.h, and edit the macro
settings that it contains to whatever is appropriate for your environment.
In particular, if you want to force a specific value for newline, you can
define the NEWLINE macro. When you compile any of the PCRE modules, you
must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
in the sources.
An alternative approach is not to edit config.h, but to use -D on the
compiler command line to make any changes that you need to the
configuration options. In this case -DHAVE_CONFIG_H must not be set.
NOTE: There have been occasions when the way in which certain parameters
in config.h are used has changed between releases. (In the configure/make
world, this is handled automatically.) When upgrading to a new release,
you are strongly advised to review config.h.generic before re-using what
you had previously.
(2) Copy or rename the file pcre.h.generic as pcre.h.
(3) EITHER:
Copy or rename file pcre_chartables.c.dist as pcre_chartables.c.
OR:
Compile dftables.c as a stand-alone program (using -DHAVE_CONFIG_H if
you have set up config.h), and then run it with the single argument
"pcre_chartables.c". This generates a set of standard character tables
and writes them to that file. The tables are generated using the default
C locale for your system. If you want to use a locale that is specified
by LC_xxx environment variables, add the -L option to the dftables
command. You must use this method if you are building on a system that
uses EBCDIC code.
The tables in pcre_chartables.c are defaults. The caller of PCRE can
specify alternative tables at run time.
(4) Ensure that you have the following header files:
pcre_internal.h
ucp.h
(5) Also ensure that you have the following file, which is #included as source
when building a debugging version of PCRE, and is also used by pcretest.
pcre_printint.src
(6) Compile the following source files, setting -DHAVE_CONFIG_H as a compiler
option if you have set up config.h with your configuration, or else use
other -D settings to change the configuration as required.
pcre_chartables.c
pcre_compile.c
pcre_config.c
pcre_dfa_exec.c
pcre_exec.c
pcre_fullinfo.c
pcre_get.c
pcre_globals.c
pcre_info.c
pcre_maketables.c
pcre_newline.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
pcre_tables.c
pcre_try_flipped.c
pcre_ucd.c
pcre_valid_utf8.c
pcre_version.c
pcre_xclass.c
Make sure that you include -I. in the compiler command (or equivalent for
an unusual compiler) so that all included PCRE header files are first
sought in the current directory. Otherwise you run the risk of picking up
a previously-installed file from somewhere else.
(7) Now link all the compiled code into an object library in whichever form
your system keeps such libraries. This is the basic PCRE C library. If
your system has static and shared libraries, you may have to do this once
for each type.
(8) Similarly, compile pcreposix.c (remembering -DHAVE_CONFIG_H if necessary)
and link the result (on its own) as the pcreposix library.
(9) Compile the test program pcretest.c (again, don't forget -DHAVE_CONFIG_H).
This needs the functions in the pcre and pcreposix libraries when linking.
It also needs the pcre_printint.src source file, which it #includes.
(10) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. Note that the
supplied files are in Unix format, with just LF characters as line
terminators. You may need to edit them to change this if your system uses
a different convention. If you are using Windows, you probably should use
the wintestinput3 file instead of testinput3 (and the corresponding output
file). This is a locale test; wintestinput3 sets the locale to "french"
rather than "fr_FR", and there some minor output differences.
(11) If you want to use the pcregrep command, compile and link pcregrep.c; it
uses only the basic PCRE library (it does not need the pcreposix library).
THE C++ WRAPPER FUNCTIONS
The PCRE distribution also contains some C++ wrapper functions and tests,
contributed by Google Inc. On a system that can use "configure" and "make",
the functions are automatically built into a library called pcrecpp. It should
be straightforward to compile the .cc files manually on other systems. The
files called xxx_unittest.cc are test programs for each of the corresponding
xxx.cc files.
BUILDING FOR VIRTUAL PASCAL
A script for building PCRE using Borland's C++ compiler for use with VPASCAL
was contributed by Alexander Tokarev. Stefan Weber updated the script and added
additional files. The following files in the distribution are for building PCRE
for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat, pcregexp.pas.
STACK SIZE IN WINDOWS ENVIRONMENTS
The default processor stack size of 1Mb in some Windows environments is too
small for matching patterns that need much recursion. In particular, test 2 may
fail because of this. Normally, running out of stack causes a crash, but there
have been cases where the test program has just died silently. See your linker
documentation for how to increase stack size if you experience problems. The
Linux default of 8Mb is a reasonable choice for the stack, though even that can
be too small for some pattern/subject combinations.
PCRE has a compile configuration option to disable the use of stack for
recursion so that heap is used instead. However, pattern matching is
significantly slower when this is done. There is more about stack usage in the
"pcrestack" documentation.
LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
If you want to statically link a program against a PCRE library in the form of
a non-dll .a file, you must define PCRE_STATIC before including pcre.h,
otherwise the pcre_malloc() and pcre_free() exported functions will be declared
__declspec(dllimport), with unwanted results.
CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
It is possible to compile programs to use different calling conventions using
MSVC. Search the web for "calling conventions" for more information. To make it
easier to change the calling convention for the exported functions in the
PCRE library, the macro PCRE_CALL_CONVENTION is present in all the external
definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
not set, it defaults to empty; the default calling convention is then used
(which is what is wanted most of the time).
COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE WITH CMAKE" below)
There are two ways of building PCRE using the "configure, make, make install"
paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
the same thing; they are completely different from each other. There is also
support for building using CMake, which some users find a more straightforward
way of building PCRE under Windows. However, the tests are not run
automatically when CMake is used.
The MinGW home page (http://www.mingw.org/) says this:
MinGW: A collection of freely available and freely distributable Windows
specific header files and import libraries combined with GNU toolsets that
allow one to produce native Windows programs that do not rely on any
3rd-party C runtime DLLs.
The Cygwin home page (http://www.cygwin.com/) says this:
Cygwin is a Linux-like environment for Windows. It consists of two parts:
. A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
substantial Linux API functionality
. A collection of tools which provide Linux look and feel.
The Cygwin DLL currently works with all recent, commercially released x86 32
bit and 64 bit versions of Windows, with the exception of Windows CE.
On both MinGW and Cygwin, PCRE should build correctly using:
./configure && make && make install
This should create two libraries called libpcre and libpcreposix, and, if you
have enabled building the C++ wrapper, a third one called libpcrecpp. These are
independent libraries: when you like with libpcreposix or libpcrecpp you must
also link with libpcre, which contains the basic functions. (Some earlier
releases of PCRE included the basic libpcre functions in libpcreposix. This no
longer happens.)
A user submitted a special-purpose patch that makes it easy to create
"pcre.dll" under mingw32 using the "msys" environment. It provides "pcre.dll"
as a special target. If you use this target, no other files are built, and in
particular, the pcretest and pcregrep programs are not built. An example of how
this might be used is:
./configure --enable-utf --disable-cpp CFLAGS="-03 -s"; make pcre.dll
Using Cygwin's compiler generates libraries and executables that depend on
cygwin1.dll. If a library that is generated this way is distributed,
cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
licence, this forces not only PCRE to be under the GPL, but also the entire
application. A distributor who wants to keep their own code proprietary must
purchase an appropriate Cygwin licence.
MinGW has no such restrictions. The MinGW compiler generates a library or
executable that can run standalone on Windows without any third party dll or
licensing issues.
But there is more complication:
If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
gcc and MinGW's gcc). So, a user can:
. Build native binaries by using MinGW or by getting Cygwin and using
-mno-cygwin.
. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
compiler flags.
The test files that are supplied with PCRE are in Unix format, with LF
characters as line terminators. It may be necessary to change the line
terminators in order to get some of the tests to work. We hope to improve
things in this area in future.
BUILDING PCRE ON WINDOWS WITH CMAKE
CMake is an alternative build facility that can be used instead of the
traditional Unix "configure". CMake version 2.4.7 supports Borland makefiles,
MinGW makefiles, MSYS makefiles, NMake makefiles, UNIX makefiles, Visual Studio
6, Visual Studio 7, Visual Studio 8, and Watcom W8. The following instructions
were contributed by a PCRE user.
1. Download CMake 2.4.7 or above from http://www.cmake.org/, install and ensure
that cmake\bin is on your path.
2. Unzip (retaining folder structure) the PCRE source tree into a source
directory such as C:\pcre.
3. Create a new, empty build directory: C:\pcre\build\
4. Run CMakeSetup from the Shell envirornment of your build tool, e.g., Msys
for Msys/MinGW or Visual Studio Command Prompt for VC/VC++
5. Enter C:\pcre\pcre-xx and C:\pcre\build for the source and build
directories, respectively
6. Hit the "Configure" button.
7. Select the particular IDE / build tool that you are using (Visual Studio,
MSYS makefiles, MinGW makefiles, etc.)
8. The GUI will then list several configuration options. This is where you can
enable UTF-8 support, etc.
9. Hit "Configure" again. The adjacent "OK" button should now be active.
10. Hit "OK".
11. The build directory should now contain a usable build system, be it a
solution file for Visual Studio, makefiles for MinGW, etc.
USE OF RELATIVE PATHS WITH CMAKE ON WINDOWS
A PCRE user comments as follows:
I thought that others may want to know the current state of
CMAKE_USE_RELATIVE_PATHS support on Windows.
Here it is:
-- AdditionalIncludeDirectories is only partially modified (only the
first path - see below)
-- Only some of the contained file paths are modified - shown below for
pcre.vcproj
-- It properly modifies
I am sure CMake people can fix that if they want to. Until then one will
need to replace existing absolute paths in project files with relative
paths manually (e.g. from VS) - relative to project file location. I did
just that before being told to try CMAKE_USE_RELATIVE_PATHS. Not a big
deal.
AdditionalIncludeDirectories="E:\builds\pcre\build;E:\builds\pcre\pcre-7.5;"
AdditionalIncludeDirectories=".;E:\builds\pcre\pcre-7.5;"
RelativePath="pcre.h">
RelativePath="pcre_chartables.c">
RelativePath="pcre_chartables.c.rule">
TESTING WITH RUNTEST.BAT
1. Copy RunTest.bat into the directory where pcretest.exe has been created.
2. Edit RunTest.bat and insert a line that indentifies the relative location of
the pcre source, e.g.:
set srcdir=..\pcre-7.4-RC3
3. Run RunTest.bat from a command shell environment. Test outputs will
automatically be compared to expected results, and discrepancies will
identified in the console output.
4. To test pcrecpp, run pcrecpp_unittest.exe, pcre_stringpiece_unittest.exe and
pcre_scanner_unittest.exe.
BUILDING UNDER WINDOWS WITH BCC5.5
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
Some of the core BCC libraries have a version of PCRE from 1998 built in,
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
version mismatch. I'm including an easy workaround below, if you'd like to
include it in the non-unix instructions:
When linking a project with BCC5.5, pcre.lib must be included before any of
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
line.
BUILDING UNDER WINDOWS CE WITH VISUAL STUDIO 200x
Vincent Richomme sent a zip archive of files to help with this process. They
can be found in the file "pcre-vsbuild.zip" in the Contrib directory of the FTP
site.
BUILDING PCRE ON OPENVMS
Dan Mooney sent the following comments about building PCRE on OpenVMS. They
relate to an older version of PCRE that used fewer source files, so the exact
commands will need changing. See the current list of source files above.
"It was quite easy to compile and link the library. I don't have a formal
make file but the attached file [reproduced below] contains the OpenVMS DCL
commands I used to build the library. I had to add #define
POSIX_MALLOC_THRESHOLD 10 to pcre.h since it was not defined anywhere.
The library was built on:
O/S: HP OpenVMS v7.3-1
Compiler: Compaq C v6.5-001-48BCD
Linker: vA13-01
The test results did not match 100% due to the issues you mention in your
documentation regarding isprint(), iscntrl(), isgraph() and ispunct(). I
modified some of the character tables temporarily and was able to get the
results to match. Tests using the fr locale did not match since I don't have
that locale loaded. The study size was always reported to be 3 less than the
value in the standard test output files."
=========================
$! This DCL procedure builds PCRE on OpenVMS
$!
$! I followed the instructions in the non-unix-use file in the distribution.
$!
$ COMPILE == "CC/LIST/NOMEMBER_ALIGNMENT/PREFIX_LIBRARY_ENTRIES=ALL_ENTRIES
$ COMPILE DFTABLES.C
$ LINK/EXE=DFTABLES.EXE DFTABLES.OBJ
$ RUN DFTABLES.EXE/OUTPUT=CHARTABLES.C
$ COMPILE MAKETABLES.C
$ COMPILE GET.C
$ COMPILE STUDY.C
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
$! did not seem to be defined anywhere.
$! I edited pcre.h and added #DEFINE SUPPORT_UTF8 to enable UTF8 support.
$ COMPILE PCRE.C
$ LIB/CREATE PCRE MAKETABLES.OBJ, GET.OBJ, STUDY.OBJ, PCRE.OBJ
$! I had to set POSIX_MALLOC_THRESHOLD to 10 in PCRE.H since the symbol
$! did not seem to be defined anywhere.
$ COMPILE PCREPOSIX.C
$ LIB/CREATE PCREPOSIX PCREPOSIX.OBJ
$ COMPILE PCRETEST.C
$ LINK/EXE=PCRETEST.EXE PCRETEST.OBJ, PCRE/LIB, PCREPOSIX/LIB
$! C programs that want access to command line arguments must be
$! defined as a symbol
$ PCRETEST :== "$ SYS$ROADSUSERS:[DMOONEY.REGEXP]PCRETEST.EXE"
$! Arguments must be enclosed in quotes.
$ PCRETEST "-C"
$! Test results:
$!
$! The test results did not match 100%. The functions isprint(), iscntrl(),
$! isgraph() and ispunct() on OpenVMS must not produce the same results
$! as the system that built the test output files provided with the
$! distribution.
$!
$! The study size did not match and was always 3 less on OpenVMS.
$!
$! Locale could not be set to fr
$!
=========================
Last Updated: 17 March 2009
****

View File

@ -1,214 +0,0 @@
#/bin/sh
# Script to prepare the files for building a PCRE release. It does some
# processing of the documentation, detrails files, and creates pcre.h.generic
# and config.h.generic (for use by builders who can't run ./configure).
# You must run this script before runnning "make dist". It makes use of the
# following files:
# 132html A Perl script that converts a .1 or .3 man page into HTML. It
# is called from MakeRelease. It "knows" the relevant troff
# constructs that are used in the PCRE man pages.
# CleanTxt A Perl script that cleans up the output of "nroff -man" by
# removing backspaces and other redundant text so as to produce
# a readable .txt file.
# Detrail A Perl script that removes trailing spaces from files.
# doc/index.html.src
# A file that is copied as index.html into the doc/html directory
# when the HTML documentation is built. It works like this so that
# doc/html can be deleted and re-created from scratch.
# First, sort out the documentation
cd doc
echo Processing documentation
# Make Text form of the documentation. It needs some mangling to make it
# tidy for online reading. Concatenate all the .3 stuff, but omit the
# individual function pages.
cat <<End >pcre.txt
-----------------------------------------------------------------------------
This file contains a concatenation of the PCRE man pages, converted to plain
text format for ease of searching with a text editor, or for use on systems
that do not have a man page processor. The small individual files that give
synopses of each function in the library have not been included. There are
separate text files for the pcregrep and pcretest commands.
-----------------------------------------------------------------------------
End
echo "Making pcre.txt"
for file in pcre pcrebuild pcrematching pcreapi pcrecallout pcrecompat \
pcrepattern pcresyntax pcrepartial pcreprecompile \
pcreperform pcreposix pcrecpp pcresample pcrestack ; do
echo " Processing $file.3"
nroff -c -man $file.3 >$file.rawtxt
../CleanTxt <$file.rawtxt >>pcre.txt
/bin/rm $file.rawtxt
echo "------------------------------------------------------------------------------" >>pcre.txt
if [ "$file" != "pcresample" ] ; then
echo " " >>pcre.txt
echo " " >>pcre.txt
fi
done
# The three commands
for file in pcretest pcregrep pcre-config ; do
echo Making $file.txt
nroff -c -man $file.1 >$file.rawtxt
../CleanTxt <$file.rawtxt >$file.txt
/bin/rm $file.rawtxt
done
# Make HTML form of the documentation.
echo "Making HTML documentation"
/bin/rm html/*
cp index.html.src html/index.html
for file in *.1 ; do
base=`basename $file .1`
echo " Making $base.html"
../132html -toc $base <$file >html/$base.html
done
# Exclude table of contents for function summaries. It seems that expr
# forces an anchored regex. Also exclude them for small pages that have
# only one section.
for file in *.3 ; do
base=`basename $file .3`
toc=-toc
if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi
if [ "$base" = "pcresample" ] || \
[ "$base" = "pcrestack" ] || \
[ "$base" = "pcrecompat" ] || \
[ "$base" = "pcreperform" ] ; then
toc=""
fi
echo " Making $base.html"
../132html $toc $base <$file >html/$base.html
if [ $? != 0 ] ; then exit 1; fi
done
# End of documentation processing
cd ..
echo Documentation done
# These files are detrailed; do not detrail the test data because there may be
# significant trailing spaces. The configure files are also omitted from the
# detrailing.
files="\
Makefile.am \
Makefile.in \
configure.ac \
README \
LICENCE \
COPYING \
AUTHORS \
NEWS \
NON-UNIX-USE \
INSTALL \
132html \
CleanTxt \
Detrail \
ChangeLog \
CMakeLists.txt \
RunGrepTest \
RunTest \
RunTest.bat \
pcre-config.in \
libpcre.pc.in \
libpcrecpp.pc.in \
config.h.in \
pcre_printint.src \
pcre_chartables.c.dist \
pcredemo.c \
pcregrep.c \
pcretest.c \
dftables.c \
pcreposix.c \
pcreposix.h \
pcre.h.in \
pcre_internal.h
pcre_compile.c \
pcre_config.c \
pcre_dfa_exec.c \
pcre_exec.c \
pcre_fullinfo.c \
pcre_get.c \
pcre_globals.c \
pcre_info.c \
pcre_maketables.c \
pcre_newline.c \
pcre_ord2utf8.c \
pcre_refcount.c \
pcre_study.c \
pcre_tables.c \
pcre_try_flipped.c \
pcre_ucp_searchfuncs.c \
pcre_valid_utf8.c \
pcre_version.c \
pcre_xclass.c \
pcre_scanner.cc \
pcre_scanner.h \
pcre_scanner_unittest.cc \
pcrecpp.cc \
pcrecpp.h \
pcrecpparg.h.in \
pcrecpp_unittest.cc \
pcre_stringpiece.cc \
pcre_stringpiece.h.in \
pcre_stringpiece_unittest.cc \
perltest.pl \
ucp.h \
ucpinternal.h \
ucptable.h \
makevp.bat \
pcre.def \
libpcre.def \
libpcreposix.def"
echo Detrailing
./Detrail $files doc/p* doc/html/*
echo Doing basic configure to get default pcre.h and config.h
# This is in case the caller has set aliases (as I do - PH)
unset cp ls mv rm
./configure >/dev/null
echo Converting pcre.h and config.h to generic forms
cp -f pcre.h pcre.h.generic
perl <<'END'
open(IN, "<config.h") || die "Can't open config.h: $!\n";
open(OUT, ">config.h.generic") || die "Can't open config.h.generic: $!\n";
while (<IN>)
{
if (/^#define\s(?!PACKAGE)(\w+)/)
{
print OUT "#ifndef $1\n";
print OUT;
print OUT "#endif\n";
}
else
{
print OUT;
}
}
close IN;
close OUT;
END
echo Done
#End

View File

@ -1,767 +0,0 @@
README file for PCRE (Perl-compatible regular expression library)
-----------------------------------------------------------------
The latest release of PCRE is always available in three alternative formats
from:
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.gz
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.bz2
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.zip
There is a mailing list for discussion about the development of PCRE at
pcre-dev@exim.org
Please read the NEWS file if you are upgrading from a previous release.
The contents of this README file are:
The PCRE APIs
Documentation for PCRE
Contributions by users of PCRE
Building PCRE on non-Unix systems
Building PCRE on Unix-like systems
Retrieving configuration information on Unix-like systems
Shared libraries on Unix-like systems
Cross-compiling on Unix-like systems
Using HP's ANSI C++ compiler (aCC)
Making new tarballs
Testing PCRE
Character tables
File manifest
The PCRE APIs
-------------
PCRE is written in C, and it has its own API. The distribution also includes a
set of C++ wrapper functions (see the pcrecpp man page for details), courtesy
of Google Inc.
In addition, there is a set of C wrapper functions that are based on the POSIX
regular expression API (see the pcreposix man page). These end up in the
library called libpcreposix. Note that this just provides a POSIX calling
interface to PCRE; the regular expressions themselves still follow Perl syntax
and semantics. The POSIX API is restricted, and does not give full access to
all of PCRE's facilities.
The header file for the POSIX-style functions is called pcreposix.h. The
official POSIX name is regex.h, but I did not want to risk possible problems
with existing files of that name by distributing it that way. To use PCRE with
an existing program that uses the POSIX API, pcreposix.h will have to be
renamed or pointed at by a link.
If you are using the POSIX interface to PCRE and there is already a POSIX regex
library installed on your system, as well as worrying about the regex.h header
file (as mentioned above), you must also take care when linking programs to
ensure that they link with PCRE's libpcreposix library. Otherwise they may pick
up the POSIX functions of the same name from the other library.
One way of avoiding this confusion is to compile PCRE with the addition of
-Dregcomp=PCREregcomp (and similarly for the other POSIX functions) to the
compiler flags (CFLAGS if you are using "configure" -- see below). This has the
effect of renaming the functions so that the names no longer clash. Of course,
you have to do the same thing for your applications, or write them using the
new names.
Documentation for PCRE
----------------------
If you install PCRE in the normal way on a Unix-like system, you will end up
with a set of man pages whose names all start with "pcre". The one that is just
called "pcre" lists all the others. In addition to these man pages, the PCRE
documentation is supplied in two other forms:
1. There are files called doc/pcre.txt, doc/pcregrep.txt, and
doc/pcretest.txt in the source distribution. The first of these is a
concatenation of the text forms of all the section 3 man pages except
those that summarize individual functions. The other two are the text
forms of the section 1 man pages for the pcregrep and pcretest commands.
These text forms are provided for ease of scanning with text editors or
similar tools. They are installed in <prefix>/share/doc/pcre, where
<prefix> is the installation prefix (defaulting to /usr/local).
2. A set of files containing all the documentation in HTML form, hyperlinked
in various ways, and rooted in a file called index.html, is distributed in
doc/html and installed in <prefix>/share/doc/pcre/html.
Users of PCRE have contributed files containing the documentation for various
releases in CHM format. These can be found in the Contrib directory of the FTP
site (see next section).
Contributions by users of PCRE
------------------------------
You can find contributions from PCRE users in the directory
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
There is a README file giving brief descriptions of what they are. Some are
complete in themselves; others are pointers to URLs containing relevant files.
Some of this material is likely to be well out-of-date. Several of the earlier
contributions provided support for compiling PCRE on various flavours of
Windows (I myself do not use Windows). Nowadays there is more Windows support
in the standard distribution, so these contibutions have been archived.
Building PCRE on non-Unix systems
---------------------------------
For a non-Unix system, please read the comments in the file NON-UNIX-USE,
though if your system supports the use of "configure" and "make" you may be
able to build PCRE in the same way as for Unix-like systems. PCRE can also be
configured in many platform environments using the GUI facility of CMake's
CMakeSetup. It creates Makefiles, solution files, etc.
PCRE has been compiled on many different operating systems. It should be
straightforward to build PCRE on any system that has a Standard C compiler and
library, because it uses only Standard C functions.
Building PCRE on Unix-like systems
----------------------------------
If you are using HP's ANSI C++ compiler (aCC), please see the special note
in the section entitled "Using HP's ANSI C++ compiler (aCC)" below.
The following instructions assume the use of the widely used "configure, make,
make install" process. There is also support for CMake in the PCRE
distribution; there are some comments about using CMake in the NON-UNIX-USE
file, though it can also be used in Unix-like systems.
To build PCRE on a Unix-like system, first run the "configure" command from the
PCRE distribution directory, with your current directory set to the directory
where you want the files to be created. This command is a standard GNU
"autoconf" configuration script, for which generic instructions are supplied in
the file INSTALL.
Most commonly, people build PCRE within its own distribution directory, and in
this case, on many systems, just running "./configure" is sufficient. However,
the usual methods of changing standard defaults are available. For example:
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
specifies that the C compiler should be run with the flags '-O2 -Wall' instead
of the default, and that "make install" should install PCRE under /opt/local
instead of the default /usr/local.
If you want to build in a different directory, just run "configure" with that
directory as current. For example, suppose you have unpacked the PCRE source
into /source/pcre/pcre-xxx, but you want to build it in /build/pcre/pcre-xxx:
cd /build/pcre/pcre-xxx
/source/pcre/pcre-xxx/configure
PCRE is written in C and is normally compiled as a C library. However, it is
possible to build it as a C++ library, though the provided building apparatus
does not have any features to support this.
There are some optional features that can be included or omitted from the PCRE
library. You can read more about them in the pcrebuild man page.
. If you want to suppress the building of the C++ wrapper library, you can add
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
it will try to find a C++ compiler and C++ header files, and if it succeeds,
it will try to build the C++ wrapper.
. If you want to make use of the support for UTF-8 Unicode character strings in
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
code for handling UTF-8 is not included in the library. Even when included,
it still has to be enabled by an option at run time. When PCRE is compiled
with this option, its input can only either be ASCII or UTF-8, even when
running on EBCDIC platforms. It is not possible to use both --enable-utf8 and
--enable-ebcdic at the same time.
. If, in addition to support for UTF-8 character strings, you want to include
support for the \P, \p, and \X sequences that recognize Unicode character
properties, you must add --enable-unicode-properties to the "configure"
command. This adds about 30K to the size of the library (in the form of a
property table); only the basic two-letter properties such as Lu are
supported.
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
of the preceding, or any of the Unicode newline sequences as indicating the
end of a line. Whatever you specify at build time is the default; the caller
of PCRE can change the selection at run time. The default newline indicator
is a single LF character (the Unix standard). You can specify the default
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
--enable-newline-is-any to the "configure" command, respectively.
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
the standard tests will fail, because the lines in the test files end with
LF. Even if the files are edited to change the line endings, there are likely
to be some failures. With --enable-newline-is-anycrlf or
--enable-newline-is-any, many tests should succeed, but there may be some
failures.
. By default, the sequence \R in a pattern matches any Unicode line ending
sequence. This is independent of the option specifying what PCRE considers to
be the end of a line (see above). However, the caller of PCRE can restrict \R
to match only CR, LF, or CRLF. You can make this the default by adding
--enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
. When called via the POSIX interface, PCRE uses malloc() to get additional
storage for processing capturing parentheses if there are more than 10 of
them in a pattern. You can increase this threshold by setting, for example,
--with-posix-malloc-threshold=20
on the "configure" command.
. PCRE has a counter that can be set to limit the amount of resources it uses.
If the limit is exceeded during a match, the match fails. The default is ten
million. You can change the default by setting, for example,
--with-match-limit=500000
on the "configure" command. This is just the default; individual calls to
pcre_exec() can supply their own value. There is more discussion on the
pcreapi man page.
. There is a separate counter that limits the depth of recursive function calls
during a matching process. This also has a default of ten million, which is
essentially "unlimited". You can change the default by setting, for example,
--with-match-limit-recursion=500000
Recursive function calls use up the runtime stack; running out of stack can
cause programs to crash in strange ways. There is a discussion about stack
sizes in the pcrestack man page.
. The default maximum compiled pattern size is around 64K. You can increase
this by adding --with-link-size=3 to the "configure" command. You can
increase it even more by setting --with-link-size=4, but this is unlikely
ever to be necessary. Increasing the internal link size will reduce
performance.
. You can build PCRE so that its internal match() function that is called from
pcre_exec() does not call itself recursively. Instead, it uses memory blocks
obtained from the heap via the special functions pcre_stack_malloc() and
pcre_stack_free() to save data that would otherwise be saved on the stack. To
build PCRE like this, use
--disable-stack-for-recursion
on the "configure" command. PCRE runs more slowly in this mode, but it may be
necessary in environments with limited stack sizes. This applies only to the
pcre_exec() function; it does not apply to pcre_dfa_exec(), which does not
use deeply nested recursion. There is a discussion about stack sizes in the
pcrestack man page.
. For speed, PCRE uses four tables for manipulating and identifying characters
whose code point values are less than 256. By default, it uses a set of
tables for ASCII encoding that is part of the distribution. If you specify
--enable-rebuild-chartables
a program called dftables is compiled and run in the default C locale when
you obey "make". It builds a source file called pcre_chartables.c. If you do
not specify this option, pcre_chartables.c is created as a copy of
pcre_chartables.c.dist. See "Character tables" below for further information.
. It is possible to compile PCRE for use on systems that use EBCDIC as their
character code (as opposed to ASCII) by specifying
--enable-ebcdic
This automatically implies --enable-rebuild-chartables (see above). However,
when PCRE is built this way, it always operates in EBCDIC. It cannot support
both EBCDIC and UTF-8.
. It is possible to compile pcregrep to use libz and/or libbz2, in order to
read .gz and .bz2 files (respectively), by specifying one or both of
--enable-pcregrep-libz
--enable-pcregrep-libbz2
Of course, the relevant libraries must be installed on your system.
. It is possible to compile pcretest so that it links with the libreadline
library, by specifying
--enable-pcretest-libreadline
If this is done, when pcretest's input is from a terminal, it reads it using
the readline() function. This provides line-editing and history facilities.
Note that libreadline is GPL-licenced, so if you distribute a binary of
pcretest linked in this way, there may be licensing issues.
Setting this option causes the -lreadline option to be added to the pcretest
build. In many operating environments with a sytem-installed readline
library this is sufficient. However, in some environments (e.g. if an
unmodified distribution version of readline is in use), it may be necessary
to specify something like LIBS="-lncurses" as well. This is because, to quote
the readline INSTALL, "Readline uses the termcap functions, but does not link
with the termcap or curses library itself, allowing applications which link
with readline the to choose an appropriate library." If you get error
messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto,
this is the problem, and linking with the ncurses library should fix it.
The "configure" script builds the following files for the basic C library:
. Makefile is the makefile that builds the library
. config.h contains build-time configuration options for the library
. pcre.h is the public PCRE header file
. pcre-config is a script that shows the settings of "configure" options
. libpcre.pc is data for the pkg-config command
. libtool is a script that builds shared and/or static libraries
. RunTest is a script for running tests on the basic C library
. RunGrepTest is a script for running tests on the pcregrep command
Versions of config.h and pcre.h are distributed in the PCRE tarballs under
the names config.h.generic and pcre.h.generic. These are provided for the
benefit of those who have to built PCRE without the benefit of "configure". If
you use "configure", the .generic versions are not used.
If a C++ compiler is found, the following files are also built:
. libpcrecpp.pc is data for the pkg-config command
. pcrecpparg.h is a header file for programs that call PCRE via the C++ wrapper
. pcre_stringpiece.h is the header for the C++ "stringpiece" functions
The "configure" script also creates config.status, which is an executable
script that can be run to recreate the configuration, and config.log, which
contains compiler output from tests that "configure" runs.
Once "configure" has run, you can run "make". It builds two libraries, called
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
command. If a C++ compiler was found on your system, "make" also builds the C++
wrapper library, which is called libpcrecpp, and some test programs called
pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest.
Building the C++ wrapper can be disabled by adding --disable-cpp to the
"configure" command.
The command "make check" runs all the appropriate tests. Details of the PCRE
tests are given below in a separate section of this document.
You can use "make install" to install PCRE into live directories on your
system. The following are installed (file names are all relative to the
<prefix> that is set when "configure" is run):
Commands (bin):
pcretest
pcregrep
pcre-config
Libraries (lib):
libpcre
libpcreposix
libpcrecpp (if C++ support is enabled)
Configuration information (lib/pkgconfig):
libpcre.pc
libpcrecpp.pc (if C++ support is enabled)
Header files (include):
pcre.h
pcreposix.h
pcre_scanner.h )
pcre_stringpiece.h ) if C++ support is enabled
pcrecpp.h )
pcrecpparg.h )
Man pages (share/man/man{1,3}):
pcregrep.1
pcretest.1
pcre.3
pcre*.3 (lots more pages, all starting "pcre")
HTML documentation (share/doc/pcre/html):
index.html
*.html (lots more pages, hyperlinked from index.html)
Text file documentation (share/doc/pcre):
AUTHORS
COPYING
ChangeLog
LICENCE
NEWS
README
pcre.txt (a concatenation of the man(3) pages)
pcretest.txt the pcretest man page
pcregrep.txt the pcregrep man page
If you want to remove PCRE from your system, you can run "make uninstall".
This removes all the files that "make install" installed. However, it does not
remove any directories, because these are often shared with other programs.
Retrieving configuration information on Unix-like systems
---------------------------------------------------------
Running "make install" installs the command pcre-config, which can be used to
recall information about the PCRE configuration and installation. For example:
pcre-config --version
prints the version number, and
pcre-config --libs
outputs information about where the library is installed. This command can be
included in makefiles for programs that use PCRE, saving the programmer from
having to remember too many details.
The pkg-config command is another system for saving and retrieving information
about installed libraries. Instead of separate commands for each library, a
single command is used. For example:
pkg-config --cflags pcre
The data is held in *.pc files that are installed in a directory called
<prefix>/lib/pkgconfig.
Shared libraries on Unix-like systems
-------------------------------------
The default distribution builds PCRE as shared libraries and static libraries,
as long as the operating system supports shared libraries. Shared library
support relies on the "libtool" script which is built as part of the
"configure" process.
The libtool script is used to compile and link both shared and static
libraries. They are placed in a subdirectory called .libs when they are newly
built. The programs pcretest and pcregrep are built to use these uninstalled
libraries (by means of wrapper scripts in the case of shared libraries). When
you use "make install" to install shared libraries, pcregrep and pcretest are
automatically re-built to use the newly installed shared libraries before being
installed themselves. However, the versions left in the build directory still
use the uninstalled libraries.
To build PCRE using static libraries only you must use --disable-shared when
configuring it. For example:
./configure --prefix=/usr/gnu --disable-shared
Then run "make" in the usual way. Similarly, you can use --disable-static to
build only shared libraries.
Cross-compiling on Unix-like systems
------------------------------------
You can specify CC and CFLAGS in the normal way to the "configure" command, in
order to cross-compile PCRE for some other host. However, you should NOT
specify --enable-rebuild-chartables, because if you do, the dftables.c source
file is compiled and run on the local host, in order to generate the inbuilt
character tables (the pcre_chartables.c file). This will probably not work,
because dftables.c needs to be compiled with the local compiler, not the cross
compiler.
When --enable-rebuild-chartables is not specified, pcre_chartables.c is created
by making a copy of pcre_chartables.c.dist, which is a default set of tables
that assumes ASCII code. Cross-compiling with the default tables should not be
a problem.
If you need to modify the character tables when cross-compiling, you should
move pcre_chartables.c.dist out of the way, then compile dftables.c by hand and
run it on the local host to make a new version of pcre_chartables.c.dist.
Then when you cross-compile PCRE this new version of the tables will be used.
Using HP's ANSI C++ compiler (aCC)
----------------------------------
Unless C++ support is disabled by specifying the "--disable-cpp" option of the
"configure" script, you must include the "-AA" option in the CXXFLAGS
environment variable in order for the C++ components to compile correctly.
Also, note that the aCC compiler on PA-RISC platforms may have a defect whereby
needed libraries fail to get included when specifying the "-AA" compiler
option. If you experience unresolved symbols when linking the C++ programs,
use the workaround of specifying the following environment variable prior to
running the "configure" script:
CXXLDFLAGS="-lstd_v2 -lCsup_v2"
Making new tarballs
-------------------
The command "make dist" creates three PCRE tarballs, in tar.gz, tar.bz2, and
zip formats. The command "make distcheck" does the same, but then does a trial
build of the new distribution to ensure that it works.
If you have modified any of the man page sources in the doc directory, you
should first run the PrepareRelease script before making a distribution. This
script creates the .txt and HTML forms of the documentation from the man pages.
Testing PCRE
------------
To test the basic PCRE library on a Unix system, run the RunTest script that is
created by the configuring process. There is also a script called RunGrepTest
that tests the options of the pcregrep command. If the C++ wrapper library is
built, three test programs called pcrecpp_unittest, pcre_scanner_unittest, and
pcre_stringpiece_unittest are also built.
Both the scripts and all the program tests are run if you obey "make check" or
"make test". For other systems, see the instructions in NON-UNIX-USE.
The RunTest script runs the pcretest test program (which is documented in its
own man page) on each of the testinput files in the testdata directory in
turn, and compares the output with the contents of the corresponding testoutput
files. A file called testtry is used to hold the main output from pcretest
(testsavedregex is also used as a working file). To run pcretest on just one of
the test files, give its number as an argument to RunTest, for example:
RunTest 2
The first test file can also be fed directly into the perltest.pl script to
check that Perl gives the same results. The only difference you should see is
in the first few lines, where the Perl version is given instead of the PCRE
version.
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
detection, and run-time flags that are specific to PCRE, as well as the POSIX
wrapper API. It also uses the debugging flags to check some of the internals of
pcre_compile().
If you build PCRE with a locale setting that is not the standard C locale, the
character tables may be different (see next paragraph). In some cases, this may
cause failures in the second set of tests. For example, in a locale where the
isprint() function yields TRUE for characters in the range 128-255, the use of
[:isascii:] inside a character class defines a different set of characters, and
this shows up in this test as a difference in the compiled code, which is being
listed for checking. Where the comparison test output contains [\x00-\x7f] the
test will contain [\x00-\xff], and similarly in some other cases. This is not a
bug in PCRE.
The third set of tests checks pcre_maketables(), the facility for building a
set of character tables for a specific locale and using them instead of the
default tables. The tests make use of the "fr_FR" (French) locale. Before
running the test, the script checks for the presence of this locale by running
the "locale" command. If that command fails, or if it doesn't include "fr_FR"
in the list of available locales, the third test cannot be run, and a comment
is output to say why. If running this test produces instances of the error
** Failed to set locale "fr_FR"
in the comparison output, it means that locale is not available on your system,
despite being listed by "locale". This does not mean that PCRE is broken.
[If you are trying to run this test on Windows, you may be able to get it to
work by changing "fr_FR" to "french" everywhere it occurs. Alternatively, use
RunTest.bat. The version of RunTest.bat included with PCRE 7.4 and above uses
Windows versions of test 2. More info on using RunTest.bat is included in the
document entitled NON-UNIX-USE.]
The fourth test checks the UTF-8 support. It is not run automatically unless
PCRE is built with UTF-8 support. To do this you must set --enable-utf8 when
running "configure". This file can be also fed directly to the perltest script,
provided you are running Perl 5.8 or higher. (For Perl 5.6, a small patch,
commented in the script, can be be used.)
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
features of PCRE that are not relevant to Perl.
The sixth test checks the support for Unicode character properties. It it not
run automatically unless PCRE is built with Unicode property support. To to
this you must set --enable-unicode-properties when running "configure".
The seventh, eighth, and ninth tests check the pcre_dfa_exec() alternative
matching function, in non-UTF-8 mode, UTF-8 mode, and UTF-8 mode with Unicode
property support, respectively. The eighth and ninth tests are not run
automatically unless PCRE is build with the relevant support.
Character tables
----------------
For speed, PCRE uses four tables for manipulating and identifying characters
whose code point values are less than 256. The final argument of the
pcre_compile() function is a pointer to a block of memory containing the
concatenated tables. A call to pcre_maketables() can be used to generate a set
of tables in the current locale. If the final argument for pcre_compile() is
passed as NULL, a set of default tables that is built into the binary is used.
The source file called pcre_chartables.c contains the default set of tables. By
default, this is created as a copy of pcre_chartables.c.dist, which contains
tables for ASCII coding. However, if --enable-rebuild-chartables is specified
for ./configure, a different version of pcre_chartables.c is built by the
program dftables (compiled from dftables.c), which uses the ANSI C character
handling functions such as isalnum(), isalpha(), isupper(), islower(), etc. to
build the table sources. This means that the default C locale which is set for
your system will control the contents of these default tables. You can change
the default tables by editing pcre_chartables.c and then re-building PCRE. If
you do this, you should take care to ensure that the file does not get
automatically re-generated. The best way to do this is to move
pcre_chartables.c.dist out of the way and replace it with your customized
tables.
When the dftables program is run as a result of --enable-rebuild-chartables,
it uses the default C locale that is set on your system. It does not pay
attention to the LC_xxx environment variables. In other words, it uses the
system's default locale rather than whatever the compiling user happens to have
set. If you really do want to build a source set of character tables in a
locale that is specified by the LC_xxx variables, you can run the dftables
program by hand with the -L option. For example:
./dftables -L pcre_chartables.c.special
The first two 256-byte tables provide lower casing and case flipping functions,
respectively. The next table consists of three 32-byte bit maps which identify
digits, "word" characters, and white space, respectively. These are used when
building 32-byte bit maps that represent character classes for code points less
than 256.
The final 256-byte table has bits indicating various character types, as
follows:
1 white space character
2 letter
4 decimal digit
8 hexadecimal digit
16 alphanumeric or '_'
128 regular expression metacharacter or binary zero
You should not alter the set of characters that contain the 128 bit, as that
will cause PCRE to malfunction.
File manifest
-------------
The distribution should contain the following files:
(A) Source files of the PCRE library functions and their headers:
dftables.c auxiliary program for building pcre_chartables.c
when --enable-rebuild-chartables is specified
pcre_chartables.c.dist a default set of character tables that assume ASCII
coding; used, unless --enable-rebuild-chartables is
specified, by copying to pcre_chartables.c
pcreposix.c )
pcre_compile.c )
pcre_config.c )
pcre_dfa_exec.c )
pcre_exec.c )
pcre_fullinfo.c )
pcre_get.c ) sources for the functions in the library,
pcre_globals.c ) and some internal functions that they use
pcre_info.c )
pcre_maketables.c )
pcre_newline.c )
pcre_ord2utf8.c )
pcre_refcount.c )
pcre_study.c )
pcre_tables.c )
pcre_try_flipped.c )
pcre_ucd.c )
pcre_valid_utf8.c )
pcre_version.c )
pcre_xclass.c )
pcre_printint.src ) debugging function that is #included in pcretest,
) and can also be #included in pcre_compile()
pcre.h.in template for pcre.h when built by "configure"
pcreposix.h header for the external POSIX wrapper API
pcre_internal.h header for internal use
ucp.h header for Unicode property handling
config.h.in template for config.h, which is built by "configure"
pcrecpp.h public header file for the C++ wrapper
pcrecpparg.h.in template for another C++ header file
pcre_scanner.h public header file for C++ scanner functions
pcrecpp.cc )
pcre_scanner.cc ) source for the C++ wrapper library
pcre_stringpiece.h.in template for pcre_stringpiece.h, the header for the
C++ stringpiece functions
pcre_stringpiece.cc source for the C++ stringpiece functions
(B) Source files for programs that use PCRE:
pcredemo.c simple demonstration of coding calls to PCRE
pcregrep.c source of a grep utility that uses PCRE
pcretest.c comprehensive test program
(C) Auxiliary files:
132html script to turn "man" pages into HTML
AUTHORS information about the author of PCRE
ChangeLog log of changes to the code
CleanTxt script to clean nroff output for txt man pages
Detrail script to remove trailing spaces
HACKING some notes about the internals of PCRE
INSTALL generic installation instructions
LICENCE conditions for the use of PCRE
COPYING the same, using GNU's standard name
Makefile.in ) template for Unix Makefile, which is built by
) "configure"
Makefile.am ) the automake input that was used to create
) Makefile.in
NEWS important changes in this release
NON-UNIX-USE notes on building PCRE on non-Unix systems
PrepareRelease script to make preparations for "make dist"
README this file
RunTest a Unix shell script for running tests
RunGrepTest a Unix shell script for pcregrep tests
aclocal.m4 m4 macros (generated by "aclocal")
config.guess ) files used by libtool,
config.sub ) used only when building a shared library
configure a configuring shell script (built by autoconf)
configure.ac ) the autoconf input that was used to build
) "configure" and config.h
depcomp ) script to find program dependencies, generated by
) automake
doc/*.3 man page sources for the PCRE functions
doc/*.1 man page sources for pcregrep and pcretest
doc/index.html.src the base HTML page
doc/html/* HTML documentation
doc/pcre.txt plain text version of the man pages
doc/pcretest.txt plain text documentation of test program
doc/perltest.txt plain text documentation of Perl test program
install-sh a shell script for installing files
libpcre.pc.in template for libpcre.pc for pkg-config
libpcrecpp.pc.in template for libpcrecpp.pc for pkg-config
ltmain.sh file used to build a libtool script
missing ) common stub for a few missing GNU programs while
) installing, generated by automake
mkinstalldirs script for making install directories
perltest.pl Perl test program
pcre-config.in source of script which retains PCRE information
pcrecpp_unittest.cc )
pcre_scanner_unittest.cc ) test programs for the C++ wrapper
pcre_stringpiece_unittest.cc )
testdata/testinput* test data for main library tests
testdata/testoutput* expected test results
testdata/grep* input and output for pcregrep tests
(D) Auxiliary files for cmake support
cmake/COPYING-CMAKE-SCRIPTS
cmake/FindPackageHandleStandardArgs.cmake
cmake/FindReadline.cmake
CMakeLists.txt
config-cmake.h.in
(E) Auxiliary files for VPASCAL
makevp.bat
makevp_c.txt
makevp_l.txt
pcregexp.pas
(F) Auxiliary files for building PCRE "by hand"
pcre.h.generic ) a version of the public PCRE header file
) for use in non-"configure" environments
config.h.generic ) a version of config.h for use in non-"configure"
) environments
(F) Miscellaneous
RunTest.bat a script for running tests under Windows
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 21 March 2009

View File

@ -1,280 +0,0 @@
#! /bin/sh
# Run pcregrep tests. The assumption is that the PCRE tests check the library
# itself. What we are checking here is the file handling and options that are
# supported by pcregrep.
# Set the C locale, so that sort(1) behaves predictably.
LC_ALL=C
export LC_ALL
pcregrep=`pwd`/pcregrep
echo " "
echo "Testing pcregrep"
$pcregrep -V
cf="diff -ub"
valgrind=
while [ $# -gt 0 ] ; do
case $1 in
valgrind) valgrind="valgrind -q --leak-check=no";;
*) echo "Unknown argument $1"; exit 1;;
esac
shift
done
# If PCRE has been built in a directory other than the source directory, and
# this test is being run from "make check" as usual, then $(srcdir) will be
# set. If not, set it to the current directory. We then arrange to run the
# pcregrep command in the source directory so that the file names that appear
# in the output are always the same.
if [ -z "$srcdir" -o ! -d "$srcdir/testdata" ] ; then
srcdir=.
fi
# Check for the availability of UTF-8 support
./pcretest -C | ./pcregrep "No UTF-8 support" >/dev/null
utf8=$?
echo "---------------------------- Test 1 ------------------------------" >testtry
(cd $srcdir; $valgrind $pcregrep PATTERN ./testdata/grepinput) >>testtry
echo "---------------------------- Test 2 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep '^PATTERN' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 3 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -in PATTERN ./testdata/grepinput) >>testtry
echo "---------------------------- Test 4 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -ic PATTERN ./testdata/grepinput) >>testtry
echo "---------------------------- Test 5 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -in PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 6 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -inh PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 7 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -il PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 8 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -l PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 9 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -q PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 10 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -q NEVER-PATTERN ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 11 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -vn pattern ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 12 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -ix pattern ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 13 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -f./testdata/greplist ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 14 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -w pat ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 15 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep 'abc^*' ./testdata/grepinput) 2>>testtry >>testtry
echo "---------------------------- Test 16 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep abc ./testdata/grepinput ./testdata/nonexistfile) 2>>testtry >>testtry
echo "---------------------------- Test 17 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -M 'the\noutput' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 18 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -Mn '(the\noutput|dog\.\n--)' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 19 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -Mix 'Pattern' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 20 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -Mixn 'complete pair\nof lines' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 21 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -nA3 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 22 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -nB3 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 23 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -C3 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 24 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -A9 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 25 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -nB9 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 26 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -A9 -B9 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 27 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -A10 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 28 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -nB10 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 29 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -C12 -B10 'four' ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 30 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -inB3 'pattern' ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 31 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -inA3 'pattern' ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 32 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -L 'fox' ./testdata/grepinput ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 33 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep 'fox' ./testdata/grepnonexist) >>testtry 2>&1
echo "RC=$?" >>testtry
echo "---------------------------- Test 34 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -s 'fox' ./testdata/grepnonexist) >>testtry 2>&1
echo "RC=$?" >>testtry
echo "---------------------------- Test 35 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -L -r --include=grepinputx --exclude_dir='^\.' 'fox' ./testdata) >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 36 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -L -r --include=grepinput --exclude 'grepinput$' --exclude_dir='^\.' 'fox' ./testdata | sort) >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 37 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep '^(a+)*\d' ./testdata/grepinput) >>testtry 2>teststderr
echo "RC=$?" >>testtry
echo "======== STDERR ========" >>testtry
cat teststderr >>testtry
echo "---------------------------- Test 38 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep '>\x00<' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 39 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -A1 'before the binary zero' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 40 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -B1 'after the binary zero' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 41 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -B1 -o '\w+ the binary zero' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 41 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -B1 -onH '\w+ the binary zero' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 42 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -on 'before|zero|after' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 43 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -on -e before -e zero -e after ./testdata/grepinput) >>testtry
echo "---------------------------- Test 44 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -on -f ./testdata/greplist -e binary ./testdata/grepinput) >>testtry
echo "---------------------------- Test 45 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -e abc -e '(unclosed' ./testdata/grepinput) 2>>testtry >>testtry
echo "---------------------------- Test 46 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -Fx "AB.VE
elephant" ./testdata/grepinput) >>testtry
echo "---------------------------- Test 47 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -F "AB.VE
elephant" ./testdata/grepinput) >>testtry
echo "---------------------------- Test 48 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -F -e DATA -e "AB.VE
elephant" ./testdata/grepinput) >>testtry
echo "---------------------------- Test 49 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep "^(abc|def|ghi|jkl)" ./testdata/grepinputx) >>testtry
echo "---------------------------- Test 50 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -Mv "brown\sfox" ./testdata/grepinputv) >>testtry
echo "---------------------------- Test 51 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep --colour=always jumps ./testdata/grepinputv) >>testtry
echo "---------------------------- Test 52 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep --file-offsets 'before|zero|after' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 53 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep --line-offsets 'before|zero|after' ./testdata/grepinput) >>testtry
echo "---------------------------- Test 54 -----------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -f./testdata/greplist --color=always ./testdata/grepinputx) >>testtry
# Now compare the results.
$cf $srcdir/testdata/grepoutput testtry
if [ $? != 0 ] ; then exit 1; fi
# These tests require UTF-8 support
if [ $utf8 -ne 0 ] ; then
echo "Testing pcregrep UTF-8 features"
echo "---------------------------- Test U1 ------------------------------" >testtry
(cd $srcdir; $valgrind $pcregrep -n -u --newline=any "^X" ./testdata/grepinput8) >>testtry
echo "---------------------------- Test U2 ------------------------------" >>testtry
(cd $srcdir; $valgrind $pcregrep -n -u -C 3 --newline=any "Match" ./testdata/grepinput8) >>testtry
$cf $srcdir/testdata/grepoutput8 testtry
if [ $? != 0 ] ; then exit 1; fi
else
echo "Skipping pcregrep UTF-8 tests: no UTF-8 support in PCRE library"
fi
# We go to some contortions to try to ensure that the tests for the various
# newline settings will work in environments where the normal newline sequence
# is not \n. Do not use exported files, whose line endings might be changed.
# Instead, create an input file using printf so that its contents are exactly
# what we want. Note the messy fudge to get printf to write a string that
# starts with a hyphen.
echo "Testing pcregrep newline settings"
printf "abc\rdef\r\nghi\njkl" >testNinput
printf "%c--------------------------- Test N1 ------------------------------\r\n" - >testtry
$valgrind $pcregrep -n -N CR "^(abc|def|ghi|jkl)" testNinput >>testtry
printf "%c--------------------------- Test N2 ------------------------------\r\n" - >>testtry
$valgrind $pcregrep -n --newline=crlf "^(abc|def|ghi|jkl)" testNinput >>testtry
printf "%c--------------------------- Test N3 ------------------------------\r\n" - >>testtry
pattern=`printf 'def\rjkl'`
$valgrind $pcregrep -n --newline=cr -F "$pattern" testNinput >>testtry
printf "%c--------------------------- Test N4 ------------------------------\r\n" - >>testtry
pattern=`printf 'xxx\r\njkl'`
$valgrind $pcregrep -n --newline=crlf -F "$pattern" testNinput >>testtry
printf "%c--------------------------- Test N5 ------------------------------\r\n" - >>testtry
$valgrind $pcregrep -n --newline=any "^(abc|def|ghi|jkl)" testNinput >>testtry
printf "%c--------------------------- Test N6 ------------------------------\r\n" - >>testtry
$valgrind $pcregrep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinput >>testtry
$cf $srcdir/testdata/grepoutputN testtry
if [ $? != 0 ] ; then exit 1; fi
exit 0
# End

View File

@ -1,208 +0,0 @@
#! /bin/sh
# This file is generated by configure from RunGrepTest.in. Make any changes
# to that file.
echo "Testing pcregrep"
./pcregrep -V
# Run pcregrep tests. The assumption is that the PCRE tests check the library
# itself. What we are checking here is the file handling and options that are
# supported by pcregrep.
cf=diff
valgrind=
if [ ! -d testdata ] ; then
ln -s @top_srcdir@/testdata testdata
fi
testdata=./testdata
while [ $# -gt 0 ] ; do
case $1 in
valgrind) valgrind="valgrind -q --leak-check=no";;
*) echo "Unknown argument $1"; exit 1;;
esac
shift
done
echo "---------------------------- Test 1 ------------------------------" >testtry
$valgrind ./pcregrep PATTERN $testdata/grepinput >>testtry
echo "---------------------------- Test 2 ------------------------------" >>testtry
$valgrind ./pcregrep '^PATTERN' $testdata/grepinput >>testtry
echo "---------------------------- Test 3 ------------------------------" >>testtry
$valgrind ./pcregrep -in PATTERN $testdata/grepinput >>testtry
echo "---------------------------- Test 4 ------------------------------" >>testtry
$valgrind ./pcregrep -ic PATTERN $testdata/grepinput >>testtry
echo "---------------------------- Test 5 ------------------------------" >>testtry
$valgrind ./pcregrep -in PATTERN $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 6 ------------------------------" >>testtry
$valgrind ./pcregrep -inh PATTERN $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 7 ------------------------------" >>testtry
$valgrind ./pcregrep -il PATTERN $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 8 ------------------------------" >>testtry
$valgrind ./pcregrep -l PATTERN $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 9 ------------------------------" >>testtry
$valgrind ./pcregrep -q PATTERN $testdata/grepinput $testdata/grepinputx >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 10 -----------------------------" >>testtry
$valgrind ./pcregrep -q NEVER-PATTERN $testdata/grepinput $testdata/grepinputx >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 11 -----------------------------" >>testtry
$valgrind ./pcregrep -vn pattern $testdata/grepinputx >>testtry
echo "---------------------------- Test 12 -----------------------------" >>testtry
$valgrind ./pcregrep -ix pattern $testdata/grepinputx >>testtry
echo "---------------------------- Test 13 -----------------------------" >>testtry
$valgrind ./pcregrep -f$testdata/greplist $testdata/grepinputx >>testtry
echo "---------------------------- Test 14 -----------------------------" >>testtry
$valgrind ./pcregrep -w pat $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 15 -----------------------------" >>testtry
$valgrind ./pcregrep 'abc^*' $testdata/grepinput 2>>testtry >>testtry
echo "---------------------------- Test 16 -----------------------------" >>testtry
$valgrind ./pcregrep abc $testdata/grepinput $testdata/nonexistfile 2>>testtry >>testtry
echo "---------------------------- Test 17 -----------------------------" >>testtry
$valgrind ./pcregrep -M 'the\noutput' $testdata/grepinput >>testtry
echo "---------------------------- Test 18 -----------------------------" >>testtry
$valgrind ./pcregrep -Mn '(the\noutput|dog\.\n--)' $testdata/grepinput >>testtry
echo "---------------------------- Test 19 -----------------------------" >>testtry
$valgrind ./pcregrep -Mix 'Pattern' $testdata/grepinputx >>testtry
echo "---------------------------- Test 20 -----------------------------" >>testtry
$valgrind ./pcregrep -Mixn 'complete pair\nof lines' $testdata/grepinputx >>testtry
echo "---------------------------- Test 21 -----------------------------" >>testtry
$valgrind ./pcregrep -nA3 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 22 -----------------------------" >>testtry
$valgrind ./pcregrep -nB3 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 23 -----------------------------" >>testtry
$valgrind ./pcregrep -C3 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 24 -----------------------------" >>testtry
$valgrind ./pcregrep -A9 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 25 -----------------------------" >>testtry
$valgrind ./pcregrep -nB9 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 26 -----------------------------" >>testtry
$valgrind ./pcregrep -A9 -B9 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 27 -----------------------------" >>testtry
$valgrind ./pcregrep -A10 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 28 -----------------------------" >>testtry
$valgrind ./pcregrep -nB10 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 29 -----------------------------" >>testtry
$valgrind ./pcregrep -C12 -B10 'four' $testdata/grepinputx >>testtry
echo "---------------------------- Test 30 -----------------------------" >>testtry
$valgrind ./pcregrep -inB3 'pattern' $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 31 -----------------------------" >>testtry
$valgrind ./pcregrep -inA3 'pattern' $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 32 -----------------------------" >>testtry
$valgrind ./pcregrep -L 'fox' $testdata/grepinput $testdata/grepinputx >>testtry
echo "---------------------------- Test 33 -----------------------------" >>testtry
$valgrind ./pcregrep 'fox' $testdata/grepnonexist >>testtry 2>&1
echo "RC=$?" >>testtry
echo "---------------------------- Test 34 -----------------------------" >>testtry
$valgrind ./pcregrep -s 'fox' $testdata/grepnonexist >>testtry 2>&1
echo "RC=$?" >>testtry
echo "---------------------------- Test 35 -----------------------------" >>testtry
$valgrind ./pcregrep -L -r --include=grepinputx 'fox' $testdata >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 36 -----------------------------" >>testtry
$valgrind ./pcregrep -L -r --include=grepinput --exclude 'grepinput$' 'fox' $testdata >>testtry
echo "RC=$?" >>testtry
echo "---------------------------- Test 37 -----------------------------" >>testtry
$valgrind ./pcregrep '^(a+)*\d' $testdata/grepinput >>testtry 2>teststderr
echo "RC=$?" >>testtry
echo "======== STDERR ========" >>testtry
cat teststderr >>testtry
echo "---------------------------- Test 38 ------------------------------" >>testtry
$valgrind ./pcregrep '>\x00<' $testdata/grepinput >>testtry
echo "---------------------------- Test 39 ------------------------------" >>testtry
$valgrind ./pcregrep -A1 'before the binary zero' $testdata/grepinput >>testtry
echo "---------------------------- Test 40 ------------------------------" >>testtry
$valgrind ./pcregrep -B1 'after the binary zero' $testdata/grepinput >>testtry
echo "---------------------------- Test 41 ------------------------------" >>testtry
$valgrind ./pcregrep -B1 -o '\w+ the binary zero' $testdata/grepinput >>testtry
echo "---------------------------- Test 41 ------------------------------" >>testtry
$valgrind ./pcregrep -B1 -onH '\w+ the binary zero' $testdata/grepinput >>testtry
echo "---------------------------- Test 42 ------------------------------" >>testtry
$valgrind ./pcregrep -on 'before|zero|after' $testdata/grepinput >>testtry
echo "---------------------------- Test 43 ------------------------------" >>testtry
$valgrind ./pcregrep -on -e before -e zero -e after $testdata/grepinput >>testtry
echo "---------------------------- Test 44 ------------------------------" >>testtry
$valgrind ./pcregrep -on -f $testdata/greplist -e binary $testdata/grepinput >>testtry
echo "---------------------------- Test 45 ------------------------------" >>testtry
$valgrind ./pcregrep -e abc -e '(unclosed' $testdata/grepinput 2>>testtry >>testtry
echo "---------------------------- Test 46 ------------------------------" >>testtry
$valgrind ./pcregrep -Fx "AB.VE
elephant" $testdata/grepinput >>testtry
echo "---------------------------- Test 47 ------------------------------" >>testtry
$valgrind ./pcregrep -F "AB.VE
elephant" $testdata/grepinput >>testtry
echo "---------------------------- Test 48 ------------------------------" >>testtry
$valgrind ./pcregrep -F -e DATA -e "AB.VE
elephant" $testdata/grepinput >>testtry
echo "---------------------------- Test 49 ------------------------------" >>testtry
$valgrind ./pcregrep "^(abc|def|ghi|jkl)" $testdata/grepinputx >>testtry
echo "---------------------------- Test 50 ------------------------------" >>testtry
$valgrind ./pcregrep -N CR "^(abc|def|ghi|jkl)" $testdata/grepinputx >>testtry
echo "---------------------------- Test 51 ------------------------------" >>testtry
$valgrind ./pcregrep --newline=crlf "^(abc|def|ghi|jkl)" $testdata/grepinputx >>testtry
echo "---------------------------- Test 52 ------------------------------" >>testtry
$valgrind ./pcregrep --newline=cr -F "def jkl" $testdata/grepinputx >>testtry
echo "---------------------------- Test 53 ------------------------------" >>testtry
$valgrind ./pcregrep --newline=crlf -F "xxx
jkl" $testdata/grepinputx >>testtry
# Now compare the results.
$cf testtry $testdata/grepoutput
if [ $? != 0 ] ; then exit 1; else exit 0; fi
# End

View File

@ -1,292 +0,0 @@
#! /bin/sh
# Run PCRE tests.
valgrind=
# Set up a suitable "diff" command for comparison. Some systems
# have a diff that lacks a -u option. Try to deal with this.
if diff -u /dev/null /dev/null; then cf="diff -u"; else cf="diff"; fi
# Find the test data
testdata=testdata
if [ -n "$srcdir" -a -d "$srcdir" ] ; then
testdata="$srcdir/testdata"
fi
# Find which optional facilities are available
case `./pcretest -C | ./pcregrep 'Internal link size'` in
*2) link_size=2;;
*3) link_size=3;;
*4) link_size=4;;
*) echo "Failed to find internal link size"; exit 1;;
esac
./pcretest -C | ./pcregrep 'No UTF-8 support' >/dev/null
utf8=$?
./pcretest -C | ./pcregrep 'No Unicode properties support' >/dev/null
ucp=$?
# Select which tests to run; for those that are explicitly requested, check
# that the necessary optional facilities are available.
do1=no
do2=no
do3=no
do4=no
do5=no
do6=no
do7=no
do8=no
do9=no
do10=no
while [ $# -gt 0 ] ; do
case $1 in
1) do1=yes;;
2) do2=yes;;
3) do3=yes;;
4) do4=yes;;
5) do5=yes;;
6) do6=yes;;
7) do7=yes;;
8) do8=yes;;
9) do9=yes;;
10) do10=yes;;
valgrind) valgrind="valgrind -q";;
*) echo "Unknown test number $1"; exit 1;;
esac
shift
done
if [ $utf8 -eq 0 ] ; then
if [ $do4 = yes ] ; then
echo "Can't run test 4 because UTF-8 support is not configured"
exit 1
fi
if [ $do5 = yes ] ; then
echo "Can't run test 5 because UTF-8 support is not configured"
exit 1
fi
if [ $do8 = yes ] ; then
echo "Can't run test 8 because UTF-8 support is not configured"
exit 1
fi
fi
if [ $ucp -eq 0 ] ; then
if [ $do6 = yes ] ; then
echo "Can't run test 6 because Unicode property support is not configured"
exit 1
fi
if [ $do9 = yes ] ; then
echo "Can't run test 9 because Unicode property support is not configured"
exit 1
fi
if [ $do10 = yes ] ; then
echo "Can't run test 10 because Unicode property support is not configured"
exit 1
fi
fi
if [ $link_size -ne 2 ] ; then
if [ $do10 = yes ] ; then
echo "Can't run test 10 because the link size ($link_size) is not 2"
exit 1
fi
fi
# If no specific tests were requested, select all that are relevant.
if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \
$do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \
$do9 = no -a $do10 = no ] ; then
do1=yes
do2=yes
do3=yes
if [ $utf8 -ne 0 ] ; then do4=yes; fi
if [ $utf8 -ne 0 ] ; then do5=yes; fi
if [ $utf8 -ne 0 -a $ucp -ne 0 ] ; then do6=yes; fi
do7=yes
if [ $utf8 -ne 0 ] ; then do8=yes; fi
if [ $utf8 -ne 0 -a $ucp -ne 0 ] ; then do9=yes; fi
if [ $link_size -eq 2 -a $ucp -ne 0 ] ; then do10=yes; fi
fi
# Show which release
echo ""
echo PCRE C library tests
./pcretest /dev/null
# Primary test, Perl-compatible
if [ $do1 = yes ] ; then
echo "Test 1: main functionality (Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput1 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput1 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
# PCRE tests that are not Perl-compatible - API & error tests, mostly
if [ $do2 = yes ] ; then
echo "Test 2: API and error handling (not Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput2 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput2 testtry
if [ $? != 0 ] ; then exit 1; fi
else
echo " "
echo "** Test 2 requires a lot of stack. If it has crashed with a"
echo "** segmentation fault, it may be that you do not have enough"
echo "** stack available by default. Please see the 'pcrestack' man"
echo "** page for a discussion of PCRE's stack usage."
echo " "
exit 1
fi
echo "OK"
fi
# Locale-specific tests, provided that either the "fr_FR" or the "french"
# locale is available. The former is the Unix-like standard; the latter is
# for Windows.
if [ $do3 = yes ] ; then
locale -a | grep '^fr_FR$' >/dev/null
if [ $? -eq 0 ] ; then
locale=fr_FR
infile=$testdata/testinput3
outfile=$testdata/testoutput3
else
locale -a | grep '^french$' >/dev/null
if [ $? -eq 0 ] ; then
locale=french
sed 's/fr_FR/french/' $testdata/testinput3 >test3input
sed 's/fr_FR/french/' $testdata/testoutput3 >test3output
infile=test3input
outfile=test3output
else
locale=
fi
fi
if [ "$locale" != "" ] ; then
echo "Test 3: locale-specific features (using '$locale' locale)"
$valgrind ./pcretest -q $infile testtry
if [ $? = 0 ] ; then
$cf $outfile testtry
if [ $? != 0 ] ; then
echo " "
echo "Locale test did not run entirely successfully."
echo "This usually means that there is a problem with the locale"
echo "settings rather than a bug in PCRE."
else
echo "OK"
fi
else exit 1
fi
else
echo "Cannot test locale-specific features - neither the 'fr_FR' nor the"
echo "'french' locale exists, or the \"locale\" command is not available"
echo "to check for them."
echo " "
fi
fi
# Additional tests for UTF8 support
if [ $do4 = yes ] ; then
echo "Test 4: UTF-8 support (Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput4 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput4 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
if [ $do5 = yes ] ; then
echo "Test 5: API and internals for UTF-8 support (not Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput5 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput5 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
if [ $do6 = yes ] ; then
echo "Test 6: Unicode property support"
$valgrind ./pcretest -q $testdata/testinput6 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput6 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
# Tests for DFA matching support
if [ $do7 = yes ] ; then
echo "Test 7: DFA matching"
$valgrind ./pcretest -q -dfa $testdata/testinput7 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput7 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
if [ $do8 = yes ] ; then
echo "Test 8: DFA matching with UTF-8"
$valgrind ./pcretest -q -dfa $testdata/testinput8 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput8 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
if [ $do9 = yes ] ; then
echo "Test 9: DFA matching with Unicode properties"
$valgrind ./pcretest -q -dfa $testdata/testinput9 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput9 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
# Test of internal offsets and code sizes. This test is run only when there
# is Unicode property support and the link size is 2. The actual tests are
# mostly the same as in some of the above, but in this test we inspect some
# offsets and sizes that require a known link size. This is a doublecheck for
# the maintainer, just in case something changes unexpectely.
if [ $do10 = yes ] ; then
echo "Test 10: Internal offsets and code size tests"
$valgrind ./pcretest -q $testdata/testinput10 testtry
if [ $? = 0 ] ; then
$cf $testdata/testoutput10 testtry
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
fi
# End

View File

@ -1,39 +0,0 @@
@rem This file was contributed by Ralf Junker, and touched up by
@rem Daniel Richard G. Test 10 added by Philip H.
@rem Philip H also changed test 3 to use "wintest" files.
@rem
@rem MS Windows batch file to run pcretest on testfiles with the correct
@rem options.
@rem
@rem Output is written to a newly created subfolder named "testdata".
setlocal
if [%srcdir%]==[] set srcdir=.
if [%pcretest%]==[] set pcretest=pcretest
if not exist testout md testout
%pcretest% -q %srcdir%\testdata\testinput1 > testout\testoutput1
%pcretest% -q %srcdir%\testdata\testinput2 > testout\testoutput2
@rem %pcretest% -q %srcdir%\testdata\testinput3 > testout\testoutput3
%pcretest% -q %srcdir%\testdata\wintestinput3 > testout\wintestoutput3
%pcretest% -q %srcdir%\testdata\testinput4 > testout\testoutput4
%pcretest% -q %srcdir%\testdata\testinput5 > testout\testoutput5
%pcretest% -q %srcdir%\testdata\testinput6 > testout\testoutput6
%pcretest% -q -dfa %srcdir%\testdata\testinput7 > testout\testoutput7
%pcretest% -q -dfa %srcdir%\testdata\testinput8 > testout\testoutput8
%pcretest% -q -dfa %srcdir%\testdata\testinput9 > testout\testoutput9
%pcretest% -q %srcdir%\testdata\testinput10 > testout\testoutput10
fc /n %srcdir%\testdata\testoutput1 testout\testoutput1
fc /n %srcdir%\testdata\testoutput2 testout\testoutput2
rem fc /n %srcdir%\testdata\testoutput3 testout\testoutput3
fc /n %srcdir%\testdata\wintestoutput3 testout\wintestoutput3
fc /n %srcdir%\testdata\testoutput4 testout\testoutput4
fc /n %srcdir%\testdata\testoutput5 testout\testoutput5
fc /n %srcdir%\testdata\testoutput6 testout\testoutput6
fc /n %srcdir%\testdata\testoutput7 testout\testoutput7
fc /n %srcdir%\testdata\testoutput8 testout\testoutput8
fc /n %srcdir%\testdata\testoutput9 testout\testoutput9
fc /n %srcdir%\testdata\testoutput10 testout\testoutput10

View File

@ -1,258 +0,0 @@
#! /bin/sh
# This file is generated by configure from RunTest.in. Make any changes
# to that file.
# Run PCRE tests
cf=diff
valgrind=
if [ ! -d testdata ] ; then
ln -s @top_srcdir@/testdata testdata
fi
testdata=./testdata
# Select which tests to run; if no selection, run all
do1=no
do2=no
do3=no
do4=no
do5=no
do6=no
do7=no
do8=no
do9=no
while [ $# -gt 0 ] ; do
case $1 in
1) do1=yes;;
2) do2=yes;;
3) do3=yes;;
4) do4=yes;;
5) do5=yes;;
6) do6=yes;;
7) do7=yes;;
8) do8=yes;;
9) do9=yes;;
valgrind) valgrind="valgrind -q";;
*) echo "Unknown test number $1"; exit 1;;
esac
shift
done
if [ "@LINK_SIZE@" != "" -a "@LINK_SIZE@" != "-DLINK_SIZE=2" ] ; then
if [ $do2 = yes ] ; then
echo "Can't run test 2 with an internal link size other than 2"
exit 1
fi
if [ $do5 = yes ] ; then
echo "Can't run test 5 with an internal link size other than 2"
exit 1
fi
if [ $do6 = yes ] ; then
echo "Can't run test 6 with an internal link size other than 2"
exit 1
fi
fi
if [ "@UTF8@" = "" ] ; then
if [ $do4 = yes ] ; then
echo "Can't run test 4 because UTF-8 support is not configured"
exit 1
fi
if [ $do5 = yes ] ; then
echo "Can't run test 5 because UTF-8 support is not configured"
exit 1
fi
if [ $do6 = yes ] ; then
echo "Can't run test 6 because UTF-8 support is not configured"
exit 1
fi
if [ $do8 = yes ] ; then
echo "Can't run test 8 because UTF-8 support is not configured"
exit 1
fi
if [ $do9 = yes ] ; then
echo "Can't run test 9 because UTF-8 support is not configured"
exit 1
fi
fi
if [ "@UCP@" = "" ] ; then
if [ $do6 = yes ] ; then
echo "Can't run test 6 because Unicode property support is not configured"
exit 1
fi
if [ $do9 = yes ] ; then
echo "Can't run test 9 because Unicode property support is not configured"
exit 1
fi
fi
if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \
$do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \
$do9 = no ] ; then
do1=yes
do2=yes
do3=yes
if [ "@UTF8@" != "" ] ; then do4=yes; fi
if [ "@UTF8@" != "" ] ; then do5=yes; fi
if [ "@UTF8@" != "" -a "@UCP@" != "" ] ; then do6=yes; fi
do7=yes
if [ "@UTF8@" != "" ] ; then do8=yes; fi
if [ "@UTF8@" != "" -a "@UCP@" != "" ] ; then do9=yes; fi
fi
# Show which release
./pcretest /dev/null
# Primary test, Perl-compatible
if [ $do1 = yes ] ; then
echo "Test 1: main functionality (Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput1 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput1
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
fi
# PCRE tests that are not Perl-compatible - API & error tests, mostly
if [ $do2 = yes ] ; then
if [ "@LINK_SIZE@" = "" -o "@LINK_SIZE@" = "-DLINK_SIZE=2" ] ; then
echo "Test 2: API and error handling (not Perl compatible)"
$valgrind ./pcretest -q -i $testdata/testinput2 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput2
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
else
echo Test 2 skipped for link size other than 2 \(@LINK_SIZE@\)
echo " "
fi
fi
# Locale-specific tests, provided the "fr_FR" locale is available
if [ $do3 = yes ] ; then
locale -a | grep '^fr_FR$' >/dev/null
if [ $? -eq 0 ] ; then
echo "Test 3: locale-specific features (using 'fr_FR' locale)"
$valgrind ./pcretest -q $testdata/testinput3 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput3
if [ $? != 0 ] ; then
echo " "
echo "Locale test did not run entirely successfully."
echo "This usually means that there is a problem with the locale"
echo "settings rather than a bug in PCRE."
else
echo "OK"
fi
echo " "
else exit 1
fi
else
echo "Cannot test locale-specific features - 'fr_FR' locale not found,"
echo "or the \"locale\" command is not available to check for it."
echo " "
fi
fi
# Additional tests for UTF8 support
if [ $do4 = yes ] ; then
echo "Test 4: UTF-8 support (Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput4 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput4
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
fi
if [ $do5 = yes ] ; then
if [ "@LINK_SIZE@" = "" -o "@LINK_SIZE@" = "-DLINK_SIZE=2" ] ; then
echo "Test 5: API and internals for UTF-8 support (not Perl compatible)"
$valgrind ./pcretest -q $testdata/testinput5 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput5
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
else
echo Test 5 skipped for link size other than 2 \(@LINK_SIZE@\)
echo " "
fi
fi
if [ $do6 = yes ] ; then
if [ "@LINK_SIZE@" = "" -o "@LINK_SIZE@" = "-DLINK_SIZE=2" ] ; then
echo "Test 6: Unicode property support"
$valgrind ./pcretest -q $testdata/testinput6 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput6
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
else
echo Test 6 skipped for link size other than 2 \(@LINK_SIZE@\)
echo " "
fi
fi
# Tests for DFA matching support
if [ $do7 = yes ] ; then
echo "Test 7: DFA matching"
$valgrind ./pcretest -q -dfa $testdata/testinput7 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput7
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
fi
if [ $do8 = yes ] ; then
echo "Test 8: DFA matching with UTF-8"
$valgrind ./pcretest -q -dfa $testdata/testinput8 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput8
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
fi
if [ $do9 = yes ] ; then
echo "Test 9: DFA matching with Unicode properties"
$valgrind ./pcretest -q -dfa $testdata/testinput9 testtry
if [ $? = 0 ] ; then
$cf testtry $testdata/testoutput9
if [ $? != 0 ] ; then exit 1; fi
else exit 1
fi
echo "OK"
echo " "
fi
# End

View File

@ -1,22 +0,0 @@
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,58 +0,0 @@
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME (DEFAULT_MSG|"Custom failure message") VAR1 ... )
# This macro is intended to be used in FindXXX.cmake modules files.
# It handles the REQUIRED and QUIET argument to FIND_PACKAGE() and
# it also sets the <UPPERCASED_NAME>_FOUND variable.
# The package is found if all variables listed are TRUE.
# Example:
#
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibXml2 DEFAULT_MSG LIBXML2_LIBRARIES LIBXML2_INCLUDE_DIR)
#
# LibXml2 is considered to be found, if both LIBXML2_LIBRARIES and
# LIBXML2_INCLUDE_DIR are valid. Then also LIBXML2_FOUND is set to TRUE.
# If it is not found and REQUIRED was used, it fails with FATAL_ERROR,
# independent whether QUIET was used or not.
# If it is found, the location is reported using the VAR1 argument, so
# here a message "Found LibXml2: /usr/lib/libxml2.so" will be printed out.
# If the second argument is DEFAULT_MSG, the message in the failure case will
# be "Could NOT find LibXml2", if you don't like this message you can specify
# your own custom failure message there.
MACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 )
IF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
IF (${_NAME}_FIND_REQUIRED)
SET(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}")
ELSE (${_NAME}_FIND_REQUIRED)
SET(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}")
ENDIF (${_NAME}_FIND_REQUIRED)
ELSE("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
SET(_FAIL_MESSAGE "${_FAIL_MSG}")
ENDIF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
STRING(TOUPPER ${_NAME} _NAME_UPPER)
SET(${_NAME_UPPER}_FOUND TRUE)
IF(NOT ${_VAR1})
SET(${_NAME_UPPER}_FOUND FALSE)
ENDIF(NOT ${_VAR1})
FOREACH(_CURRENT_VAR ${ARGN})
IF(NOT ${_CURRENT_VAR})
SET(${_NAME_UPPER}_FOUND FALSE)
ENDIF(NOT ${_CURRENT_VAR})
ENDFOREACH(_CURRENT_VAR)
IF (${_NAME_UPPER}_FOUND)
IF (NOT ${_NAME}_FIND_QUIETLY)
MESSAGE(STATUS "Found ${_NAME}: ${${_VAR1}}")
ENDIF (NOT ${_NAME}_FIND_QUIETLY)
ELSE (${_NAME_UPPER}_FOUND)
IF (${_NAME}_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "${_FAIL_MESSAGE}")
ELSE (${_NAME}_FIND_REQUIRED)
IF (NOT ${_NAME}_FIND_QUIETLY)
MESSAGE(STATUS "${_FAIL_MESSAGE}")
ENDIF (NOT ${_NAME}_FIND_QUIETLY)
ENDIF (${_NAME}_FIND_REQUIRED)
ENDIF (${_NAME_UPPER}_FOUND)
ENDMACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS)

View File

@ -1,29 +0,0 @@
# from http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/FindReadline.cmake
# http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/COPYING-CMAKE-SCRIPTS
# --> BSD licensed
#
# GNU Readline library finder
if(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
set(READLINE_FOUND TRUE)
else(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
FIND_PATH(READLINE_INCLUDE_DIR readline/readline.h
/usr/include/readline
)
# 2008-04-22 The next clause used to read like this:
#
# FIND_LIBRARY(READLINE_LIBRARY NAMES readline)
# FIND_LIBRARY(NCURSES_LIBRARY NAMES ncurses )
# include(FindPackageHandleStandardArgs)
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG NCURSES_LIBRARY READLINE_INCLUDE_DIR READLINE_LIBRARY )
#
# I was advised to modify it such that it will find an ncurses library if
# required, but not if one was explicitly given, that is, it allows the
# default to be overridden. PH
FIND_LIBRARY(READLINE_LIBRARY NAMES readline)
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG READLINE_INCLUDE_DIR READLINE_LIBRARY )
MARK_AS_ADVANCED(READLINE_INCLUDE_DIR READLINE_LIBRARY)
endif(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)

View File

@ -1,44 +0,0 @@
/* config.h for CMake builds */
#cmakedefine HAVE_DIRENT_H 1
#cmakedefine HAVE_SYS_STAT_H 1
#cmakedefine HAVE_SYS_TYPES_H 1
#cmakedefine HAVE_UNISTD_H 1
#cmakedefine HAVE_WINDOWS_H 1
#cmakedefine HAVE_TYPE_TRAITS_H 1
#cmakedefine HAVE_BITS_TYPE_TRAITS_H 1
#cmakedefine HAVE_BCOPY 1
#cmakedefine HAVE_MEMMOVE 1
#cmakedefine HAVE_STRERROR 1
#cmakedefine HAVE_STRTOLL 1
#cmakedefine HAVE_STRTOQ 1
#cmakedefine HAVE__STRTOI64 1
#cmakedefine PCRE_STATIC 1
#cmakedefine SUPPORT_UTF8 1
#cmakedefine SUPPORT_UCP 1
#cmakedefine EBCDIC 1
#cmakedefine BSR_ANYCRLF 1
#cmakedefine NO_RECURSE 1
#cmakedefine HAVE_LONG_LONG 1
#cmakedefine HAVE_UNSIGNED_LONG_LONG 1
#cmakedefine SUPPORT_LIBBZ2 1
#cmakedefine SUPPORT_LIBZ 1
#cmakedefine SUPPORT_LIBREADLINE 1
#define NEWLINE @NEWLINE@
#define POSIX_MALLOC_THRESHOLD @PCRE_POSIX_MALLOC_THRESHOLD@
#define LINK_SIZE @PCRE_LINK_SIZE@
#define MATCH_LIMIT @PCRE_MATCH_LIMIT@
#define MATCH_LIMIT_RECURSION @PCRE_MATCH_LIMIT_RECURSION@
#define MAX_NAME_SIZE 32
#define MAX_NAME_COUNT 10000
/* end config.h for CMake builds */

View File

@ -1,313 +0,0 @@
/* config.h. Generated from config.h.in by configure. */
/* config.h.in. Generated from configure.ac by autoheader. */
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
Some other environments also support the use of "configure". PCRE is written in
Standard C, but there are a few non-standard things it can cope with, allowing
it to run on SunOS4 and other "close to standard" systems.
If you are going to build PCRE "by hand" on a system without "configure" you
should copy the distributed config.h.generic to config.h, and then set up the
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
all of your compile commands, so that config.h is included at the start of
every source.
Alternatively, you can avoid editing by using -D on the compiler command line
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
them both to 0; an emulation function will be used. */
/* By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined, this is
changed so that backslash-R matches only CR, LF, or CRLF. The build- time
default can be overridden by the user of PCRE at runtime. On systems that
support it, "configure" can be used to override the default. */
/* #undef BSR_ANYCRLF */
/* If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro as 1. On systems that can use
"configure", this can be done via --enable-ebcdic. PCRE will then assume
that all input strings are in EBCDIC. If you do not define this macro, PCRE
will assume input strings are ASCII or UTF-8 Unicode. It is not possible to
build a version of PCRE that supports both EBCDIC and UTF-8. */
/* #undef EBCDIC */
/* Define to 1 if you have the `bcopy' function. */
#ifndef HAVE_BCOPY
#define HAVE_BCOPY 1
#endif
/* Define to 1 if you have the <bits/type_traits.h> header file. */
/* #undef HAVE_BITS_TYPE_TRAITS_H */
/* Define to 1 if you have the <bzlib.h> header file. */
#ifndef HAVE_BZLIB_H
#define HAVE_BZLIB_H 1
#endif
/* Define to 1 if you have the <dirent.h> header file. */
#ifndef HAVE_DIRENT_H
#define HAVE_DIRENT_H 1
#endif
/* Define to 1 if you have the <dlfcn.h> header file. */
#ifndef HAVE_DLFCN_H
#define HAVE_DLFCN_H 1
#endif
/* Define to 1 if you have the <inttypes.h> header file. */
#ifndef HAVE_INTTYPES_H
#define HAVE_INTTYPES_H 1
#endif
/* Define to 1 if you have the <limits.h> header file. */
#ifndef HAVE_LIMITS_H
#define HAVE_LIMITS_H 1
#endif
/* Define to 1 if the system has the type `long long'. */
#ifndef HAVE_LONG_LONG
#define HAVE_LONG_LONG 1
#endif
/* Define to 1 if you have the `memmove' function. */
#ifndef HAVE_MEMMOVE
#define HAVE_MEMMOVE 1
#endif
/* Define to 1 if you have the <memory.h> header file. */
#ifndef HAVE_MEMORY_H
#define HAVE_MEMORY_H 1
#endif
/* Define to 1 if you have the <readline/history.h> header file. */
#ifndef HAVE_READLINE_HISTORY_H
#define HAVE_READLINE_HISTORY_H 1
#endif
/* Define to 1 if you have the <readline/readline.h> header file. */
#ifndef HAVE_READLINE_READLINE_H
#define HAVE_READLINE_READLINE_H 1
#endif
/* Define to 1 if you have the <stdint.h> header file. */
#ifndef HAVE_STDINT_H
#define HAVE_STDINT_H 1
#endif
/* Define to 1 if you have the <stdlib.h> header file. */
#ifndef HAVE_STDLIB_H
#define HAVE_STDLIB_H 1
#endif
/* Define to 1 if you have the `strerror' function. */
#ifndef HAVE_STRERROR
#define HAVE_STRERROR 1
#endif
/* Define to 1 if you have the <string> header file. */
#ifndef HAVE_STRING
#define HAVE_STRING 1
#endif
/* Define to 1 if you have the <strings.h> header file. */
#ifndef HAVE_STRINGS_H
#define HAVE_STRINGS_H 1
#endif
/* Define to 1 if you have the <string.h> header file. */
#ifndef HAVE_STRING_H
#define HAVE_STRING_H 1
#endif
/* Define to 1 if you have the `strtoll' function. */
/* #undef HAVE_STRTOLL */
/* Define to 1 if you have the `strtoq' function. */
#ifndef HAVE_STRTOQ
#define HAVE_STRTOQ 1
#endif
/* Define to 1 if you have the <sys/stat.h> header file. */
#ifndef HAVE_SYS_STAT_H
#define HAVE_SYS_STAT_H 1
#endif
/* Define to 1 if you have the <sys/types.h> header file. */
#ifndef HAVE_SYS_TYPES_H
#define HAVE_SYS_TYPES_H 1
#endif
/* Define to 1 if you have the <type_traits.h> header file. */
/* #undef HAVE_TYPE_TRAITS_H */
/* Define to 1 if you have the <unistd.h> header file. */
#ifndef HAVE_UNISTD_H
#define HAVE_UNISTD_H 1
#endif
/* Define to 1 if the system has the type `unsigned long long'. */
#ifndef HAVE_UNSIGNED_LONG_LONG
#define HAVE_UNSIGNED_LONG_LONG 1
#endif
/* Define to 1 if you have the <windows.h> header file. */
/* #undef HAVE_WINDOWS_H */
/* Define to 1 if you have the <zlib.h> header file. */
#ifndef HAVE_ZLIB_H
#define HAVE_ZLIB_H 1
#endif
/* Define to 1 if you have the `_strtoi64' function. */
/* #undef HAVE__STRTOI64 */
/* The value of LINK_SIZE determines the number of bytes used to store links
as offsets within the compiled regex. The default is 2, which allows for
compiled patterns up to 64K long. This covers the vast majority of cases.
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
for longer patterns in extreme cases. On systems that support it,
"configure" can be used to override this default. */
#ifndef LINK_SIZE
#define LINK_SIZE 2
#endif
/* The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different limit.
The limit exists in order to catch runaway regular expressions that take
for ever to determine that they do not match. The default is set very large
so that it does not accidentally catch legitimate cases. On systems that
support it, "configure" can be used to override this default default. */
#ifndef MATCH_LIMIT
#define MATCH_LIMIT 10000000
#endif
/* The above limit applies to all calls of match(), whether or not they
increase the recursion depth. In some environments it is desirable to limit
the depth of recursive calls of match() more strictly, in order to restrict
the maximum amount of stack (or heap, if NO_RECURSE is defined) that is
used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of
match(). To have any useful effect, it must be less than the value of
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is
a runtime method for setting a different limit. On systems that support it,
"configure" can be used to override the default. */
#ifndef MATCH_LIMIT_RECURSION
#define MATCH_LIMIT_RECURSION MATCH_LIMIT
#endif
/* This limit is parameterized just in case anybody ever wants to change it.
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#ifndef MAX_NAME_COUNT
#define MAX_NAME_COUNT 10000
#endif
/* This limit is parameterized just in case anybody ever wants to change it.
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#ifndef MAX_NAME_SIZE
#define MAX_NAME_SIZE 32
#endif
/* The value of NEWLINE determines the newline character sequence. On systems
that support it, "configure" can be used to override the default, which is
10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF), -1 (ANY), or -2
(ANYCRLF). */
#ifndef NEWLINE
#define NEWLINE 10
#endif
/* PCRE uses recursive function calls to handle backtracking while matching.
This can sometimes be a problem on systems that have stacks of limited
size. Define NO_RECURSE to get a version that doesn't use recursion in the
match() function; instead it creates its own stack by steam using
pcre_recurse_malloc() to obtain memory from the heap. For more detail, see
the comments and other stuff just above the match() function. On systems
that support it, "configure" can be used to set this in the Makefile (use
--disable-stack-for-recursion). */
/* #undef NO_RECURSE */
/* Name of package */
#define PACKAGE "pcre"
/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT ""
/* Define to the full name of this package. */
#define PACKAGE_NAME "PCRE"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE 7.9"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre"
/* Define to the version of this package. */
#define PACKAGE_VERSION "7.9"
/* If you are compiling for a system other than a Unix-like system or
Win32, and it needs some magic to be inserted before the definition
of a function that is exported by the library, define this macro to
contain the relevant magic. If you do not define this macro, it
defaults to "extern" for a C compiler and "extern C" for a C++
compiler on non-Win32 systems. This macro apears at the start of
every exported function that is part of the external API. It does
not appear on functions that are "external" in the C sense, but
which are internal to the library. */
/* #undef PCRE_EXP_DEFN */
/* Define if linking statically (TODO: make nice with Libtool) */
/* #undef PCRE_STATIC */
/* When calling PCRE via the POSIX interface, additional working storage is
required for holding the pointers to capturing substrings because PCRE
requires three integers per substring, whereas the POSIX interface provides
only two. If the number of expected substrings is small, the wrapper
function uses space on the stack, because this is faster than using
malloc() for each call. The threshold above which the stack is no longer
used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it,
"configure" can be used to override this default. */
#ifndef POSIX_MALLOC_THRESHOLD
#define POSIX_MALLOC_THRESHOLD 10
#endif
/* Define to 1 if you have the ANSI C header files. */
#ifndef STDC_HEADERS
#define STDC_HEADERS 1
#endif
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
handle .bz2 files. */
/* #undef SUPPORT_LIBBZ2 */
/* Define to allow pcretest to be linked with libreadline. */
/* #undef SUPPORT_LIBREADLINE */
/* Define to allow pcregrep to be linked with libz, so that it is able to
handle .gz files. */
/* #undef SUPPORT_LIBZ */
/* Define to enable support for Unicode properties */
/* #undef SUPPORT_UCP */
/* Define to enable support for the UTF-8 Unicode encoding. This will work
even in an EBCDIC environment, but it is incompatible with the EBCDIC
macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but
not both at once. */
/* #undef SUPPORT_UTF8 */
/* Version number of package */
#ifndef VERSION
#define VERSION "7.9"
#endif
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */
/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */

View File

@ -1,691 +0,0 @@
dnl Process this file with autoconf to produce a configure script.
dnl NOTE FOR MAINTAINERS: Do not use major or minor version numbers with
dnl leading zeros, because they may be treated as octal constants. The
dnl PCRE_PRERELEASE feature is for identifying release candidates. It might
dnl be defined as -RC2, for example. For real releases, it should be defined
dnl empty.
m4_define(pcre_major, [7])
m4_define(pcre_minor, [9])
m4_define(pcre_prerelease, [])
m4_define(pcre_date, [2009-04-11])
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre_version, [0:1:0])
m4_define(libpcreposix_version, [0:0:0])
m4_define(libpcrecpp_version, [0:0:0])
AC_PREREQ(2.57)
AC_INIT(PCRE, pcre_major.pcre_minor[]pcre_prerelease, , pcre)
AC_CONFIG_SRCDIR([pcre.h.in])
AM_INIT_AUTOMAKE([dist-bzip2 dist-zip])
AC_CONFIG_HEADERS(config.h)
CFLAGS="$CFLAGS $CONFIGURE_CFLAGS"
CXXFLAGS="$CXXFLAGS $CONFIGURE_CXXFLAGS"
LDFLAGS="$LDFLAGS $CONFIGURE_LDFLAGS"
# The default CFLAGS and CXXFLAGS in Autoconf are "-g -O2" for gcc and just
# "-g" for any other compiler. There doesn't seem to be a standard way of
# getting rid of the -g (which I don't think is needed for a production
# library). This fudge seems to achieve the necessary. First, we remember the
# externally set values of CFLAGS and CXXFLAGS. Then call the AC_PROG_CC and
# AC_PROG_CXX macros to find the compilers - if CFLAGS and CXXFLAGS are not
# set, they will be set to Autoconf's defaults. Afterwards, if the original
# values were not set, remove the -g from the Autoconf defaults.
# (PH 02-May-07)
remember_set_CFLAGS="$CFLAGS"
remember_set_CXXFLAGS="$CXXFLAGS"
AC_PROG_CC
AC_PROG_CXX
if test "x$remember_set_CFLAGS" = "x"
then
if test "$CFLAGS" = "-g -O2"
then
CFLAGS="-O2"
elif test "$CFLAGS" = "-g"
then
CFLAGS=""
fi
fi
if test "x$remember_set_CXXFLAGS" = "x"
then
if test "$CXXFLAGS" = "-g -O2"
then
CXXFLAGS="-O2"
elif test "$CXXFLAGS" = "-g"
then
CXXFLAGS=""
fi
fi
# AC_PROG_CXX will return "g++" even if no c++ compiler is installed.
# Check for that case, and just disable c++ code if g++ doesn't run.
AC_LANG_PUSH(C++)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]],[[]])],, CXX=""; CXXCP=""; CXXFLAGS="")
AC_LANG_POP
AC_PROG_INSTALL
AC_LIBTOOL_WIN32_DLL
AC_PROG_LIBTOOL
AC_PROG_LN_S
PCRE_MAJOR="pcre_major"
PCRE_MINOR="pcre_minor"
PCRE_PRERELEASE="pcre_prerelease"
PCRE_DATE="pcre_date"
AC_SUBST(PCRE_MAJOR)
AC_SUBST(PCRE_MINOR)
AC_SUBST(PCRE_PRERELEASE)
AC_SUBST(PCRE_DATE)
# Set a more sensible default value for $(htmldir).
if test "x$htmldir" = 'x${docdir}'
then
htmldir='${docdir}/html'
fi
# Handle --disable-cpp
AC_ARG_ENABLE(cpp,
AS_HELP_STRING([--disable-cpp],
[disable C++ support]),
, enable_cpp=yes)
# Handle --enable-rebuild-chartables
AC_ARG_ENABLE(rebuild-chartables,
AS_HELP_STRING([--enable-rebuild-chartables],
[rebuild character tables in current locale]),
, enable_rebuild_chartables=no)
# Handle --enable-utf8 (disabled by default)
AC_ARG_ENABLE(utf8,
AS_HELP_STRING([--enable-utf8],
[enable UTF-8 support (incompatible with --enable-ebcdic)]),
, enable_utf8=unset)
# Handle --enable-unicode-properties
AC_ARG_ENABLE(unicode-properties,
AS_HELP_STRING([--enable-unicode-properties],
[enable Unicode properties support (implies --enable-utf8)]),
, enable_unicode_properties=no)
# Handle --enable-newline=NL
dnl AC_ARG_ENABLE(newline,
dnl AS_HELP_STRING([--enable-newline=NL],
dnl [use NL as newline (lf, cr, crlf, anycrlf, any; default=lf)]),
dnl , enable_newline=lf)
# Separate newline options
ac_pcre_newline=lf
AC_ARG_ENABLE(newline-is-cr,
AS_HELP_STRING([--enable-newline-is-cr],
[use CR as newline character]),
ac_pcre_newline=cr)
AC_ARG_ENABLE(newline-is-lf,
AS_HELP_STRING([--enable-newline-is-lf],
[use LF as newline character (default)]),
ac_pcre_newline=lf)
AC_ARG_ENABLE(newline-is-crlf,
AS_HELP_STRING([--enable-newline-is-crlf],
[use CRLF as newline sequence]),
ac_pcre_newline=crlf)
AC_ARG_ENABLE(newline-is-anycrlf,
AS_HELP_STRING([--enable-newline-is-anycrlf],
[use CR, LF, or CRLF as newline sequence]),
ac_pcre_newline=anycrlf)
AC_ARG_ENABLE(newline-is-any,
AS_HELP_STRING([--enable-newline-is-any],
[use any valid Unicode newline sequence]),
ac_pcre_newline=any)
enable_newline="$ac_pcre_newline"
# Handle --enable-bsr-anycrlf
AC_ARG_ENABLE(bsr-anycrlf,
AS_HELP_STRING([--enable-bsr-anycrlf],
[\R matches only CR, LF, CRLF by default]),
, enable_bsr_anycrlf=no)
# Handle --enable-ebcdic
AC_ARG_ENABLE(ebcdic,
AS_HELP_STRING([--enable-ebcdic],
[assume EBCDIC coding rather than ASCII; incompatible with --enable-utf8; use only in (uncommon) EBCDIC environments; it implies --enable-rebuild-chartables]),
, enable_ebcdic=no)
# Handle --disable-stack-for-recursion
AC_ARG_ENABLE(stack-for-recursion,
AS_HELP_STRING([--disable-stack-for-recursion],
[don't use stack recursion when matching]),
, enable_stack_for_recursion=yes)
# Handle --enable-pcregrep-libz
AC_ARG_ENABLE(pcregrep-libz,
AS_HELP_STRING([--enable-pcregrep-libz],
[link pcregrep with libz to handle .gz files]),
, enable_pcregrep_libz=no)
# Handle --enable-pcregrep-libbz2
AC_ARG_ENABLE(pcregrep-libbz2,
AS_HELP_STRING([--enable-pcregrep-libbz2],
[link pcregrep with libbz2 to handle .bz2 files]),
, enable_pcregrep_libbz2=no)
# Handle --enable-pcretest-libreadline
AC_ARG_ENABLE(pcretest-libreadline,
AS_HELP_STRING([--enable-pcretest-libreadline],
[link pcretest with libreadline]),
, enable_pcretest_libreadline=no)
# Handle --with-posix-malloc-threshold=NBYTES
AC_ARG_WITH(posix-malloc-threshold,
AS_HELP_STRING([--with-posix-malloc-threshold=NBYTES],
[threshold for POSIX malloc usage (default=10)]),
, with_posix_malloc_threshold=10)
# Handle --with-link-size=N
AC_ARG_WITH(link-size,
AS_HELP_STRING([--with-link-size=N],
[internal link size (2, 3, or 4 allowed; default=2)]),
, with_link_size=2)
# Handle --with-match-limit=N
AC_ARG_WITH(match-limit,
AS_HELP_STRING([--with-match-limit=N],
[default limit on internal looping (default=10000000)]),
, with_match_limit=10000000)
# Handle --with-match-limit_recursion=N
#
# Note: In config.h, the default is to define MATCH_LIMIT_RECURSION
# symbolically as MATCH_LIMIT, which in turn is defined to be some numeric
# value (e.g. 10000000). MATCH_LIMIT_RECURSION can otherwise be set to some
# different numeric value (or even the same numeric value as MATCH_LIMIT,
# though no longer defined in terms of the latter).
#
AC_ARG_WITH(match-limit-recursion,
AS_HELP_STRING([--with-match-limit-recursion=N],
[default limit on internal recursion (default=MATCH_LIMIT)]),
, with_match_limit_recursion=MATCH_LIMIT)
# Make sure that if enable_unicode_properties was set, that UTF-8 support
# is enabled.
#
if test "x$enable_unicode_properties" = "xyes"
then
if test "x$enable_utf8" = "xno"
then
AC_MSG_ERROR([support for Unicode properties requires UTF-8 support])
fi
enable_utf8=yes
fi
if test "x$enable_utf8" = "xunset"
then
enable_utf8=no
fi
# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
# Also check that UTF-8 support is not requested, because PCRE cannot handle
# EBCDIC and UTF-8 in the same build. To do so it would need to use different
# character constants depending on the mode.
#
if test "x$enable_ebcdic" = "xyes"
then
enable_rebuild_chartables=yes
if test "x$enable_utf8" = "xyes"
then
AC_MSG_ERROR([support for EBCDIC and UTF-8 cannot be enabled at the same time])
fi
fi
# Convert the newline identifier into the appropriate integer value.
case "$enable_newline" in
lf) ac_pcre_newline_value=10 ;;
cr) ac_pcre_newline_value=13 ;;
crlf) ac_pcre_newline_value=3338 ;;
anycrlf) ac_pcre_newline_value=-2 ;;
any) ac_pcre_newline_value=-1 ;;
*)
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
;;
esac
# Check argument to --with-link-size
case "$with_link_size" in
2|3|4) ;;
*)
AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option])
;;
esac
AH_TOP([
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
Some other environments also support the use of "configure". PCRE is written in
Standard C, but there are a few non-standard things it can cope with, allowing
it to run on SunOS4 and other "close to standard" systems.
If you are going to build PCRE "by hand" on a system without "configure" you
should copy the distributed config.h.generic to config.h, and then set up the
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
all of your compile commands, so that config.h is included at the start of
every source.
Alternatively, you can avoid editing by using -D on the compiler command line
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
them both to 0; an emulation function will be used. */])
AC_DEFUN([AX_COMPILER_VENDOR],
[
AC_CACHE_CHECK([for _AC_LANG compiler vendor], ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor,
[ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor=unknown
# note: don't check for gcc first since some other compilers define __GNUC__
for ventest in intel:__ICC,__ECC,__INTEL_COMPILER ibm:__xlc__,__xlC__,__IBMC__,__IBMCPP__ gnu:__GNUC__ sun:__SUNPRO_C,__SUNPRO_CC hp:__HP_cc,__HP_aCC dec:__DECC,__DECCXX,__DECC_VER,__DECCXX_VER borland:__BORLANDC__,__TURBOC__ comeau:__COMO__ cray:_CRAYC kai:__KCC lcc:__LCC__ metrowerks:__MWERKS__ sgi:__sgi,sgi microsoft:_MSC_VER watcom:__WATCOMC__ portland:__PGI; do
vencpp="defined("`echo $ventest | cut -d: -f2 | sed 's/,/) || defined(/g'`")"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[
#if !($vencpp)
thisisanerror;
#endif
])], [ax_cv_]_AC_LANG_ABBREV[_compiler_vendor=`echo $ventest | cut -d: -f1`; break])
done
])
])
AX_COMPILER_VENDOR
# Enable 64 bit build
AC_ARG_ENABLE(64,
[AC_HELP_STRING([--enable-64],[build with 64 bit support])],[enable_64="$enable_64"],[enable_64="no"])
if test "x${ax_cv_c_compiler_vendor}" = "xsun" ; then
if test "${enable_64}" = "yes"; then
CFLAGS="$CFLAGS -m64"
CXXFLAGS="$CXXFLAGS -m64"
fi
fi
# Checks for header files.
AC_HEADER_STDC
AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h windows.h)
# The files below are C++ header files.
pcre_have_type_traits="0"
pcre_have_bits_type_traits="0"
if test "x$enable_cpp" = "xyes" -a -n "$CXX"
then
AC_LANG_PUSH(C++)
# Older versions of pcre defined pcrecpp::no_arg, but in new versions
# it's called pcrecpp::RE::no_arg. For backwards ABI compatibility,
# we want to make one an alias for the other. Different systems do
# this in different ways. Some systems, for instance, can do it via
# a linker flag: -alias (for os x 10.5) or -i (for os x <=10.4).
OLD_LDFLAGS="$LDFLAGS"
for flag in "-alias,__ZN7pcrecpp2RE6no_argE,__ZN7pcrecpp6no_argE" \
"-i__ZN7pcrecpp6no_argE:__ZN7pcrecpp2RE6no_argE"; do
AC_MSG_CHECKING([for alias support in the linker])
LDFLAGS="$OLD_LDFLAGS -Wl,$flag"
# We try to run the linker with this new ld flag. If the link fails,
# we give up and remove the new flag from LDFLAGS.
AC_LINK_IFELSE([AC_LANG_PROGRAM([[namespace pcrecpp {
class RE { static int no_arg; };
int RE::no_arg;
}]],
[[]])],
[AC_MSG_RESULT([yes]);
EXTRA_LIBPCRECPP_LDFLAGS="$EXTRA_LIBPCRECPP_LDFLAGS -Wl,$flag";
break;],
AC_MSG_RESULT([no]))
done
LDFLAGS="$OLD_LDFLAGS"
# We could be more clever here, given we're doing AC_SUBST with this
# (eg set a var to be the name of the include file we want). But we're not
# so it's easy to change back to 'regular' autoconf vars if we needed to.
AC_CHECK_HEADERS(string, [pcre_have_cpp_headers="1"],
[pcre_have_cpp_headers="0"])
AC_CHECK_HEADERS(bits/type_traits.h, [pcre_have_bits_type_traits="1"],
[pcre_have_bits_type_traits="0"])
AC_CHECK_HEADERS(type_traits.h, [pcre_have_type_traits="1"],
[pcre_have_type_traits="0"])
AC_LANG_POP
fi
# Using AC_SUBST eliminates the need to include config.h in a public .h file
AC_SUBST(pcre_have_type_traits)
AC_SUBST(pcre_have_bits_type_traits)
# Conditional compilation
AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes")
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
# Checks for typedefs, structures, and compiler characteristics.
AC_C_CONST
AC_TYPE_SIZE_T
pcre_have_strotolonglong=0
AC_CHECK_FUNCS(strtoq strtoll _strtoi64, [pcre_have_strotolonglong="1"; break])
# If we can't convert a string to a long long, pretend we don't even
# have a long long.
if test $pcre_have_strotolonglong = "0"; then
pcre_have_long_long="0"
pcre_have_ulong_long="0"
else
AC_CHECK_TYPES([long long],
[pcre_have_long_long="1"],
[pcre_have_long_long="0"])
AC_CHECK_TYPES([unsigned long long],
[pcre_have_ulong_long="1"],
[pcre_have_ulong_long="0"])
fi
AC_SUBST(pcre_have_long_long)
AC_SUBST(pcre_have_ulong_long)
# Checks for library functions.
AC_CHECK_FUNCS(bcopy memmove strerror)
# Check for the availability of libz (aka zlib)
AC_CHECK_HEADERS([zlib.h], [HAVE_ZLIB_H=1])
AC_CHECK_LIB([z], [gzopen], [HAVE_LIBZ=1])
# Check for the availability of libbz2
AC_CHECK_HEADERS([bzlib.h], [HAVE_BZLIB_H=1])
AC_CHECK_LIB([bz2], [BZ2_bzopen], [HAVE_LIBBZ2=1])
# Check for the availabiity of libreadline
AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_H=1])
AC_CHECK_HEADERS([readline/history.h], [HAVE_HISTORY_H=1])
AC_CHECK_LIB([readline], [readline], [HAVE_LIB_READLINE=1])
# This facilitates -ansi builds under Linux
dnl AC_DEFINE([_GNU_SOURCE], [], [Enable GNU extensions in glibc])
if test "x$enable_shared" = "xno" ; then
AC_DEFINE([PCRE_STATIC], [1], [
Define if linking statically (TODO: make nice with Libtool)])
fi
# Here is where pcre specific defines are handled
if test "$enable_utf8" = "yes"; then
AC_DEFINE([SUPPORT_UTF8], [], [
Define to enable support for the UTF-8 Unicode encoding. This will
work even in an EBCDIC environment, but it is incompatible with
the EBCDIC macro. That is, PCRE can support *either* EBCDIC code
*or* ASCII/UTF-8, but not both at once.])
fi
if test "$enable_unicode_properties" = "yes"; then
AC_DEFINE([SUPPORT_UCP], [], [
Define to enable support for Unicode properties])
fi
if test "$enable_stack_for_recursion" = "no"; then
AC_DEFINE([NO_RECURSE], [], [
PCRE uses recursive function calls to handle backtracking while
matching. This can sometimes be a problem on systems that have
stacks of limited size. Define NO_RECURSE to get a version that
doesn't use recursion in the match() function; instead it creates
its own stack by steam using pcre_recurse_malloc() to obtain memory
from the heap. For more detail, see the comments and other stuff
just above the match() function. On systems that support it,
"configure" can be used to set this in the Makefile
(use --disable-stack-for-recursion).])
fi
if test "$enable_pcregrep_libz" = "yes"; then
AC_DEFINE([SUPPORT_LIBZ], [], [
Define to allow pcregrep to be linked with libz, so that it is
able to handle .gz files.])
fi
if test "$enable_pcregrep_libbz2" = "yes"; then
AC_DEFINE([SUPPORT_LIBBZ2], [], [
Define to allow pcregrep to be linked with libbz2, so that it is
able to handle .bz2 files.])
fi
if test "$enable_pcretest_libreadline" = "yes"; then
AC_DEFINE([SUPPORT_LIBREADLINE], [], [
Define to allow pcretest to be linked with libreadline.])
fi
AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
The value of NEWLINE determines the newline character sequence. On
systems that support it, "configure" can be used to override the
default, which is 10. The possible values are 10 (LF), 13 (CR),
3338 (CRLF), -1 (ANY), or -2 (ANYCRLF).])
if test "$enable_bsr_anycrlf" = "yes"; then
AC_DEFINE([BSR_ANYCRLF], [], [
By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined, this is
changed so that backslash-R matches only CR, LF, or CRLF. The build-
time default can be overridden by the user of PCRE at runtime. On
systems that support it, "configure" can be used to override the
default.])
fi
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
The value of LINK_SIZE determines the number of bytes used to store
links as offsets within the compiled regex. The default is 2, which
allows for compiled patterns up to 64K long. This covers the vast
majority of cases. However, PCRE can also be compiled to use 3 or 4
bytes instead. This allows for longer patterns in extreme cases. On
systems that support it, "configure" can be used to override this default.])
AC_DEFINE_UNQUOTED([POSIX_MALLOC_THRESHOLD], [$with_posix_malloc_threshold], [
When calling PCRE via the POSIX interface, additional working storage
is required for holding the pointers to capturing substrings because
PCRE requires three integers per substring, whereas the POSIX
interface provides only two. If the number of expected substrings is
small, the wrapper function uses space on the stack, because this is
faster than using malloc() for each call. The threshold above which
the stack is no longer used is defined by POSIX_MALLOC_THRESHOLD. On
systems that support it, "configure" can be used to override this
default.])
AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [
The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different
limit. The limit exists in order to catch runaway regular
expressions that take for ever to determine that they do not match.
The default is set very large so that it does not accidentally catch
legitimate cases. On systems that support it, "configure" can be
used to override this default default.])
AC_DEFINE_UNQUOTED([MATCH_LIMIT_RECURSION], [$with_match_limit_recursion], [
The above limit applies to all calls of match(), whether or not they
increase the recursion depth. In some environments it is desirable
to limit the depth of recursive calls of match() more strictly, in
order to restrict the maximum amount of stack (or heap, if
NO_RECURSE is defined) that is used. The value of
MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To
have any useful effect, it must be less than the value of
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT.
There is a runtime method for setting a different limit. On systems
that support it, "configure" can be used to override the default.])
AC_DEFINE([MAX_NAME_SIZE], [32], [
This limit is parameterized just in case anybody ever wants to
change it. Care must be taken if it is increased, because it guards
against integer overflow caused by enormously large patterns.])
AC_DEFINE([MAX_NAME_COUNT], [10000], [
This limit is parameterized just in case anybody ever wants to
change it. Care must be taken if it is increased, because it guards
against integer overflow caused by enormously large patterns.])
AH_VERBATIM([PCRE_EXP_DEFN], [
/* If you are compiling for a system other than a Unix-like system or
Win32, and it needs some magic to be inserted before the definition
of a function that is exported by the library, define this macro to
contain the relevant magic. If you do not define this macro, it
defaults to "extern" for a C compiler and "extern C" for a C++
compiler on non-Win32 systems. This macro apears at the start of
every exported function that is part of the external API. It does
not appear on functions that are "external" in the C sense, but
which are internal to the library. */
#undef PCRE_EXP_DEFN])
if test "$enable_ebcdic" = "yes"; then
AC_DEFINE_UNQUOTED([EBCDIC], [], [
If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro as 1. On systems that can use
"configure", this can be done via --enable-ebcdic. PCRE will then
assume that all input strings are in EBCDIC. If you do not define
this macro, PCRE will assume input strings are ASCII or UTF-8 Unicode.
It is not possible to build a version of PCRE that supports both
EBCDIC and UTF-8.])
fi
# Platform specific issues
NO_UNDEFINED=
EXPORT_ALL_SYMBOLS=
case $host_os in
cygwin* | mingw* )
if test X"$enable_shared" = Xyes; then
NO_UNDEFINED="-no-undefined"
EXPORT_ALL_SYMBOLS="-Wl,--export-all-symbols"
fi
;;
esac
# The extra LDFLAGS for each particular library
# (Note: The libpcre*_version bits are m4 variables, assigned above)
EXTRA_LIBPCRE_LDFLAGS="$EXTRA_LIBPCRE_LDFLAGS \
$NO_UNDEFINED -version-info libpcre_version"
EXTRA_LIBPCREPOSIX_LDFLAGS="$EXTRA_LIBPCREPOSIX_LDFLAGS \
$NO_UNDEFINED -version-info libpcreposix_version"
EXTRA_LIBPCRECPP_LDFLAGS="$EXTRA_LIBPCRECPP_LDFLAGS \
$NO_UNDEFINED -version-info libpcrecpp_version \
$EXPORT_ALL_SYMBOLS"
AC_SUBST(EXTRA_LIBPCRE_LDFLAGS)
AC_SUBST(EXTRA_LIBPCREPOSIX_LDFLAGS)
AC_SUBST(EXTRA_LIBPCRECPP_LDFLAGS)
# When we run 'make distcheck', use these arguments.
DISTCHECK_CONFIGURE_FLAGS="--enable-cpp --enable-unicode-properties"
AC_SUBST(DISTCHECK_CONFIGURE_FLAGS)
# Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is
# specified, the relevant library is available.
if test "$enable_pcregrep_libz" = "yes"; then
if test "$HAVE_ZLIB_H" != "1"; then
echo "** Cannot --enable-pcregrep-libz because zlib.h was not found"
exit 1
fi
if test "$HAVE_LIBZ" != "1"; then
echo "** Cannot --enable-pcregrep-libz because libz was not found"
exit 1
fi
LIBZ="-lz"
fi
AC_SUBST(LIBZ)
if test "$enable_pcregrep_libbz2" = "yes"; then
if test "$HAVE_BZLIB_H" != "1"; then
echo "** Cannot --enable-pcregrep-libbz2 because bzlib.h was not found"
exit 1
fi
if test "$HAVE_LIBBZ2" != "1"; then
echo "** Cannot --enable-pcregrep-libbz2 because libbz2 was not found"
exit 1
fi
LIBBZ2="-lbz2"
fi
AC_SUBST(LIBBZ2)
# Similarly for --enable-pcretest-readline
if test "$enable_pcretest_libreadline" = "yes"; then
if test "$HAVE_READLINE_H" != "1"; then
echo "** Cannot --enable-pcretest-readline because readline/readline.h was not found."
exit 1
fi
if test "$HAVE_HISTORY_H" != "1"; then
echo "** Cannot --enable-pcretest-readline because readline/history.h was not found."
exit 1
fi
LIBREADLINE="-lreadline"
fi
AC_SUBST(LIBREADLINE)
# Produce these files, in addition to config.h.
AC_CONFIG_FILES(
Makefile
libpcre.pc
libpcrecpp.pc
pcre-config
pcre.h
pcre_stringpiece.h
pcrecpparg.h
)
# Make the generated script files executable.
AC_CONFIG_COMMANDS([script-chmod], [chmod a+x pcre-config])
# Make sure that pcre_chartables.c is removed in case the method for
# creating it was changed by reconfiguration.
AC_CONFIG_COMMANDS([delete-old-chartables], [rm -f pcre_chartables.c])
AC_OUTPUT
# Print out a nice little message after configure is run displaying your
# chosen options.
cat <<EOF
$PACKAGE-$VERSION configuration summary:
Install prefix .................. : ${prefix}
C preprocessor .................. : ${CPP}
C compiler ...................... : ${CC}
C++ preprocessor ................ : ${CXXCPP}
C++ compiler .................... : ${CXX}
Linker .......................... : ${LD}
C preprocessor flags ............ : ${CPPFLAGS}
C compiler flags ................ : ${CFLAGS}
C++ compiler flags .............. : ${CXXFLAGS}
Linker flags .................... : ${LDFLAGS}
Extra libraries ................. : ${LIBS}
Build C++ library ............... : ${enable_cpp}
Enable UTF-8 support ............ : ${enable_utf8}
Unicode properties .............. : ${enable_unicode_properties}
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
EBCDIC coding ................... : ${enable_ebcdic}
Rebuild char tables ............. : ${enable_rebuild_chartables}
Use stack recursion ............. : ${enable_stack_for_recursion}
POSIX mem threshold ............. : ${with_posix_malloc_threshold}
Internal link size .............. : ${with_link_size}
Match limit ..................... : ${with_match_limit}
Match limit recursion ........... : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
Link pcregrep with libz ......... : ${enable_pcregrep_libz}
Link pcregrep with libbz2 ....... : ${enable_pcregrep_libbz2}
Link pcretest with libreadline .. : ${enable_pcretest_libreadline}
EOF
dnl end configure.ac

View File

@ -1,4 +0,0 @@
#! /bin/sh
srcpath=$(dirname $0 2>/dev/null ) || srcpath="."
$srcpath/configure "$@" --disable-shared --with-pic --disable-cpp

View File

@ -1,589 +0,0 @@
#! /bin/sh
# depcomp - compile a program generating dependencies as side-effects
scriptversion=2007-03-29.01
# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007 Free Software
# Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.
# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
case $1 in
'')
echo "$0: No command. Try \`$0 --help' for more information." 1>&2
exit 1;
;;
-h | --h*)
cat <<\EOF
Usage: depcomp [--help] [--version] PROGRAM [ARGS]
Run PROGRAMS ARGS to compile a file, generating dependencies
as side-effects.
Environment variables:
depmode Dependency tracking mode.
source Source file read by `PROGRAMS ARGS'.
object Object file output by `PROGRAMS ARGS'.
DEPDIR directory where to store dependencies.
depfile Dependency file to output.
tmpdepfile Temporary file to use when outputing dependencies.
libtool Whether libtool is used (yes/no).
Report bugs to <bug-automake@gnu.org>.
EOF
exit $?
;;
-v | --v*)
echo "depcomp $scriptversion"
exit $?
;;
esac
if test -z "$depmode" || test -z "$source" || test -z "$object"; then
echo "depcomp: Variables source, object and depmode must be set" 1>&2
exit 1
fi
# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
depfile=${depfile-`echo "$object" |
sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
rm -f "$tmpdepfile"
# Some modes work just like other modes, but use different flags. We
# parameterize here, but still list the modes in the big case below,
# to make depend.m4 easier to write. Note that we *cannot* use a case
# here, because this file can only contain one case statement.
if test "$depmode" = hp; then
# HP compiler uses -M and no extra arg.
gccflag=-M
depmode=gcc
fi
if test "$depmode" = dashXmstdout; then
# This is just like dashmstdout with a different argument.
dashmflag=-xM
depmode=dashmstdout
fi
case "$depmode" in
gcc3)
## gcc 3 implements dependency tracking that does exactly what
## we want. Yay! Note: for some reason libtool 1.4 doesn't like
## it if -MD -MP comes after the -MF stuff. Hmm.
## Unfortunately, FreeBSD c89 acceptance of flags depends upon
## the command line argument order; so add the flags where they
## appear in depend2.am. Note that the slowdown incurred here
## affects only configure: in makefiles, %FASTDEP% shortcuts this.
for arg
do
case $arg in
-c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
*) set fnord "$@" "$arg" ;;
esac
shift # fnord
shift # $arg
done
"$@"
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
mv "$tmpdepfile" "$depfile"
;;
gcc)
## There are various ways to get dependency output from gcc. Here's
## why we pick this rather obscure method:
## - Don't want to use -MD because we'd like the dependencies to end
## up in a subdir. Having to rename by hand is ugly.
## (We might end up doing this anyway to support other compilers.)
## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
## -MM, not -M (despite what the docs say).
## - Using -M directly means running the compiler twice (even worse
## than renaming).
if test -z "$gccflag"; then
gccflag=-MD,
fi
"$@" -Wp,"$gccflag$tmpdepfile"
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
rm -f "$depfile"
echo "$object : \\" > "$depfile"
alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
## The second -e expression handles DOS-style file names with drive letters.
sed -e 's/^[^:]*: / /' \
-e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
## This next piece of magic avoids the `deleted header file' problem.
## The problem is that when a header file which appears in a .P file
## is deleted, the dependency causes make to die (because there is
## typically no way to rebuild the header). We avoid this by adding
## dummy dependencies for each header file. Too bad gcc doesn't do
## this for us directly.
tr ' ' '
' < "$tmpdepfile" |
## Some versions of gcc put a space before the `:'. On the theory
## that the space means something, we add a space to the output as
## well.
## Some versions of the HPUX 10.20 sed can't process this invocation
## correctly. Breaking it into two sed invocations is a workaround.
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
hp)
# This case exists only to let depend.m4 do its work. It works by
# looking at the text of this script. This case will never be run,
# since it is checked for above.
exit 1
;;
sgi)
if test "$libtool" = yes; then
"$@" "-Wp,-MDupdate,$tmpdepfile"
else
"$@" -MDupdate "$tmpdepfile"
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
rm -f "$depfile"
if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files
echo "$object : \\" > "$depfile"
# Clip off the initial element (the dependent). Don't try to be
# clever and replace this with sed code, as IRIX sed won't handle
# lines with more than a fixed number of characters (4096 in
# IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines;
# the IRIX cc adds comments like `#:fec' to the end of the
# dependency line.
tr ' ' '
' < "$tmpdepfile" \
| sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
tr '
' ' ' >> $depfile
echo >> $depfile
# The second pass generates a dummy entry for each header file.
tr ' ' '
' < "$tmpdepfile" \
| sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
>> $depfile
else
# The sourcefile does not contain any dependencies, so just
# store a dummy comment line, to avoid errors with the Makefile
# "include basename.Plo" scheme.
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile"
;;
aix)
# The C for AIX Compiler uses -M and outputs the dependencies
# in a .u file. In older versions, this file always lives in the
# current directory. Also, the AIX compiler puts `$object:' at the
# start of each line; $object doesn't have directory information.
# Version 6 uses the directory in both cases.
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
test "x$dir" = "x$object" && dir=
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
if test "$libtool" = yes; then
tmpdepfile1=$dir$base.u
tmpdepfile2=$base.u
tmpdepfile3=$dir.libs/$base.u
"$@" -Wc,-M
else
tmpdepfile1=$dir$base.u
tmpdepfile2=$dir$base.u
tmpdepfile3=$dir$base.u
"$@" -M
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
exit $stat
fi
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
do
test -f "$tmpdepfile" && break
done
if test -f "$tmpdepfile"; then
# Each line is of the form `foo.o: dependent.h'.
# Do two passes, one to just change these to
# `$object: dependent.h' and one to simply `dependent.h:'.
sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
# That's a tab and a space in the [].
sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
else
# The sourcefile does not contain any dependencies, so just
# store a dummy comment line, to avoid errors with the Makefile
# "include basename.Plo" scheme.
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile"
;;
icc)
# Intel's C compiler understands `-MD -MF file'. However on
# icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
# ICC 7.0 will fill foo.d with something like
# foo.o: sub/foo.c
# foo.o: sub/foo.h
# which is wrong. We want:
# sub/foo.o: sub/foo.c
# sub/foo.o: sub/foo.h
# sub/foo.c:
# sub/foo.h:
# ICC 7.1 will output
# foo.o: sub/foo.c sub/foo.h
# and will wrap long lines using \ :
# foo.o: sub/foo.c ... \
# sub/foo.h ... \
# ...
"$@" -MD -MF "$tmpdepfile"
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
rm -f "$depfile"
# Each line is of the form `foo.o: dependent.h',
# or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
# Do two passes, one to just change these to
# `$object: dependent.h' and one to simply `dependent.h:'.
sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
# Some versions of the HPUX 10.20 sed can't process this invocation
# correctly. Breaking it into two sed invocations is a workaround.
sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
hp2)
# The "hp" stanza above does not work with aCC (C++) and HP's ia64
# compilers, which have integrated preprocessors. The correct option
# to use with these is +Maked; it writes dependencies to a file named
# 'foo.d', which lands next to the object file, wherever that
# happens to be.
# Much of this is similar to the tru64 case; see comments there.
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
test "x$dir" = "x$object" && dir=
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
if test "$libtool" = yes; then
tmpdepfile1=$dir$base.d
tmpdepfile2=$dir.libs/$base.d
"$@" -Wc,+Maked
else
tmpdepfile1=$dir$base.d
tmpdepfile2=$dir$base.d
"$@" +Maked
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile1" "$tmpdepfile2"
exit $stat
fi
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
do
test -f "$tmpdepfile" && break
done
if test -f "$tmpdepfile"; then
sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
# Add `dependent.h:' lines.
sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
else
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile" "$tmpdepfile2"
;;
tru64)
# The Tru64 compiler uses -MD to generate dependencies as a side
# effect. `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
# At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
# dependencies in `foo.d' instead, so we check for that too.
# Subdirectories are respected.
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
test "x$dir" = "x$object" && dir=
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
if test "$libtool" = yes; then
# With Tru64 cc, shared objects can also be used to make a
# static library. This mechanism is used in libtool 1.4 series to
# handle both shared and static libraries in a single compilation.
# With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
#
# With libtool 1.5 this exception was removed, and libtool now
# generates 2 separate objects for the 2 libraries. These two
# compilations output dependencies in $dir.libs/$base.o.d and
# in $dir$base.o.d. We have to check for both files, because
# one of the two compilations can be disabled. We should prefer
# $dir$base.o.d over $dir.libs/$base.o.d because the latter is
# automatically cleaned when .libs/ is deleted, while ignoring
# the former would cause a distcleancheck panic.
tmpdepfile1=$dir.libs/$base.lo.d # libtool 1.4
tmpdepfile2=$dir$base.o.d # libtool 1.5
tmpdepfile3=$dir.libs/$base.o.d # libtool 1.5
tmpdepfile4=$dir.libs/$base.d # Compaq CCC V6.2-504
"$@" -Wc,-MD
else
tmpdepfile1=$dir$base.o.d
tmpdepfile2=$dir$base.d
tmpdepfile3=$dir$base.d
tmpdepfile4=$dir$base.d
"$@" -MD
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
exit $stat
fi
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
do
test -f "$tmpdepfile" && break
done
if test -f "$tmpdepfile"; then
sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
# That's a tab and a space in the [].
sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
else
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile"
;;
#nosideeffect)
# This comment above is used by automake to tell side-effect
# dependency tracking mechanisms from slower ones.
dashmstdout)
# Important note: in order to support this mode, a compiler *must*
# always write the preprocessed file to stdout, regardless of -o.
"$@" || exit $?
# Remove the call to Libtool.
if test "$libtool" = yes; then
while test $1 != '--mode=compile'; do
shift
done
shift
fi
# Remove `-o $object'.
IFS=" "
for arg
do
case $arg in
-o)
shift
;;
$object)
shift
;;
*)
set fnord "$@" "$arg"
shift # fnord
shift # $arg
;;
esac
done
test -z "$dashmflag" && dashmflag=-M
# Require at least two characters before searching for `:'
# in the target name. This is to cope with DOS-style filenames:
# a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
"$@" $dashmflag |
sed 's:^[ ]*[^: ][^:][^:]*\:[ ]*:'"$object"'\: :' > "$tmpdepfile"
rm -f "$depfile"
cat < "$tmpdepfile" > "$depfile"
tr ' ' '
' < "$tmpdepfile" | \
## Some versions of the HPUX 10.20 sed can't process this invocation
## correctly. Breaking it into two sed invocations is a workaround.
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
dashXmstdout)
# This case only exists to satisfy depend.m4. It is never actually
# run, as this mode is specially recognized in the preamble.
exit 1
;;
makedepend)
"$@" || exit $?
# Remove any Libtool call
if test "$libtool" = yes; then
while test $1 != '--mode=compile'; do
shift
done
shift
fi
# X makedepend
shift
cleared=no
for arg in "$@"; do
case $cleared in
no)
set ""; shift
cleared=yes ;;
esac
case "$arg" in
-D*|-I*)
set fnord "$@" "$arg"; shift ;;
# Strip any option that makedepend may not understand. Remove
# the object too, otherwise makedepend will parse it as a source file.
-*|$object)
;;
*)
set fnord "$@" "$arg"; shift ;;
esac
done
obj_suffix="`echo $object | sed 's/^.*\././'`"
touch "$tmpdepfile"
${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
rm -f "$depfile"
cat < "$tmpdepfile" > "$depfile"
sed '1,2d' "$tmpdepfile" | tr ' ' '
' | \
## Some versions of the HPUX 10.20 sed can't process this invocation
## correctly. Breaking it into two sed invocations is a workaround.
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile" "$tmpdepfile".bak
;;
cpp)
# Important note: in order to support this mode, a compiler *must*
# always write the preprocessed file to stdout.
"$@" || exit $?
# Remove the call to Libtool.
if test "$libtool" = yes; then
while test $1 != '--mode=compile'; do
shift
done
shift
fi
# Remove `-o $object'.
IFS=" "
for arg
do
case $arg in
-o)
shift
;;
$object)
shift
;;
*)
set fnord "$@" "$arg"
shift # fnord
shift # $arg
;;
esac
done
"$@" -E |
sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
sed '$ s: \\$::' > "$tmpdepfile"
rm -f "$depfile"
echo "$object : \\" > "$depfile"
cat < "$tmpdepfile" >> "$depfile"
sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
msvisualcpp)
# Important note: in order to support this mode, a compiler *must*
# always write the preprocessed file to stdout, regardless of -o,
# because we must use -o when running libtool.
"$@" || exit $?
IFS=" "
for arg
do
case "$arg" in
"-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
set fnord "$@"
shift
shift
;;
*)
set fnord "$@" "$arg"
shift
shift
;;
esac
done
"$@" -E |
sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
rm -f "$depfile"
echo "$object : \\" > "$depfile"
. "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s:: \1 \\:p' >> "$depfile"
echo " " >> "$depfile"
. "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
rm -f "$tmpdepfile"
;;
none)
exec "$@"
;;
*)
echo "Unknown depmode $depmode" 1>&2
exit 1
;;
esac
exit 0
# Local Variables:
# mode: shell-script
# sh-indentation: 2
# eval: (add-hook 'write-file-hooks 'time-stamp)
# time-stamp-start: "scriptversion="
# time-stamp-format: "%:y-%02m-%02d.%02H"
# time-stamp-end: "$"
# End:

View File

@ -1,199 +0,0 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This is a freestanding support program to generate a file containing
character tables for PCRE. The tables are built according to the current
locale. Now that pcre_maketables is a function visible to the outside world, we
make use of its code from here in order to be consistent. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <locale.h>
#include "pcre_internal.h"
#define DFTABLES /* pcre_maketables.c notices this */
#include "pcre_maketables.c"
int main(int argc, char **argv)
{
FILE *f;
int i = 1;
const unsigned char *tables;
const unsigned char *base_of_tables;
/* By default, the default C locale is used rather than what the building user
happens to have set. However, if the -L option is given, set the locale from
the LC_xxx environment variables. */
if (argc > 1 && strcmp(argv[1], "-L") == 0)
{
setlocale(LC_ALL, ""); /* Set from environment variables */
i++;
}
if (argc < i + 1)
{
fprintf(stderr, "dftables: one filename argument is required\n");
return 1;
}
tables = pcre_maketables();
base_of_tables = tables;
f = fopen(argv[i], "wb");
if (f == NULL)
{
fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]);
return 1;
}
/* There are several fprintf() calls here, because gcc in pedantic mode
complains about the very long string otherwise. */
fprintf(f,
"/*************************************************\n"
"* Perl-Compatible Regular Expressions *\n"
"*************************************************/\n\n"
"/* This file was automatically written by the dftables auxiliary\n"
"program. It contains character tables that are used when no external\n"
"tables are passed to PCRE by the application that calls it. The tables\n"
"are used only for characters whose code values are less than 256.\n\n");
fprintf(f,
"The following #includes are present because without them gcc 4.x may remove\n"
"the array definition from the final binary if PCRE is built into a static\n"
"library and dead code stripping is activated. This leads to link errors.\n"
"Pulling in the header ensures that the array gets flagged as \"someone\n"
"outside this compilation unit might reference this\" and so it will always\n"
"be supplied to the linker. */\n\n"
"#ifdef HAVE_CONFIG_H\n"
"#include \"config.h\"\n"
"#endif\n\n"
"#include \"pcre_internal.h\"\n\n");
fprintf(f,
"const unsigned char _pcre_default_tables[] = {\n\n"
"/* This table is a lower casing table. */\n\n");
fprintf(f, " ");
for (i = 0; i < 256; i++)
{
if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
fprintf(f, "%3d", *tables++);
if (i != 255) fprintf(f, ",");
}
fprintf(f, ",\n\n");
fprintf(f, "/* This table is a case flipping table. */\n\n");
fprintf(f, " ");
for (i = 0; i < 256; i++)
{
if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
fprintf(f, "%3d", *tables++);
if (i != 255) fprintf(f, ",");
}
fprintf(f, ",\n\n");
fprintf(f,
"/* This table contains bit maps for various character classes.\n"
"Each map is 32 bytes long and the bits run from the least\n"
"significant end of each byte. The classes that have their own\n"
"maps are: space, xdigit, digit, upper, lower, word, graph\n"
"print, punct, and cntrl. Other classes are built from combinations. */\n\n");
fprintf(f, " ");
for (i = 0; i < cbit_length; i++)
{
if ((i & 7) == 0 && i != 0)
{
if ((i & 31) == 0) fprintf(f, "\n");
fprintf(f, "\n ");
}
fprintf(f, "0x%02x", *tables++);
if (i != cbit_length - 1) fprintf(f, ",");
}
fprintf(f, ",\n\n");
fprintf(f,
"/* This table identifies various classes of character by individual bits:\n"
" 0x%02x white space character\n"
" 0x%02x letter\n"
" 0x%02x decimal digit\n"
" 0x%02x hexadecimal digit\n"
" 0x%02x alphanumeric or '_'\n"
" 0x%02x regular expression metacharacter or binary zero\n*/\n\n",
ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word,
ctype_meta);
fprintf(f, " ");
for (i = 0; i < 256; i++)
{
if ((i & 7) == 0 && i != 0)
{
fprintf(f, " /* ");
if (isprint(i-8)) fprintf(f, " %c -", i-8);
else fprintf(f, "%3d-", i-8);
if (isprint(i-1)) fprintf(f, " %c ", i-1);
else fprintf(f, "%3d", i-1);
fprintf(f, " */\n ");
}
fprintf(f, "0x%02x", *tables++);
if (i != 255) fprintf(f, ",");
}
fprintf(f, "};/* ");
if (isprint(i-8)) fprintf(f, " %c -", i-8);
else fprintf(f, "%3d-", i-8);
if (isprint(i-1)) fprintf(f, " %c ", i-1);
else fprintf(f, "%3d", i-1);
fprintf(f, " */\n\n/* End of pcre_chartables.c */\n");
fclose(f);
free((void *)base_of_tables);
return 0;
}
/* End of dftables.c */

View File

@ -1,348 +0,0 @@
Technical Notes about PCRE
--------------------------
These are very rough technical notes that record potentially useful information
about PCRE internals.
Historical note 1
-----------------
Many years ago I implemented some regular expression functions to an algorithm
suggested by Martin Richards. These were not Unix-like in form, and were quite
restricted in what they could do by comparison with Perl. The interesting part
about the algorithm was that the amount of space required to hold the compiled
form of an expression was known in advance. The code to apply an expression did
not operate by backtracking, as the original Henry Spencer code and current
Perl code does, but instead checked all possibilities simultaneously by keeping
a list of current states and checking all of them as it advanced through the
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
algorithm". When the pattern was all used up, all remaining states were
possible matches, and the one matching the longest subset of the subject string
was chosen. This did not necessarily maximize the individual wild portions of
the pattern, as is expected in Unix and Perl-style regular expressions.
Historical note 2
-----------------
By contrast, the code originally written by Henry Spencer (which was
subsequently heavily modified for Perl) compiles the expression twice: once in
a dummy mode in order to find out how much store will be needed, and then for
real. (The Perl version probably doesn't do this any more; I'm talking about
the original library.) The execution function operates by backtracking and
maximizing (or, optionally, minimizing in Perl) the amount of the subject that
matches individual wild portions of the pattern. This is an "NFA algorithm" in
Friedl's terminology.
OK, here's the real stuff
-------------------------
For the set of functions that form the "basic" PCRE library (which are
unrelated to those mentioned above), I tried at first to invent an algorithm
that used an amount of store bounded by a multiple of the number of characters
in the pattern, to save on compiling time. However, because of the greater
complexity in Perl regular expressions, I couldn't do this. In any case, a
first pass through the pattern is needed, for a number of reasons. PCRE works
by running a very degenerate first pass to calculate a maximum store size, and
then a second pass to do the real compile - which may use a bit less than the
predicted amount of store. The idea is that this is going to turn out faster
because the first pass is degenerate and the second pass can just store stuff
straight into the vector, which it knows is big enough. It does make the
compiling functions bigger, of course, but they have become quite big anyway to
handle all the Perl stuff.
Traditional matching function
-----------------------------
The "traditional", and original, matching function is called pcre_exec(), and
it implements an NFA algorithm, similar to the original Henry Spencer algorithm
and the way that Perl works. Not surprising, since it is intended to be as
compatible with Perl as possible. This is the function most users of PCRE will
use most of the time.
Supplementary matching function
-------------------------------
From PCRE 6.0, there is also a supplementary matching function called
pcre_dfa_exec(). This implements a DFA matching algorithm that searches
simultaneously for all possible matches that start at one point in the subject
string. (Going back to my roots: see Historical Note 1 above.) This function
intreprets the same compiled pattern data as pcre_exec(); however, not all the
facilities are available, and those that are do not always work in quite the
same way. See the user documentation for details.
Format of compiled patterns
---------------------------
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
item is either implicit in the opcode or contained in the data bytes that
follow it.
In many cases below "two-byte" data values are specified. This is in fact just
a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the
performance). This is necessary only when patterns whose compiled length is
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options.
A list of all the opcodes follows:
Opcodes with no following data
------------------------------
These items are all just one byte long
OP_END end of pattern
OP_ANY match any character
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
OP_CIRC ^ (start of data, or after \n in multiline)
OP_NOT_WORD_BOUNDARY \W
OP_WORD_BOUNDARY \w
OP_NOT_DIGIT \D
OP_DIGIT \d
OP_NOT_WHITESPACE \S
OP_WHITESPACE \s
OP_NOT_WORDCHAR \W
OP_WORDCHAR \w
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_EXTUNI match an extended Unicode character
Repeating single characters
---------------------------
The common repeats (*, +, ?) when applied to a single character use the
following opcodes:
OP_STAR
OP_MINSTAR
OP_PLUS
OP_MINPLUS
OP_QUERY
OP_MINQUERY
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
Those with "MIN" in their name are the minimizing versions. Each is followed by
the character that is to be repeated. Other repeats make use of
OP_UPTO
OP_MINUPTO
OP_EXACT
which are followed by a two-byte count (most significant first) and the
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
OP_UPTO (or OP_MINUPTO).
Repeating character types
-------------------------
Repeats of things like \d are done exactly as for single characters, except
that instead of a character, the opcode for the type is stored in the data
byte. The opcodes are:
OP_TYPESTAR
OP_TYPEMINSTAR
OP_TYPEPLUS
OP_TYPEMINPLUS
OP_TYPEQUERY
OP_TYPEMINQUERY
OP_TYPEUPTO
OP_TYPEMINUPTO
OP_TYPEEXACT
Match by Unicode property
-------------------------
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
character by testing its Unicode property (the \p and \P escape sequences).
Each is followed by two bytes that encode the desired property as a type and a
value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
value.
Matching literal characters
---------------------------
The OP_CHAR opcode is followed by a single character that is to be matched
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
character may be more than one byte long. (Earlier versions of PCRE used
multi-character strings, but this was changed to allow some new features to be
added.)
Character classes
-----------------
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
class, and OP_NOT for a negative one (that is, for something like [^a]).
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
values < 128, because OP_NOT is confined to single bytes.
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
repeated positive single-character class.
When there's more than one character in a class and all the characters are less
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
bit for every character that is acceptable. The bits are counted from the least
significant end of each byte.
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
subject characters with values greater than 256 can be handled correctly. For
OP_CLASS they don't match, whereas for OP_NCLASS they do.
For classes containing characters with values > 255, OP_XCLASS is used. It
optionally uses a bit map (if any characters lie within it), followed by a list
of pairs and single characters. There is a flag character than indicates
whether it's a positive or a negative class.
Back references
---------------
OP_REF is followed by two bytes containing the reference number.
Repeating character classes and back references
-----------------------------------------------
Single-character classes are handled specially (see above). This applies to
OP_CLASS and OP_REF. In both cases, the repeat information follows the base
item. The matching code looks at the following opcode to see if it is one of
OP_CRSTAR
OP_CRMINSTAR
OP_CRPLUS
OP_CRMINPLUS
OP_CRQUERY
OP_CRMINQUERY
OP_CRRANGE
OP_CRMINRANGE
All but the last two are just single-byte items. The others are followed by
four bytes of data, comprising the minimum and maximum repeat counts.
Brackets and alternation
------------------------
A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
speakers, including myself, can be round, square, curly, or pointy. Hence this
usage.]
Originally PCRE was limited to 99 capturing brackets (so as not to use up all
the opcodes). From release 3.5, there is no limit. What happens is that the
first ones, up to EXTRACT_BASIC_MAX are handled with separate opcodes, as
above. If there are more, the opcode is set to EXTRACT_BASIC_MAX+1, and the
first operation in the bracket is OP_BRANUMBER, followed by a 2-byte bracket
number. This opcode is ignored while matching, but is fished out when handling
the bracket itself. (They could have all been done like this, but I was making
minimal changes.)
A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
next alternative OP_ALT or, if there aren't any branches, to the matching
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
the next one, or to the OP_KET opcode.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
positive number) the offset back to the matching OP_BRA opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
opcodes which tell the matcher that skipping this subpattern entirely is a
valid branch.
A subpattern with an indefinite maximum repetition is replicated in the
compiled data its minimum number of times (or once with OP_BRAZERO if the
minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
as appropriate.
A subpattern with a bounded maximum repetition is replicated in a nested
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
before each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?.
Assertions
----------
Forward assertions are just like other subpatterns, but starting with one of
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
is OP_REVERSE, followed by a two byte count of the number of characters to move
back the pointer in the subject string. When operating in UTF-8 mode, the count
is a character count rather than a byte count. A separate count is present in
each alternative of a lookbehind assertion, allowing them to have different
fixed lengths.
Once-only subpatterns
---------------------
These are also just like other subpatterns, but they start with the opcode
OP_ONCE.
Conditional subpatterns
-----------------------
These are like other subpatterns, but they start with the opcode OP_COND. If
the condition is a back reference, this is stored at the start of the
subpattern using the opcode OP_CREF followed by two bytes containing the
reference number. If the condition is "in recursion" (coded as "(?(R)"), the
same scheme is used, with a "reference number" of 0xffff. Otherwise, a
conditional subpattern always starts with one of the assertions.
Recursion
---------
Recursion either matches the current regex, or some subexpression. The opcode
OP_RECURSE is followed by an value which is the offset to the starting bracket
from the start of the whole pattern. From release 6.5, OP_RECURSE is
automatically wrapped inside OP_ONCE brackets (because otherwise some patterns
broke it). OP_RECURSE is also used for "subroutine" calls, even though they
are not strictly a recursion.
Callout
-------
OP_CALLOUT is followed by one byte of data that holds a callout number in the
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
cases there follows a two-byte value giving the offset in the pattern to the
start of the following item, and another two-byte item giving the length of the
next item.
Changing options
----------------
If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
opcode is compiled, followed by one byte containing the new settings of these
flags. If there are several alternatives, there is an occurrence of OP_OPT at
the start of all those following the first options change, to set appropriate
options for the start of the alternative. Immediately after the end of the
group there is another such item to reset the flags to their previous values. A
change of flag right at the very start of the pattern can be handled entirely
at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
June 2006

View File

@ -1,140 +0,0 @@
<html>
<!-- This is a manually maintained file that is the root of the HTML version of
the PCRE documentation. When the HTML documents are built from the man
page versions, the entire doc/html directory is emptied, this file is then
copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
-->
<head>
<title>PCRE specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>Perl-compatible Regular Expressions (PCRE)</h1>
<p>
The HTML documentation for PCRE comprises the following pages:
</p>
<table>
<tr><td><a href="pcre.html">pcre</a></td>
<td>&nbsp;&nbsp;Introductory page</td></tr>
<tr><td><a href="pcre-config.html">pcre-config</a></td>
<td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
<tr><td><a href="pcreapi.html">pcreapi</a></td>
<td>&nbsp;&nbsp;PCRE's native API</td></tr>
<tr><td><a href="pcrebuild.html">pcrebuild</a></td>
<td>&nbsp;&nbsp;Options for building PCRE</td></tr>
<tr><td><a href="pcrecallout.html">pcrecallout</a></td>
<td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
<tr><td><a href="pcrecompat.html">pcrecompat</a></td>
<td>&nbsp;&nbsp;Compability with Perl</td></tr>
<tr><td><a href="pcrecpp.html">pcrecpp</a></td>
<td>&nbsp;&nbsp;The C++ wrapper for the PCRE library</td></tr>
<tr><td><a href="pcregrep.html">pcregrep</a></td>
<td>&nbsp;&nbsp;The <b>pcregrep</b> command</td></tr>
<tr><td><a href="pcrematching.html">pcrematching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
<tr><td><a href="pcrepartial.html">pcrepartial</a></td>
<td>&nbsp;&nbsp;Using PCRE for partial matching</td></tr>
<tr><td><a href="pcrepattern.html">pcrepattern</a></td>
<td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE</td></tr>
<tr><td><a href="pcreperform.html">pcreperform</a></td>
<td>&nbsp;&nbsp;Some comments on performance</td></tr>
<tr><td><a href="pcreposix.html">pcreposix</a></td>
<td>&nbsp;&nbsp;The POSIX API to the PCRE library</td></tr>
<tr><td><a href="pcreprecompile.html">pcreprecompile</a></td>
<td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
<tr><td><a href="pcresample.html">pcresample</a></td>
<td>&nbsp;&nbsp;Description of the sample program</td></tr>
<tr><td><a href="pcrestack.html">pcrestack</a></td>
<td>&nbsp;&nbsp;Discussion of PCRE's stack usage</td></tr>
<tr><td><a href="pcresyntax.html">pcresyntax</a></td>
<td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
<tr><td><a href="pcretest.html">pcretest</a></td>
<td>&nbsp;&nbsp;The <b>pcretest</b> command for testing PCRE</td></tr>
</table>
<p>
There are also individual pages that summarize the interface for each function
in the library:
</p>
<table>
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
<tr><td><a href="pcre_compile2.html">pcre_compile2</a></td>
<td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
<tr><td><a href="pcre_config.html">pcre_config</a></td>
<td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
<tr><td><a href="pcre_copy_named_substring.html">pcre_copy_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
<tr><td><a href="pcre_copy_substring.html">pcre_copy_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
<tr><td><a href="pcre_dfa_exec.html">pcre_dfa_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
<tr><td><a href="pcre_exec.html">pcre_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
<tr><td><a href="pcre_free_substring.html">pcre_free_substring</a></td>
<td>&nbsp;&nbsp;Free extracted substring</td></tr>
<tr><td><a href="pcre_free_substring_list.html">pcre_free_substring_list</a></td>
<td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
<tr><td><a href="pcre_fullinfo.html">pcre_fullinfo</a></td>
<td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
<tr><td><a href="pcre_get_named_substring.html">pcre_get_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
<tr><td><a href="pcre_get_stringnumber.html">pcre_get_stringnumber</a></td>
<td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
<tr><td><a href="pcre_get_substring.html">pcre_get_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
<tr><td><a href="pcre_get_substring_list.html">pcre_get_substring_list</a></td>
<td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
<tr><td><a href="pcre_info.html">pcre_info</a></td>
<td>&nbsp;&nbsp;Obsolete information extraction function</td></tr>
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
<tr><td><a href="pcre_study.html">pcre_study</a></td>
<td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
<tr><td><a href="pcre_version.html">pcre_version</a></td>
<td>&nbsp;&nbsp;Return PCRE version and release date</td></tr>
</table>
</html>

View File

@ -1,88 +0,0 @@
<html>
<head>
<title>pcre-config specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre-config man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">OPTIONS</a>
<li><a name="TOC4" href="#SEC4">SEE ALSO</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>pcre-config [--prefix] [--exec-prefix] [--version] [--libs]</b>
<b>[--libs-posix] [--cflags] [--cflags-posix]</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
<b>pcre-config</b> returns the configuration of the installed PCRE
libraries and the options required to compile a program to use them.
</P>
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
<P>
<b>--prefix</b>
Writes the directory prefix used in the PCRE installation for architecture
independent files (<i>/usr</i> on many systems, <i>/usr/local</i> on some
systems) to the standard output.
</P>
<P>
<b>--exec-prefix</b>
Writes the directory prefix used in the PCRE installation for architecture
dependent files (normally the same as <b>--prefix</b>) to the standard output.
</P>
<P>
<b>--version</b>
Writes the version number of the installed PCRE libraries to the standard
output.
</P>
<P>
<b>--libs</b>
Writes to the standard output the command line options required to link
with PCRE (<b>-lpcre</b> on many systems).
</P>
<P>
<b>--libs-posix</b>
Writes to the standard output the command line options required to link with
the PCRE posix emulation library (<b>-lpcreposix</b> <b>-lpcre</b> on many
systems).
</P>
<P>
<b>--cflags</b>
Writes to the standard output the command line options required to compile
files that use PCRE (this may include some <b>-I</b> options, but is blank on
many systems).
</P>
<P>
<b>--cflags-posix</b>
Writes to the standard output the command line options required to compile
files that use the PCRE posix emulation library (this may include some <b>-I</b>
options, but is blank on many systems).
</P>
<br><a name="SEC4" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre(3)</b>
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
This manual page was originally written by Mark Baker for the Debian GNU/Linux
system. It has been slightly revised as a generic PCRE man page.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 18 April 2007
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,306 +0,0 @@
<html>
<head>
<title>pcre specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">INTRODUCTION</a>
<li><a name="TOC2" href="#SEC2">USER DOCUMENTATION</a>
<li><a name="TOC3" href="#SEC3">LIMITATIONS</a>
<li><a name="TOC4" href="#SEC4">UTF-8 AND UNICODE PROPERTY SUPPORT</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">INTRODUCTION</a><br>
<P>
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl, with just a few
differences. Certain features that appeared in Python and PCRE before they
appeared in Perl are also available using the Python syntax. There is also some
support for certain .NET and Oniguruma syntax items, and there is an option for
requesting some minor changes that give better JavaScript compatibility.
</P>
<P>
The current implementation of PCRE (release 7.x) corresponds approximately with
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
category properties. However, UTF-8 and Unicode support has to be explicitly
enabled; it is not the default. The Unicode tables correspond to Unicode
release 5.1.
</P>
<P>
In addition to the Perl-compatible matching function, PCRE contains an
alternative matching function that matches the same compiled patterns in a
different way. In certain circumstances, the alternative function has some
advantages. For a discussion of the two matching algorithms, see the
<a href="pcrematching.html"><b>pcrematching</b></a>
page.
</P>
<P>
PCRE is written in C and released as a C library. A number of people have
written wrappers and interfaces of various kinds. In particular, Google Inc.
have provided a comprehensive C++ wrapper. This is now included as part of the
PCRE distribution. The
<a href="pcrecpp.html"><b>pcrecpp</b></a>
page has details of this interface. Other people's contributions can be found
in the <i>Contrib</i> directory at the primary FTP site, which is:
<a href="ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre">ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre</a>
</P>
<P>
Details of exactly which Perl regular expression features are and are not
supported by PCRE are given in separate documents. See the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
and
<a href="pcrecompat.html"><b>pcrecompat</b></a>
pages. There is a syntax summary in the
<a href="pcresyntax.html"><b>pcresyntax</b></a>
page.
</P>
<P>
Some features of PCRE can be included, excluded, or changed when the library is
built. The
<a href="pcre_config.html"><b>pcre_config()</b></a>
function makes it possible for a client to discover which features are
available. The features themselves are described in the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
page. Documentation about building PCRE for various operating systems can be
found in the <b>README</b> file in the source distribution.
</P>
<P>
The library contains a number of undocumented internal functions and data
tables that are used by more than one of the exported external functions, but
which are not intended for use by external callers. Their names all begin with
"_pcre_", which hopefully will not provoke any name clashes. In some
environments, it is possible to control which external symbols are exported
when a shared library is built, and in these cases the undocumented symbols are
not exported.
</P>
<br><a name="SEC2" href="#TOC1">USER DOCUMENTATION</a><br>
<P>
The user documentation for PCRE comprises a number of different sections. In
the "man" format, each of these is a separate "man page". In the HTML format,
each is a separate page, linked from the index page. In the plain text format,
all the sections are concatenated, for ease of searching. The sections are as
follows:
<pre>
pcre this document
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
pcrecallout details of the callout feature
pcrecompat discussion of Perl compatibility
pcrecpp details of the C++ wrapper
pcregrep description of the <b>pcregrep</b> command
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
pcrepattern syntax and semantics of supported regular expressions
pcresyntax quick syntax reference
pcreperform discussion of performance issues
pcreposix the POSIX-compatible C API
pcreprecompile details of saving and re-using precompiled patterns
pcresample discussion of the sample program
pcrestack discussion of stack usage
pcretest description of the <b>pcretest</b> testing command
</pre>
In addition, in the "man" and HTML formats, there is a short page for each
C library function, listing its arguments and results.
</P>
<br><a name="SEC3" href="#TOC1">LIMITATIONS</a><br>
<P>
There are some size limitations in PCRE but it is hoped that they will never in
practice be relevant.
</P>
<P>
The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
compiled with the default internal linkage size of 2. If you want to process
regular expressions that are truly enormous, you can compile PCRE with an
internal linkage size of 3 or 4 (see the <b>README</b> file in the source
distribution and the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
documentation for details). In these cases the limit is substantially larger.
However, the speed of execution is slower.
</P>
<P>
All values in repeating quantifiers must be less than 65536.
</P>
<P>
There is no limit to the number of parenthesized subpatterns, but there can be
no more than 65535 capturing subpatterns.
</P>
<P>
The maximum length of name for a named subpattern is 32 characters, and the
maximum number of named subpatterns is 10000.
</P>
<P>
The maximum length of a subject string is the largest positive number that an
integer variable can hold. However, when using the traditional matching
function, PCRE uses recursion to handle subpatterns and indefinite repetition.
This means that the available stack space may limit the size of a subject
string that can be processed by certain patterns. For a discussion of stack
issues, see the
<a href="pcrestack.html"><b>pcrestack</b></a>
documentation.
<a name="utf8support"></a></P>
<br><a name="SEC4" href="#TOC1">UTF-8 AND UNICODE PROPERTY SUPPORT</a><br>
<P>
From release 3.3, PCRE has had some support for character strings encoded in
the UTF-8 format. For release 4.0 this was greatly extended to cover most
common requirements, and in release 5.0 additional support for Unicode general
category properties was added.
</P>
<P>
In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
the code, and, in addition, you must call
<a href="pcre_compile.html"><b>pcre_compile()</b></a>
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
(*UTF8). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF-8 strings instead of
just strings of bytes.
</P>
<P>
If you compile PCRE with UTF-8 support, but do not use it at run time, the
library will be a bit bigger, but the additional run time overhead is limited
to testing the PCRE_UTF8 flag occasionally, so should not be very big.
</P>
<P>
If PCRE is built with Unicode character property support (which implies UTF-8
support), the escape sequences \p{..}, \P{..}, and \X are supported.
The available properties that can be tested are limited to the general
category properties such as Lu for an upper case letter or Nd for a decimal
number, the Unicode script names such as Arabic or Han, and the derived
properties Any and L&. A full list is given in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation. Only the short names for properties are supported. For example,
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
compatibility with Perl 5.6. PCRE does not support this.
<a name="utf8strings"></a></P>
<br><b>
Validity of UTF-8 strings
</b><br>
<P>
When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
are (by default) checked for validity on entry to the relevant functions. From
release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
themselves derived from the Unicode specification. Earlier releases of PCRE
followed the rules of RFC 2279, which allows the full range of 31-bit values (0
to 0x7FFFFFFF). The current check allows only values in the range U+0 to
U+10FFFF, excluding U+D800 to U+DFFF.
</P>
<P>
The excluded code points are the "Low Surrogate Area" of Unicode, of which the
Unicode Standard says this: "The Low Surrogate Area does not contain any
character assignments, consequently no character code charts or namelists are
provided for this area. Surrogates are reserved for use with UTF-16 and then
must be used in pairs." The code points that are encoded by UTF-16 pairs are
available as independent code points in the UTF-8 encoding. (In other words,
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
UTF-8.)
</P>
<P>
If an invalid UTF-8 string is passed to PCRE, an error return
(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
your strings are valid, and therefore want to skip these checks in order to
improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
at run time, PCRE assumes that the pattern or subject it is given
(respectively) contains only valid UTF-8 codes. In this case, it does not
diagnose an invalid UTF-8 string.
</P>
<P>
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
happens depends on why the string is invalid. If the string conforms to the
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
rules of RFC 2279. However, if the string does not even conform to RFC 2279,
the result is undefined. Your program may crash.
</P>
<P>
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
encoded in a UTF-8-like manner as per the old RFC, you can set
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
situation, you will have to apply your own validity check.
</P>
<br><b>
General comments about UTF-8 mode
</b><br>
<P>
1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
UTF-8 character if the value is greater than 127.
</P>
<P>
2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
characters for values greater than \177.
</P>
<P>
3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
bytes, for example: \x{100}{3}.
</P>
<P>
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
</P>
<P>
5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
but its use can lead to some strange effects. This facility is not available in
the alternative matching function, <b>pcre_dfa_exec()</b>.
</P>
<P>
6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
test characters of any code value, but the characters that PCRE recognizes as
digits, spaces, or word characters remain the same set as before, all with
values less than 256. This remains true even when PCRE includes Unicode
property support, because to do otherwise would slow down PCRE in many common
cases. If you really want to test for a wider sense of, say, "digit", you
must use Unicode property tests such as \p{Nd}. Note that this also applies to
\b, because it is defined in terms of \w and \W.
</P>
<P>
7. Similarly, characters that match the POSIX named character classes are all
low-valued characters.
</P>
<P>
8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
(\h, \H, \v, and \V) do match all the appropriate Unicode characters.
</P>
<P>
9. Case-insensitive matching applies only to characters whose values are less
than 128, unless PCRE is built with Unicode property support. Even when Unicode
property support is available, PCRE still uses its own character tables when
checking the case of low-valued characters, so as not to degrade performance.
The Unicode property information is used only for characters with higher
values. Even when Unicode property support is available, PCRE supports
case-insensitive matching only when there is a one-to-one mapping between a
letter's cases. There are a small number of many-to-one mappings in Unicode;
these are not supported by PCRE.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<P>
Putting an actual email address here seems to have been a spam magnet, so I've
taken it away. If you want to email me, use my two initials, followed by the
two digits 10, at the domain cam.ac.uk.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 11 April 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,89 +0,0 @@
<html>
<head>
<title>pcre_compile specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_compile man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b>
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
<b>const unsigned char *<i>tableptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function compiles a regular expression into an internal form. It is the
same as <b>pcre_compile2()</b>, except for the absence of the <i>errorcodeptr</i>
argument. Its arguments are:
<pre>
<i>pattern</i> A zero-terminated string containing the
regular expression to be compiled
<i>options</i> Zero or more option bits
<i>errptr</i> Where to put an error message
<i>erroffset</i> Offset in pattern where error was found
<i>tableptr</i> Pointer to character tables, or NULL to
use the built-in default
</pre>
The option bits are:
<pre>
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \R matches all Unicode line endings
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
</pre>
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
</P>
<P>
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,89 +0,0 @@
<html>
<head>
<title>pcre_compile2 specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_compile2 man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>pcre *pcre_compile2(const char *<i>pattern</i>, int <i>options</i>,</b>
<b>int *<i>errorcodeptr</i>,</b>
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
<b>const unsigned char *<i>tableptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function compiles a regular expression into an internal form. It is the
same as <b>pcre_compile()</b>, except for the addition of the <i>errorcodeptr</i>
argument. The arguments are:
</P>
<P>
<pre>
<i>pattern</i> A zero-terminated string containing the
regular expression to be compiled
<i>options</i> Zero or more option bits
<i>errorcodeptr</i> Where to put an error code
<i>errptr</i> Where to put an error message
<i>erroffset</i> Offset in pattern where error was found
<i>tableptr</i> Pointer to character tables, or NULL to
use the built-in default
</pre>
The option bits are:
<pre>
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
</pre>
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
</P>
<P>
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,70 +0,0 @@
<html>
<head>
<title>pcre_config specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_config man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function makes it possible for a client program to find out which optional
features are available in the version of the PCRE library it is using. Its
arguments are as follows:
<pre>
<i>what</i> A code specifying what information is required
<i>where</i> Points to where to put the data
</pre>
The available codes are:
<pre>
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
Internal recursion depth limit
PCRE_CONFIG_NEWLINE Value of the default newline sequence:
13 (0x000d) for CR
10 (0x000a) for LF
3338 (0x0d0a) for CRLF
-2 for ANYCRLF
-1 for ANY
PCRE_CONFIG_BSR Indicates what \R matches by default:
0 all Unicode line endings
1 CR, LF, or CRLF only
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
Threshold of return slots, above
which <b>malloc()</b> is used by
the POSIX API
PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
PCRE_CONFIG_UNICODE_PROPERTIES
Availability of Unicode property support
(1=yes 0=no)
</pre>
The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,53 +0,0 @@
<html>
<head>
<title>pcre_copy_named_substring specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_copy_named_substring man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_copy_named_substring(const pcre *<i>code</i>,</b>
<b>const char *<i>subject</i>, int *<i>ovector</i>,</b>
<b>int <i>stringcount</i>, const char *<i>stringname</i>,</b>
<b>char *<i>buffer</i>, int <i>buffersize</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for extracting a captured substring, identified
by name, into a given buffer. The arguments are:
<pre>
<i>code</i> Pattern that was successfully matched
<i>subject</i> Subject that has been successfully matched
<i>ovector</i> Offset vector that <b>pcre_exec()</b> used
<i>stringcount</i> Value returned by <b>pcre_exec()</b>
<i>stringname</i> Name of the required substring
<i>buffer</i> Buffer to receive the string
<i>buffersize</i> Size of buffer
</pre>
The yield is the length of the substring, PCRE_ERROR_NOMEMORY if the buffer was
too small, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,51 +0,0 @@
<html>
<head>
<title>pcre_copy_substring specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_copy_substring man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_copy_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b>
<b>int <i>stringcount</i>, int <i>stringnumber</i>, char *<i>buffer</i>,</b>
<b>int <i>buffersize</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for extracting a captured substring into a given
buffer. The arguments are:
<pre>
<i>subject</i> Subject that has been successfully matched
<i>ovector</i> Offset vector that <b>pcre_exec()</b> used
<i>stringcount</i> Value returned by <b>pcre_exec()</b>
<i>stringnumber</i> Number of the required substring
<i>buffer</i> Buffer to receive the string
<i>buffersize</i> Size of buffer
</pre>
The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,98 +0,0 @@
<html>
<head>
<title>pcre_dfa_exec specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_dfa_exec man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>,</b>
<b>int *<i>workspace</i>, int <i>wscount</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function matches a compiled regular expression against a given subject
string, using an alternative matching algorithm that scans the subject string
just once (<i>not</i> Perl-compatible). Note that the main, Perl-compatible,
matching function is <b>pcre_exec()</b>. The arguments for this function are:
<pre>
<i>code</i> Points to the compiled pattern
<i>extra</i> Points to an associated <b>pcre_extra</b> structure,
or is NULL
<i>subject</i> Points to the subject string
<i>length</i> Length of the subject string, in bytes
<i>startoffset</i> Offset in bytes in the subject at which to
start matching
<i>options</i> Option bits
<i>ovector</i> Points to a vector of ints for result offsets
<i>ovecsize</i> Number of elements in the vector
<i>workspace</i> Points to a vector of ints used as working space
<i>wscount</i> Number of elements in the vector
</pre>
The options are:
<pre>
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \R matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
PCRE_DFA_SHORTEST Return only the shortest match
PCRE_DFA_RESTART This is a restart after a partial match
</pre>
There are restrictions on what may appear in a pattern when using this matching
function. Details are given in the
<a href="pcrematching.html"><b>pcrematching</b></a>
documentation.
</P>
<P>
A <b>pcre_extra</b> structure contains the following fields:
<pre>
<i>flags</i> Bits indicating which fields are set
<i>study_data</i> Opaque data from <b>pcre_study()</b>
<i>match_limit</i> Limit on internal resource use
<i>match_limit_recursion</i> Limit on internal recursion depth
<i>callout_data</i> Opaque data passed back to callouts
<i>tables</i> Points to character tables or is NULL
</pre>
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
PCRE_EXTRA_TABLES. For this matching function, the <i>match_limit</i> and
<i>match_limit_recursion</i> fields are not used, and must not be set.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,91 +0,0 @@
<html>
<head>
<title>pcre_exec specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_exec man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function matches a compiled regular expression against a given subject
string, using a matching algorithm that is similar to Perl's. It returns
offsets to captured substrings. Its arguments are:
<pre>
<i>code</i> Points to the compiled pattern
<i>extra</i> Points to an associated <b>pcre_extra</b> structure,
or is NULL
<i>subject</i> Points to the subject string
<i>length</i> Length of the subject string, in bytes
<i>startoffset</i> Offset in bytes in the subject at which to
start matching
<i>options</i> Option bits
<i>ovector</i> Points to a vector of ints for result offsets
<i>ovecsize</i> Number of elements in the vector (a multiple of 3)
</pre>
The options are:
<pre>
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \R matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
</pre>
There are restrictions on what may appear in a pattern when partial matching is
requested. For details, see the
<a href="pcrepartial.html"><b>pcrepartial</b></a>
page.
</P>
<P>
A <b>pcre_extra</b> structure contains the following fields:
<pre>
<i>flags</i> Bits indicating which fields are set
<i>study_data</i> Opaque data from <b>pcre_study()</b>
<i>match_limit</i> Limit on internal resource use
<i>match_limit_recursion</i> Limit on internal recursion depth
<i>callout_data</i> Opaque data passed back to callouts
<i>tables</i> Points to character tables or is NULL
</pre>
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
PCRE_EXTRA_TABLES.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,40 +0,0 @@
<html>
<head>
<title>pcre_free_substring specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_free_substring man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>void pcre_free_substring(const char *<i>stringptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for freeing the store obtained by a previous
call to <b>pcre_get_substring()</b> or <b>pcre_get_named_substring()</b>. Its
only argument is a pointer to the string.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,40 +0,0 @@
<html>
<head>
<title>pcre_free_substring_list specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_free_substring_list man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>void pcre_free_substring_list(const char **<i>stringptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for freeing the store obtained by a previous
call to <b>pcre_get_substring_list()</b>. Its only argument is a pointer to the
list of string pointers.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,72 +0,0 @@
<html>
<head>
<title>pcre_fullinfo specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_fullinfo man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>int <i>what</i>, void *<i>where</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function returns information about a compiled pattern. Its arguments are:
<pre>
<i>code</i> Compiled regular expression
<i>extra</i> Result of <b>pcre_study()</b> or NULL
<i>what</i> What information is required
<i>where</i> Where to put the information
</pre>
The following information is available:
<pre>
PCRE_INFO_BACKREFMAX Number of highest back reference
PCRE_INFO_CAPTURECOUNT Number of capturing subpatterns
PCRE_INFO_DEFAULT_TABLES Pointer to default tables
PCRE_INFO_FIRSTBYTE Fixed first byte for a match, or
-1 for start of string
or after newline, or
-2 otherwise
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_NAMECOUNT Number of named subpatterns
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
PCRE_INFO_NAMETABLE Pointer to name table
PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
</pre>
The yield of the function is zero on success or:
<pre>
PCRE_ERROR_NULL the argument <i>code</i> was NULL
the argument <i>where</i> was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of <i>what</i> was invalid
</PRE>
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,55 +0,0 @@
<html>
<head>
<title>pcre_get_named_substring specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_get_named_substring man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_get_named_substring(const pcre *<i>code</i>,</b>
<b>const char *<i>subject</i>, int *<i>ovector</i>,</b>
<b>int <i>stringcount</i>, const char *<i>stringname</i>,</b>
<b>const char **<i>stringptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for extracting a captured substring by name. The
arguments are:
<pre>
<i>code</i> Compiled pattern
<i>subject</i> Subject that has been successfully matched
<i>ovector</i> Offset vector that <b>pcre_exec()</b> used
<i>stringcount</i> Value returned by <b>pcre_exec()</b>
<i>stringname</i> Name of the required substring
<i>stringptr</i> Where to put the string pointer
</pre>
The memory in which the substring is placed is obtained by calling
<b>pcre_malloc()</b>. The convenience function <b>pcre_free_substring()</b> can
be used to free it when it is no longer needed. The yield of the function is
the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,49 +0,0 @@
<html>
<head>
<title>pcre_get_stringnumber specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_get_stringnumber man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_get_stringnumber(const pcre *<i>code</i>,</b>
<b>const char *<i>name</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This convenience function finds the number of a named substring capturing
parenthesis in a compiled pattern. Its arguments are:
<pre>
<i>code</i> Compiled regular expression
<i>name</i> Name whose number is required
</pre>
The yield of the function is the number of the parenthesis if the name is
found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
<b>pcre_get_stringnumber()</b>. You can obtain the complete list by calling
<b>pcre_get_stringtable_entries()</b>.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,52 +0,0 @@
<html>
<head>
<title>pcre_get_stringtable_entries specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_get_stringtable_entries man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_get_stringtable_entries(const pcre *<i>code</i>,</b>
<b>const char *<i>name</i>, char **<i>first</i>, char **<i>last</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This convenience function finds, for a compiled pattern, the first and last
entries for a given name in the table that translates capturing parenthesis
names into numbers. When names are required to be unique (PCRE_DUPNAMES is
<i>not</i> set), it is usually easier to use <b>pcre_get_stringnumber()</b>
instead.
<pre>
<i>code</i> Compiled regular expression
<i>name</i> Name whose entries required
<i>first</i> Where to return a pointer to the first entry
<i>last</i> Where to return a pointer to the last entry
</pre>
The yield of the function is the length of each entry, or
PCRE_ERROR_NOSUBSTRING if none are found.
</P>
<P>
There is a complete description of the PCRE native API, including the format of
the table entries, in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page, and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,53 +0,0 @@
<html>
<head>
<title>pcre_get_substring specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_get_substring man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_get_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b>
<b>int <i>stringcount</i>, int <i>stringnumber</i>,</b>
<b>const char **<i>stringptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for extracting a captured substring. The
arguments are:
<pre>
<i>subject</i> Subject that has been successfully matched
<i>ovector</i> Offset vector that <b>pcre_exec()</b> used
<i>stringcount</i> Value returned by <b>pcre_exec()</b>
<i>stringnumber</i> Number of the required substring
<i>stringptr</i> Where to put the string pointer
</pre>
The memory in which the substring is placed is obtained by calling
<b>pcre_malloc()</b>. The convenience function <b>pcre_free_substring()</b> can
be used to free it when it is no longer needed. The yield of the function is
the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,53 +0,0 @@
<html>
<head>
<title>pcre_get_substring_list specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_get_substring_list man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_get_substring_list(const char *<i>subject</i>,</b>
<b>int *<i>ovector</i>, int <i>stringcount</i>, const char ***<i>listptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This is a convenience function for extracting a list of all the captured
substrings. The arguments are:
<pre>
<i>subject</i> Subject that has been successfully matched
<i>ovector</i> Offset vector that <b>pcre_exec</b> used
<i>stringcount</i> Value returned by <b>pcre_exec</b>
<i>listptr</i> Where to put a pointer to the list
</pre>
The memory in which the substrings and the list are placed is obtained by
calling <b>pcre_malloc()</b>. The convenience function
<b>pcre_free_substring_list()</b> can be used to free it when it is no longer
needed. A pointer to a list of pointers is put in the variable whose address is
in <i>listptr</i>. The list is terminated by a NULL pointer. The yield of the
function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
not be obtained.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,39 +0,0 @@
<html>
<head>
<title>pcre_info specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_info man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_info(const pcre *<i>code</i>, int *<i>optptr</i>, int</b>
<b>*<i>firstcharptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function is obsolete. You should be using <b>pcre_fullinfo()</b> instead.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,42 +0,0 @@
<html>
<head>
<title>pcre_maketables specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_maketables man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>const unsigned char *pcre_maketables(void);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function builds a set of character tables for character values less than
256. These can be passed to <b>pcre_compile()</b> to override PCRE's internal,
built-in tables (which were made by <b>pcre_maketables()</b> when PCRE was
compiled). You might want to do this if you are using a non-standard locale.
The function yields a pointer to the tables.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,45 +0,0 @@
<html>
<head>
<title>pcre_refcount specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_refcount man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function is used to maintain a reference count inside a data block that
contains a compiled pattern. Its arguments are:
<pre>
<i>code</i> Compiled regular expression
<i>adjust</i> Adjustment to reference value
</pre>
The yield of the function is the adjusted reference value, which is constrained
to lie between 0 and 65535.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,56 +0,0 @@
<html>
<head>
<title>pcre_study specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_study man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>pcre_extra *pcre_study(const pcre *<i>code</i>, int <i>options</i>,</b>
<b>const char **<i>errptr</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function studies a compiled pattern, to see if additional information can
be extracted that might speed up matching. Its arguments are:
<pre>
<i>code</i> A compiled regular expression
<i>options</i> Options for <b>pcre_study()</b>
<i>errptr</i> Where to put an error message
</pre>
If the function succeeds, it returns a value that can be passed to
<b>pcre_exec()</b> via its <i>extra</i> argument.
</P>
<P>
If the function returns NULL, either it could not find any additional
information, or there was an error. You can tell the difference by looking at
the error value. It is NULL in first case.
</P>
<P>
There are currently no options defined; the value of the second argument should
always be zero.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,39 +0,0 @@
<html>
<head>
<title>pcre_version specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre_version man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
SYNOPSIS
</b><br>
<P>
<b>#include &#60;pcre.h&#62;</b>
</P>
<P>
<b>char *pcre_version(void);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
This function returns a character string that gives the version number of the
PCRE library and the date of its release.
</P>
<P>
There is a complete description of the PCRE native API in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

File diff suppressed because it is too large Load Diff

View File

@ -1,348 +0,0 @@
<html>
<head>
<title>pcrebuild specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrebuild man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE BUILD-TIME OPTIONS</a>
<li><a name="TOC2" href="#SEC2">C++ SUPPORT</a>
<li><a name="TOC3" href="#SEC3">UTF-8 SUPPORT</a>
<li><a name="TOC4" href="#SEC4">UNICODE CHARACTER PROPERTY SUPPORT</a>
<li><a name="TOC5" href="#SEC5">CODE VALUE OF NEWLINE</a>
<li><a name="TOC6" href="#SEC6">WHAT \R MATCHES</a>
<li><a name="TOC7" href="#SEC7">BUILDING SHARED AND STATIC LIBRARIES</a>
<li><a name="TOC8" href="#SEC8">POSIX MALLOC USAGE</a>
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
<li><a name="TOC11" href="#SEC11">LIMITING PCRE RESOURCE USAGE</a>
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
<li><a name="TOC14" href="#SEC14">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
<li><a name="TOC15" href="#SEC15">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
<li><a name="TOC16" href="#SEC16">SEE ALSO</a>
<li><a name="TOC17" href="#SEC17">AUTHOR</a>
<li><a name="TOC18" href="#SEC18">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE BUILD-TIME OPTIONS</a><br>
<P>
This document describes the optional features of PCRE that can be selected when
the library is compiled. It assumes use of the <b>configure</b> script, where
the optional features are selected or deselected by providing options to
<b>configure</b> before running the <b>make</b> command. However, the same
options can be selected in both Unix-like and non-Unix-like environments using
the GUI facility of <b>CMakeSetup</b> if you are using <b>CMake</b> instead of
<b>configure</b> to build PCRE.
</P>
<P>
The complete list of options for <b>configure</b> (which includes the standard
ones such as the selection of the installation directory) can be obtained by
running
<pre>
./configure --help
</pre>
The following sections include descriptions of options whose names begin with
--enable or --disable. These settings specify changes to the defaults for the
<b>configure</b> command. Because of the way that <b>configure</b> works,
--enable and --disable always come in pairs, so the complementary option always
exists as well, but as it specifies the default, it is not described.
</P>
<br><a name="SEC2" href="#TOC1">C++ SUPPORT</a><br>
<P>
By default, the <b>configure</b> script will search for a C++ compiler and C++
header files. If it finds them, it automatically builds the C++ wrapper library
for PCRE. You can disable this by adding
<pre>
--disable-cpp
</pre>
to the <b>configure</b> command.
</P>
<br><a name="SEC3" href="#TOC1">UTF-8 SUPPORT</a><br>
<P>
To build PCRE with support for UTF-8 Unicode character strings, add
<pre>
--enable-utf8
</pre>
to the <b>configure</b> command. Of itself, this does not make PCRE treat
strings as UTF-8. As well as compiling PCRE with this option, you also have
have to set the PCRE_UTF8 option when you call the <b>pcre_compile()</b>
function.
</P>
<P>
If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
its input to be either ASCII or UTF-8 (depending on the runtime option). It is
not possible to support both EBCDIC and UTF-8 codes in the same version of the
library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
exclusive.
</P>
<br><a name="SEC4" href="#TOC1">UNICODE CHARACTER PROPERTY SUPPORT</a><br>
<P>
UTF-8 support allows PCRE to process character values greater than 255 in the
strings that it handles. On its own, however, it does not provide any
facilities for accessing the properties of such characters. If you want to be
able to use the pattern escapes \P, \p, and \X, which refer to Unicode
character properties, you must add
<pre>
--enable-unicode-properties
</pre>
to the <b>configure</b> command. This implies UTF-8 support, even if you have
not explicitly requested it.
</P>
<P>
Including Unicode property support adds around 30K of tables to the PCRE
library. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
supported. Details are given in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation.
</P>
<br><a name="SEC5" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
<P>
By default, PCRE interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can
compile PCRE to use carriage return (CR) instead, by adding
<pre>
--enable-newline-is-cr
</pre>
to the <b>configure</b> command. There is also a --enable-newline-is-lf option,
which explicitly specifies linefeed as the newline character.
<br>
<br>
Alternatively, you can specify that line endings are to be indicated by the two
character sequence CRLF. If you want this, add
<pre>
--enable-newline-is-crlf
</pre>
to the <b>configure</b> command. There is a fourth option, specified by
<pre>
--enable-newline-is-anycrlf
</pre>
which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
indicating a line ending. Finally, a fifth option, specified by
<pre>
--enable-newline-is-any
</pre>
causes PCRE to recognize any Unicode newline sequence.
</P>
<P>
Whatever line ending convention is selected when PCRE is built can be
overridden when the library functions are called. At build time it is
conventional to use the standard for your operating system.
</P>
<br><a name="SEC6" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
By default, the sequence \R in a pattern matches any Unicode newline sequence,
whatever has been selected as the line ending sequence. If you specify
<pre>
--enable-bsr-anycrlf
</pre>
the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
selected when PCRE is built can be overridden when the library functions are
called.
</P>
<br><a name="SEC7" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
<P>
The PCRE building process uses <b>libtool</b> to build both shared and static
Unix libraries by default. You can suppress one of these by adding one of
<pre>
--disable-shared
--disable-static
</pre>
to the <b>configure</b> command, as required.
</P>
<br><a name="SEC8" href="#TOC1">POSIX MALLOC USAGE</a><br>
<P>
When PCRE is called through the POSIX interface (see the
<a href="pcreposix.html"><b>pcreposix</b></a>
documentation), additional working storage is required for holding the pointers
to capturing substrings, because PCRE requires three integers per substring,
whereas the POSIX interface provides only two. If the number of expected
substrings is small, the wrapper function uses space on the stack, because this
is faster than using <b>malloc()</b> for each call. The default threshold above
which the stack is no longer used is 10; it can be changed by adding a setting
such as
<pre>
--with-posix-malloc-threshold=20
</pre>
to the <b>configure</b> command.
</P>
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<P>
Within a compiled pattern, offset values are used to point from one part to
another (for example, from an opening parenthesis to an alternation
metacharacter). By default, two-byte values are used for these offsets, leading
to a maximum size for a compiled pattern of around 64K. This is sufficient to
handle all but the most gigantic patterns. Nevertheless, some people do want to
process enormous patterns, so it is possible to compile PCRE to use three-byte
or four-byte offsets by adding a setting such as
<pre>
--with-link-size=3
</pre>
to the <b>configure</b> command. The value given must be 2, 3, or 4. Using
longer offsets slows down the operation of PCRE because it has to load
additional bytes when handling them.
</P>
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<P>
When matching with the <b>pcre_exec()</b> function, PCRE implements backtracking
by making recursive calls to an internal function called <b>match()</b>. In
environments where the size of the stack is limited, this can severely limit
PCRE's operation. (The Unix environment does not usually suffer from this
problem, but it may sometimes be necessary to increase the maximum stack size.
There is a discussion in the
<a href="pcrestack.html"><b>pcrestack</b></a>
documentation.) An alternative approach to recursion that uses memory from the
heap to remember data, instead of using recursive function calls, has been
implemented to work round the problem of limited stack size. If you want to
build a version of PCRE that works this way, add
<pre>
--disable-stack-for-recursion
</pre>
to the <b>configure</b> command. With this configuration, PCRE will use the
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables to call memory
management functions. By default these point to <b>malloc()</b> and
<b>free()</b>, but you can replace the pointers so that your own functions are
used.
</P>
<P>
Separate functions are provided rather than using <b>pcre_malloc</b> and
<b>pcre_free</b> because the usage is very predictable: the block sizes
requested are always the same, and the blocks are always freed in reverse
order. A calling program might be able to implement optimized functions that
perform better than <b>malloc()</b> and <b>free()</b>. PCRE runs noticeably more
slowly when built in this way. This option affects only the <b>pcre_exec()</b>
function; it is not relevant for the the <b>pcre_dfa_exec()</b> function.
</P>
<br><a name="SEC11" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
<P>
Internally, PCRE has a function called <b>match()</b>, which it calls repeatedly
(sometimes recursively) when matching a pattern with the <b>pcre_exec()</b>
function. By controlling the maximum number of times this function may be
called during a single matching operation, a limit can be placed on the
resources used by a single call to <b>pcre_exec()</b>. The limit can be changed
at run time, as described in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation. The default is 10 million, but this can be changed by adding a
setting such as
<pre>
--with-match-limit=500000
</pre>
to the <b>configure</b> command. This setting has no effect on the
<b>pcre_dfa_exec()</b> matching function.
</P>
<P>
In some environments it is desirable to limit the depth of recursive calls of
<b>match()</b> more strictly than the total number of calls, in order to
restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion
is specified) that is used. A second limit controls this; it defaults to the
value that is set for --with-match-limit, which imposes no additional
constraints. However, you can set a lower limit by adding, for example,
<pre>
--with-match-limit-recursion=10000
</pre>
to the <b>configure</b> command. This value can also be overridden at run time.
</P>
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<P>
PCRE uses fixed tables for processing characters whose code values are less
than 256. By default, PCRE is built with a set of tables that are distributed
in the file <i>pcre_chartables.c.dist</i>. These tables are for ASCII codes
only. If you add
<pre>
--enable-rebuild-chartables
</pre>
to the <b>configure</b> command, the distributed tables are no longer used.
Instead, a program called <b>dftables</b> is compiled and run. This outputs the
source for new set of tables, created in the default locale of your C runtime
system. (This method of replacing the tables does not work if you are cross
compiling, because <b>dftables</b> is run on the local host. If you need to
create alternative tables when cross compiling, you will have to do so "by
hand".)
</P>
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
<P>
PCRE assumes by default that it will run in an environment where the character
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
most computer operating systems. PCRE can, however, be compiled to run in an
EBCDIC environment by adding
<pre>
--enable-ebcdic
</pre>
to the <b>configure</b> command. This setting implies
--enable-rebuild-chartables. You should only use it if you know that you are in
an EBCDIC environment (for example, an IBM mainframe operating system). The
--enable-ebcdic option is incompatible with --enable-utf8.
</P>
<br><a name="SEC14" href="#TOC1">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<P>
By default, <b>pcregrep</b> reads all files as plain text. You can build it so
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
them with <b>libz</b> or <b>libbz2</b>, respectively, by adding one or both of
<pre>
--enable-pcregrep-libz
--enable-pcregrep-libbz2
</pre>
to the <b>configure</b> command. These options naturally require that the
relevant libraries are installed on your system. Configuration will fail if
they are not.
</P>
<br><a name="SEC15" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<P>
If you add
<pre>
--enable-pcretest-libreadline
</pre>
to the <b>configure</b> command, <b>pcretest</b> is linked with the
<b>libreadline</b> library, and when its input is from a terminal, it reads it
using the <b>readline()</b> function. This provides line-editing and history
facilities. Note that <b>libreadline</b> is GPL-licenced, so if you distribute a
binary of <b>pcretest</b> linked in this way, there may be licensing issues.
</P>
<P>
Setting this option causes the <b>-lreadline</b> option to be added to the
<b>pcretest</b> build. In many operating environments with a sytem-installed
<b>libreadline</b> this is sufficient. However, in some environments (e.g.
if an unmodified distribution version of readline is in use), some extra
configuration may be necessary. The INSTALL file for <b>libreadline</b> says
this:
<pre>
"Readline uses the termcap functions, but does not link with the
termcap or curses library itself, allowing applications which link
with readline the to choose an appropriate library."
</pre>
If your environment has not been set up so that an appropriate library is
automatically included, you may need to add something like
<pre>
LIBS="-ncurses"
</pre>
immediately before the <b>configure</b> command.
</P>
<br><a name="SEC16" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcreapi</b>(3), <b>pcre_config</b>(3).
</P>
<br><a name="SEC17" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC18" href="#TOC1">REVISION</a><br>
<P>
Last updated: 17 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,208 +0,0 @@
<html>
<head>
<title>pcrecallout specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrecallout man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE CALLOUTS</a>
<li><a name="TOC2" href="#SEC2">MISSING CALLOUTS</a>
<li><a name="TOC3" href="#SEC3">THE CALLOUT INTERFACE</a>
<li><a name="TOC4" href="#SEC4">RETURN VALUES</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE CALLOUTS</a><br>
<P>
<b>int (*pcre_callout)(pcre_callout_block *);</b>
</P>
<P>
PCRE provides a feature called "callout", which is a means of temporarily
passing control to the caller of PCRE in the middle of pattern matching. The
caller of PCRE provides an external function by putting its entry point in the
global variable <i>pcre_callout</i>. By default, this variable contains NULL,
which disables all calling out.
</P>
<P>
Within a regular expression, (?C) indicates the points at which the external
function is to be called. Different callout points can be identified by putting
a number less than 256 after the letter C. The default value is zero.
For example, this pattern has two callout points:
<pre>
(?C1)abc(?C2)def
</pre>
If the PCRE_AUTO_CALLOUT option bit is set when <b>pcre_compile()</b> is called,
PCRE automatically inserts callouts, all with number 255, before each item in
the pattern. For example, if PCRE_AUTO_CALLOUT is used with the pattern
<pre>
A(\d{2}|--)
</pre>
it is processed as if it were
<br>
<br>
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
<br>
<br>
Notice that there is a callout before and after each parenthesis and
alternation bar. Automatic callouts can be used for tracking the progress of
pattern matching. The
<a href="pcretest.html"><b>pcretest</b></a>
command has an option that sets automatic callouts; when it is used, the output
indicates how the pattern is matched. This is useful information when you are
trying to optimize the performance of a particular pattern.
</P>
<br><a name="SEC2" href="#TOC1">MISSING CALLOUTS</a><br>
<P>
You should be aware that, because of optimizations in the way PCRE matches
patterns by default, callouts sometimes do not happen. For example, if the
pattern is
<pre>
ab(?C4)cd
</pre>
PCRE knows that any matching string must contain the letter "d". If the subject
string is "abyz", the lack of "d" means that matching doesn't ever start, and
the callout is never reached. However, with "abyd", though the result is still
no match, the callout is obeyed.
</P>
<P>
You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>. This slows down the
matching process, but does ensure that callouts such as the example above are
obeyed.
</P>
<br><a name="SEC3" href="#TOC1">THE CALLOUT INTERFACE</a><br>
<P>
During matching, when PCRE reaches a callout point, the external function
defined by <i>pcre_callout</i> is called (if it is set). This applies to both
the <b>pcre_exec()</b> and the <b>pcre_dfa_exec()</b> matching functions. The
only argument to the callout function is a pointer to a <b>pcre_callout</b>
block. This structure contains the following fields:
<pre>
int <i>version</i>;
int <i>callout_number</i>;
int *<i>offset_vector</i>;
const char *<i>subject</i>;
int <i>subject_length</i>;
int <i>start_match</i>;
int <i>current_position</i>;
int <i>capture_top</i>;
int <i>capture_last</i>;
void *<i>callout_data</i>;
int <i>pattern_position</i>;
int <i>next_item_length</i>;
</pre>
The <i>version</i> field is an integer containing the version number of the
block format. The initial version was 0; the current version is 1. The version
number will change again in future if additional fields are added, but the
intention is never to remove any of the existing fields.
</P>
<P>
The <i>callout_number</i> field contains the number of the callout, as compiled
into the pattern (that is, the number after ?C for manual callouts, and 255 for
automatically generated callouts).
</P>
<P>
The <i>offset_vector</i> field is a pointer to the vector of offsets that was
passed by the caller to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>. When
<b>pcre_exec()</b> is used, the contents can be inspected in order to extract
substrings that have been matched so far, in the same way as for extracting
substrings after a match has completed. For <b>pcre_dfa_exec()</b> this field is
not useful.
</P>
<P>
The <i>subject</i> and <i>subject_length</i> fields contain copies of the values
that were passed to <b>pcre_exec()</b>.
</P>
<P>
The <i>start_match</i> field normally contains the offset within the subject at
which the current match attempt started. However, if the escape sequence \K
has been encountered, this value is changed to reflect the modified starting
point. If the pattern is not anchored, the callout function may be called
several times from the same point in the pattern for different starting points
in the subject.
</P>
<P>
The <i>current_position</i> field contains the offset within the subject of the
current match pointer.
</P>
<P>
When the <b>pcre_exec()</b> function is used, the <i>capture_top</i> field
contains one more than the number of the highest numbered captured substring so
far. If no substrings have been captured, the value of <i>capture_top</i> is
one. This is always the case when <b>pcre_dfa_exec()</b> is used, because it
does not support captured substrings.
</P>
<P>
The <i>capture_last</i> field contains the number of the most recently captured
substring. If no substrings have been captured, its value is -1. This is always
the case when <b>pcre_dfa_exec()</b> is used.
</P>
<P>
The <i>callout_data</i> field contains a value that is passed to
<b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> specifically so that it can be
passed back in callouts. It is passed in the <i>pcre_callout</i> field of the
<b>pcre_extra</b> data structure. If no such data was passed, the value of
<i>callout_data</i> in a <b>pcre_callout</b> block is NULL. There is a
description of the <b>pcre_extra</b> structure in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
<P>
The <i>pattern_position</i> field is present from version 1 of the
<i>pcre_callout</i> structure. It contains the offset to the next item to be
matched in the pattern string.
</P>
<P>
The <i>next_item_length</i> field is present from version 1 of the
<i>pcre_callout</i> structure. It contains the length of the next item to be
matched in the pattern string. When the callout immediately precedes an
alternation bar, a closing parenthesis, or the end of the pattern, the length
is zero. When the callout precedes an opening parenthesis, the length is that
of the entire subpattern.
</P>
<P>
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
help in distinguishing between different automatic callouts, which all have the
same callout number. However, they are set for all callouts.
</P>
<br><a name="SEC4" href="#TOC1">RETURN VALUES</a><br>
<P>
The external callout function returns an integer to PCRE. If the value is zero,
matching proceeds as normal. If the value is greater than zero, matching fails
at the current point, but the testing of other matching possibilities goes
ahead, just as if a lookahead assertion had failed. If the value is less than
zero, the match is abandoned, and <b>pcre_exec()</b> (or <b>pcre_dfa_exec()</b>)
returns the negative value.
</P>
<P>
Negative values should normally be chosen from the set of PCRE_ERROR_xxx
values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
it will never be used by PCRE itself.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 15 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,179 +0,0 @@
<html>
<head>
<title>pcrecompat specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrecompat man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
DIFFERENCES BETWEEN PCRE AND PERL
</b><br>
<P>
This document describes the differences in the ways that PCRE and Perl handle
regular expressions. The differences described here are mainly with respect to
Perl 5.8, though PCRE versions 7.0 and later contain some features that are
expected to be in the forthcoming Perl 5.10.
</P>
<P>
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
it does have are given in the
<a href="pcre.html#utf8support">section on UTF-8 support</a>
in the main
<a href="pcre.html"><b>pcre</b></a>
page.
</P>
<P>
2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
them, but they do not mean what you might think. For example, (?!a){3} does
not assert that the next three characters are not "a". It just asserts that the
next character is not "a" three times.
</P>
<P>
3. Capturing subpatterns that occur inside negative lookahead assertions are
counted, but their entries in the offsets vector are never set. Perl sets its
numerical variables from any such patterns that are matched before the
assertion fails to match something (thereby succeeding), but only if the
negative lookahead assertion contains just one branch.
</P>
<P>
4. Though binary zero characters are supported in the subject string, they are
not allowed in a pattern string because it is passed as a normal C string,
terminated by zero. The escape sequence \0 can be used in the pattern to
represent a binary zero.
</P>
<P>
5. The following Perl escape sequences are not supported: \l, \u, \L,
\U, and \N. In fact these are implemented by Perl's general string-handling
and are not part of its pattern matching engine. If any of these are
encountered by PCRE, an error is generated.
</P>
<P>
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE is
built with Unicode character property support. The properties that can be
tested with \p and \P are limited to the general category properties such as
Lu and Nd, script names such as Greek or Han, and the derived properties Any
and L&.
</P>
<P>
7. PCRE does support the \Q...\E escape for quoting substrings. Characters in
between are treated as literals. This is slightly different from Perl in that $
and @ are also handled as literals inside the quotes. In Perl, they cause
variable interpolation (but of course PCRE does not have variables). Note the
following examples:
<pre>
Pattern PCRE matches Perl matches
\Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
\Qabc\$xyz\E abc\$xyz abc\$xyz
\Qabc\E\$\Qxyz\E abc$xyz abc$xyz
</pre>
The \Q...\E sequence is recognized both inside and outside character classes.
</P>
<P>
8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
constructions. However, there is support for recursive patterns. This is not
available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
feature allows an external function to be called during pattern matching. See
the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation for details.
</P>
<P>
9. Subpatterns that are called recursively or as "subroutines" are always
treated as atomic groups in PCRE. This is like Python, but unlike Perl.
</P>
<P>
10. There are some differences that are concerned with the settings of captured
strings when part of a pattern is repeated. For example, matching "aba" against
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
</P>
<P>
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
parentheses, PCRE does not set that capture group; this is different to Perl.
</P>
<P>
12. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 will include new features that are not in earlier versions, some of
which (such as named parentheses) have been in PCRE for some time. This list is
with respect to Perl 5.10:
<br>
<br>
(a) Although lookbehind assertions must match fixed length strings, each
alternative branch of a lookbehind assertion can match a different length of
string. Perl requires them all to have the same length.
<br>
<br>
(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $
meta-character matches only at the very end of the string.
<br>
<br>
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
(Perl can be made to issue a warning.)
<br>
<br>
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
inverted, that is, by default they are not greedy, but if followed by a
question mark they are.
<br>
<br>
(e) PCRE_ANCHORED can be used at matching time to force a pattern to be tried
only at the first matching position in the subject string.
<br>
<br>
(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
options for <b>pcre_exec()</b> have no Perl equivalents.
<br>
<br>
(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
by the PCRE_BSR_ANYCRLF option.
<br>
<br>
(h) The callout facility is PCRE-specific.
<br>
<br>
(i) The partial matching facility is PCRE-specific.
<br>
<br>
(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
different hosts that have the other endianness.
<br>
<br>
(k) The alternative matching function (<b>pcre_dfa_exec()</b>) matches in a
different way and is not Perl-compatible.
<br>
<br>
(l) PCRE recognizes some special sequences such as (*CR) at the start of
a pattern that set overall options that cannot be changed within the pattern.
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 11 September 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,370 +0,0 @@
<html>
<head>
<title>pcrecpp specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrecpp man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS OF C++ WRAPPER</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">MATCHING INTERFACE</a>
<li><a name="TOC4" href="#SEC4">QUOTING METACHARACTERS</a>
<li><a name="TOC5" href="#SEC5">PARTIAL MATCHES</a>
<li><a name="TOC6" href="#SEC6">UTF-8 AND THE MATCHING INTERFACE</a>
<li><a name="TOC7" href="#SEC7">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a>
<li><a name="TOC8" href="#SEC8">SCANNING TEXT INCREMENTALLY</a>
<li><a name="TOC9" href="#SEC9">PARSING HEX/OCTAL/C-RADIX NUMBERS</a>
<li><a name="TOC10" href="#SEC10">REPLACING PARTS OF STRINGS</a>
<li><a name="TOC11" href="#SEC11">AUTHOR</a>
<li><a name="TOC12" href="#SEC12">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS OF C++ WRAPPER</a><br>
<P>
<b>#include &#60;pcrecpp.h&#62;</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
The C++ wrapper for PCRE was provided by Google Inc. Some additional
functionality was added by Giuseppe Maxia. This brief man page was constructed
from the notes in the <i>pcrecpp.h</i> file, which should be consulted for
further details.
</P>
<br><a name="SEC3" href="#TOC1">MATCHING INTERFACE</a><br>
<P>
The "FullMatch" operation checks that supplied text matches a supplied pattern
exactly. If pointer arguments are supplied, it copies matched sub-strings that
match sub-patterns into them.
<pre>
Example: successful match
pcrecpp::RE re("h.*o");
re.FullMatch("hello");
Example: unsuccessful match (requires full match):
pcrecpp::RE re("e");
!re.FullMatch("hello");
Example: creating a temporary RE object:
pcrecpp::RE("h.*o").FullMatch("hello");
</pre>
You can pass in a "const char*" or a "string" for "text". The examples below
tend to use a const char*. You can, as in the different examples above, store
the RE object explicitly in a variable or use a temporary RE object. The
examples below use one mode or the other arbitrarily. Either could correctly be
used for any of these examples.
</P>
<P>
You must supply extra pointer arguments to extract matched subpieces.
<pre>
Example: extracts "ruby" into "s" and 1234 into "i"
int i;
string s;
pcrecpp::RE re("(\\w+):(\\d+)");
re.FullMatch("ruby:1234", &s, &i);
Example: does not try to extract any extra sub-patterns
re.FullMatch("ruby:1234", &s);
Example: does not try to extract into NULL
re.FullMatch("ruby:1234", NULL, &i);
Example: integer overflow causes failure
!re.FullMatch("ruby:1234567891234", NULL, &i);
Example: fails because there aren't enough sub-patterns:
!pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s);
Example: fails because string cannot be stored in integer
!pcrecpp::RE("(.*)").FullMatch("ruby", &i);
</pre>
The provided pointer arguments can be pointers to any scalar numeric
type, or one of:
<pre>
string (matched piece is copied to string)
StringPiece (StringPiece is mutated to point to matched piece)
T (where "bool T::ParseFrom(const char*, int)" exists)
NULL (the corresponding matched sub-pattern is not copied)
</pre>
The function returns true iff all of the following conditions are satisfied:
<pre>
a. "text" matches "pattern" exactly;
b. The number of matched sub-patterns is &#62;= number of supplied
pointers;
c. The "i"th argument has a suitable type for holding the
string captured as the "i"th sub-pattern. If you pass in
void * NULL for the "i"th argument, or a non-void * NULL
of the correct type, or pass fewer arguments than the
number of sub-patterns, "i"th captured sub-pattern is
ignored.
</pre>
CAVEAT: An optional sub-pattern that does not exist in the matched
string is assigned the empty string. Therefore, the following will
return false (because the empty string is not a valid number):
<pre>
int number;
pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
</pre>
The matching interface supports at most 16 arguments per call.
If you need more, consider using the more general interface
<b>pcrecpp::RE::DoMatch</b>. See <b>pcrecpp.h</b> for the signature for
<b>DoMatch</b>.
</P>
<P>
NOTE: Do not use <b>no_arg</b>, which is used internally to mark the end of a
list of optional arguments, as a placeholder for missing arguments, as this can
lead to segfaults.
</P>
<br><a name="SEC4" href="#TOC1">QUOTING METACHARACTERS</a><br>
<P>
You can use the "QuoteMeta" operation to insert backslashes before all
potentially meaningful characters in a string. The returned string, used as a
regular expression, will exactly match the original string.
<pre>
Example:
string quoted = RE::QuoteMeta(unquoted);
</pre>
Note that it's legal to escape a character even if it has no special meaning in
a regular expression -- so this function does that. (This also makes it
identical to the perl function of the same name; see "perldoc -f quotemeta".)
For example, "1.5-2.0?" becomes "1\.5\-2\.0\?".
</P>
<br><a name="SEC5" href="#TOC1">PARTIAL MATCHES</a><br>
<P>
You can use the "PartialMatch" operation when you want the pattern
to match any substring of the text.
<pre>
Example: simple search for a string:
pcrecpp::RE("ell").PartialMatch("hello");
Example: find first number in a string:
int number;
pcrecpp::RE re("(\\d+)");
re.PartialMatch("x*100 + 20", &number);
assert(number == 100);
</PRE>
</P>
<br><a name="SEC6" href="#TOC1">UTF-8 AND THE MATCHING INTERFACE</a><br>
<P>
By default, pattern and text are plain text, one byte per character. The UTF8
flag, passed to the constructor, causes both pattern and string to be treated
as UTF-8 text, still a byte stream but potentially multiple bytes per
character. In practice, the text is likelier to be UTF-8 than the pattern, but
the match returned may depend on the UTF8 flag, so always use it when matching
UTF8 text. For example, "." will match one byte normally but with UTF8 set may
match up to three bytes of a multi-byte character.
<pre>
Example:
pcrecpp::RE_Options options;
options.set_utf8();
pcrecpp::RE re(utf8_pattern, options);
re.FullMatch(utf8_string);
Example: using the convenience function UTF8():
pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8());
re.FullMatch(utf8_string);
</pre>
NOTE: The UTF8 flag is ignored if pcre was not configured with the
<pre>
--enable-utf8 flag.
</PRE>
</P>
<br><a name="SEC7" href="#TOC1">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a><br>
<P>
PCRE defines some modifiers to change the behavior of the regular expression
engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
pass such modifiers to a RE class. Currently, the following modifiers are
supported:
<pre>
modifier description Perl corresponding
PCRE_CASELESS case insensitive match /i
PCRE_MULTILINE multiple lines match /m
PCRE_DOTALL dot matches newlines /s
PCRE_DOLLAR_ENDONLY $ matches only at end N/A
PCRE_EXTRA strict escape parsing N/A
PCRE_EXTENDED ignore whitespaces /x
PCRE_UTF8 handles UTF8 chars built-in
PCRE_UNGREEDY reverses * and *? N/A
PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*)
</pre>
(*) Both Perl and PCRE allow non capturing parentheses by means of the
"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
capture, while (ab|cd) does.
</P>
<P>
For a full account on how each modifier works, please check the
PCRE API reference page.
</P>
<P>
For each modifier, there are two member functions whose name is made
out of the modifier in lowercase, without the "PCRE_" prefix. For
instance, PCRE_CASELESS is handled by
<pre>
bool caseless()
</pre>
which returns true if the modifier is set, and
<pre>
RE_Options & set_caseless(bool)
</pre>
which sets or unsets the modifier. Moreover, PCRE_EXTRA_MATCH_LIMIT can be
accessed through the <b>set_match_limit()</b> and <b>match_limit()</b> member
functions. Setting <i>match_limit</i> to a non-zero value will limit the
execution of pcre to keep it from doing bad things like blowing the stack or
taking an eternity to return a result. A value of 5000 is good enough to stop
stack blowup in a 2MB thread stack. Setting <i>match_limit</i> to zero disables
match limiting. Alternatively, you can call <b>match_limit_recursion()</b>
which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much PCRE
recurses. <b>match_limit()</b> limits the number of matches PCRE does;
<b>match_limit_recursion()</b> limits the depth of internal recursion, and
therefore the amount of stack that is used.
</P>
<P>
Normally, to pass one or more modifiers to a RE class, you declare
a <i>RE_Options</i> object, set the appropriate options, and pass this
object to a RE constructor. Example:
<pre>
RE_options opt;
opt.set_caseless(true);
if (RE("HELLO", opt).PartialMatch("hello world")) ...
</pre>
RE_options has two constructors. The default constructor takes no arguments and
creates a set of flags that are off by default. The optional parameter
<i>option_flags</i> is to facilitate transfer of legacy code from C programs.
This lets you do
<pre>
RE(pattern,
RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str);
</pre>
However, new code is better off doing
<pre>
RE(pattern,
RE_Options().set_caseless(true).set_multiline(true))
.PartialMatch(str);
</pre>
If you are going to pass one of the most used modifiers, there are some
convenience functions that return a RE_Options class with the
appropriate modifier already set: <b>CASELESS()</b>, <b>UTF8()</b>,
<b>MULTILINE()</b>, <b>DOTALL</b>(), and <b>EXTENDED()</b>.
</P>
<P>
If you need to set several options at once, and you don't want to go through
the pains of declaring a RE_Options object and setting several options, there
is a parallel method that give you such ability on the fly. You can concatenate
several <b>set_xxxxx()</b> member functions, since each of them returns a
reference to its class object. For example, to pass PCRE_CASELESS,
PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
<pre>
RE(" ^ xyz \\s+ .* blah$",
RE_Options()
.set_caseless(true)
.set_extended(true)
.set_multiline(true)).PartialMatch(sometext);
</PRE>
</P>
<br><a name="SEC8" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br>
<P>
The "Consume" operation may be useful if you want to repeatedly
match regular expressions at the front of a string and skip over
them as they match. This requires use of the "StringPiece" type,
which represents a sub-range of a real string. Like RE, StringPiece
is defined in the pcrecpp namespace.
<pre>
Example: read lines of the form "var = value" from a string.
string contents = ...; // Fill string somehow
pcrecpp::StringPiece input(contents); // Wrap in a StringPiece
</PRE>
</P>
<P>
<pre>
string var;
int value;
pcrecpp::RE re("(\\w+) = (\\d+)\n");
while (re.Consume(&input, &var, &value)) {
...;
}
</pre>
Each successful call to "Consume" will set "var/value", and also
advance "input" so it points past the matched text.
</P>
<P>
The "FindAndConsume" operation is similar to "Consume" but does not
anchor your match at the beginning of the string. For example, you
could extract all words from a string by repeatedly calling
<pre>
pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
</PRE>
</P>
<br><a name="SEC9" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br>
<P>
By default, if you pass a pointer to a numeric value, the
corresponding text is interpreted as a base-10 number. You can
instead wrap the pointer with a call to one of the operators Hex(),
Octal(), or CRadix() to interpret the text in another base. The
CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
prefixes, but defaults to base-10.
<pre>
Example:
int a, b, c, d;
pcrecpp::RE re("(.*) (.*) (.*) (.*)");
re.FullMatch("100 40 0100 0x40",
pcrecpp::Octal(&a), pcrecpp::Hex(&b),
pcrecpp::CRadix(&c), pcrecpp::CRadix(&d));
</pre>
will leave 64 in a, b, c, and d.
</P>
<br><a name="SEC10" href="#TOC1">REPLACING PARTS OF STRINGS</a><br>
<P>
You can replace the first match of "pattern" in "str" with "rewrite".
Within "rewrite", backslash-escaped digits (\1 to \9) can be
used to insert text matching corresponding parenthesized group
from the pattern. \0 in "rewrite" refers to the entire matching
text. For example:
<pre>
string s = "yabba dabba doo";
pcrecpp::RE("b+").Replace("d", &s);
</pre>
will leave "s" containing "yada dabba doo". The result is true if the pattern
matches and a replacement occurs, false otherwise.
</P>
<P>
<b>GlobalReplace</b> is like <b>Replace</b> except that it replaces all
occurrences of the pattern in the string with the rewrite. Replacements are
not subject to re-matching. For example:
<pre>
string s = "yabba dabba doo";
pcrecpp::RE("b+").GlobalReplace("d", &s);
</pre>
will leave "s" containing "yada dada doo". It returns the number of
replacements made.
</P>
<P>
<b>Extract</b> is like <b>Replace</b>, except that if the pattern matches,
"rewrite" is copied into "out" (an additional argument) with substitutions.
The non-matching portions of "text" are ignored. Returns true iff a match
occurred and the extraction happened successfully; if no match occurs, the
string is left unaffected.
</P>
<br><a name="SEC11" href="#TOC1">AUTHOR</a><br>
<P>
The C++ wrapper was contributed by Google Inc.
<br>
Copyright &copy; 2007 Google Inc.
<br>
</P>
<br><a name="SEC12" href="#TOC1">REVISION</a><br>
<P>
Last updated: 17 March 2009
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,533 +0,0 @@
<html>
<head>
<title>pcregrep specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcregrep man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">SUPPORT FOR COMPRESSED FILES</a>
<li><a name="TOC4" href="#SEC4">OPTIONS</a>
<li><a name="TOC5" href="#SEC5">ENVIRONMENT VARIABLES</a>
<li><a name="TOC6" href="#SEC6">NEWLINES</a>
<li><a name="TOC7" href="#SEC7">OPTIONS COMPATIBILITY</a>
<li><a name="TOC8" href="#SEC8">OPTIONS WITH DATA</a>
<li><a name="TOC9" href="#SEC9">MATCHING ERRORS</a>
<li><a name="TOC10" href="#SEC10">DIAGNOSTICS</a>
<li><a name="TOC11" href="#SEC11">SEE ALSO</a>
<li><a name="TOC12" href="#SEC12">AUTHOR</a>
<li><a name="TOC13" href="#SEC13">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>pcregrep [options] [long options] [pattern] [path1 path2 ...]</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
<b>pcregrep</b> searches files for character patterns, in the same way as other
grep commands do, but it uses the PCRE regular expression library to support
patterns that are compatible with the regular expressions of Perl 5. See
<a href="pcrepattern.html"><b>pcrepattern</b>(3)</a>
for a full description of syntax and semantics of the regular expressions
that PCRE supports.
</P>
<P>
Patterns, whether supplied on the command line or in a separate file, are given
without delimiters. For example:
<pre>
pcregrep Thursday /etc/motd
</pre>
If you attempt to use delimiters (for example, by surrounding a pattern with
slashes, as is common in Perl scripts), they are interpreted as part of the
pattern. Quotes can of course be used to delimit patterns on the command line
because they are interpreted by the shell, and indeed they are required if a
pattern contains white space or shell metacharacters.
</P>
<P>
The first argument that follows any option settings is treated as the single
pattern to be matched when neither <b>-e</b> nor <b>-f</b> is present.
Conversely, when one or both of these options are used to specify patterns, all
arguments are treated as path names. At least one of <b>-e</b>, <b>-f</b>, or an
argument pattern must be provided.
</P>
<P>
If no files are specified, <b>pcregrep</b> reads the standard input. The
standard input can also be referenced by a name consisting of a single hyphen.
For example:
<pre>
pcregrep some-pattern /file1 - /file3
</pre>
By default, each line that matches a pattern is copied to the standard
output, and if there is more than one file, the file name is output at the
start of each line, followed by a colon. However, there are options that can
change how <b>pcregrep</b> behaves. In particular, the <b>-M</b> option makes it
possible to search for patterns that span line boundaries. What defines a line
boundary is controlled by the <b>-N</b> (<b>--newline</b>) option.
</P>
<P>
Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
BUFSIZ is defined in <b>&#60;stdio.h&#62;</b>. When there is more than one pattern
(specified by the use of <b>-e</b> and/or <b>-f</b>), each pattern is applied to
each line in the order in which they are defined, except that all the <b>-e</b>
patterns are tried before the <b>-f</b> patterns.
</P>
<P>
By default, as soon as one pattern matches (or fails to match when <b>-v</b> is
used), no further patterns are considered. However, if <b>--colour</b> (or
<b>--color</b>) is used to colour the matching substrings, or if
<b>--only-matching</b>, <b>--file-offsets</b>, or <b>--line-offsets</b> is used to
output only the part of the line that matched (either shown literally, or as an
offset), scanning resumes immediately following the match, so that further
matches on the same line can be found. If there are multiple patterns, they are
all tried on the remainder of the line, but patterns that follow the one that
matched are not tried on the earlier part of the line.
</P>
<P>
This is the same behaviour as GNU grep, but it does mean that the order in
which multiple patterns are specified can affect the output when one of the
above options is used.
</P>
<P>
Patterns that can match an empty string are accepted, but empty string
matches are not recognized. An example is the pattern "(super)?(man)?", in
which all components are optional. This pattern finds all occurrences of both
"super" and "man"; the output differs from matching with "super|man" when only
the matching substrings are being shown.
</P>
<P>
If the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variable is set,
<b>pcregrep</b> uses the value to set a locale when calling the PCRE library.
The <b>--locale</b> option can be used to override this.
</P>
<br><a name="SEC3" href="#TOC1">SUPPORT FOR COMPRESSED FILES</a><br>
<P>
It is possible to compile <b>pcregrep</b> so that it uses <b>libz</b> or
<b>libbz2</b> to read files whose names end in <b>.gz</b> or <b>.bz2</b>,
respectively. You can find out whether your binary has support for one or both
of these file types by running it with the <b>--help</b> option. If the
appropriate support is not present, files are treated as plain text. The
standard input is always so treated.
</P>
<br><a name="SEC4" href="#TOC1">OPTIONS</a><br>
<P>
<b>--</b>
This terminate the list of options. It is useful if the next item on the
command line starts with a hyphen but is not an option. This allows for the
processing of patterns and filenames that start with hyphens.
</P>
<P>
<b>-A</b> <i>number</i>, <b>--after-context=</b><i>number</i>
Output <i>number</i> lines of context after each matching line. If filenames
and/or line numbers are being output, a hyphen separator is used instead of a
colon for the context lines. A line containing "--" is output between each
group of lines, unless they are in fact contiguous in the input file. The value
of <i>number</i> is expected to be relatively small. However, <b>pcregrep</b>
guarantees to have up to 8K of following text available for context output.
</P>
<P>
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
Output <i>number</i> lines of context before each matching line. If filenames
and/or line numbers are being output, a hyphen separator is used instead of a
colon for the context lines. A line containing "--" is output between each
group of lines, unless they are in fact contiguous in the input file. The value
of <i>number</i> is expected to be relatively small. However, <b>pcregrep</b>
guarantees to have up to 8K of preceding text available for context output.
</P>
<P>
<b>-C</b> <i>number</i>, <b>--context=</b><i>number</i>
Output <i>number</i> lines of context both before and after each matching line.
This is equivalent to setting both <b>-A</b> and <b>-B</b> to the same value.
</P>
<P>
<b>-c</b>, <b>--count</b>
Do not output individual lines; instead just output a count of the number of
lines that would otherwise have been output. If several files are given, a
count is output for each of them. In this mode, the <b>-A</b>, <b>-B</b>, and
<b>-C</b> options are ignored.
</P>
<P>
<b>--colour</b>, <b>--color</b>
If this option is given without any data, it is equivalent to "--colour=auto".
If data is required, it must be given in the same shell item, separated by an
equals sign.
</P>
<P>
<b>--colour=</b><i>value</i>, <b>--color=</b><i>value</i>
This option specifies under what circumstances the parts of a line that matched
a pattern should be coloured in the output. By default, the output is not
coloured. The value (which is optional, see above) may be "never", "always", or
"auto". In the latter case, colouring happens only if the standard output is
connected to a terminal. More resources are used when colouring is enabled,
because <b>pcregrep</b> has to search for all possible matches in a line, not
just one, in order to colour them all.
</P>
<P>
The colour that is used can be specified by setting the environment variable
PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
string of two numbers, separated by a semicolon. They are copied directly into
the control string for setting colour on a terminal, so it is your
responsibility to ensure that they make sense. If neither of the environment
variables is set, the default is "1;31", which gives red.
</P>
<P>
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
If an input path is not a regular file or a directory, "action" specifies how
it is to be processed. Valid values are "read" (the default) or "skip"
(silently skip the path).
</P>
<P>
<b>-d</b> <i>action</i>, <b>--directories=</b><i>action</i>
If an input path is a directory, "action" specifies how it is to be processed.
Valid values are "read" (the default), "recurse" (equivalent to the <b>-r</b>
option), or "skip" (silently skip the path). In the default case, directories
are read as if they were ordinary files. In some operating systems the effect
of reading a directory like this is an immediate end-of-file.
</P>
<P>
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>, <b>--regexp=</b><i>pattern</i>
Specify a pattern to be matched. This option can be used multiple times in
order to specify several patterns. It can also be used as a way of specifying a
single pattern that starts with a hyphen. When <b>-e</b> is used, no argument
pattern is taken from the command line; all arguments are treated as file
names. There is an overall maximum of 100 patterns. They are applied to each
line in the order in which they are defined until one matches (or fails to
match if <b>-v</b> is used). If <b>-f</b> is used with <b>-e</b>, the command line
patterns are matched first, followed by the patterns from the file, independent
of the order in which these options are specified. Note that multiple use of
<b>-e</b> is not the same as a single pattern with alternatives. For example,
X|Y finds the first character in a line that is X or Y, whereas if the two
patterns are given separately, <b>pcregrep</b> finds X if it is present, even if
it follows Y in the line. It finds Y only if there is no X in the line. This
really matters only if you are using <b>-o</b> to show the part(s) of the line
that matched.
</P>
<P>
<b>--exclude</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the files in a directory as a consequence of
the <b>-r</b> (recursive search) option, any regular files whose names match the
pattern are excluded. Subdirectories are not excluded by this option; they are
searched recursively, subject to the <b>--exclude_dir</b> and
<b>--include_dir</b> options. The pattern is a PCRE regular expression, and is
matched against the final component of the file name (not the entire path). If
a file name matches both <b>--include</b> and <b>--exclude</b>, it is excluded.
There is no short form for this option.
</P>
<P>
<b>--exclude_dir</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the contents of a directory as a consequence
of the <b>-r</b> (recursive search) option, any subdirectories whose names match
the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
subdirectories.) The pattern is a PCRE regular expression, and is matched
against the final component of the name (not the entire path). If a
subdirectory name matches both <b>--include_dir</b> and <b>--exclude_dir</b>, it
is excluded. There is no short form for this option.
</P>
<P>
<b>-F</b>, <b>--fixed-strings</b>
Interpret each pattern as a list of fixed strings, separated by newlines,
instead of as a regular expression. The <b>-w</b> (match as a word) and <b>-x</b>
(match whole line) options can be used with <b>-F</b>. They apply to each of the
fixed strings. A line is selected if any of the fixed strings are found in it
(subject to <b>-w</b> or <b>-x</b>, if present).
</P>
<P>
<b>-f</b> <i>filename</i>, <b>--file=</b><i>filename</i>
Read a number of patterns from the file, one per line, and match them against
each line of input. A data line is output if any of the patterns match it. The
filename can be given as "-" to refer to the standard input. When <b>-f</b> is
used, patterns specified on the command line using <b>-e</b> may also be
present; they are tested before the file's patterns. However, no other pattern
is taken from the command line; all arguments are treated as file names. There
is an overall maximum of 100 patterns. Trailing white space is removed from
each line, and blank lines are ignored. An empty file contains no patterns and
therefore matches nothing. See also the comments about multiple patterns versus
a single pattern with alternatives in the description of <b>-e</b> above.
</P>
<P>
<b>--file-offsets</b>
Instead of showing lines or parts of lines that match, show each match as an
offset from the start of the file and a length, separated by a comma. In this
mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b>
options are ignored. If there is more than one match in a line, each of them is
shown separately. This option is mutually exclusive with <b>--line-offsets</b>
and <b>--only-matching</b>.
</P>
<P>
<b>-H</b>, <b>--with-filename</b>
Force the inclusion of the filename at the start of output lines when searching
a single file. By default, the filename is not shown in this case. For matching
lines, the filename is followed by a colon; for context lines, a hyphen
separator is used. If a line number is also being output, it follows the file
name.
</P>
<P>
<b>-h</b>, <b>--no-filename</b>
Suppress the output filenames when searching multiple files. By default,
filenames are shown when multiple files are searched. For matching lines, the
filename is followed by a colon; for context lines, a hyphen separator is used.
If a line number is also being output, it follows the file name.
</P>
<P>
<b>--help</b>
Output a help message, giving brief details of the command options and file
type support, and then exit.
</P>
<P>
<b>-i</b>, <b>--ignore-case</b>
Ignore upper/lower case distinctions during comparisons.
</P>
<P>
<b>--include</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the files in a directory as a consequence of
the <b>-r</b> (recursive search) option, only those regular files whose names
match the pattern are included. Subdirectories are always included and searched
recursively, subject to the \fP--include_dir\fP and <b>--exclude_dir</b>
options. The pattern is a PCRE regular expression, and is matched against the
final component of the file name (not the entire path). If a file name matches
both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no short
form for this option.
</P>
<P>
<b>--include_dir</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the contents of a directory as a consequence
of the <b>-r</b> (recursive search) option, only those subdirectories whose
names match the pattern are included. (Note that the <b>--include</b> option
does not affect subdirectories.) The pattern is a PCRE regular expression, and
is matched against the final component of the name (not the entire path). If a
subdirectory name matches both <b>--include_dir</b> and <b>--exclude_dir</b>, it
is excluded. There is no short form for this option.
</P>
<P>
<b>-L</b>, <b>--files-without-match</b>
Instead of outputting lines from the files, just output the names of the files
that do not contain any lines that would have been output. Each file name is
output once, on a separate line.
</P>
<P>
<b>-l</b>, <b>--files-with-matches</b>
Instead of outputting lines from the files, just output the names of the files
containing lines that would have been output. Each file name is output
once, on a separate line. Searching stops as soon as a matching line is found
in a file.
</P>
<P>
<b>--label</b>=<i>name</i>
This option supplies a name to be used for the standard input when file names
are being output. If not supplied, "(standard input)" is used. There is no
short form for this option.
</P>
<P>
<b>--line-offsets</b>
Instead of showing lines or parts of lines that match, show each match as a
line number, the offset from the start of the line, and a length. The line
number is terminated by a colon (as usual; see the <b>-n</b> option), and the
offset and length are separated by a comma. In this mode, no context is shown.
That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. If there is
more than one match in a line, each of them is shown separately. This option is
mutually exclusive with <b>--file-offsets</b> and <b>--only-matching</b>.
</P>
<P>
<b>--locale</b>=<i>locale-name</i>
This option specifies a locale to be used for pattern matching. It overrides
the value in the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variables. If no
locale is specified, the PCRE library's default (usually the "C" locale) is
used. There is no short form for this option.
</P>
<P>
<b>-M</b>, <b>--multiline</b>
Allow patterns to match more than one line. When this option is given, patterns
may usefully contain literal newline characters and internal occurrences of ^
and $ characters. The output for any one match may consist of more than one
line. When this option is set, the PCRE library is called in "multiline" mode.
There is a limit to the number of lines that can be matched, imposed by the way
that <b>pcregrep</b> buffers the input file as it scans it. However,
<b>pcregrep</b> ensures that at least 8K characters or the rest of the document
(whichever is the shorter) are available for forward matching, and similarly
the previous 8K characters (or all the previous characters, if fewer than 8K)
are guaranteed to be available for lookbehind assertions.
</P>
<P>
<b>-N</b> <i>newline-type</i>, <b>--newline=</b><i>newline-type</i>
The PCRE library supports five different conventions for indicating
the ends of lines. They are the single-character sequences CR (carriage return)
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
which recognizes any of the preceding three types, and an "any" convention, in
which any Unicode line ending sequence is assumed to end a line. The Unicode
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
PS (paragraph separator, U+2029).
<br>
<br>
When the PCRE library is built, a default line-ending sequence is specified.
This is normally the standard sequence for the operating system. Unless
otherwise specified by this option, <b>pcregrep</b> uses the library's default.
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
makes it possible to use <b>pcregrep</b> on files that have come from other
environments without having to modify their line endings. If the data that is
being scanned does not agree with the convention set by this option,
<b>pcregrep</b> may behave in strange ways.
</P>
<P>
<b>-n</b>, <b>--line-number</b>
Precede each output line by its line number in the file, followed by a colon
for matching lines or a hyphen for context lines. If the filename is also being
output, it precedes the line number. This option is forced if
<b>--line-offsets</b> is used.
</P>
<P>
<b>-o</b>, <b>--only-matching</b>
Show only the part of the line that matched a pattern. In this mode, no
context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are
ignored. If there is more than one match in a line, each of them is shown
separately. If <b>-o</b> is combined with <b>-v</b> (invert the sense of the
match to find non-matching lines), no output is generated, but the return code
is set appropriately. This option is mutually exclusive with
<b>--file-offsets</b> and <b>--line-offsets</b>.
</P>
<P>
<b>-q</b>, <b>--quiet</b>
Work quietly, that is, display nothing except error messages. The exit
status indicates whether or not any matches were found.
</P>
<P>
<b>-r</b>, <b>--recursive</b>
If any given path is a directory, recursively scan the files it contains,
taking note of any <b>--include</b> and <b>--exclude</b> settings. By default, a
directory is read as a normal file; in some operating systems this gives an
immediate end-of-file. This option is a shorthand for setting the <b>-d</b>
option to "recurse".
</P>
<P>
<b>-s</b>, <b>--no-messages</b>
Suppress error messages about non-existent or unreadable files. Such files are
quietly skipped. However, the return code is still 2, even if matches were
found in other files.
</P>
<P>
<b>-u</b>, <b>--utf-8</b>
Operate in UTF-8 mode. This option is available only if PCRE has been compiled
with UTF-8 support. Both patterns and subject lines must be valid strings of
UTF-8 characters.
</P>
<P>
<b>-V</b>, <b>--version</b>
Write the version numbers of <b>pcregrep</b> and the PCRE library that is being
used to the standard error stream.
</P>
<P>
<b>-v</b>, <b>--invert-match</b>
Invert the sense of the match, so that lines which do <i>not</i> match any of
the patterns are the ones that are found.
</P>
<P>
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
Force the patterns to match only whole words. This is equivalent to having \b
at the start and end of the pattern.
</P>
<P>
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
Force the patterns to be anchored (each must start matching at the beginning of
a line) and in addition, require them to match entire lines. This is
equivalent to having ^ and $ characters at the start and end of each
alternative branch in every pattern.
</P>
<br><a name="SEC5" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
<P>
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
order, for a locale. The first one that is set is used. This can be overridden
by the <b>--locale</b> option. If no locale is set, the PCRE library's default
(usually the "C" locale) is used.
</P>
<br><a name="SEC6" href="#TOC1">NEWLINES</a><br>
<P>
The <b>-N</b> (<b>--newline</b>) option allows <b>pcregrep</b> to scan files with
different newline conventions from the default. However, the setting of this
option does not affect the way in which <b>pcregrep</b> writes information to
the standard error and output streams. It uses the string "\n" in C
<b>printf()</b> calls to indicate newlines, relying on the C I/O library to
convert this to an appropriate sequence if the output is sent to a file.
</P>
<br><a name="SEC7" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
<P>
The majority of short and long forms of <b>pcregrep</b>'s options are the same
as in the GNU <b>grep</b> program. Any long option of the form
<b>--xxx-regexp</b> (GNU terminology) is also available as <b>--xxx-regex</b>
(PCRE terminology). However, the <b>--locale</b>, <b>-M</b>, <b>--multiline</b>,
<b>-u</b>, and <b>--utf-8</b> options are specific to <b>pcregrep</b>.
</P>
<br><a name="SEC8" href="#TOC1">OPTIONS WITH DATA</a><br>
<P>
There are four different ways in which an option with data can be specified.
If a short form option is used, the data may follow immediately, or in the next
command line item. For example:
<pre>
-f/some/file
-f /some/file
</pre>
If a long form option is used, the data may appear in the same command line
item, separated by an equals character, or (with one exception) it may appear
in the next command line item. For example:
<pre>
--file=/some/file
--file /some/file
</pre>
Note, however, that if you want to supply a file name beginning with ~ as data
in a shell command, and have the shell expand ~ to a home directory, you must
separate the file name from the option, because the shell does not treat ~
specially unless it is at the start of an item.
</P>
<P>
The exception to the above is the <b>--colour</b> (or <b>--color</b>) option,
for which the data is optional. If this option does have data, it must be given
in the first form, using an equals character. Otherwise it will be assumed that
it has no data.
</P>
<br><a name="SEC9" href="#TOC1">MATCHING ERRORS</a><br>
<P>
It is possible to supply a regular expression that takes a very long time to
fail to match certain lines. Such patterns normally involve nested indefinite
repeats, for example: (a+)*\d when matched against a line of a's with no final
digit. The PCRE matching function has a resource limit that causes it to abort
in these circumstances. If this happens, <b>pcregrep</b> outputs an error
message and the line that caused the problem to the standard error stream. If
there are more than 20 such errors, <b>pcregrep</b> gives up.
</P>
<br><a name="SEC10" href="#TOC1">DIAGNOSTICS</a><br>
<P>
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
for syntax errors and non-existent or inacessible files (even if matches were
found in other files) or too many matching errors. Using the <b>-s</b> option to
suppress error messages about inaccessble files does not affect the return
code.
</P>
<br><a name="SEC11" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcrepattern</b>(3), <b>pcretest</b>(1).
</P>
<br><a name="SEC12" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
<P>
Last updated: 01 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,224 +0,0 @@
<html>
<head>
<title>pcrematching specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrematching man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE MATCHING ALGORITHMS</a>
<li><a name="TOC2" href="#SEC2">REGULAR EXPRESSIONS AS TREES</a>
<li><a name="TOC3" href="#SEC3">THE STANDARD MATCHING ALGORITHM</a>
<li><a name="TOC4" href="#SEC4">THE ALTERNATIVE MATCHING ALGORITHM</a>
<li><a name="TOC5" href="#SEC5">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a>
<li><a name="TOC6" href="#SEC6">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a>
<li><a name="TOC7" href="#SEC7">AUTHOR</a>
<li><a name="TOC8" href="#SEC8">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE MATCHING ALGORITHMS</a><br>
<P>
This document describes the two different algorithms that are available in PCRE
for matching a compiled regular expression against a given subject string. The
"standard" algorithm is the one provided by the <b>pcre_exec()</b> function.
This works in the same was as Perl's matching function, and provides a
Perl-compatible matching operation.
</P>
<P>
An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> function;
this operates in a different way, and is not Perl-compatible. It has advantages
and disadvantages compared with the standard algorithm, and these are described
below.
</P>
<P>
When there is only one possible way in which a given subject string can match a
pattern, the two algorithms give the same answer. A difference arises, however,
when there are multiple possibilities. For example, if the pattern
<pre>
^&#60;.*&#62;
</pre>
is matched against the string
<pre>
&#60;something&#62; &#60;something else&#62; &#60;something further&#62;
</pre>
there are three possible answers. The standard algorithm finds only one of
them, whereas the alternative algorithm finds all three.
</P>
<br><a name="SEC2" href="#TOC1">REGULAR EXPRESSIONS AS TREES</a><br>
<P>
The set of strings that are matched by a regular expression can be represented
as a tree structure. An unlimited repetition in the pattern makes the tree of
infinite size, but it is still a tree. Matching the pattern to a given subject
string (from a given starting point) can be thought of as a search of the tree.
There are two ways to search a tree: depth-first and breadth-first, and these
correspond to the two matching algorithms provided by PCRE.
</P>
<br><a name="SEC3" href="#TOC1">THE STANDARD MATCHING ALGORITHM</a><br>
<P>
In the terminology of Jeffrey Friedl's book "Mastering Regular
Expressions", the standard algorithm is an "NFA algorithm". It conducts a
depth-first search of the pattern tree. That is, it proceeds along a single
path through the tree, checking that the subject matches what is required. When
there is a mismatch, the algorithm tries any alternatives at the current point,
and if they all fail, it backs up to the previous branch point in the tree, and
tries the next alternative branch at that level. This often involves backing up
(moving to the left) in the subject string as well. The order in which
repetition branches are tried is controlled by the greedy or ungreedy nature of
the quantifier.
</P>
<P>
If a leaf node is reached, a matching string has been found, and at that point
the algorithm stops. Thus, if there is more than one possible match, this
algorithm returns the first one that it finds. Whether this is the shortest,
the longest, or some intermediate length depends on the way the greedy and
ungreedy repetition quantifiers are specified in the pattern.
</P>
<P>
Because it ends up with a single path through the tree, it is relatively
straightforward for this algorithm to keep track of the substrings that are
matched by portions of the pattern in parentheses. This provides support for
capturing parentheses and back references.
</P>
<br><a name="SEC4" href="#TOC1">THE ALTERNATIVE MATCHING ALGORITHM</a><br>
<P>
This algorithm conducts a breadth-first search of the tree. Starting from the
first matching point in the subject, it scans the subject string from left to
right, once, character by character, and as it does this, it remembers all the
paths through the tree that represent valid matches. In Friedl's terminology,
this is a kind of "DFA algorithm", though it is not implemented as a
traditional finite state machine (it keeps multiple states active
simultaneously).
</P>
<P>
The scan continues until either the end of the subject is reached, or there are
no more unterminated paths. At this point, terminated paths represent the
different matching possibilities (if there are none, the match has failed).
Thus, if there is more than one possible match, this algorithm finds all of
them, and in particular, it finds the longest. In PCRE, there is an option to
stop the algorithm after the first match (which is necessarily the shortest)
has been found.
</P>
<P>
Note that all the matches that are found start at the same point in the
subject. If the pattern
<pre>
cat(er(pillar)?)
</pre>
is matched against the string "the caterpillar catchment", the result will be
the three strings "cat", "cater", and "caterpillar" that start at the fourth
character of the subject. The algorithm does not automatically move on to find
matches that start at later positions.
</P>
<P>
There are a number of features of PCRE regular expressions that are not
supported by the alternative matching algorithm. They are as follows:
</P>
<P>
1. Because the algorithm finds all possible matches, the greedy or ungreedy
nature of repetition quantifiers is not relevant. Greedy and ungreedy
quantifiers are treated in exactly the same way. However, possessive
quantifiers can make a difference when what follows could also match what is
quantified, for example in a pattern like this:
<pre>
^a++\w!
</pre>
This pattern matches "aaab!" but not "aaa!", which would be matched by a
non-possessive quantifier. Similarly, if an atomic group is present, it is
matched as if it were a standalone pattern at the current point, and the
longest match is then "locked in" for the rest of the overall pattern.
</P>
<P>
2. When dealing with multiple paths through the tree simultaneously, it is not
straightforward to keep track of captured substrings for the different matching
possibilities, and PCRE's implementation of this algorithm does not attempt to
do this. This means that no captured substrings are available.
</P>
<P>
3. Because no substrings are captured, back references within the pattern are
not supported, and cause errors if encountered.
</P>
<P>
4. For the same reason, conditional expressions that use a backreference as the
condition or test for a specific group recursion are not supported.
</P>
<P>
5. Because many paths through the tree may be active, the \K escape sequence,
which resets the start of the match when encountered (but may be on some paths
and not on others), is not supported. It causes an error if encountered.
</P>
<P>
6. Callouts are supported, but the value of the <i>capture_top</i> field is
always 1, and the value of the <i>capture_last</i> field is always -1.
</P>
<P>
7. The \C escape sequence, which (in the standard algorithm) matches a single
byte, even in UTF-8 mode, is not supported because the alternative algorithm
moves through the subject string one character at a time, for all active paths
through the tree.
</P>
<P>
8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
</P>
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
<P>
Using the alternative matching algorithm provides the following advantages:
</P>
<P>
1. All possible matches (at a single point in the subject) are automatically
found, and in particular, the longest match is found. To find more than one
match using the standard algorithm, you have to do kludgy things with
callouts.
</P>
<P>
2. There is much better support for partial matching. The restrictions on the
content of the pattern that apply when using the standard algorithm for partial
matching do not apply to the alternative algorithm. For non-anchored patterns,
the starting position of a partial match is available.
</P>
<P>
3. Because the alternative algorithm scans the subject string just once, and
never needs to backtrack, it is possible to pass very long subject strings to
the matching function in several pieces, checking for partial matching each
time.
</P>
<br><a name="SEC6" href="#TOC1">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
<P>
The alternative algorithm suffers from a number of disadvantages:
</P>
<P>
1. It is substantially slower than the standard algorithm. This is partly
because it has to search for all possible matches, but is also because it is
less susceptible to optimization.
</P>
<P>
2. Capturing parentheses and back references are not supported.
</P>
<P>
3. Although atomic groups are supported, their use does not provide the
performance advantage that it does for the standard algorithm.
</P>
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
<P>
Last updated: 19 April 2008
<br>
Copyright &copy; 1997-2008 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,242 +0,0 @@
<html>
<head>
<title>pcrepartial specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrepartial man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PARTIAL MATCHING IN PCRE</a>
<li><a name="TOC2" href="#SEC2">RESTRICTED PATTERNS FOR PCRE_PARTIAL</a>
<li><a name="TOC3" href="#SEC3">EXAMPLE OF PARTIAL MATCHING USING PCRETEST</a>
<li><a name="TOC4" href="#SEC4">MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE</a><br>
<P>
In normal use of PCRE, if the subject string that is passed to
<b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> matches as far as it goes, but is
too short to match the entire pattern, PCRE_ERROR_NOMATCH is returned. There
are circumstances where it might be helpful to distinguish this case from other
cases in which there is no match.
</P>
<P>
Consider, for example, an application where a human is required to type in data
for a field with specific formatting requirements. An example might be a date
in the form <i>ddmmmyy</i>, defined by this pattern:
<pre>
^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
</pre>
If the application sees the user's keystrokes one by one, and can check that
what has been typed so far is potentially valid, it is able to raise an error
as soon as a mistake is made, possibly beeping and not reflecting the
character that has been typed. This immediate feedback is likely to be a better
user interface than a check that is delayed until the entire string has been
entered.
</P>
<P>
PCRE supports the concept of partial matching by means of the PCRE_PARTIAL
option, which can be set when calling <b>pcre_exec()</b> or
<b>pcre_dfa_exec()</b>. When this flag is set for <b>pcre_exec()</b>, the return
code PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if at any time
during the matching process the last part of the subject string matched part of
the pattern. Unfortunately, for non-anchored matching, it is not possible to
obtain the position of the start of the partial match. No captured data is set
when PCRE_ERROR_PARTIAL is returned.
</P>
<P>
When PCRE_PARTIAL is set for <b>pcre_dfa_exec()</b>, the return code
PCRE_ERROR_NOMATCH is converted into PCRE_ERROR_PARTIAL if the end of the
subject is reached, there have been no complete matches, but there is still at
least one matching possibility. The portion of the string that provided the
partial match is set as the first matching string.
</P>
<P>
Using PCRE_PARTIAL disables one of PCRE's optimizations. PCRE remembers the
last literal byte in a pattern, and abandons matching immediately if such a
byte is not present in the subject string. This optimization cannot be used
for a subject string that might match only partially.
</P>
<br><a name="SEC2" href="#TOC1">RESTRICTED PATTERNS FOR PCRE_PARTIAL</a><br>
<P>
Because of the way certain internal optimizations are implemented in the
<b>pcre_exec()</b> function, the PCRE_PARTIAL option cannot be used with all
patterns. These restrictions do not apply when <b>pcre_dfa_exec()</b> is used.
For <b>pcre_exec()</b>, repeated single characters such as
<pre>
a{2,4}
</pre>
and repeated single metasequences such as
<pre>
\d+
</pre>
are not permitted if the maximum number of occurrences is greater than one.
Optional items such as \d? (where the maximum is one) are permitted.
Quantifiers with any values are permitted after parentheses, so the invalid
examples above can be coded thus:
<pre>
(a){2,4}
(\d)+
</pre>
These constructions run more slowly, but for the kinds of application that are
envisaged for this facility, this is not felt to be a major restriction.
</P>
<P>
If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
<b>pcre_exec()</b> returns the error code PCRE_ERROR_BADPARTIAL (-13).
You can use the PCRE_INFO_OKPARTIAL call to <b>pcre_fullinfo()</b> to find out
if a compiled pattern can be used for partial matching.
</P>
<br><a name="SEC3" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRETEST</a><br>
<P>
If the escape sequence \P is present in a <b>pcretest</b> data line, the
PCRE_PARTIAL flag is used for the match. Here is a run of <b>pcretest</b> that
uses the date example quoted above:
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 25jun04\P
0: 25jun04
1: jun
data&#62; 25dec3\P
Partial match
data&#62; 3ju\P
Partial match
data&#62; 3juj\P
No match
data&#62; j\P
No match
</pre>
The first data string is matched completely, so <b>pcretest</b> shows the
matched substrings. The remaining four strings do not match the complete
pattern, but the first two are partial matches. The same test, using
<b>pcre_dfa_exec()</b> matching (by means of the \D escape sequence), produces
the following output:
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 25jun04\P\D
0: 25jun04
data&#62; 23dec3\P\D
Partial match: 23dec3
data&#62; 3ju\P\D
Partial match: 3ju
data&#62; 3juj\P\D
No match
data&#62; j\P\D
No match
</pre>
Notice that in this case the portion of the string that was matched is made
available.
</P>
<br><a name="SEC4" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()</a><br>
<P>
When a partial match has been found using <b>pcre_dfa_exec()</b>, it is possible
to continue the match by providing additional subject data and calling
<b>pcre_dfa_exec()</b> again with the same compiled regular expression, this
time setting the PCRE_DFA_RESTART option. You must also pass the same working
space as before, because this is where details of the previous partial match
are stored. Here is an example using <b>pcretest</b>, using the \R escape
sequence to set the PCRE_DFA_RESTART option (\P and \D are as above):
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 23ja\P\D
Partial match: 23ja
data&#62; n05\R\D
0: n05
</pre>
The first call has "23ja" as the subject, and requests partial matching; the
second call has "n05" as the subject for the continued (restarted) match.
Notice that when the match is complete, only the last part is shown; PCRE does
not retain the previously partially-matched string. It is up to the calling
program to do that if it needs to.
</P>
<P>
You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
over multiple segments. This facility can be used to pass very long subject
strings to <b>pcre_dfa_exec()</b>. However, some care is needed for certain
types of pattern.
</P>
<P>
1. If the pattern contains tests for the beginning or end of a line, you need
to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
subject string for any call does not contain the beginning or end of a line.
</P>
<P>
2. If the pattern contains backward assertions (including \b or \B), you need
to arrange for some overlap in the subject strings to allow for this. For
example, you could pass the subject in chunks that are 500 bytes long, but in
a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
bytes at the start of the buffer.
</P>
<P>
3. Matching a subject string that is split into multiple segments does not
always produce exactly the same result as matching over one single long string.
The difference arises when there are multiple matching possibilities, because a
partial match result is given only when there are no completed matches in a
call to <b>pcre_dfa_exec()</b>. This means that as soon as the shortest match has
been found, continuation to a new subject segment is no longer possible.
Consider this <b>pcretest</b> example:
<pre>
re&#62; /dog(sbody)?/
data&#62; do\P\D
Partial match: do
data&#62; gsb\R\P\D
0: g
data&#62; dogsbody\D
0: dogsbody
1: dog
</pre>
The pattern matches the words "dog" or "dogsbody". When the subject is
presented in several parts ("do" and "gsb" being the first two) the match stops
when "dog" has been found, and it is not possible to continue. On the other
hand, if "dogsbody" is presented as a single string, both matches are found.
</P>
<P>
Because of this phenomenon, it does not usually make sense to end a pattern
that is going to be matched in this way with a variable repeat.
</P>
<P>
4. Patterns that contain alternatives at the top level which do not all
start with the same pattern item may not work as expected. For example,
consider this pattern:
<pre>
1234|3789
</pre>
If the first part of the subject is "ABC123", a partial match of the first
alternative is found at offset 3. There is no partial match for the second
alternative, because such a match does not start at the same point in the
subject string. Attempting to continue with the string "789" does not yield a
match because only those alternatives that match at one point in the subject
are remembered. The problem arises because the start of the second alternative
matches within the first alternative. There is no problem with anchored
patterns or patterns such as:
<pre>
1234|ABCD
</pre>
where no string can be a partial match for both alternatives.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 04 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

File diff suppressed because it is too large Load Diff

View File

@ -1,173 +0,0 @@
<html>
<head>
<title>pcreperform specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcreperform man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
PCRE PERFORMANCE
</b><br>
<P>
Two aspects of performance are discussed below: memory usage and processing
time. The way you express your pattern as a regular expression can affect both
of them.
</P>
<br><b>
MEMORY USAGE
</b><br>
<P>
Patterns are compiled by PCRE into a reasonably efficient byte code, so that
most simple patterns do not use much memory. However, there is one case where
memory usage can be unexpectedly large. When a parenthesized subpattern has a
quantifier with a minimum greater than 1 and/or a limited maximum, the whole
subpattern is repeated in the compiled code. For example, the pattern
<pre>
(abc|def){2,4}
</pre>
is compiled as if it were
<pre>
(abc|def)(abc|def)((abc|def)(abc|def)?)?
</pre>
(Technical aside: It is done this way so that backtrack points within each of
the repetitions can be independently maintained.)
</P>
<P>
For regular expressions whose quantifiers use only small numbers, this is not
usually a problem. However, if the numbers are large, and particularly if such
repetitions are nested, the memory usage can become an embarrassment. For
example, the very simple pattern
<pre>
((ab){1,1000}c){1,3}
</pre>
uses 51K bytes when compiled. When PCRE is compiled with its default internal
pointer size of two bytes, the size limit on a compiled pattern is 64K, and
this is reached with the above pattern if the outer repetition is increased
from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
handle larger compiled patterns, but it is better to try to rewrite your
pattern to use less memory if you can.
</P>
<P>
One way of reducing the memory usage for such patterns is to make use of PCRE's
<a href="pcrepattern.html#subpatternsassubroutines">"subroutine"</a>
facility. Re-writing the above pattern as
<pre>
((ab)(?2){0,999}c)(?1){0,2}
</pre>
reduces the memory requirements to 18K, and indeed it remains under 20K even
with the outer repetition increased to 100. However, this pattern is not
exactly equivalent, because the "subroutine" calls are treated as
<a href="pcrepattern.html#atomicgroup">atomic groups</a>
into which there can be no backtracking if there is a subsequent matching
failure. Therefore, PCRE cannot do this kind of rewriting automatically.
Furthermore, there is a noticeable loss of speed when executing the modified
pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
speed is acceptable, this kind of rewriting will allow you to process patterns
that PCRE cannot otherwise handle.
</P>
<br><b>
PROCESSING TIME
</b><br>
<P>
Certain items in regular expression patterns are processed more efficiently
than others. It is more efficient to use a character class like [aeiou] than a
set of single-character alternatives such as (a|e|i|o|u). In general, the
simplest construction that provides the required behaviour is usually the most
efficient. Jeffrey Friedl's book contains a lot of useful general discussion
about optimizing regular expressions for efficient performance. This document
contains a few observations about PCRE.
</P>
<P>
Using Unicode character properties (the \p, \P, and \X escapes) is slow,
because PCRE has to scan a structure that contains data for over fifteen
thousand characters whenever it needs a character's property. If you can find
an alternative pattern that does not use character properties, it will probably
be faster.
</P>
<P>
When a pattern begins with .* not in parentheses, or in parentheses that are
not the subject of a backreference, and the PCRE_DOTALL option is set, the
pattern is implicitly anchored by PCRE, since it can match only at the start of
a subject string. However, if PCRE_DOTALL is not set, PCRE cannot make this
optimization, because the . metacharacter does not then match a newline, and if
the subject string contains newlines, the pattern may match from the character
immediately following one of them instead of from the very start. For example,
the pattern
<pre>
.*second
</pre>
matches the subject "first\nand second" (where \n stands for a newline
character), with the match starting at the seventh character. In order to do
this, PCRE has to retry the match starting after every newline in the subject.
</P>
<P>
If you are using such a pattern with subject strings that do not contain
newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE
from having to scan along the subject looking for a newline to restart at.
</P>
<P>
Beware of patterns that contain nested indefinite repeats. These can take a
long time to run when applied to a string that does not match. Consider the
pattern fragment
<pre>
^(a+)*
</pre>
This can match "aaaa" in 16 different ways, and this number increases very
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
times, and for each of those cases other than 0 or 4, the + repeats can match
different numbers of times.) When the remainder of the pattern is such that the
entire match is going to fail, PCRE has in principle to try every possible
variation, and this can take an extremely long time, even for relatively short
strings.
</P>
<P>
An optimization catches some of the more simple cases such as
<pre>
(a+)*b
</pre>
where a literal character follows. Before embarking on the standard matching
procedure, PCRE checks that there is a "b" later in the subject string, and if
there is not, it fails the match immediately. However, when there is no
following literal this optimization cannot be used. You can see the difference
by comparing the behaviour of
<pre>
(a+)*\d
</pre>
with the pattern above. The former gives a failure almost instantly when
applied to a whole line of "a" characters, whereas the latter takes an
appreciable time with strings longer than about 20 characters.
</P>
<P>
In many cases, the solution to this kind of performance issue is to use an
atomic group or a possessive quantifier.
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 06 March 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,266 +0,0 @@
<html>
<head>
<title>pcreposix specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcreposix man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS OF POSIX API</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">COMPILING A PATTERN</a>
<li><a name="TOC4" href="#SEC4">MATCHING NEWLINE CHARACTERS</a>
<li><a name="TOC5" href="#SEC5">MATCHING A PATTERN</a>
<li><a name="TOC6" href="#SEC6">ERROR MESSAGES</a>
<li><a name="TOC7" href="#SEC7">MEMORY USAGE</a>
<li><a name="TOC8" href="#SEC8">AUTHOR</a>
<li><a name="TOC9" href="#SEC9">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS OF POSIX API</a><br>
<P>
<b>#include &#60;pcreposix.h&#62;</b>
</P>
<P>
<b>int regcomp(regex_t *<i>preg</i>, const char *<i>pattern</i>,</b>
<b>int <i>cflags</i>);</b>
</P>
<P>
<b>int regexec(regex_t *<i>preg</i>, const char *<i>string</i>,</b>
<b>size_t <i>nmatch</i>, regmatch_t <i>pmatch</i>[], int <i>eflags</i>);</b>
</P>
<P>
<b>size_t regerror(int <i>errcode</i>, const regex_t *<i>preg</i>,</b>
<b>char *<i>errbuf</i>, size_t <i>errbuf_size</i>);</b>
</P>
<P>
<b>void regfree(regex_t *<i>preg</i>);</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
This set of functions provides a POSIX-style API to the PCRE regular expression
package. See the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation for a description of PCRE's native API, which contains much
additional functionality.
</P>
<P>
The functions described here are just wrapper functions that ultimately call
the PCRE native API. Their prototypes are defined in the <b>pcreposix.h</b>
header file, and on Unix systems the library itself is called
<b>pcreposix.a</b>, so can be accessed by adding <b>-lpcreposix</b> to the
command for linking an application that uses them. Because the POSIX functions
call the native ones, it is also necessary to add <b>-lpcre</b>.
</P>
<P>
I have implemented only those POSIX option bits that can be reasonably mapped
to PCRE native options. In addition, the option REG_EXTENDED is defined with
the value zero. This has no effect, but since programs that are written to the
POSIX interface often use it, this makes it easier to slot in PCRE as a
replacement library. Other POSIX options are not even defined.
</P>
<P>
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
still those of Perl, subject to the setting of various PCRE options, as
described below. "POSIX-like in style" means that the API approximates to the
POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
domains it is probably even less compatible.
</P>
<P>
The header for these functions is supplied as <b>pcreposix.h</b> to avoid any
potential clash with other POSIX libraries. It can, of course, be renamed or
aliased as <b>regex.h</b>, which is the "correct" name. It provides two
structure types, <i>regex_t</i> for compiled internal forms, and
<i>regmatch_t</i> for returning captured substrings. It also defines some
constants whose names start with "REG_"; these are used for setting options and
identifying error codes.
</P>
<P>
</P>
<br><a name="SEC3" href="#TOC1">COMPILING A PATTERN</a><br>
<P>
The function <b>regcomp()</b> is called to compile a pattern into an
internal form. The pattern is a C string terminated by a binary zero, and
is passed in the argument <i>pattern</i>. The <i>preg</i> argument is a pointer
to a <b>regex_t</b> structure that is used as a base for storing information
about the compiled regular expression.
</P>
<P>
The argument <i>cflags</i> is either zero, or contains one or more of the bits
defined by the following macros:
<pre>
REG_DOTALL
</pre>
The PCRE_DOTALL option is set when the regular expression is passed for
compilation to the native function. Note that REG_DOTALL is not part of the
POSIX standard.
<pre>
REG_ICASE
</pre>
The PCRE_CASELESS option is set when the regular expression is passed for
compilation to the native function.
<pre>
REG_NEWLINE
</pre>
The PCRE_MULTILINE option is set when the regular expression is passed for
compilation to the native function. Note that this does <i>not</i> mimic the
defined POSIX behaviour for REG_NEWLINE (see the following section).
<pre>
REG_NOSUB
</pre>
The PCRE_NO_AUTO_CAPTURE option is set when the regular expression is passed
for compilation to the native function. In addition, when a pattern that is
compiled with this flag is passed to <b>regexec()</b> for matching, the
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
are returned.
<pre>
REG_UTF8
</pre>
The PCRE_UTF8 option is set when the regular expression is passed for
compilation to the native function. This causes the pattern itself and all data
strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF8
is not part of the POSIX standard.
</P>
<P>
In the absence of these flags, no options are passed to the native function.
This means the the regex is compiled with PCRE default semantics. In
particular, the way it handles newline characters in the subject string is the
Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
<i>some</i> of the effects specified for REG_NEWLINE. It does not affect the way
newlines are matched by . (they aren't) or by a negative class such as [^a]
(they are).
</P>
<P>
The yield of <b>regcomp()</b> is zero on success, and non-zero otherwise. The
<i>preg</i> structure is filled in on success, and one member of the structure
is public: <i>re_nsub</i> contains the number of capturing subpatterns in
the regular expression. Various error codes are defined in the header file.
</P>
<br><a name="SEC4" href="#TOC1">MATCHING NEWLINE CHARACTERS</a><br>
<P>
This area is not simple, because POSIX and Perl take different views of things.
It is not possible to get PCRE to obey POSIX semantics, but then PCRE was never
intended to be a POSIX engine. The following table lists the different
possibilities for matching newline characters in PCRE:
<pre>
Default Change with
. matches newline no PCRE_DOTALL
newline matches [^a] yes not changeable
$ matches \n at end yes PCRE_DOLLARENDONLY
$ matches \n in middle no PCRE_MULTILINE
^ matches \n in middle no PCRE_MULTILINE
</pre>
This is the equivalent table for POSIX:
<pre>
Default Change with
. matches newline yes REG_NEWLINE
newline matches [^a] yes REG_NEWLINE
$ matches \n at end no REG_NEWLINE
$ matches \n in middle no REG_NEWLINE
^ matches \n in middle no REG_NEWLINE
</pre>
PCRE's behaviour is the same as Perl's, except that there is no equivalent for
PCRE_DOLLAR_ENDONLY in Perl. In both PCRE and Perl, there is no way to stop
newline from matching [^a].
</P>
<P>
The default POSIX newline handling can be obtained by setting PCRE_DOTALL and
PCRE_DOLLAR_ENDONLY, but there is no way to make PCRE behave exactly as for the
REG_NEWLINE action.
</P>
<br><a name="SEC5" href="#TOC1">MATCHING A PATTERN</a><br>
<P>
The function <b>regexec()</b> is called to match a compiled pattern <i>preg</i>
against a given <i>string</i>, which is by default terminated by a zero byte
(but see REG_STARTEND below), subject to the options in <i>eflags</i>. These can
be:
<pre>
REG_NOTBOL
</pre>
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
function.
<pre>
REG_NOTEMPTY
</pre>
The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
setting this option can give more POSIX-like behaviour in some situations.
<pre>
REG_NOTEOL
</pre>
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
function.
<pre>
REG_STARTEND
</pre>
The string is considered to start at <i>string</i> + <i>pmatch[0].rm_so</i> and
to have a terminating NUL located at <i>string</i> + <i>pmatch[0].rm_eo</i>
(there need not actually be a NUL at that location), regardless of the value of
<i>nmatch</i>. This is a BSD extension, compatible with but not specified by
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
intended to be portable to other systems. Note that a non-zero <i>rm_so</i> does
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
how it is matched.
</P>
<P>
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
<b>regexec()</b> are ignored.
</P>
<P>
Otherwise,the portion of the string that was matched, and also any captured
substrings, are returned via the <i>pmatch</i> argument, which points to an
array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the
members <i>rm_so</i> and <i>rm_eo</i>. These contain the offset to the first
character of each substring and the offset to the first character after the end
of each substring, respectively. The 0th element of the vector relates to the
entire portion of <i>string</i> that was matched; subsequent elements relate to
the capturing subpatterns of the regular expression. Unused entries in the
array have both structure members set to -1.
</P>
<P>
A successful match yields a zero return; various error codes are defined in the
header file, of which REG_NOMATCH is the "expected" failure code.
</P>
<br><a name="SEC6" href="#TOC1">ERROR MESSAGES</a><br>
<P>
The <b>regerror()</b> function maps a non-zero errorcode from either
<b>regcomp()</b> or <b>regexec()</b> to a printable message. If <i>preg</i> is not
NULL, the error should have arisen from the use of that structure. A message
terminated by a binary zero is placed in <i>errbuf</i>. The length of the
message, including the zero, is limited to <i>errbuf_size</i>. The yield of the
function is the size of buffer needed to hold the whole message.
</P>
<br><a name="SEC7" href="#TOC1">MEMORY USAGE</a><br>
<P>
Compiling a regular expression causes memory to be allocated and associated
with the <i>preg</i> structure. The function <b>regfree()</b> frees all such
memory, after which <i>preg</i> may no longer be used as a compiled expression.
</P>
<br><a name="SEC8" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
<P>
Last updated: 11 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,148 +0,0 @@
<html>
<head>
<title>pcreprecompile specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcreprecompile man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SAVING AND RE-USING PRECOMPILED PCRE PATTERNS</a>
<li><a name="TOC2" href="#SEC2">SAVING A COMPILED PATTERN</a>
<li><a name="TOC3" href="#SEC3">RE-USING A PRECOMPILED PATTERN</a>
<li><a name="TOC4" href="#SEC4">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE PATTERNS</a><br>
<P>
If you are running an application that uses a large number of regular
expression patterns, it may be useful to store them in a precompiled form
instead of having to compile them every time the application is run.
If you are not using any private character tables (see the
<a href="pcre_maketables.html"><b>pcre_maketables()</b></a>
documentation), this is relatively straightforward. If you are using private
tables, it is a little bit more complicated.
</P>
<P>
If you save compiled patterns to a file, you can copy them to a different host
and run them there. This works even if the new host has the opposite endianness
to the one on which the patterns were compiled. There may be a small
performance penalty, but it should be insignificant. However, compiling regular
expressions with one version of PCRE for use with a different version is not
guaranteed to work and may cause crashes.
</P>
<br><a name="SEC2" href="#TOC1">SAVING A COMPILED PATTERN</a><br>
<P>
The value returned by <b>pcre_compile()</b> points to a single block of memory
that holds the compiled pattern and associated data. You can find the length of
this block in bytes by calling <b>pcre_fullinfo()</b> with an argument of
PCRE_INFO_SIZE. You can then save the data in any appropriate manner. Here is
sample code that compiles a pattern and writes it to a file. It assumes that
the variable <i>fd</i> refers to a file that is open for output:
<pre>
int erroroffset, rc, size;
char *error;
pcre *re;
re = pcre_compile("my pattern", 0, &error, &erroroffset, NULL);
if (re == NULL) { ... handle errors ... }
rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size);
if (rc &#60; 0) { ... handle errors ... }
rc = fwrite(re, 1, size, fd);
if (rc != size) { ... handle errors ... }
</pre>
In this example, the bytes that comprise the compiled pattern are copied
exactly. Note that this is binary data that may contain any of the 256 possible
byte values. On systems that make a distinction between binary and non-binary
data, be sure that the file is opened for binary output.
</P>
<P>
If you want to write more than one pattern to a file, you will have to devise a
way of separating them. For binary data, preceding each pattern with its length
is probably the most straightforward approach. Another possibility is to write
out the data in hexadecimal instead of binary, one pattern to a line.
</P>
<P>
Saving compiled patterns in a file is only one possible way of storing them for
later use. They could equally well be saved in a database, or in the memory of
some daemon process that passes them via sockets to the processes that want
them.
</P>
<P>
If the pattern has been studied, it is also possible to save the study data in
a similar way to the compiled pattern itself. When studying generates
additional information, <b>pcre_study()</b> returns a pointer to a
<b>pcre_extra</b> data block. Its format is defined in the
<a href="pcreapi.html#extradata">section on matching a pattern</a>
in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation. The <i>study_data</i> field points to the binary study data, and
this is what you must save (not the <b>pcre_extra</b> block itself). The length
of the study data can be obtained by calling <b>pcre_fullinfo()</b> with an
argument of PCRE_INFO_STUDYSIZE. Remember to check that <b>pcre_study()</b> did
return a non-NULL value before trying to save the study data.
</P>
<br><a name="SEC3" href="#TOC1">RE-USING A PRECOMPILED PATTERN</a><br>
<P>
Re-using a precompiled pattern is straightforward. Having reloaded it into main
memory, you pass its pointer to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> in
the usual way. This should work even on another host, and even if that host has
the opposite endianness to the one where the pattern was compiled.
</P>
<P>
However, if you passed a pointer to custom character tables when the pattern
was compiled (the <i>tableptr</i> argument of <b>pcre_compile()</b>), you must
now pass a similar pointer to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>,
because the value saved with the compiled pattern will obviously be nonsense. A
field in a <b>pcre_extra()</b> block is used to pass this data, as described in
the
<a href="pcreapi.html#extradata">section on matching a pattern</a>
in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
<P>
If you did not provide custom character tables when the pattern was compiled,
the pointer in the compiled pattern is NULL, which causes <b>pcre_exec()</b> to
use PCRE's internal tables. Thus, you do not need to take any special action at
run time in this case.
</P>
<P>
If you saved study data with the compiled pattern, you need to create your own
<b>pcre_extra</b> data block and set the <i>study_data</i> field to point to the
reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
<i>flags</i> field to indicate that study data is present. Then pass the
<b>pcre_extra</b> block to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> in the
usual way.
</P>
<br><a name="SEC4" href="#TOC1">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a><br>
<P>
In general, it is safest to recompile all saved patterns when you update to a
new PCRE release, though not all updates actually require this. Recompiling is
definitely needed for release 7.2.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,96 +0,0 @@
<html>
<head>
<title>pcresample specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcresample man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
PCRE SAMPLE PROGRAM
</b><br>
<P>
A simple, complete demonstration program, to get you started with using PCRE,
is supplied in the file <i>pcredemo.c</i> in the PCRE distribution.
</P>
<P>
The program compiles the regular expression that is its first argument, and
matches it against the subject string in its second argument. No PCRE options
are set, and default character tables are used. If matching succeeds, the
program outputs the portion of the subject that matched, together with the
contents of any captured substrings.
</P>
<P>
If the -g option is given on the command line, the program then goes on to
check for further matches of the same regular expression in the same subject
string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on.
</P>
<P>
If PCRE is installed in the standard include and library directories for your
system, you should be able to compile the demonstration program using this
command:
<pre>
gcc -o pcredemo pcredemo.c -lpcre
</pre>
If PCRE is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE installed in
<i>/usr/local</i>, you can compile the demonstration program using a command
like this:
<pre>
gcc -o pcredemo -I/usr/local/include pcredemo.c -L/usr/local/lib -lpcre
</pre>
Once you have compiled the demonstration program, you can run simple tests like
this:
<pre>
./pcredemo 'cat|dog' 'the cat sat on the mat'
./pcredemo -g 'cat|dog' 'the dog sat on the cat'
</pre>
Note that there is a much more comprehensive test program, called
<a href="pcretest.html"><b>pcretest</b>,</a>
which supports many more facilities for testing regular expressions and the
PCRE library. The <b>pcredemo</b> program is provided as a simple coding
example.
</P>
<P>
On some operating systems (e.g. Solaris), when PCRE is not installed in the
standard library directory, you may get an error like this when you try to run
<b>pcredemo</b>:
<pre>
ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
</pre>
This is caused by the way shared library support works on those systems. You
need to add
<pre>
-R/usr/local/lib
</pre>
(for example) to the compile command to get round this problem.
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 23 January 2008
<br>
Copyright &copy; 1997-2008 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,172 +0,0 @@
<html>
<head>
<title>pcrestack specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcrestack man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<br><b>
PCRE DISCUSSION OF STACK USAGE
</b><br>
<P>
When you call <b>pcre_exec()</b>, it makes use of an internal function called
<b>match()</b>. This calls itself recursively at branch points in the pattern,
in order to remember the state of the match so that it can back up and try a
different alternative if the first one fails. As matching proceeds deeper and
deeper into the tree of possibilities, the recursion depth increases.
</P>
<P>
Not all calls of <b>match()</b> increase the recursion depth; for an item such
as a* it may be called several times at the same level, after matching
different numbers of a's. Furthermore, in a number of cases where the result of
the recursive call would immediately be passed back as the result of the
current call (a "tail recursion"), the function is just restarted instead.
</P>
<P>
The <b>pcre_dfa_exec()</b> function operates in an entirely different way, and
hardly uses recursion at all. The limit on its complexity is the amount of
workspace it is given. The comments that follow do NOT apply to
<b>pcre_dfa_exec()</b>; they are relevant only for <b>pcre_exec()</b>.
</P>
<P>
You can set limits on the number of times that <b>match()</b> is called, both in
total and recursively. If the limit is exceeded, an error occurs. For details,
see the
<a href="pcreapi.html#extradata">section on extra data for <b>pcre_exec()</b></a>
in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
<P>
Each time that <b>match()</b> is actually called recursively, it uses memory
from the process stack. For certain kinds of pattern and data, very large
amounts of stack may be needed, despite the recognition of "tail recursion".
You can often reduce the amount of recursion, and therefore the amount of stack
used, by modifying the pattern that is being matched. Consider, for example,
this pattern:
<pre>
([^&#60;]|&#60;(?!inet))+
</pre>
It matches from wherever it starts until it encounters "&#60;inet" or the end of
the data, and is the kind of pattern that might be used when processing an XML
file. Each iteration of the outer parentheses matches either one character that
is not "&#60;" or a "&#60;" that is not followed by "inet". However, each time a
parenthesis is processed, a recursion occurs, so this formulation uses a stack
frame for each matched character. For a long string, a lot of stack is
required. Consider now this rewritten pattern, which matches exactly the same
strings:
<pre>
([^&#60;]++|&#60;(?!inet))+
</pre>
This uses very much less stack, because runs of characters that do not contain
"&#60;" are "swallowed" in one item inside the parentheses. Recursion happens only
when a "&#60;" character that is not followed by "inet" is encountered (and we
assume this is relatively rare). A possessive quantifier is used to stop any
backtracking into the runs of non-"&#60;" characters, but that is not related to
stack usage.
</P>
<P>
This example shows that one way of avoiding stack problems when matching long
subject strings is to write repeated parenthesized subpatterns to match more
than one character whenever possible.
</P>
<br><b>
Compiling PCRE to use heap instead of stack
</b><br>
<P>
In environments where stack memory is constrained, you might want to compile
PCRE to use heap memory instead of stack for remembering back-up points. This
makes it run a lot more slowly, however. Details of how to do this are given in
the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
documentation. When built in this way, instead of using the stack, PCRE obtains
and frees memory by calling the functions that are pointed to by the
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables. By default, these
point to <b>malloc()</b> and <b>free()</b>, but you can replace the pointers to
cause PCRE to use your own functions. Since the block sizes are always the
same, and are always freed in reverse order, it may be possible to implement
customized memory handlers that are more efficient than the standard functions.
</P>
<br><b>
Limiting PCRE's stack usage
</b><br>
<P>
PCRE has an internal counter that can be used to limit the depth of recursion,
and thus cause <b>pcre_exec()</b> to give an error code before it runs out of
stack. By default, the limit is very large, and unlikely ever to operate. It
can be changed when PCRE is built, and it can also be set when
<b>pcre_exec()</b> is called. For details of these interfaces, see the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
and
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
<P>
As a very rough rule of thumb, you should reckon on about 500 bytes per
recursion. Thus, if you want to limit your stack usage to 8Mb, you
should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
support around 128000 recursions. The <b>pcretest</b> test program has a command
line option (<b>-S</b>) that can be used to increase the size of its stack.
</P>
<br><b>
Changing stack size in Unix-like systems
</b><br>
<P>
In Unix-like environments, there is not often a problem with the stack unless
very long strings are involved, though the default limit on stack size varies
from system to system. Values from 8Mb to 64Mb are common. You can find your
default limit by running the command:
<pre>
ulimit -s
</pre>
Unfortunately, the effect of running out of stack is often SIGSEGV, though
sometimes a more explicit error message is given. You can normally increase the
limit on stack size by code such as this:
<pre>
struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = 100*1024*1024;
setrlimit(RLIMIT_STACK, &rlim);
</pre>
This reads the current limits (soft and hard) using <b>getrlimit()</b>, then
attempts to increase the soft limit to 100Mb using <b>setrlimit()</b>. You must
do this before calling <b>pcre_exec()</b>.
</P>
<br><b>
Changing stack size in Mac OS X
</b><br>
<P>
Using <b>setrlimit()</b>, as described above, should also work on Mac OS X. It
is also possible to set a stack size when linking a program. There is a
discussion about stack sizes in Mac OS X at this web site:
<a href="http://developer.apple.com/qa/qa2005/qa1419.html">http://developer.apple.com/qa/qa2005/qa1419.html.</a>
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 09 July 2008
<br>
Copyright &copy; 1997-2008 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,473 +0,0 @@
<html>
<head>
<title>pcresyntax specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcresyntax man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE REGULAR EXPRESSION SYNTAX SUMMARY</a>
<li><a name="TOC2" href="#SEC2">QUOTING</a>
<li><a name="TOC3" href="#SEC3">CHARACTERS</a>
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTY CODES FOR \p and \P</a>
<li><a name="TOC6" href="#SEC6">SCRIPT NAMES FOR \p AND \P</a>
<li><a name="TOC7" href="#SEC7">CHARACTER CLASSES</a>
<li><a name="TOC8" href="#SEC8">QUANTIFIERS</a>
<li><a name="TOC9" href="#SEC9">ANCHORS AND SIMPLE ASSERTIONS</a>
<li><a name="TOC10" href="#SEC10">MATCH POINT RESET</a>
<li><a name="TOC11" href="#SEC11">ALTERNATION</a>
<li><a name="TOC12" href="#SEC12">CAPTURING</a>
<li><a name="TOC13" href="#SEC13">ATOMIC GROUPS</a>
<li><a name="TOC14" href="#SEC14">COMMENT</a>
<li><a name="TOC15" href="#SEC15">OPTION SETTING</a>
<li><a name="TOC16" href="#SEC16">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
<li><a name="TOC17" href="#SEC17">BACKREFERENCES</a>
<li><a name="TOC18" href="#SEC18">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
<li><a name="TOC19" href="#SEC19">CONDITIONAL PATTERNS</a>
<li><a name="TOC20" href="#SEC20">BACKTRACKING CONTROL</a>
<li><a name="TOC21" href="#SEC21">NEWLINE CONVENTIONS</a>
<li><a name="TOC22" href="#SEC22">WHAT \R MATCHES</a>
<li><a name="TOC23" href="#SEC23">CALLOUTS</a>
<li><a name="TOC24" href="#SEC24">SEE ALSO</a>
<li><a name="TOC25" href="#SEC25">AUTHOR</a>
<li><a name="TOC26" href="#SEC26">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
<P>
The full syntax and semantics of the regular expressions that are supported by
PCRE are described in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation. This document contains just a quick-reference summary of the
syntax.
</P>
<br><a name="SEC2" href="#TOC1">QUOTING</a><br>
<P>
<pre>
\x where x is non-alphanumeric is a literal x
\Q...\E treat enclosed characters as literal
</PRE>
</P>
<br><a name="SEC3" href="#TOC1">CHARACTERS</a><br>
<P>
<pre>
\a alarm, that is, the BEL character (hex 07)
\cx "control-x", where x is any character
\e escape (hex 1B)
\f formfeed (hex 0C)
\n newline (hex 0A)
\r carriage return (hex 0D)
\t tab (hex 09)
\ddd character with octal code ddd, or backreference
\xhh character with hex code hh
\x{hhh..} character with hex code hhh..
</PRE>
</P>
<br><a name="SEC4" href="#TOC1">CHARACTER TYPES</a><br>
<P>
<pre>
. any character except newline;
in dotall mode, any character whatsoever
\C one byte, even in UTF-8 mode (best avoided)
\d a decimal digit
\D a character that is not a decimal digit
\h a horizontal whitespace character
\H a character that is not a horizontal whitespace character
\p{<i>xx</i>} a character with the <i>xx</i> property
\P{<i>xx</i>} a character without the <i>xx</i> property
\R a newline sequence
\s a whitespace character
\S a character that is not a whitespace character
\v a vertical whitespace character
\V a character that is not a vertical whitespace character
\w a "word" character
\W a "non-word" character
\X an extended Unicode sequence
</pre>
In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
</P>
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTY CODES FOR \p and \P</a><br>
<P>
<pre>
C Other
Cc Control
Cf Format
Cn Unassigned
Co Private use
Cs Surrogate
L Letter
Ll Lower case letter
Lm Modifier letter
Lo Other letter
Lt Title case letter
Lu Upper case letter
L& Ll, Lu, or Lt
M Mark
Mc Spacing mark
Me Enclosing mark
Mn Non-spacing mark
N Number
Nd Decimal number
Nl Letter number
No Other number
P Punctuation
Pc Connector punctuation
Pd Dash punctuation
Pe Close punctuation
Pf Final punctuation
Pi Initial punctuation
Po Other punctuation
Ps Open punctuation
S Symbol
Sc Currency symbol
Sk Modifier symbol
Sm Mathematical symbol
So Other symbol
Z Separator
Zl Line separator
Zp Paragraph separator
Zs Space separator
</PRE>
</P>
<br><a name="SEC6" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
<P>
Arabic,
Armenian,
Balinese,
Bengali,
Bopomofo,
Braille,
Buginese,
Buhid,
Canadian_Aboriginal,
Carian,
Cham,
Cherokee,
Common,
Coptic,
Cuneiform,
Cypriot,
Cyrillic,
Deseret,
Devanagari,
Ethiopic,
Georgian,
Glagolitic,
Gothic,
Greek,
Gujarati,
Gurmukhi,
Han,
Hangul,
Hanunoo,
Hebrew,
Hiragana,
Inherited,
Kannada,
Katakana,
Kayah_Li,
Kharoshthi,
Khmer,
Lao,
Latin,
Lepcha,
Limbu,
Linear_B,
Lycian,
Lydian,
Malayalam,
Mongolian,
Myanmar,
New_Tai_Lue,
Nko,
Ogham,
Old_Italic,
Old_Persian,
Ol_Chiki,
Oriya,
Osmanya,
Phags_Pa,
Phoenician,
Rejang,
Runic,
Saurashtra,
Shavian,
Sinhala,
Sudanese,
Syloti_Nagri,
Syriac,
Tagalog,
Tagbanwa,
Tai_Le,
Tamil,
Telugu,
Thaana,
Thai,
Tibetan,
Tifinagh,
Ugaritic,
Vai,
Yi.
</P>
<br><a name="SEC7" href="#TOC1">CHARACTER CLASSES</a><br>
<P>
<pre>
[...] positive character class
[^...] negative character class
[x-y] range (can be used for hex characters)
[[:xxx:]] positive POSIX named set
[[:^xxx:]] negative POSIX named set
alnum alphanumeric
alpha alphabetic
ascii 0-127
blank space or tab
cntrl control character
digit decimal digit
graph printing, excluding space
lower lower case letter
print printing, including space
punct printing, excluding alphanumeric
space whitespace
upper upper case letter
word same as \w
xdigit hexadecimal digit
</pre>
In PCRE, POSIX character set names recognize only ASCII characters. You can use
\Q...\E inside a character class.
</P>
<br><a name="SEC8" href="#TOC1">QUANTIFIERS</a><br>
<P>
<pre>
? 0 or 1, greedy
?+ 0 or 1, possessive
?? 0 or 1, lazy
* 0 or more, greedy
*+ 0 or more, possessive
*? 0 or more, lazy
+ 1 or more, greedy
++ 1 or more, possessive
+? 1 or more, lazy
{n} exactly n
{n,m} at least n, no more than m, greedy
{n,m}+ at least n, no more than m, possessive
{n,m}? at least n, no more than m, lazy
{n,} n or more, greedy
{n,}+ n or more, possessive
{n,}? n or more, lazy
</PRE>
</P>
<br><a name="SEC9" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
<P>
<pre>
\b word boundary (only ASCII letters recognized)
\B not a word boundary
^ start of subject
also after internal newline in multiline mode
\A start of subject
$ end of subject
also before newline at end of subject
also before internal newline in multiline mode
\Z end of subject
also before newline at end of subject
\z end of subject
\G first matching position in subject
</PRE>
</P>
<br><a name="SEC10" href="#TOC1">MATCH POINT RESET</a><br>
<P>
<pre>
\K reset start of match
</PRE>
</P>
<br><a name="SEC11" href="#TOC1">ALTERNATION</a><br>
<P>
<pre>
expr|expr|expr...
</PRE>
</P>
<br><a name="SEC12" href="#TOC1">CAPTURING</a><br>
<P>
<pre>
(...) capturing group
(?&#60;name&#62;...) named capturing group (Perl)
(?'name'...) named capturing group (Perl)
(?P&#60;name&#62;...) named capturing group (Python)
(?:...) non-capturing group
(?|...) non-capturing group; reset group numbers for
capturing groups in each alternative
</PRE>
</P>
<br><a name="SEC13" href="#TOC1">ATOMIC GROUPS</a><br>
<P>
<pre>
(?&#62;...) atomic, non-capturing group
</PRE>
</P>
<br><a name="SEC14" href="#TOC1">COMMENT</a><br>
<P>
<pre>
(?#....) comment (not nestable)
</PRE>
</P>
<br><a name="SEC15" href="#TOC1">OPTION SETTING</a><br>
<P>
<pre>
(?i) caseless
(?J) allow duplicate names
(?m) multiline
(?s) single line (dotall)
(?U) default ungreedy (lazy)
(?x) extended (ignore white space)
(?-...) unset option(s)
</pre>
The following is recognized only at the start of a pattern or after one of the
newline-setting options with similar syntax:
<pre>
(*UTF8) set UTF-8 mode
</PRE>
</P>
<br><a name="SEC16" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
<P>
<pre>
(?=...) positive look ahead
(?!...) negative look ahead
(?&#60;=...) positive look behind
(?&#60;!...) negative look behind
</pre>
Each top-level branch of a look behind must be of a fixed length.
</P>
<br><a name="SEC17" href="#TOC1">BACKREFERENCES</a><br>
<P>
<pre>
\n reference by number (can be ambiguous)
\gn reference by number
\g{n} reference by number
\g{-n} relative reference by number
\k&#60;name&#62; reference by name (Perl)
\k'name' reference by name (Perl)
\g{name} reference by name (Perl)
\k{name} reference by name (.NET)
(?P=name) reference by name (Python)
</PRE>
</P>
<br><a name="SEC18" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
<P>
<pre>
(?R) recurse whole pattern
(?n) call subpattern by absolute number
(?+n) call subpattern by relative number
(?-n) call subpattern by relative number
(?&name) call subpattern by name (Perl)
(?P&#62;name) call subpattern by name (Python)
\g&#60;name&#62; call subpattern by name (Oniguruma)
\g'name' call subpattern by name (Oniguruma)
\g&#60;n&#62; call subpattern by absolute number (Oniguruma)
\g'n' call subpattern by absolute number (Oniguruma)
\g&#60;+n&#62; call subpattern by relative number (PCRE extension)
\g'+n' call subpattern by relative number (PCRE extension)
\g&#60;-n&#62; call subpattern by relative number (PCRE extension)
\g'-n' call subpattern by relative number (PCRE extension)
</PRE>
</P>
<br><a name="SEC19" href="#TOC1">CONDITIONAL PATTERNS</a><br>
<P>
<pre>
(?(condition)yes-pattern)
(?(condition)yes-pattern|no-pattern)
(?(n)... absolute reference condition
(?(+n)... relative reference condition
(?(-n)... relative reference condition
(?(&#60;name&#62;)... named reference condition (Perl)
(?('name')... named reference condition (Perl)
(?(name)... named reference condition (PCRE)
(?(R)... overall recursion condition
(?(Rn)... specific group recursion condition
(?(R&name)... specific recursion condition
(?(DEFINE)... define subpattern for reference
(?(assert)... assertion condition
</PRE>
</P>
<br><a name="SEC20" href="#TOC1">BACKTRACKING CONTROL</a><br>
<P>
The following act immediately they are reached:
<pre>
(*ACCEPT) force successful match
(*FAIL) force backtrack; synonym (*F)
</pre>
The following act only when a subsequent match failure causes a backtrack to
reach them. They all force a match failure, but they differ in what happens
afterwards. Those that advance the start-of-match point do so only if the
pattern is not anchored.
<pre>
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
(*SKIP) advance start to current matching position
(*THEN) local failure, backtrack to next alternation
</PRE>
</P>
<br><a name="SEC21" href="#TOC1">NEWLINE CONVENTIONS</a><br>
<P>
These are recognized only at the very start of the pattern or after a
(*BSR_...) or (*UTF8) option.
<pre>
(*CR) carriage return only
(*LF) linefeed only
(*CRLF) carriage return followed by linefeed
(*ANYCRLF) all three of the above
(*ANY) any Unicode newline sequence
</PRE>
</P>
<br><a name="SEC22" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
These are recognized only at the very start of the pattern or after a
(*...) option that sets the newline convention or UTF-8 mode.
<pre>
(*BSR_ANYCRLF) CR, LF, or CRLF
(*BSR_UNICODE) any Unicode newline sequence
</PRE>
</P>
<br><a name="SEC23" href="#TOC1">CALLOUTS</a><br>
<P>
<pre>
(?C) callout
(?Cn) callout with data n
</PRE>
</P>
<br><a name="SEC24" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcrepattern</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3),
<b>pcrematching</b>(3), <b>pcre</b>(3).
</P>
<br><a name="SEC25" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
<P>
Last updated: 11 April 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,712 +0,0 @@
<html>
<head>
<title>pcretest specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcretest man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
<li><a name="TOC2" href="#SEC2">OPTIONS</a>
<li><a name="TOC3" href="#SEC3">DESCRIPTION</a>
<li><a name="TOC4" href="#SEC4">PATTERN MODIFIERS</a>
<li><a name="TOC5" href="#SEC5">DATA LINES</a>
<li><a name="TOC6" href="#SEC6">THE ALTERNATIVE MATCHING FUNCTION</a>
<li><a name="TOC7" href="#SEC7">DEFAULT OUTPUT FROM PCRETEST</a>
<li><a name="TOC8" href="#SEC8">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a>
<li><a name="TOC9" href="#SEC9">RESTARTING AFTER A PARTIAL MATCH</a>
<li><a name="TOC10" href="#SEC10">CALLOUTS</a>
<li><a name="TOC11" href="#SEC11">NON-PRINTING CHARACTERS</a>
<li><a name="TOC12" href="#SEC12">SAVING AND RELOADING COMPILED PATTERNS</a>
<li><a name="TOC13" href="#SEC13">SEE ALSO</a>
<li><a name="TOC14" href="#SEC14">AUTHOR</a>
<li><a name="TOC15" href="#SEC15">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>pcretest [options] [source] [destination]</b>
<br>
<br>
<b>pcretest</b> was written as a test program for the PCRE regular expression
library itself, but it can also be used for experimenting with regular
expressions. This document describes the features of the test program; for
details of the regular expressions themselves, see the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation. For details of the PCRE library function calls and their
options, see the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
<br><a name="SEC2" href="#TOC1">OPTIONS</a><br>
<P>
<b>-b</b>
Behave as if each regex has the <b>/B</b> (show bytecode) modifier; the internal
form is output after compilation.
</P>
<P>
<b>-C</b>
Output the version number of the PCRE library, and all available information
about the optional features that are included, and then exit.
</P>
<P>
<b>-d</b>
Behave as if each regex has the <b>/D</b> (debug) modifier; the internal
form and information about the compiled pattern is output after compilation;
<b>-d</b> is equivalent to <b>-b -i</b>.
</P>
<P>
<b>-dfa</b>
Behave as if each data line contains the \D escape sequence; this causes the
alternative matching function, <b>pcre_dfa_exec()</b>, to be used instead of the
standard <b>pcre_exec()</b> function (more detail is given below).
</P>
<P>
<b>-help</b>
Output a brief summary these options and then exit.
</P>
<P>
<b>-i</b>
Behave as if each regex has the <b>/I</b> modifier; information about the
compiled pattern is given after compilation.
</P>
<P>
<b>-M</b>
Behave as if each data line contains the \M escape sequence; this causes
PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
calling <b>pcre_exec()</b> repeatedly with different limits.
</P>
<P>
<b>-m</b>
Output the size of each compiled pattern after it has been compiled. This is
equivalent to adding <b>/M</b> to each regular expression. For compatibility
with earlier versions of pcretest, <b>-s</b> is a synonym for <b>-m</b>.
</P>
<P>
<b>-o</b> <i>osize</i>
Set the number of elements in the output vector that is used when calling
<b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> to be <i>osize</i>. The default value
is 45, which is enough for 14 capturing subexpressions for <b>pcre_exec()</b> or
22 different matches for <b>pcre_dfa_exec()</b>. The vector size can be
changed for individual matching calls by including \O in the data line (see
below).
</P>
<P>
<b>-p</b>
Behave as if each regex has the <b>/P</b> modifier; the POSIX wrapper API is
used to call PCRE. None of the other options has any effect when <b>-p</b> is
set.
</P>
<P>
<b>-q</b>
Do not output the version number of <b>pcretest</b> at the start of execution.
</P>
<P>
<b>-S</b> <i>size</i>
On Unix-like systems, set the size of the runtime stack to <i>size</i>
megabytes.
</P>
<P>
<b>-t</b>
Run each compile, study, and match many times with a timer, and output
resulting time per compile or match (in milliseconds). Do not set <b>-m</b> with
<b>-t</b>, because you will then get the size output a zillion times, and the
timing will be distorted. You can control the number of iterations that are
used for timing by following <b>-t</b> with a number (as a separate item on the
command line). For example, "-t 1000" would iterate 1000 times. The default is
to iterate 500000 times.
</P>
<P>
<b>-tm</b>
This is like <b>-t</b> except that it times only the matching phase, not the
compile or study phases.
</P>
<br><a name="SEC3" href="#TOC1">DESCRIPTION</a><br>
<P>
If <b>pcretest</b> is given two filename arguments, it reads from the first and
writes to the second. If it is given only one filename argument, it reads from
that file and writes to stdout. Otherwise, it reads from stdin and writes to
stdout, and prompts for each line of input, using "re&#62;" to prompt for regular
expressions, and "data&#62;" to prompt for data lines.
</P>
<P>
When <b>pcretest</b> is built, a configuration option can specify that it should
be linked with the <b>libreadline</b> library. When this is done, if the input
is from a terminal, it is read using the <b>readline()</b> function. This
provides line-editing and history facilities. The output from the <b>-help</b>
option states whether or not <b>readline()</b> will be used.
</P>
<P>
The program handles any number of sets of input on a single input file. Each
set starts with a regular expression, and continues with any number of data
lines to be matched against the pattern.
</P>
<P>
Each data line is matched separately and independently. If you want to do
multi-line matches, you have to use the \n escape sequence (or \r or \r\n,
etc., depending on the newline setting) in a single line of input to encode the
newline sequences. There is no limit on the length of data lines; the input
buffer is automatically extended if it is too small.
</P>
<P>
An empty line signals the end of the data lines, at which point a new regular
expression is read. The regular expressions are given enclosed in any
non-alphanumeric delimiters other than backslash, for example:
<pre>
/(a|bc)x+yz/
</pre>
White space before the initial delimiter is ignored. A regular expression may
be continued over several input lines, in which case the newline characters are
included within it. It is possible to include the delimiter within the pattern
by escaping it, for example
<pre>
/abc\/def/
</pre>
If you do so, the escape and the delimiter form part of the pattern, but since
delimiters are always non-alphanumeric, this does not affect its interpretation.
If the terminating delimiter is immediately followed by a backslash, for
example,
<pre>
/abc/\
</pre>
then a backslash is added to the end of the pattern. This is done to provide a
way of testing the error condition that arises if a pattern finishes with a
backslash, because
<pre>
/abc\/
</pre>
is interpreted as the first line of a pattern that starts with "abc/", causing
pcretest to read the next line as a continuation of the regular expression.
</P>
<br><a name="SEC4" href="#TOC1">PATTERN MODIFIERS</a><br>
<P>
A pattern may be followed by any number of modifiers, which are mostly single
characters. Following Perl usage, these are referred to below as, for example,
"the <b>/i</b> modifier", even though the delimiter of the pattern need not
always be a slash, and no slash is used when writing modifiers. Whitespace may
appear between the final pattern delimiter and the first modifier, and between
the modifiers themselves.
</P>
<P>
The <b>/i</b>, <b>/m</b>, <b>/s</b>, and <b>/x</b> modifiers set the PCRE_CASELESS,
PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options, respectively, when
<b>pcre_compile()</b> is called. These four modifier letters have the same
effect as they do in Perl. For example:
<pre>
/caseless/i
</pre>
The following table shows additional modifiers for setting PCRE options that do
not correspond to anything in Perl:
<pre>
<b>/A</b> PCRE_ANCHORED
<b>/C</b> PCRE_AUTO_CALLOUT
<b>/E</b> PCRE_DOLLAR_ENDONLY
<b>/f</b> PCRE_FIRSTLINE
<b>/J</b> PCRE_DUPNAMES
<b>/N</b> PCRE_NO_AUTO_CAPTURE
<b>/U</b> PCRE_UNGREEDY
<b>/X</b> PCRE_EXTRA
<b>/&#60;JS&#62;</b> PCRE_JAVASCRIPT_COMPAT
<b>/&#60;cr&#62;</b> PCRE_NEWLINE_CR
<b>/&#60;lf&#62;</b> PCRE_NEWLINE_LF
<b>/&#60;crlf&#62;</b> PCRE_NEWLINE_CRLF
<b>/&#60;anycrlf&#62;</b> PCRE_NEWLINE_ANYCRLF
<b>/&#60;any&#62;</b> PCRE_NEWLINE_ANY
<b>/&#60;bsr_anycrlf&#62;</b> PCRE_BSR_ANYCRLF
<b>/&#60;bsr_unicode&#62;</b> PCRE_BSR_UNICODE
</pre>
Those specifying line ending sequences are literal strings as shown, but the
letters can be in either case. This example sets multiline matching with CRLF
as the line ending sequence:
<pre>
/^abc/m&#60;crlf&#62;
</pre>
Details of the meanings of these PCRE options are given in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
<br><b>
Finding all matches in a string
</b><br>
<P>
Searching for all possible matches within each subject string can be requested
by the <b>/g</b> or <b>/G</b> modifier. After finding a match, PCRE is called
again to search the remainder of the subject string. The difference between
<b>/g</b> and <b>/G</b> is that the former uses the <i>startoffset</i> argument to
<b>pcre_exec()</b> to start searching at a new point within the entire string
(which is in effect what Perl does), whereas the latter passes over a shortened
substring. This makes a difference to the matching process if the pattern
begins with a lookbehind assertion (including \b or \B).
</P>
<P>
If any call to <b>pcre_exec()</b> in a <b>/g</b> or <b>/G</b> sequence matches an
empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
flags set in order to search for another, non-empty, match at the same point.
If this second match fails, the start offset is advanced by one, and the normal
match is retried. This imitates the way Perl handles such cases when using the
<b>/g</b> modifier or the <b>split()</b> function.
</P>
<br><b>
Other modifiers
</b><br>
<P>
There are yet more modifiers for controlling the way <b>pcretest</b>
operates.
</P>
<P>
The <b>/+</b> modifier requests that as well as outputting the substring that
matched the entire pattern, pcretest should in addition output the remainder of
the subject string. This is useful for tests where the subject contains
multiple copies of the same substring.
</P>
<P>
The <b>/B</b> modifier is a debugging feature. It requests that <b>pcretest</b>
output a representation of the compiled byte code after compilation. Normally
this information contains length and offset values; however, if <b>/Z</b> is
also present, this data is replaced by spaces. This is a special feature for
use in the automatic test scripts; it ensures that the same output is generated
for different internal link sizes.
</P>
<P>
The <b>/L</b> modifier must be followed directly by the name of a locale, for
example,
<pre>
/pattern/Lfr_FR
</pre>
For this reason, it must be the last modifier. The given locale is set,
<b>pcre_maketables()</b> is called to build a set of character tables for the
locale, and this is then passed to <b>pcre_compile()</b> when compiling the
regular expression. Without an <b>/L</b> modifier, NULL is passed as the tables
pointer; that is, <b>/L</b> applies only to the expression on which it appears.
</P>
<P>
The <b>/I</b> modifier requests that <b>pcretest</b> output information about the
compiled pattern (whether it is anchored, has a fixed first character, and
so on). It does this by calling <b>pcre_fullinfo()</b> after compiling a
pattern. If the pattern is studied, the results of that are also output.
</P>
<P>
The <b>/D</b> modifier is a PCRE debugging feature, and is equivalent to
<b>/BI</b>, that is, both the <b>/B</b> and the <b>/I</b> modifiers.
</P>
<P>
The <b>/F</b> modifier causes <b>pcretest</b> to flip the byte order of the
fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
facility is for testing the feature in PCRE that allows it to execute patterns
that were compiled on a host with a different endianness. This feature is not
available when the POSIX interface to PCRE is being used, that is, when the
<b>/P</b> pattern modifier is specified. See also the section about saving and
reloading compiled patterns below.
</P>
<P>
The <b>/S</b> modifier causes <b>pcre_study()</b> to be called after the
expression has been compiled, and the results used when the expression is
matched.
</P>
<P>
The <b>/M</b> modifier causes the size of memory block used to hold the compiled
pattern to be output.
</P>
<P>
The <b>/P</b> modifier causes <b>pcretest</b> to call PCRE via the POSIX wrapper
API rather than its native API. When this is done, all other modifiers except
<b>/i</b>, <b>/m</b>, and <b>/+</b> are ignored. REG_ICASE is set if <b>/i</b> is
present, and REG_NEWLINE is set if <b>/m</b> is present. The wrapper functions
force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
</P>
<P>
The <b>/8</b> modifier causes <b>pcretest</b> to call PCRE with the PCRE_UTF8
option set. This turns on support for UTF-8 character handling in PCRE,
provided that it was compiled with this support enabled. This modifier also
causes any non-printing characters in output strings to be printed using the
\x{hh...} notation if they are valid UTF-8 sequences.
</P>
<P>
If the <b>/?</b> modifier is used with <b>/8</b>, it causes <b>pcretest</b> to
call <b>pcre_compile()</b> with the PCRE_NO_UTF8_CHECK option, to suppress the
checking of the string for UTF-8 validity.
</P>
<br><a name="SEC5" href="#TOC1">DATA LINES</a><br>
<P>
Before each data line is passed to <b>pcre_exec()</b>, leading and trailing
whitespace is removed, and it is then scanned for \ escapes. Some of these are
pretty esoteric features, intended for checking out some of the more
complicated features of PCRE. If you are just testing "ordinary" regular
expressions, you probably don't need any of these. The following escapes are
recognized:
<pre>
\a alarm (BEL, \x07)
\b backspace (\x08)
\e escape (\x27)
\f formfeed (\x0c)
\n newline (\x0a)
\qdd set the PCRE_MATCH_LIMIT limit to dd (any number of digits)
\r carriage return (\x0d)
\t tab (\x09)
\v vertical tab (\x0b)
\nnn octal character (up to 3 octal digits)
\xhh hexadecimal character (up to 2 hex digits)
\x{hh...} hexadecimal character, any number of digits in UTF-8 mode
\A pass the PCRE_ANCHORED option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\B pass the PCRE_NOTBOL option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\Cdd call pcre_copy_substring() for substring dd after a successful match (number less than 32)
\Cname call pcre_copy_named_substring() for substring "name" after a successful match (name termin-
ated by next non alphanumeric character)
\C+ show the current captured substrings at callout time
\C- do not supply a callout function
\C!n return 1 instead of 0 when callout number n is reached
\C!n!m return 1 instead of 0 when callout number n is reached for the nth time
\C*n pass the number n (may be negative) as callout data; this is used as the callout return value
\D use the <b>pcre_dfa_exec()</b> match function
\F only shortest match for <b>pcre_dfa_exec()</b>
\Gdd call pcre_get_substring() for substring dd after a successful match (number less than 32)
\Gname call pcre_get_named_substring() for substring "name" after a successful match (name termin-
ated by next non-alphanumeric character)
\L call pcre_get_substringlist() after a successful match
\M discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings
\N pass the PCRE_NOTEMPTY option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\Odd set the size of the output vector passed to <b>pcre_exec()</b> to dd (any number of digits)
\P pass the PCRE_PARTIAL option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\Qdd set the PCRE_MATCH_LIMIT_RECURSION limit to dd (any number of digits)
\R pass the PCRE_DFA_RESTART option to <b>pcre_dfa_exec()</b>
\S output details of memory get/free calls during matching
\Z pass the PCRE_NOTEOL option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\? pass the PCRE_NO_UTF8_CHECK option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#62;dd start the match at offset dd (any number of digits);
this sets the <i>startoffset</i> argument for <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;cr&#62; pass the PCRE_NEWLINE_CR option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;lf&#62; pass the PCRE_NEWLINE_LF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;crlf&#62; pass the PCRE_NEWLINE_CRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;anycrlf&#62; pass the PCRE_NEWLINE_ANYCRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;any&#62; pass the PCRE_NEWLINE_ANY option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
</pre>
The escapes that specify line ending sequences are literal strings, exactly as
shown. No more than one newline setting should be present in any data line.
</P>
<P>
A backslash followed by anything else just escapes the anything else. If
the very last character is a backslash, it is ignored. This gives a way of
passing an empty line as data, since a real empty line terminates the data
input.
</P>
<P>
If \M is present, <b>pcretest</b> calls <b>pcre_exec()</b> several times, with
different values in the <i>match_limit</i> and <i>match_limit_recursion</i>
fields of the <b>pcre_extra</b> data structure, until it finds the minimum
numbers for each parameter that allow <b>pcre_exec()</b> to complete. The
<i>match_limit</i> number is a measure of the amount of backtracking that takes
place, and checking it out can be instructive. For most simple matches, the
number is quite small, but for patterns with very large numbers of matching
possibilities, it can become large very quickly with increasing length of
subject string. The <i>match_limit_recursion</i> number is a measure of how much
stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
to complete the match attempt.
</P>
<P>
When \O is used, the value specified may be higher or lower than the size set
by the <b>-O</b> command line option (or defaulted to 45); \O applies only to
the call of <b>pcre_exec()</b> for the line in which it appears.
</P>
<P>
If the <b>/P</b> modifier was present on the pattern, causing the POSIX wrapper
API to be used, the only option-setting sequences that have any effect are \B
and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
<b>regexec()</b>.
</P>
<P>
The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
of the <b>/8</b> modifier on the pattern. It is recognized always. There may be
any number of hexadecimal digits inside the braces. The result is from one to
six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
valid Unicode code points, or indeed valid UTF-8 characters according to the
later rules in RFC 3629.
</P>
<br><a name="SEC6" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P>
By default, <b>pcretest</b> uses the standard PCRE matching function,
<b>pcre_exec()</b> to match each data line. From release 6.0, PCRE supports an
alternative matching function, <b>pcre_dfa_test()</b>, which operates in a
different way, and has some restrictions. The differences between the two
functions are described in the
<a href="pcrematching.html"><b>pcrematching</b></a>
documentation.
</P>
<P>
If a data line contains the \D escape sequence, or if the command line
contains the <b>-dfa</b> option, the alternative matching function is called.
This function finds all possible matches at a given point. If, however, the \F
escape sequence is present in the data line, it stops after the first match is
found. This is always the shortest possible match.
</P>
<br><a name="SEC7" href="#TOC1">DEFAULT OUTPUT FROM PCRETEST</a><br>
<P>
This section describes the output when the normal matching function,
<b>pcre_exec()</b>, is being used.
</P>
<P>
When a match succeeds, pcretest outputs the list of captured substrings that
<b>pcre_exec()</b> returns, starting with number 0 for the string that matched
the whole pattern. Otherwise, it outputs "No match" or "Partial match"
when <b>pcre_exec()</b> returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PARTIAL,
respectively, and otherwise the PCRE negative error number. Here is an example
of an interactive <b>pcretest</b> run.
<pre>
$ pcretest
PCRE version 7.0 30-Nov-2006
re&#62; /^abc(\d+)/
data&#62; abc123
0: abc123
1: 123
data&#62; xyz
No match
</pre>
Note that unset capturing substrings that are not followed by one that is set
are not returned by <b>pcre_exec()</b>, and are not shown by <b>pcretest</b>. In
the following example, there are two capturing substrings, but when the first
data line is matched, the second, unset substring is not shown. An "internal"
unset substring is shown as "&#60;unset&#62;", as for the second data line.
<pre>
re&#62; /(a)|(b)/
data&#62; a
0: a
1: a
data&#62; b
0: b
1: &#60;unset&#62;
2: b
</pre>
If the strings contain any non-printing characters, they are output as \0x
escapes, or as \x{...} escapes if the <b>/8</b> modifier was present on the
pattern. See below for the definition of non-printing characters. If the
pattern has the <b>/+</b> modifier, the output for substring 0 is followed by
the the rest of the subject string, identified by "0+" like this:
<pre>
re&#62; /cat/+
data&#62; cataract
0: cat
0+ aract
</pre>
If the pattern has the <b>/g</b> or <b>/G</b> modifier, the results of successive
matching attempts are output in sequence, like this:
<pre>
re&#62; /\Bi(\w\w)/g
data&#62; Mississippi
0: iss
1: ss
0: iss
1: ss
0: ipp
1: pp
</pre>
"No match" is output only if the first match attempt fails.
</P>
<P>
If any of the sequences <b>\C</b>, <b>\G</b>, or <b>\L</b> are present in a
data line that is successfully matched, the substrings extracted by the
convenience functions are output with C, G, or L after the string number
instead of a colon. This is in addition to the normal full list. The string
length (that is, the return from the extraction function) is given in
parentheses after each string for <b>\C</b> and <b>\G</b>.
</P>
<P>
Note that whereas patterns can be continued over several lines (a plain "&#62;"
prompt is used for continuations), data lines may not. However newlines can be
included in data by means of the \n escape (or \r, \r\n, etc., depending on
the newline sequence setting).
</P>
<br><a name="SEC8" href="#TOC1">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P>
When the alternative matching function, <b>pcre_dfa_exec()</b>, is used (by
means of the \D escape sequence or the <b>-dfa</b> command line option), the
output consists of a list of all the matches that start at the first point in
the subject where there is at least one match. For example:
<pre>
re&#62; /(tang|tangerine|tan)/
data&#62; yellow tangerine\D
0: tangerine
1: tang
2: tan
</pre>
(Using the normal matching function on this data finds only "tang".) The
longest matching string is always given first (and numbered zero).
</P>
<P>
If <b>/g</b> is present on the pattern, the search for further matches resumes
at the end of the longest match. For example:
<pre>
re&#62; /(tang|tangerine|tan)/g
data&#62; yellow tangerine and tangy sultana\D
0: tangerine
1: tang
2: tan
0: tang
1: tan
0: tan
</pre>
Since the matching function does not support substring capture, the escape
sequences that are concerned with captured substrings are not relevant.
</P>
<br><a name="SEC9" href="#TOC1">RESTARTING AFTER A PARTIAL MATCH</a><br>
<P>
When the alternative matching function has given the PCRE_ERROR_PARTIAL return,
indicating that the subject partially matched the pattern, you can restart the
match with additional subject data by means of the \R escape sequence. For
example:
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 23ja\P\D
Partial match: 23ja
data&#62; n05\R\D
0: n05
</pre>
For further information about partial matching, see the
<a href="pcrepartial.html"><b>pcrepartial</b></a>
documentation.
</P>
<br><a name="SEC10" href="#TOC1">CALLOUTS</a><br>
<P>
If the pattern contains any callout requests, <b>pcretest</b>'s callout function
is called during matching. This works with both matching functions. By default,
the called function displays the callout number, the start and current
positions in the text at the callout time, and the next pattern item to be
tested. For example, the output
<pre>
---&#62;pqrabcdef
0 ^ ^ \d
</pre>
indicates that callout number 0 occurred for a match attempt starting at the
fourth character of the subject string, when the pointer was at the seventh
character of the data, and when the next pattern item was \d. Just one
circumflex is output if the start and current positions are the same.
</P>
<P>
Callouts numbered 255 are assumed to be automatic callouts, inserted as a
result of the <b>/C</b> pattern modifier. In this case, instead of showing the
callout number, the offset in the pattern, preceded by a plus, is output. For
example:
<pre>
re&#62; /\d?[A-E]\*/C
data&#62; E*
---&#62;E*
+0 ^ \d?
+3 ^ [A-E]
+8 ^^ \*
+10 ^ ^
0: E*
</pre>
The callout function in <b>pcretest</b> returns zero (carry on matching) by
default, but you can use a \C item in a data line (as described above) to
change this.
</P>
<P>
Inserting callouts can be helpful when using <b>pcretest</b> to check
complicated regular expressions. For further information about callouts, see
the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation.
</P>
<br><a name="SEC11" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
<P>
When <b>pcretest</b> is outputting text in the compiled version of a pattern,
bytes other than 32-126 are always treated as non-printing characters are are
therefore shown as hex escapes.
</P>
<P>
When <b>pcretest</b> is outputting text that is a matched part of a subject
string, it behaves in the same way, unless a different locale has been set for
the pattern (using the <b>/L</b> modifier). In this case, the <b>isprint()</b>
function to distinguish printing and non-printing characters.
</P>
<br><a name="SEC12" href="#TOC1">SAVING AND RELOADING COMPILED PATTERNS</a><br>
<P>
The facilities described in this section are not available when the POSIX
inteface to PCRE is being used, that is, when the <b>/P</b> pattern modifier is
specified.
</P>
<P>
When the POSIX interface is not in use, you can cause <b>pcretest</b> to write a
compiled pattern to a file, by following the modifiers with &#62; and a file name.
For example:
<pre>
/pattern/im &#62;/some/file
</pre>
See the
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
documentation for a discussion about saving and re-using compiled patterns.
</P>
<P>
The data that is written is binary. The first eight bytes are the length of the
compiled pattern data followed by the length of the optional study data, each
written as four bytes in big-endian order (most significant byte first). If
there is no study data (either the pattern was not studied, or studying did not
return any data), the second length is zero. The lengths are followed by an
exact copy of the compiled pattern. If there is additional study data, this
follows immediately after the compiled pattern. After writing the file,
<b>pcretest</b> expects to read a new pattern.
</P>
<P>
A saved pattern can be reloaded into <b>pcretest</b> by specifing &#60; and a file
name instead of a pattern. The name of the file must not contain a &#60; character,
as otherwise <b>pcretest</b> will interpret the line as a pattern delimited by &#60;
characters.
For example:
<pre>
re&#62; &#60;/some/file
Compiled regex loaded from /some/file
No study data
</pre>
When the pattern has been loaded, <b>pcretest</b> proceeds to read data lines in
the usual way.
</P>
<P>
You can copy a file written by <b>pcretest</b> to a different host and reload it
there, even if the new host has opposite endianness to the one on which the
pattern was compiled. For example, you can compile on an i86 machine and run on
a SPARC machine.
</P>
<P>
File names for saving and reloading can be absolute or relative, but note that
the shell facility of expanding a file name that starts with a tilde (~) is not
available.
</P>
<P>
The ability to save and reload files in <b>pcretest</b> is intended for testing
and experimentation. It is not intended for production use because only a
single pattern can be written to a file. Furthermore, there is no facility for
supplying custom character tables for use with a reloaded pattern. If the
original pattern was compiled with custom tables, an attempt to match a subject
string using a reloaded pattern is likely to cause <b>pcretest</b> to crash.
Finally, if you attempt to load a file that is not in the correct format, the
result is undefined.
</P>
<br><a name="SEC13" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrematching</b>(3),
<b>pcrepartial</b>(d), <b>pcrepattern</b>(3), <b>pcreprecompile</b>(3).
</P>
<br><a name="SEC14" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
Last updated: 10 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@ -1,140 +0,0 @@
<html>
<!-- This is a manually maintained file that is the root of the HTML version of
the PCRE documentation. When the HTML documents are built from the man
page versions, the entire doc/html directory is emptied, this file is then
copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
-->
<head>
<title>PCRE specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>Perl-compatible Regular Expressions (PCRE)</h1>
<p>
The HTML documentation for PCRE comprises the following pages:
</p>
<table>
<tr><td><a href="pcre.html">pcre</a></td>
<td>&nbsp;&nbsp;Introductory page</td></tr>
<tr><td><a href="pcre-config.html">pcre-config</a></td>
<td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
<tr><td><a href="pcreapi.html">pcreapi</a></td>
<td>&nbsp;&nbsp;PCRE's native API</td></tr>
<tr><td><a href="pcrebuild.html">pcrebuild</a></td>
<td>&nbsp;&nbsp;Options for building PCRE</td></tr>
<tr><td><a href="pcrecallout.html">pcrecallout</a></td>
<td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
<tr><td><a href="pcrecompat.html">pcrecompat</a></td>
<td>&nbsp;&nbsp;Compability with Perl</td></tr>
<tr><td><a href="pcrecpp.html">pcrecpp</a></td>
<td>&nbsp;&nbsp;The C++ wrapper for the PCRE library</td></tr>
<tr><td><a href="pcregrep.html">pcregrep</a></td>
<td>&nbsp;&nbsp;The <b>pcregrep</b> command</td></tr>
<tr><td><a href="pcrematching.html">pcrematching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
<tr><td><a href="pcrepartial.html">pcrepartial</a></td>
<td>&nbsp;&nbsp;Using PCRE for partial matching</td></tr>
<tr><td><a href="pcrepattern.html">pcrepattern</a></td>
<td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE</td></tr>
<tr><td><a href="pcreperform.html">pcreperform</a></td>
<td>&nbsp;&nbsp;Some comments on performance</td></tr>
<tr><td><a href="pcreposix.html">pcreposix</a></td>
<td>&nbsp;&nbsp;The POSIX API to the PCRE library</td></tr>
<tr><td><a href="pcreprecompile.html">pcreprecompile</a></td>
<td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
<tr><td><a href="pcresample.html">pcresample</a></td>
<td>&nbsp;&nbsp;Description of the sample program</td></tr>
<tr><td><a href="pcrestack.html">pcrestack</a></td>
<td>&nbsp;&nbsp;Discussion of PCRE's stack usage</td></tr>
<tr><td><a href="pcresyntax.html">pcresyntax</a></td>
<td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
<tr><td><a href="pcretest.html">pcretest</a></td>
<td>&nbsp;&nbsp;The <b>pcretest</b> command for testing PCRE</td></tr>
</table>
<p>
There are also individual pages that summarize the interface for each function
in the library:
</p>
<table>
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
<tr><td><a href="pcre_compile2.html">pcre_compile2</a></td>
<td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
<tr><td><a href="pcre_config.html">pcre_config</a></td>
<td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
<tr><td><a href="pcre_copy_named_substring.html">pcre_copy_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
<tr><td><a href="pcre_copy_substring.html">pcre_copy_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
<tr><td><a href="pcre_dfa_exec.html">pcre_dfa_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
<tr><td><a href="pcre_exec.html">pcre_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
<tr><td><a href="pcre_free_substring.html">pcre_free_substring</a></td>
<td>&nbsp;&nbsp;Free extracted substring</td></tr>
<tr><td><a href="pcre_free_substring_list.html">pcre_free_substring_list</a></td>
<td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
<tr><td><a href="pcre_fullinfo.html">pcre_fullinfo</a></td>
<td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
<tr><td><a href="pcre_get_named_substring.html">pcre_get_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
<tr><td><a href="pcre_get_stringnumber.html">pcre_get_stringnumber</a></td>
<td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
<tr><td><a href="pcre_get_substring.html">pcre_get_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
<tr><td><a href="pcre_get_substring_list.html">pcre_get_substring_list</a></td>
<td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
<tr><td><a href="pcre_info.html">pcre_info</a></td>
<td>&nbsp;&nbsp;Obsolete information extraction function</td></tr>
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
<tr><td><a href="pcre_study.html">pcre_study</a></td>
<td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
<tr><td><a href="pcre_version.html">pcre_version</a></td>
<td>&nbsp;&nbsp;Return PCRE version and release date</td></tr>
</table>
</html>

View File

@ -1,73 +0,0 @@
.TH PCRE-CONFIG 1
.SH NAME
pcre-config - program to return PCRE configuration
.SH SYNOPSIS
.rs
.sp
.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
.ti +5n
.B [--libs-posix] [--cflags] [--cflags-posix]
.
.
.SH DESCRIPTION
.rs
.sp
\fBpcre-config\fP returns the configuration of the installed PCRE
libraries and the options required to compile a program to use them.
.
.
.SH OPTIONS
.rs
.TP 10
\fB--prefix\fP
Writes the directory prefix used in the PCRE installation for architecture
independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some
systems) to the standard output.
.TP 10
\fB--exec-prefix\fP
Writes the directory prefix used in the PCRE installation for architecture
dependent files (normally the same as \fB--prefix\fP) to the standard output.
.TP 10
\fB--version\fP
Writes the version number of the installed PCRE libraries to the standard
output.
.TP 10
\fB--libs\fP
Writes to the standard output the command line options required to link
with PCRE (\fB-lpcre\fP on many systems).
.TP 10
\fB--libs-posix\fP
Writes to the standard output the command line options required to link with
the PCRE posix emulation library (\fB-lpcreposix\fP \fB-lpcre\fP on many
systems).
.TP 10
\fB--cflags\fP
Writes to the standard output the command line options required to compile
files that use PCRE (this may include some \fB-I\fP options, but is blank on
many systems).
.TP 10
\fB--cflags-posix\fP
Writes to the standard output the command line options required to compile
files that use the PCRE posix emulation library (this may include some \fB-I\fP
options, but is blank on many systems).
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcre(3)\fP
.
.
.SH AUTHOR
.rs
.sp
This manual page was originally written by Mark Baker for the Debian GNU/Linux
system. It has been slightly revised as a generic PCRE man page.
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 18 April 2007
.fi

View File

@ -1,67 +0,0 @@
PCRE-CONFIG(1) PCRE-CONFIG(1)
NAME
pcre-config - program to return PCRE configuration
SYNOPSIS
pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
[--libs-posix] [--cflags] [--cflags-posix]
DESCRIPTION
pcre-config returns the configuration of the installed PCRE libraries
and the options required to compile a program to use them.
OPTIONS
--prefix Writes the directory prefix used in the PCRE installation for
architecture independent files (/usr on many systems,
/usr/local on some systems) to the standard output.
--exec-prefix
Writes the directory prefix used in the PCRE installation for
architecture dependent files (normally the same as --prefix)
to the standard output.
--version Writes the version number of the installed PCRE libraries to
the standard output.
--libs Writes to the standard output the command line options
required to link with PCRE (-lpcre on many systems).
--libs-posix
Writes to the standard output the command line options
required to link with the PCRE posix emulation library
(-lpcreposix -lpcre on many systems).
--cflags Writes to the standard output the command line options
required to compile files that use PCRE (this may include
some -I options, but is blank on many systems).
--cflags-posix
Writes to the standard output the command line options
required to compile files that use the PCRE posix emulation
library (this may include some -I options, but is blank on
many systems).
SEE ALSO
pcre(3)
AUTHOR
This manual page was originally written by Mark Baker for the Debian
GNU/Linux system. It has been slightly revised as a generic PCRE man
page.
REVISION
Last updated: 18 April 2007

View File

@ -1,296 +0,0 @@
.TH PCRE 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH INTRODUCTION
.rs
.sp
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl, with just a few
differences. Certain features that appeared in Python and PCRE before they
appeared in Perl are also available using the Python syntax. There is also some
support for certain .NET and Oniguruma syntax items, and there is an option for
requesting some minor changes that give better JavaScript compatibility.
.P
The current implementation of PCRE (release 7.x) corresponds approximately with
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
category properties. However, UTF-8 and Unicode support has to be explicitly
enabled; it is not the default. The Unicode tables correspond to Unicode
release 5.1.
.P
In addition to the Perl-compatible matching function, PCRE contains an
alternative matching function that matches the same compiled patterns in a
different way. In certain circumstances, the alternative function has some
advantages. For a discussion of the two matching algorithms, see the
.\" HREF
\fBpcrematching\fP
.\"
page.
.P
PCRE is written in C and released as a C library. A number of people have
written wrappers and interfaces of various kinds. In particular, Google Inc.
have provided a comprehensive C++ wrapper. This is now included as part of the
PCRE distribution. The
.\" HREF
\fBpcrecpp\fP
.\"
page has details of this interface. Other people's contributions can be found
in the \fIContrib\fR directory at the primary FTP site, which is:
.sp
.\" HTML <a href="ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre">
.\" </a>
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
.P
Details of exactly which Perl regular expression features are and are not
supported by PCRE are given in separate documents. See the
.\" HREF
\fBpcrepattern\fR
.\"
and
.\" HREF
\fBpcrecompat\fR
.\"
pages. There is a syntax summary in the
.\" HREF
\fBpcresyntax\fR
.\"
page.
.P
Some features of PCRE can be included, excluded, or changed when the library is
built. The
.\" HREF
\fBpcre_config()\fR
.\"
function makes it possible for a client to discover which features are
available. The features themselves are described in the
.\" HREF
\fBpcrebuild\fP
.\"
page. Documentation about building PCRE for various operating systems can be
found in the \fBREADME\fP file in the source distribution.
.P
The library contains a number of undocumented internal functions and data
tables that are used by more than one of the exported external functions, but
which are not intended for use by external callers. Their names all begin with
"_pcre_", which hopefully will not provoke any name clashes. In some
environments, it is possible to control which external symbols are exported
when a shared library is built, and in these cases the undocumented symbols are
not exported.
.
.
.SH "USER DOCUMENTATION"
.rs
.sp
The user documentation for PCRE comprises a number of different sections. In
the "man" format, each of these is a separate "man page". In the HTML format,
each is a separate page, linked from the index page. In the plain text format,
all the sections are concatenated, for ease of searching. The sections are as
follows:
.sp
pcre this document
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
pcrecallout details of the callout feature
pcrecompat discussion of Perl compatibility
pcrecpp details of the C++ wrapper
pcregrep description of the \fBpcregrep\fP command
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
.\" JOIN
pcrepattern syntax and semantics of supported
regular expressions
pcresyntax quick syntax reference
pcreperform discussion of performance issues
pcreposix the POSIX-compatible C API
pcreprecompile details of saving and re-using precompiled patterns
pcresample discussion of the sample program
pcrestack discussion of stack usage
pcretest description of the \fBpcretest\fP testing command
.sp
In addition, in the "man" and HTML formats, there is a short page for each
C library function, listing its arguments and results.
.
.
.SH LIMITATIONS
.rs
.sp
There are some size limitations in PCRE but it is hoped that they will never in
practice be relevant.
.P
The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is
compiled with the default internal linkage size of 2. If you want to process
regular expressions that are truly enormous, you can compile PCRE with an
internal linkage size of 3 or 4 (see the \fBREADME\fP file in the source
distribution and the
.\" HREF
\fBpcrebuild\fP
.\"
documentation for details). In these cases the limit is substantially larger.
However, the speed of execution is slower.
.P
All values in repeating quantifiers must be less than 65536.
.P
There is no limit to the number of parenthesized subpatterns, but there can be
no more than 65535 capturing subpatterns.
.P
The maximum length of name for a named subpattern is 32 characters, and the
maximum number of named subpatterns is 10000.
.P
The maximum length of a subject string is the largest positive number that an
integer variable can hold. However, when using the traditional matching
function, PCRE uses recursion to handle subpatterns and indefinite repetition.
This means that the available stack space may limit the size of a subject
string that can be processed by certain patterns. For a discussion of stack
issues, see the
.\" HREF
\fBpcrestack\fP
.\"
documentation.
.
.\" HTML <a name="utf8support"></a>
.
.
.SH "UTF-8 AND UNICODE PROPERTY SUPPORT"
.rs
.sp
From release 3.3, PCRE has had some support for character strings encoded in
the UTF-8 format. For release 4.0 this was greatly extended to cover most
common requirements, and in release 5.0 additional support for Unicode general
category properties was added.
.P
In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
the code, and, in addition, you must call
.\" HREF
\fBpcre_compile()\fP
.\"
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
(*UTF8). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF-8 strings instead of
just strings of bytes.
.P
If you compile PCRE with UTF-8 support, but do not use it at run time, the
library will be a bit bigger, but the additional run time overhead is limited
to testing the PCRE_UTF8 flag occasionally, so should not be very big.
.P
If PCRE is built with Unicode character property support (which implies UTF-8
support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
The available properties that can be tested are limited to the general
category properties such as Lu for an upper case letter or Nd for a decimal
number, the Unicode script names such as Arabic or Han, and the derived
properties Any and L&. A full list is given in the
.\" HREF
\fBpcrepattern\fP
.\"
documentation. Only the short names for properties are supported. For example,
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
compatibility with Perl 5.6. PCRE does not support this.
.
.\" HTML <a name="utf8strings"></a>
.
.SS "Validity of UTF-8 strings"
.rs
.sp
When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
are (by default) checked for validity on entry to the relevant functions. From
release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
themselves derived from the Unicode specification. Earlier releases of PCRE
followed the rules of RFC 2279, which allows the full range of 31-bit values (0
to 0x7FFFFFFF). The current check allows only values in the range U+0 to
U+10FFFF, excluding U+D800 to U+DFFF.
.P
The excluded code points are the "Low Surrogate Area" of Unicode, of which the
Unicode Standard says this: "The Low Surrogate Area does not contain any
character assignments, consequently no character code charts or namelists are
provided for this area. Surrogates are reserved for use with UTF-16 and then
must be used in pairs." The code points that are encoded by UTF-16 pairs are
available as independent code points in the UTF-8 encoding. (In other words,
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
UTF-8.)
.P
If an invalid UTF-8 string is passed to PCRE, an error return
(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
your strings are valid, and therefore want to skip these checks in order to
improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
at run time, PCRE assumes that the pattern or subject it is given
(respectively) contains only valid UTF-8 codes. In this case, it does not
diagnose an invalid UTF-8 string.
.P
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
happens depends on why the string is invalid. If the string conforms to the
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
rules of RFC 2279. However, if the string does not even conform to RFC 2279,
the result is undefined. Your program may crash.
.P
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
encoded in a UTF-8-like manner as per the old RFC, you can set
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
situation, you will have to apply your own validity check.
.
.SS "General comments about UTF-8 mode"
.rs
.sp
1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
UTF-8 character if the value is greater than 127.
.P
2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
characters for values greater than \e177.
.P
3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
bytes, for example: \ex{100}{3}.
.P
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
.P
5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
but its use can lead to some strange effects. This facility is not available in
the alternative matching function, \fBpcre_dfa_exec()\fP.
.P
6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
test characters of any code value, but the characters that PCRE recognizes as
digits, spaces, or word characters remain the same set as before, all with
values less than 256. This remains true even when PCRE includes Unicode
property support, because to do otherwise would slow down PCRE in many common
cases. If you really want to test for a wider sense of, say, "digit", you
must use Unicode property tests such as \ep{Nd}. Note that this also applies to
\eb, because it is defined in terms of \ew and \eW.
.P
7. Similarly, characters that match the POSIX named character classes are all
low-valued characters.
.P
8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
(\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters.
.P
9. Case-insensitive matching applies only to characters whose values are less
than 128, unless PCRE is built with Unicode property support. Even when Unicode
property support is available, PCRE still uses its own character tables when
checking the case of low-valued characters, so as not to degrade performance.
The Unicode property information is used only for characters with higher
values. Even when Unicode property support is available, PCRE supports
case-insensitive matching only when there is a one-to-one mapping between a
letter's cases. There are a small number of many-to-one mappings in Unicode;
these are not supported by PCRE.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.P
Putting an actual email address here seems to have been a spam magnet, so I've
taken it away. If you want to email me, use my two initials, followed by the
two digits 10, at the domain cam.ac.uk.
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 11 April 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

File diff suppressed because it is too large Load Diff

View File

@ -1,77 +0,0 @@
.TH PCRE_COMPILE 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
.ti +5n
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
.ti +5n
.B const unsigned char *\fItableptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This function compiles a regular expression into an internal form. It is the
same as \fBpcre_compile2()\fP, except for the absence of the \fIerrorcodeptr\fP
argument. Its arguments are:
.sp
\fIpattern\fR A zero-terminated string containing the
regular expression to be compiled
\fIoptions\fR Zero or more option bits
\fIerrptr\fR Where to put an error message
\fIerroffset\fR Offset in pattern where error was found
\fItableptr\fR Pointer to character tables, or NULL to
use the built-in default
.sp
The option bits are:
.sp
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \eR matches all Unicode line endings
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
.sp
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
.P
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fR
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fR
.\"
page.

View File

@ -1,77 +0,0 @@
.TH PCRE_COMPILE2 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
.ti +5n
.B int *\fIerrorcodeptr\fP,
.ti +5n
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
.ti +5n
.B const unsigned char *\fItableptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This function compiles a regular expression into an internal form. It is the
same as \fBpcre_compile()\fP, except for the addition of the \fIerrorcodeptr\fP
argument. The arguments are:
.sp
\fIpattern\fR A zero-terminated string containing the
regular expression to be compiled
\fIoptions\fR Zero or more option bits
\fIerrorcodeptr\fP Where to put an error code
\fIerrptr\fR Where to put an error message
\fIerroffset\fR Offset in pattern where error was found
\fItableptr\fR Pointer to character tables, or NULL to
use the built-in default
.sp
The option bits are:
.sp
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
.sp
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
.P
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fR
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fR
.\"
page.

View File

@ -1,57 +0,0 @@
.TH PCRE_CONFIG 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
.
.SH DESCRIPTION
.rs
.sp
This function makes it possible for a client program to find out which optional
features are available in the version of the PCRE library it is using. Its
arguments are as follows:
.sp
\fIwhat\fR A code specifying what information is required
\fIwhere\fR Points to where to put the data
.sp
The available codes are:
.sp
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
Internal recursion depth limit
PCRE_CONFIG_NEWLINE Value of the default newline sequence:
13 (0x000d) for CR
10 (0x000a) for LF
3338 (0x0d0a) for CRLF
-2 for ANYCRLF
-1 for ANY
PCRE_CONFIG_BSR Indicates what \eR matches by default:
0 all Unicode line endings
1 CR, LF, or CRLF only
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
Threshold of return slots, above
which \fBmalloc()\fR is used by
the POSIX API
PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
PCRE_CONFIG_UNICODE_PROPERTIES
Availability of Unicode property support
(1=yes 0=no)
.sp
The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fR
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fR
.\"
page.

View File

@ -1,43 +0,0 @@
.TH PCRE_COPY_NAMED_SUBSTRING 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, const char *\fIstringname\fP,
.ti +5n
.B char *\fIbuffer\fP, int \fIbuffersize\fP);
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for extracting a captured substring, identified
by name, into a given buffer. The arguments are:
.sp
\fIcode\fP Pattern that was successfully matched
\fIsubject\fP Subject that has been successfully matched
\fIovector\fP Offset vector that \fBpcre_exec()\fP used
\fIstringcount\fP Value returned by \fBpcre_exec()\fP
\fIstringname\fP Name of the required substring
\fIbuffer\fP Buffer to receive the string
\fIbuffersize\fP Size of buffer
.sp
The yield is the length of the substring, PCRE_ERROR_NOMEMORY if the buffer was
too small, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,40 +0,0 @@
.TH PCRE_COPY_SUBSTRING 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
.ti +5n
.B int \fIbuffersize\fP);
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for extracting a captured substring into a given
buffer. The arguments are:
.sp
\fIsubject\fP Subject that has been successfully matched
\fIovector\fP Offset vector that \fBpcre_exec()\fP used
\fIstringcount\fP Value returned by \fBpcre_exec()\fP
\fIstringnumber\fP Number of the required substring
\fIbuffer\fP Buffer to receive the string
\fIbuffersize\fP Size of buffer
.sp
The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,89 +0,0 @@
.TH PCRE_DFA_EXEC 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
.ti +5n
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,
.ti +5n
.B int *\fIworkspace\fP, int \fIwscount\fP);
.
.SH DESCRIPTION
.rs
.sp
This function matches a compiled regular expression against a given subject
string, using an alternative matching algorithm that scans the subject string
just once (\fInot\fP Perl-compatible). Note that the main, Perl-compatible,
matching function is \fBpcre_exec()\fP. The arguments for this function are:
.sp
\fIcode\fP Points to the compiled pattern
\fIextra\fP Points to an associated \fBpcre_extra\fP structure,
or is NULL
\fIsubject\fP Points to the subject string
\fIlength\fP Length of the subject string, in bytes
\fIstartoffset\fP Offset in bytes in the subject at which to
start matching
\fIoptions\fP Option bits
\fIovector\fP Points to a vector of ints for result offsets
\fIovecsize\fP Number of elements in the vector
\fIworkspace\fP Points to a vector of ints used as working space
\fIwscount\fP Number of elements in the vector
.sp
The options are:
.sp
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \eR matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
PCRE_DFA_SHORTEST Return only the shortest match
PCRE_DFA_RESTART This is a restart after a partial match
.sp
There are restrictions on what may appear in a pattern when using this matching
function. Details are given in the
.\" HREF
\fBpcrematching\fP
.\"
documentation.
.P
A \fBpcre_extra\fP structure contains the following fields:
.sp
\fIflags\fP Bits indicating which fields are set
\fIstudy_data\fP Opaque data from \fBpcre_study()\fP
\fImatch_limit\fP Limit on internal resource use
\fImatch_limit_recursion\fP Limit on internal recursion depth
\fIcallout_data\fP Opaque data passed back to callouts
\fItables\fP Points to character tables or is NULL
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
PCRE_EXTRA_TABLES. For this matching function, the \fImatch_limit\fP and
\fImatch_limit_recursion\fP fields are not used, and must not be set.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,81 +0,0 @@
.TH PCRE_EXEC 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
.ti +5n
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
.
.SH DESCRIPTION
.rs
.sp
This function matches a compiled regular expression against a given subject
string, using a matching algorithm that is similar to Perl's. It returns
offsets to captured substrings. Its arguments are:
.sp
\fIcode\fP Points to the compiled pattern
\fIextra\fP Points to an associated \fBpcre_extra\fP structure,
or is NULL
\fIsubject\fP Points to the subject string
\fIlength\fP Length of the subject string, in bytes
\fIstartoffset\fP Offset in bytes in the subject at which to
start matching
\fIoptions\fP Option bits
\fIovector\fP Points to a vector of ints for result offsets
\fIovecsize\fP Number of elements in the vector (a multiple of 3)
.sp
The options are:
.sp
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \eR matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
.sp
There are restrictions on what may appear in a pattern when partial matching is
requested. For details, see the
.\" HREF
\fBpcrepartial\fP
.\"
page.
.P
A \fBpcre_extra\fP structure contains the following fields:
.sp
\fIflags\fP Bits indicating which fields are set
\fIstudy_data\fP Opaque data from \fBpcre_study()\fP
\fImatch_limit\fP Limit on internal resource use
\fImatch_limit_recursion\fP Limit on internal recursion depth
\fIcallout_data\fP Opaque data passed back to callouts
\fItables\fP Points to character tables or is NULL
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
PCRE_EXTRA_TABLES.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,27 +0,0 @@
.TH PCRE_FREE_SUBSTRING 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B void pcre_free_substring(const char *\fIstringptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for freeing the store obtained by a previous
call to \fBpcre_get_substring()\fP or \fBpcre_get_named_substring()\fP. Its
only argument is a pointer to the string.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,27 +0,0 @@
.TH PCRE_FREE_SUBSTRING_LIST 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B void pcre_free_substring_list(const char **\fIstringptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for freeing the store obtained by a previous
call to \fBpcre_get_substring_list()\fP. Its only argument is a pointer to the
list of string pointers.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,59 +0,0 @@
.TH PCRE_FULLINFO 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B int \fIwhat\fP, void *\fIwhere\fP);
.
.SH DESCRIPTION
.rs
.sp
This function returns information about a compiled pattern. Its arguments are:
.sp
\fIcode\fP Compiled regular expression
\fIextra\fP Result of \fBpcre_study()\fP or NULL
\fIwhat\fP What information is required
\fIwhere\fP Where to put the information
.sp
The following information is available:
.sp
PCRE_INFO_BACKREFMAX Number of highest back reference
PCRE_INFO_CAPTURECOUNT Number of capturing subpatterns
PCRE_INFO_DEFAULT_TABLES Pointer to default tables
PCRE_INFO_FIRSTBYTE Fixed first byte for a match, or
-1 for start of string
or after newline, or
-2 otherwise
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_NAMECOUNT Number of named subpatterns
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
PCRE_INFO_NAMETABLE Pointer to name table
PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
.sp
The yield of the function is zero on success or:
.sp
PCRE_ERROR_NULL the argument \fIcode\fP was NULL
the argument \fIwhere\fP was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,45 +0,0 @@
.TH PCRE_GET_NAMED_SUBSTRING 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, const char *\fIstringname\fP,
.ti +5n
.B const char **\fIstringptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for extracting a captured substring by name. The
arguments are:
.sp
\fIcode\fP Compiled pattern
\fIsubject\fP Subject that has been successfully matched
\fIovector\fP Offset vector that \fBpcre_exec()\fP used
\fIstringcount\fP Value returned by \fBpcre_exec()\fP
\fIstringname\fP Name of the required substring
\fIstringptr\fP Where to put the string pointer
.sp
The memory in which the substring is placed is obtained by calling
\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
be used to free it when it is no longer needed. The yield of the function is
the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,37 +0,0 @@
.TH PCRE_GET_STRINGNUMBER 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIname\fP);
.
.SH DESCRIPTION
.rs
.sp
This convenience function finds the number of a named substring capturing
parenthesis in a compiled pattern. Its arguments are:
.sp
\fIcode\fP Compiled regular expression
\fIname\fP Name whose number is required
.sp
The yield of the function is the number of the parenthesis if the name is
found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
\fBpcre_get_stringnumber()\fP. You can obtain the complete list by calling
\fBpcre_get_stringtable_entries()\fP.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,40 +0,0 @@
.TH PCRE_GET_STRINGTABLE_ENTRIES 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
.
.SH DESCRIPTION
.rs
.sp
This convenience function finds, for a compiled pattern, the first and last
entries for a given name in the table that translates capturing parenthesis
names into numbers. When names are required to be unique (PCRE_DUPNAMES is
\fInot\fP set), it is usually easier to use \fBpcre_get_stringnumber()\fP
instead.
.sp
\fIcode\fP Compiled regular expression
\fIname\fP Name whose entries required
\fIfirst\fP Where to return a pointer to the first entry
\fIlast\fP Where to return a pointer to the last entry
.sp
The yield of the function is the length of each entry, or
PCRE_ERROR_NOSUBSTRING if none are found.
.P
There is a complete description of the PCRE native API, including the format of
the table entries, in the
.\" HREF
\fBpcreapi\fP
.\"
page, and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,42 +0,0 @@
.TH PCRE_GET_SUBSTRING 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP,
.ti +5n
.B const char **\fIstringptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for extracting a captured substring. The
arguments are:
.sp
\fIsubject\fP Subject that has been successfully matched
\fIovector\fP Offset vector that \fBpcre_exec()\fP used
\fIstringcount\fP Value returned by \fBpcre_exec()\fP
\fIstringnumber\fP Number of the required substring
\fIstringptr\fP Where to put the string pointer
.sp
The memory in which the substring is placed is obtained by calling
\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
be used to free it when it is no longer needed. The yield of the function is
the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,41 +0,0 @@
.TH PCRE_GET_SUBSTRING_LIST 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_get_substring_list(const char *\fIsubject\fP,
.ti +5n
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
.
.SH DESCRIPTION
.rs
.sp
This is a convenience function for extracting a list of all the captured
substrings. The arguments are:
.sp
\fIsubject\fP Subject that has been successfully matched
\fIovector\fP Offset vector that \fBpcre_exec\fP used
\fIstringcount\fP Value returned by \fBpcre_exec\fP
\fIlistptr\fP Where to put a pointer to the list
.sp
The memory in which the substrings and the list are placed is obtained by
calling \fBpcre_malloc()\fP. The convenience function
\fBpcre_free_substring_list()\fP can be used to free it when it is no longer
needed. A pointer to a list of pointers is put in the variable whose address is
in \fIlistptr\fP. The list is terminated by a NULL pointer. The yield of the
function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
not be obtained.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,26 +0,0 @@
.TH PCRE_INFO 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
.B *\fIfirstcharptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This function is obsolete. You should be using \fBpcre_fullinfo()\fP instead.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,29 +0,0 @@
.TH PCRE_MAKETABLES 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B const unsigned char *pcre_maketables(void);
.
.SH DESCRIPTION
.rs
.sp
This function builds a set of character tables for character values less than
256. These can be passed to \fBpcre_compile()\fP to override PCRE's internal,
built-in tables (which were made by \fBpcre_maketables()\fP when PCRE was
compiled). You might want to do this if you are using a non-standard locale.
The function yields a pointer to the tables.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,32 +0,0 @@
.TH PCRE_REFCOUNT 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
.
.SH DESCRIPTION
.rs
.sp
This function is used to maintain a reference count inside a data block that
contains a compiled pattern. Its arguments are:
.sp
\fIcode\fP Compiled regular expression
\fIadjust\fP Adjustment to reference value
.sp
The yield of the function is the adjusted reference value, which is constrained
to lie between 0 and 65535.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

View File

@ -1,42 +0,0 @@
.TH PCRE_STUDY 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
.rs
.sp
.B #include <pcre.h>
.PP
.SM
.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
.ti +5n
.B const char **\fIerrptr\fP);
.
.SH DESCRIPTION
.rs
.sp
This function studies a compiled pattern, to see if additional information can
be extracted that might speed up matching. Its arguments are:
.sp
\fIcode\fP A compiled regular expression
\fIoptions\fP Options for \fBpcre_study()\fP
\fIerrptr\fP Where to put an error message
.sp
If the function succeeds, it returns a value that can be passed to
\fBpcre_exec()\fP via its \fIextra\fP argument.
.P
If the function returns NULL, either it could not find any additional
information, or there was an error. You can tell the difference by looking at
the error value. It is NULL in first case.
.P
There are currently no options defined; the value of the second argument should
always be zero.
.P
There is a complete description of the PCRE native API in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"
page.

Some files were not shown because too many files have changed in this diff Show More