update to pcre 7.9

git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@13706 d0543943-73ff-0310-b7d9-9358b9ac24b2
This commit is contained in:
Michael Jerris
2009-06-08 23:51:30 +00:00
parent a1e5add731
commit f7efdaa901
178 changed files with 43560 additions and 11382 deletions

296
libs/pcre/132html Executable file
View File

@@ -0,0 +1,296 @@
#! /usr/bin/perl -w
# Script to turn PCRE man pages into HTML
# Subroutine to handle font changes and other escapes
sub do_line {
my($s) = $_[0];
$s =~ s/</&#60;/g; # Deal with < and >
$s =~ s/>/&#62;/g;
$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
$s =~ s"\\e"\\"g;
$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
$s;
}
# Subroutine to ensure not in a paragraph
sub end_para {
if ($inpara)
{
print TEMP "</PRE>\n" if ($inpre);
print TEMP "</P>\n";
}
$inpara = $inpre = 0;
$wrotetext = 0;
}
# Subroutine to start a new paragraph
sub new_para {
&end_para();
print TEMP "<P>\n";
$inpara = 1;
}
# Main program
$innf = 0;
$inpara = 0;
$inpre = 0;
$wrotetext = 0;
$toc = 0;
$ref = 1;
while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
{
$toc = 1 if $ARGV[0] eq "-toc";
shift;
}
# Initial output to STDOUT
print <<End ;
<html>
<head>
<title>$ARGV[0] specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>$ARGV[0] man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
End
print "<ul>\n" if ($toc);
open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
while (<STDIN>)
{
# Handle lines beginning with a dot
if (/^\./)
{
# Some of the PCRE man pages used to contain instances of .br. However,
# they should have all been removed because they cause trouble in some
# (other) automated systems that translate man pages to HTML. Complain if
# we find .br or .in (another macro that is deprecated).
if (/^\.br/ || /^\.in/)
{
print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
print STDERR "*** $_\n";
die "*** Processing abandoned\n";
}
# Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
elsif (/^\.nf/)
{
$innf = 1;
}
elsif (/^\.fi/)
{
$innf = 0;
}
# Handling .sp is subtle. If it is inside a literal section, do nothing if
# the next line is a non literal text line; similarly, if not inside a
# literal section, do nothing if a literal follows. The point being that
# the <pre> and </pre> that delimit literal sections will do the spacing.
# Always skip if no previous output.
elsif (/^\.sp/)
{
if ($wrotetext)
{
$_ = <STDIN>;
if ($inpre)
{
print TEMP "\n" if (/^[\s.]/);
}
else
{
print TEMP "<br>\n<br>\n" if (!/^[\s.]/);
}
redo; # Now process the lookahead line we just read
}
}
elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
{
&new_para();
}
elsif (/^\.SH\s*("?)(.*)\1/)
{
# Ignore the NAME section
if ($2 =~ /^NAME\b/)
{
<STDIN>;
next;
}
&end_para();
my($title) = &do_line($2);
if ($toc)
{
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
$ref, $ref);
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
$ref, $ref);
$ref++;
}
else
{
print TEMP "<br><b>\n$title\n</b><br>\n";
}
}
elsif (/^\.SS\s*("?)(.*)\1/)
{
&end_para();
my($title) = &do_line($2);
print TEMP "<br><b>\n$title\n</b><br>\n";
}
elsif (/^\.B\s*(.*)/)
{
&new_para() if (!$inpara);
$_ = &do_line($1);
s/"(.*?)"/$1/g;
print TEMP "<b>$_</b>\n";
$wrotetext = 1;
}
elsif (/^\.I\s*(.*)/)
{
&new_para() if (!$inpara);
$_ = &do_line($1);
s/"(.*?)"/$1/g;
print TEMP "<i>$_</i>\n";
$wrotetext = 1;
}
# A comment that starts "HREF" takes the next line as a name that
# is turned into a hyperlink, using the text given, which might be
# in a special font. If it ends in () or (digits) or punctuation, they
# aren't part of the link.
elsif (/^\.\\"\s*HREF/)
{
$_=<STDIN>;
chomp;
$_ = &do_line($_);
$_ =~ s/\s+$//;
$_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
print TEMP "<a href=\"$1.html\">$_</a>\n";
}
# A comment that starts "HTML" inserts literal HTML
elsif (/^\.\\"\s*HTML\s*(.*)/)
{
print TEMP $1;
}
# A comment that starts < inserts that HTML at the end of the
# *next* input line - so as not to get a newline between them.
elsif (/^\.\\"\s*(<.*>)/)
{
my($markup) = $1;
$_=<STDIN>;
chomp;
$_ = &do_line($_);
$_ =~ s/\s+$//;
print TEMP "$_$markup\n";
}
# A comment that starts JOIN joins the next two lines together, with one
# space between them. Then that line is processed. This is used in some
# displays where two lines are needed for the "man" version. JOINSH works
# the same, except that it assumes this is a shell command, so removes
# continuation backslashes.
elsif (/^\.\\"\s*JOIN(SH)?/)
{
my($one,$two);
$one = <STDIN>;
$two = <STDIN>;
$one =~ s/\s*\\e\s*$// if (defined($1));
chomp($one);
$two =~ s/^\s+//;
$_ = "$one $two";
redo; # Process the joined lines
}
# Ignore anything not recognized
next;
}
# Line does not begin with a dot. Replace blank lines with new paragraphs
if (/^\s*$/)
{
&end_para() if ($wrotetext);
next;
}
# Convert fonts changes and output an ordinary line. Ensure that indented
# lines are marked as literal.
$_ = &do_line($_);
&new_para() if (!$inpara);
if (/^\s/)
{
if (!$inpre)
{
print TEMP "<pre>\n";
$inpre = 1;
}
}
elsif ($inpre)
{
print TEMP "</pre>\n";
$inpre = 0;
}
# Add <br> to the end of a non-literal line if we are within .nf/.fi
$_ .= "<br>\n" if (!$inpre && $innf);
print TEMP;
$wrotetext = 1;
}
# The TOC, if present, will have been written - terminate it
print "</ul>\n" if ($toc);
# Copy the remainder to the standard output
close(TEMP);
open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
print while (<TEMP>);
print <<End ;
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
End
close(TEMP);
unlink("/tmp/$$");
# End

View File

@@ -6,9 +6,9 @@ Email local part: ph10
Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Cambridge, England.
Copyright (c) 1997-2006 University of Cambridge
Copyright (c) 1997-2009 University of Cambridge
All rights reserved
@@ -17,7 +17,7 @@ THE C++ WRAPPER LIBRARY
Written by: Google Inc.
Copyright (c) 2006 Google Inc
Copyright (c) 2007-2008 Google Inc
All rights reserved
####

View File

@@ -1,17 +1,578 @@
cmake_minimum_required(VERSION 2.6)
# CMakeLists.txt
#
#
# This file allows building PCRE with the CMake configuration and build
# tool. Download CMake in source or binary form from http://www.cmake.org/
#
# Original listfile by Christian Ehrlicher <Ch.Ehrlicher@gmx.de>
# Refined and expanded by Daniel Richard G. <skunk@iSKUNK.ORG>
# 2007-09-14 mod by Sheri so 7.4 supported configuration options can be entered
# 2007-09-19 Adjusted by PH to retain previous default settings
# 2007-12-26 (a) On UNIX, use names libpcre instead of just pcre
# (b) Ensure pcretest and pcregrep link with the local library,
# not a previously-installed one.
# (c) Add PCRE_SUPPORT_LIBREADLINE, PCRE_SUPPORT_LIBZ, and
# PCRE_SUPPORT_LIBBZ2.
# 2008-01-20 Brought up to date to include several new features by Christian
# Ehrlicher.
# 2008-01-22 Sheri added options for backward compatibility of library names
# when building with minGW:
# if "ON", NON_STANDARD_LIB_PREFIX causes shared libraries to
# be built without "lib" as prefix. (The libraries will be named
# pcre.dll, pcreposix.dll and pcrecpp.dll).
# if "ON", NON_STANDARD_LIB_SUFFIX causes shared libraries to
# be built with suffix of "-0.dll". (The libraries will be named
# libpcre-0.dll, libpcreposix-0.dll and libpcrecpp-0.dll - same names
# built by default with Configure and Make.
# 2008-01-23 PH removed the automatic build of pcredemo.
# 2008-04-22 PH modified READLINE support so it finds NCURSES when needed.
# 2008-07-03 PH updated for revised UCP property support (change of files)
# 2009-03-23 PH applied Steven Van Ingelgem's patch to change the name
# CMAKE_BINARY_DIR to PROJECT_BINARY_DIR so that it works when PCRE
# is included within another project.
# 2009-03-23 PH applied a modified version of Steven Van Ingelgem's patches to
# add options to stop the building of pcregrep and the tests, and
# to disable the final configuration report.
# 2009-04-11 PH applied Christian Ehrlicher's patch to show compiler flags that
# are set by specifying a release type.
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/libs/pcre/include/ ${CMAKE_SOURCE_DIR}/libs/pcre/)
PROJECT(PCRE C CXX)
SET ( pcre_SRCS pcre_compile.c pcre_tables.c pcre_config.c pcre_try_flipped.c pcre_dfa_exec.c pcre_ucp_searchfuncs.c pcre_exec.c pcre_valid_utf8.c pcre_fullinfo.c pcre_version.c dftables.c pcre_get.c pcre_xclass.c pcre_globals.c pcre_info.c pcrecpp.h pcre_internal.h pcre_maketables.c pcrecpparg.h pcre_ord2utf8.c pcredemo.c pcre_refcount.c pcregrep.c pcreposix.c pcre_scanner.h pcreposix.h pcre_scanner_unittest.cc pcretest.c pcre_stringpiece.h pcre_stringpiece.h.in ucp.h pcre.h ucpinternal.h pcre_chartables.c pcre.h )
CMAKE_MINIMUM_REQUIRED(VERSION 2.4.6)
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
# external packages
FIND_PACKAGE( BZip2 )
FIND_PACKAGE( ZLIB )
FIND_PACKAGE( Readline )
# Configuration checks
INCLUDE(CheckIncludeFile)
INCLUDE(CheckIncludeFileCXX)
INCLUDE(CheckFunctionExists)
INCLUDE(CheckTypeSize)
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
CHECK_INCLUDE_FILE_CXX(type_traits.h HAVE_TYPE_TRAITS_H)
CHECK_INCLUDE_FILE_CXX(bits/type_traits.h HAVE_BITS_TYPE_TRAITS_H)
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
CHECK_FUNCTION_EXISTS(strtoll HAVE_STRTOLL)
CHECK_FUNCTION_EXISTS(strtoq HAVE_STRTOQ)
CHECK_FUNCTION_EXISTS(_strtoi64 HAVE__STRTOI64)
CHECK_TYPE_SIZE("long long" LONG_LONG)
CHECK_TYPE_SIZE("unsigned long long" UNSIGNED_LONG_LONG)
# User-configurable options
#
# (Note: CMakeSetup displays these in alphabetical order, regardless of
# the order we use here)
SET(BUILD_SHARED_LIBS OFF CACHE BOOL
"Build shared libraries instead of static ones.")
OPTION(PCRE_BUILD_PCRECPP "Build the PCRE C++ library (pcrecpp)." ON)
SET(PCRE_EBCDIC OFF CACHE BOOL
"Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems)")
SET(PCRE_LINK_SIZE "2" CACHE STRING
"Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details.")
SET(PCRE_MATCH_LIMIT "10000000" CACHE STRING
"Default limit on internal looping. See MATCH_LIMIT in config.h.in for details.")
SET(PCRE_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
SET(PCRE_NEWLINE "LF" CACHE STRING
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")
SET(PCRE_NO_RECURSE OFF CACHE BOOL
"If ON, then don't use stack recursion when matching. See NO_RECURSE in config.h.in for details.")
SET(PCRE_POSIX_MALLOC_THRESHOLD "10" CACHE STRING
"Threshold for malloc() usage. See POSIX_MALLOC_THRESHOLD in config.h.in for details.")
SET(PCRE_SUPPORT_UNICODE_PROPERTIES OFF CACHE BOOL
"Enable support for Unicode properties. (If set, UTF-8 support will be enabled as well)")
SET(PCRE_SUPPORT_UTF8 OFF CACHE BOOL
"Enable support for the Unicode UTF-8 encoding.")
SET(PCRE_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
OPTION(PCRE_SHOW_REPORT "Show the final configuration report" ON)
OPTION(PCRE_BUILD_PCREGREP "Build pcregrep" ON)
OPTION(PCRE_BUILD_TESTS "Build the tests" ON)
IF (PCRE_BUILD_TESTS)
IF (NOT PCRE_BUILD_PCREGREP)
MESSAGE(STATUS "** Building tests requires pcregrep: PCRE_BUILD_PCREGREP forced ON")
SET(PCRE_BUILD_PCREGREP ON)
ENDIF(NOT PCRE_BUILD_PCREGREP)
ENDIF(PCRE_BUILD_TESTS)
IF (MINGW)
OPTION(NON_STANDARD_LIB_PREFIX
"ON=Shared libraries built in mingw will be named pcre.dll, etc., instead of libpcre.dll, etc."
OFF)
OPTION(NON_STANDARD_LIB_SUFFIX
"ON=Shared libraries built in mingw will be named libpcre-0.dll, etc., instead of libpcre.dll, etc."
OFF)
ENDIF(MINGW)
# bzip2 lib
IF(BZIP2_FOUND)
OPTION (PCRE_SUPPORT_LIBBZ2 "Enable support for linking pcregrep with libbz2." ON)
ENDIF(BZIP2_FOUND)
IF(PCRE_SUPPORT_LIBBZ2)
INCLUDE_DIRECTORIES(${BZIP2_INCLUDE_DIR})
ENDIF(PCRE_SUPPORT_LIBBZ2)
# zlib
IF(ZLIB_FOUND)
OPTION (PCRE_SUPPORT_LIBZ "Enable support for linking pcregrep with libz." ON)
ENDIF(ZLIB_FOUND)
IF(PCRE_SUPPORT_LIBZ)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
ENDIF(PCRE_SUPPORT_LIBZ)
# readline lib
IF(READLINE_FOUND)
OPTION (PCRE_SUPPORT_LIBREADLINE "Enable support for linking pcretest with libreadline." ON)
ENDIF(READLINE_FOUND)
IF(PCRE_SUPPORT_LIBREADLINE)
INCLUDE_DIRECTORIES(${READLINE_INCLUDE_DIR})
ENDIF(PCRE_SUPPORT_LIBREADLINE)
# Prepare build configuration
SET(pcre_have_type_traits 0)
SET(pcre_have_bits_type_traits 0)
IF(HAVE_TYPE_TRAITS_H)
SET(pcre_have_type_traits 1)
ENDIF(HAVE_TYPE_TRAITS_H)
IF(HAVE_BITS_TYPE_TRAITS_H)
SET(pcre_have_bits_type_traits 1)
ENDIF(HAVE_BITS_TYPE_TRAITS_H)
SET(pcre_have_long_long 0)
SET(pcre_have_ulong_long 0)
IF(HAVE_LONG_LONG)
SET(pcre_have_long_long 1)
ENDIF(HAVE_LONG_LONG)
IF(HAVE_UNSIGNED_LONG_LONG)
SET(pcre_have_ulong_long 1)
ENDIF(HAVE_UNSIGNED_LONG_LONG)
IF(NOT BUILD_SHARED_LIBS)
SET(PCRE_STATIC 1)
ENDIF(NOT BUILD_SHARED_LIBS)
IF(PCRE_SUPPORT_BSR_ANYCRLF)
SET(BSR_ANYCRLF 1)
ENDIF(PCRE_SUPPORT_BSR_ANYCRLF)
IF(PCRE_SUPPORT_UTF8 OR PCRE_SUPPORT_UNICODE_PROPERTIES)
SET(SUPPORT_UTF8 1)
ENDIF(PCRE_SUPPORT_UTF8 OR PCRE_SUPPORT_UNICODE_PROPERTIES)
IF(PCRE_SUPPORT_UNICODE_PROPERTIES)
SET(SUPPORT_UCP 1)
ENDIF(PCRE_SUPPORT_UNICODE_PROPERTIES)
# This next one used to contain
# SET(PCRETEST_LIBS ${READLINE_LIBRARY})
# but I was advised to add the NCURSES test as well, along with
# some modifications to cmake/FindReadline.cmake which should
# make it possible to override the default if necessary. PH
IF(PCRE_SUPPORT_LIBREADLINE)
SET(SUPPORT_LIBREADLINE 1)
SET(PCRETEST_LIBS ${READLINE_LIBRARY} ${NCURSES_LIBRARY})
ENDIF(PCRE_SUPPORT_LIBREADLINE)
IF(PCRE_SUPPORT_LIBZ)
SET(SUPPORT_LIBZ 1)
SET(PCREGREP_LIBS ${PCREGREP_LIBS} ${ZLIB_LIBRARIES})
ENDIF(PCRE_SUPPORT_LIBZ)
IF(PCRE_SUPPORT_LIBBZ2)
SET(SUPPORT_LIBBZ2 1)
SET(PCREGREP_LIBS ${PCREGREP_LIBS} ${BZIP2_LIBRARIES})
ENDIF(PCRE_SUPPORT_LIBBZ2)
SET(NEWLINE "")
IF(PCRE_NEWLINE STREQUAL "LF")
SET(NEWLINE "10")
ENDIF(PCRE_NEWLINE STREQUAL "LF")
IF(PCRE_NEWLINE STREQUAL "CR")
SET(NEWLINE "13")
ENDIF(PCRE_NEWLINE STREQUAL "CR")
IF(PCRE_NEWLINE STREQUAL "CRLF")
SET(NEWLINE "3338")
ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
IF(PCRE_NEWLINE STREQUAL "ANY")
SET(NEWLINE "-1")
ENDIF(PCRE_NEWLINE STREQUAL "ANY")
IF(PCRE_NEWLINE STREQUAL "ANYCRLF")
SET(NEWLINE "-2")
ENDIF(PCRE_NEWLINE STREQUAL "ANYCRLF")
IF(NEWLINE STREQUAL "")
MESSAGE(FATAL_ERROR "The PCRE_NEWLINE variable must be set to one of the following values: \"LF\", \"CR\", \"CRLF\", \"ANY\", \"ANYCRLF\".")
ENDIF(NEWLINE STREQUAL "")
IF(PCRE_EBCDIC)
SET(EBCDIC 1)
ENDIF(PCRE_EBCDIC)
IF(PCRE_NO_RECURSE)
SET(NO_RECURSE 1)
ENDIF(PCRE_NO_RECURSE)
# Output files
CONFIGURE_FILE(config-cmake.h.in
${PROJECT_BINARY_DIR}/config.h
@ONLY)
CONFIGURE_FILE(pcre.h.generic
${PROJECT_BINARY_DIR}/pcre.h
COPYONLY)
# What about pcre-config and libpcre.pc?
IF(PCRE_BUILD_PCRECPP)
CONFIGURE_FILE(pcre_stringpiece.h.in
${PROJECT_BINARY_DIR}/pcre_stringpiece.h
@ONLY)
CONFIGURE_FILE(pcrecpparg.h.in
${PROJECT_BINARY_DIR}/pcrecpparg.h
@ONLY)
ENDIF(PCRE_BUILD_PCRECPP)
# Character table generation
OPTION(PCRE_REBUILD_CHARTABLES "Rebuild char tables" OFF)
IF(PCRE_REBUILD_CHARTABLES)
ADD_EXECUTABLE(dftables dftables.c)
GET_TARGET_PROPERTY(DFTABLES_EXE dftables LOCATION)
ADD_CUSTOM_COMMAND(
COMMENT "Generating character tables (pcre_chartables.c) for current locale"
DEPENDS dftables
COMMAND ${DFTABLES_EXE}
ARGS ${PROJECT_BINARY_DIR}/pcre_chartables.c
OUTPUT ${PROJECT_BINARY_DIR}/pcre_chartables.c
)
ELSE(PCRE_REBUILD_CHARTABLES)
CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/pcre_chartables.c.dist
${PROJECT_BINARY_DIR}/pcre_chartables.c
COPYONLY)
ENDIF(PCRE_REBUILD_CHARTABLES)
# Source code
SET(PCRE_HEADERS ${PROJECT_BINARY_DIR}/pcre.h)
SET(PCRE_SOURCES
${PROJECT_BINARY_DIR}/pcre_chartables.c
pcre_compile.c
pcre_config.c
pcre_dfa_exec.c
pcre_exec.c
pcre_fullinfo.c
pcre_get.c
pcre_globals.c
pcre_info.c
pcre_newline.c
pcre_maketables.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
pcre_tables.c
pcre_try_flipped.c
pcre_ucd.c
pcre_valid_utf8.c
pcre_version.c
pcre_xclass.c
)
SET(PCREPOSIX_HEADERS pcreposix.h)
SET(PCREPOSIX_SOURCES pcreposix.c)
SET(PCRECPP_HEADERS
pcrecpp.h
pcre_scanner.h
${PROJECT_BINARY_DIR}/pcrecpparg.h
${PROJECT_BINARY_DIR}/pcre_stringpiece.h
)
SET(PCRECPP_SOURCES
pcrecpp.cc
pcre_scanner.cc
pcre_stringpiece.cc
)
# Build setup
ADD_DEFINITIONS(-DHAVE_CONFIG_H)
IF(MSVC)
ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE)
ENDIF(MSVC)
SET(CMAKE_INCLUDE_CURRENT_DIR 1)
# needed to make sure to not link debug libs
# against release libs and vice versa
IF(WIN32)
SET(CMAKE_DEBUG_POSTFIX "d")
ENDIF(WIN32)
SET(targets)
# Libraries
# pcre
ADD_LIBRARY(pcre ${PCRE_HEADERS} ${PCRE_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET(targets ${targets} pcre)
ADD_LIBRARY(pcreposix ${PCREPOSIX_HEADERS} ${PCREPOSIX_SOURCES})
SET(targets ${targets} pcreposix)
TARGET_LINK_LIBRARIES(pcreposix pcre)
IF(MINGW AND NOT PCRE_STATIC)
IF(NON_STANDARD_LIB_PREFIX)
SET_TARGET_PROPERTIES(pcre pcreposix
PROPERTIES PREFIX ""
)
ENDIF(NON_STANDARD_LIB_PREFIX)
IF(NON_STANDARD_LIB_SUFFIX)
SET_TARGET_PROPERTIES(pcre pcreposix
PROPERTIES SUFFIX "-0.dll"
)
ENDIF(NON_STANDARD_LIB_SUFFIX)
ENDIF(MINGW AND NOT PCRE_STATIC)
# pcrecpp
IF(PCRE_BUILD_PCRECPP)
ADD_LIBRARY(pcrecpp ${PCRECPP_HEADERS} ${PCRECPP_SOURCES})
SET(targets ${targets} pcrecpp)
TARGET_LINK_LIBRARIES(pcrecpp pcre)
IF(MINGW AND NOT PCRE_STATIC)
IF(NON_STANDARD_LIB_PREFIX)
SET_TARGET_PROPERTIES(pcrecpp
PROPERTIES PREFIX ""
)
ENDIF(NON_STANDARD_LIB_PREFIX)
IF(NON_STANDARD_LIB_SUFFIX)
SET_TARGET_PROPERTIES(pcrecpp
PROPERTIES SUFFIX "-0.dll"
)
ENDIF(NON_STANDARD_LIB_SUFFIX)
ENDIF(MINGW AND NOT PCRE_STATIC)
ENDIF(PCRE_BUILD_PCRECPP)
ADD_LIBRARY(pcre STATIC ${pcre_SRCS})
# Executables
# Removed by PH (2008-01-23) because pcredemo shouldn't really be built
# automatically, and it gave trouble in some environments anyway.
# ADD_EXECUTABLE(pcredemo pcredemo.c)
# TARGET_LINK_LIBRARIES(pcredemo pcreposix)
# IF(NOT BUILD_SHARED_LIBS)
# # make sure to not use declspec(dllimport) in static mode on windows
# SET_TARGET_PROPERTIES(pcredemo PROPERTIES COMPILE_FLAGS "-DPCRE_STATIC")
# ENDIF(NOT BUILD_SHARED_LIBS)
IF(PCRE_BUILD_PCREGREP)
ADD_EXECUTABLE(pcregrep pcregrep.c)
SET(targets ${targets} pcregrep)
TARGET_LINK_LIBRARIES(pcregrep pcreposix ${PCREGREP_LIBS})
ENDIF(PCRE_BUILD_PCREGREP)
# Testing
IF(PCRE_BUILD_TESTS)
ENABLE_TESTING()
ADD_EXECUTABLE(pcretest pcretest.c)
SET(targets ${targets} pcretest)
TARGET_LINK_LIBRARIES(pcretest pcreposix ${PCRETEST_LIBS})
IF(PCRE_BUILD_PCRECPP)
ADD_EXECUTABLE(pcrecpp_unittest pcrecpp_unittest.cc)
SET(targets ${targets} pcrecpp_unittest)
TARGET_LINK_LIBRARIES(pcrecpp_unittest pcrecpp)
IF(MINGW AND NON_STANDARD_LIB_NAMES AND NOT PCRE_STATIC)
SET_TARGET_PROPERTIES(pcrecpp
PROPERTIES PREFIX ""
)
ENDIF(MINGW AND NON_STANDARD_LIB_NAMES AND NOT PCRE_STATIC)
ADD_EXECUTABLE(pcre_scanner_unittest pcre_scanner_unittest.cc)
SET(targets ${targets} pcre_scanner_unittest)
TARGET_LINK_LIBRARIES(pcre_scanner_unittest pcrecpp)
ADD_EXECUTABLE(pcre_stringpiece_unittest pcre_stringpiece_unittest.cc)
SET(targets ${targets} pcre_stringpiece_unittest)
TARGET_LINK_LIBRARIES(pcre_stringpiece_unittest pcrecpp)
ENDIF(PCRE_BUILD_PCRECPP)
GET_TARGET_PROPERTY(PCREGREP_EXE pcregrep DEBUG_LOCATION)
GET_TARGET_PROPERTY(PCRETEST_EXE pcretest DEBUG_LOCATION)
# Write out a CTest configuration file that sets some needed environment
# variables for the test scripts.
#
FILE(WRITE ${PROJECT_BINARY_DIR}/CTestCustom.ctest
"# This is a generated file.
SET(ENV{srcdir} ${PROJECT_SOURCE_DIR})
SET(ENV{pcregrep} ${PCREGREP_EXE})
SET(ENV{pcretest} ${PCRETEST_EXE})
")
IF(UNIX)
ADD_TEST(pcre_test ${PROJECT_SOURCE_DIR}/RunTest)
ADD_TEST(pcre_grep_test ${PROJECT_SOURCE_DIR}/RunGrepTest)
ENDIF(UNIX)
IF(WIN32)
ADD_TEST(pcre_test cmd /C ${PROJECT_SOURCE_DIR}/RunTest.bat)
ENDIF(WIN32)
GET_TARGET_PROPERTY(PCRECPP_UNITTEST_EXE
pcrecpp_unittest
DEBUG_LOCATION)
GET_TARGET_PROPERTY(PCRE_SCANNER_UNITTEST_EXE
pcre_scanner_unittest
DEBUG_LOCATION)
GET_TARGET_PROPERTY(PCRE_STRINGPIECE_UNITTEST_EXE
pcre_stringpiece_unittest
DEBUG_LOCATION)
ADD_TEST(pcrecpp_test ${PCRECPP_UNITTEST_EXE})
ADD_TEST(pcre_scanner_test ${PCRE_SCANNER_UNITTEST_EXE})
ADD_TEST(pcre_stringpiece_test ${PCRE_STRINGPIECE_UNITTEST_EXE})
ENDIF(PCRE_BUILD_TESTS)
# Installation
SET(CMAKE_INSTALL_ALWAYS 1)
INSTALL(TARGETS ${targets}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
INSTALL(FILES ${PCRE_HEADERS} ${PCREPOSIX_HEADERS} DESTINATION include)
FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
IF(PCRE_BUILD_PCRECPP)
INSTALL(FILES ${PCRECPP_HEADERS} DESTINATION include)
ELSE(PCRE_BUILD_PCRECPP)
# Remove pcrecpp.3
FOREACH(man ${man3})
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
IF(NOT man_tmp STREQUAL "pcrecpp.3")
SET(man3_new ${man3} ${man})
ENDIF(NOT man_tmp STREQUAL "pcrecpp.3")
ENDFOREACH(man ${man3})
SET(man3 ${man3_new})
ENDIF(PCRE_BUILD_PCRECPP)
INSTALL(FILES ${man1} DESTINATION man/man1)
INSTALL(FILES ${man3} DESTINATION man/man3)
INSTALL(FILES ${html} DESTINATION share/doc/pcre/html)
# help, only for nice output
IF(BUILD_SHARED_LIBS)
SET(BUILD_STATIC_LIBS OFF)
ELSE(BUILD_SHARED_LIBS)
SET(BUILD_STATIC_LIBS ON)
ENDIF(BUILD_SHARED_LIBS)
IF(PCRE_SHOW_REPORT)
STRING(TOUPPER "${CMAKE_BUILD_TYPE}" buildtype)
IF (CMAKE_C_FLAGS)
SET(cfsp " ")
ENDIF(CMAKE_C_FLAGS)
IF (CMAKE_CXX_FLAGS)
SET(cxxfsp " ")
ENDIF(CMAKE_CXX_FLAGS)
MESSAGE(STATUS "")
MESSAGE(STATUS "")
MESSAGE(STATUS "PCRE configuration summary:")
MESSAGE(STATUS "")
MESSAGE(STATUS " Install prefix .................. : ${CMAKE_INSTALL_PREFIX}")
MESSAGE(STATUS " C compiler ...................... : ${CMAKE_C_COMPILER}")
MESSAGE(STATUS " C++ compiler .................... : ${CMAKE_CXX_COMPILER}")
MESSAGE(STATUS " C compiler flags ................ : ${CMAKE_C_FLAGS}${cfsp}${CMAKE_C_FLAGS_${buildtype}}")
MESSAGE(STATUS " C++ compiler flags .............. : ${CMAKE_CXX_FLAGS}${cxxfsp}${CMAKE_CXX_FLAGS_${buildtype}}")
MESSAGE(STATUS "")
MESSAGE(STATUS " Build C++ library ............... : ${PCRE_BUILD_PCRECPP}")
MESSAGE(STATUS " Enable UTF-8 support ............ : ${PCRE_SUPPORT_UNICODE_PROPERTIES}")
MESSAGE(STATUS " Unicode properties .............. : ${PCRE_SUPPORT_UNICODE_PROPERTIES}")
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE_SUPPORT_BSR_ANYCRLF}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE_EBCDIC}")
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE_REBUILD_CHARTABLES}")
MESSAGE(STATUS " No stack recursion .............. : ${PCRE_NO_RECURSE}")
MESSAGE(STATUS " POSIX mem threshold ............. : ${PCRE_POSIX_MALLOC_THRESHOLD}")
MESSAGE(STATUS " Internal link size .............. : ${PCRE_LINK_SIZE}")
MESSAGE(STATUS " Match limit ..................... : ${PCRE_MATCH_LIMIT}")
MESSAGE(STATUS " Match limit recursion ........... : ${PCRE_MATCH_LIMIT_RECURSION}")
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
MESSAGE(STATUS " Build pcregrep .................. : ${PCRE_BUILD_PCREGREP}")
MESSAGE(STATUS " Build tests (implies pcretest) .. : ${PCRE_BUILD_TESTS}")
IF(ZLIB_FOUND)
MESSAGE(STATUS " Link pcregrep with libz ......... : ${PCRE_SUPPORT_LIBZ}")
ELSE(ZLIB_FOUND)
MESSAGE(STATUS " Link pcregrep with libz ......... : None" )
ENDIF(ZLIB_FOUND)
IF(BZIP2_FOUND)
MESSAGE(STATUS " Link pcregrep with libbz2 ....... : ${PCRE_SUPPORT_LIBBZ2}")
ELSE(BZIP2_FOUND)
MESSAGE(STATUS " Link pcregrep with libbz2 ....... : None" )
ENDIF(BZIP2_FOUND)
IF(NOT PCRE_SUPPORT_LIBREADLINE)
MESSAGE(STATUS " Link pcretest with libreadline .. : None" )
ELSE(NOT PCRE_SUPPORT_LIBREADLINE)
MESSAGE(STATUS " Link pcretest with libreadline .. : ${PCRE_SUPPORT_LIBREADLINE}")
ENDIF(NOT PCRE_SUPPORT_LIBREADLINE)
IF(MINGW AND NOT PCRE_STATIC)
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
ENDIF(MINGW AND NOT PCRE_STATIC)
MESSAGE(STATUS "")
ENDIF(PCRE_SHOW_REPORT)
# end CMakeLists.txt

View File

@@ -1,68 +1,5 @@
PCRE LICENCE
------------
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
The basic library functions are written in C and are freestanding. Also
included in the distribution is a set of C++ wrapper functions.
THE BASIC LIBRARY FUNCTIONS
---------------------------
Written by: Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2006 University of Cambridge
All rights reserved.
THE C++ WRAPPER FUNCTIONS
-------------------------
Contributed by: Google Inc.
Copyright (c) 2006, Google Inc.
All rights reserved.
THE "BSD" LICENCE
-----------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the name of Google
Inc. nor the names of their contributors may be used to endorse or
promote products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
Please see the file LICENCE in the PCRE distribution for licensing details.
End

File diff suppressed because it is too large Load Diff

113
libs/pcre/CleanTxt Executable file
View File

@@ -0,0 +1,113 @@
#! /usr/bin/perl -w
# Script to take the output of nroff -man and remove all the backspacing and
# the page footers and the screen commands etc so that it is more usefully
# readable online. In fact, in the latest nroff, intermediate footers don't
# seem to be generated any more.
$blankcount = 0;
$lastwascut = 0;
$firstheader = 1;
# Input on STDIN; output to STDOUT.
while (<STDIN>)
{
s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
s/.\x8//g; # Remove "char, backspace"
# Handle header lines. Retain only the first one we encounter, but remove
# the blank line that follows. Any others (e.g. at end of document) and the
# following blank line are dropped.
if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/)
{
if ($firstheader)
{
$firstheader = 0;
print;
$lastprinted = $_;
$lastwascut = 0;
}
$_=<STDIN>; # Remove a blank that follows
next;
}
# Count runs of empty lines
if (/^\s*$/)
{
$blankcount++;
$lastwascut = 0;
next;
}
# If a chunk of lines has been cut out (page footer) and the next line
# has a different indentation, put back one blank line.
if ($lastwascut && $blankcount < 1 && defined($lastprinted))
{
($a) = $lastprinted =~ /^(\s*)/;
($b) = $_ =~ /^(\s*)/;
$blankcount++ if ($a ne $b);
}
# We get here only when we have a non-blank line in hand. If it was preceded
# by 3 or more blank lines, read the next 3 lines and see if they are blank.
# If so, remove all 7 lines, and remember that we have just done a cut.
if ($blankcount >= 3)
{
for ($i = 0; $i < 3; $i++)
{
$next[$i] = <STDIN>;
$next[$i] = "" if !defined $next[$i];
$next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
$next[$i] =~ s/.\x8//g; # Remove "char, backspace"
}
# Cut out chunks of the form <3 blanks><non-blank><3 blanks>
if ($next[0] =~ /^\s*$/ &&
$next[1] =~ /^\s*$/ &&
$next[2] =~ /^\s*$/)
{
$blankcount -= 3;
$lastwascut = 1;
}
# Otherwise output the saved blanks, the current, and the next three
# lines. Remember the last printed line.
else
{
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
print;
for ($i = 0; $i < 3; $i++)
{
$next[$i] =~ s/.\x8//g;
print $next[$i];
$lastprinted = $_;
}
$lastwascut = 0;
$blankcount = 0;
}
}
# This non-blank line is not preceded by 3 or more blank lines. Output
# any blanks there are, and the line. Remember it. Force two blank lines
# before headings.
else
{
$blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ &&
defined($lastprinted);
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
print;
$lastprinted = $_;
$lastwascut = 0;
$blankcount = 0;
}
}
# End

35
libs/pcre/Detrail Executable file
View File

@@ -0,0 +1,35 @@
#!/usr/bin/perl
# This is a script for removing trailing whitespace from lines in files that
# are listed on the command line.
# This subroutine does the work for one file.
sub detrail {
my($file) = $_[0];
my($changed) = 0;
open(IN, "$file") || die "Can't open $file for input";
@lines = <IN>;
close(IN);
foreach (@lines)
{
if (/\s+\n$/)
{
s/\s+\n$/\n/;
$changed = 1;
}
}
if ($changed)
{
open(OUT, ">$file") || die "Can't open $file for output";
print OUT @lines;
close(OUT);
}
}
# This is the main program
$, = ""; # Output field separator
for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); }
# End

418
libs/pcre/HACKING Normal file
View File

@@ -0,0 +1,418 @@
Technical Notes about PCRE
--------------------------
These are very rough technical notes that record potentially useful information
about PCRE internals.
Historical note 1
-----------------
Many years ago I implemented some regular expression functions to an algorithm
suggested by Martin Richards. These were not Unix-like in form, and were quite
restricted in what they could do by comparison with Perl. The interesting part
about the algorithm was that the amount of space required to hold the compiled
form of an expression was known in advance. The code to apply an expression did
not operate by backtracking, as the original Henry Spencer code and current
Perl code does, but instead checked all possibilities simultaneously by keeping
a list of current states and checking all of them as it advanced through the
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
algorithm", though it was not a traditional Finite State Machine (FSM). When
the pattern was all used up, all remaining states were possible matches, and
the one matching the longest subset of the subject string was chosen. This did
not necessarily maximize the individual wild portions of the pattern, as is
expected in Unix and Perl-style regular expressions.
Historical note 2
-----------------
By contrast, the code originally written by Henry Spencer (which was
subsequently heavily modified for Perl) compiles the expression twice: once in
a dummy mode in order to find out how much store will be needed, and then for
real. (The Perl version probably doesn't do this any more; I'm talking about
the original library.) The execution function operates by backtracking and
maximizing (or, optionally, minimizing in Perl) the amount of the subject that
matches individual wild portions of the pattern. This is an "NFA algorithm" in
Friedl's terminology.
OK, here's the real stuff
-------------------------
For the set of functions that form the "basic" PCRE library (which are
unrelated to those mentioned above), I tried at first to invent an algorithm
that used an amount of store bounded by a multiple of the number of characters
in the pattern, to save on compiling time. However, because of the greater
complexity in Perl regular expressions, I couldn't do this. In any case, a
first pass through the pattern is helpful for other reasons.
Computing the memory requirement: how it was
--------------------------------------------
Up to and including release 6.7, PCRE worked by running a very degenerate first
pass to calculate a maximum store size, and then a second pass to do the real
compile - which might use a bit less than the predicted amount of memory. The
idea was that this would turn out faster than the Henry Spencer code because
the first pass is degenerate and the second pass can just store stuff straight
into the vector, which it knows is big enough.
Computing the memory requirement: how it is
-------------------------------------------
By the time I was working on a potential 6.8 release, the degenerate first pass
had become very complicated and hard to maintain. Indeed one of the early
things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
I had a flash of inspiration as to how I could run the real compile function in
a "fake" mode that enables it to compute how much memory it would need, while
actually only ever using a few hundred bytes of working memory, and without too
many tests of the mode that might slow it down. So I re-factored the compiling
functions to work this way. This got rid of about 600 lines of source. It
should make future maintenance and development easier. As this was such a major
change, I never released 6.8, instead upping the number to 7.0 (other quite
major changes are also present in the 7.0 release).
A side effect of this work is that the previous limit of 200 on the nesting
depth of parentheses was removed. However, there is a downside: pcre_compile()
runs more slowly than before (30% or more, depending on the pattern) because it
is doing a full analysis of the pattern. My hope is that this is not a big
issue.
Traditional matching function
-----------------------------
The "traditional", and original, matching function is called pcre_exec(), and
it implements an NFA algorithm, similar to the original Henry Spencer algorithm
and the way that Perl works. Not surprising, since it is intended to be as
compatible with Perl as possible. This is the function most users of PCRE will
use most of the time.
Supplementary matching function
-------------------------------
From PCRE 6.0, there is also a supplementary matching function called
pcre_dfa_exec(). This implements a DFA matching algorithm that searches
simultaneously for all possible matches that start at one point in the subject
string. (Going back to my roots: see Historical Note 1 above.) This function
intreprets the same compiled pattern data as pcre_exec(); however, not all the
facilities are available, and those that are do not always work in quite the
same way. See the user documentation for details.
The algorithm that is used for pcre_dfa_exec() is not a traditional FSM,
because it may have a number of states active at one time. More work would be
needed at compile time to produce a traditional FSM where only one state is
ever active at once. I believe some other regex matchers work this way.
Format of compiled patterns
---------------------------
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
item is either implicit in the opcode or contained in the data bytes that
follow it.
In many cases below LINK_SIZE data values are specified for offsets within the
compiled pattern. The default value for LINK_SIZE is 2, but PCRE can be
compiled to use 3-byte or 4-byte values for these offsets (impairing the
performance). This is necessary only when patterns whose compiled length is
greater than 64K are going to be processed. In this description, we assume the
"normal" compilation options. Data values that are counts (e.g. for
quantifiers) are always just two bytes long.
A list of the opcodes follows:
Opcodes with no following data
------------------------------
These items are all just one byte long
OP_END end of pattern
OP_ANY match any one character other than newline
OP_ALLANY match any one character, including newline
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
OP_SET_SOM, set start of match (\K)
OP_CIRC ^ (start of data, or after \n in multiline)
OP_NOT_WORD_BOUNDARY \W
OP_WORD_BOUNDARY \w
OP_NOT_DIGIT \D
OP_DIGIT \d
OP_NOT_HSPACE \H
OP_HSPACE \h
OP_NOT_WHITESPACE \S
OP_WHITESPACE \s
OP_NOT_VSPACE \V
OP_VSPACE \v
OP_NOT_WORDCHAR \W
OP_WORDCHAR \w
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_EXTUNI match an extended Unicode character
OP_ANYNL match any Unicode newline sequence
OP_ACCEPT )
OP_COMMIT )
OP_FAIL ) These are Perl 5.10's "backtracking
OP_PRUNE ) control verbs".
OP_SKIP )
OP_THEN )
Repeating single characters
---------------------------
The common repeats (*, +, ?) when applied to a single character use the
following opcodes:
OP_STAR
OP_MINSTAR
OP_POSSTAR
OP_PLUS
OP_MINPLUS
OP_POSPLUS
OP_QUERY
OP_MINQUERY
OP_POSQUERY
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
their names are possessive versions. Each is followed by the character that is
to be repeated. Other repeats make use of
OP_UPTO
OP_MINUPTO
OP_POSUPTO
OP_EXACT
which are followed by a two-byte count (most significant first) and the
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
Repeating character types
-------------------------
Repeats of things like \d are done exactly as for single characters, except
that instead of a character, the opcode for the type is stored in the data
byte. The opcodes are:
OP_TYPESTAR
OP_TYPEMINSTAR
OP_TYPEPOSSTAR
OP_TYPEPLUS
OP_TYPEMINPLUS
OP_TYPEPOSPLUS
OP_TYPEQUERY
OP_TYPEMINQUERY
OP_TYPEPOSQUERY
OP_TYPEUPTO
OP_TYPEMINUPTO
OP_TYPEPOSUPTO
OP_TYPEEXACT
Match by Unicode property
-------------------------
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
character by testing its Unicode property (the \p and \P escape sequences).
Each is followed by two bytes that encode the desired property as a type and a
value.
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
value.
Matching literal characters
---------------------------
The OP_CHAR opcode is followed by a single character that is to be matched
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
character may be more than one byte long. (Earlier versions of PCRE used
multi-character strings, but this was changed to allow some new features to be
added.)
Character classes
-----------------
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
class, and OP_NOT for a negative one (that is, for something like [^a]).
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
values < 128, because OP_NOT is confined to single bytes.
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
repeated positive single-character class.
When there's more than one character in a class and all the characters are less
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
bit for every character that is acceptable. The bits are counted from the least
significant end of each byte.
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
subject characters with values greater than 256 can be handled correctly. For
OP_CLASS they don't match, whereas for OP_NCLASS they do.
For classes containing characters with values > 255, OP_XCLASS is used. It
optionally uses a bit map (if any characters lie within it), followed by a list
of pairs and single characters. There is a flag character than indicates
whether it's a positive or a negative class.
Back references
---------------
OP_REF is followed by two bytes containing the reference number.
Repeating character classes and back references
-----------------------------------------------
Single-character classes are handled specially (see above). This section
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
the base item. The matching code looks at the following opcode to see if it is
one of
OP_CRSTAR
OP_CRMINSTAR
OP_CRPLUS
OP_CRMINPLUS
OP_CRQUERY
OP_CRMINQUERY
OP_CRRANGE
OP_CRMINRANGE
All but the last two are just single-byte items. The others are followed by
four bytes of data, comprising the minimum and maximum repeat counts. There are
no special possessive opcodes for these repeats; a possessive repeat is
compiled into an atomic group.
Brackets and alternation
------------------------
A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
[Note for North Americans: "bracket" to some English speakers, including
myself, can be round, square, curly, or pointy. Hence this usage.]
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
capturing brackets and it used a different opcode for each one. From release
3.5, the limit was removed by putting the bracket number into the data for
higher-numbered brackets. From release 7.0 all capturing brackets are handled
this way, using the single opcode OP_CBRA.
A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
next alternative OP_ALT or, if there aren't any branches, to the matching
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
number immediately follows the offset, always as a 2-byte item.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
positive number) the offset back to the matching bracket opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
single-byte opcodes that tell the matcher that skipping the following
subpattern entirely is a valid branch. In the case of the first two, not
skipping the pattern is also valid (greedy and non-greedy). The third is used
when a pattern has the quantifier {0,0}. It cannot be entirely discarded,
because it may be called as a subroutine from elsewhere in the regex.
A subpattern with an indefinite maximum repetition is replicated in the
compiled data its minimum number of times (or once with OP_BRAZERO if the
minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
as appropriate.
A subpattern with a bounded maximum repetition is replicated in a nested
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
before each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group
has the same number.
When a repeated subpattern has an unbounded upper limit, it is checked to see
whether it could match an empty string. If this is the case, the opcode in the
final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
that it needs to check for matching an empty string when it hits OP_KETRMIN or
OP_KETRMAX, and if so, to break the loop.
Assertions
----------
Forward assertions are just like other subpatterns, but starting with one of
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
is OP_REVERSE, followed by a two byte count of the number of characters to move
back the pointer in the subject string. When operating in UTF-8 mode, the count
is a character count rather than a byte count. A separate count is present in
each alternative of a lookbehind assertion, allowing them to have different
fixed lengths.
Once-only (atomic) subpatterns
------------------------------
These are also just like other subpatterns, but they start with the opcode
OP_ONCE. The check for matching an empty string in an unbounded repeat is
handled entirely at runtime, so there is just this one opcode.
Conditional subpatterns
-----------------------
These are like other subpatterns, but they start with the opcode OP_COND, or
OP_SCOND for one that might match an empty string in an unbounded repeat. If
the condition is a back reference, this is stored at the start of the
subpattern using the opcode OP_CREF followed by two bytes containing the
reference number. If the condition is "in recursion" (coded as "(?(R)"), or "in
recursion of group x" (coded as "(?(Rx)"), the group number is stored at the
start of the subpattern using the opcode OP_RREF, and a value of zero for "the
whole pattern". For a DEFINE condition, just the single byte OP_DEF is used (it
has no associated data). Otherwise, a conditional subpattern always starts with
one of the assertions.
Recursion
---------
Recursion either matches the current regex, or some subexpression. The opcode
OP_RECURSE is followed by an value which is the offset to the starting bracket
from the start of the whole pattern. From release 6.5, OP_RECURSE is
automatically wrapped inside OP_ONCE brackets (because otherwise some patterns
broke it). OP_RECURSE is also used for "subroutine" calls, even though they
are not strictly a recursion.
Callout
-------
OP_CALLOUT is followed by one byte of data that holds a callout number in the
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
cases there follows a two-byte value giving the offset in the pattern to the
start of the following item, and another two-byte item giving the length of the
next item.
Changing options
----------------
If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
opcode is compiled, followed by one byte containing the new settings of these
flags. If there are several alternatives, there is an occurrence of OP_OPT at
the start of all those following the first options change, to set appropriate
options for the start of the alternative. Immediately after the end of the
group there is another such item to reset the flags to their previous values. A
change of flag right at the very start of the pattern can be handled entirely
at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
April 2008

View File

@@ -1,41 +1,54 @@
Installation Instructions
*************************
Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
2006, 2007, 2008 Free Software Foundation, Inc.
This file is free documentation; the Free Software Foundation gives
unlimited permission to copy, distribute and modify it.
Basic Installation
==================
These are generic installation instructions that apply to systems that
can run the `configure' shell script - Unix systems and any that imitate
it. They are not specific to PCRE. There are PCRE-specific instructions
for non-Unix systems in the file NON-UNIX-USE.
Briefly, the shell commands `./configure; make; make install' should
configure, build, and install this package. The following
more-detailed instructions are generic; see the `README' file for
instructions specific to this package.
The `configure' shell script attempts to guess correct values for
various system-dependent variables used during compilation. It uses
those values to create a `Makefile' in each directory of the package.
It may also create one or more `.h' files containing system-dependent
definitions. Finally, it creates a shell script `config.status' that
you can run in the future to recreate the current configuration, a file
`config.cache' that saves the results of its tests to speed up
reconfiguring, and a file `config.log' containing compiler output
(useful mainly for debugging `configure').
you can run in the future to recreate the current configuration, and a
file `config.log' containing compiler output (useful mainly for
debugging `configure').
It can also use an optional file (typically called `config.cache'
and enabled with `--cache-file=config.cache' or simply `-C') that saves
the results of its tests to speed up reconfiguring. Caching is
disabled by default to prevent problems with accidental use of stale
cache files.
If you need to do unusual things to compile the package, please try
to figure out how `configure' could check whether to do them, and mail
diffs or instructions to the address given in the `README' so they can
be considered for the next release. If at some point `config.cache'
contains results you don't want to keep, you may remove or edit it.
be considered for the next release. If you are using the cache, and at
some point `config.cache' contains results you don't want to keep, you
may remove or edit it.
The file `configure.in' is used to create `configure' by a program
called `autoconf'. You only need `configure.in' if you want to change
it or regenerate `configure' using a newer version of `autoconf'.
The file `configure.ac' (or `configure.in') is used to create
`configure' by a program called `autoconf'. You need `configure.ac' if
you want to change it or regenerate `configure' using a newer version
of `autoconf'.
The simplest way to compile this package is:
1. `cd' to the directory containing the package's source code and type
`./configure' to configure the package for your system. If you're
using `csh' on an old version of System V, you might need to type
`sh ./configure' instead to prevent `csh' from trying to execute
`configure' itself.
`./configure' to configure the package for your system.
Running `configure' takes awhile. While running, it prints some
messages telling which features it is checking for.
Running `configure' might take a while. While running, it prints
some messages telling which features it is checking for.
2. Type `make' to compile the package.
@@ -54,52 +67,69 @@ The simplest way to compile this package is:
all sorts of other programs in order to regenerate files that came
with the distribution.
6. Often, you can also type `make uninstall' to remove the installed
files again.
Compilers and Options
=====================
Some systems require unusual options for compilation or linking that
the `configure' script does not know about. You can give `configure'
initial values for variables by setting them in the environment. Using
a Bourne-compatible shell, you can do that on the command line like
this:
CC=c89 CFLAGS=-O2 LIBS=-lposix ./configure
the `configure' script does not know about. Run `./configure --help'
for details on some of the pertinent environment variables.
Or on systems that have the `env' program, you can do it like this:
env CPPFLAGS=-I/usr/local/include LDFLAGS=-s ./configure
You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment. Here
is an example:
./configure CC=c99 CFLAGS=-g LIBS=-lposix
*Note Defining Variables::, for more details.
Compiling For Multiple Architectures
====================================
You can compile the package for more than one kind of computer at the
same time, by placing the object files for each architecture in their
own directory. To do this, you must use a version of `make' that
supports the `VPATH' variable, such as GNU `make'. `cd' to the
own directory. To do this, you can use GNU `make'. `cd' to the
directory where you want the object files and executables to go and run
the `configure' script. `configure' automatically checks for the
source code in the directory that `configure' is in and in `..'.
If you have to use a `make' that does not supports the `VPATH'
variable, you have to compile the package for one architecture at a time
in the source code directory. After you have installed the package for
one architecture, use `make distclean' before reconfiguring for another
architecture.
With a non-GNU `make', it is safer to compile the package for one
architecture at a time in the source code directory. After you have
installed the package for one architecture, use `make distclean' before
reconfiguring for another architecture.
On MacOS X 10.5 and later systems, you can create libraries and
executables that work on multiple system types--known as "fat" or
"universal" binaries--by specifying multiple `-arch' options to the
compiler but only a single `-arch' option to the preprocessor. Like
this:
./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
CPP="gcc -E" CXXCPP="g++ -E"
This is not guaranteed to produce working output in all cases, you
may have to build one architecture at a time and combine the results
using the `lipo' tool if you have problems.
Installation Names
==================
By default, `make install' will install the package's files in
`/usr/local/bin', `/usr/local/man', etc. You can specify an
installation prefix other than `/usr/local' by giving `configure' the
option `--prefix=PATH'.
By default, `make install' installs the package's commands under
`/usr/local/bin', include files under `/usr/local/include', etc. You
can specify an installation prefix other than `/usr/local' by giving
`configure' the option `--prefix=PREFIX'.
You can specify separate installation prefixes for
architecture-specific files and architecture-independent files. If you
give `configure' the option `--exec-prefix=PATH', the package will use
PATH as the prefix for installing programs and libraries.
Documentation and other data files will still use the regular prefix.
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
PREFIX as the prefix for installing programs and libraries.
Documentation and other data files still use the regular prefix.
In addition, if you use an unusual directory layout you can give
options like `--bindir=PATH' to specify different values for particular
options like `--bindir=DIR' to specify different values for particular
kinds of files. Run `configure --help' for a list of the directories
you can set and what kinds of files go in them.
@@ -122,25 +152,57 @@ find the X include and library files automatically, but if it doesn't,
you can use the `configure' options `--x-includes=DIR' and
`--x-libraries=DIR' to specify their locations.
Particular systems
==================
On HP-UX, the default C compiler is not ANSI C compatible. If GNU
CC is not installed, it is recommended to use the following options in
order to use an ANSI C compiler:
./configure CC="cc -Ae"
and if that doesn't work, install pre-built binaries of GCC for HP-UX.
On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
parse its `<wchar.h>' header file. The option `-nodtk' can be used as
a workaround. If GNU CC is not installed, it is therefore recommended
to try
./configure CC="cc"
and if that doesn't work, try
./configure CC="cc -nodtk"
Specifying the System Type
==========================
There may be some features `configure' can not figure out
automatically, but needs to determine by the type of host the package
will run on. Usually `configure' can figure that out, but if it prints
a message saying it can not guess the host type, give it the
`--host=TYPE' option. TYPE can either be a short name for the system
type, such as `sun4', or a canonical name with three fields:
There may be some features `configure' cannot figure out
automatically, but needs to determine by the type of machine the package
will run on. Usually, assuming the package is built to be run on the
_same_ architectures, `configure' can figure that out, but if it prints
a message saying it cannot guess the machine type, give it the
`--build=TYPE' option. TYPE can either be a short name for the system
type, such as `sun4', or a canonical name which has the form:
CPU-COMPANY-SYSTEM
See the file `config.sub' for the possible values of each field. If
`config.sub' isn't included in this package, then this package doesn't
need to know the host type.
where SYSTEM can have one of these forms:
If you are building compiler tools for cross-compiling, you can also
use the `--target=TYPE' option to select the type of system they will
produce code for and the `--build=TYPE' option to select the type of
system on which you are compiling the package.
OS KERNEL-OS
See the file `config.sub' for the possible values of each field. If
`config.sub' isn't included in this package, then this package doesn't
need to know the machine type.
If you are _building_ compiler tools for cross-compiling, you should
use the option `--target=TYPE' to select the type of system they will
produce code for.
If you want to _use_ a cross compiler, that generates code for a
platform different from the build platform, you should specify the
"host" platform (i.e., that on which the generated programs will
eventually be run) with `--host=TYPE'.
Sharing Defaults
================
@@ -153,19 +215,55 @@ default values for variables like `CC', `cache_file', and `prefix'.
`CONFIG_SITE' environment variable to the location of the site script.
A warning: not all `configure' scripts look for a site script.
Operation Controls
Defining Variables
==================
Variables not defined in a site shell script can be set in the
environment passed to `configure'. However, some packages may run
configure again during the build, and the customized values of these
variables may be lost. In order to avoid this problem, you should set
them in the `configure' command line, using `VAR=value'. For example:
./configure CC=/usr/local2/bin/gcc
causes the specified `gcc' to be used as the C compiler (unless it is
overridden in the site shell script).
Unfortunately, this technique does not work for `CONFIG_SHELL' due to
an Autoconf bug. Until the bug is fixed you can use this workaround:
CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
`configure' Invocation
======================
`configure' recognizes the following options to control how it
operates.
`--cache-file=FILE'
Use and save the results of the tests in FILE instead of
`./config.cache'. Set FILE to `/dev/null' to disable caching, for
debugging `configure'.
`--help'
Print a summary of the options to `configure', and exit.
`-h'
Print a summary of all of the options to `configure', and exit.
`--help=short'
`--help=recursive'
Print a summary of the options unique to this package's
`configure', and exit. The `short' variant lists options used
only in the top level, while the `recursive' variant lists options
also present in any nested packages.
`--version'
`-V'
Print the version of Autoconf used to generate the `configure'
script, and exit.
`--cache-file=FILE'
Enable the cache: use and save the results of the tests in FILE,
traditionally `config.cache'. FILE defaults to `/dev/null' to
disable caching.
`--config-cache'
`-C'
Alias for `--cache-file=config.cache'.
`--quiet'
`--silent'
@@ -178,9 +276,16 @@ operates.
Look for the package's source code in directory DIR. Usually
`configure' can determine that directory automatically.
`--version'
Print the version of Autoconf used to generate the `configure'
script, and exit.
`--prefix=DIR'
Use DIR as the installation prefix. *Note Installation Names::
for more details, including other options available for fine-tuning
the installation locations.
`configure' also accepts some other, not widely useful, options.
`--no-create'
`-n'
Run the configure checks, but stop before creating any output
files.
`configure' also accepts some other, not widely useful, options. Run
`configure --help' for more details.

View File

@@ -4,7 +4,7 @@ PCRE LICENCE
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
specified below. The documentation for PCRE, supplied in the "doc"
directory, is distributed under the same terms as the software itself.
@@ -20,9 +20,9 @@ Email local part: ph10
Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Cambridge, England.
Copyright (c) 1997-2006 University of Cambridge
Copyright (c) 1997-2009 University of Cambridge
All rights reserved.
@@ -31,7 +31,7 @@ THE C++ WRAPPER FUNCTIONS
Contributed by: Google Inc.
Copyright (c) 2006, Google Inc.
Copyright (c) 2007-2008, Google Inc.
All rights reserved.

390
libs/pcre/Makefile.am Normal file
View File

@@ -0,0 +1,390 @@
## Process this file with automake to produce Makefile.in.
dist_doc_DATA = \
doc/pcre.txt \
doc/pcre-config.txt \
doc/pcregrep.txt \
doc/pcretest.txt \
AUTHORS \
COPYING \
ChangeLog \
LICENCE \
NEWS \
README
dist_html_DATA = \
doc/html/index.html \
doc/html/pcre.html \
doc/html/pcre-config.html \
doc/html/pcre_compile.html \
doc/html/pcre_compile2.html \
doc/html/pcre_config.html \
doc/html/pcre_copy_named_substring.html \
doc/html/pcre_copy_substring.html \
doc/html/pcre_dfa_exec.html \
doc/html/pcre_exec.html \
doc/html/pcre_free_substring.html \
doc/html/pcre_free_substring_list.html \
doc/html/pcre_fullinfo.html \
doc/html/pcre_get_named_substring.html \
doc/html/pcre_get_stringnumber.html \
doc/html/pcre_get_stringtable_entries.html \
doc/html/pcre_get_substring.html \
doc/html/pcre_get_substring_list.html \
doc/html/pcre_info.html \
doc/html/pcre_maketables.html \
doc/html/pcre_refcount.html \
doc/html/pcre_study.html \
doc/html/pcre_version.html \
doc/html/pcreapi.html \
doc/html/pcrebuild.html \
doc/html/pcrecallout.html \
doc/html/pcrecompat.html \
doc/html/pcregrep.html \
doc/html/pcrematching.html \
doc/html/pcrepartial.html \
doc/html/pcrepattern.html \
doc/html/pcreperform.html \
doc/html/pcreposix.html \
doc/html/pcreprecompile.html \
doc/html/pcresample.html \
doc/html/pcrestack.html \
doc/html/pcresyntax.html \
doc/html/pcretest.html
pcrecpp_html = doc/html/pcrecpp.html
dist_noinst_DATA = $(pcrecpp_html)
if WITH_PCRE_CPP
html_DATA = $(pcrecpp_html)
endif
# The Libtool libraries to install. We'll add to this later.
lib_LTLIBRARIES =
# Unit tests you want to run when people type 'make check'.
# TESTS is for binary unit tests, check_SCRIPTS for script-based tests
TESTS =
check_SCRIPTS =
dist_noinst_SCRIPTS =
# Some of the binaries we make are to be installed, and others are
# (non-user-visible) helper programs needed to build libpcre.
bin_PROGRAMS =
noinst_PROGRAMS =
# Additional files to delete on 'make clean' and 'make maintainer-clean'.
CLEANFILES =
MAINTAINERCLEANFILES =
# Additional files to bundle with the distribution, over and above what
# the Autotools include by default.
EXTRA_DIST =
# These files contain maintenance information
EXTRA_DIST += \
doc/perltest.txt \
NON-UNIX-USE \
HACKING
# These files are used in the preparation of a release
EXTRA_DIST += \
PrepareRelease \
CleanTxt \
Detrail \
132html \
doc/index.html.src
# These files are to do with building for Virtual Pascal
EXTRA_DIST += \
makevp.bat \
makevp_c.txt \
makevp_l.txt \
pcregexp.pas
# These files are usable versions of pcre.h and config.h that are distributed
# for the benefit of people who are building PCRE manually, without the
# Autotools support.
EXTRA_DIST += \
pcre.h.generic \
config.h.generic
pcre.h.generic: configure.ac
rm -f $@
cp -p pcre.h $@
MAINTAINERCLEANFILES += pcre.h.generic
# These are the header files we'll install. We do not distribute pcre.h because
# it is generated from pcre.h.in.
nodist_include_HEADERS = \
pcre.h
include_HEADERS = \
pcreposix.h
# These additional headers will be be installed if C++ support is enabled. We
# do not distribute pcrecpparg.h or pcre_stringpiece.h, as these are generated
# from corresponding .h.in files (which we do distribute).
if WITH_PCRE_CPP
nodist_include_HEADERS += \
pcrecpparg.h \
pcre_stringpiece.h
include_HEADERS += \
pcrecpp.h \
pcre_scanner.h
endif # WITH_PCRE_CPP
bin_SCRIPTS = pcre-config
## ---------------------------------------------------------------
## The dftables program is used to rebuild character tables before compiling
## PCRE, if --enable-rebuild-chartables is specified. It is not a user-visible
## program. The default (when --enable-rebuild-chartables is not specified) is
## to copy a distributed set of tables that are defined for ASCII code. In this
## case, dftables is not needed.
if WITH_REBUILD_CHARTABLES
noinst_PROGRAMS += dftables
dftables_SOURCES = dftables.c
pcre_chartables.c: dftables$(EXEEXT)
./dftables$(EXEEXT) $@
else
pcre_chartables.c: $(srcdir)/pcre_chartables.c.dist
rm -f $@
$(LN_S) $(srcdir)/pcre_chartables.c.dist $@
endif # WITH_REBUILD_CHARTABLES
## The main pcre library
lib_LTLIBRARIES += libpcre.la
libpcre_la_SOURCES = \
pcre_compile.c \
pcre_config.c \
pcre_dfa_exec.c \
pcre_exec.c \
pcre_fullinfo.c \
pcre_get.c \
pcre_globals.c \
pcre_info.c \
pcre_internal.h \
pcre_maketables.c \
pcre_newline.c \
pcre_ord2utf8.c \
pcre_refcount.c \
pcre_study.c \
pcre_tables.c \
pcre_try_flipped.c \
pcre_ucd.c \
pcre_valid_utf8.c \
pcre_version.c \
pcre_xclass.c \
ucp.h
## This file is generated as part of the building process, so don't distribute.
nodist_libpcre_la_SOURCES = \
pcre_chartables.c
# The pcre_printint.src file is #included by some source files, so it must be
# distributed. The pcre_chartables.c.dist file is the default version of
# pcre_chartables.c, used unless --enable-rebuild-chartables is specified.
EXTRA_DIST += pcre_printint.src pcre_chartables.c.dist
libpcre_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
CLEANFILES += pcre_chartables.c
## A version of the main pcre library that has a posix re API.
lib_LTLIBRARIES += libpcreposix.la
libpcreposix_la_SOURCES = \
pcreposix.c
libpcreposix_la_LDFLAGS = $(EXTRA_LIBPCREPOSIX_LDFLAGS)
libpcreposix_la_LIBADD = libpcre.la
## There's a C++ library as well.
if WITH_PCRE_CPP
lib_LTLIBRARIES += libpcrecpp.la
libpcrecpp_la_SOURCES = \
pcrecpp_internal.h \
pcrecpp.cc \
pcre_scanner.cc \
pcre_stringpiece.cc
libpcrecpp_la_LDFLAGS = $(EXTRA_LIBPCRECPP_LDFLAGS)
libpcrecpp_la_LIBADD = libpcre.la
TESTS += pcrecpp_unittest
noinst_PROGRAMS += pcrecpp_unittest
pcrecpp_unittest_SOURCES = pcrecpp_unittest.cc
pcrecpp_unittest_LDADD = libpcrecpp.la
TESTS += pcre_scanner_unittest
noinst_PROGRAMS += pcre_scanner_unittest
pcre_scanner_unittest_SOURCES = pcre_scanner_unittest.cc
pcre_scanner_unittest_LDADD = libpcrecpp.la
TESTS += pcre_stringpiece_unittest
noinst_PROGRAMS += pcre_stringpiece_unittest
pcre_stringpiece_unittest_SOURCES = pcre_stringpiece_unittest.cc
pcre_stringpiece_unittest_LDADD = libpcrecpp.la
endif # WITH_PCRE_CPP
## The main unit tests
# Each unit test is a binary plus a script that runs that binary in various
# ways. We install these test binaries in case folks find it helpful.
TESTS += RunTest
dist_noinst_SCRIPTS += RunTest
EXTRA_DIST += RunTest.bat
bin_PROGRAMS += pcretest
pcretest_SOURCES = pcretest.c
pcretest_LDADD = libpcreposix.la $(LIBREADLINE)
TESTS += RunGrepTest
dist_noinst_SCRIPTS += RunGrepTest
bin_PROGRAMS += pcregrep
pcregrep_SOURCES = pcregrep.c
pcregrep_LDADD = libpcreposix.la $(LIBZ) $(LIBBZ2)
EXTRA_DIST += \
testdata/grepinput \
testdata/grepinput8 \
testdata/grepinputv \
testdata/grepinputx \
testdata/greplist \
testdata/grepoutput \
testdata/grepoutput8 \
testdata/grepoutputN \
testdata/testinput1 \
testdata/testinput2 \
testdata/testinput3 \
testdata/testinput4 \
testdata/testinput5 \
testdata/testinput6 \
testdata/testinput7 \
testdata/testinput8 \
testdata/testinput9 \
testdata/testinput10 \
testdata/testoutput1 \
testdata/testoutput2 \
testdata/testoutput3 \
testdata/testoutput4 \
testdata/testoutput5 \
testdata/testoutput6 \
testdata/testoutput7 \
testdata/testoutput8 \
testdata/testoutput9 \
testdata/testoutput10 \
testdata/wintestinput3 \
testdata/wintestoutput3 \
perltest.pl
CLEANFILES += \
testsavedregex \
teststderr \
testtry \
testNinput
# PCRE demonstration program. No longer built automatcally. The point is that
# the users should build it themselves. So just distribute the source.
# noinst_PROGRAMS += pcredemo
# pcredemo_SOURCES = pcredemo.c
# pcredemo_LDADD = libpcre.la
EXTRA_DIST += pcredemo.c
## Utility rules, documentation, etc.
# A compatibility line, the old build system worked with 'make test'
test: check ;
# A PCRE user submitted the following addition, saying that it "will allow
# anyone using the 'mingw32' compiler to simply type 'make pcre.dll' and get a
# nice DLL for Windows use". (It is used by the pcre.dll target.)
DLL_OBJS= pcre_compile.o pcre_config.o \
pcre_dfa_exec.o pcre_exec.o pcre_fullinfo.o pcre_get.o \
pcre_globals.o pcre_info.o pcre_maketables.o \
pcre_newline.o pcre_ord2utf8.o pcre_refcount.o \
pcre_study.o pcre_tables.o pcre_try_flipped.o \
pcre_ucd.o pcre_valid_utf8.o pcre_version.o \
pcre_chartables.o \
pcre_xclass.o
# A PCRE user submitted the following addition, saying that it "will allow
# anyone using the 'mingw32' compiler to simply type 'make pcre.dll' and get a
# nice DLL for Windows use".
pcre.dll: $(DLL_OBJS)
$(CC) -shared -o pcre.dll -Wl,"--strip-all" -Wl,"--export-all-symbols" $(DLL_OBJS)
# We have .pc files for pkg-config users.
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpcre.pc
if WITH_PCRE_CPP
pkgconfig_DATA += libpcrecpp.pc
endif
dist_man_MANS = \
doc/pcre.3 \
doc/pcre-config.1 \
doc/pcre_compile.3 \
doc/pcre_compile2.3 \
doc/pcre_config.3 \
doc/pcre_copy_named_substring.3 \
doc/pcre_copy_substring.3 \
doc/pcre_dfa_exec.3 \
doc/pcre_exec.3 \
doc/pcre_free_substring.3 \
doc/pcre_free_substring_list.3 \
doc/pcre_fullinfo.3 \
doc/pcre_get_named_substring.3 \
doc/pcre_get_stringnumber.3 \
doc/pcre_get_stringtable_entries.3 \
doc/pcre_get_substring.3 \
doc/pcre_get_substring_list.3 \
doc/pcre_info.3 \
doc/pcre_maketables.3 \
doc/pcre_refcount.3 \
doc/pcre_study.3 \
doc/pcre_version.3 \
doc/pcreapi.3 \
doc/pcrebuild.3 \
doc/pcrecallout.3 \
doc/pcrecompat.3 \
doc/pcregrep.1 \
doc/pcrematching.3 \
doc/pcrepartial.3 \
doc/pcrepattern.3 \
doc/pcreperform.3 \
doc/pcreposix.3 \
doc/pcreprecompile.3 \
doc/pcresample.3 \
doc/pcrestack.3 \
doc/pcresyntax.3 \
doc/pcretest.1
pcrecpp_man = doc/pcrecpp.3
EXTRA_DIST += $(pcrecpp_man)
if WITH_PCRE_CPP
man_MANS = $(pcrecpp_man)
endif
## CMake support
EXTRA_DIST += \
cmake/COPYING-CMAKE-SCRIPTS \
cmake/FindPackageHandleStandardArgs.cmake \
cmake/FindReadline.cmake \
CMakeLists.txt \
config-cmake.h.in
## end Makefile.am

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,168 @@
News about PCRE releases
------------------------
Release 7.9 11-Apr-09
---------------------
Mostly bugfixes and tidies with just a couple of minor functional additions.
Release 7.8 05-Sep-08
---------------------
More bug fixes, plus a performance improvement in Unicode character property
lookup.
Release 7.7 07-May-08
---------------------
This is once again mainly a bug-fix release, but there are a couple of new
features.
Release 7.6 28-Jan-08
---------------------
The main reason for having this release so soon after 7.5 is because it fixes a
potential buffer overflow problem in pcre_compile() when run in UTF-8 mode. In
addition, the CMake configuration files have been brought up to date.
Release 7.5 10-Jan-08
---------------------
This is mainly a bug-fix release. However the ability to link pcregrep with
libz or libbz2 and the ability to link pcretest with libreadline have been
added. Also the --line-offsets and --file-offsets options were added to
pcregrep.
Release 7.4 21-Sep-07
---------------------
The only change of specification is the addition of options to control whether
\R matches any Unicode line ending (the default) or just CR, LF, and CRLF.
Otherwise, the changes are bug fixes and a refactoring to reduce the number of
relocations needed in a shared library. There have also been some documentation
updates, in particular, some more information about using CMake to build PCRE
has been added to the NON-UNIX-USE file.
Release 7.3 28-Aug-07
---------------------
Most changes are bug fixes. Some that are not:
1. There is some support for Perl 5.10's experimental "backtracking control
verbs" such as (*PRUNE).
2. UTF-8 checking is now as per RFC 3629 instead of RFC 2279; this is more
restrictive in the strings it accepts.
3. Checking for potential integer overflow has been made more dynamic, and as a
consequence there is no longer a hard limit on the size of a subpattern that
has a limited repeat count.
4. When CRLF is a valid line-ending sequence, pcre_exec() and pcre_dfa_exec()
no longer advance by two characters instead of one when an unanchored match
fails at CRLF if there are explicit CR or LF matches within the pattern.
This gets rid of some anomalous effects that previously occurred.
5. Some PCRE-specific settings for varying the newline options at the start of
a pattern have been added.
Release 7.2 19-Jun-07
---------------------
WARNING: saved patterns that were compiled by earlier versions of PCRE must be
recompiled for use with 7.2 (necessitated by the addition of \K, \h, \H, \v,
and \V).
Correction to the notes for 7.1: the note about shared libraries for Windows is
wrong. Previously, three libraries were built, but each could function
independently. For example, the pcreposix library also included all the
functions from the basic pcre library. The change is that the three libraries
are no longer independent. They are like the Unix libraries. To use the
pcreposix functions, for example, you need to link with both the pcreposix and
the basic pcre library.
Some more features from Perl 5.10 have been added:
(?-n) and (?+n) relative references for recursion and subroutines.
(?(-n) and (?(+n) relative references as conditions.
\k{name} and \g{name} are synonyms for \k<name>.
\K to reset the start of the matched string; for example, (foo)\Kbar
matches bar preceded by foo, but only sets bar as the matched string.
(?| introduces a group where the capturing parentheses in each alternative
start from the same number; for example, (?|(abc)|(xyz)) sets capturing
parentheses number 1 in both cases.
\h, \H, \v, \V match horizontal and vertical whitespace, respectively.
Release 7.1 24-Apr-07
---------------------
There is only one new feature in this release: a linebreak setting of
PCRE_NEWLINE_ANYCRLF. It is a cut-down version of PCRE_NEWLINE_ANY, which
recognizes only CRLF, CR, and LF as linebreaks.
A few bugs are fixed (see ChangeLog for details), but the major change is a
complete re-implementation of the build system. This now has full Autotools
support and so is now "standard" in some sense. It should help with compiling
PCRE in a wide variety of environments.
NOTE: when building shared libraries for Windows, three dlls are now built,
called libpcre, libpcreposix, and libpcrecpp. Previously, everything was
included in a single dll.
Another important change is that the dftables auxiliary program is no longer
compiled and run at "make" time by default. Instead, a default set of character
tables (assuming ASCII coding) is used. If you want to use dftables to generate
the character tables as previously, add --enable-rebuild-chartables to the
"configure" command. You must do this if you are compiling PCRE to run on a
system that uses EBCDIC code.
There is a discussion about character tables in the README file. The default is
not to use dftables so that that there is no problem when cross-compiling.
Release 7.0 19-Dec-06
---------------------
This release has a new major number because there have been some internal
upheavals to facilitate the addition of new optimizations and other facilities,
and to make subsequent maintenance and extension easier. Compilation is likely
to be a bit slower, but there should be no major effect on runtime performance.
Previously compiled patterns are NOT upwards compatible with this release. If
you have saved compiled patterns from a previous release, you will have to
re-compile them. Important changes that are visible to users are:
1. The Unicode property tables have been updated to Unicode 5.0.0, which adds
some more scripts.
2. The option PCRE_NEWLINE_ANY causes PCRE to recognize any Unicode newline
sequence as a newline.
3. The \R escape matches a single Unicode newline sequence as a single unit.
4. New features that will appear in Perl 5.10 are now in PCRE. These include
alternative Perl syntax for named parentheses, and Perl syntax for
recursion.
5. The C++ wrapper interface has been extended by the addition of a
QuoteMeta function and the ability to allow copy construction and
assignment.
For a complete list of changes, see the ChangeLog file.
Release 6.7 04-Jul-06
---------------------

View File

@@ -1,127 +1,154 @@
Compiling PCRE on non-Unix systems
----------------------------------
See below for comments on Cygwin or MinGW and OpenVMS usage. I (Philip Hazel)
have no knowledge of Windows or VMS sytems and how their libraries work. The
items in the PCRE Makefile that relate to anything other than Unix-like systems
have been contributed by PCRE users. There are some other comments and files in
the Contrib directory on the ftp site that you may find useful. See
This document contains the following sections:
General
Generic instructions for the PCRE C library
The C++ wrapper functions
Building for virtual Pascal
Stack size in Windows environments
Linking programs in Windows environments
Comments about Win32 builds
Building PCRE on Windows with CMake
Use of relative paths with CMake on Windows
Testing with runtest.bat
Building under Windows with BCC5.5
Building PCRE on OpenVMS
GENERAL
I (Philip Hazel) have no experience of Windows or VMS sytems and how their
libraries work. The items in the PCRE distribution and Makefile that relate to
anything other than Unix-like systems are untested by me.
There are some other comments and files (including some documentation in CHM
format) in the Contrib directory on the FTP site:
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
If you want to compile PCRE for a non-Unix system (or perhaps, more strictly,
for a system that does not support "configure" and "make" files), note that
the basic PCRE library consists entirely of code written in Standard C, and so
should compile successfully on any system that has a Standard C compiler and
library. The C++ wrapper functions are a separate issue (see below).
If you want to compile PCRE for a non-Unix system (especially for a system that
does not support "configure" and "make" files), note that the basic PCRE
library consists entirely of code written in Standard C, and so should compile
successfully on any system that has a Standard C compiler and library. The C++
wrapper functions are a separate issue (see below).
The PCRE distribution includes a "configure" file for use by the Configure/Make
build system, as found in many Unix-like environments. There is also support
support for CMake, which some users prefer, in particular in Windows
environments. There are some instructions for CMake under Windows in the
section entitled "Building PCRE with CMake" below. CMake can also be used to
build PCRE in Unix-like systems.
GENERIC INSTRUCTIONS FOR THE C LIBRARY
GENERIC INSTRUCTIONS FOR THE PCRE C LIBRARY
The following are generic comments about building PCRE. The interspersed
indented commands are suggestions from Mark Tetrode as to which commands you
might use on a Windows system to build a static library.
The following are generic comments about building the PCRE C library "by hand".
(1) Copy or rename the file config.in as config.h, and change the macros that
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
Unfortunately, because of the way Unix autoconf works, the default setting has
to be 0. You may also want to make changes to other macros in config.h. In
particular, if you want to force a specific value for newline, you can define
the NEWLINE macro. The default is to use '\n', thereby using whatever value
your compiler gives to '\n'.
(1) Copy or rename the file config.h.generic as config.h, and edit the macro
settings that it contains to whatever is appropriate for your environment.
In particular, if you want to force a specific value for newline, you can
define the NEWLINE macro. When you compile any of the PCRE modules, you
must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
in the sources.
rem Mark Tetrode's commands
copy config.in config.h
rem Use write, because notepad cannot handle UNIX files. Change values.
write config.h
An alternative approach is not to edit config.h, but to use -D on the
compiler command line to make any changes that you need to the
configuration options. In this case -DHAVE_CONFIG_H must not be set.
(2) Compile dftables.c as a stand-alone program, and then run it with
the single argument "pcre_chartables.c". This generates a set of standard
character tables and writes them to that file.
NOTE: There have been occasions when the way in which certain parameters
in config.h are used has changed between releases. (In the configure/make
world, this is handled automatically.) When upgrading to a new release,
you are strongly advised to review config.h.generic before re-using what
you had previously.
rem Mark Tetrode's commands
rem Compile & run
cl -DSUPPORT_UTF8 -DSUPPORT_UCP dftables.c
dftables.exe pcre_chartables.c
(2) Copy or rename the file pcre.h.generic as pcre.h.
(3) Compile the following source files:
(3) EITHER:
Copy or rename file pcre_chartables.c.dist as pcre_chartables.c.
pcre_chartables.c
pcre_compile.c
pcre_config.c
pcre_dfa_exec.c
pcre_exec.c
pcre_fullinfo.c
pcre_get.c
pcre_globals.c
pcre_info.c
pcre_maketables.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
pcre_tables.c
pcre_try_flipped.c
pcre_ucp_searchfuncs.c
pcre_valid_utf8.c
pcre_version.c
pcre_xclass.c
OR:
Compile dftables.c as a stand-alone program (using -DHAVE_CONFIG_H if
you have set up config.h), and then run it with the single argument
"pcre_chartables.c". This generates a set of standard character tables
and writes them to that file. The tables are generated using the default
C locale for your system. If you want to use a locale that is specified
by LC_xxx environment variables, add the -L option to the dftables
command. You must use this method if you are building on a system that
uses EBCDIC code.
and link them all together into an object library in whichever form your system
keeps such libraries. This is the pcre C library. If your system has static and
shared libraries, you may have to do this once for each type.
The tables in pcre_chartables.c are defaults. The caller of PCRE can
specify alternative tables at run time.
rem These comments are out-of-date, referring to a previous release which
rem had fewer source files. Replace with the file names from above.
rem Mark Tetrode's commands, for a static library
rem Compile & lib
cl -DSUPPORT_UTF8 -DSUPPORT_UCP -DPOSIX_MALLOC_THRESHOLD=10 /c maketables.c get.c study.c pcre.c
lib /OUT:pcre.lib maketables.obj get.obj study.obj pcre.obj
(4) Ensure that you have the following header files:
(4) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix
library.
pcre_internal.h
ucp.h
rem Mark Tetrode's commands, for a static library
rem Compile & lib
cl -DSUPPORT_UTF8 -DSUPPORT_UCP -DPOSIX_MALLOC_THRESHOLD=10 /c pcreposix.c
lib /OUT:pcreposix.lib pcreposix.obj
(5) Also ensure that you have the following file, which is #included as source
when building a debugging version of PCRE, and is also used by pcretest.
(5) Compile the test program pcretest.c. This needs the functions in the
pcre and pcreposix libraries when linking.
pcre_printint.src
rem Mark Tetrode's commands
rem compile & link
cl /F0x400000 pcretest.c pcre.lib pcreposix.lib
(6) Compile the following source files, setting -DHAVE_CONFIG_H as a compiler
option if you have set up config.h with your configuration, or else use
other -D settings to change the configuration as required.
(6) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. You must use the
-i option when checking testinput2. Note that the supplied files are in Unix
format, with just LF characters as line terminators. You may need to edit them
to change this if your system uses a different convention.
pcre_chartables.c
pcre_compile.c
pcre_config.c
pcre_dfa_exec.c
pcre_exec.c
pcre_fullinfo.c
pcre_get.c
pcre_globals.c
pcre_info.c
pcre_maketables.c
pcre_newline.c
pcre_ord2utf8.c
pcre_refcount.c
pcre_study.c
pcre_tables.c
pcre_try_flipped.c
pcre_ucd.c
pcre_valid_utf8.c
pcre_version.c
pcre_xclass.c
rem Mark Tetrode's commands
pcretest testdata\testinput1 testdata\myoutput1
windiff testdata\testoutput1 testdata\myoutput1
pcretest -i testdata\testinput2 testdata\myoutput2
windiff testdata\testoutput2 testdata\myoutput2
pcretest testdata\testinput3 testdata\myoutput3
windiff testdata\testoutput3 testdata\myoutput3
pcretest testdata\testinput4 testdata\myoutput4
windiff testdata\testoutput4 testdata\myoutput4
pcretest testdata\testinput5 testdata\myoutput5
windiff testdata\testoutput5 testdata\myoutput5
pcretest testdata\testinput6 testdata\myoutput6
windiff testdata\testoutput6 testdata\myoutput6
Make sure that you include -I. in the compiler command (or equivalent for
an unusual compiler) so that all included PCRE header files are first
sought in the current directory. Otherwise you run the risk of picking up
a previously-installed file from somewhere else.
Note that there are now three more tests (7, 8, 9) that did not exist when Mark
wrote those comments. The test the new pcre_dfa_exec() function.
(7) Now link all the compiled code into an object library in whichever form
your system keeps such libraries. This is the basic PCRE C library. If
your system has static and shared libraries, you may have to do this once
for each type.
(7) If you want to use the pcregrep command, compile and link pcregrep.c; it
uses only the basic PCRE library.
(8) Similarly, compile pcreposix.c (remembering -DHAVE_CONFIG_H if necessary)
and link the result (on its own) as the pcreposix library.
(9) Compile the test program pcretest.c (again, don't forget -DHAVE_CONFIG_H).
This needs the functions in the pcre and pcreposix libraries when linking.
It also needs the pcre_printint.src source file, which it #includes.
(10) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. Note that the
supplied files are in Unix format, with just LF characters as line
terminators. You may need to edit them to change this if your system uses
a different convention. If you are using Windows, you probably should use
the wintestinput3 file instead of testinput3 (and the corresponding output
file). This is a locale test; wintestinput3 sets the locale to "french"
rather than "fr_FR", and there some minor output differences.
(11) If you want to use the pcregrep command, compile and link pcregrep.c; it
uses only the basic PCRE library (it does not need the pcreposix library).
THE C++ WRAPPER FUNCTIONS
The PCRE distribution now contains some C++ wrapper functions and tests,
The PCRE distribution also contains some C++ wrapper functions and tests,
contributed by Google Inc. On a system that can use "configure" and "make",
the functions are automatically built into a library called pcrecpp. It should
be straightforward to compile the .cc files manually on other systems. The
@@ -129,77 +156,228 @@ files called xxx_unittest.cc are test programs for each of the corresponding
xxx.cc files.
FURTHER REMARKS
If you have a system without "configure" but where you can use a Makefile, edit
Makefile.in to create Makefile, substituting suitable values for the variables
at the head of the file.
Some help in building a Win32 DLL of PCRE in GnuWin32 environments was
contributed by Paul Sokolovsky. These environments are Mingw32
(http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and CygWin
(http://sourceware.cygnus.com/cygwin/). Paul comments:
For CygWin, set CFLAGS=-mno-cygwin, and do 'make dll'. You'll get
pcre.dll (containing pcreposix also), libpcre.dll.a, and dynamically
linked pgrep and pcretest. If you have /bin/sh, run RunTest (three
main test go ok, locale not supported).
Changes to do MinGW with autoconf 2.50 were supplied by Fred Cox
<sailorFred@yahoo.com>, who comments as follows:
If you are using the PCRE DLL, the normal Unix style configure && make &&
make check && make install should just work[*]. If you want to statically
link against the .a file, you must define PCRE_STATIC before including
pcre.h, otherwise the pcre_malloc and pcre_free exported functions will be
declared __declspec(dllimport), with hilarious results. See the configure.in
and pcretest.c for how it is done for the static test.
Also, there will only be a libpcre.la, not a libpcreposix.la, as you
would expect from the Unix version. The single DLL includes the pcreposix
interface.
[*] But note that the supplied test files are in Unix format, with just LF
characters as line terminators. You will have to edit them to change to CR LF
terminators.
BUILDING FOR VIRTUAL PASCAL
A script for building PCRE using Borland's C++ compiler for use with VPASCAL
was contributed by Alexander Tokarev. It is called makevp.bat.
was contributed by Alexander Tokarev. Stefan Weber updated the script and added
additional files. The following files in the distribution are for building PCRE
for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat, pcregexp.pas.
These are some further comments about Win32 builds from Mark Evans. They
were contributed before Fred Cox's changes were made, so it is possible that
they may no longer be relevant.
"The documentation for Win32 builds is a bit shy. Under MSVC6 I
followed their instructions to the letter, but there were still
some things missing.
STACK SIZE IN WINDOWS ENVIRONMENTS
(1) Must #define STATIC for entire project if linking statically.
(I see no reason to use DLLs for code this compact.) This of
course is a project setting in MSVC under Preprocessor.
The default processor stack size of 1Mb in some Windows environments is too
small for matching patterns that need much recursion. In particular, test 2 may
fail because of this. Normally, running out of stack causes a crash, but there
have been cases where the test program has just died silently. See your linker
documentation for how to increase stack size if you experience problems. The
Linux default of 8Mb is a reasonable choice for the stack, though even that can
be too small for some pattern/subject combinations.
(2) Missing some #ifdefs relating to the function pointers
pcre_malloc and pcre_free. See my solution below. (The stubs
may not be mandatory but they made me feel better.)"
PCRE has a compile configuration option to disable the use of stack for
recursion so that heap is used instead. However, pattern matching is
significantly slower when this is done. There is more about stack usage in the
"pcrestack" documentation.
=========================
#ifdef _WIN32
#include <malloc.h>
void* malloc_stub(size_t N)
{ return malloc(N); }
void free_stub(void* p)
{ free(p); }
void *(*pcre_malloc)(size_t) = &malloc_stub;
void (*pcre_free)(void *) = &free_stub;
LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
#else
If you want to statically link a program against a PCRE library in the form of
a non-dll .a file, you must define PCRE_STATIC before including pcre.h,
otherwise the pcre_malloc() and pcre_free() exported functions will be declared
__declspec(dllimport), with unwanted results.
void *(*pcre_malloc)(size_t) = malloc;
void (*pcre_free)(void *) = free;
#endif
=========================
CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
It is possible to compile programs to use different calling conventions using
MSVC. Search the web for "calling conventions" for more information. To make it
easier to change the calling convention for the exported functions in the
PCRE library, the macro PCRE_CALL_CONVENTION is present in all the external
definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
not set, it defaults to empty; the default calling convention is then used
(which is what is wanted most of the time).
COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE WITH CMAKE" below)
There are two ways of building PCRE using the "configure, make, make install"
paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
the same thing; they are completely different from each other. There is also
support for building using CMake, which some users find a more straightforward
way of building PCRE under Windows. However, the tests are not run
automatically when CMake is used.
The MinGW home page (http://www.mingw.org/) says this:
MinGW: A collection of freely available and freely distributable Windows
specific header files and import libraries combined with GNU toolsets that
allow one to produce native Windows programs that do not rely on any
3rd-party C runtime DLLs.
The Cygwin home page (http://www.cygwin.com/) says this:
Cygwin is a Linux-like environment for Windows. It consists of two parts:
. A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
substantial Linux API functionality
. A collection of tools which provide Linux look and feel.
The Cygwin DLL currently works with all recent, commercially released x86 32
bit and 64 bit versions of Windows, with the exception of Windows CE.
On both MinGW and Cygwin, PCRE should build correctly using:
./configure && make && make install
This should create two libraries called libpcre and libpcreposix, and, if you
have enabled building the C++ wrapper, a third one called libpcrecpp. These are
independent libraries: when you like with libpcreposix or libpcrecpp you must
also link with libpcre, which contains the basic functions. (Some earlier
releases of PCRE included the basic libpcre functions in libpcreposix. This no
longer happens.)
A user submitted a special-purpose patch that makes it easy to create
"pcre.dll" under mingw32 using the "msys" environment. It provides "pcre.dll"
as a special target. If you use this target, no other files are built, and in
particular, the pcretest and pcregrep programs are not built. An example of how
this might be used is:
./configure --enable-utf --disable-cpp CFLAGS="-03 -s"; make pcre.dll
Using Cygwin's compiler generates libraries and executables that depend on
cygwin1.dll. If a library that is generated this way is distributed,
cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
licence, this forces not only PCRE to be under the GPL, but also the entire
application. A distributor who wants to keep their own code proprietary must
purchase an appropriate Cygwin licence.
MinGW has no such restrictions. The MinGW compiler generates a library or
executable that can run standalone on Windows without any third party dll or
licensing issues.
But there is more complication:
If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
gcc and MinGW's gcc). So, a user can:
. Build native binaries by using MinGW or by getting Cygwin and using
-mno-cygwin.
. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
compiler flags.
The test files that are supplied with PCRE are in Unix format, with LF
characters as line terminators. It may be necessary to change the line
terminators in order to get some of the tests to work. We hope to improve
things in this area in future.
BUILDING PCRE ON WINDOWS WITH CMAKE
CMake is an alternative build facility that can be used instead of the
traditional Unix "configure". CMake version 2.4.7 supports Borland makefiles,
MinGW makefiles, MSYS makefiles, NMake makefiles, UNIX makefiles, Visual Studio
6, Visual Studio 7, Visual Studio 8, and Watcom W8. The following instructions
were contributed by a PCRE user.
1. Download CMake 2.4.7 or above from http://www.cmake.org/, install and ensure
that cmake\bin is on your path.
2. Unzip (retaining folder structure) the PCRE source tree into a source
directory such as C:\pcre.
3. Create a new, empty build directory: C:\pcre\build\
4. Run CMakeSetup from the Shell envirornment of your build tool, e.g., Msys
for Msys/MinGW or Visual Studio Command Prompt for VC/VC++
5. Enter C:\pcre\pcre-xx and C:\pcre\build for the source and build
directories, respectively
6. Hit the "Configure" button.
7. Select the particular IDE / build tool that you are using (Visual Studio,
MSYS makefiles, MinGW makefiles, etc.)
8. The GUI will then list several configuration options. This is where you can
enable UTF-8 support, etc.
9. Hit "Configure" again. The adjacent "OK" button should now be active.
10. Hit "OK".
11. The build directory should now contain a usable build system, be it a
solution file for Visual Studio, makefiles for MinGW, etc.
USE OF RELATIVE PATHS WITH CMAKE ON WINDOWS
A PCRE user comments as follows:
I thought that others may want to know the current state of
CMAKE_USE_RELATIVE_PATHS support on Windows.
Here it is:
-- AdditionalIncludeDirectories is only partially modified (only the
first path - see below)
-- Only some of the contained file paths are modified - shown below for
pcre.vcproj
-- It properly modifies
I am sure CMake people can fix that if they want to. Until then one will
need to replace existing absolute paths in project files with relative
paths manually (e.g. from VS) - relative to project file location. I did
just that before being told to try CMAKE_USE_RELATIVE_PATHS. Not a big
deal.
AdditionalIncludeDirectories="E:\builds\pcre\build;E:\builds\pcre\pcre-7.5;"
AdditionalIncludeDirectories=".;E:\builds\pcre\pcre-7.5;"
RelativePath="pcre.h">
RelativePath="pcre_chartables.c">
RelativePath="pcre_chartables.c.rule">
TESTING WITH RUNTEST.BAT
1. Copy RunTest.bat into the directory where pcretest.exe has been created.
2. Edit RunTest.bat and insert a line that indentifies the relative location of
the pcre source, e.g.:
set srcdir=..\pcre-7.4-RC3
3. Run RunTest.bat from a command shell environment. Test outputs will
automatically be compared to expected results, and discrepancies will
identified in the console output.
4. To test pcrecpp, run pcrecpp_unittest.exe, pcre_stringpiece_unittest.exe and
pcre_scanner_unittest.exe.
BUILDING UNDER WINDOWS WITH BCC5.5
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
Some of the core BCC libraries have a version of PCRE from 1998 built in,
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
version mismatch. I'm including an easy workaround below, if you'd like to
include it in the non-unix instructions:
When linking a project with BCC5.5, pcre.lib must be included before any of
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
line.
BUILDING UNDER WINDOWS CE WITH VISUAL STUDIO 200x
Vincent Richomme sent a zip archive of files to help with this process. They
can be found in the file "pcre-vsbuild.zip" in the Contrib directory of the FTP
site.
BUILDING PCRE ON OPENVMS
@@ -266,4 +444,5 @@ $! Locale could not be set to fr
$!
=========================
Last Updated: 17 March 2009
****

214
libs/pcre/PrepareRelease Executable file
View File

@@ -0,0 +1,214 @@
#/bin/sh
# Script to prepare the files for building a PCRE release. It does some
# processing of the documentation, detrails files, and creates pcre.h.generic
# and config.h.generic (for use by builders who can't run ./configure).
# You must run this script before runnning "make dist". It makes use of the
# following files:
# 132html A Perl script that converts a .1 or .3 man page into HTML. It
# is called from MakeRelease. It "knows" the relevant troff
# constructs that are used in the PCRE man pages.
# CleanTxt A Perl script that cleans up the output of "nroff -man" by
# removing backspaces and other redundant text so as to produce
# a readable .txt file.
# Detrail A Perl script that removes trailing spaces from files.
# doc/index.html.src
# A file that is copied as index.html into the doc/html directory
# when the HTML documentation is built. It works like this so that
# doc/html can be deleted and re-created from scratch.
# First, sort out the documentation
cd doc
echo Processing documentation
# Make Text form of the documentation. It needs some mangling to make it
# tidy for online reading. Concatenate all the .3 stuff, but omit the
# individual function pages.
cat <<End >pcre.txt
-----------------------------------------------------------------------------
This file contains a concatenation of the PCRE man pages, converted to plain
text format for ease of searching with a text editor, or for use on systems
that do not have a man page processor. The small individual files that give
synopses of each function in the library have not been included. There are
separate text files for the pcregrep and pcretest commands.
-----------------------------------------------------------------------------
End
echo "Making pcre.txt"
for file in pcre pcrebuild pcrematching pcreapi pcrecallout pcrecompat \
pcrepattern pcresyntax pcrepartial pcreprecompile \
pcreperform pcreposix pcrecpp pcresample pcrestack ; do
echo " Processing $file.3"
nroff -c -man $file.3 >$file.rawtxt
../CleanTxt <$file.rawtxt >>pcre.txt
/bin/rm $file.rawtxt
echo "------------------------------------------------------------------------------" >>pcre.txt
if [ "$file" != "pcresample" ] ; then
echo " " >>pcre.txt
echo " " >>pcre.txt
fi
done
# The three commands
for file in pcretest pcregrep pcre-config ; do
echo Making $file.txt
nroff -c -man $file.1 >$file.rawtxt
../CleanTxt <$file.rawtxt >$file.txt
/bin/rm $file.rawtxt
done
# Make HTML form of the documentation.
echo "Making HTML documentation"
/bin/rm html/*
cp index.html.src html/index.html
for file in *.1 ; do
base=`basename $file .1`
echo " Making $base.html"
../132html -toc $base <$file >html/$base.html
done
# Exclude table of contents for function summaries. It seems that expr
# forces an anchored regex. Also exclude them for small pages that have
# only one section.
for file in *.3 ; do
base=`basename $file .3`
toc=-toc
if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi
if [ "$base" = "pcresample" ] || \
[ "$base" = "pcrestack" ] || \
[ "$base" = "pcrecompat" ] || \
[ "$base" = "pcreperform" ] ; then
toc=""
fi
echo " Making $base.html"
../132html $toc $base <$file >html/$base.html
if [ $? != 0 ] ; then exit 1; fi
done
# End of documentation processing
cd ..
echo Documentation done
# These files are detrailed; do not detrail the test data because there may be
# significant trailing spaces. The configure files are also omitted from the
# detrailing.
files="\
Makefile.am \
Makefile.in \
configure.ac \
README \
LICENCE \
COPYING \
AUTHORS \
NEWS \
NON-UNIX-USE \
INSTALL \
132html \
CleanTxt \
Detrail \
ChangeLog \
CMakeLists.txt \
RunGrepTest \
RunTest \
RunTest.bat \
pcre-config.in \
libpcre.pc.in \
libpcrecpp.pc.in \
config.h.in \
pcre_printint.src \
pcre_chartables.c.dist \
pcredemo.c \
pcregrep.c \
pcretest.c \
dftables.c \
pcreposix.c \
pcreposix.h \
pcre.h.in \
pcre_internal.h
pcre_compile.c \
pcre_config.c \
pcre_dfa_exec.c \
pcre_exec.c \
pcre_fullinfo.c \
pcre_get.c \
pcre_globals.c \
pcre_info.c \
pcre_maketables.c \
pcre_newline.c \
pcre_ord2utf8.c \
pcre_refcount.c \
pcre_study.c \
pcre_tables.c \
pcre_try_flipped.c \
pcre_ucp_searchfuncs.c \
pcre_valid_utf8.c \
pcre_version.c \
pcre_xclass.c \
pcre_scanner.cc \
pcre_scanner.h \
pcre_scanner_unittest.cc \
pcrecpp.cc \
pcrecpp.h \
pcrecpparg.h.in \
pcrecpp_unittest.cc \
pcre_stringpiece.cc \
pcre_stringpiece.h.in \
pcre_stringpiece_unittest.cc \
perltest.pl \
ucp.h \
ucpinternal.h \
ucptable.h \
makevp.bat \
pcre.def \
libpcre.def \
libpcreposix.def"
echo Detrailing
./Detrail $files doc/p* doc/html/*
echo Doing basic configure to get default pcre.h and config.h
# This is in case the caller has set aliases (as I do - PH)
unset cp ls mv rm
./configure >/dev/null
echo Converting pcre.h and config.h to generic forms
cp -f pcre.h pcre.h.generic
perl <<'END'
open(IN, "<config.h") || die "Can't open config.h: $!\n";
open(OUT, ">config.h.generic") || die "Can't open config.h.generic: $!\n";
while (<IN>)
{
if (/^#define\s(?!PACKAGE)(\w+)/)
{
print OUT "#ifndef $1\n";
print OUT;
print OUT "#endif\n";
}
else
{
print OUT;
}
}
close IN;
close OUT;
END
echo Done
#End

View File

@@ -1,55 +1,93 @@
README file for PCRE (Perl-compatible regular expression library)
-----------------------------------------------------------------
The latest release of PCRE is always available from
The latest release of PCRE is always available in three alternative formats
from:
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.gz
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.bz2
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.zip
There is a mailing list for discussion about the development of PCRE at
pcre-dev@exim.org
Please read the NEWS file if you are upgrading from a previous release.
The contents of this README file are:
The PCRE APIs
Documentation for PCRE
Contributions by users of PCRE
Building PCRE on non-Unix systems
Building PCRE on Unix-like systems
Retrieving configuration information on Unix-like systems
Shared libraries on Unix-like systems
Cross-compiling on Unix-like systems
Using HP's ANSI C++ compiler (aCC)
Making new tarballs
Testing PCRE
Character tables
File manifest
The PCRE APIs
-------------
PCRE is written in C, and it has its own API. The distribution now includes a
set of C++ wrapper functions, courtesy of Google Inc. (see the pcrecpp man page
for details).
PCRE is written in C, and it has its own API. The distribution also includes a
set of C++ wrapper functions (see the pcrecpp man page for details), courtesy
of Google Inc.
Also included are a set of C wrapper functions that are based on the POSIX
API. These end up in the library called libpcreposix. Note that this just
provides a POSIX calling interface to PCRE: the regular expressions themselves
still follow Perl syntax and semantics. The header file for the POSIX-style
functions is called pcreposix.h. The official POSIX name is regex.h, but I
didn't want to risk possible problems with existing files of that name by
distributing it that way. To use it with an existing program that uses the
POSIX API, it will have to be renamed or pointed at by a link.
In addition, there is a set of C wrapper functions that are based on the POSIX
regular expression API (see the pcreposix man page). These end up in the
library called libpcreposix. Note that this just provides a POSIX calling
interface to PCRE; the regular expressions themselves still follow Perl syntax
and semantics. The POSIX API is restricted, and does not give full access to
all of PCRE's facilities.
The header file for the POSIX-style functions is called pcreposix.h. The
official POSIX name is regex.h, but I did not want to risk possible problems
with existing files of that name by distributing it that way. To use PCRE with
an existing program that uses the POSIX API, pcreposix.h will have to be
renamed or pointed at by a link.
If you are using the POSIX interface to PCRE and there is already a POSIX regex
library installed on your system, you must take care when linking programs to
library installed on your system, as well as worrying about the regex.h header
file (as mentioned above), you must also take care when linking programs to
ensure that they link with PCRE's libpcreposix library. Otherwise they may pick
up the "real" POSIX functions of the same name.
up the POSIX functions of the same name from the other library.
One way of avoiding this confusion is to compile PCRE with the addition of
-Dregcomp=PCREregcomp (and similarly for the other POSIX functions) to the
compiler flags (CFLAGS if you are using "configure" -- see below). This has the
effect of renaming the functions so that the names no longer clash. Of course,
you have to do the same thing for your applications, or write them using the
new names.
Documentation for PCRE
----------------------
If you install PCRE in the normal way, you will end up with an installed set of
man pages whose names all start with "pcre". The one that is just called "pcre"
lists all the others. In addition to these man pages, the PCRE documentation is
supplied in two other forms; however, as there is no standard place to install
them, they are left in the doc directory of the unpacked source distribution.
These forms are:
If you install PCRE in the normal way on a Unix-like system, you will end up
with a set of man pages whose names all start with "pcre". The one that is just
called "pcre" lists all the others. In addition to these man pages, the PCRE
documentation is supplied in two other forms:
1. Files called doc/pcre.txt, doc/pcregrep.txt, and doc/pcretest.txt. The
first of these is a concatenation of the text forms of all the section 3
man pages except those that summarize individual functions. The other two
are the text forms of the section 1 man pages for the pcregrep and
pcretest commands. Text forms are provided for ease of scanning with text
editors or similar tools.
1. There are files called doc/pcre.txt, doc/pcregrep.txt, and
doc/pcretest.txt in the source distribution. The first of these is a
concatenation of the text forms of all the section 3 man pages except
those that summarize individual functions. The other two are the text
forms of the section 1 man pages for the pcregrep and pcretest commands.
These text forms are provided for ease of scanning with text editors or
similar tools. They are installed in <prefix>/share/doc/pcre, where
<prefix> is the installation prefix (defaulting to /usr/local).
2. A subdirectory called doc/html contains all the documentation in HTML
form, hyperlinked in various ways, and rooted in a file called
doc/index.html.
2. A set of files containing all the documentation in HTML form, hyperlinked
in various ways, and rooted in a file called index.html, is distributed in
doc/html and installed in <prefix>/share/doc/pcre/html.
Users of PCRE have contributed files containing the documentation for various
releases in CHM format. These can be found in the Contrib directory of the FTP
site (see next section).
Contributions by users of PCRE
@@ -59,27 +97,48 @@ You can find contributions from PCRE users in the directory
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
where there is also a README file giving brief descriptions of what they are.
Several of them provide support for compiling PCRE on various flavours of
Windows systems (I myself do not use Windows). Some are complete in themselves;
others are pointers to URLs containing relevant files.
There is a README file giving brief descriptions of what they are. Some are
complete in themselves; others are pointers to URLs containing relevant files.
Some of this material is likely to be well out-of-date. Several of the earlier
contributions provided support for compiling PCRE on various flavours of
Windows (I myself do not use Windows). Nowadays there is more Windows support
in the standard distribution, so these contibutions have been archived.
Building PCRE on a Unix-like system
-----------------------------------
Building PCRE on non-Unix systems
---------------------------------
For a non-Unix system, please read the comments in the file NON-UNIX-USE,
though if your system supports the use of "configure" and "make" you may be
able to build PCRE in the same way as for Unix-like systems. PCRE can also be
configured in many platform environments using the GUI facility of CMake's
CMakeSetup. It creates Makefiles, solution files, etc.
PCRE has been compiled on many different operating systems. It should be
straightforward to build PCRE on any system that has a Standard C compiler and
library, because it uses only Standard C functions.
Building PCRE on Unix-like systems
----------------------------------
If you are using HP's ANSI C++ compiler (aCC), please see the special note
in the section entitled "Using HP's ANSI C++ compiler (aCC)" below.
The following instructions assume the use of the widely used "configure, make,
make install" process. There is also support for CMake in the PCRE
distribution; there are some comments about using CMake in the NON-UNIX-USE
file, though it can also be used in Unix-like systems.
To build PCRE on a Unix-like system, first run the "configure" command from the
PCRE distribution directory, with your current directory set to the directory
where you want the files to be created. This command is a standard GNU
"autoconf" configuration script, for which generic instructions are supplied in
INSTALL.
the file INSTALL.
Most commonly, people build PCRE within its own distribution directory, and in
this case, on many systems, just running "./configure" is sufficient, but the
usual methods of changing standard defaults are available. For example:
this case, on many systems, just running "./configure" is sufficient. However,
the usual methods of changing standard defaults are available. For example:
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
@@ -103,13 +162,16 @@ library. You can read more about them in the pcrebuild man page.
. If you want to suppress the building of the C++ wrapper library, you can add
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
will try to find a C++ compiler and C++ header files, and if it succeeds, it
will try to build the C++ wrapper.
it will try to find a C++ compiler and C++ header files, and if it succeeds,
it will try to build the C++ wrapper.
. If you want to make use of the support for UTF-8 character strings in PCRE,
you must add --enable-utf8 to the "configure" command. Without it, the code
for handling UTF-8 is not included in the library. (Even when included, it
still has to be enabled by an option at run time.)
. If you want to make use of the support for UTF-8 Unicode character strings in
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
code for handling UTF-8 is not included in the library. Even when included,
it still has to be enabled by an option at run time. When PCRE is compiled
with this option, its input can only either be ASCII or UTF-8, even when
running on EBCDIC platforms. It is not possible to use both --enable-utf8 and
--enable-ebcdic at the same time.
. If, in addition to support for UTF-8 character strings, you want to include
support for the \P, \p, and \X sequences that recognize Unicode character
@@ -118,17 +180,31 @@ library. You can read more about them in the pcrebuild man page.
property table); only the basic two-letter properties such as Lu are
supported.
. You can build PCRE to recognize either CR or LF or the sequence CRLF as
indicating the end of a line. Whatever you specify at build time is the
default; the caller of PCRE can change the selection at run time. The default
newline indicator is a single LF character (the Unix standard). You can
specify the default newline indicator by adding --newline-is-cr or
--newline-is-lf or --newline-is-crlf to the "configure" command,
respectively.
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
of the preceding, or any of the Unicode newline sequences as indicating the
end of a line. Whatever you specify at build time is the default; the caller
of PCRE can change the selection at run time. The default newline indicator
is a single LF character (the Unix standard). You can specify the default
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
--enable-newline-is-any to the "configure" command, respectively.
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
the standard tests will fail, because the lines in the test files end with
LF. Even if the files are edited to change the line endings, there are likely
to be some failures. With --enable-newline-is-anycrlf or
--enable-newline-is-any, many tests should succeed, but there may be some
failures.
. By default, the sequence \R in a pattern matches any Unicode line ending
sequence. This is independent of the option specifying what PCRE considers to
be the end of a line (see above). However, the caller of PCRE can restrict \R
to match only CR, LF, or CRLF. You can make this the default by adding
--enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
. When called via the POSIX interface, PCRE uses malloc() to get additional
storage for processing capturing parentheses if there are more than 10 of
them. You can increase this threshold by setting, for example,
them in a pattern. You can increase this threshold by setting, for example,
--with-posix-malloc-threshold=20
@@ -141,8 +217,8 @@ library. You can read more about them in the pcrebuild man page.
--with-match-limit=500000
on the "configure" command. This is just the default; individual calls to
pcre_exec() can supply their own value. There is discussion on the pcreapi
man page.
pcre_exec() can supply their own value. There is more discussion on the
pcreapi man page.
. There is a separate counter that limits the depth of recursive function calls
during a matching process. This also has a default of ten million, which is
@@ -157,37 +233,92 @@ library. You can read more about them in the pcrebuild man page.
. The default maximum compiled pattern size is around 64K. You can increase
this by adding --with-link-size=3 to the "configure" command. You can
increase it even more by setting --with-link-size=4, but this is unlikely
ever to be necessary. If you build PCRE with an increased link size, test 2
(and 5 if you are using UTF-8) will fail. Part of the output of these tests
is a representation of the compiled pattern, and this changes with the link
size.
ever to be necessary. Increasing the internal link size will reduce
performance.
. You can build PCRE so that its internal match() function that is called from
pcre_exec() does not call itself recursively. Instead, it uses blocks of data
from the heap via special functions pcre_stack_malloc() and pcre_stack_free()
to save data that would otherwise be saved on the stack. To build PCRE like
this, use
pcre_exec() does not call itself recursively. Instead, it uses memory blocks
obtained from the heap via the special functions pcre_stack_malloc() and
pcre_stack_free() to save data that would otherwise be saved on the stack. To
build PCRE like this, use
--disable-stack-for-recursion
on the "configure" command. PCRE runs more slowly in this mode, but it may be
necessary in environments with limited stack sizes. This applies only to the
pcre_exec() function; it does not apply to pcre_dfa_exec(), which does not
use deeply nested recursion.
use deeply nested recursion. There is a discussion about stack sizes in the
pcrestack man page.
The "configure" script builds eight files for the basic C library:
. For speed, PCRE uses four tables for manipulating and identifying characters
whose code point values are less than 256. By default, it uses a set of
tables for ASCII encoding that is part of the distribution. If you specify
--enable-rebuild-chartables
a program called dftables is compiled and run in the default C locale when
you obey "make". It builds a source file called pcre_chartables.c. If you do
not specify this option, pcre_chartables.c is created as a copy of
pcre_chartables.c.dist. See "Character tables" below for further information.
. It is possible to compile PCRE for use on systems that use EBCDIC as their
character code (as opposed to ASCII) by specifying
--enable-ebcdic
This automatically implies --enable-rebuild-chartables (see above). However,
when PCRE is built this way, it always operates in EBCDIC. It cannot support
both EBCDIC and UTF-8.
. It is possible to compile pcregrep to use libz and/or libbz2, in order to
read .gz and .bz2 files (respectively), by specifying one or both of
--enable-pcregrep-libz
--enable-pcregrep-libbz2
Of course, the relevant libraries must be installed on your system.
. It is possible to compile pcretest so that it links with the libreadline
library, by specifying
--enable-pcretest-libreadline
If this is done, when pcretest's input is from a terminal, it reads it using
the readline() function. This provides line-editing and history facilities.
Note that libreadline is GPL-licenced, so if you distribute a binary of
pcretest linked in this way, there may be licensing issues.
Setting this option causes the -lreadline option to be added to the pcretest
build. In many operating environments with a sytem-installed readline
library this is sufficient. However, in some environments (e.g. if an
unmodified distribution version of readline is in use), it may be necessary
to specify something like LIBS="-lncurses" as well. This is because, to quote
the readline INSTALL, "Readline uses the termcap functions, but does not link
with the termcap or curses library itself, allowing applications which link
with readline the to choose an appropriate library." If you get error
messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto,
this is the problem, and linking with the ncurses library should fix it.
The "configure" script builds the following files for the basic C library:
. Makefile is the makefile that builds the library
. config.h contains build-time configuration options for the library
. pcre.h is the public PCRE header file
. pcre-config is a script that shows the settings of "configure" options
. libpcre.pc is data for the pkg-config command
. libtool is a script that builds shared and/or static libraries
. RunTest is a script for running tests on the library
. RunTest is a script for running tests on the basic C library
. RunGrepTest is a script for running tests on the pcregrep command
In addition, if a C++ compiler is found, the following are also built:
Versions of config.h and pcre.h are distributed in the PCRE tarballs under
the names config.h.generic and pcre.h.generic. These are provided for the
benefit of those who have to built PCRE without the benefit of "configure". If
you use "configure", the .generic versions are not used.
. pcrecpp.h is the header file for programs that call PCRE via the C++ wrapper
If a C++ compiler is found, the following files are also built:
. libpcrecpp.pc is data for the pkg-config command
. pcrecpparg.h is a header file for programs that call PCRE via the C++ wrapper
. pcre_stringpiece.h is the header for the C++ "stringpiece" functions
The "configure" script also creates config.status, which is an executable
@@ -196,17 +327,61 @@ contains compiler output from tests that "configure" runs.
Once "configure" has run, you can run "make". It builds two libraries, called
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
command. If a C++ compiler was found on your system, it also builds the C++
command. If a C++ compiler was found on your system, "make" also builds the C++
wrapper library, which is called libpcrecpp, and some test programs called
pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest.
Building the C++ wrapper can be disabled by adding --disable-cpp to the
"configure" command.
The command "make test" runs all the appropriate tests. Details of the PCRE
tests are given in a separate section of this document, below.
The command "make check" runs all the appropriate tests. Details of the PCRE
tests are given below in a separate section of this document.
You can use "make install" to copy the libraries, the public header files
pcre.h, pcreposix.h, pcrecpp.h, and pcre_stringpiece.h (the last two only if
the C++ wrapper was built), and the man pages to appropriate live directories
on your system, in the normal way.
You can use "make install" to install PCRE into live directories on your
system. The following are installed (file names are all relative to the
<prefix> that is set when "configure" is run):
Commands (bin):
pcretest
pcregrep
pcre-config
Libraries (lib):
libpcre
libpcreposix
libpcrecpp (if C++ support is enabled)
Configuration information (lib/pkgconfig):
libpcre.pc
libpcrecpp.pc (if C++ support is enabled)
Header files (include):
pcre.h
pcreposix.h
pcre_scanner.h )
pcre_stringpiece.h ) if C++ support is enabled
pcrecpp.h )
pcrecpparg.h )
Man pages (share/man/man{1,3}):
pcregrep.1
pcretest.1
pcre.3
pcre*.3 (lots more pages, all starting "pcre")
HTML documentation (share/doc/pcre/html):
index.html
*.html (lots more pages, hyperlinked from index.html)
Text file documentation (share/doc/pcre):
AUTHORS
COPYING
ChangeLog
LICENCE
NEWS
README
pcre.txt (a concatenation of the man(3) pages)
pcretest.txt the pcretest man page
pcregrep.txt the pcregrep man page
If you want to remove PCRE from your system, you can run "make uninstall".
This removes all the files that "make install" installed. However, it does not
@@ -216,9 +391,8 @@ remove any directories, because these are often shared with other programs.
Retrieving configuration information on Unix-like systems
---------------------------------------------------------
Running "make install" also installs the command pcre-config, which can be used
to recall information about the PCRE configuration and installation. For
example:
Running "make install" installs the command pcre-config, which can be used to
recall information about the PCRE configuration and installation. For example:
pcre-config --version
@@ -237,7 +411,7 @@ single command is used. For example:
pkg-config --cflags pcre
The data is held in *.pc files that are installed in a directory called
pkgconfig.
<prefix>/lib/pkgconfig.
Shared libraries on Unix-like systems
@@ -254,7 +428,7 @@ built. The programs pcretest and pcregrep are built to use these uninstalled
libraries (by means of wrapper scripts in the case of shared libraries). When
you use "make install" to install shared libraries, pcregrep and pcretest are
automatically re-built to use the newly installed shared libraries before being
installed themselves. However, the versions left in the source directory still
installed themselves. However, the versions left in the build directory still
use the uninstalled libraries.
To build PCRE using static libraries only you must use --disable-shared when
@@ -266,25 +440,33 @@ Then run "make" in the usual way. Similarly, you can use --disable-static to
build only shared libraries.
Cross-compiling on a Unix-like system
-------------------------------------
Cross-compiling on Unix-like systems
------------------------------------
You can specify CC and CFLAGS in the normal way to the "configure" command, in
order to cross-compile PCRE for some other host. However, during the building
process, the dftables.c source file is compiled *and run* on the local host, in
order to generate the default character tables (the chartables.c file). It
therefore needs to be compiled with the local compiler, not the cross compiler.
You can do this by specifying CC_FOR_BUILD (and if necessary CFLAGS_FOR_BUILD;
there are also CXX_FOR_BUILD and CXXFLAGS_FOR_BUILD for the C++ wrapper)
when calling the "configure" command. If they are not specified, they default
to the values of CC and CFLAGS.
order to cross-compile PCRE for some other host. However, you should NOT
specify --enable-rebuild-chartables, because if you do, the dftables.c source
file is compiled and run on the local host, in order to generate the inbuilt
character tables (the pcre_chartables.c file). This will probably not work,
because dftables.c needs to be compiled with the local compiler, not the cross
compiler.
When --enable-rebuild-chartables is not specified, pcre_chartables.c is created
by making a copy of pcre_chartables.c.dist, which is a default set of tables
that assumes ASCII code. Cross-compiling with the default tables should not be
a problem.
If you need to modify the character tables when cross-compiling, you should
move pcre_chartables.c.dist out of the way, then compile dftables.c by hand and
run it on the local host to make a new version of pcre_chartables.c.dist.
Then when you cross-compile PCRE this new version of the tables will be used.
Using HP's ANSI C++ compiler (aCC)
----------------------------------
Unless C++ support is disabled by specifiying the "--disable-cpp" option of the
"configure" script, you *must* include the "-AA" option in the CXXFLAGS
Unless C++ support is disabled by specifying the "--disable-cpp" option of the
"configure" script, you must include the "-AA" option in the CXXFLAGS
environment variable in order for the C++ components to compile correctly.
Also, note that the aCC compiler on PA-RISC platforms may have a defect whereby
@@ -296,49 +478,48 @@ running the "configure" script:
CXXLDFLAGS="-lstd_v2 -lCsup_v2"
Building on non-Unix systems
----------------------------
Making new tarballs
-------------------
For a non-Unix system, read the comments in the file NON-UNIX-USE, though if
the system supports the use of "configure" and "make" you may be able to build
PCRE in the same way as for Unix systems.
The command "make dist" creates three PCRE tarballs, in tar.gz, tar.bz2, and
zip formats. The command "make distcheck" does the same, but then does a trial
build of the new distribution to ensure that it works.
PCRE has been compiled on Windows systems and on Macintoshes, but I don't know
the details because I don't use those systems. It should be straightforward to
build PCRE on any system that has a Standard C compiler, because it uses only
Standard C functions.
If you have modified any of the man page sources in the doc directory, you
should first run the PrepareRelease script before making a distribution. This
script creates the .txt and HTML forms of the documentation from the man pages.
Testing PCRE
------------
To test PCRE on a Unix system, run the RunTest script that is created by the
configuring process. There is also a script called RunGrepTest that tests the
options of the pcregrep command. If the C++ wrapper library is build, three
test programs called pcrecpp_unittest, pcre_scanner_unittest, and
pcre_stringpiece_unittest are provided.
To test the basic PCRE library on a Unix system, run the RunTest script that is
created by the configuring process. There is also a script called RunGrepTest
that tests the options of the pcregrep command. If the C++ wrapper library is
built, three test programs called pcrecpp_unittest, pcre_scanner_unittest, and
pcre_stringpiece_unittest are also built.
Both the scripts and all the program tests are run if you obey "make runtest",
"make check", or "make test". For other systems, see the instructions in
NON-UNIX-USE.
Both the scripts and all the program tests are run if you obey "make check" or
"make test". For other systems, see the instructions in NON-UNIX-USE.
The RunTest script runs the pcretest test program (which is documented in its
own man page) on each of the testinput files (in the testdata directory) in
own man page) on each of the testinput files in the testdata directory in
turn, and compares the output with the contents of the corresponding testoutput
file. A file called testtry is used to hold the main output from pcretest
files. A file called testtry is used to hold the main output from pcretest
(testsavedregex is also used as a working file). To run pcretest on just one of
the test files, give its number as an argument to RunTest, for example:
RunTest 2
The first file can also be fed directly into the perltest script to check that
Perl gives the same results. The only difference you should see is in the first
few lines, where the Perl version is given instead of the PCRE version.
The first test file can also be fed directly into the perltest.pl script to
check that Perl gives the same results. The only difference you should see is
in the first few lines, where the Perl version is given instead of the PCRE
version.
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
detection, and run-time flags that are specific to PCRE, as well as the POSIX
wrapper API. It also uses the debugging flag to check some of the internals of
wrapper API. It also uses the debugging flags to check some of the internals of
pcre_compile().
If you build PCRE with a locale setting that is not the standard C locale, the
@@ -364,6 +545,12 @@ is output to say why. If running this test produces instances of the error
in the comparison output, it means that locale is not available on your system,
despite being listed by "locale". This does not mean that PCRE is broken.
[If you are trying to run this test on Windows, you may be able to get it to
work by changing "fr_FR" to "french" everywhere it occurs. Alternatively, use
RunTest.bat. The version of RunTest.bat included with PCRE 7.4 and above uses
Windows versions of test 2. More info on using RunTest.bat is included in the
document entitled NON-UNIX-USE.]
The fourth test checks the UTF-8 support. It is not run automatically unless
PCRE is built with UTF-8 support. To do this you must set --enable-utf8 when
running "configure". This file can be also fed directly to the perltest script,
@@ -373,8 +560,8 @@ commented in the script, can be be used.)
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
features of PCRE that are not relevant to Perl.
The sixth and test checks the support for Unicode character properties. It it
not run automatically unless PCRE is built with Unicode property support. To to
The sixth test checks the support for Unicode character properties. It it not
run automatically unless PCRE is built with Unicode property support. To to
this you must set --enable-unicode-properties when running "configure".
The seventh, eighth, and ninth tests check the pcre_dfa_exec() alternative
@@ -386,27 +573,42 @@ automatically unless PCRE is build with the relevant support.
Character tables
----------------
PCRE uses four tables for manipulating and identifying characters whose values
are less than 256. The final argument of the pcre_compile() function is a
pointer to a block of memory containing the concatenated tables. A call to
pcre_maketables() can be used to generate a set of tables in the current
locale. If the final argument for pcre_compile() is passed as NULL, a set of
default tables that is built into the binary is used.
For speed, PCRE uses four tables for manipulating and identifying characters
whose code point values are less than 256. The final argument of the
pcre_compile() function is a pointer to a block of memory containing the
concatenated tables. A call to pcre_maketables() can be used to generate a set
of tables in the current locale. If the final argument for pcre_compile() is
passed as NULL, a set of default tables that is built into the binary is used.
The source file called chartables.c contains the default set of tables. This is
not supplied in the distribution, but is built by the program dftables
(compiled from dftables.c), which uses the ANSI C character handling functions
such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table
sources. This means that the default C locale which is set for your system will
control the contents of these default tables. You can change the default tables
by editing chartables.c and then re-building PCRE. If you do this, you should
probably also edit Makefile to ensure that the file doesn't ever get
re-generated.
The source file called pcre_chartables.c contains the default set of tables. By
default, this is created as a copy of pcre_chartables.c.dist, which contains
tables for ASCII coding. However, if --enable-rebuild-chartables is specified
for ./configure, a different version of pcre_chartables.c is built by the
program dftables (compiled from dftables.c), which uses the ANSI C character
handling functions such as isalnum(), isalpha(), isupper(), islower(), etc. to
build the table sources. This means that the default C locale which is set for
your system will control the contents of these default tables. You can change
the default tables by editing pcre_chartables.c and then re-building PCRE. If
you do this, you should take care to ensure that the file does not get
automatically re-generated. The best way to do this is to move
pcre_chartables.c.dist out of the way and replace it with your customized
tables.
When the dftables program is run as a result of --enable-rebuild-chartables,
it uses the default C locale that is set on your system. It does not pay
attention to the LC_xxx environment variables. In other words, it uses the
system's default locale rather than whatever the compiling user happens to have
set. If you really do want to build a source set of character tables in a
locale that is specified by the LC_xxx variables, you can run the dftables
program by hand with the -L option. For example:
./dftables -L pcre_chartables.c.special
The first two 256-byte tables provide lower casing and case flipping functions,
respectively. The next table consists of three 32-byte bit maps which identify
digits, "word" characters, and white space, respectively. These are used when
building 32-byte bit maps that represent character classes.
building 32-byte bit maps that represent character classes for code points less
than 256.
The final 256-byte table has bits indicating various character types, as
follows:
@@ -422,107 +624,144 @@ You should not alter the set of characters that contain the 128 bit, as that
will cause PCRE to malfunction.
Manifest
--------
File manifest
-------------
The distribution should contain the following files:
(A) The actual source files of the PCRE library functions and their
headers:
(A) Source files of the PCRE library functions and their headers:
dftables.c auxiliary program for building chartables.c
dftables.c auxiliary program for building pcre_chartables.c
when --enable-rebuild-chartables is specified
pcreposix.c )
pcre_compile.c )
pcre_config.c )
pcre_dfa_exec.c )
pcre_exec.c )
pcre_fullinfo.c )
pcre_get.c ) sources for the functions in the library,
pcre_globals.c ) and some internal functions that they use
pcre_info.c )
pcre_maketables.c )
pcre_ord2utf8.c )
pcre_refcount.c )
pcre_study.c )
pcre_tables.c )
pcre_try_flipped.c )
pcre_ucp_searchfuncs.c)
pcre_valid_utf8.c )
pcre_version.c )
pcre_xclass.c )
ucptable.c )
pcre_chartables.c.dist a default set of character tables that assume ASCII
coding; used, unless --enable-rebuild-chartables is
specified, by copying to pcre_chartables.c
pcre_printint.src ) debugging function that is #included in pcretest, and
) can also be #included in pcre_compile()
pcreposix.c )
pcre_compile.c )
pcre_config.c )
pcre_dfa_exec.c )
pcre_exec.c )
pcre_fullinfo.c )
pcre_get.c ) sources for the functions in the library,
pcre_globals.c ) and some internal functions that they use
pcre_info.c )
pcre_maketables.c )
pcre_newline.c )
pcre_ord2utf8.c )
pcre_refcount.c )
pcre_study.c )
pcre_tables.c )
pcre_try_flipped.c )
pcre_ucd.c )
pcre_valid_utf8.c )
pcre_version.c )
pcre_xclass.c )
pcre_printint.src ) debugging function that is #included in pcretest,
) and can also be #included in pcre_compile()
pcre.h.in template for pcre.h when built by "configure"
pcreposix.h header for the external POSIX wrapper API
pcre_internal.h header for internal use
ucp.h header for Unicode property handling
pcre.h the public PCRE header file
pcreposix.h header for the external POSIX wrapper API
pcre_internal.h header for internal use
ucp.h ) headers concerned with
ucpinternal.h ) Unicode property handling
config.in template for config.h, which is built by configure
config.h.in template for config.h, which is built by "configure"
pcrecpp.h the header file for the C++ wrapper
pcrecpparg.h.in "source" for another C++ header file
pcrecpp.cc )
pcre_scanner.cc ) source for the C++ wrapper library
pcrecpp.h public header file for the C++ wrapper
pcrecpparg.h.in template for another C++ header file
pcre_scanner.h public header file for C++ scanner functions
pcrecpp.cc )
pcre_scanner.cc ) source for the C++ wrapper library
pcre_stringpiece.h.in "source" for pcre_stringpiece.h, the header for the
C++ stringpiece functions
pcre_stringpiece.cc source for the C++ stringpiece functions
pcre_stringpiece.h.in template for pcre_stringpiece.h, the header for the
C++ stringpiece functions
pcre_stringpiece.cc source for the C++ stringpiece functions
(B) Auxiliary files:
(B) Source files for programs that use PCRE:
AUTHORS information about the author of PCRE
ChangeLog log of changes to the code
INSTALL generic installation instructions
LICENCE conditions for the use of PCRE
COPYING the same, using GNU's standard name
Makefile.in template for Unix Makefile, which is built by configure
NEWS important changes in this release
NON-UNIX-USE notes on building PCRE on non-Unix systems
README this file
RunTest.in template for a Unix shell script for running tests
RunGrepTest.in template for a Unix shell script for pcregrep tests
config.guess ) files used by libtool,
config.sub ) used only when building a shared library
config.h.in "source" for the config.h header file
configure a configuring shell script (built by autoconf)
configure.ac the autoconf input used to build configure
doc/Tech.Notes notes on the encoding
doc/*.3 man page sources for the PCRE functions
doc/*.1 man page sources for pcregrep and pcretest
doc/html/* HTML documentation
doc/pcre.txt plain text version of the man pages
doc/pcretest.txt plain text documentation of test program
doc/perltest.txt plain text documentation of Perl test program
install-sh a shell script for installing files
libpcre.pc.in "source" for libpcre.pc for pkg-config
ltmain.sh file used to build a libtool script
mkinstalldirs script for making install directories
pcretest.c comprehensive test program
pcredemo.c simple demonstration of coding calls to PCRE
perltest Perl test program
pcregrep.c source of a grep utility that uses PCRE
pcre-config.in source of script which retains PCRE information
pcrecpp_unittest.c )
pcre_scanner_unittest.c ) test programs for the C++ wrapper
pcre_stringpiece_unittest.c )
testdata/testinput* test data for main library tests
testdata/testoutput* expected test results
testdata/grep* input and output for pcregrep tests
pcredemo.c simple demonstration of coding calls to PCRE
pcregrep.c source of a grep utility that uses PCRE
pcretest.c comprehensive test program
(C) Auxiliary files for Win32 DLL
(C) Auxiliary files:
libpcre.def
libpcreposix.def
132html script to turn "man" pages into HTML
AUTHORS information about the author of PCRE
ChangeLog log of changes to the code
CleanTxt script to clean nroff output for txt man pages
Detrail script to remove trailing spaces
HACKING some notes about the internals of PCRE
INSTALL generic installation instructions
LICENCE conditions for the use of PCRE
COPYING the same, using GNU's standard name
Makefile.in ) template for Unix Makefile, which is built by
) "configure"
Makefile.am ) the automake input that was used to create
) Makefile.in
NEWS important changes in this release
NON-UNIX-USE notes on building PCRE on non-Unix systems
PrepareRelease script to make preparations for "make dist"
README this file
RunTest a Unix shell script for running tests
RunGrepTest a Unix shell script for pcregrep tests
aclocal.m4 m4 macros (generated by "aclocal")
config.guess ) files used by libtool,
config.sub ) used only when building a shared library
configure a configuring shell script (built by autoconf)
configure.ac ) the autoconf input that was used to build
) "configure" and config.h
depcomp ) script to find program dependencies, generated by
) automake
doc/*.3 man page sources for the PCRE functions
doc/*.1 man page sources for pcregrep and pcretest
doc/index.html.src the base HTML page
doc/html/* HTML documentation
doc/pcre.txt plain text version of the man pages
doc/pcretest.txt plain text documentation of test program
doc/perltest.txt plain text documentation of Perl test program
install-sh a shell script for installing files
libpcre.pc.in template for libpcre.pc for pkg-config
libpcrecpp.pc.in template for libpcrecpp.pc for pkg-config
ltmain.sh file used to build a libtool script
missing ) common stub for a few missing GNU programs while
) installing, generated by automake
mkinstalldirs script for making install directories
perltest.pl Perl test program
pcre-config.in source of script which retains PCRE information
pcrecpp_unittest.cc )
pcre_scanner_unittest.cc ) test programs for the C++ wrapper
pcre_stringpiece_unittest.cc )
testdata/testinput* test data for main library tests
testdata/testoutput* expected test results
testdata/grep* input and output for pcregrep tests
(D) Auxiliary file for VPASCAL
(D) Auxiliary files for cmake support
cmake/COPYING-CMAKE-SCRIPTS
cmake/FindPackageHandleStandardArgs.cmake
cmake/FindReadline.cmake
CMakeLists.txt
config-cmake.h.in
(E) Auxiliary files for VPASCAL
makevp.bat
makevp_c.txt
makevp_l.txt
pcregexp.pas
(F) Auxiliary files for building PCRE "by hand"
pcre.h.generic ) a version of the public PCRE header file
) for use in non-"configure" environments
config.h.generic ) a version of config.h for use in non-"configure"
) environments
(F) Miscellaneous
RunTest.bat a script for running tests under Windows
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
June 2006
Last updated: 21 March 2009

39
libs/pcre/RunTest.bat Normal file
View File

@@ -0,0 +1,39 @@
@rem This file was contributed by Ralf Junker, and touched up by
@rem Daniel Richard G. Test 10 added by Philip H.
@rem Philip H also changed test 3 to use "wintest" files.
@rem
@rem MS Windows batch file to run pcretest on testfiles with the correct
@rem options.
@rem
@rem Output is written to a newly created subfolder named "testdata".
setlocal
if [%srcdir%]==[] set srcdir=.
if [%pcretest%]==[] set pcretest=pcretest
if not exist testout md testout
%pcretest% -q %srcdir%\testdata\testinput1 > testout\testoutput1
%pcretest% -q %srcdir%\testdata\testinput2 > testout\testoutput2
@rem %pcretest% -q %srcdir%\testdata\testinput3 > testout\testoutput3
%pcretest% -q %srcdir%\testdata\wintestinput3 > testout\wintestoutput3
%pcretest% -q %srcdir%\testdata\testinput4 > testout\testoutput4
%pcretest% -q %srcdir%\testdata\testinput5 > testout\testoutput5
%pcretest% -q %srcdir%\testdata\testinput6 > testout\testoutput6
%pcretest% -q -dfa %srcdir%\testdata\testinput7 > testout\testoutput7
%pcretest% -q -dfa %srcdir%\testdata\testinput8 > testout\testoutput8
%pcretest% -q -dfa %srcdir%\testdata\testinput9 > testout\testoutput9
%pcretest% -q %srcdir%\testdata\testinput10 > testout\testoutput10
fc /n %srcdir%\testdata\testoutput1 testout\testoutput1
fc /n %srcdir%\testdata\testoutput2 testout\testoutput2
rem fc /n %srcdir%\testdata\testoutput3 testout\testoutput3
fc /n %srcdir%\testdata\wintestoutput3 testout\wintestoutput3
fc /n %srcdir%\testdata\testoutput4 testout\testoutput4
fc /n %srcdir%\testdata\testoutput5 testout\testoutput5
fc /n %srcdir%\testdata\testoutput6 testout\testoutput6
fc /n %srcdir%\testdata\testoutput7 testout\testoutput7
fc /n %srcdir%\testdata\testoutput8 testout\testoutput8
fc /n %srcdir%\testdata\testoutput9 testout\testoutput9
fc /n %srcdir%\testdata\testoutput10 testout\testoutput10

View File

@@ -0,0 +1,22 @@
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,58 @@
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME (DEFAULT_MSG|"Custom failure message") VAR1 ... )
# This macro is intended to be used in FindXXX.cmake modules files.
# It handles the REQUIRED and QUIET argument to FIND_PACKAGE() and
# it also sets the <UPPERCASED_NAME>_FOUND variable.
# The package is found if all variables listed are TRUE.
# Example:
#
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibXml2 DEFAULT_MSG LIBXML2_LIBRARIES LIBXML2_INCLUDE_DIR)
#
# LibXml2 is considered to be found, if both LIBXML2_LIBRARIES and
# LIBXML2_INCLUDE_DIR are valid. Then also LIBXML2_FOUND is set to TRUE.
# If it is not found and REQUIRED was used, it fails with FATAL_ERROR,
# independent whether QUIET was used or not.
# If it is found, the location is reported using the VAR1 argument, so
# here a message "Found LibXml2: /usr/lib/libxml2.so" will be printed out.
# If the second argument is DEFAULT_MSG, the message in the failure case will
# be "Could NOT find LibXml2", if you don't like this message you can specify
# your own custom failure message there.
MACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 )
IF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
IF (${_NAME}_FIND_REQUIRED)
SET(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}")
ELSE (${_NAME}_FIND_REQUIRED)
SET(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}")
ENDIF (${_NAME}_FIND_REQUIRED)
ELSE("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
SET(_FAIL_MESSAGE "${_FAIL_MSG}")
ENDIF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
STRING(TOUPPER ${_NAME} _NAME_UPPER)
SET(${_NAME_UPPER}_FOUND TRUE)
IF(NOT ${_VAR1})
SET(${_NAME_UPPER}_FOUND FALSE)
ENDIF(NOT ${_VAR1})
FOREACH(_CURRENT_VAR ${ARGN})
IF(NOT ${_CURRENT_VAR})
SET(${_NAME_UPPER}_FOUND FALSE)
ENDIF(NOT ${_CURRENT_VAR})
ENDFOREACH(_CURRENT_VAR)
IF (${_NAME_UPPER}_FOUND)
IF (NOT ${_NAME}_FIND_QUIETLY)
MESSAGE(STATUS "Found ${_NAME}: ${${_VAR1}}")
ENDIF (NOT ${_NAME}_FIND_QUIETLY)
ELSE (${_NAME_UPPER}_FOUND)
IF (${_NAME}_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "${_FAIL_MESSAGE}")
ELSE (${_NAME}_FIND_REQUIRED)
IF (NOT ${_NAME}_FIND_QUIETLY)
MESSAGE(STATUS "${_FAIL_MESSAGE}")
ENDIF (NOT ${_NAME}_FIND_QUIETLY)
ENDIF (${_NAME}_FIND_REQUIRED)
ENDIF (${_NAME_UPPER}_FOUND)
ENDMACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS)

View File

@@ -0,0 +1,29 @@
# from http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/FindReadline.cmake
# http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/COPYING-CMAKE-SCRIPTS
# --> BSD licensed
#
# GNU Readline library finder
if(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
set(READLINE_FOUND TRUE)
else(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
FIND_PATH(READLINE_INCLUDE_DIR readline/readline.h
/usr/include/readline
)
# 2008-04-22 The next clause used to read like this:
#
# FIND_LIBRARY(READLINE_LIBRARY NAMES readline)
# FIND_LIBRARY(NCURSES_LIBRARY NAMES ncurses )
# include(FindPackageHandleStandardArgs)
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG NCURSES_LIBRARY READLINE_INCLUDE_DIR READLINE_LIBRARY )
#
# I was advised to modify it such that it will find an ncurses library if
# required, but not if one was explicitly given, that is, it allows the
# default to be overridden. PH
FIND_LIBRARY(READLINE_LIBRARY NAMES readline)
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG READLINE_INCLUDE_DIR READLINE_LIBRARY )
MARK_AS_ADVANCED(READLINE_INCLUDE_DIR READLINE_LIBRARY)
endif(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)

View File

@@ -0,0 +1,44 @@
/* config.h for CMake builds */
#cmakedefine HAVE_DIRENT_H 1
#cmakedefine HAVE_SYS_STAT_H 1
#cmakedefine HAVE_SYS_TYPES_H 1
#cmakedefine HAVE_UNISTD_H 1
#cmakedefine HAVE_WINDOWS_H 1
#cmakedefine HAVE_TYPE_TRAITS_H 1
#cmakedefine HAVE_BITS_TYPE_TRAITS_H 1
#cmakedefine HAVE_BCOPY 1
#cmakedefine HAVE_MEMMOVE 1
#cmakedefine HAVE_STRERROR 1
#cmakedefine HAVE_STRTOLL 1
#cmakedefine HAVE_STRTOQ 1
#cmakedefine HAVE__STRTOI64 1
#cmakedefine PCRE_STATIC 1
#cmakedefine SUPPORT_UTF8 1
#cmakedefine SUPPORT_UCP 1
#cmakedefine EBCDIC 1
#cmakedefine BSR_ANYCRLF 1
#cmakedefine NO_RECURSE 1
#cmakedefine HAVE_LONG_LONG 1
#cmakedefine HAVE_UNSIGNED_LONG_LONG 1
#cmakedefine SUPPORT_LIBBZ2 1
#cmakedefine SUPPORT_LIBZ 1
#cmakedefine SUPPORT_LIBREADLINE 1
#define NEWLINE @NEWLINE@
#define POSIX_MALLOC_THRESHOLD @PCRE_POSIX_MALLOC_THRESHOLD@
#define LINK_SIZE @PCRE_LINK_SIZE@
#define MATCH_LIMIT @PCRE_MATCH_LIMIT@
#define MATCH_LIMIT_RECURSION @PCRE_MATCH_LIMIT_RECURSION@
#define MAX_NAME_SIZE 32
#define MAX_NAME_COUNT 10000
/* end config.h for CMake builds */

313
libs/pcre/config.h.generic Normal file
View File

@@ -0,0 +1,313 @@
/* config.h. Generated from config.h.in by configure. */
/* config.h.in. Generated from configure.ac by autoheader. */
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
Some other environments also support the use of "configure". PCRE is written in
Standard C, but there are a few non-standard things it can cope with, allowing
it to run on SunOS4 and other "close to standard" systems.
If you are going to build PCRE "by hand" on a system without "configure" you
should copy the distributed config.h.generic to config.h, and then set up the
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
all of your compile commands, so that config.h is included at the start of
every source.
Alternatively, you can avoid editing by using -D on the compiler command line
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
them both to 0; an emulation function will be used. */
/* By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined, this is
changed so that backslash-R matches only CR, LF, or CRLF. The build- time
default can be overridden by the user of PCRE at runtime. On systems that
support it, "configure" can be used to override the default. */
/* #undef BSR_ANYCRLF */
/* If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro as 1. On systems that can use
"configure", this can be done via --enable-ebcdic. PCRE will then assume
that all input strings are in EBCDIC. If you do not define this macro, PCRE
will assume input strings are ASCII or UTF-8 Unicode. It is not possible to
build a version of PCRE that supports both EBCDIC and UTF-8. */
/* #undef EBCDIC */
/* Define to 1 if you have the `bcopy' function. */
#ifndef HAVE_BCOPY
#define HAVE_BCOPY 1
#endif
/* Define to 1 if you have the <bits/type_traits.h> header file. */
/* #undef HAVE_BITS_TYPE_TRAITS_H */
/* Define to 1 if you have the <bzlib.h> header file. */
#ifndef HAVE_BZLIB_H
#define HAVE_BZLIB_H 1
#endif
/* Define to 1 if you have the <dirent.h> header file. */
#ifndef HAVE_DIRENT_H
#define HAVE_DIRENT_H 1
#endif
/* Define to 1 if you have the <dlfcn.h> header file. */
#ifndef HAVE_DLFCN_H
#define HAVE_DLFCN_H 1
#endif
/* Define to 1 if you have the <inttypes.h> header file. */
#ifndef HAVE_INTTYPES_H
#define HAVE_INTTYPES_H 1
#endif
/* Define to 1 if you have the <limits.h> header file. */
#ifndef HAVE_LIMITS_H
#define HAVE_LIMITS_H 1
#endif
/* Define to 1 if the system has the type `long long'. */
#ifndef HAVE_LONG_LONG
#define HAVE_LONG_LONG 1
#endif
/* Define to 1 if you have the `memmove' function. */
#ifndef HAVE_MEMMOVE
#define HAVE_MEMMOVE 1
#endif
/* Define to 1 if you have the <memory.h> header file. */
#ifndef HAVE_MEMORY_H
#define HAVE_MEMORY_H 1
#endif
/* Define to 1 if you have the <readline/history.h> header file. */
#ifndef HAVE_READLINE_HISTORY_H
#define HAVE_READLINE_HISTORY_H 1
#endif
/* Define to 1 if you have the <readline/readline.h> header file. */
#ifndef HAVE_READLINE_READLINE_H
#define HAVE_READLINE_READLINE_H 1
#endif
/* Define to 1 if you have the <stdint.h> header file. */
#ifndef HAVE_STDINT_H
#define HAVE_STDINT_H 1
#endif
/* Define to 1 if you have the <stdlib.h> header file. */
#ifndef HAVE_STDLIB_H
#define HAVE_STDLIB_H 1
#endif
/* Define to 1 if you have the `strerror' function. */
#ifndef HAVE_STRERROR
#define HAVE_STRERROR 1
#endif
/* Define to 1 if you have the <string> header file. */
#ifndef HAVE_STRING
#define HAVE_STRING 1
#endif
/* Define to 1 if you have the <strings.h> header file. */
#ifndef HAVE_STRINGS_H
#define HAVE_STRINGS_H 1
#endif
/* Define to 1 if you have the <string.h> header file. */
#ifndef HAVE_STRING_H
#define HAVE_STRING_H 1
#endif
/* Define to 1 if you have the `strtoll' function. */
/* #undef HAVE_STRTOLL */
/* Define to 1 if you have the `strtoq' function. */
#ifndef HAVE_STRTOQ
#define HAVE_STRTOQ 1
#endif
/* Define to 1 if you have the <sys/stat.h> header file. */
#ifndef HAVE_SYS_STAT_H
#define HAVE_SYS_STAT_H 1
#endif
/* Define to 1 if you have the <sys/types.h> header file. */
#ifndef HAVE_SYS_TYPES_H
#define HAVE_SYS_TYPES_H 1
#endif
/* Define to 1 if you have the <type_traits.h> header file. */
/* #undef HAVE_TYPE_TRAITS_H */
/* Define to 1 if you have the <unistd.h> header file. */
#ifndef HAVE_UNISTD_H
#define HAVE_UNISTD_H 1
#endif
/* Define to 1 if the system has the type `unsigned long long'. */
#ifndef HAVE_UNSIGNED_LONG_LONG
#define HAVE_UNSIGNED_LONG_LONG 1
#endif
/* Define to 1 if you have the <windows.h> header file. */
/* #undef HAVE_WINDOWS_H */
/* Define to 1 if you have the <zlib.h> header file. */
#ifndef HAVE_ZLIB_H
#define HAVE_ZLIB_H 1
#endif
/* Define to 1 if you have the `_strtoi64' function. */
/* #undef HAVE__STRTOI64 */
/* The value of LINK_SIZE determines the number of bytes used to store links
as offsets within the compiled regex. The default is 2, which allows for
compiled patterns up to 64K long. This covers the vast majority of cases.
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
for longer patterns in extreme cases. On systems that support it,
"configure" can be used to override this default. */
#ifndef LINK_SIZE
#define LINK_SIZE 2
#endif
/* The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different limit.
The limit exists in order to catch runaway regular expressions that take
for ever to determine that they do not match. The default is set very large
so that it does not accidentally catch legitimate cases. On systems that
support it, "configure" can be used to override this default default. */
#ifndef MATCH_LIMIT
#define MATCH_LIMIT 10000000
#endif
/* The above limit applies to all calls of match(), whether or not they
increase the recursion depth. In some environments it is desirable to limit
the depth of recursive calls of match() more strictly, in order to restrict
the maximum amount of stack (or heap, if NO_RECURSE is defined) that is
used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of
match(). To have any useful effect, it must be less than the value of
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is
a runtime method for setting a different limit. On systems that support it,
"configure" can be used to override the default. */
#ifndef MATCH_LIMIT_RECURSION
#define MATCH_LIMIT_RECURSION MATCH_LIMIT
#endif
/* This limit is parameterized just in case anybody ever wants to change it.
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#ifndef MAX_NAME_COUNT
#define MAX_NAME_COUNT 10000
#endif
/* This limit is parameterized just in case anybody ever wants to change it.
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#ifndef MAX_NAME_SIZE
#define MAX_NAME_SIZE 32
#endif
/* The value of NEWLINE determines the newline character sequence. On systems
that support it, "configure" can be used to override the default, which is
10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF), -1 (ANY), or -2
(ANYCRLF). */
#ifndef NEWLINE
#define NEWLINE 10
#endif
/* PCRE uses recursive function calls to handle backtracking while matching.
This can sometimes be a problem on systems that have stacks of limited
size. Define NO_RECURSE to get a version that doesn't use recursion in the
match() function; instead it creates its own stack by steam using
pcre_recurse_malloc() to obtain memory from the heap. For more detail, see
the comments and other stuff just above the match() function. On systems
that support it, "configure" can be used to set this in the Makefile (use
--disable-stack-for-recursion). */
/* #undef NO_RECURSE */
/* Name of package */
#define PACKAGE "pcre"
/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT ""
/* Define to the full name of this package. */
#define PACKAGE_NAME "PCRE"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE 7.9"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre"
/* Define to the version of this package. */
#define PACKAGE_VERSION "7.9"
/* If you are compiling for a system other than a Unix-like system or
Win32, and it needs some magic to be inserted before the definition
of a function that is exported by the library, define this macro to
contain the relevant magic. If you do not define this macro, it
defaults to "extern" for a C compiler and "extern C" for a C++
compiler on non-Win32 systems. This macro apears at the start of
every exported function that is part of the external API. It does
not appear on functions that are "external" in the C sense, but
which are internal to the library. */
/* #undef PCRE_EXP_DEFN */
/* Define if linking statically (TODO: make nice with Libtool) */
/* #undef PCRE_STATIC */
/* When calling PCRE via the POSIX interface, additional working storage is
required for holding the pointers to capturing substrings because PCRE
requires three integers per substring, whereas the POSIX interface provides
only two. If the number of expected substrings is small, the wrapper
function uses space on the stack, because this is faster than using
malloc() for each call. The threshold above which the stack is no longer
used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it,
"configure" can be used to override this default. */
#ifndef POSIX_MALLOC_THRESHOLD
#define POSIX_MALLOC_THRESHOLD 10
#endif
/* Define to 1 if you have the ANSI C header files. */
#ifndef STDC_HEADERS
#define STDC_HEADERS 1
#endif
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
handle .bz2 files. */
/* #undef SUPPORT_LIBBZ2 */
/* Define to allow pcretest to be linked with libreadline. */
/* #undef SUPPORT_LIBREADLINE */
/* Define to allow pcregrep to be linked with libz, so that it is able to
handle .gz files. */
/* #undef SUPPORT_LIBZ */
/* Define to enable support for Unicode properties */
/* #undef SUPPORT_UCP */
/* Define to enable support for the UTF-8 Unicode encoding. This will work
even in an EBCDIC environment, but it is incompatible with the EBCDIC
macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but
not both at once. */
/* #undef SUPPORT_UTF8 */
/* Version number of package */
#ifndef VERSION
#define VERSION "7.9"
#endif
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */
/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */

View File

@@ -1,143 +1,248 @@
/* config.h.in. Generated from configure.ac by autoheader. */
/* On Unix-like systems config.in is converted by "configure" into config.h.
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
Some other environments also support the use of "configure". PCRE is written in
Standard C, but there are a few non-standard things it can cope with, allowing
it to run on SunOS4 and other "close to standard" systems.
On a non-Unix-like system you should just copy this file into config.h, and set
up the macros the way you need them. You should normally change the definitions
of HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way
autoconf works, these cannot be made the defaults. If your system has bcopy()
and not memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE.
If your system has neither bcopy() nor memmove(), leave them both as 0; an
emulation function will be used. */
If you are going to build PCRE "by hand" on a system without "configure" you
should copy the distributed config.h.generic to config.h, and then set up the
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
all of your compile commands, so that config.h is included at the start of
every source.
Alternatively, you can avoid editing by using -D on the compiler command line
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
them both to 0; an emulation function will be used. */
/* By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined, this is
changed so that backslash-R matches only CR, LF, or CRLF. The build- time
default can be overridden by the user of PCRE at runtime. On systems that
support it, "configure" can be used to override the default. */
#undef BSR_ANYCRLF
/* If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro as 1. On systems that can use "configure",
this can be done via --enable-ebcdic. */
character codes, define this macro as 1. On systems that can use
"configure", this can be done via --enable-ebcdic. PCRE will then assume
that all input strings are in EBCDIC. If you do not define this macro, PCRE
will assume input strings are ASCII or UTF-8 Unicode. It is not possible to
build a version of PCRE that supports both EBCDIC and UTF-8. */
#undef EBCDIC
#ifndef EBCDIC
#define EBCDIC 0
#endif
/* Define to 1 if you have the `bcopy' function. */
#undef HAVE_BCOPY
/* If you are compiling for a system other than a Unix-like system or Win32,
and it needs some magic to be inserted before the definition of a function that
is exported by the library, define this macro to contain the relevant magic. If
you do not define this macro, it defaults to "extern" for a C compiler and
"extern C" for a C++ compiler on non-Win32 systems. This macro apears at the
start of every exported function that is part of the external API. It does not
appear on functions that are "external" in the C sense, but which are internal
to the library. */
/* Define to 1 if you have the <bits/type_traits.h> header file. */
#undef HAVE_BITS_TYPE_TRAITS_H
/* #define PCRE_DATA_SCOPE */
/* Define to 1 if you have the <bzlib.h> header file. */
#undef HAVE_BZLIB_H
/* Define the following macro to empty if the "const" keyword does not work. */
/* Define to 1 if you have the <dirent.h> header file. */
#undef HAVE_DIRENT_H
#undef const
/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H
/* Define the following macro to "unsigned" if <stddef.h> does not define
size_t. */
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
#undef size_t
/* Define to 1 if you have the <limits.h> header file. */
#undef HAVE_LIMITS_H
/* The following two definitions are mainly for the benefit of SunOS4, which
does not have the strerror() or memmove() functions that should be present in
all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should
normally be defined with the value 1 for other systems, but unfortunately we
cannot make this the default because "configure" files generated by autoconf
will only change 0 to 1; they won't change 1 to 0 if the functions are not
found. */
/* Define to 1 if the system has the type `long long'. */
#undef HAVE_LONG_LONG
#define HAVE_STRERROR 0
#define HAVE_MEMMOVE 0
/* Define to 1 if you have the `memmove' function. */
#undef HAVE_MEMMOVE
/* There are some non-Unix-like systems that don't even have bcopy(). If this
macro is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of
HAVE_BCOPY is not relevant. */
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
#define HAVE_BCOPY 0
/* Define to 1 if you have the <readline/history.h> header file. */
#undef HAVE_READLINE_HISTORY_H
/* The value of NEWLINE determines the newline character. The default is to
leave it up to the compiler, but some sites want to force a particular value.
On Unix-like systems, "configure" can be used to override this default. */
/* Define to 1 if you have the <readline/readline.h> header file. */
#undef HAVE_READLINE_READLINE_H
#ifndef NEWLINE
#define NEWLINE '\n'
#endif
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* The value of LINK_SIZE determines the number of bytes used to store links as
offsets within the compiled regex. The default is 2, which allows for compiled
patterns up to 64K long. This covers the vast majority of cases. However, PCRE
can also be compiled to use 3 or 4 bytes instead. This allows for longer
patterns in extreme cases. On systems that support it, "configure" can be used
to override this default. */
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
#ifndef LINK_SIZE
#define LINK_SIZE 2
#endif
/* Define to 1 if you have the `strerror' function. */
#undef HAVE_STRERROR
/* When calling PCRE via the POSIX interface, additional working storage is
required for holding the pointers to capturing substrings because PCRE requires
three integers per substring, whereas the POSIX interface provides only two. If
the number of expected substrings is small, the wrapper function uses space on
the stack, because this is faster than using malloc() for each call. The
threshold above which the stack is no longer used is defined by POSIX_MALLOC_
THRESHOLD. On systems that support it, "configure" can be used to override this
default. */
/* Define to 1 if you have the <string> header file. */
#undef HAVE_STRING
#ifndef POSIX_MALLOC_THRESHOLD
#define POSIX_MALLOC_THRESHOLD 10
#endif
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* PCRE uses recursive function calls to handle backtracking while matching.
This can sometimes be a problem on systems that have stacks of limited size.
Define NO_RECURSE to get a version that doesn't use recursion in the match()
function; instead it creates its own stack by steam using pcre_recurse_malloc()
to obtain memory from the heap. For more detail, see the comments and other
stuff just above the match() function. On systems that support it, "configure"
can be used to set this in the Makefile (use --disable-stack-for-recursion). */
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* #define NO_RECURSE */
/* Define to 1 if you have the `strtoll' function. */
#undef HAVE_STRTOLL
/* The value of MATCH_LIMIT determines the default number of times the internal
match() function can be called during a single execution of pcre_exec(). There
is a runtime interface for setting a different limit. The limit exists in order
to catch runaway regular expressions that take for ever to determine that they
do not match. The default is set very large so that it does not accidentally
catch legitimate cases. On systems that support it, "configure" can be used to
override this default default. */
/* Define to 1 if you have the `strtoq' function. */
#undef HAVE_STRTOQ
#ifndef MATCH_LIMIT
#define MATCH_LIMIT 10000000
#endif
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <type_traits.h> header file. */
#undef HAVE_TYPE_TRAITS_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to 1 if the system has the type `unsigned long long'. */
#undef HAVE_UNSIGNED_LONG_LONG
/* Define to 1 if you have the <windows.h> header file. */
#undef HAVE_WINDOWS_H
/* Define to 1 if you have the <zlib.h> header file. */
#undef HAVE_ZLIB_H
/* Define to 1 if you have the `_strtoi64' function. */
#undef HAVE__STRTOI64
/* The value of LINK_SIZE determines the number of bytes used to store links
as offsets within the compiled regex. The default is 2, which allows for
compiled patterns up to 64K long. This covers the vast majority of cases.
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
for longer patterns in extreme cases. On systems that support it,
"configure" can be used to override this default. */
#undef LINK_SIZE
/* The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different limit.
The limit exists in order to catch runaway regular expressions that take
for ever to determine that they do not match. The default is set very large
so that it does not accidentally catch legitimate cases. On systems that
support it, "configure" can be used to override this default default. */
#undef MATCH_LIMIT
/* The above limit applies to all calls of match(), whether or not they
increase the recursion depth. In some environments it is desirable to limit the
depth of recursive calls of match() more strictly, in order to restrict the
maximum amount of stack (or heap, if NO_RECURSE is defined) that is used. The
value of MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To
have any useful effect, it must be less than the value of MATCH_LIMIT. There is
a runtime method for setting a different limit. On systems that support it,
"configure" can be used to override this default default. */
increase the recursion depth. In some environments it is desirable to limit
the depth of recursive calls of match() more strictly, in order to restrict
the maximum amount of stack (or heap, if NO_RECURSE is defined) that is
used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of
match(). To have any useful effect, it must be less than the value of
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is
a runtime method for setting a different limit. On systems that support it,
"configure" can be used to override the default. */
#undef MATCH_LIMIT_RECURSION
#ifndef MATCH_LIMIT_RECURSION
#define MATCH_LIMIT_RECURSION MATCH_LIMIT
#endif
/* This limit is parameterized just in case anybody ever wants to change it.
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#undef MAX_NAME_COUNT
/* These three limits are parameterized just in case anybody ever wants to
change them. Care must be taken if they are increased, because they guard
against integer overflow caused by enormously large patterns. */
/* This limit is parameterized just in case anybody ever wants to change it.
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#undef MAX_NAME_SIZE
#ifndef MAX_NAME_SIZE
#define MAX_NAME_SIZE 32
#endif
/* The value of NEWLINE determines the newline character sequence. On systems
that support it, "configure" can be used to override the default, which is
10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF), -1 (ANY), or -2
(ANYCRLF). */
#undef NEWLINE
#ifndef MAX_NAME_COUNT
#define MAX_NAME_COUNT 10000
#endif
/* PCRE uses recursive function calls to handle backtracking while matching.
This can sometimes be a problem on systems that have stacks of limited
size. Define NO_RECURSE to get a version that doesn't use recursion in the
match() function; instead it creates its own stack by steam using
pcre_recurse_malloc() to obtain memory from the heap. For more detail, see
the comments and other stuff just above the match() function. On systems
that support it, "configure" can be used to set this in the Makefile (use
--disable-stack-for-recursion). */
#undef NO_RECURSE
#ifndef MAX_DUPLENGTH
#define MAX_DUPLENGTH 30000
#endif
/* Name of package */
#undef PACKAGE
/* End */
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* If you are compiling for a system other than a Unix-like system or
Win32, and it needs some magic to be inserted before the definition
of a function that is exported by the library, define this macro to
contain the relevant magic. If you do not define this macro, it
defaults to "extern" for a C compiler and "extern C" for a C++
compiler on non-Win32 systems. This macro apears at the start of
every exported function that is part of the external API. It does
not appear on functions that are "external" in the C sense, but
which are internal to the library. */
#undef PCRE_EXP_DEFN
/* Define if linking statically (TODO: make nice with Libtool) */
#undef PCRE_STATIC
/* When calling PCRE via the POSIX interface, additional working storage is
required for holding the pointers to capturing substrings because PCRE
requires three integers per substring, whereas the POSIX interface provides
only two. If the number of expected substrings is small, the wrapper
function uses space on the stack, because this is faster than using
malloc() for each call. The threshold above which the stack is no longer
used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it,
"configure" can be used to override this default. */
#undef POSIX_MALLOC_THRESHOLD
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
handle .bz2 files. */
#undef SUPPORT_LIBBZ2
/* Define to allow pcretest to be linked with libreadline. */
#undef SUPPORT_LIBREADLINE
/* Define to allow pcregrep to be linked with libz, so that it is able to
handle .gz files. */
#undef SUPPORT_LIBZ
/* Define to enable support for Unicode properties */
#undef SUPPORT_UCP
/* Define to enable support for the UTF-8 Unicode encoding. This will work
even in an EBCDIC environment, but it is incompatible with the EBCDIC
macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but
not both at once. */
#undef SUPPORT_UTF8
/* Version number of package */
#undef VERSION
/* Define to empty if `const' does not conform to ANSI C. */
#undef const
/* Define to `unsigned int' if <sys/types.h> does not define. */
#undef size_t

View File

@@ -1,91 +1,282 @@
dnl Process this file with autoconf to produce a configure script.
dnl This configure.in file has been hacked around quite a lot as a result of
dnl patches that various people have sent to me (PH). Sometimes the information
dnl I get is contradictory. I've tried to put in comments that explain things,
dnl but in some cases the information is second-hand and I have no way of
dnl verifying it. I am not an autoconf or libtool expert!
dnl NOTE FOR MAINTAINERS: Do not use major or minor version numbers with
dnl leading zeros, because they may be treated as octal constants. The
dnl PCRE_PRERELEASE feature is for identifying release candidates. It might
dnl be defined as -RC2, for example. For real releases, it should be defined
dnl empty.
dnl This is required at the start; the name is the name of a file
dnl it should be seeing, to verify it is in the same directory.
m4_define(pcre_major, [7])
m4_define(pcre_minor, [9])
m4_define(pcre_prerelease, [])
m4_define(pcre_date, [2009-04-11])
AC_INIT(dftables.c)
AC_CONFIG_SRCDIR([pcre.h])
dnl A safety precaution
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre_version, [0:1:0])
m4_define(libpcreposix_version, [0:0:0])
m4_define(libpcrecpp_version, [0:0:0])
AC_PREREQ(2.57)
AC_INIT(PCRE, pcre_major.pcre_minor[]pcre_prerelease, , pcre)
AC_CONFIG_SRCDIR([pcre.h.in])
AM_INIT_AUTOMAKE([dist-bzip2 dist-zip])
AC_CONFIG_HEADERS(config.h)
dnl Arrange to build config.h from config.h.in.
dnl Manual says this macro should come right after AC_INIT.
AC_CONFIG_HEADER(config.h)
# The default CFLAGS and CXXFLAGS in Autoconf are "-g -O2" for gcc and just
# "-g" for any other compiler. There doesn't seem to be a standard way of
# getting rid of the -g (which I don't think is needed for a production
# library). This fudge seems to achieve the necessary. First, we remember the
# externally set values of CFLAGS and CXXFLAGS. Then call the AC_PROG_CC and
# AC_PROG_CXX macros to find the compilers - if CFLAGS and CXXFLAGS are not
# set, they will be set to Autoconf's defaults. Afterwards, if the original
# values were not set, remove the -g from the Autoconf defaults.
# (PH 02-May-07)
dnl Default values for miscellaneous macros
POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10
dnl Provide versioning information for libtool shared libraries that
dnl are built by default on Unix systems.
PCRE_LIB_VERSION=0:1:0
PCRE_POSIXLIB_VERSION=0:0:0
PCRE_CPPLIB_VERSION=0:0:0
dnl Find the PCRE version from the pcre.h file. The PCRE_VERSION variable is
dnl substituted in pcre-config.in.
PCRE_MAJOR=`grep '#define PCRE_MAJOR' ${srcdir}/pcre.h | cut -c 29-`
PCRE_MINOR=`grep '#define PCRE_MINOR' ${srcdir}/pcre.h | cut -c 29-`
PCRE_PRERELEASE=`grep '#define PCRE_PRERELEASE' ${srcdir}/pcre.h | cut -c 29-`
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}${PCRE_PRERELEASE}
dnl Handle --disable-cpp
AC_ARG_ENABLE(cpp,
[ --disable-cpp disable C++ support],
want_cpp="$enableval", want_cpp=yes)
dnl Checks for programs.
remember_set_CFLAGS="$CFLAGS"
remember_set_CXXFLAGS="$CXXFLAGS"
AC_PROG_CC
dnl Test for C++ for the C++ wrapper libpcrecpp. It seems, however, that
dnl AC_PROC_CXX will set $CXX to "g++" when no C++ compiler is installed, even
dnl though that is completely bogus. (This may happen only on certain systems
dnl with certain versions of autoconf, of course.) An attempt to include this
dnl test inside a check for want_cpp was criticized by a libtool expert, who
dnl tells me that it isn't allowed.
AC_PROG_CXX
dnl The icc compiler has the same options as gcc, so let the rest of the
dnl configure script think it has gcc when setting up dnl options etc.
dnl This is a nasty hack which no longer seems necessary with the update
dnl to the latest libtool files, so I have commented it out.
dnl
dnl if test "$CC" = "icc" ; then GCC=yes ; fi
if test "x$remember_set_CFLAGS" = "x"
then
if test "$CFLAGS" = "-g -O2"
then
CFLAGS="-O2"
elif test "$CFLAGS" = "-g"
then
CFLAGS=""
fi
fi
if test "x$remember_set_CXXFLAGS" = "x"
then
if test "$CXXFLAGS" = "-g -O2"
then
CXXFLAGS="-O2"
elif test "$CXXFLAGS" = "-g"
then
CXXFLAGS=""
fi
fi
# AC_PROG_CXX will return "g++" even if no c++ compiler is installed.
# Check for that case, and just disable c++ code if g++ doesn't run.
AC_LANG_PUSH(C++)
AC_COMPILE_IFELSE(AC_LANG_PROGRAM([],[]),, CXX=""; CXXCP=""; CXXFLAGS="")
AC_LANG_POP
AC_PROG_INSTALL
AC_LIBTOOL_WIN32_DLL
AC_PROG_LIBTOOL
AC_PROG_LN_S
dnl We need to find a compiler for compiling a program to run on the local host
dnl while building. It needs to be different from CC when cross-compiling.
dnl There is a macro called AC_PROG_CC_FOR_BUILD in the GNU archive for
dnl figuring this out automatically. Unfortunately, it does not work with the
dnl latest versions of autoconf. So for the moment, we just default to the
dnl same values as the "main" compiler. People who are cross-compiling will
dnl just have to adjust the Makefile by hand or set these values when they
dnl run "configure".
PCRE_MAJOR="pcre_major"
PCRE_MINOR="pcre_minor"
PCRE_PRERELEASE="pcre_prerelease"
PCRE_DATE="pcre_date"
CC_FOR_BUILD=${CC_FOR_BUILD:-'$(CC)'}
CXX_FOR_BUILD=${CXX_FOR_BUILD:-'$(CXX)'}
CFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CFLAGS)'}
CPPFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CPPFLAGS)'}
CXXFLAGS_FOR_BUILD=${CXXFLAGS_FOR_BUILD:-'$(CXXFLAGS)'}
BUILD_EXEEXT=${BUILD_EXEEXT:-'$(EXEEXT)'}
BUILD_OBJEXT=${BUILD_OBJEXT:-'$(OBJEXT)'}
AC_SUBST(PCRE_MAJOR)
AC_SUBST(PCRE_MINOR)
AC_SUBST(PCRE_PRERELEASE)
AC_SUBST(PCRE_DATE)
# Set a more sensible default value for $(htmldir).
if test "x$htmldir" = 'x${docdir}'
then
htmldir='${docdir}/html'
fi
# Handle --disable-cpp
AC_ARG_ENABLE(cpp,
AS_HELP_STRING([--disable-cpp],
[disable C++ support]),
, enable_cpp=yes)
# Handle --enable-rebuild-chartables
AC_ARG_ENABLE(rebuild-chartables,
AS_HELP_STRING([--enable-rebuild-chartables],
[rebuild character tables in current locale]),
, enable_rebuild_chartables=no)
# Handle --enable-utf8 (disabled by default)
AC_ARG_ENABLE(utf8,
AS_HELP_STRING([--enable-utf8],
[enable UTF-8 support (incompatible with --enable-ebcdic)]),
, enable_utf8=unset)
# Handle --enable-unicode-properties
AC_ARG_ENABLE(unicode-properties,
AS_HELP_STRING([--enable-unicode-properties],
[enable Unicode properties support (implies --enable-utf8)]),
, enable_unicode_properties=no)
# Handle --enable-newline=NL
dnl AC_ARG_ENABLE(newline,
dnl AS_HELP_STRING([--enable-newline=NL],
dnl [use NL as newline (lf, cr, crlf, anycrlf, any; default=lf)]),
dnl , enable_newline=lf)
# Separate newline options
ac_pcre_newline=lf
AC_ARG_ENABLE(newline-is-cr,
AS_HELP_STRING([--enable-newline-is-cr],
[use CR as newline character]),
ac_pcre_newline=cr)
AC_ARG_ENABLE(newline-is-lf,
AS_HELP_STRING([--enable-newline-is-lf],
[use LF as newline character (default)]),
ac_pcre_newline=lf)
AC_ARG_ENABLE(newline-is-crlf,
AS_HELP_STRING([--enable-newline-is-crlf],
[use CRLF as newline sequence]),
ac_pcre_newline=crlf)
AC_ARG_ENABLE(newline-is-anycrlf,
AS_HELP_STRING([--enable-newline-is-anycrlf],
[use CR, LF, or CRLF as newline sequence]),
ac_pcre_newline=anycrlf)
AC_ARG_ENABLE(newline-is-any,
AS_HELP_STRING([--enable-newline-is-any],
[use any valid Unicode newline sequence]),
ac_pcre_newline=any)
enable_newline="$ac_pcre_newline"
# Handle --enable-bsr-anycrlf
AC_ARG_ENABLE(bsr-anycrlf,
AS_HELP_STRING([--enable-bsr-anycrlf],
[\R matches only CR, LF, CRLF by default]),
, enable_bsr_anycrlf=no)
# Handle --enable-ebcdic
AC_ARG_ENABLE(ebcdic,
AS_HELP_STRING([--enable-ebcdic],
[assume EBCDIC coding rather than ASCII; incompatible with --enable-utf8; use only in (uncommon) EBCDIC environments; it implies --enable-rebuild-chartables]),
, enable_ebcdic=no)
# Handle --disable-stack-for-recursion
AC_ARG_ENABLE(stack-for-recursion,
AS_HELP_STRING([--disable-stack-for-recursion],
[don't use stack recursion when matching]),
, enable_stack_for_recursion=yes)
# Handle --enable-pcregrep-libz
AC_ARG_ENABLE(pcregrep-libz,
AS_HELP_STRING([--enable-pcregrep-libz],
[link pcregrep with libz to handle .gz files]),
, enable_pcregrep_libz=no)
# Handle --enable-pcregrep-libbz2
AC_ARG_ENABLE(pcregrep-libbz2,
AS_HELP_STRING([--enable-pcregrep-libbz2],
[link pcregrep with libbz2 to handle .bz2 files]),
, enable_pcregrep_libbz2=no)
# Handle --enable-pcretest-libreadline
AC_ARG_ENABLE(pcretest-libreadline,
AS_HELP_STRING([--enable-pcretest-libreadline],
[link pcretest with libreadline]),
, enable_pcretest_libreadline=no)
# Handle --with-posix-malloc-threshold=NBYTES
AC_ARG_WITH(posix-malloc-threshold,
AS_HELP_STRING([--with-posix-malloc-threshold=NBYTES],
[threshold for POSIX malloc usage (default=10)]),
, with_posix_malloc_threshold=10)
# Handle --with-link-size=N
AC_ARG_WITH(link-size,
AS_HELP_STRING([--with-link-size=N],
[internal link size (2, 3, or 4 allowed; default=2)]),
, with_link_size=2)
# Handle --with-match-limit=N
AC_ARG_WITH(match-limit,
AS_HELP_STRING([--with-match-limit=N],
[default limit on internal looping (default=10000000)]),
, with_match_limit=10000000)
# Handle --with-match-limit_recursion=N
#
# Note: In config.h, the default is to define MATCH_LIMIT_RECURSION
# symbolically as MATCH_LIMIT, which in turn is defined to be some numeric
# value (e.g. 10000000). MATCH_LIMIT_RECURSION can otherwise be set to some
# different numeric value (or even the same numeric value as MATCH_LIMIT,
# though no longer defined in terms of the latter).
#
AC_ARG_WITH(match-limit-recursion,
AS_HELP_STRING([--with-match-limit-recursion=N],
[default limit on internal recursion (default=MATCH_LIMIT)]),
, with_match_limit_recursion=MATCH_LIMIT)
# Make sure that if enable_unicode_properties was set, that UTF-8 support
# is enabled.
#
if test "x$enable_unicode_properties" = "xyes"
then
if test "x$enable_utf8" = "xno"
then
AC_MSG_ERROR([support for Unicode properties requires UTF-8 support])
fi
enable_utf8=yes
fi
if test "x$enable_utf8" = "xunset"
then
enable_utf8=no
fi
# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
# Also check that UTF-8 support is not requested, because PCRE cannot handle
# EBCDIC and UTF-8 in the same build. To do so it would need to use different
# character constants depending on the mode.
#
if test "x$enable_ebcdic" = "xyes"
then
enable_rebuild_chartables=yes
if test "x$enable_utf8" = "xyes"
then
AC_MSG_ERROR([support for EBCDIC and UTF-8 cannot be enabled at the same time])
fi
fi
# Convert the newline identifier into the appropriate integer value.
case "$enable_newline" in
lf) ac_pcre_newline_value=10 ;;
cr) ac_pcre_newline_value=13 ;;
crlf) ac_pcre_newline_value=3338 ;;
anycrlf) ac_pcre_newline_value=-2 ;;
any) ac_pcre_newline_value=-1 ;;
*)
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
;;
esac
# Check argument to --with-link-size
case "$with_link_size" in
2|3|4) ;;
*)
AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option])
;;
esac
AH_TOP([
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
Some other environments also support the use of "configure". PCRE is written in
Standard C, but there are a few non-standard things it can cope with, allowing
it to run on SunOS4 and other "close to standard" systems.
If you are going to build PCRE "by hand" on a system without "configure" you
should copy the distributed config.h.generic to config.h, and then set up the
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
all of your compile commands, so that config.h is included at the start of
every source.
Alternatively, you can avoid editing by using -D on the compiler command line
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
them both to 0; an emulation function will be used. */])
AC_DEFUN([AX_COMPILER_VENDOR],
[
@@ -116,217 +307,381 @@ if test "x${ax_cv_c_compiler_vendor}" = "xsun" ; then
fi
fi
dnl Checks for header files.
# Checks for header files.
AC_HEADER_STDC
AC_CHECK_HEADERS(limits.h)
AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h windows.h)
dnl The files below are C++ header files. One person told me (PH) that
dnl AC_LANG_CPLUSPLUS unsets CXX if it was explicitly set to something which
dnl doesn't work. However, this doesn't always seem to be the case.
if test "x$want_cpp" = "xyes" -a -n "$CXX"
# The files below are C++ header files.
pcre_have_type_traits="0"
pcre_have_bits_type_traits="0"
if test "x$enable_cpp" = "xyes" -a -n "$CXX"
then
AC_LANG_SAVE
AC_LANG_CPLUSPLUS
AC_LANG_PUSH(C++)
dnl We could be more clever here, given we're doing AC_SUBST with this
dnl (eg set a var to be the name of the include file we want). But we're not
dnl so it's easy to change back to 'regular' autoconf vars if we needed to.
# Older versions of pcre defined pcrecpp::no_arg, but in new versions
# it's called pcrecpp::RE::no_arg. For backwards ABI compatibility,
# we want to make one an alias for the other. Different systems do
# this in different ways. Some systems, for instance, can do it via
# a linker flag: -alias (for os x 10.5) or -i (for os x <=10.4).
OLD_LDFLAGS="$LDFLAGS"
for flag in "-alias,__ZN7pcrecpp2RE6no_argE,__ZN7pcrecpp6no_argE" \
"-i__ZN7pcrecpp6no_argE:__ZN7pcrecpp2RE6no_argE"; do
AC_MSG_CHECKING([for alias support in the linker])
LDFLAGS="$OLD_LDFLAGS -Wl,$flag"
# We try to run the linker with this new ld flag. If the link fails,
# we give up and remove the new flag from LDFLAGS.
AC_LINK_IFELSE(AC_LANG_PROGRAM([namespace pcrecpp {
class RE { static int no_arg; };
int RE::no_arg;
}],
[]),
[AC_MSG_RESULT([yes]);
EXTRA_LIBPCRECPP_LDFLAGS="$EXTRA_LIBPCRECPP_LDFLAGS -Wl,$flag";
break;],
AC_MSG_RESULT([no]))
done
LDFLAGS="$OLD_LDFLAGS"
# We could be more clever here, given we're doing AC_SUBST with this
# (eg set a var to be the name of the include file we want). But we're not
# so it's easy to change back to 'regular' autoconf vars if we needed to.
AC_CHECK_HEADERS(string, [pcre_have_cpp_headers="1"],
[pcre_have_cpp_headers="0"])
AC_CHECK_HEADERS(bits/type_traits.h, [pcre_have_bits_type_traits="1"],
[pcre_have_bits_type_traits="0"])
AC_CHECK_HEADERS(type_traits.h, [pcre_have_type_traits="1"],
[pcre_have_type_traits="0"])
dnl Using AC_SUBST eliminates the need to include config.h in a public .h file
AC_SUBST(pcre_have_bits_type_traits)
AC_LANG_POP
fi
# Using AC_SUBST eliminates the need to include config.h in a public .h file
AC_SUBST(pcre_have_type_traits)
AC_LANG_RESTORE
fi
AC_SUBST(pcre_have_bits_type_traits)
dnl From the above, we now have enough info to know if C++ is fully installed
if test "x$want_cpp" = "xyes" -a -n "$CXX" -a "$pcre_have_cpp_headers" = 1; then
MAYBE_CPP_TARGETS='$(CPP_TARGETS)'
HAVE_CPP=
else
MAYBE_CPP_TARGETS=
HAVE_CPP="#"
fi
AC_SUBST(MAYBE_CPP_TARGETS)
AC_SUBST(HAVE_CPP)
# Conditional compilation
AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes")
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
dnl Checks for typedefs, structures, and compiler characteristics.
# Checks for typedefs, structures, and compiler characteristics.
AC_C_CONST
AC_TYPE_SIZE_T
AC_CHECK_TYPES([long long], [pcre_have_long_long="1"], [pcre_have_long_long="0"])
AC_CHECK_TYPES([unsigned long long], [pcre_have_ulong_long="1"], [pcre_have_ulong_long="0"])
pcre_have_strotolonglong=0
AC_CHECK_FUNCS(strtoq strtoll _strtoi64, [pcre_have_strotolonglong="1"; break])
# If we can't convert a string to a long long, pretend we don't even
# have a long long.
if test $pcre_have_strotolonglong = "0"; then
pcre_have_long_long="0"
pcre_have_ulong_long="0"
else
AC_CHECK_TYPES([long long],
[pcre_have_long_long="1"],
[pcre_have_long_long="0"])
AC_CHECK_TYPES([unsigned long long],
[pcre_have_ulong_long="1"],
[pcre_have_ulong_long="0"])
fi
AC_SUBST(pcre_have_long_long)
AC_SUBST(pcre_have_ulong_long)
dnl Checks for library functions.
# Checks for library functions.
AC_CHECK_FUNCS(bcopy memmove strerror strtoq strtoll)
AC_CHECK_FUNCS(bcopy memmove strerror)
dnl Handle --enable-utf8
# Check for the availability of libz (aka zlib)
AC_ARG_ENABLE(utf8,
[ --enable-utf8 enable UTF8 support],
if test "$enableval" = "yes"; then
UTF8=-DSUPPORT_UTF8
fi
)
AC_CHECK_HEADERS([zlib.h], [HAVE_ZLIB_H=1])
AC_CHECK_LIB([z], [gzopen], [HAVE_LIBZ=1])
dnl Handle --enable-unicode-properties
# Check for the availability of libbz2
AC_ARG_ENABLE(unicode-properties,
[ --enable-unicode-properties enable Unicode properties support],
if test "$enableval" = "yes"; then
UCP=-DSUPPORT_UCP
fi
)
AC_CHECK_HEADERS([bzlib.h], [HAVE_BZLIB_H=1])
AC_CHECK_LIB([bz2], [BZ2_bzopen], [HAVE_LIBBZ2=1])
dnl Handle --enable-newline-is-cr
# Check for the availabiity of libreadline
AC_ARG_ENABLE(newline-is-cr,
[ --enable-newline-is-cr use CR as the newline character],
if test "$enableval" = "yes"; then
NEWLINE=-DNEWLINE=13
fi
)
AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_H=1])
AC_CHECK_HEADERS([readline/history.h], [HAVE_HISTORY_H=1])
AC_CHECK_LIB([readline], [readline], [HAVE_LIB_READLINE=1])
dnl Handle --enable-newline-is-lf
AC_ARG_ENABLE(newline-is-lf,
[ --enable-newline-is-lf use LF as the newline character],
if test "$enableval" = "yes"; then
NEWLINE=-DNEWLINE=10
fi
)
dnl Handle --enable-newline-is-crlf
AC_ARG_ENABLE(newline-is-crlf,
[ --enable-newline-is-crlf use CRLF as the newline sequence],
if test "$enableval" = "yes"; then
NEWLINE=-DNEWLINE=3338
fi
)
dnl Handle --enable-ebcdic
AC_ARG_ENABLE(ebcdic,
[ --enable-ebcdic assume EBCDIC coding rather than ASCII],
if test "$enableval" == "yes"; then
EBCDIC=-DEBCDIC=1
fi
)
dnl Handle --disable-stack-for-recursion
AC_ARG_ENABLE(stack-for-recursion,
[ --disable-stack-for-recursion disable use of stack recursion when matching],
if test "$enableval" = "no"; then
NO_RECURSE=-DNO_RECURSE
fi
)
dnl There doesn't seem to be a straightforward way of having parameters
dnl that set values, other than fudging the --with thing. So that's what
dnl I've done.
dnl Handle --with-posix-malloc-threshold=n
AC_ARG_WITH(posix-malloc-threshold,
[ --with-posix-malloc-threshold=10 threshold for POSIX malloc usage],
POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=$withval
)
dnl Handle --with-link-size=n
AC_ARG_WITH(link-size,
[ --with-link-size=2 internal link size (2, 3, or 4 allowed)],
LINK_SIZE=-DLINK_SIZE=$withval
)
dnl Handle --with-match-limit=n
AC_ARG_WITH(match-limit,
[ --with-match-limit=10000000 default limit on internal looping],
MATCH_LIMIT=-DMATCH_LIMIT=$withval
)
dnl Handle --with-match-limit_recursion=n
AC_ARG_WITH(match-limit-recursion,
[ --with-match-limit-recursion=10000000 default limit on internal recursion],
MATCH_LIMIT_RECURSION=-DMATCH_LIMIT_RECURSION=$withval
)
dnl Unicode character property support implies UTF-8 support
if test "$UCP" != "" ; then
UTF8=-DSUPPORT_UTF8
fi
dnl "Export" these variables
AC_SUBST(BUILD_EXEEXT)
AC_SUBST(BUILD_OBJEXT)
AC_SUBST(CC_FOR_BUILD)
AC_SUBST(CXX_FOR_BUILD)
AC_SUBST(CFLAGS_FOR_BUILD)
AC_SUBST(CXXFLAGS_FOR_BUILD)
AC_SUBST(CXXLDFLAGS)
AC_SUBST(EBCDIC)
AC_SUBST(HAVE_MEMMOVE)
AC_SUBST(HAVE_STRERROR)
AC_SUBST(LINK_SIZE)
AC_SUBST(MATCH_LIMIT)
AC_SUBST(MATCH_LIMIT_RECURSION)
AC_SUBST(NEWLINE)
AC_SUBST(NO_RECURSE)
AC_SUBST(PCRE_LIB_VERSION)
AC_SUBST(PCRE_POSIXLIB_VERSION)
AC_SUBST(PCRE_CPPLIB_VERSION)
AC_SUBST(PCRE_VERSION)
AC_SUBST(POSIX_MALLOC_THRESHOLD)
AC_SUBST(UCP)
AC_SUBST(UTF8)
dnl Stuff to make MinGW work better. Special treatment is no longer
dnl needed for Cygwin.
case $host_os in
mingw* )
POSIX_OBJ=pcreposix.o
POSIX_LOBJ=pcreposix.lo
POSIX_LIB=
ON_WINDOWS=
NOT_ON_WINDOWS="#"
WIN_PREFIX=
;;
* )
ON_WINDOWS="#"
NOT_ON_WINDOWS=
POSIX_OBJ=
POSIX_LOBJ=
POSIX_LIB=libpcreposix.la
WIN_PREFIX=
;;
esac
AC_SUBST(WIN_PREFIX)
AC_SUBST(ON_WINDOWS)
AC_SUBST(NOT_ON_WINDOWS)
AC_SUBST(POSIX_OBJ)
AC_SUBST(POSIX_LOBJ)
AC_SUBST(POSIX_LIB)
# This facilitates -ansi builds under Linux
dnl AC_DEFINE([_GNU_SOURCE], [], [Enable GNU extensions in glibc])
if test "x$enable_shared" = "xno" ; then
AC_DEFINE([PCRE_STATIC],[1],[to link statically])
AC_DEFINE([PCRE_STATIC], [1], [
Define if linking statically (TODO: make nice with Libtool)])
fi
dnl This must be last; it determines what files are written as well as config.h
AC_OUTPUT(Makefile pcre-config:pcre-config.in libpcre.pc:libpcre.pc.in pcrecpparg.h:pcrecpparg.h.in pcre_stringpiece.h:pcre_stringpiece.h.in RunGrepTest:RunGrepTest.in RunTest:RunTest.in,[chmod a+x RunTest RunGrepTest pcre-config])
# Here is where pcre specific defines are handled
if test "$enable_utf8" = "yes"; then
AC_DEFINE([SUPPORT_UTF8], [], [
Define to enable support for the UTF-8 Unicode encoding. This will
work even in an EBCDIC environment, but it is incompatible with
the EBCDIC macro. That is, PCRE can support *either* EBCDIC code
*or* ASCII/UTF-8, but not both at once.])
fi
if test "$enable_unicode_properties" = "yes"; then
AC_DEFINE([SUPPORT_UCP], [], [
Define to enable support for Unicode properties])
fi
if test "$enable_stack_for_recursion" = "no"; then
AC_DEFINE([NO_RECURSE], [], [
PCRE uses recursive function calls to handle backtracking while
matching. This can sometimes be a problem on systems that have
stacks of limited size. Define NO_RECURSE to get a version that
doesn't use recursion in the match() function; instead it creates
its own stack by steam using pcre_recurse_malloc() to obtain memory
from the heap. For more detail, see the comments and other stuff
just above the match() function. On systems that support it,
"configure" can be used to set this in the Makefile
(use --disable-stack-for-recursion).])
fi
if test "$enable_pcregrep_libz" = "yes"; then
AC_DEFINE([SUPPORT_LIBZ], [], [
Define to allow pcregrep to be linked with libz, so that it is
able to handle .gz files.])
fi
if test "$enable_pcregrep_libbz2" = "yes"; then
AC_DEFINE([SUPPORT_LIBBZ2], [], [
Define to allow pcregrep to be linked with libbz2, so that it is
able to handle .bz2 files.])
fi
if test "$enable_pcretest_libreadline" = "yes"; then
AC_DEFINE([SUPPORT_LIBREADLINE], [], [
Define to allow pcretest to be linked with libreadline.])
fi
AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
The value of NEWLINE determines the newline character sequence. On
systems that support it, "configure" can be used to override the
default, which is 10. The possible values are 10 (LF), 13 (CR),
3338 (CRLF), -1 (ANY), or -2 (ANYCRLF).])
if test "$enable_bsr_anycrlf" = "yes"; then
AC_DEFINE([BSR_ANYCRLF], [], [
By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined, this is
changed so that backslash-R matches only CR, LF, or CRLF. The build-
time default can be overridden by the user of PCRE at runtime. On
systems that support it, "configure" can be used to override the
default.])
fi
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
The value of LINK_SIZE determines the number of bytes used to store
links as offsets within the compiled regex. The default is 2, which
allows for compiled patterns up to 64K long. This covers the vast
majority of cases. However, PCRE can also be compiled to use 3 or 4
bytes instead. This allows for longer patterns in extreme cases. On
systems that support it, "configure" can be used to override this default.])
AC_DEFINE_UNQUOTED([POSIX_MALLOC_THRESHOLD], [$with_posix_malloc_threshold], [
When calling PCRE via the POSIX interface, additional working storage
is required for holding the pointers to capturing substrings because
PCRE requires three integers per substring, whereas the POSIX
interface provides only two. If the number of expected substrings is
small, the wrapper function uses space on the stack, because this is
faster than using malloc() for each call. The threshold above which
the stack is no longer used is defined by POSIX_MALLOC_THRESHOLD. On
systems that support it, "configure" can be used to override this
default.])
AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [
The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different
limit. The limit exists in order to catch runaway regular
expressions that take for ever to determine that they do not match.
The default is set very large so that it does not accidentally catch
legitimate cases. On systems that support it, "configure" can be
used to override this default default.])
AC_DEFINE_UNQUOTED([MATCH_LIMIT_RECURSION], [$with_match_limit_recursion], [
The above limit applies to all calls of match(), whether or not they
increase the recursion depth. In some environments it is desirable
to limit the depth of recursive calls of match() more strictly, in
order to restrict the maximum amount of stack (or heap, if
NO_RECURSE is defined) that is used. The value of
MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To
have any useful effect, it must be less than the value of
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT.
There is a runtime method for setting a different limit. On systems
that support it, "configure" can be used to override the default.])
AC_DEFINE([MAX_NAME_SIZE], [32], [
This limit is parameterized just in case anybody ever wants to
change it. Care must be taken if it is increased, because it guards
against integer overflow caused by enormously large patterns.])
AC_DEFINE([MAX_NAME_COUNT], [10000], [
This limit is parameterized just in case anybody ever wants to
change it. Care must be taken if it is increased, because it guards
against integer overflow caused by enormously large patterns.])
AH_VERBATIM([PCRE_EXP_DEFN], [
/* If you are compiling for a system other than a Unix-like system or
Win32, and it needs some magic to be inserted before the definition
of a function that is exported by the library, define this macro to
contain the relevant magic. If you do not define this macro, it
defaults to "extern" for a C compiler and "extern C" for a C++
compiler on non-Win32 systems. This macro apears at the start of
every exported function that is part of the external API. It does
not appear on functions that are "external" in the C sense, but
which are internal to the library. */
#undef PCRE_EXP_DEFN])
if test "$enable_ebcdic" = "yes"; then
AC_DEFINE_UNQUOTED([EBCDIC], [], [
If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro as 1. On systems that can use
"configure", this can be done via --enable-ebcdic. PCRE will then
assume that all input strings are in EBCDIC. If you do not define
this macro, PCRE will assume input strings are ASCII or UTF-8 Unicode.
It is not possible to build a version of PCRE that supports both
EBCDIC and UTF-8.])
fi
# Platform specific issues
NO_UNDEFINED=
EXPORT_ALL_SYMBOLS=
case $host_os in
cygwin* | mingw* )
if test X"$enable_shared" = Xyes; then
NO_UNDEFINED="-no-undefined"
EXPORT_ALL_SYMBOLS="-Wl,--export-all-symbols"
fi
;;
esac
# The extra LDFLAGS for each particular library
# (Note: The libpcre*_version bits are m4 variables, assigned above)
EXTRA_LIBPCRE_LDFLAGS="$EXTRA_LIBPCRE_LDFLAGS \
$NO_UNDEFINED -version-info libpcre_version"
EXTRA_LIBPCREPOSIX_LDFLAGS="$EXTRA_LIBPCREPOSIX_LDFLAGS \
$NO_UNDEFINED -version-info libpcreposix_version"
EXTRA_LIBPCRECPP_LDFLAGS="$EXTRA_LIBPCRECPP_LDFLAGS \
$NO_UNDEFINED -version-info libpcrecpp_version \
$EXPORT_ALL_SYMBOLS"
AC_SUBST(EXTRA_LIBPCRE_LDFLAGS)
AC_SUBST(EXTRA_LIBPCREPOSIX_LDFLAGS)
AC_SUBST(EXTRA_LIBPCRECPP_LDFLAGS)
# When we run 'make distcheck', use these arguments.
DISTCHECK_CONFIGURE_FLAGS="--enable-cpp --enable-unicode-properties"
AC_SUBST(DISTCHECK_CONFIGURE_FLAGS)
# Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is
# specified, the relevant library is available.
if test "$enable_pcregrep_libz" = "yes"; then
if test "$HAVE_ZLIB_H" != "1"; then
echo "** Cannot --enable-pcregrep-libz because zlib.h was not found"
exit 1
fi
if test "$HAVE_LIBZ" != "1"; then
echo "** Cannot --enable-pcregrep-libz because libz was not found"
exit 1
fi
LIBZ="-lz"
fi
AC_SUBST(LIBZ)
if test "$enable_pcregrep_libbz2" = "yes"; then
if test "$HAVE_BZLIB_H" != "1"; then
echo "** Cannot --enable-pcregrep-libbz2 because bzlib.h was not found"
exit 1
fi
if test "$HAVE_LIBBZ2" != "1"; then
echo "** Cannot --enable-pcregrep-libbz2 because libbz2 was not found"
exit 1
fi
LIBBZ2="-lbz2"
fi
AC_SUBST(LIBBZ2)
# Similarly for --enable-pcretest-readline
if test "$enable_pcretest_libreadline" = "yes"; then
if test "$HAVE_READLINE_H" != "1"; then
echo "** Cannot --enable-pcretest-readline because readline/readline.h was not found."
exit 1
fi
if test "$HAVE_HISTORY_H" != "1"; then
echo "** Cannot --enable-pcretest-readline because readline/history.h was not found."
exit 1
fi
LIBREADLINE="-lreadline"
fi
AC_SUBST(LIBREADLINE)
# Produce these files, in addition to config.h.
AC_CONFIG_FILES(
Makefile
libpcre.pc
libpcrecpp.pc
pcre-config
pcre.h
pcre_stringpiece.h
pcrecpparg.h
)
# Make the generated script files executable.
AC_CONFIG_COMMANDS([script-chmod], [chmod a+x pcre-config])
# Make sure that pcre_chartables.c is removed in case the method for
# creating it was changed by reconfiguration.
AC_CONFIG_COMMANDS([delete-old-chartables], [rm -f pcre_chartables.c])
AC_OUTPUT
# Print out a nice little message after configure is run displaying your
# chosen options.
cat <<EOF
$PACKAGE-$VERSION configuration summary:
Install prefix .................. : ${prefix}
C preprocessor .................. : ${CPP}
C compiler ...................... : ${CC}
C++ preprocessor ................ : ${CXXCPP}
C++ compiler .................... : ${CXX}
Linker .......................... : ${LD}
C preprocessor flags ............ : ${CPPFLAGS}
C compiler flags ................ : ${CFLAGS}
C++ compiler flags .............. : ${CXXFLAGS}
Linker flags .................... : ${LDFLAGS}
Extra libraries ................. : ${LIBS}
Build C++ library ............... : ${enable_cpp}
Enable UTF-8 support ............ : ${enable_utf8}
Unicode properties .............. : ${enable_unicode_properties}
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
EBCDIC coding ................... : ${enable_ebcdic}
Rebuild char tables ............. : ${enable_rebuild_chartables}
Use stack recursion ............. : ${enable_stack_for_recursion}
POSIX mem threshold ............. : ${with_posix_malloc_threshold}
Internal link size .............. : ${with_link_size}
Match limit ..................... : ${with_match_limit}
Match limit recursion ........... : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
Link pcregrep with libz ......... : ${enable_pcregrep_libz}
Link pcregrep with libbz2 ....... : ${enable_pcregrep_libbz2}
Link pcretest with libreadline .. : ${enable_pcretest_libreadline}
EOF
dnl end configure.ac

589
libs/pcre/depcomp Executable file
View File

@@ -0,0 +1,589 @@
#! /bin/sh
# depcomp - compile a program generating dependencies as side-effects
scriptversion=2007-03-29.01
# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007 Free Software
# Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.
# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
case $1 in
'')
echo "$0: No command. Try \`$0 --help' for more information." 1>&2
exit 1;
;;
-h | --h*)
cat <<\EOF
Usage: depcomp [--help] [--version] PROGRAM [ARGS]
Run PROGRAMS ARGS to compile a file, generating dependencies
as side-effects.
Environment variables:
depmode Dependency tracking mode.
source Source file read by `PROGRAMS ARGS'.
object Object file output by `PROGRAMS ARGS'.
DEPDIR directory where to store dependencies.
depfile Dependency file to output.
tmpdepfile Temporary file to use when outputing dependencies.
libtool Whether libtool is used (yes/no).
Report bugs to <bug-automake@gnu.org>.
EOF
exit $?
;;
-v | --v*)
echo "depcomp $scriptversion"
exit $?
;;
esac
if test -z "$depmode" || test -z "$source" || test -z "$object"; then
echo "depcomp: Variables source, object and depmode must be set" 1>&2
exit 1
fi
# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
depfile=${depfile-`echo "$object" |
sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
rm -f "$tmpdepfile"
# Some modes work just like other modes, but use different flags. We
# parameterize here, but still list the modes in the big case below,
# to make depend.m4 easier to write. Note that we *cannot* use a case
# here, because this file can only contain one case statement.
if test "$depmode" = hp; then
# HP compiler uses -M and no extra arg.
gccflag=-M
depmode=gcc
fi
if test "$depmode" = dashXmstdout; then
# This is just like dashmstdout with a different argument.
dashmflag=-xM
depmode=dashmstdout
fi
case "$depmode" in
gcc3)
## gcc 3 implements dependency tracking that does exactly what
## we want. Yay! Note: for some reason libtool 1.4 doesn't like
## it if -MD -MP comes after the -MF stuff. Hmm.
## Unfortunately, FreeBSD c89 acceptance of flags depends upon
## the command line argument order; so add the flags where they
## appear in depend2.am. Note that the slowdown incurred here
## affects only configure: in makefiles, %FASTDEP% shortcuts this.
for arg
do
case $arg in
-c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
*) set fnord "$@" "$arg" ;;
esac
shift # fnord
shift # $arg
done
"$@"
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
mv "$tmpdepfile" "$depfile"
;;
gcc)
## There are various ways to get dependency output from gcc. Here's
## why we pick this rather obscure method:
## - Don't want to use -MD because we'd like the dependencies to end
## up in a subdir. Having to rename by hand is ugly.
## (We might end up doing this anyway to support other compilers.)
## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
## -MM, not -M (despite what the docs say).
## - Using -M directly means running the compiler twice (even worse
## than renaming).
if test -z "$gccflag"; then
gccflag=-MD,
fi
"$@" -Wp,"$gccflag$tmpdepfile"
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
rm -f "$depfile"
echo "$object : \\" > "$depfile"
alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
## The second -e expression handles DOS-style file names with drive letters.
sed -e 's/^[^:]*: / /' \
-e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
## This next piece of magic avoids the `deleted header file' problem.
## The problem is that when a header file which appears in a .P file
## is deleted, the dependency causes make to die (because there is
## typically no way to rebuild the header). We avoid this by adding
## dummy dependencies for each header file. Too bad gcc doesn't do
## this for us directly.
tr ' ' '
' < "$tmpdepfile" |
## Some versions of gcc put a space before the `:'. On the theory
## that the space means something, we add a space to the output as
## well.
## Some versions of the HPUX 10.20 sed can't process this invocation
## correctly. Breaking it into two sed invocations is a workaround.
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
hp)
# This case exists only to let depend.m4 do its work. It works by
# looking at the text of this script. This case will never be run,
# since it is checked for above.
exit 1
;;
sgi)
if test "$libtool" = yes; then
"$@" "-Wp,-MDupdate,$tmpdepfile"
else
"$@" -MDupdate "$tmpdepfile"
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
rm -f "$depfile"
if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files
echo "$object : \\" > "$depfile"
# Clip off the initial element (the dependent). Don't try to be
# clever and replace this with sed code, as IRIX sed won't handle
# lines with more than a fixed number of characters (4096 in
# IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines;
# the IRIX cc adds comments like `#:fec' to the end of the
# dependency line.
tr ' ' '
' < "$tmpdepfile" \
| sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
tr '
' ' ' >> $depfile
echo >> $depfile
# The second pass generates a dummy entry for each header file.
tr ' ' '
' < "$tmpdepfile" \
| sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
>> $depfile
else
# The sourcefile does not contain any dependencies, so just
# store a dummy comment line, to avoid errors with the Makefile
# "include basename.Plo" scheme.
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile"
;;
aix)
# The C for AIX Compiler uses -M and outputs the dependencies
# in a .u file. In older versions, this file always lives in the
# current directory. Also, the AIX compiler puts `$object:' at the
# start of each line; $object doesn't have directory information.
# Version 6 uses the directory in both cases.
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
test "x$dir" = "x$object" && dir=
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
if test "$libtool" = yes; then
tmpdepfile1=$dir$base.u
tmpdepfile2=$base.u
tmpdepfile3=$dir.libs/$base.u
"$@" -Wc,-M
else
tmpdepfile1=$dir$base.u
tmpdepfile2=$dir$base.u
tmpdepfile3=$dir$base.u
"$@" -M
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
exit $stat
fi
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
do
test -f "$tmpdepfile" && break
done
if test -f "$tmpdepfile"; then
# Each line is of the form `foo.o: dependent.h'.
# Do two passes, one to just change these to
# `$object: dependent.h' and one to simply `dependent.h:'.
sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
# That's a tab and a space in the [].
sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
else
# The sourcefile does not contain any dependencies, so just
# store a dummy comment line, to avoid errors with the Makefile
# "include basename.Plo" scheme.
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile"
;;
icc)
# Intel's C compiler understands `-MD -MF file'. However on
# icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
# ICC 7.0 will fill foo.d with something like
# foo.o: sub/foo.c
# foo.o: sub/foo.h
# which is wrong. We want:
# sub/foo.o: sub/foo.c
# sub/foo.o: sub/foo.h
# sub/foo.c:
# sub/foo.h:
# ICC 7.1 will output
# foo.o: sub/foo.c sub/foo.h
# and will wrap long lines using \ :
# foo.o: sub/foo.c ... \
# sub/foo.h ... \
# ...
"$@" -MD -MF "$tmpdepfile"
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile"
exit $stat
fi
rm -f "$depfile"
# Each line is of the form `foo.o: dependent.h',
# or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
# Do two passes, one to just change these to
# `$object: dependent.h' and one to simply `dependent.h:'.
sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
# Some versions of the HPUX 10.20 sed can't process this invocation
# correctly. Breaking it into two sed invocations is a workaround.
sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
hp2)
# The "hp" stanza above does not work with aCC (C++) and HP's ia64
# compilers, which have integrated preprocessors. The correct option
# to use with these is +Maked; it writes dependencies to a file named
# 'foo.d', which lands next to the object file, wherever that
# happens to be.
# Much of this is similar to the tru64 case; see comments there.
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
test "x$dir" = "x$object" && dir=
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
if test "$libtool" = yes; then
tmpdepfile1=$dir$base.d
tmpdepfile2=$dir.libs/$base.d
"$@" -Wc,+Maked
else
tmpdepfile1=$dir$base.d
tmpdepfile2=$dir$base.d
"$@" +Maked
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile1" "$tmpdepfile2"
exit $stat
fi
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
do
test -f "$tmpdepfile" && break
done
if test -f "$tmpdepfile"; then
sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
# Add `dependent.h:' lines.
sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
else
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile" "$tmpdepfile2"
;;
tru64)
# The Tru64 compiler uses -MD to generate dependencies as a side
# effect. `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
# At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
# dependencies in `foo.d' instead, so we check for that too.
# Subdirectories are respected.
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
test "x$dir" = "x$object" && dir=
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
if test "$libtool" = yes; then
# With Tru64 cc, shared objects can also be used to make a
# static library. This mechanism is used in libtool 1.4 series to
# handle both shared and static libraries in a single compilation.
# With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
#
# With libtool 1.5 this exception was removed, and libtool now
# generates 2 separate objects for the 2 libraries. These two
# compilations output dependencies in $dir.libs/$base.o.d and
# in $dir$base.o.d. We have to check for both files, because
# one of the two compilations can be disabled. We should prefer
# $dir$base.o.d over $dir.libs/$base.o.d because the latter is
# automatically cleaned when .libs/ is deleted, while ignoring
# the former would cause a distcleancheck panic.
tmpdepfile1=$dir.libs/$base.lo.d # libtool 1.4
tmpdepfile2=$dir$base.o.d # libtool 1.5
tmpdepfile3=$dir.libs/$base.o.d # libtool 1.5
tmpdepfile4=$dir.libs/$base.d # Compaq CCC V6.2-504
"$@" -Wc,-MD
else
tmpdepfile1=$dir$base.o.d
tmpdepfile2=$dir$base.d
tmpdepfile3=$dir$base.d
tmpdepfile4=$dir$base.d
"$@" -MD
fi
stat=$?
if test $stat -eq 0; then :
else
rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
exit $stat
fi
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
do
test -f "$tmpdepfile" && break
done
if test -f "$tmpdepfile"; then
sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
# That's a tab and a space in the [].
sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
else
echo "#dummy" > "$depfile"
fi
rm -f "$tmpdepfile"
;;
#nosideeffect)
# This comment above is used by automake to tell side-effect
# dependency tracking mechanisms from slower ones.
dashmstdout)
# Important note: in order to support this mode, a compiler *must*
# always write the preprocessed file to stdout, regardless of -o.
"$@" || exit $?
# Remove the call to Libtool.
if test "$libtool" = yes; then
while test $1 != '--mode=compile'; do
shift
done
shift
fi
# Remove `-o $object'.
IFS=" "
for arg
do
case $arg in
-o)
shift
;;
$object)
shift
;;
*)
set fnord "$@" "$arg"
shift # fnord
shift # $arg
;;
esac
done
test -z "$dashmflag" && dashmflag=-M
# Require at least two characters before searching for `:'
# in the target name. This is to cope with DOS-style filenames:
# a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
"$@" $dashmflag |
sed 's:^[ ]*[^: ][^:][^:]*\:[ ]*:'"$object"'\: :' > "$tmpdepfile"
rm -f "$depfile"
cat < "$tmpdepfile" > "$depfile"
tr ' ' '
' < "$tmpdepfile" | \
## Some versions of the HPUX 10.20 sed can't process this invocation
## correctly. Breaking it into two sed invocations is a workaround.
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
dashXmstdout)
# This case only exists to satisfy depend.m4. It is never actually
# run, as this mode is specially recognized in the preamble.
exit 1
;;
makedepend)
"$@" || exit $?
# Remove any Libtool call
if test "$libtool" = yes; then
while test $1 != '--mode=compile'; do
shift
done
shift
fi
# X makedepend
shift
cleared=no
for arg in "$@"; do
case $cleared in
no)
set ""; shift
cleared=yes ;;
esac
case "$arg" in
-D*|-I*)
set fnord "$@" "$arg"; shift ;;
# Strip any option that makedepend may not understand. Remove
# the object too, otherwise makedepend will parse it as a source file.
-*|$object)
;;
*)
set fnord "$@" "$arg"; shift ;;
esac
done
obj_suffix="`echo $object | sed 's/^.*\././'`"
touch "$tmpdepfile"
${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
rm -f "$depfile"
cat < "$tmpdepfile" > "$depfile"
sed '1,2d' "$tmpdepfile" | tr ' ' '
' | \
## Some versions of the HPUX 10.20 sed can't process this invocation
## correctly. Breaking it into two sed invocations is a workaround.
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
rm -f "$tmpdepfile" "$tmpdepfile".bak
;;
cpp)
# Important note: in order to support this mode, a compiler *must*
# always write the preprocessed file to stdout.
"$@" || exit $?
# Remove the call to Libtool.
if test "$libtool" = yes; then
while test $1 != '--mode=compile'; do
shift
done
shift
fi
# Remove `-o $object'.
IFS=" "
for arg
do
case $arg in
-o)
shift
;;
$object)
shift
;;
*)
set fnord "$@" "$arg"
shift # fnord
shift # $arg
;;
esac
done
"$@" -E |
sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
sed '$ s: \\$::' > "$tmpdepfile"
rm -f "$depfile"
echo "$object : \\" > "$depfile"
cat < "$tmpdepfile" >> "$depfile"
sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
rm -f "$tmpdepfile"
;;
msvisualcpp)
# Important note: in order to support this mode, a compiler *must*
# always write the preprocessed file to stdout, regardless of -o,
# because we must use -o when running libtool.
"$@" || exit $?
IFS=" "
for arg
do
case "$arg" in
"-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
set fnord "$@"
shift
shift
;;
*)
set fnord "$@" "$arg"
shift
shift
;;
esac
done
"$@" -E |
sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
rm -f "$depfile"
echo "$object : \\" > "$depfile"
. "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s:: \1 \\:p' >> "$depfile"
echo " " >> "$depfile"
. "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
rm -f "$tmpdepfile"
;;
none)
exec "$@"
;;
*)
echo "Unknown depmode $depmode" 1>&2
exit 1
;;
esac
exit 0
# Local Variables:
# mode: shell-script
# sh-indentation: 2
# eval: (add-hook 'write-file-hooks 'time-stamp)
# time-stamp-start: "scriptversion="
# time-stamp-format: "%:y-%02m-%02d.%02H"
# time-stamp-end: "$"
# End:

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
Copyright (c) 1997-2008 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -38,14 +38,19 @@ POSSIBILITY OF SUCH DAMAGE.
*/
/* This is a freestanding support program to generate a file containing default
character tables for PCRE. The tables are built according to the default C
/* This is a freestanding support program to generate a file containing
character tables for PCRE. The tables are built according to the current
locale. Now that pcre_maketables is a function visible to the outside world, we
make use of its code from here in order to be consistent. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <locale.h>
#include "pcre_internal.h"
@@ -55,38 +60,60 @@ make use of its code from here in order to be consistent. */
int main(int argc, char **argv)
{
int i;
FILE *f;
const unsigned char *tables = pcre_maketables();
const unsigned char *base_of_tables = tables;
int i = 1;
const unsigned char *tables;
const unsigned char *base_of_tables;
if (argc != 2)
/* By default, the default C locale is used rather than what the building user
happens to have set. However, if the -L option is given, set the locale from
the LC_xxx environment variables. */
if (argc > 1 && strcmp(argv[1], "-L") == 0)
{
setlocale(LC_ALL, ""); /* Set from environment variables */
i++;
}
if (argc < i + 1)
{
fprintf(stderr, "dftables: one filename argument is required\n");
return 1;
}
f = fopen(argv[1], "wb");
tables = pcre_maketables();
base_of_tables = tables;
f = fopen(argv[i], "wb");
if (f == NULL)
{
fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]);
return 1;
}
/* There are two fprintf() calls here, because gcc in pedantic mode complains
about the very long string otherwise. */
/* There are several fprintf() calls here, because gcc in pedantic mode
complains about the very long string otherwise. */
fprintf(f,
"/*************************************************\n"
"* Perl-Compatible Regular Expressions *\n"
"*************************************************/\n\n"
"/* This file is automatically written by the dftables auxiliary \n"
"program. If you edit it by hand, you might like to edit the Makefile to \n"
"prevent its ever being regenerated.\n\n");
"/* This file was automatically written by the dftables auxiliary\n"
"program. It contains character tables that are used when no external\n"
"tables are passed to PCRE by the application that calls it. The tables\n"
"are used only for characters whose code values are less than 256.\n\n");
fprintf(f,
"The following #includes are present because without them gcc 4.x may remove\n"
"the array definition from the final binary if PCRE is built into a static\n"
"library and dead code stripping is activated. This leads to link errors.\n"
"Pulling in the header ensures that the array gets flagged as \"someone\n"
"outside this compilation unit might reference this\" and so it will always\n"
"be supplied to the linker. */\n\n"
"#ifdef HAVE_CONFIG_H\n"
"#include \"config.h\"\n"
"#endif\n\n"
"#include \"pcre_internal.h\"\n\n");
fprintf(f,
"This file contains the default tables for characters with codes less than\n"
"128 (ASCII characters). These tables are used when no external tables are\n"
"passed to PCRE. */\n\n"
"const unsigned char _pcre_default_tables[] = {\n\n"
"/* This table is a lower casing table. */\n\n");
@@ -162,7 +189,7 @@ if (isprint(i-8)) fprintf(f, " %c -", i-8);
else fprintf(f, "%3d-", i-8);
if (isprint(i-1)) fprintf(f, " %c ", i-1);
else fprintf(f, "%3d", i-1);
fprintf(f, " */\n\n/* End of chartables.c */\n");
fprintf(f, " */\n\n/* End of pcre_chartables.c */\n");
fclose(f);
free((void *)base_of_tables);

View File

@@ -1,4 +1,10 @@
<html>
<!-- This is a manually maintained file that is the root of the HTML version of
the PCRE documentation. When the HTML documents are built from the man
page versions, the entire doc/html directory is emptied, this file is then
copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
-->
<head>
<title>PCRE specification</title>
</head>
@@ -12,6 +18,9 @@ The HTML documentation for PCRE comprises the following pages:
<tr><td><a href="pcre.html">pcre</a></td>
<td>&nbsp;&nbsp;Introductory page</td></tr>
<tr><td><a href="pcre-config.html">pcre-config</a></td>
<td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
<tr><td><a href="pcreapi.html">pcreapi</a></td>
<td>&nbsp;&nbsp;PCRE's native API</td></tr>
@@ -54,6 +63,9 @@ The HTML documentation for PCRE comprises the following pages:
<tr><td><a href="pcrestack.html">pcrestack</a></td>
<td>&nbsp;&nbsp;Discussion of PCRE's stack usage</td></tr>
<tr><td><a href="pcresyntax.html">pcresyntax</a></td>
<td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
<tr><td><a href="pcretest.html">pcretest</a></td>
<td>&nbsp;&nbsp;The <b>pcretest</b> command for testing PCRE</td></tr>
</table>

View File

@@ -0,0 +1,88 @@
<html>
<head>
<title>pcre-config specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre-config man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">OPTIONS</a>
<li><a name="TOC4" href="#SEC4">SEE ALSO</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>pcre-config [--prefix] [--exec-prefix] [--version] [--libs]</b>
<b>[--libs-posix] [--cflags] [--cflags-posix]</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
<b>pcre-config</b> returns the configuration of the installed PCRE
libraries and the options required to compile a program to use them.
</P>
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
<P>
<b>--prefix</b>
Writes the directory prefix used in the PCRE installation for architecture
independent files (<i>/usr</i> on many systems, <i>/usr/local</i> on some
systems) to the standard output.
</P>
<P>
<b>--exec-prefix</b>
Writes the directory prefix used in the PCRE installation for architecture
dependent files (normally the same as <b>--prefix</b>) to the standard output.
</P>
<P>
<b>--version</b>
Writes the version number of the installed PCRE libraries to the standard
output.
</P>
<P>
<b>--libs</b>
Writes to the standard output the command line options required to link
with PCRE (<b>-lpcre</b> on many systems).
</P>
<P>
<b>--libs-posix</b>
Writes to the standard output the command line options required to link with
the PCRE posix emulation library (<b>-lpcreposix</b> <b>-lpcre</b> on many
systems).
</P>
<P>
<b>--cflags</b>
Writes to the standard output the command line options required to compile
files that use PCRE (this may include some <b>-I</b> options, but is blank on
many systems).
</P>
<P>
<b>--cflags-posix</b>
Writes to the standard output the command line options required to compile
files that use the PCRE posix emulation library (this may include some <b>-I</b>
options, but is blank on many systems).
</P>
<br><a name="SEC4" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre(3)</b>
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
This manual page was originally written by Mark Baker for the Debian GNU/Linux
system. It has been slightly revised as a generic PCRE man page.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 18 April 2007
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -18,18 +18,26 @@ man page, in case the conversion went wrong.
<li><a name="TOC3" href="#SEC3">LIMITATIONS</a>
<li><a name="TOC4" href="#SEC4">UTF-8 AND UNICODE PROPERTY SUPPORT</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">INTRODUCTION</a><br>
<P>
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl, with just a few
differences. The current implementation of PCRE (release 6.x) corresponds
approximately with Perl 5.8, including support for UTF-8 encoded strings and
Unicode general category properties. However, this support has to be explicitly
enabled; it is not the default.
differences. Certain features that appeared in Python and PCRE before they
appeared in Perl are also available using the Python syntax. There is also some
support for certain .NET and Oniguruma syntax items, and there is an option for
requesting some minor changes that give better JavaScript compatibility.
</P>
<P>
In addition to the Perl-compatible matching function, PCRE also contains an
The current implementation of PCRE (release 7.x) corresponds approximately with
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
category properties. However, UTF-8 and Unicode support has to be explicitly
enabled; it is not the default. The Unicode tables correspond to Unicode
release 5.1.
</P>
<P>
In addition to the Perl-compatible matching function, PCRE contains an
alternative matching function that matches the same compiled patterns in a
different way. In certain circumstances, the alternative function has some
advantages. For a discussion of the two matching algorithms, see the
@@ -52,7 +60,9 @@ supported by PCRE are given in separate documents. See the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
and
<a href="pcrecompat.html"><b>pcrecompat</b></a>
pages.
pages. There is a syntax summary in the
<a href="pcresyntax.html"><b>pcresyntax</b></a>
page.
</P>
<P>
Some features of PCRE can be included, excluded, or changed when the library is
@@ -82,6 +92,7 @@ all the sections are concatenated, for ease of searching. The sections are as
follows:
<pre>
pcre this document
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
pcrecallout details of the callout feature
@@ -91,6 +102,7 @@ follows:
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
pcrepattern syntax and semantics of supported regular expressions
pcresyntax quick syntax reference
pcreperform discussion of performance issues
pcreposix the POSIX-compatible C API
pcreprecompile details of saving and re-using precompiled patterns
@@ -114,21 +126,18 @@ internal linkage size of 3 or 4 (see the <b>README</b> file in the source
distribution and the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
documentation for details). In these cases the limit is substantially larger.
However, the speed of execution will be slower.
However, the speed of execution is slower.
</P>
<P>
All values in repeating quantifiers must be less than 65536. The maximum
compiled length of subpattern with an explicit repeat count is 30000 bytes. The
maximum number of capturing subpatterns is 65535.
All values in repeating quantifiers must be less than 65536.
</P>
<P>
There is no limit to the number of non-capturing subpatterns, but the maximum
depth of nesting of all kinds of parenthesized subpattern, including capturing
subpatterns, assertions, and other types of subpattern, is 200.
There is no limit to the number of parenthesized subpatterns, but there can be
no more than 65535 capturing subpatterns.
</P>
<P>
The maximum length of name for a named subpattern is 32, and the maximum number
of named subpatterns is 10000.
The maximum length of name for a named subpattern is 32 characters, and the
maximum number of named subpatterns is 10000.
</P>
<P>
The maximum length of a subject string is the largest positive number that an
@@ -151,14 +160,15 @@ category properties was added.
In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
the code, and, in addition, you must call
<a href="pcre_compile.html"><b>pcre_compile()</b></a>
with the PCRE_UTF8 option flag. When you do this, both the pattern and any
subject strings that are matched against it are treated as UTF-8 strings
instead of just strings of bytes.
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
(*UTF8). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF-8 strings instead of
just strings of bytes.
</P>
<P>
If you compile PCRE with UTF-8 support, but do not use it at run time, the
library will be a bit bigger, but the additional run time overhead is limited
to testing the PCRE_UTF8 flag in several places, so should not be very large.
to testing the PCRE_UTF8 flag occasionally, so should not be very big.
</P>
<P>
If PCRE is built with Unicode character property support (which implies UTF-8
@@ -172,56 +182,95 @@ documentation. Only the short names for properties are supported. For example,
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
compatibility with Perl 5.6. PCRE does not support this.
<a name="utf8strings"></a></P>
<br><b>
Validity of UTF-8 strings
</b><br>
<P>
When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
are (by default) checked for validity on entry to the relevant functions. From
release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
themselves derived from the Unicode specification. Earlier releases of PCRE
followed the rules of RFC 2279, which allows the full range of 31-bit values (0
to 0x7FFFFFFF). The current check allows only values in the range U+0 to
U+10FFFF, excluding U+D800 to U+DFFF.
</P>
<P>
The following comments apply when PCRE is running in UTF-8 mode:
The excluded code points are the "Low Surrogate Area" of Unicode, of which the
Unicode Standard says this: "The Low Surrogate Area does not contain any
character assignments, consequently no character code charts or namelists are
provided for this area. Surrogates are reserved for use with UTF-16 and then
must be used in pairs." The code points that are encoded by UTF-16 pairs are
available as independent code points in the UTF-8 encoding. (In other words,
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
UTF-8.)
</P>
<P>
1. When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
are checked for validity on entry to the relevant functions. If an invalid
UTF-8 string is passed, an error return is given. In some situations, you may
already know that your strings are valid, and therefore want to skip these
checks in order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag
at compile time or at run time, PCRE assumes that the pattern or subject it
is given (respectively) contains only valid UTF-8 codes. In this case, it does
not diagnose an invalid UTF-8 string. If you pass an invalid UTF-8 string to
PCRE when PCRE_NO_UTF8_CHECK is set, the results are undefined. Your program
may crash.
If an invalid UTF-8 string is passed to PCRE, an error return
(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
your strings are valid, and therefore want to skip these checks in order to
improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
at run time, PCRE assumes that the pattern or subject it is given
(respectively) contains only valid UTF-8 codes. In this case, it does not
diagnose an invalid UTF-8 string.
</P>
<P>
2. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
happens depends on why the string is invalid. If the string conforms to the
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
rules of RFC 2279. However, if the string does not even conform to RFC 2279,
the result is undefined. Your program may crash.
</P>
<P>
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
encoded in a UTF-8-like manner as per the old RFC, you can set
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
situation, you will have to apply your own validity check.
</P>
<br><b>
General comments about UTF-8 mode
</b><br>
<P>
1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
UTF-8 character if the value is greater than 127.
</P>
<P>
3. Octal numbers up to \777 are recognized, and match two-byte UTF-8
2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
characters for values greater than \177.
</P>
<P>
4. Repeat quantifiers apply to complete UTF-8 characters, not to individual
3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
bytes, for example: \x{100}{3}.
</P>
<P>
5. The dot metacharacter matches one UTF-8 character instead of a single byte.
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
</P>
<P>
6. The escape sequence \C can be used to match a single byte in UTF-8 mode,
5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
but its use can lead to some strange effects. This facility is not available in
the alternative matching function, <b>pcre_dfa_exec()</b>.
</P>
<P>
7. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
test characters of any code value, but the characters that PCRE recognizes as
digits, spaces, or word characters remain the same set as before, all with
values less than 256. This remains true even when PCRE includes Unicode
property support, because to do otherwise would slow down PCRE in many common
cases. If you really want to test for a wider sense of, say, "digit", you
must use Unicode property tests such as \p{Nd}.
must use Unicode property tests such as \p{Nd}. Note that this also applies to
\b, because it is defined in terms of \w and \W.
</P>
<P>
8. Similarly, characters that match the POSIX named character classes are all
7. Similarly, characters that match the POSIX named character classes are all
low-valued characters.
</P>
<P>
8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
(\h, \H, \v, and \V) do match all the appropriate Unicode characters.
</P>
<P>
9. Case-insensitive matching applies only to characters whose values are less
than 128, unless PCRE is built with Unicode property support. Even when Unicode
property support is available, PCRE still uses its own character tables when
@@ -236,17 +285,22 @@ these are not supported by PCRE.
<P>
Philip Hazel
<br>
University Computing Service,
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
Cambridge CB2 3QG, England.
</P>
<P>
Putting an actual email address here seems to have been a spam magnet, so I've
taken it away. If you want to email me, use my initial and surname, separated
by a dot, at the domain ucs.cam.ac.uk.
Last updated: 05 June 2006
taken it away. If you want to email me, use my two initials, followed by the
two digits 10, at the domain cam.ac.uk.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 11 April 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -27,8 +27,9 @@ SYNOPSIS
DESCRIPTION
</b><br>
<P>
This function compiles a regular expression into an internal form. Its
arguments are:
This function compiles a regular expression into an internal form. It is the
same as <b>pcre_compile2()</b>, except for the absence of the <i>errorcodeptr</i>
argument. Its arguments are:
<pre>
<i>pattern</i> A zero-terminated string containing the
regular expression to be compiled
@@ -40,34 +41,42 @@ arguments are:
</pre>
The option bits are:
<pre>
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \R matches all Unicode line endings
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
</pre>
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
</P>
<P>
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected.
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
</P>
<P>
There is a complete description of the PCRE native API in the

View File

@@ -56,6 +56,8 @@ The option bits are:
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
@@ -72,7 +74,9 @@ PCRE_NO_UTF8_CHECK.
</P>
<P>
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected.
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
</P>
<P>
There is a complete description of the PCRE native API in the

View File

@@ -38,7 +38,15 @@ The available codes are:
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
Internal recursion depth limit
PCRE_CONFIG_NEWLINE Value of the newline sequence
PCRE_CONFIG_NEWLINE Value of the default newline sequence:
13 (0x000d) for CR
10 (0x000a) for LF
3338 (0x0d0a) for CRLF
-2 for ANYCRLF
-1 for ANY
PCRE_CONFIG_BSR Indicates what \R matches by default:
0 all Unicode line endings
1 CR, LF, or CRLF only
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
Threshold of return slots, above
which <b>malloc()</b> is used by

View File

@@ -37,7 +37,7 @@ buffer. The arguments are:
<i>buffer</i> Buffer to receive the string
<i>buffersize</i> Size of buffer
</pre>
The yield is the legnth of the string, PCRE_ERROR_NOMEMORY if the buffer was
The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
</P>
<P>

View File

@@ -29,9 +29,9 @@ DESCRIPTION
</b><br>
<P>
This function matches a compiled regular expression against a given subject
string, using a DFA matching algorithm (<i>not</i> Perl-compatible). Note that
the main, Perl-compatible, matching function is <b>pcre_exec()</b>. The
arguments for this function are:
string, using an alternative matching algorithm that scans the subject string
just once (<i>not</i> Perl-compatible). Note that the main, Perl-compatible,
matching function is <b>pcre_exec()</b>. The arguments for this function are:
<pre>
<i>code</i> Points to the compiled pattern
<i>extra</i> Points to an associated <b>pcre_extra</b> structure,
@@ -49,12 +49,17 @@ arguments for this function are:
The options are:
<pre>
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \R matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
@@ -62,8 +67,8 @@ The options are:
PCRE_DFA_SHORTEST Return only the shortest match
PCRE_DFA_RESTART This is a restart after a partial match
</pre>
There are restrictions on what may appear in a pattern when matching using the
DFA algorithm is requested. Details are given in the
There are restrictions on what may appear in a pattern when using this matching
function. Details are given in the
<a href="pcrematching.html"><b>pcrematching</b></a>
documentation.
</P>
@@ -79,7 +84,7 @@ A <b>pcre_extra</b> structure contains the following fields:
</pre>
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
PCRE_EXTRA_TABLES. For DFA matching, the <i>match_limit</i> and
PCRE_EXTRA_TABLES. For this matching function, the <i>match_limit</i> and
<i>match_limit_recursion</i> fields are not used, and must not be set.
</P>
<P>

View File

@@ -45,19 +45,26 @@ offsets to captured substrings. Its arguments are:
The options are:
<pre>
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \R matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
</pre>
There are restrictions on what may appear in a pattern when partial matching is
requested.
requested. For details, see the
<a href="pcrepartial.html"><b>pcrepartial</b></a>
page.
</P>
<P>
A <b>pcre_extra</b> structure contains the following fields:

View File

@@ -42,13 +42,14 @@ The following information is available:
-1 for start of string
or after newline, or
-2 otherwise
PCRE_INFO_FIRSTTABLE Table of first bytes
(after studying)
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_NAMECOUNT Number of named subpatterns
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
PCRE_INFO_NAMETABLE Pointer to name table
PCRE_INFO_OPTIONS Options used for compilation
PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
</pre>

View File

@@ -39,9 +39,10 @@ arguments are:
<i>stringptr</i> Where to put the string pointer
</pre>
The memory in which the substring is placed is obtained by calling
<b>pcre_malloc()</b>. The yield of the function is the length of the extracted
substring, PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
PCRE_ERROR_NOSUBSTRING if the string name is invalid.
<b>pcre_malloc()</b>. The convenience function <b>pcre_free_substring()</b> can
be used to free it when it is no longer needed. The yield of the function is
the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
</P>
<P>
There is a complete description of the PCRE native API in the

View File

@@ -33,7 +33,10 @@ parenthesis in a compiled pattern. Its arguments are:
<i>name</i> Name whose number is required
</pre>
The yield of the function is the number of the parenthesis if the name is
found, or PCRE_ERROR_NOSUBSTRING otherwise.
found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
<b>pcre_get_stringnumber()</b>. You can obtain the complete list by calling
<b>pcre_get_stringtable_entries()</b>.
</P>
<P>
There is a complete description of the PCRE native API in the

View File

@@ -44,7 +44,7 @@ PCRE_ERROR_NOSUBSTRING if none are found.
There is a complete description of the PCRE native API, including the format of
the table entries, in the
<a href="pcreapi.html"><b>pcreapi</b></a>
page and a description of the POSIX API in the
page, and a description of the POSIX API in the
<a href="pcreposix.html"><b>pcreposix</b></a>
page.
<p>

View File

@@ -37,9 +37,10 @@ arguments are:
<i>stringptr</i> Where to put the string pointer
</pre>
The memory in which the substring is placed is obtained by calling
<b>pcre_malloc()</b>. The yield of the function is the length of the substring,
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
PCRE_ERROR_NOSUBSTRING if the string number is invalid.
<b>pcre_malloc()</b>. The convenience function <b>pcre_free_substring()</b> can
be used to free it when it is no longer needed. The yield of the function is
the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
</P>
<P>
There is a complete description of the PCRE native API in the

View File

@@ -35,10 +35,12 @@ substrings. The arguments are:
<i>listptr</i> Where to put a pointer to the list
</pre>
The memory in which the substrings and the list are placed is obtained by
calling <b>pcre_malloc()</b>. A pointer to a list of pointers is put in
the variable whose address is in <i>listptr</i>. The list is terminated by a
NULL pointer. The yield of the function is zero on success or
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained.
calling <b>pcre_malloc()</b>. The convenience function
<b>pcre_free_substring_list()</b> can be used to free it when it is no longer
needed. A pointer to a list of pointers is put in the variable whose address is
in <i>listptr</i>. The list is terminated by a NULL pointer. The yield of the
function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
not be obtained.
</P>
<P>
There is a complete description of the PCRE native API in the

View File

@@ -32,6 +32,9 @@ man page, in case the conversion went wrong.
<li><a name="TOC17" href="#SEC17">DUPLICATE SUBPATTERN NAMES</a>
<li><a name="TOC18" href="#SEC18">FINDING ALL POSSIBLE MATCHES</a>
<li><a name="TOC19" href="#SEC19">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
<li><a name="TOC22" href="#SEC22">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE NATIVE API</a><br>
<P>
@@ -140,8 +143,8 @@ man page, in case the conversion went wrong.
</P>
<br><a name="SEC2" href="#TOC1">PCRE API OVERVIEW</a><br>
<P>
PCRE has its own native API, which is described in this document. There is
also a set of wrapper functions that correspond to the POSIX regular expression
PCRE has its own native API, which is described in this document. There are
also some wrapper functions that correspond to the POSIX regular expression
API. These are described in the
<a href="pcreposix.html"><b>pcreposix</b></a>
documentation. Both of these APIs define a set of C function calls. A C++
@@ -164,15 +167,15 @@ in a Perl-compatible manner. A sample program that demonstrates the simplest
way of using them is provided in the file called <i>pcredemo.c</i> in the source
distribution. The
<a href="pcresample.html"><b>pcresample</b></a>
documentation describes how to run it.
documentation describes how to compile and run it.
</P>
<P>
A second matching function, <b>pcre_dfa_exec()</b>, which is not
Perl-compatible, is also provided. This uses a different algorithm for the
matching. The alternative algorithm finds all possible matches (at a given
point in the subject). However, this algorithm does not return captured
substrings. A description of the two matching algorithms and their advantages
and disadvantages is given in the
point in the subject), and scans the subject just once. However, this algorithm
does not return captured substrings. A description of the two matching
algorithms and their advantages and disadvantages is given in the
<a href="pcrematching.html"><b>pcrematching</b></a>
documentation.
</P>
@@ -240,19 +243,45 @@ by the caller to a "callout" function, which PCRE will then call at specified
points during a matching operation. Details are given in the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation.
</P>
<a name="newlines"></a></P>
<br><a name="SEC3" href="#TOC1">NEWLINES</a><br>
<P>
PCRE supports three different conventions for indicating line breaks in
strings: a single CR character, a single LF character, or the two-character
sequence CRLF. All three are used as "standard" by different operating systems.
When PCRE is built, a default can be specified. The default default is LF,
which is the Unix standard. When PCRE is run, the default can be overridden,
either when a pattern is compiled, or when it is matched.
<br>
<br>
PCRE supports five different conventions for indicating line breaks in
strings: a single CR (carriage return) character, a single LF (linefeed)
character, the two-character sequence CRLF, any of the three preceding, or any
Unicode newline sequence. The Unicode newline sequences are the three just
mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
(paragraph separator, U+2029).
</P>
<P>
Each of the first three conventions is used by at least one operating system as
its standard newline sequence. When PCRE is built, a default can be specified.
The default default is LF, which is the Unix standard. When PCRE is run, the
default can be overridden, either when a pattern is compiled, or when it is
matched.
</P>
<P>
At compile time, the newline convention can be specified by the <i>options</i>
argument of <b>pcre_compile()</b>, or it can be specified by special text at the
start of the pattern itself; this overrides any other settings. See the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
page for details of the special character sequences.
</P>
<P>
In the PCRE documentation the word "newline" is used to mean "the character or
pair of characters that indicate a line break".
pair of characters that indicate a line break". The choice of newline
convention affects the handling of the dot, circumflex, and dollar
metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
recognized line ending sequence, the match position advancement for a
non-anchored pattern. There is more detail about this in the
<a href="#execoptions">section on <b>pcre_exec()</b> options</a>
below.
</P>
<P>
The choice of newline convention does not affect the interpretation of
the \n or \r escape sequences, nor does it affect what \R matches, which is
controlled in a similar way, but by separate options.
</P>
<br><a name="SEC4" href="#TOC1">MULTITHREADING</a><br>
<P>
@@ -271,7 +300,9 @@ The compiled form of a regular expression can be saved and re-used at a later
time, possibly by a different program, and even on a host other than the one on
which it was compiled. Details are given in the
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
documentation.
documentation. However, compiling a regular expression with one version of PCRE
for use with a different version is not guaranteed to work and may cause
crashes.
</P>
<br><a name="SEC6" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
<P>
@@ -301,9 +332,18 @@ properties is available; otherwise it is set to zero.
PCRE_CONFIG_NEWLINE
</pre>
The output is an integer whose value specifies the default character sequence
that is recognized as meaning "newline". The three values that are supported
are: 10 for LF, 13 for CR, and 3338 for CRLF. The default should normally be
the standard sequence for your operating system.
that is recognized as meaning "newline". The four values that are supported
are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY.
Though they are derived from ASCII, the same values are returned in EBCDIC
environments. The default should normally correspond to the standard sequence
for your operating system.
<pre>
PCRE_CONFIG_BSR
</pre>
The output is an integer whose value indicates what character sequences the \R
escape sequence matches by default. A value of 0 means that \R matches any
Unicode line ending sequence; a value of 1 means that \R matches only CR, LF,
or CRLF. The default can be overridden when a pattern is compiled or matched.
<pre>
PCRE_CONFIG_LINK_SIZE
</pre>
@@ -323,13 +363,13 @@ documentation.
<pre>
PCRE_CONFIG_MATCH_LIMIT
</pre>
The output is an integer that gives the default limit for the number of
The output is a long integer that gives the default limit for the number of
internal matching function calls in a <b>pcre_exec()</b> execution. Further
details are given with <b>pcre_exec()</b> below.
<pre>
PCRE_CONFIG_MATCH_LIMIT_RECURSION
</pre>
The output is an integer that gives the default limit for the depth of
The output is a long integer that gives the default limit for the depth of
recursion when calling the internal matching function in a <b>pcre_exec()</b>
execution. Further details are given with <b>pcre_exec()</b> below.
<pre>
@@ -374,16 +414,17 @@ fully relocatable, because it may contain a copy of the <i>tableptr</i>
argument, which is an address (see below).
</P>
<P>
The <i>options</i> argument contains independent bits that affect the
The <i>options</i> argument contains various bit settings that affect the
compilation. It should be zero if no options are required. The available
options are described below. Some of them, in particular, those that are
compatible with Perl, can also be set and unset from within the pattern (see
the detailed description in the
options are described below. Some of them (in particular, those that are
compatible with Perl, but also some others) can also be set and unset from
within the pattern (see the detailed description in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation). For these options, the contents of the <i>options</i> argument
specifies their initial settings at the start of compilation and execution. The
PCRE_ANCHORED and PCRE_NEWLINE_<i>xxx</i> options can be set at the time of
matching as well as at compile time.
documentation). For those options that can be different in different parts of
the pattern, the contents of the <i>options</i> argument specifies their initial
settings at the start of compilation and execution. The PCRE_ANCHORED and
PCRE_NEWLINE_<i>xxx</i> options can be set at the time of matching as well as at
compile time.
</P>
<P>
If <i>errptr</i> is NULL, <b>pcre_compile()</b> returns NULL immediately.
@@ -439,6 +480,15 @@ all with number 255, before each pattern item. For discussion of the callout
facility, see the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation.
<pre>
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
</pre>
These options (which are mutually exclusive) control what the \R escape
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
match any Unicode newline sequence. The default is specified when PCRE is
built. It can be overridden from within the pattern, or by setting an option
when a compiled pattern is matched.
<pre>
PCRE_CASELESS
</pre>
@@ -467,8 +517,8 @@ If this bit is set, a dot metacharater in the pattern matches all characters,
including those that indicate newline. Without it, a dot does not match when
the current position is at a newline. This option is equivalent to Perl's /s
option, and it can be changed within a pattern by a (?s) option setting. A
negative class such as [^a] always matches newlines, independent of the setting
of this option.
negative class such as [^a] always matches newline characters, independent of
the setting of this option.
<pre>
PCRE_DUPNAMES
</pre>
@@ -510,6 +560,22 @@ this option. It can also be set by a (?X) option setting within a pattern.
If this option is set, an unanchored pattern is required to match before or at
the first newline in the subject string, though the matched text may continue
over the newline.
<pre>
PCRE_JAVASCRIPT_COMPAT
</pre>
If this option is set, PCRE's behaviour is changed in some ways so that it is
compatible with JavaScript rather than Perl. The changes are as follows:
</P>
<P>
(1) A lone closing square bracket in a pattern causes a compile-time error,
because this is illegal in JavaScript (by default it is treated as a data
character). Thus, the pattern AB]CD becomes illegal when this option is set.
</P>
<P>
(2) At run time, a back reference to an unset subpattern group matches an empty
string (by default this causes the current matching alternative to fail). A
pattern such as (\1)(a) succeeds when this option is set (assuming it can find
an "a" in the subject), whereas it fails by default, for Perl compatibility.
<pre>
PCRE_MULTILINE
</pre>
@@ -531,19 +597,40 @@ occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
</pre>
These options override the default newline definition that was chosen when PCRE
was built. Setting the first or the second specifies that a newline is
indicated by a single character (CR or LF, respectively). Setting both of them
specifies that a newline is indicated by the two-character CRLF sequence. For
convenience, PCRE_NEWLINE_CRLF is defined to contain both bits. The only time
that a line break is relevant when compiling a pattern is if PCRE_EXTENDED is
set, and an unescaped # outside a character class is encountered. This
indicates a comment that lasts until after the next newline.
indicated by a single character (CR or LF, respectively). Setting
PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
that any Unicode newline sequence should be recognized. The Unicode newline
sequences are the three just mentioned, plus the single characters VT (vertical
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
separator, U+2028), and PS (paragraph separator, U+2029). The last two are
recognized only in UTF-8 mode.
</P>
<P>
The newline option set at compile time becomes the default that is used for
<b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, but it can be overridden.
The newline setting in the options word uses three bits that are treated
as a number, giving eight possibilities. Currently only six are used (default
plus the five values above). This means that if you set more than one newline
option, the combination may or may not be sensible. For example,
PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
other combinations may yield unused numbers and cause an error.
</P>
<P>
The only time that a line break is specially recognized when compiling a
pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
class is encountered. This indicates a comment that lasts until after the next
line break sequence. In other circumstances, line break sequences are treated
as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
as whitespace characters and are therefore ignored.
</P>
<P>
The newline option that is set at compile time becomes the default that is used
for <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, but it can be overridden.
<pre>
PCRE_NO_AUTO_CAPTURE
</pre>
@@ -574,20 +661,24 @@ page.
PCRE_NO_UTF8_CHECK
</pre>
When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
automatically checked. If an invalid UTF-8 sequence of bytes is found,
<b>pcre_compile()</b> returns an error. If you already know that your pattern is
valid, and you want to skip this check for performance reasons, you can set the
PCRE_NO_UTF8_CHECK option. When it is set, the effect of passing an invalid
UTF-8 string as a pattern is undefined. It may cause your program to crash.
Note that this option can also be passed to <b>pcre_exec()</b> and
<b>pcre_dfa_exec()</b>, to suppress the UTF-8 validity checking of subject
strings.
automatically checked. There is a discussion about the
<a href="pcre.html#utf8strings">validity of UTF-8 strings</a>
in the main
<a href="pcre.html"><b>pcre</b></a>
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_compile()</b>
returns an error. If you already know that your pattern is valid, and you want
to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
option. When it is set, the effect of passing an invalid UTF-8 string as a
pattern is undefined. It may cause your program to crash. Note that this option
can also be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress
the UTF-8 validity checking of subject strings.
</P>
<br><a name="SEC8" href="#TOC1">COMPILATION ERROR CODES</a><br>
<P>
The following table lists the error codes than may be returned by
<b>pcre_compile2()</b>, along with the error messages that may be returned by
both compiling functions.
both compiling functions. As PCRE has developed, some error codes have fallen
out of use. To avoid confusion, they have not been re-used.
<pre>
0 no error
1 \ at end of pattern
@@ -599,17 +690,17 @@ both compiling functions.
7 invalid escape sequence in character class
8 range out of order in character class
9 nothing to repeat
10 operand of unlimited repeat could match the empty string
10 [this code is not in use]
11 internal error: unexpected repeat
12 unrecognized character after (?
12 unrecognized character after (? or (?-
13 POSIX named classes are supported only within a class
14 missing )
15 reference to non-existent subpattern
16 erroffset passed as NULL
17 unknown option bit(s) set
18 missing ) after comment
19 parentheses nested too deeply
20 regular expression too large
19 [this code is not in use]
20 regular expression is too large
21 failed to get memory
22 unmatched parentheses
23 internal error: code overflow
@@ -618,11 +709,11 @@ both compiling functions.
26 malformed number or name after (?(
27 conditional group contains more than two branches
28 assertion expected after (?(
29 (?R or (?digits must be followed by )
29 (?R or (?[+-]digits must be followed by )
30 unknown POSIX class name
31 POSIX collating elements are not supported
32 this version of PCRE is not compiled with PCRE_UTF8 support
33 spare error
33 [this code is not in use]
34 character value in \x{...} sequence is too large
35 invalid condition (?(0)
36 \C not allowed in lookbehind assertion
@@ -631,17 +722,33 @@ both compiling functions.
39 closing ) for (?C expected
40 recursive call could loop indefinitely
41 unrecognized character after (?P
42 syntax error after (?P
42 syntax error in subpattern name (missing terminator)
43 two named subpatterns have the same name
44 invalid UTF-8 string
45 support for \P, \p, and \X has not been compiled
46 malformed \P or \p sequence
47 unknown property name after \P or \p
48 subpattern name is too long (maximum 32 characters)
49 too many named subpatterns (maximum 10,000)
50 repeated subpattern is too long
49 too many named subpatterns (maximum 10000)
50 [this code is not in use]
51 octal value is greater than \377 (not in UTF-8 mode)
</PRE>
52 internal error: overran compiling workspace
53 internal error: previously-checked referenced subpattern not found
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options
57 \g is not followed by a braced, angle-bracketed, or quoted
name/number or by a plain number
58 a numbered reference must not be zero
59 (*VERB) with an argument is not supported
60 (*VERB) not recognized
61 number is too big
62 subpattern name expected
63 digit expected after (?+
64 ] is an invalid data character in JavaScript compatibility mode
</pre>
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
be used if the limits were changed when PCRE was built.
</P>
<br><a name="SEC9" href="#TOC1">STUDYING A PATTERN</a><br>
<P>
@@ -698,20 +805,27 @@ bytes is created.
<a name="localesupport"></a></P>
<br><a name="SEC10" href="#TOC1">LOCALE SUPPORT</a><br>
<P>
PCRE handles caseless matching, and determines whether characters are letters
PCRE handles caseless matching, and determines whether characters are letters,
digits, or whatever, by reference to a set of tables, indexed by character
value. When running in UTF-8 mode, this applies only to characters with codes
less than 128. Higher-valued codes never match escapes such as \w or \d, but
can be tested with \p if PCRE is built with Unicode character property
support. The use of locales with Unicode is discouraged.
support. The use of locales with Unicode is discouraged. If you are handling
characters with codes greater than 128, you should either use UTF-8 and
Unicode, or use locales, but not try to mix the two.
</P>
<P>
An internal set of tables is created in the default C locale when PCRE is
built. This is used when the final argument of <b>pcre_compile()</b> is NULL,
and is sufficient for many applications. An alternative set of tables can,
however, be supplied. These may be created in a different locale from the
default. As more and more applications change to using Unicode, the need for
this locale support is expected to die away.
PCRE contains an internal set of tables that are used when the final argument
of <b>pcre_compile()</b> is NULL. These are sufficient for many applications.
Normally, the internal tables recognize only ASCII characters. However, when
PCRE is built, it is possible to cause the internal tables to be rebuilt in the
default "C" locale of the local system, which may cause them to be different.
</P>
<P>
The internal tables can always be overridden by tables supplied by the
application that calls PCRE. These may be created in a different locale from
the default. As more and more applications change to using Unicode, the need
for this locale support is expected to die away.
</P>
<P>
External tables are built by calling the <b>pcre_maketables()</b> function,
@@ -725,6 +839,10 @@ the following code could be used:
tables = pcre_maketables();
re = pcre_compile(..., tables);
</pre>
The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
are using Windows, the name for the French locale is "french".
</P>
<P>
When <b>pcre_maketables()</b> runs, the tables are built in memory that is
obtained via <b>pcre_malloc</b>. It is the caller's responsibility to ensure
that the memory containing the tables remains available for as long as it is
@@ -810,7 +928,7 @@ still recognized for backwards compatibility.)
</P>
<P>
If there is a fixed first byte, for example, from a pattern such as
(cat|cow|coyote). Otherwise, if either
(cat|cow|coyote), its value is returned. Otherwise, if either
<br>
<br>
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
@@ -831,6 +949,18 @@ If the pattern was studied, and this resulted in the construction of a 256-bit
table indicating a fixed set of bytes for the first byte in any matching
string, a pointer to the table is returned. Otherwise NULL is returned. The
fourth argument should point to an <b>unsigned char *</b> variable.
<pre>
PCRE_INFO_HASCRORLF
</pre>
Return 1 if the pattern contains any explicit matches for CR or LF characters,
otherwise 0. The fourth argument should point to an <b>int</b> variable. An
explicit match is either a literal CR or LF character, or \r or \n.
<pre>
PCRE_INFO_JCHANGED
</pre>
Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
0. The fourth argument should point to an <b>int</b> variable. (?J) and
(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
<pre>
PCRE_INFO_LASTLITERAL
</pre>
@@ -868,7 +998,7 @@ alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of
their parentheses numbers. For example, consider the following pattern (assume
PCRE_EXTENDED is set, so white space - including newlines - is ignored):
<pre>
(?P&#60;date&#62; (?P&#60;year&#62;(\d\d)?\d\d) - (?P&#60;month&#62;\d\d) - (?P&#60;day&#62;\d\d) )
(?&#60;date&#62; (?&#60;year&#62;(\d\d)?\d\d) - (?&#60;month&#62;\d\d) - (?&#60;day&#62;\d\d) )
</pre>
There are four named subpatterns, so the table has four entries, and each entry
in the table is eight bytes long. The table is as follows, with non-printing
@@ -882,13 +1012,24 @@ bytes shows in hexadecimal, and undefined bytes shown as ??:
When writing code to extract data from named subpatterns using the
name-to-number map, remember that the length of the entries is likely to be
different for each compiled pattern.
<pre>
PCRE_INFO_OKPARTIAL
</pre>
Return 1 if the pattern can be used for partial matching, otherwise 0. The
fourth argument should point to an <b>int</b> variable. The
<a href="pcrepartial.html"><b>pcrepartial</b></a>
documentation lists the restrictions that apply to patterns when partial
matching is used.
<pre>
PCRE_INFO_OPTIONS
</pre>
Return a copy of the options with which the pattern was compiled. The fourth
argument should point to an <b>unsigned long int</b> variable. These option bits
are those specified in the call to <b>pcre_compile()</b>, modified by any
top-level option settings within the pattern itself.
top-level option settings at the start of the pattern itself. In other words,
they are the options that will be in force when matching starts. For example,
if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
</P>
<P>
A pattern is automatically anchored by PCRE if all of its top-level
@@ -1097,14 +1238,15 @@ the external tables might be at a different address when <b>pcre_exec()</b> is
called. See the
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
documentation for a discussion of saving compiled patterns for later use.
</P>
<a name="execoptions"></a></P>
<br><b>
Option bits for <b>pcre_exec()</b>
</b><br>
<P>
The unused bits of the <i>options</i> argument for <b>pcre_exec()</b> must be
zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_<i>xxx</i>,
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
<pre>
PCRE_ANCHORED
</pre>
@@ -1112,15 +1254,52 @@ The PCRE_ANCHORED option limits <b>pcre_exec()</b> to matching at the first
matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
to be anchored by virtue of its contents, it cannot be made unachored at
matching time.
<pre>
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
</pre>
These options (which are mutually exclusive) control what the \R escape
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
match any Unicode newline sequence. These options override the choice that was
made or defaulted when the pattern was compiled.
<pre>
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
</pre>
These options override the newline definition that was chosen or defaulted when
the pattern was compiled. For details, see the description <b>pcre_compile()</b>
above. During matching, the newline choice affects the behaviour of the dot,
circumflex, and dollar metacharacters.
the pattern was compiled. For details, see the description of
<b>pcre_compile()</b> above. During matching, the newline choice affects the
behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
the way the match position is advanced after a match failure for an unanchored
pattern.
</P>
<P>
When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
match attempt for an unanchored pattern fails when the current position is at a
CRLF sequence, and the pattern contains no explicit matches for CR or LF
characters, the match position is advanced by two characters instead of one, in
other words, to after the CRLF.
</P>
<P>
The above rule is a compromise that makes the most common cases work as
expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
set), it does not match the string "\r\nA" because, after failing at the
start, it skips both the CR and the LF before retrying. However, the pattern
[\r\n]A does match that string, because it contains an explicit CR or LF
reference, and so advances only by one character after the first failure.
</P>
<P>
An explicit match for CR of LF is either a literal appearance of one of those
characters, or one of the \r or \n escape sequences. Implicit matches such as
[^X] do not count, nor does \s (which includes CR and LF in the characters
that it matches).
</P>
<P>
Notwithstanding the above, anomalous effects may still occur when CRLF is a
valid newline sequence and explicit \r or \n escapes appear in the pattern.
<pre>
PCRE_NOTBOL
</pre>
@@ -1158,15 +1337,30 @@ matching a null string by first trying the match again at the same offset with
PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
starting offset (see below) and trying an ordinary match again. There is some
code that demonstrates how to do this in the <i>pcredemo.c</i> sample program.
<pre>
PCRE_NO_START_OPTIMIZE
</pre>
There are a number of optimizations that <b>pcre_exec()</b> uses at the start of
a match, in order to speed up the process. For example, if it is known that a
match must start with a specific character, it searches the subject for that
character, and fails immediately if it cannot find it, without actually running
the main matching function. When callouts are in use, these optimizations can
cause them to be skipped. This option disables the "start-up" optimizations,
causing performance to suffer, but ensuring that the callouts do occur.
<pre>
PCRE_NO_UTF8_CHECK
</pre>
When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
string is automatically checked when <b>pcre_exec()</b> is subsequently called.
The value of <i>startoffset</i> is also checked to ensure that it points to the
start of a UTF-8 character. If an invalid UTF-8 sequence of bytes is found,
<b>pcre_exec()</b> returns the error PCRE_ERROR_BADUTF8. If <i>startoffset</i>
contains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
start of a UTF-8 character. There is a discussion about the validity of UTF-8
strings in the
<a href="pcre.html#utf8strings">section on UTF-8 support</a>
in the main
<a href="pcre.html"><b>pcre</b></a>
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns
the error PCRE_ERROR_BADUTF8. If <i>startoffset</i> contains an invalid value,
PCRE_ERROR_BADUTF8_OFFSET is returned.
</P>
<P>
If you already know that your subject is valid, and you want to skip these
@@ -1196,11 +1390,11 @@ The string to be matched by <b>pcre_exec()</b>
</b><br>
<P>
The subject string is passed to <b>pcre_exec()</b> as a pointer in
<i>subject</i>, a length in <i>length</i>, and a starting byte offset in
<i>startoffset</i>. In UTF-8 mode, the byte offset must point to the start of a
UTF-8 character. Unlike the pattern string, the subject may contain binary zero
bytes. When the starting offset is zero, the search for a match starts at the
beginning of the subject, and this is by far the most common case.
<i>subject</i>, a length (in bytes) in <i>length</i>, and a starting byte offset
in <i>startoffset</i>. In UTF-8 mode, the byte offset must point to the start of
a UTF-8 character. Unlike the pattern string, the subject may contain binary
zero bytes. When the starting offset is zero, the search for a match starts at
the beginning of the subject, and this is by far the most common case.
</P>
<P>
A non-zero starting offset is useful when searching for another match in the
@@ -1238,32 +1432,36 @@ a fragment of a pattern that picks out a substring. PCRE supports several other
kinds of parenthesized subpattern that do not cause substrings to be captured.
</P>
<P>
Captured substrings are returned to the caller via a vector of integer offsets
whose address is passed in <i>ovector</i>. The number of elements in the vector
is passed in <i>ovecsize</i>, which must be a non-negative number. <b>Note</b>:
this argument is NOT the size of <i>ovector</i> in bytes.
Captured substrings are returned to the caller via a vector of integers whose
address is passed in <i>ovector</i>. The number of elements in the vector is
passed in <i>ovecsize</i>, which must be a non-negative number. <b>Note</b>: this
argument is NOT the size of <i>ovector</i> in bytes.
</P>
<P>
The first two-thirds of the vector is used to pass back captured substrings,
each substring using a pair of integers. The remaining third of the vector is
used as workspace by <b>pcre_exec()</b> while matching capturing subpatterns,
and is not available for passing back information. The length passed in
and is not available for passing back information. The number passed in
<i>ovecsize</i> should always be a multiple of three. If it is not, it is
rounded down.
</P>
<P>
When a match is successful, information about captured substrings is returned
in pairs of integers, starting at the beginning of <i>ovector</i>, and
continuing up to two-thirds of its length at the most. The first element of a
pair is set to the offset of the first character in a substring, and the second
is set to the offset of the first character after the end of a substring. The
first pair, <i>ovector[0]</i> and <i>ovector[1]</i>, identify the portion of the
subject string matched by the entire pattern. The next pair is used for the
first capturing subpattern, and so on. The value returned by <b>pcre_exec()</b>
is one more than the highest numbered pair that has been set. For example, if
two substrings have been captured, the returned value is 3. If there are no
capturing subpatterns, the return value from a successful match is 1,
indicating that just the first pair of offsets has been set.
continuing up to two-thirds of its length at the most. The first element of
each pair is set to the byte offset of the first character in a substring, and
the second is set to the byte offset of the first character after the end of a
substring. <b>Note</b>: these values are always byte offsets, even in UTF-8
mode. They are not character counts.
</P>
<P>
The first pair of integers, <i>ovector[0]</i> and <i>ovector[1]</i>, identify the
portion of the subject string matched by the entire pattern. The next pair is
used for the first capturing subpattern, and so on. The value returned by
<b>pcre_exec()</b> is one more than the highest numbered pair that has been set.
For example, if two substrings have been captured, the returned value is 3. If
there are no capturing subpatterns, the return value from a successful match is
1, indicating that just the first pair of offsets has been set.
</P>
<P>
If a capturing subpattern is matched repeatedly, it is the last portion of the
@@ -1272,8 +1470,8 @@ string that it matched that is returned.
<P>
If the vector is too small to hold all the captured substring offsets, it is
used as far as possible (up to two-thirds of its length), and the function
returns a value of zero. In particular, if the substring offsets are not of
interest, <b>pcre_exec()</b> may be called with <i>ovector</i> passed as NULL and
returns a value of zero. If the substring offsets are not of interest,
<b>pcre_exec()</b> may be called with <i>ovector</i> passed as NULL and
<i>ovecsize</i> as zero. However, if the pattern contains back references and
the <i>ovector</i> is not big enough to remember the related substrings, PCRE
has to get additional memory for use during matching. Thus it is usually
@@ -1334,7 +1532,7 @@ compiled in an environment of one endianness is run in an environment with the
other endianness. This is the error that PCRE gives when the magic number is
not present.
<pre>
PCRE_ERROR_UNKNOWN_NODE (-5)
PCRE_ERROR_UNKNOWN_OPCODE (-5)
</pre>
While running the pattern match, an unknown item was encountered in the
compiled pattern. This error could be caused by a bug in PCRE or by overwriting
@@ -1359,12 +1557,6 @@ below). It is never returned by <b>pcre_exec()</b>.
The backtracking limit, as specified by the <i>match_limit</i> field in a
<b>pcre_extra</b> structure (or defaulted) was reached. See the description
above.
<pre>
PCRE_ERROR_RECURSIONLIMIT (-21)
</pre>
The internal recursion limit, as specified by the <i>match_limit_recursion</i>
field in a <b>pcre_extra</b> structure (or defaulted) was reached. See the
description above.
<pre>
PCRE_ERROR_CALLOUT (-9)
</pre>
@@ -1403,6 +1595,19 @@ in PCRE or by overwriting of the compiled pattern.
PCRE_ERROR_BADCOUNT (-15)
</pre>
This error is given if the value of the <i>ovecsize</i> argument is negative.
<pre>
PCRE_ERROR_RECURSIONLIMIT (-21)
</pre>
The internal recursion limit, as specified by the <i>match_limit_recursion</i>
field in a <b>pcre_extra</b> structure (or defaulted) was reached. See the
description above.
<pre>
PCRE_ERROR_BADNEWLINE (-23)
</pre>
An invalid combination of PCRE_NEWLINE_<i>xxx</i> options was given.
</P>
<P>
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
</P>
<br><a name="SEC15" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
<P>
@@ -1457,7 +1662,7 @@ the string is placed in <i>buffer</i>, whose length is given by
<i>buffersize</i>, while for <b>pcre_get_substring()</b> a new block of memory is
obtained via <b>pcre_malloc</b>, and its address is returned via
<i>stringptr</i>. The yield of the function is the length of the string, not
including the terminating zero, or one of
including the terminating zero, or one of these error codes:
<pre>
PCRE_ERROR_NOMEMORY (-6)
</pre>
@@ -1474,7 +1679,7 @@ and builds a list of pointers to them. All this is done in a single block of
memory that is obtained via <b>pcre_malloc</b>. The address of the memory block
is returned via <i>listptr</i>, which is also the start of the list of string
pointers. The end of the list is marked by a NULL pointer. The yield of the
function is zero if all went well, or
function is zero if all went well, or the error code
<pre>
PCRE_ERROR_NOMEMORY (-6)
</pre>
@@ -1520,7 +1725,7 @@ provided.
To extract a substring by name, you first have to find associated number.
For example, for this pattern
<pre>
(a+)b(?P&#60;xxx&#62;\d+)...
(a+)b(?&#60;xxx&#62;\d+)...
</pre>
the number of the subpattern called "xxx" is 2. If the name is known to be
unique (PCRE_DUPNAMES was not set), you can find the number from the name by
@@ -1548,8 +1753,15 @@ translation table.
</P>
<P>
These functions call <b>pcre_get_stringnumber()</b>, and if it succeeds, they
then call <i>pcre_copy_substring()</i> or <i>pcre_get_substring()</i>, as
appropriate.
then call <b>pcre_copy_substring()</b> or <b>pcre_get_substring()</b>, as
appropriate. <b>NOTE:</b> If PCRE_DUPNAMES is set and there are duplicate names,
the behaviour may not be what you want (see the next section).
</P>
<P>
<b>Warning:</b> If the pattern uses the "(?|" feature to set up multiple
subpatterns with the same number, you cannot use names to distinguish them,
because names are not included in the compiled code. The matching process uses
only numbers.
</P>
<br><a name="SEC17" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
<P>
@@ -1562,23 +1774,27 @@ are not required to be unique. Normally, patterns with duplicate names are such
that in any one match, only one of the named subpatterns participates. An
example is shown in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation. When duplicates are present, <b>pcre_copy_named_substring()</b>
and <b>pcre_get_named_substring()</b> return the first substring corresponding
to the given name that is set. If none are set, an empty string is returned.
The <b>pcre_get_stringnumber()</b> function returns one of the numbers that are
associated with the name, but it is not defined which it is.
<br>
<br>
documentation.
</P>
<P>
When duplicates are present, <b>pcre_copy_named_substring()</b> and
<b>pcre_get_named_substring()</b> return the first substring corresponding to
the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
returned; no data is returned. The <b>pcre_get_stringnumber()</b> function
returns one of the numbers that are associated with the name, but it is not
defined which it is.
</P>
<P>
If you want to get full details of all captured substrings for a given name,
you must use the <b>pcre_get_stringtable_entries()</b> function. The first
argument is the compiled pattern, and the second is the name. The third and
fourth are pointers to variables which are updated by the function. After it
has run, they point to the first and last entries in the name-to-number table
for the given name. The function itself returns the length of each entry, or
PCRE_ERROR_NOSUBSTRING if there are none. The format of the table is described
above in the section entitled <i>Information about a pattern</i>. Given all the
relevant entries for the name, you can extract each of their numbers, and hence
the captured data, if any.
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
described above in the section entitled <i>Information about a pattern</i>.
Given all the relevant entries for the name, you can extract each of their
numbers, and hence the captured data, if any.
</P>
<br><a name="SEC18" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>
<P>
@@ -1608,11 +1824,12 @@ will yield PCRE_ERROR_NOMATCH.
</P>
<P>
The function <b>pcre_dfa_exec()</b> is called to match a subject string against
a compiled pattern, using a "DFA" matching algorithm. This has different
characteristics to the normal algorithm, and is not compatible with Perl. Some
of the features of PCRE patterns are not supported. Nevertheless, there are
times when this kind of matching can be useful. For a discussion of the two
matching algorithms, see the
a compiled pattern, using a matching algorithm that scans the subject string
just once, and does not backtrack. This has different characteristics to the
normal algorithm, and is not compatible with Perl. Some of the features of PCRE
patterns are not supported. Nevertheless, there are times when this kind of
matching can be useful. For a discussion of the two matching algorithms, see
the
<a href="pcrematching.html"><b>pcrematching</b></a>
documentation.
</P>
@@ -1671,9 +1888,9 @@ matching string.
PCRE_DFA_SHORTEST
</pre>
Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
soon as it has found one match. Because of the way the DFA algorithm works,
this is necessarily the shortest possible match at the first possible matching
point in the subject string.
soon as it has found one match. Because of the way the alternative algorithm
works, this is necessarily the shortest possible match at the first possible
matching point in the subject string.
<pre>
PCRE_DFA_RESTART
</pre>
@@ -1711,10 +1928,10 @@ the three matched strings are
On success, the yield of the function is a number greater than zero, which is
the number of matched substrings. The substrings themselves are returned in
<i>ovector</i>. Each string uses two elements; the first is the offset to the
start, and the second is the offset to the end. All the strings have the same
start offset. (Space could have been saved by giving this only once, but it was
decided to retain some compatibility with the way <b>pcre_exec()</b> returns
data, even though the meaning of the strings is different.)
start, and the second is the offset to the end. In fact, all the strings have
the same start offset. (Space could have been saved by giving this only once,
but it was decided to retain some compatibility with the way <b>pcre_exec()</b>
returns data, even though the meaning of the strings is different.)
</P>
<P>
The strings are returned in reverse order of length; that is, the longest
@@ -1740,8 +1957,9 @@ that it does not support, for instance, the use of \C or a back reference.
<pre>
PCRE_ERROR_DFA_UCOND (-17)
</pre>
This return is given if <b>pcre_dfa_exec()</b> encounters a condition item in a
pattern that uses a back reference for the condition. This is not supported.
This return is given if <b>pcre_dfa_exec()</b> encounters a condition item that
uses a back reference for the condition, or a test for recursion in a specific
group. These are not supported.
<pre>
PCRE_ERROR_DFA_UMLIMIT (-18)
</pre>
@@ -1761,10 +1979,27 @@ recursively, using private vectors for <i>ovector</i> and <i>workspace</i>. This
error is given if the output vector is not large enough. This should be
extremely rare, as a vector of size 1000 is used.
</P>
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
<P>
Last updated: 08 June 2006
<b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).
</P>
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
<P>
Last updated: 11 April 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -18,26 +18,39 @@ man page, in case the conversion went wrong.
<li><a name="TOC3" href="#SEC3">UTF-8 SUPPORT</a>
<li><a name="TOC4" href="#SEC4">UNICODE CHARACTER PROPERTY SUPPORT</a>
<li><a name="TOC5" href="#SEC5">CODE VALUE OF NEWLINE</a>
<li><a name="TOC6" href="#SEC6">BUILDING SHARED AND STATIC LIBRARIES</a>
<li><a name="TOC7" href="#SEC7">POSIX MALLOC USAGE</a>
<li><a name="TOC8" href="#SEC8">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC9" href="#SEC9">AVOIDING EXCESSIVE STACK USAGE</a>
<li><a name="TOC10" href="#SEC10">LIMITING PCRE RESOURCE USAGE</a>
<li><a name="TOC11" href="#SEC11">USING EBCDIC CODE</a>
<li><a name="TOC6" href="#SEC6">WHAT \R MATCHES</a>
<li><a name="TOC7" href="#SEC7">BUILDING SHARED AND STATIC LIBRARIES</a>
<li><a name="TOC8" href="#SEC8">POSIX MALLOC USAGE</a>
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
<li><a name="TOC11" href="#SEC11">LIMITING PCRE RESOURCE USAGE</a>
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
<li><a name="TOC14" href="#SEC14">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
<li><a name="TOC15" href="#SEC15">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
<li><a name="TOC16" href="#SEC16">SEE ALSO</a>
<li><a name="TOC17" href="#SEC17">AUTHOR</a>
<li><a name="TOC18" href="#SEC18">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE BUILD-TIME OPTIONS</a><br>
<P>
This document describes the optional features of PCRE that can be selected when
the library is compiled. They are all selected, or deselected, by providing
options to the <b>configure</b> script that is run before the <b>make</b>
command. The complete list of options for <b>configure</b> (which includes the
standard ones such as the selection of the installation directory) can be
obtained by running
the library is compiled. It assumes use of the <b>configure</b> script, where
the optional features are selected or deselected by providing options to
<b>configure</b> before running the <b>make</b> command. However, the same
options can be selected in both Unix-like and non-Unix-like environments using
the GUI facility of <b>CMakeSetup</b> if you are using <b>CMake</b> instead of
<b>configure</b> to build PCRE.
</P>
<P>
The complete list of options for <b>configure</b> (which includes the standard
ones such as the selection of the installation directory) can be obtained by
running
<pre>
./configure --help
</pre>
The following sections describe certain options whose names begin with --enable
or --disable. These settings specify changes to the defaults for the
The following sections include descriptions of options whose names begin with
--enable or --disable. These settings specify changes to the defaults for the
<b>configure</b> command. Because of the way that <b>configure</b> works,
--enable and --disable always come in pairs, so the complementary option always
exists as well, but as it specifies the default, it is not described.
@@ -54,7 +67,7 @@ to the <b>configure</b> command.
</P>
<br><a name="SEC3" href="#TOC1">UTF-8 SUPPORT</a><br>
<P>
To build PCRE with support for UTF-8 character strings, add
To build PCRE with support for UTF-8 Unicode character strings, add
<pre>
--enable-utf8
</pre>
@@ -63,6 +76,13 @@ strings as UTF-8. As well as compiling PCRE with this option, you also have
have to set the PCRE_UTF8 option when you call the <b>pcre_compile()</b>
function.
</P>
<P>
If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
its input to be either ASCII or UTF-8 (depending on the runtime option). It is
not possible to support both EBCDIC and UTF-8 codes in the same version of the
library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
exclusive.
</P>
<br><a name="SEC4" href="#TOC1">UNICODE CHARACTER PROPERTY SUPPORT</a><br>
<P>
UTF-8 support allows PCRE to process character values greater than 255 in the
@@ -77,17 +97,17 @@ to the <b>configure</b> command. This implies UTF-8 support, even if you have
not explicitly requested it.
</P>
<P>
Including Unicode property support adds around 90K of tables to the PCRE
library, approximately doubling its size. Only the general category properties
such as <i>Lu</i> and <i>Nd</i> are supported. Details are given in the
Including Unicode property support adds around 30K of tables to the PCRE
library. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
supported. Details are given in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation.
</P>
<br><a name="SEC5" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
<P>
By default, PCRE interprets character 10 (linefeed, LF) as indicating the end
By default, PCRE interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can
compile PCRE to use character 13 (carriage return, CR) instead, by adding
compile PCRE to use carriage return (CR) instead, by adding
<pre>
--enable-newline-is-cr
</pre>
@@ -100,11 +120,34 @@ character sequence CRLF. If you want this, add
<pre>
--enable-newline-is-crlf
</pre>
to the <b>configure</b> command. Whatever line ending convention is selected
when PCRE is built can be overridden when the library functions are called. At
build time it is conventional to use the standard for your operating system.
to the <b>configure</b> command. There is a fourth option, specified by
<pre>
--enable-newline-is-anycrlf
</pre>
which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
indicating a line ending. Finally, a fifth option, specified by
<pre>
--enable-newline-is-any
</pre>
causes PCRE to recognize any Unicode newline sequence.
</P>
<br><a name="SEC6" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
<P>
Whatever line ending convention is selected when PCRE is built can be
overridden when the library functions are called. At build time it is
conventional to use the standard for your operating system.
</P>
<br><a name="SEC6" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
By default, the sequence \R in a pattern matches any Unicode newline sequence,
whatever has been selected as the line ending sequence. If you specify
<pre>
--enable-bsr-anycrlf
</pre>
the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
selected when PCRE is built can be overridden when the library functions are
called.
</P>
<br><a name="SEC7" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
<P>
The PCRE building process uses <b>libtool</b> to build both shared and static
Unix libraries by default. You can suppress one of these by adding one of
@@ -114,7 +157,7 @@ Unix libraries by default. You can suppress one of these by adding one of
</pre>
to the <b>configure</b> command, as required.
</P>
<br><a name="SEC7" href="#TOC1">POSIX MALLOC USAGE</a><br>
<br><a name="SEC8" href="#TOC1">POSIX MALLOC USAGE</a><br>
<P>
When PCRE is called through the POSIX interface (see the
<a href="pcreposix.html"><b>pcreposix</b></a>
@@ -130,7 +173,7 @@ such as
</pre>
to the <b>configure</b> command.
</P>
<br><a name="SEC8" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<P>
Within a compiled pattern, offset values are used to point from one part to
another (for example, from an opening parenthesis to an alternation
@@ -146,12 +189,7 @@ to the <b>configure</b> command. The value given must be 2, 3, or 4. Using
longer offsets slows down the operation of PCRE because it has to load
additional bytes when handling them.
</P>
<P>
If you build PCRE with an increased link size, test 2 (and test 5 if you are
using UTF-8) will fail. Part of the output of these tests is a representation
of the compiled pattern, and this changes with the link size.
</P>
<br><a name="SEC9" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<P>
When matching with the <b>pcre_exec()</b> function, PCRE implements backtracking
by making recursive calls to an internal function called <b>match()</b>. In
@@ -169,15 +207,20 @@ build a version of PCRE that works this way, add
</pre>
to the <b>configure</b> command. With this configuration, PCRE will use the
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables to call memory
management functions. Separate functions are provided because the usage is very
predictable: the block sizes requested are always the same, and the blocks are
always freed in reverse order. A calling program might be able to implement
optimized functions that perform better than the standard <b>malloc()</b> and
<b>free()</b> functions. PCRE runs noticeably more slowly when built in this
way. This option affects only the <b>pcre_exec()</b> function; it is not
relevant for the the <b>pcre_dfa_exec()</b> function.
management functions. By default these point to <b>malloc()</b> and
<b>free()</b>, but you can replace the pointers so that your own functions are
used.
</P>
<br><a name="SEC10" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
<P>
Separate functions are provided rather than using <b>pcre_malloc</b> and
<b>pcre_free</b> because the usage is very predictable: the block sizes
requested are always the same, and the blocks are always freed in reverse
order. A calling program might be able to implement optimized functions that
perform better than <b>malloc()</b> and <b>free()</b>. PCRE runs noticeably more
slowly when built in this way. This option affects only the <b>pcre_exec()</b>
function; it is not relevant for the the <b>pcre_dfa_exec()</b> function.
</P>
<br><a name="SEC11" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
<P>
Internally, PCRE has a function called <b>match()</b>, which it calls repeatedly
(sometimes recursively) when matching a pattern with the <b>pcre_exec()</b>
@@ -206,20 +249,100 @@ constraints. However, you can set a lower limit by adding, for example,
</pre>
to the <b>configure</b> command. This value can also be overridden at run time.
</P>
<br><a name="SEC11" href="#TOC1">USING EBCDIC CODE</a><br>
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<P>
PCRE uses fixed tables for processing characters whose code values are less
than 256. By default, PCRE is built with a set of tables that are distributed
in the file <i>pcre_chartables.c.dist</i>. These tables are for ASCII codes
only. If you add
<pre>
--enable-rebuild-chartables
</pre>
to the <b>configure</b> command, the distributed tables are no longer used.
Instead, a program called <b>dftables</b> is compiled and run. This outputs the
source for new set of tables, created in the default locale of your C runtime
system. (This method of replacing the tables does not work if you are cross
compiling, because <b>dftables</b> is run on the local host. If you need to
create alternative tables when cross compiling, you will have to do so "by
hand".)
</P>
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
<P>
PCRE assumes by default that it will run in an environment where the character
code is ASCII (or Unicode, which is a superset of ASCII). PCRE can, however, be
compiled to run in an EBCDIC environment by adding
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
most computer operating systems. PCRE can, however, be compiled to run in an
EBCDIC environment by adding
<pre>
--enable-ebcdic
</pre>
to the <b>configure</b> command.
to the <b>configure</b> command. This setting implies
--enable-rebuild-chartables. You should only use it if you know that you are in
an EBCDIC environment (for example, an IBM mainframe operating system). The
--enable-ebcdic option is incompatible with --enable-utf8.
</P>
<br><a name="SEC14" href="#TOC1">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<P>
By default, <b>pcregrep</b> reads all files as plain text. You can build it so
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
them with <b>libz</b> or <b>libbz2</b>, respectively, by adding one or both of
<pre>
--enable-pcregrep-libz
--enable-pcregrep-libbz2
</pre>
to the <b>configure</b> command. These options naturally require that the
relevant libraries are installed on your system. Configuration will fail if
they are not.
</P>
<br><a name="SEC15" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<P>
If you add
<pre>
--enable-pcretest-libreadline
</pre>
to the <b>configure</b> command, <b>pcretest</b> is linked with the
<b>libreadline</b> library, and when its input is from a terminal, it reads it
using the <b>readline()</b> function. This provides line-editing and history
facilities. Note that <b>libreadline</b> is GPL-licenced, so if you distribute a
binary of <b>pcretest</b> linked in this way, there may be licensing issues.
</P>
<P>
Last updated: 06 June 2006
Setting this option causes the <b>-lreadline</b> option to be added to the
<b>pcretest</b> build. In many operating environments with a sytem-installed
<b>libreadline</b> this is sufficient. However, in some environments (e.g.
if an unmodified distribution version of readline is in use), some extra
configuration may be necessary. The INSTALL file for <b>libreadline</b> says
this:
<pre>
"Readline uses the termcap functions, but does not link with the
termcap or curses library itself, allowing applications which link
with readline the to choose an appropriate library."
</pre>
If your environment has not been set up so that an appropriate library is
automatically included, you may need to add something like
<pre>
LIBS="-ncurses"
</pre>
immediately before the <b>configure</b> command.
</P>
<br><a name="SEC16" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcreapi</b>(3), <b>pcre_config</b>(3).
</P>
<br><a name="SEC17" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC18" href="#TOC1">REVISION</a><br>
<P>
Last updated: 17 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -17,6 +17,8 @@ man page, in case the conversion went wrong.
<li><a name="TOC2" href="#SEC2">MISSING CALLOUTS</a>
<li><a name="TOC3" href="#SEC3">THE CALLOUT INTERFACE</a>
<li><a name="TOC4" href="#SEC4">RETURN VALUES</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE CALLOUTS</a><br>
<P>
@@ -35,7 +37,7 @@ function is to be called. Different callout points can be identified by putting
a number less than 256 after the letter C. The default value is zero.
For example, this pattern has two callout points:
<pre>
(?C1)\deabc(?C2)def
(?C1)abc(?C2)def
</pre>
If the PCRE_AUTO_CALLOUT option bit is set when <b>pcre_compile()</b> is called,
PCRE automatically inserts callouts, all with number 255, before each item in
@@ -60,7 +62,8 @@ trying to optimize the performance of a particular pattern.
<br><a name="SEC2" href="#TOC1">MISSING CALLOUTS</a><br>
<P>
You should be aware that, because of optimizations in the way PCRE matches
patterns, callouts sometimes do not happen. For example, if the pattern is
patterns by default, callouts sometimes do not happen. For example, if the
pattern is
<pre>
ab(?C4)cd
</pre>
@@ -69,6 +72,12 @@ string is "abyz", the lack of "d" means that matching doesn't ever start, and
the callout is never reached. However, with "abyd", though the result is still
no match, the callout is obeyed.
</P>
<P>
You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>. This slows down the
matching process, but does ensure that callouts such as the example above are
obeyed.
</P>
<br><a name="SEC3" href="#TOC1">THE CALLOUT INTERFACE</a><br>
<P>
During matching, when PCRE reaches a callout point, the external function
@@ -113,10 +122,12 @@ The <i>subject</i> and <i>subject_length</i> fields contain copies of the values
that were passed to <b>pcre_exec()</b>.
</P>
<P>
The <i>start_match</i> field contains the offset within the subject at which the
current match attempt started. If the pattern is not anchored, the callout
function may be called several times from the same point in the pattern for
different starting points in the subject.
The <i>start_match</i> field normally contains the offset within the subject at
which the current match attempt started. However, if the escape sequence \K
has been encountered, this value is changed to reflect the modified starting
point. If the pattern is not anchored, the callout function may be called
several times from the same point in the pattern for different starting points
in the subject.
</P>
<P>
The <i>current_position</i> field contains the offset within the subject of the
@@ -177,10 +188,21 @@ values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
it will never be used by PCRE itself.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Last updated: 28 February 2005
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 15 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2005 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -17,8 +17,9 @@ DIFFERENCES BETWEEN PCRE AND PERL
</b><br>
<P>
This document describes the differences in the ways that PCRE and Perl handle
regular expressions. The differences described here are with respect to Perl
5.8.
regular expressions. The differences described here are mainly with respect to
Perl 5.8, though PCRE versions 7.0 and later contain some features that are
expected to be in the forthcoming Perl 5.10.
</P>
<P>
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
@@ -76,20 +77,34 @@ following examples:
The \Q...\E sequence is recognized both inside and outside character classes.
</P>
<P>
8. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
constructions. However, there is support for recursive patterns using the
non-Perl items (?R), (?number), and (?P&#62;name). Also, the PCRE "callout" feature
allows an external function to be called during pattern matching. See the
8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
constructions. However, there is support for recursive patterns. This is not
available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
feature allows an external function to be called during pattern matching. See
the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation for details.
</P>
<P>
9. There are some differences that are concerned with the settings of captured
9. Subpatterns that are called recursively or as "subroutines" are always
treated as atomic groups in PCRE. This is like Python, but unlike Perl.
</P>
<P>
10. There are some differences that are concerned with the settings of captured
strings when part of a pattern is repeated. For example, matching "aba" against
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
</P>
<P>
10. PCRE provides some extensions to the Perl regular expression facilities:
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
parentheses, PCRE does not set that capture group; this is different to Perl.
</P>
<P>
12. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 will include new features that are not in earlier versions, some of
which (such as named parentheses) have been in PCRE for some time. This list is
with respect to Perl 5.10:
<br>
<br>
(a) Although lookbehind assertions must match fixed length strings, each
@@ -102,8 +117,8 @@ meta-character matches only at the very end of the string.
<br>
<br>
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
meaning is faulted. Otherwise, like Perl, the backslash is ignored. (Perl can
be made to issue a warning.)
meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
(Perl can be made to issue a warning.)
<br>
<br>
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
@@ -119,38 +134,46 @@ only at the first matching position in the subject string.
options for <b>pcre_exec()</b> have no Perl equivalents.
<br>
<br>
(g) The (?R), (?number), and (?P&#62;name) constructs allows for recursive pattern
matching (Perl can do this using the (?p{code}) construct, which PCRE cannot
support.)
(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
by the PCRE_BSR_ANYCRLF option.
<br>
<br>
(h) PCRE supports named capturing substrings, using the Python syntax.
(h) The callout facility is PCRE-specific.
<br>
<br>
(i) PCRE supports the possessive quantifier "++" syntax, taken from Sun's Java
package.
(i) The partial matching facility is PCRE-specific.
<br>
<br>
(j) The (R) condition, for testing recursion, is a PCRE extension.
<br>
<br>
(k) The callout facility is PCRE-specific.
<br>
<br>
(l) The partial matching facility is PCRE-specific.
<br>
<br>
(m) Patterns compiled by PCRE can be saved and re-used at a later time, even on
(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
different hosts that have the other endianness.
<br>
<br>
(n) The alternative matching function (<b>pcre_dfa_exec()</b>) matches in a
(k) The alternative matching function (<b>pcre_dfa_exec()</b>) matches in a
different way and is not Perl-compatible.
</P>
<P>
Last updated: 06 June 2006
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<br>
(l) PCRE recognizes some special sequences such as (*CR) at the start of
a pattern that set overall options that cannot be changed within the pattern.
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 11 September 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -16,20 +16,20 @@ man page, in case the conversion went wrong.
<li><a name="TOC1" href="#SEC1">SYNOPSIS OF C++ WRAPPER</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">MATCHING INTERFACE</a>
<li><a name="TOC4" href="#SEC4">PARTIAL MATCHES</a>
<li><a name="TOC5" href="#SEC5">UTF-8 AND THE MATCHING INTERFACE</a>
<li><a name="TOC6" href="#SEC6">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a>
<li><a name="TOC7" href="#SEC7">SCANNING TEXT INCREMENTALLY</a>
<li><a name="TOC8" href="#SEC8">PARSING HEX/OCTAL/C-RADIX NUMBERS</a>
<li><a name="TOC9" href="#SEC9">REPLACING PARTS OF STRINGS</a>
<li><a name="TOC10" href="#SEC10">AUTHOR</a>
<li><a name="TOC4" href="#SEC4">QUOTING METACHARACTERS</a>
<li><a name="TOC5" href="#SEC5">PARTIAL MATCHES</a>
<li><a name="TOC6" href="#SEC6">UTF-8 AND THE MATCHING INTERFACE</a>
<li><a name="TOC7" href="#SEC7">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a>
<li><a name="TOC8" href="#SEC8">SCANNING TEXT INCREMENTALLY</a>
<li><a name="TOC9" href="#SEC9">PARSING HEX/OCTAL/C-RADIX NUMBERS</a>
<li><a name="TOC10" href="#SEC10">REPLACING PARTS OF STRINGS</a>
<li><a name="TOC11" href="#SEC11">AUTHOR</a>
<li><a name="TOC12" href="#SEC12">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS OF C++ WRAPPER</a><br>
<P>
<b>#include &#60;pcrecpp.h&#62;</b>
</P>
<P>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
The C++ wrapper for PCRE was provided by Google Inc. Some additional
@@ -101,16 +101,43 @@ The function returns true iff all of the following conditions are satisfied:
c. The "i"th argument has a suitable type for holding the
string captured as the "i"th sub-pattern. If you pass in
NULL for the "i"th argument, or pass fewer arguments than
void * NULL for the "i"th argument, or a non-void * NULL
of the correct type, or pass fewer arguments than the
number of sub-patterns, "i"th captured sub-pattern is
ignored.
</pre>
CAVEAT: An optional sub-pattern that does not exist in the matched
string is assigned the empty string. Therefore, the following will
return false (because the empty string is not a valid number):
<pre>
int number;
pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
</pre>
The matching interface supports at most 16 arguments per call.
If you need more, consider using the more general interface
<b>pcrecpp::RE::DoMatch</b>. See <b>pcrecpp.h</b> for the signature for
<b>DoMatch</b>.
</P>
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHES</a><br>
<P>
NOTE: Do not use <b>no_arg</b>, which is used internally to mark the end of a
list of optional arguments, as a placeholder for missing arguments, as this can
lead to segfaults.
</P>
<br><a name="SEC4" href="#TOC1">QUOTING METACHARACTERS</a><br>
<P>
You can use the "QuoteMeta" operation to insert backslashes before all
potentially meaningful characters in a string. The returned string, used as a
regular expression, will exactly match the original string.
<pre>
Example:
string quoted = RE::QuoteMeta(unquoted);
</pre>
Note that it's legal to escape a character even if it has no special meaning in
a regular expression -- so this function does that. (This also makes it
identical to the perl function of the same name; see "perldoc -f quotemeta".)
For example, "1.5-2.0?" becomes "1\.5\-2\.0\?".
</P>
<br><a name="SEC5" href="#TOC1">PARTIAL MATCHES</a><br>
<P>
You can use the "PartialMatch" operation when you want the pattern
to match any substring of the text.
@@ -125,7 +152,7 @@ to match any substring of the text.
assert(number == 100);
</PRE>
</P>
<br><a name="SEC5" href="#TOC1">UTF-8 AND THE MATCHING INTERFACE</a><br>
<br><a name="SEC6" href="#TOC1">UTF-8 AND THE MATCHING INTERFACE</a><br>
<P>
By default, pattern and text are plain text, one byte per character. The UTF8
flag, passed to the constructor, causes both pattern and string to be treated
@@ -150,7 +177,7 @@ NOTE: The UTF8 flag is ignored if pcre was not configured with the
--enable-utf8 flag.
</PRE>
</P>
<br><a name="SEC6" href="#TOC1">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a><br>
<br><a name="SEC7" href="#TOC1">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a><br>
<P>
PCRE defines some modifiers to change the behavior of the regular expression
engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
@@ -244,7 +271,7 @@ PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
</PRE>
</P>
<br><a name="SEC7" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br>
<br><a name="SEC8" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br>
<P>
The "Consume" operation may be useful if you want to repeatedly
match regular expressions at the front of a string and skip over
@@ -277,7 +304,7 @@ could extract all words from a string by repeatedly calling
pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
</PRE>
</P>
<br><a name="SEC8" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br>
<br><a name="SEC9" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br>
<P>
By default, if you pass a pointer to a numeric value, the
corresponding text is interpreted as a base-10 number. You can
@@ -295,7 +322,7 @@ prefixes, but defaults to base-10.
</pre>
will leave 64 in a, b, c, and d.
</P>
<br><a name="SEC9" href="#TOC1">REPLACING PARTS OF STRINGS</a><br>
<br><a name="SEC10" href="#TOC1">REPLACING PARTS OF STRINGS</a><br>
<P>
You can replace the first match of "pattern" in "str" with "rewrite".
Within "rewrite", backslash-escaped digits (\1 to \9) can be
@@ -327,11 +354,17 @@ The non-matching portions of "text" are ignored. Returns true iff a match
occurred and the extraction happened successfully; if no match occurs, the
string is left unaffected.
</P>
<br><a name="SEC10" href="#TOC1">AUTHOR</a><br>
<br><a name="SEC11" href="#TOC1">AUTHOR</a><br>
<P>
The C++ wrapper was contributed by Google Inc.
<br>
Copyright &copy; 2005 Google Inc.
Copyright &copy; 2007 Google Inc.
<br>
</P>
<br><a name="SEC12" href="#TOC1">REVISION</a><br>
<P>
Last updated: 17 March 2009
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -15,14 +15,17 @@ man page, in case the conversion went wrong.
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">OPTIONS</a>
<li><a name="TOC4" href="#SEC4">ENVIRONMENT VARIABLES</a>
<li><a name="TOC5" href="#SEC5">NEWLINES</a>
<li><a name="TOC6" href="#SEC6">OPTIONS COMPATIBILITY</a>
<li><a name="TOC7" href="#SEC7">OPTIONS WITH DATA</a>
<li><a name="TOC8" href="#SEC8">MATCHING ERRORS</a>
<li><a name="TOC9" href="#SEC9">DIAGNOSTICS</a>
<li><a name="TOC10" href="#SEC10">AUTHOR</a>
<li><a name="TOC3" href="#SEC3">SUPPORT FOR COMPRESSED FILES</a>
<li><a name="TOC4" href="#SEC4">OPTIONS</a>
<li><a name="TOC5" href="#SEC5">ENVIRONMENT VARIABLES</a>
<li><a name="TOC6" href="#SEC6">NEWLINES</a>
<li><a name="TOC7" href="#SEC7">OPTIONS COMPATIBILITY</a>
<li><a name="TOC8" href="#SEC8">OPTIONS WITH DATA</a>
<li><a name="TOC9" href="#SEC9">MATCHING ERRORS</a>
<li><a name="TOC10" href="#SEC10">DIAGNOSTICS</a>
<li><a name="TOC11" href="#SEC11">SEE ALSO</a>
<li><a name="TOC12" href="#SEC12">AUTHOR</a>
<li><a name="TOC13" href="#SEC13">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
@@ -33,9 +36,9 @@ man page, in case the conversion went wrong.
<b>pcregrep</b> searches files for character patterns, in the same way as other
grep commands do, but it uses the PCRE regular expression library to support
patterns that are compatible with the regular expressions of Perl 5. See
<a href="pcrepattern.html"><b>pcrepattern</b></a>
for a full description of syntax and semantics of the regular expressions that
PCRE supports.
<a href="pcrepattern.html"><b>pcrepattern</b>(3)</a>
for a full description of syntax and semantics of the regular expressions
that PCRE supports.
</P>
<P>
Patterns, whether supplied on the command line or in a separate file, are given
@@ -45,9 +48,9 @@ without delimiters. For example:
</pre>
If you attempt to use delimiters (for example, by surrounding a pattern with
slashes, as is common in Perl scripts), they are interpreted as part of the
pattern. Quotes can of course be used on the command line because they are
interpreted by the shell, and indeed they are required if a pattern contains
white space or shell metacharacters.
pattern. Quotes can of course be used to delimit patterns on the command line
because they are interpreted by the shell, and indeed they are required if a
pattern contains white space or shell metacharacters.
</P>
<P>
The first argument that follows any option settings is treated as the single
@@ -63,23 +66,58 @@ For example:
<pre>
pcregrep some-pattern /file1 - /file3
</pre>
By default, each line that matches the pattern is copied to the standard
By default, each line that matches a pattern is copied to the standard
output, and if there is more than one file, the file name is output at the
start of each line. However, there are options that can change how
<b>pcregrep</b> behaves. In particular, the <b>-M</b> option makes it possible to
search for patterns that span line boundaries. What defines a line boundary is
controlled by the <b>-N</b> (<b>--newline</b>) option.
start of each line, followed by a colon. However, there are options that can
change how <b>pcregrep</b> behaves. In particular, the <b>-M</b> option makes it
possible to search for patterns that span line boundaries. What defines a line
boundary is controlled by the <b>-N</b> (<b>--newline</b>) option.
</P>
<P>
Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
BUFSIZ is defined in <b>&#60;stdio.h&#62;</b>.
BUFSIZ is defined in <b>&#60;stdio.h&#62;</b>. When there is more than one pattern
(specified by the use of <b>-e</b> and/or <b>-f</b>), each pattern is applied to
each line in the order in which they are defined, except that all the <b>-e</b>
patterns are tried before the <b>-f</b> patterns.
</P>
<P>
By default, as soon as one pattern matches (or fails to match when <b>-v</b> is
used), no further patterns are considered. However, if <b>--colour</b> (or
<b>--color</b>) is used to colour the matching substrings, or if
<b>--only-matching</b>, <b>--file-offsets</b>, or <b>--line-offsets</b> is used to
output only the part of the line that matched (either shown literally, or as an
offset), scanning resumes immediately following the match, so that further
matches on the same line can be found. If there are multiple patterns, they are
all tried on the remainder of the line, but patterns that follow the one that
matched are not tried on the earlier part of the line.
</P>
<P>
This is the same behaviour as GNU grep, but it does mean that the order in
which multiple patterns are specified can affect the output when one of the
above options is used.
</P>
<P>
Patterns that can match an empty string are accepted, but empty string
matches are not recognized. An example is the pattern "(super)?(man)?", in
which all components are optional. This pattern finds all occurrences of both
"super" and "man"; the output differs from matching with "super|man" when only
the matching substrings are being shown.
</P>
<P>
If the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variable is set,
<b>pcregrep</b> uses the value to set a locale when calling the PCRE library.
The <b>--locale</b> option can be used to override this.
</P>
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
<br><a name="SEC3" href="#TOC1">SUPPORT FOR COMPRESSED FILES</a><br>
<P>
It is possible to compile <b>pcregrep</b> so that it uses <b>libz</b> or
<b>libbz2</b> to read files whose names end in <b>.gz</b> or <b>.bz2</b>,
respectively. You can find out whether your binary has support for one or both
of these file types by running it with the <b>--help</b> option. If the
appropriate support is not present, files are treated as plain text. The
standard input is always so treated.
</P>
<br><a name="SEC4" href="#TOC1">OPTIONS</a><br>
<P>
<b>--</b>
This terminate the list of options. It is useful if the next item on the
@@ -124,16 +162,21 @@ equals sign.
</P>
<P>
<b>--colour=</b><i>value</i>, <b>--color=</b><i>value</i>
This option specifies under what circumstances the part of a line that matched
a pattern should be coloured in the output. The value may be "never" (the
default), "always", or "auto". In the latter case, colouring happens only if
the standard output is connected to a terminal. The colour can be specified by
setting the environment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
of this variable should be a string of two numbers, separated by a semicolon.
They are copied directly into the control string for setting colour on a
terminal, so it is your responsibility to ensure that they make sense. If
neither of the environment variables is set, the default is "1;31", which gives
red.
This option specifies under what circumstances the parts of a line that matched
a pattern should be coloured in the output. By default, the output is not
coloured. The value (which is optional, see above) may be "never", "always", or
"auto". In the latter case, colouring happens only if the standard output is
connected to a terminal. More resources are used when colouring is enabled,
because <b>pcregrep</b> has to search for all possible matches in a line, not
just one, in order to colour them all.
</P>
<P>
The colour that is used can be specified by setting the environment variable
PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
string of two numbers, separated by a semicolon. They are copied directly into
the control string for setting colour on a terminal, so it is your
responsibility to ensure that they make sense. If neither of the environment
variables is set, the default is "1;31", which gives red.
</P>
<P>
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
@@ -150,30 +193,43 @@ are read as if they were ordinary files. In some operating systems the effect
of reading a directory like this is an immediate end-of-file.
</P>
<P>
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>,
<b>--regexp=</b><i>pattern</i> Specify a pattern to be matched. This option can
be used multiple times in order to specify several patterns. It can also be
used as a way of specifying a single pattern that starts with a hyphen. When
<b>-e</b> is used, no argument pattern is taken from the command line; all
arguments are treated as file names. There is an overall maximum of 100
patterns. They are applied to each line in the order in which they are defined
until one matches (or fails to match if <b>-v</b> is used). If <b>-f</b> is used
with <b>-e</b>, the command line patterns are matched first, followed by the
patterns from the file, independent of the order in which these options are
specified. Note that multiple use of <b>-e</b> is not the same as a single
pattern with alternatives. For example, X|Y finds the first character in a line
that is X or Y, whereas if the two patterns are given separately,
<b>pcregrep</b> finds X if it is present, even if it follows Y in the line. It
finds Y only if there is no X in the line. This really matters only if you are
using <b>-o</b> to show the portion of the line that matched.
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>, <b>--regexp=</b><i>pattern</i>
Specify a pattern to be matched. This option can be used multiple times in
order to specify several patterns. It can also be used as a way of specifying a
single pattern that starts with a hyphen. When <b>-e</b> is used, no argument
pattern is taken from the command line; all arguments are treated as file
names. There is an overall maximum of 100 patterns. They are applied to each
line in the order in which they are defined until one matches (or fails to
match if <b>-v</b> is used). If <b>-f</b> is used with <b>-e</b>, the command line
patterns are matched first, followed by the patterns from the file, independent
of the order in which these options are specified. Note that multiple use of
<b>-e</b> is not the same as a single pattern with alternatives. For example,
X|Y finds the first character in a line that is X or Y, whereas if the two
patterns are given separately, <b>pcregrep</b> finds X if it is present, even if
it follows Y in the line. It finds Y only if there is no X in the line. This
really matters only if you are using <b>-o</b> to show the part(s) of the line
that matched.
</P>
<P>
<b>--exclude</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the files in a directory as a consequence of
the <b>-r</b> (recursive search) option, any files whose names match the pattern
are excluded. The pattern is a PCRE regular expression. If a file name matches
both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no short
form for this option.
the <b>-r</b> (recursive search) option, any regular files whose names match the
pattern are excluded. Subdirectories are not excluded by this option; they are
searched recursively, subject to the <b>--exclude_dir</b> and
<b>--include_dir</b> options. The pattern is a PCRE regular expression, and is
matched against the final component of the file name (not the entire path). If
a file name matches both <b>--include</b> and <b>--exclude</b>, it is excluded.
There is no short form for this option.
</P>
<P>
<b>--exclude_dir</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the contents of a directory as a consequence
of the <b>-r</b> (recursive search) option, any subdirectories whose names match
the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
subdirectories.) The pattern is a PCRE regular expression, and is matched
against the final component of the name (not the entire path). If a
subdirectory name matches both <b>--include_dir</b> and <b>--exclude_dir</b>, it
is excluded. There is no short form for this option.
</P>
<P>
<b>-F</b>, <b>--fixed-strings</b>
@@ -193,27 +249,37 @@ present; they are tested before the file's patterns. However, no other pattern
is taken from the command line; all arguments are treated as file names. There
is an overall maximum of 100 patterns. Trailing white space is removed from
each line, and blank lines are ignored. An empty file contains no patterns and
therefore matches nothing.
therefore matches nothing. See also the comments about multiple patterns versus
a single pattern with alternatives in the description of <b>-e</b> above.
</P>
<P>
<b>--file-offsets</b>
Instead of showing lines or parts of lines that match, show each match as an
offset from the start of the file and a length, separated by a comma. In this
mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b>
options are ignored. If there is more than one match in a line, each of them is
shown separately. This option is mutually exclusive with <b>--line-offsets</b>
and <b>--only-matching</b>.
</P>
<P>
<b>-H</b>, <b>--with-filename</b>
Force the inclusion of the filename at the start of output lines when searching
a single file. By default, the filename is not shown in this case. For matching
lines, the filename is followed by a colon and a space; for context lines, a
hyphen separator is used. If a line number is also being output, it follows the
file name without a space.
lines, the filename is followed by a colon; for context lines, a hyphen
separator is used. If a line number is also being output, it follows the file
name.
</P>
<P>
<b>-h</b>, <b>--no-filename</b>
Suppress the output filenames when searching multiple files. By default,
filenames are shown when multiple files are searched. For matching lines, the
filename is followed by a colon and a space; for context lines, a hyphen
separator is used. If a line number is also being output, it follows the file
name without a space.
filename is followed by a colon; for context lines, a hyphen separator is used.
If a line number is also being output, it follows the file name.
</P>
<P>
<b>--help</b>
Output a brief help message and exit.
Output a help message, giving brief details of the command options and file
type support, and then exit.
</P>
<P>
<b>-i</b>, <b>--ignore-case</b>
@@ -222,10 +288,23 @@ Ignore upper/lower case distinctions during comparisons.
<P>
<b>--include</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the files in a directory as a consequence of
the <b>-r</b> (recursive search) option, only those files whose names match the
pattern are included. The pattern is a PCRE regular expression. If a file name
matches both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no
short form for this option.
the <b>-r</b> (recursive search) option, only those regular files whose names
match the pattern are included. Subdirectories are always included and searched
recursively, subject to the \fP--include_dir\fP and <b>--exclude_dir</b>
options. The pattern is a PCRE regular expression, and is matched against the
final component of the file name (not the entire path). If a file name matches
both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no short
form for this option.
</P>
<P>
<b>--include_dir</b>=<i>pattern</i>
When <b>pcregrep</b> is searching the contents of a directory as a consequence
of the <b>-r</b> (recursive search) option, only those subdirectories whose
names match the pattern are included. (Note that the <b>--include</b> option
does not affect subdirectories.) The pattern is a PCRE regular expression, and
is matched against the final component of the name (not the entire path). If a
subdirectory name matches both <b>--include_dir</b> and <b>--exclude_dir</b>, it
is excluded. There is no short form for this option.
</P>
<P>
<b>-L</b>, <b>--files-without-match</b>
@@ -247,6 +326,16 @@ are being output. If not supplied, "(standard input)" is used. There is no
short form for this option.
</P>
<P>
<b>--line-offsets</b>
Instead of showing lines or parts of lines that match, show each match as a
line number, the offset from the start of the line, and a length. The line
number is terminated by a colon (as usual; see the <b>-n</b> option), and the
offset and length are separated by a comma. In this mode, no context is shown.
That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. If there is
more than one match in a line, each of them is shown separately. This option is
mutually exclusive with <b>--file-offsets</b> and <b>--only-matching</b>.
</P>
<P>
<b>--locale</b>=<i>locale-name</i>
This option specifies a locale to be used for pattern matching. It overrides
the value in the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variables. If no
@@ -268,28 +357,41 @@ are guaranteed to be available for lookbehind assertions.
</P>
<P>
<b>-N</b> <i>newline-type</i>, <b>--newline=</b><i>newline-type</i>
The PCRE library supports three different character sequences for indicating
The PCRE library supports five different conventions for indicating
the ends of lines. They are the single-character sequences CR (carriage return)
and LF (linefeed), and the two-character sequence CR, LF. When the library is
built, a default line-ending sequence is specified. This is normally the
standard sequence for the operating system. Unless otherwise specified by this
option, <b>pcregrep</b> uses the default. The possible values for this option
are CR, LF, or CRLF. This makes it possible to use <b>pcregrep</b> on files that
have come from other environments without having to modify their line endings.
If the data that is being scanned does not agree with the convention set by
this option, <b>pcregrep</b> may behave in strange ways.
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
which recognizes any of the preceding three types, and an "any" convention, in
which any Unicode line ending sequence is assumed to end a line. The Unicode
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
PS (paragraph separator, U+2029).
<br>
<br>
When the PCRE library is built, a default line-ending sequence is specified.
This is normally the standard sequence for the operating system. Unless
otherwise specified by this option, <b>pcregrep</b> uses the library's default.
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
makes it possible to use <b>pcregrep</b> on files that have come from other
environments without having to modify their line endings. If the data that is
being scanned does not agree with the convention set by this option,
<b>pcregrep</b> may behave in strange ways.
</P>
<P>
<b>-n</b>, <b>--line-number</b>
Precede each output line by its line number in the file, followed by a colon
and a space for matching lines or a hyphen and a space for context lines. If
the filename is also being output, it precedes the line number.
for matching lines or a hyphen for context lines. If the filename is also being
output, it precedes the line number. This option is forced if
<b>--line-offsets</b> is used.
</P>
<P>
<b>-o</b>, <b>--only-matching</b>
Show only the part of the line that matched a pattern. In this mode, no
context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are
ignored.
ignored. If there is more than one match in a line, each of them is shown
separately. If <b>-o</b> is combined with <b>-v</b> (invert the sense of the
match to find non-matching lines), no output is generated, but the return code
is set appropriately. This option is mutually exclusive with
<b>--file-offsets</b> and <b>--line-offsets</b>.
</P>
<P>
<b>-q</b>, <b>--quiet</b>
@@ -332,20 +434,20 @@ Force the patterns to match only whole words. This is equivalent to having \b
at the start and end of the pattern.
</P>
<P>
<b>-x</b>, <b>--line-regex</b>, \fP--line-regexp\fP
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
Force the patterns to be anchored (each must start matching at the beginning of
a line) and in addition, require them to match entire lines. This is
equivalent to having ^ and $ characters at the start and end of each
alternative branch in every pattern.
</P>
<br><a name="SEC4" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
<br><a name="SEC5" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
<P>
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
order, for a locale. The first one that is set is used. This can be overridden
by the <b>--locale</b> option. If no locale is set, the PCRE library's default
(usually the "C" locale) is used.
</P>
<br><a name="SEC5" href="#TOC1">NEWLINES</a><br>
<br><a name="SEC6" href="#TOC1">NEWLINES</a><br>
<P>
The <b>-N</b> (<b>--newline</b>) option allows <b>pcregrep</b> to scan files with
different newline conventions from the default. However, the setting of this
@@ -354,7 +456,7 @@ the standard error and output streams. It uses the string "\n" in C
<b>printf()</b> calls to indicate newlines, relying on the C I/O library to
convert this to an appropriate sequence if the output is sent to a file.
</P>
<br><a name="SEC6" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
<br><a name="SEC7" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
<P>
The majority of short and long forms of <b>pcregrep</b>'s options are the same
as in the GNU <b>grep</b> program. Any long option of the form
@@ -362,7 +464,7 @@ as in the GNU <b>grep</b> program. Any long option of the form
(PCRE terminology). However, the <b>--locale</b>, <b>-M</b>, <b>--multiline</b>,
<b>-u</b>, and <b>--utf-8</b> options are specific to <b>pcregrep</b>.
</P>
<br><a name="SEC7" href="#TOC1">OPTIONS WITH DATA</a><br>
<br><a name="SEC8" href="#TOC1">OPTIONS WITH DATA</a><br>
<P>
There are four different ways in which an option with data can be specified.
If a short form option is used, the data may follow immediately, or in the next
@@ -389,7 +491,7 @@ for which the data is optional. If this option does have data, it must be given
in the first form, using an equals character. Otherwise it will be assumed that
it has no data.
</P>
<br><a name="SEC8" href="#TOC1">MATCHING ERRORS</a><br>
<br><a name="SEC9" href="#TOC1">MATCHING ERRORS</a><br>
<P>
It is possible to supply a regular expression that takes a very long time to
fail to match certain lines. Such patterns normally involve nested indefinite
@@ -399,7 +501,7 @@ in these circumstances. If this happens, <b>pcregrep</b> outputs an error
message and the line that caused the problem to the standard error stream. If
there are more than 20 such errors, <b>pcregrep</b> gives up.
</P>
<br><a name="SEC9" href="#TOC1">DIAGNOSTICS</a><br>
<br><a name="SEC10" href="#TOC1">DIAGNOSTICS</a><br>
<P>
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
for syntax errors and non-existent or inacessible files (even if matches were
@@ -407,18 +509,25 @@ found in other files) or too many matching errors. Using the <b>-s</b> option to
suppress error messages about inaccessble files does not affect the return
code.
</P>
<br><a name="SEC10" href="#TOC1">AUTHOR</a><br>
<br><a name="SEC11" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcrepattern</b>(3), <b>pcretest</b>(1).
</P>
<br><a name="SEC12" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QG, England.
</P>
<P>
Last updated: 06 June 2006
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
<P>
Last updated: 01 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -16,9 +16,11 @@ man page, in case the conversion went wrong.
<li><a name="TOC1" href="#SEC1">PCRE MATCHING ALGORITHMS</a>
<li><a name="TOC2" href="#SEC2">REGULAR EXPRESSIONS AS TREES</a>
<li><a name="TOC3" href="#SEC3">THE STANDARD MATCHING ALGORITHM</a>
<li><a name="TOC4" href="#SEC4">THE DFA MATCHING ALGORITHM</a>
<li><a name="TOC5" href="#SEC5">ADVANTAGES OF THE DFA ALGORITHM</a>
<li><a name="TOC6" href="#SEC6">DISADVANTAGES OF THE DFA ALGORITHM</a>
<li><a name="TOC4" href="#SEC4">THE ALTERNATIVE MATCHING ALGORITHM</a>
<li><a name="TOC5" href="#SEC5">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a>
<li><a name="TOC6" href="#SEC6">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a>
<li><a name="TOC7" href="#SEC7">AUTHOR</a>
<li><a name="TOC8" href="#SEC8">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE MATCHING ALGORITHMS</a><br>
<P>
@@ -46,7 +48,7 @@ is matched against the string
&#60;something&#62; &#60;something else&#62; &#60;something further&#62;
</pre>
there are three possible answers. The standard algorithm finds only one of
them, whereas the DFA algorithm finds all three.
them, whereas the alternative algorithm finds all three.
</P>
<br><a name="SEC2" href="#TOC1">REGULAR EXPRESSIONS AS TREES</a><br>
<P>
@@ -59,8 +61,8 @@ correspond to the two matching algorithms provided by PCRE.
</P>
<br><a name="SEC3" href="#TOC1">THE STANDARD MATCHING ALGORITHM</a><br>
<P>
In the terminology of Jeffrey Friedl's book \fIMastering Regular
Expressions\fP, the standard algorithm is an "NFA algorithm". It conducts a
In the terminology of Jeffrey Friedl's book "Mastering Regular
Expressions", the standard algorithm is an "NFA algorithm". It conducts a
depth-first search of the pattern tree. That is, it proceeds along a single
path through the tree, checking that the subject matches what is required. When
there is a mismatch, the algorithm tries any alternatives at the current point,
@@ -83,14 +85,15 @@ straightforward for this algorithm to keep track of the substrings that are
matched by portions of the pattern in parentheses. This provides support for
capturing parentheses and back references.
</P>
<br><a name="SEC4" href="#TOC1">THE DFA MATCHING ALGORITHM</a><br>
<br><a name="SEC4" href="#TOC1">THE ALTERNATIVE MATCHING ALGORITHM</a><br>
<P>
DFA stands for "deterministic finite automaton", but you do not need to
understand the origins of that name. This algorithm conducts a breadth-first
search of the tree. Starting from the first matching point in the subject, it
scans the subject string from left to right, once, character by character, and
as it does this, it remembers all the paths through the tree that represent
valid matches.
This algorithm conducts a breadth-first search of the tree. Starting from the
first matching point in the subject, it scans the subject string from left to
right, once, character by character, and as it does this, it remembers all the
paths through the tree that represent valid matches. In Friedl's terminology,
this is a kind of "DFA algorithm", though it is not implemented as a
traditional finite state machine (it keeps multiple states active
simultaneously).
</P>
<P>
The scan continues until either the end of the subject is reached, or there are
@@ -114,12 +117,21 @@ matches that start at later positions.
</P>
<P>
There are a number of features of PCRE regular expressions that are not
supported by the DFA matching algorithm. They are as follows:
supported by the alternative matching algorithm. They are as follows:
</P>
<P>
1. Because the algorithm finds all possible matches, the greedy or ungreedy
nature of repetition quantifiers is not relevant. Greedy and ungreedy
quantifiers are treated in exactly the same way.
quantifiers are treated in exactly the same way. However, possessive
quantifiers can make a difference when what follows could also match what is
quantified, for example in a pattern like this:
<pre>
^a++\w!
</pre>
This pattern matches "aaab!" but not "aaa!", which would be matched by a
non-possessive quantifier. Similarly, if an atomic group is present, it is
matched as if it were a standalone pattern at the current point, and the
longest match is then "locked in" for the rest of the overall pattern.
</P>
<P>
2. When dealing with multiple paths through the tree simultaneously, it is not
@@ -133,22 +145,30 @@ not supported, and cause errors if encountered.
</P>
<P>
4. For the same reason, conditional expressions that use a backreference as the
condition are not supported.
condition or test for a specific group recursion are not supported.
</P>
<P>
5. Callouts are supported, but the value of the <i>capture_top</i> field is
5. Because many paths through the tree may be active, the \K escape sequence,
which resets the start of the match when encountered (but may be on some paths
and not on others), is not supported. It causes an error if encountered.
</P>
<P>
6. Callouts are supported, but the value of the <i>capture_top</i> field is
always 1, and the value of the <i>capture_last</i> field is always -1.
</P>
<P>
6.
The \C escape sequence, which (in the standard algorithm) matches a single
byte, even in UTF-8 mode, is not supported because the DFA algorithm moves
through the subject string one character at a time, for all active paths
7. The \C escape sequence, which (in the standard algorithm) matches a single
byte, even in UTF-8 mode, is not supported because the alternative algorithm
moves through the subject string one character at a time, for all active paths
through the tree.
</P>
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE DFA ALGORITHM</a><br>
<P>
Using the DFA matching algorithm provides the following advantages:
8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
</P>
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
<P>
Using the alternative matching algorithm provides the following advantages:
</P>
<P>
1. All possible matches (at a single point in the subject) are automatically
@@ -159,17 +179,18 @@ callouts.
<P>
2. There is much better support for partial matching. The restrictions on the
content of the pattern that apply when using the standard algorithm for partial
matching do not apply to the DFA algorithm. For non-anchored patterns, the
starting position of a partial match is available.
matching do not apply to the alternative algorithm. For non-anchored patterns,
the starting position of a partial match is available.
</P>
<P>
3. Because the DFA algorithm scans the subject string just once, and never
needs to backtrack, it is possible to pass very long subject strings to the
matching function in several pieces, checking for partial matching each time.
3. Because the alternative algorithm scans the subject string just once, and
never needs to backtrack, it is possible to pass very long subject strings to
the matching function in several pieces, checking for partial matching each
time.
</P>
<br><a name="SEC6" href="#TOC1">DISADVANTAGES OF THE DFA ALGORITHM</a><br>
<br><a name="SEC6" href="#TOC1">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
<P>
The DFA algorithm suffers from a number of disadvantages:
The alternative algorithm suffers from a number of disadvantages:
</P>
<P>
1. It is substantially slower than the standard algorithm. This is partly
@@ -180,13 +201,24 @@ less susceptible to optimization.
2. Capturing parentheses and back references are not supported.
</P>
<P>
3. The "atomic group" feature of PCRE regular expressions is supported, but
does not provide the advantage that it does for the standard algorithm.
3. Although atomic groups are supported, their use does not provide the
performance advantage that it does for the standard algorithm.
</P>
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
<P>
Last updated: 06 June 2006
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
<P>
Last updated: 19 April 2008
<br>
Copyright &copy; 1997-2008 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -17,6 +17,8 @@ man page, in case the conversion went wrong.
<li><a name="TOC2" href="#SEC2">RESTRICTED PATTERNS FOR PCRE_PARTIAL</a>
<li><a name="TOC3" href="#SEC3">EXAMPLE OF PARTIAL MATCHING USING PCRETEST</a>
<li><a name="TOC4" href="#SEC4">MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE</a><br>
<P>
@@ -90,6 +92,8 @@ envisaged for this facility, this is not felt to be a major restriction.
<P>
If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
<b>pcre_exec()</b> returns the error code PCRE_ERROR_BADPARTIAL (-13).
You can use the PCRE_INFO_OKPARTIAL call to <b>pcre_fullinfo()</b> to find out
if a compiled pattern can be used for partial matching.
</P>
<br><a name="SEC3" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRETEST</a><br>
<P>
@@ -112,8 +116,9 @@ uses the date example quoted above:
</pre>
The first data string is matched completely, so <b>pcretest</b> shows the
matched substrings. The remaining four strings do not match the complete
pattern, but the first two are partial matches. The same test, using DFA
matching (by means of the \D escape sequence), produces the following output:
pattern, but the first two are partial matches. The same test, using
<b>pcre_dfa_exec()</b> matching (by means of the \D escape sequence), produces
the following output:
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 25jun04\P\D
@@ -134,11 +139,11 @@ available.
<P>
When a partial match has been found using <b>pcre_dfa_exec()</b>, it is possible
to continue the match by providing additional subject data and calling
<b>pcre_dfa_exec()</b> again with the PCRE_DFA_RESTART option and the same
working space (where details of the previous partial match are stored). Here is
an example using <b>pcretest</b>, where the \R escape sequence sets the
PCRE_DFA_RESTART option and the \D escape sequence requests the use of
<b>pcre_dfa_exec()</b>:
<b>pcre_dfa_exec()</b> again with the same compiled regular expression, this
time setting the PCRE_DFA_RESTART option. You must also pass the same working
space as before, because this is where details of the previous partial match
are stored. Here is an example using <b>pcretest</b>, using the \R escape
sequence to set the PCRE_DFA_RESTART option (\P and \D are as above):
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 23ja\P\D
@@ -153,9 +158,10 @@ not retain the previously partially-matched string. It is up to the calling
program to do that if it needs to.
</P>
<P>
This facility can be used to pass very long subject strings to
<b>pcre_dfa_exec()</b>. However, some care is needed for certain types of
pattern.
You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
over multiple segments. This facility can be used to pass very long subject
strings to <b>pcre_dfa_exec()</b>. However, some care is needed for certain
types of pattern.
</P>
<P>
1. If the pattern contains tests for the beginning or end of a line, you need
@@ -165,7 +171,7 @@ subject string for any call does not contain the beginning or end of a line.
<P>
2. If the pattern contains backward assertions (including \b or \B), you need
to arrange for some overlap in the subject strings to allow for this. For
example, you could pass the subject in chunks that were 500 bytes long, but in
example, you could pass the subject in chunks that are 500 bytes long, but in
a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
bytes at the start of the buffer.
</P>
@@ -174,7 +180,7 @@ bytes at the start of the buffer.
always produce exactly the same result as matching over one single long string.
The difference arises when there are multiple matching possibilities, because a
partial match result is given only when there are no completed matches in a
call to fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
call to <b>pcre_dfa_exec()</b>. This means that as soon as the shortest match has
been found, continuation to a new subject segment is no longer possible.
Consider this <b>pcretest</b> example:
<pre>
@@ -216,10 +222,21 @@ patterns or patterns such as:
</pre>
where no string can be a partial match for both alternatives.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Last updated: 16 January 2006
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 04 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

File diff suppressed because it is too large Load Diff

View File

@@ -16,13 +16,73 @@ man page, in case the conversion went wrong.
PCRE PERFORMANCE
</b><br>
<P>
Certain items that may appear in regular expression patterns are more efficient
Two aspects of performance are discussed below: memory usage and processing
time. The way you express your pattern as a regular expression can affect both
of them.
</P>
<br><b>
MEMORY USAGE
</b><br>
<P>
Patterns are compiled by PCRE into a reasonably efficient byte code, so that
most simple patterns do not use much memory. However, there is one case where
memory usage can be unexpectedly large. When a parenthesized subpattern has a
quantifier with a minimum greater than 1 and/or a limited maximum, the whole
subpattern is repeated in the compiled code. For example, the pattern
<pre>
(abc|def){2,4}
</pre>
is compiled as if it were
<pre>
(abc|def)(abc|def)((abc|def)(abc|def)?)?
</pre>
(Technical aside: It is done this way so that backtrack points within each of
the repetitions can be independently maintained.)
</P>
<P>
For regular expressions whose quantifiers use only small numbers, this is not
usually a problem. However, if the numbers are large, and particularly if such
repetitions are nested, the memory usage can become an embarrassment. For
example, the very simple pattern
<pre>
((ab){1,1000}c){1,3}
</pre>
uses 51K bytes when compiled. When PCRE is compiled with its default internal
pointer size of two bytes, the size limit on a compiled pattern is 64K, and
this is reached with the above pattern if the outer repetition is increased
from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
handle larger compiled patterns, but it is better to try to rewrite your
pattern to use less memory if you can.
</P>
<P>
One way of reducing the memory usage for such patterns is to make use of PCRE's
<a href="pcrepattern.html#subpatternsassubroutines">"subroutine"</a>
facility. Re-writing the above pattern as
<pre>
((ab)(?2){0,999}c)(?1){0,2}
</pre>
reduces the memory requirements to 18K, and indeed it remains under 20K even
with the outer repetition increased to 100. However, this pattern is not
exactly equivalent, because the "subroutine" calls are treated as
<a href="pcrepattern.html#atomicgroup">atomic groups</a>
into which there can be no backtracking if there is a subsequent matching
failure. Therefore, PCRE cannot do this kind of rewriting automatically.
Furthermore, there is a noticeable loss of speed when executing the modified
pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
speed is acceptable, this kind of rewriting will allow you to process patterns
that PCRE cannot otherwise handle.
</P>
<br><b>
PROCESSING TIME
</b><br>
<P>
Certain items in regular expression patterns are processed more efficiently
than others. It is more efficient to use a character class like [aeiou] than a
set of alternatives such as (a|e|i|o|u). In general, the simplest construction
that provides the required behaviour is usually the most efficient. Jeffrey
Friedl's book contains a lot of useful general discussion about optimizing
regular expressions for efficient performance. This document contains a few
observations about PCRE.
set of single-character alternatives such as (a|e|i|o|u). In general, the
simplest construction that provides the required behaviour is usually the most
efficient. Jeffrey Friedl's book contains a lot of useful general discussion
about optimizing regular expressions for efficient performance. This document
contains a few observations about PCRE.
</P>
<P>
Using Unicode character properties (the \p, \P, and \X escapes) is slow,
@@ -58,14 +118,15 @@ Beware of patterns that contain nested indefinite repeats. These can take a
long time to run when applied to a string that does not match. Consider the
pattern fragment
<pre>
(a+)*
^(a+)*
</pre>
This can match "aaaa" in 33 different ways, and this number increases very
This can match "aaaa" in 16 different ways, and this number increases very
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
times, and for each of those cases other than 0, the + repeats can match
times, and for each of those cases other than 0 or 4, the + repeats can match
different numbers of times.) When the remainder of the pattern is such that the
entire match is going to fail, PCRE has in principle to try every possible
variation, and this can take an extremely long time.
variation, and this can take an extremely long time, even for relatively short
strings.
</P>
<P>
An optimization catches some of the more simple cases such as
@@ -88,10 +149,25 @@ appreciable time with strings longer than about 20 characters.
In many cases, the solution to this kind of performance issue is to use an
atomic group or a possessive quantifier.
</P>
<br><b>
AUTHOR
</b><br>
<P>
Last updated: 28 February 2005
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 06 March 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
Copyright &copy; 1997-2005 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -21,6 +21,7 @@ man page, in case the conversion went wrong.
<li><a name="TOC6" href="#SEC6">ERROR MESSAGES</a>
<li><a name="TOC7" href="#SEC7">MEMORY USAGE</a>
<li><a name="TOC8" href="#SEC8">AUTHOR</a>
<li><a name="TOC9" href="#SEC9">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS OF POSIX API</a><br>
<P>
@@ -58,11 +59,11 @@ command for linking an application that uses them. Because the POSIX functions
call the native ones, it is also necessary to add <b>-lpcre</b>.
</P>
<P>
I have implemented only those option bits that can be reasonably mapped to PCRE
native options. In addition, the option REG_EXTENDED is defined with the value
zero. This has no effect, but since programs that are written to the POSIX
interface often use it, this makes it easier to slot in PCRE as a replacement
library. Other POSIX options are not even defined.
I have implemented only those POSIX option bits that can be reasonably mapped
to PCRE native options. In addition, the option REG_EXTENDED is defined with
the value zero. This has no effect, but since programs that are written to the
POSIX interface often use it, this makes it easier to slot in PCRE as a
replacement library. Other POSIX options are not even defined.
</P>
<P>
When PCRE is called via these functions, it is only the API that is POSIX-like
@@ -179,18 +180,36 @@ REG_NEWLINE action.
<br><a name="SEC5" href="#TOC1">MATCHING A PATTERN</a><br>
<P>
The function <b>regexec()</b> is called to match a compiled pattern <i>preg</i>
against a given <i>string</i>, which is terminated by a zero byte, subject to
the options in <i>eflags</i>. These can be:
against a given <i>string</i>, which is by default terminated by a zero byte
(but see REG_STARTEND below), subject to the options in <i>eflags</i>. These can
be:
<pre>
REG_NOTBOL
</pre>
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
function.
<pre>
REG_NOTEMPTY
</pre>
The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
setting this option can give more POSIX-like behaviour in some situations.
<pre>
REG_NOTEOL
</pre>
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
function.
<pre>
REG_STARTEND
</pre>
The string is considered to start at <i>string</i> + <i>pmatch[0].rm_so</i> and
to have a terminating NUL located at <i>string</i> + <i>pmatch[0].rm_eo</i>
(there need not actually be a NUL at that location), regardless of the value of
<i>nmatch</i>. This is a BSD extension, compatible with but not specified by
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
intended to be portable to other systems. Note that a non-zero <i>rm_so</i> does
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
how it is matched.
</P>
<P>
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
@@ -231,14 +250,17 @@ memory, after which <i>preg</i> may no longer be used as a compiled expression.
<P>
Philip Hazel
<br>
University Computing Service,
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
Cambridge CB2 3QG, England.
</P>
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
<P>
Last updated: 16 January 2006
Last updated: 11 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -17,6 +17,8 @@ man page, in case the conversion went wrong.
<li><a name="TOC2" href="#SEC2">SAVING A COMPILED PATTERN</a>
<li><a name="TOC3" href="#SEC3">RE-USING A PRECOMPILED PATTERN</a>
<li><a name="TOC4" href="#SEC4">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE PATTERNS</a><br>
<P>
@@ -32,7 +34,9 @@ tables, it is a little bit more complicated.
If you save compiled patterns to a file, you can copy them to a different host
and run them there. This works even if the new host has the opposite endianness
to the one on which the patterns were compiled. There may be a small
performance penalty, but it should be insignificant.
performance penalty, but it should be insignificant. However, compiling regular
expressions with one version of PCRE for use with a different version is not
guaranteed to work and may cause crashes.
</P>
<br><a name="SEC2" href="#TOC1">SAVING A COMPILED PATTERN</a><br>
<P>
@@ -120,21 +124,25 @@ usual way.
</P>
<br><a name="SEC4" href="#TOC1">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a><br>
<P>
The layout of the control block that is at the start of the data that makes up
a compiled pattern was changed for release 5.0. If you have any saved patterns
that were compiled with previous releases (not a facility that was previously
advertised), you will have to recompile them for release 5.0. However, from now
on, it should be possible to make changes in a compatible manner.
In general, it is safest to recompile all saved patterns when you update to a
new PCRE release, though not all updates actually require this. Recompiling is
definitely needed for release 7.2.
</P>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Notwithstanding the above, if you have any saved patterns in UTF-8 mode that
use \p or \P that were compiled with any release up to and including 6.4, you
will have to recompile them for release 6.5 and above.
</P>
<P>
Last updated: 01 February 2006
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 13 June 2007
<br>
Copyright &copy; 1997-2007 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -72,10 +72,25 @@ need to add
</pre>
(for example) to the compile command to get round this problem.
</P>
<br><b>
AUTHOR
</b><br>
<P>
Last updated: 09 September 2004
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 23 January 2008
<br>
Copyright &copy; 1997-2008 University of Cambridge.
<br>
Copyright &copy; 1997-2004 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -63,7 +63,7 @@ frame for each matched character. For a long string, a lot of stack is
required. Consider now this rewritten pattern, which matches exactly the same
strings:
<pre>
([^&#60;]++|&#60;(?!inet))
([^&#60;]++|&#60;(?!inet))+
</pre>
This uses very much less stack, because runs of characters that do not contain
"&#60;" are "swallowed" in one item inside the parentheses. Recursion happens only
@@ -73,33 +73,30 @@ backtracking into the runs of non-"&#60;" characters, but that is not related to
stack usage.
</P>
<P>
This example shows that one way of avoiding stack problems when matching long
subject strings is to write repeated parenthesized subpatterns to match more
than one character whenever possible.
</P>
<br><b>
Compiling PCRE to use heap instead of stack
</b><br>
<P>
In environments where stack memory is constrained, you might want to compile
PCRE to use heap memory instead of stack for remembering back-up points. This
makes it run a lot more slowly, however. Details of how to do this are given in
the
<a href="pcrebuild.html"><b>pcrebuild</b></a>
documentation.
</P>
<P>
In Unix-like environments, there is not often a problem with the stack, though
the default limit on stack size varies from system to system. Values from 8Mb
to 64Mb are common. You can find your default limit by running the command:
<pre>
ulimit -s
</pre>
The effect of running out of stack is often SIGSEGV, though sometimes an error
message is given. You can normally increase the limit on stack size by code
such as this:
<pre>
struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = 100*1024*1024;
setrlimit(RLIMIT_STACK, &rlim);
</pre>
This reads the current limits (soft and hard) using <b>getrlimit()</b>, then
attempts to increase the soft limit to 100Mb using <b>setrlimit()</b>. You must
do this before calling <b>pcre_exec()</b>.
documentation. When built in this way, instead of using the stack, PCRE obtains
and frees memory by calling the functions that are pointed to by the
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables. By default, these
point to <b>malloc()</b> and <b>free()</b>, but you can replace the pointers to
cause PCRE to use your own functions. Since the block sizes are always the
same, and are always freed in reverse order, it may be possible to implement
customized memory handlers that are more efficient than the standard functions.
</P>
<br><b>
Limiting PCRE's stack usage
</b><br>
<P>
PCRE has an internal counter that can be used to limit the depth of recursion,
and thus cause <b>pcre_exec()</b> to give an error code before it runs out of
@@ -116,12 +113,60 @@ As a very rough rule of thumb, you should reckon on about 500 bytes per
recursion. Thus, if you want to limit your stack usage to 8Mb, you
should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
support around 128000 recursions. The <b>pcretest</b> test program has a command
line option (<b>-S</b>) that can be used to increase its stack.
line option (<b>-S</b>) that can be used to increase the size of its stack.
</P>
<br><b>
Changing stack size in Unix-like systems
</b><br>
<P>
Last updated: 29 June 2006
In Unix-like environments, there is not often a problem with the stack unless
very long strings are involved, though the default limit on stack size varies
from system to system. Values from 8Mb to 64Mb are common. You can find your
default limit by running the command:
<pre>
ulimit -s
</pre>
Unfortunately, the effect of running out of stack is often SIGSEGV, though
sometimes a more explicit error message is given. You can normally increase the
limit on stack size by code such as this:
<pre>
struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = 100*1024*1024;
setrlimit(RLIMIT_STACK, &rlim);
</pre>
This reads the current limits (soft and hard) using <b>getrlimit()</b>, then
attempts to increase the soft limit to 100Mb using <b>setrlimit()</b>. You must
do this before calling <b>pcre_exec()</b>.
</P>
<br><b>
Changing stack size in Mac OS X
</b><br>
<P>
Using <b>setrlimit()</b>, as described above, should also work on Mac OS X. It
is also possible to set a stack size when linking a program. There is a
discussion about stack sizes in Mac OS X at this web site:
<a href="http://developer.apple.com/qa/qa2005/qa1419.html">http://developer.apple.com/qa/qa2005/qa1419.html.</a>
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 09 July 2008
<br>
Copyright &copy; 1997-2008 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -0,0 +1,473 @@
<html>
<head>
<title>pcresyntax specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcresyntax man page</h1>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
<p>
This page is part of the PCRE HTML documentation. It was generated automatically
from the original man page. If there is any nonsense in it, please consult the
man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE REGULAR EXPRESSION SYNTAX SUMMARY</a>
<li><a name="TOC2" href="#SEC2">QUOTING</a>
<li><a name="TOC3" href="#SEC3">CHARACTERS</a>
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTY CODES FOR \p and \P</a>
<li><a name="TOC6" href="#SEC6">SCRIPT NAMES FOR \p AND \P</a>
<li><a name="TOC7" href="#SEC7">CHARACTER CLASSES</a>
<li><a name="TOC8" href="#SEC8">QUANTIFIERS</a>
<li><a name="TOC9" href="#SEC9">ANCHORS AND SIMPLE ASSERTIONS</a>
<li><a name="TOC10" href="#SEC10">MATCH POINT RESET</a>
<li><a name="TOC11" href="#SEC11">ALTERNATION</a>
<li><a name="TOC12" href="#SEC12">CAPTURING</a>
<li><a name="TOC13" href="#SEC13">ATOMIC GROUPS</a>
<li><a name="TOC14" href="#SEC14">COMMENT</a>
<li><a name="TOC15" href="#SEC15">OPTION SETTING</a>
<li><a name="TOC16" href="#SEC16">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
<li><a name="TOC17" href="#SEC17">BACKREFERENCES</a>
<li><a name="TOC18" href="#SEC18">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
<li><a name="TOC19" href="#SEC19">CONDITIONAL PATTERNS</a>
<li><a name="TOC20" href="#SEC20">BACKTRACKING CONTROL</a>
<li><a name="TOC21" href="#SEC21">NEWLINE CONVENTIONS</a>
<li><a name="TOC22" href="#SEC22">WHAT \R MATCHES</a>
<li><a name="TOC23" href="#SEC23">CALLOUTS</a>
<li><a name="TOC24" href="#SEC24">SEE ALSO</a>
<li><a name="TOC25" href="#SEC25">AUTHOR</a>
<li><a name="TOC26" href="#SEC26">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
<P>
The full syntax and semantics of the regular expressions that are supported by
PCRE are described in the
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation. This document contains just a quick-reference summary of the
syntax.
</P>
<br><a name="SEC2" href="#TOC1">QUOTING</a><br>
<P>
<pre>
\x where x is non-alphanumeric is a literal x
\Q...\E treat enclosed characters as literal
</PRE>
</P>
<br><a name="SEC3" href="#TOC1">CHARACTERS</a><br>
<P>
<pre>
\a alarm, that is, the BEL character (hex 07)
\cx "control-x", where x is any character
\e escape (hex 1B)
\f formfeed (hex 0C)
\n newline (hex 0A)
\r carriage return (hex 0D)
\t tab (hex 09)
\ddd character with octal code ddd, or backreference
\xhh character with hex code hh
\x{hhh..} character with hex code hhh..
</PRE>
</P>
<br><a name="SEC4" href="#TOC1">CHARACTER TYPES</a><br>
<P>
<pre>
. any character except newline;
in dotall mode, any character whatsoever
\C one byte, even in UTF-8 mode (best avoided)
\d a decimal digit
\D a character that is not a decimal digit
\h a horizontal whitespace character
\H a character that is not a horizontal whitespace character
\p{<i>xx</i>} a character with the <i>xx</i> property
\P{<i>xx</i>} a character without the <i>xx</i> property
\R a newline sequence
\s a whitespace character
\S a character that is not a whitespace character
\v a vertical whitespace character
\V a character that is not a vertical whitespace character
\w a "word" character
\W a "non-word" character
\X an extended Unicode sequence
</pre>
In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
</P>
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTY CODES FOR \p and \P</a><br>
<P>
<pre>
C Other
Cc Control
Cf Format
Cn Unassigned
Co Private use
Cs Surrogate
L Letter
Ll Lower case letter
Lm Modifier letter
Lo Other letter
Lt Title case letter
Lu Upper case letter
L& Ll, Lu, or Lt
M Mark
Mc Spacing mark
Me Enclosing mark
Mn Non-spacing mark
N Number
Nd Decimal number
Nl Letter number
No Other number
P Punctuation
Pc Connector punctuation
Pd Dash punctuation
Pe Close punctuation
Pf Final punctuation
Pi Initial punctuation
Po Other punctuation
Ps Open punctuation
S Symbol
Sc Currency symbol
Sk Modifier symbol
Sm Mathematical symbol
So Other symbol
Z Separator
Zl Line separator
Zp Paragraph separator
Zs Space separator
</PRE>
</P>
<br><a name="SEC6" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
<P>
Arabic,
Armenian,
Balinese,
Bengali,
Bopomofo,
Braille,
Buginese,
Buhid,
Canadian_Aboriginal,
Carian,
Cham,
Cherokee,
Common,
Coptic,
Cuneiform,
Cypriot,
Cyrillic,
Deseret,
Devanagari,
Ethiopic,
Georgian,
Glagolitic,
Gothic,
Greek,
Gujarati,
Gurmukhi,
Han,
Hangul,
Hanunoo,
Hebrew,
Hiragana,
Inherited,
Kannada,
Katakana,
Kayah_Li,
Kharoshthi,
Khmer,
Lao,
Latin,
Lepcha,
Limbu,
Linear_B,
Lycian,
Lydian,
Malayalam,
Mongolian,
Myanmar,
New_Tai_Lue,
Nko,
Ogham,
Old_Italic,
Old_Persian,
Ol_Chiki,
Oriya,
Osmanya,
Phags_Pa,
Phoenician,
Rejang,
Runic,
Saurashtra,
Shavian,
Sinhala,
Sudanese,
Syloti_Nagri,
Syriac,
Tagalog,
Tagbanwa,
Tai_Le,
Tamil,
Telugu,
Thaana,
Thai,
Tibetan,
Tifinagh,
Ugaritic,
Vai,
Yi.
</P>
<br><a name="SEC7" href="#TOC1">CHARACTER CLASSES</a><br>
<P>
<pre>
[...] positive character class
[^...] negative character class
[x-y] range (can be used for hex characters)
[[:xxx:]] positive POSIX named set
[[:^xxx:]] negative POSIX named set
alnum alphanumeric
alpha alphabetic
ascii 0-127
blank space or tab
cntrl control character
digit decimal digit
graph printing, excluding space
lower lower case letter
print printing, including space
punct printing, excluding alphanumeric
space whitespace
upper upper case letter
word same as \w
xdigit hexadecimal digit
</pre>
In PCRE, POSIX character set names recognize only ASCII characters. You can use
\Q...\E inside a character class.
</P>
<br><a name="SEC8" href="#TOC1">QUANTIFIERS</a><br>
<P>
<pre>
? 0 or 1, greedy
?+ 0 or 1, possessive
?? 0 or 1, lazy
* 0 or more, greedy
*+ 0 or more, possessive
*? 0 or more, lazy
+ 1 or more, greedy
++ 1 or more, possessive
+? 1 or more, lazy
{n} exactly n
{n,m} at least n, no more than m, greedy
{n,m}+ at least n, no more than m, possessive
{n,m}? at least n, no more than m, lazy
{n,} n or more, greedy
{n,}+ n or more, possessive
{n,}? n or more, lazy
</PRE>
</P>
<br><a name="SEC9" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
<P>
<pre>
\b word boundary (only ASCII letters recognized)
\B not a word boundary
^ start of subject
also after internal newline in multiline mode
\A start of subject
$ end of subject
also before newline at end of subject
also before internal newline in multiline mode
\Z end of subject
also before newline at end of subject
\z end of subject
\G first matching position in subject
</PRE>
</P>
<br><a name="SEC10" href="#TOC1">MATCH POINT RESET</a><br>
<P>
<pre>
\K reset start of match
</PRE>
</P>
<br><a name="SEC11" href="#TOC1">ALTERNATION</a><br>
<P>
<pre>
expr|expr|expr...
</PRE>
</P>
<br><a name="SEC12" href="#TOC1">CAPTURING</a><br>
<P>
<pre>
(...) capturing group
(?&#60;name&#62;...) named capturing group (Perl)
(?'name'...) named capturing group (Perl)
(?P&#60;name&#62;...) named capturing group (Python)
(?:...) non-capturing group
(?|...) non-capturing group; reset group numbers for
capturing groups in each alternative
</PRE>
</P>
<br><a name="SEC13" href="#TOC1">ATOMIC GROUPS</a><br>
<P>
<pre>
(?&#62;...) atomic, non-capturing group
</PRE>
</P>
<br><a name="SEC14" href="#TOC1">COMMENT</a><br>
<P>
<pre>
(?#....) comment (not nestable)
</PRE>
</P>
<br><a name="SEC15" href="#TOC1">OPTION SETTING</a><br>
<P>
<pre>
(?i) caseless
(?J) allow duplicate names
(?m) multiline
(?s) single line (dotall)
(?U) default ungreedy (lazy)
(?x) extended (ignore white space)
(?-...) unset option(s)
</pre>
The following is recognized only at the start of a pattern or after one of the
newline-setting options with similar syntax:
<pre>
(*UTF8) set UTF-8 mode
</PRE>
</P>
<br><a name="SEC16" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
<P>
<pre>
(?=...) positive look ahead
(?!...) negative look ahead
(?&#60;=...) positive look behind
(?&#60;!...) negative look behind
</pre>
Each top-level branch of a look behind must be of a fixed length.
</P>
<br><a name="SEC17" href="#TOC1">BACKREFERENCES</a><br>
<P>
<pre>
\n reference by number (can be ambiguous)
\gn reference by number
\g{n} reference by number
\g{-n} relative reference by number
\k&#60;name&#62; reference by name (Perl)
\k'name' reference by name (Perl)
\g{name} reference by name (Perl)
\k{name} reference by name (.NET)
(?P=name) reference by name (Python)
</PRE>
</P>
<br><a name="SEC18" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
<P>
<pre>
(?R) recurse whole pattern
(?n) call subpattern by absolute number
(?+n) call subpattern by relative number
(?-n) call subpattern by relative number
(?&name) call subpattern by name (Perl)
(?P&#62;name) call subpattern by name (Python)
\g&#60;name&#62; call subpattern by name (Oniguruma)
\g'name' call subpattern by name (Oniguruma)
\g&#60;n&#62; call subpattern by absolute number (Oniguruma)
\g'n' call subpattern by absolute number (Oniguruma)
\g&#60;+n&#62; call subpattern by relative number (PCRE extension)
\g'+n' call subpattern by relative number (PCRE extension)
\g&#60;-n&#62; call subpattern by relative number (PCRE extension)
\g'-n' call subpattern by relative number (PCRE extension)
</PRE>
</P>
<br><a name="SEC19" href="#TOC1">CONDITIONAL PATTERNS</a><br>
<P>
<pre>
(?(condition)yes-pattern)
(?(condition)yes-pattern|no-pattern)
(?(n)... absolute reference condition
(?(+n)... relative reference condition
(?(-n)... relative reference condition
(?(&#60;name&#62;)... named reference condition (Perl)
(?('name')... named reference condition (Perl)
(?(name)... named reference condition (PCRE)
(?(R)... overall recursion condition
(?(Rn)... specific group recursion condition
(?(R&name)... specific recursion condition
(?(DEFINE)... define subpattern for reference
(?(assert)... assertion condition
</PRE>
</P>
<br><a name="SEC20" href="#TOC1">BACKTRACKING CONTROL</a><br>
<P>
The following act immediately they are reached:
<pre>
(*ACCEPT) force successful match
(*FAIL) force backtrack; synonym (*F)
</pre>
The following act only when a subsequent match failure causes a backtrack to
reach them. They all force a match failure, but they differ in what happens
afterwards. Those that advance the start-of-match point do so only if the
pattern is not anchored.
<pre>
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
(*SKIP) advance start to current matching position
(*THEN) local failure, backtrack to next alternation
</PRE>
</P>
<br><a name="SEC21" href="#TOC1">NEWLINE CONVENTIONS</a><br>
<P>
These are recognized only at the very start of the pattern or after a
(*BSR_...) or (*UTF8) option.
<pre>
(*CR) carriage return only
(*LF) linefeed only
(*CRLF) carriage return followed by linefeed
(*ANYCRLF) all three of the above
(*ANY) any Unicode newline sequence
</PRE>
</P>
<br><a name="SEC22" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
These are recognized only at the very start of the pattern or after a
(*...) option that sets the newline convention or UTF-8 mode.
<pre>
(*BSR_ANYCRLF) CR, LF, or CRLF
(*BSR_UNICODE) any Unicode newline sequence
</PRE>
</P>
<br><a name="SEC23" href="#TOC1">CALLOUTS</a><br>
<P>
<pre>
(?C) callout
(?Cn) callout with data n
</PRE>
</P>
<br><a name="SEC24" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcrepattern</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3),
<b>pcrematching</b>(3), <b>pcre</b>(3).
</P>
<br><a name="SEC25" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
<P>
Last updated: 11 April 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -23,8 +23,11 @@ man page, in case the conversion went wrong.
<li><a name="TOC8" href="#SEC8">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a>
<li><a name="TOC9" href="#SEC9">RESTARTING AFTER A PARTIAL MATCH</a>
<li><a name="TOC10" href="#SEC10">CALLOUTS</a>
<li><a name="TOC11" href="#SEC11">SAVING AND RELOADING COMPILED PATTERNS</a>
<li><a name="TOC12" href="#SEC12">AUTHOR</a>
<li><a name="TOC11" href="#SEC11">NON-PRINTING CHARACTERS</a>
<li><a name="TOC12" href="#SEC12">SAVING AND RELOADING COMPILED PATTERNS</a>
<li><a name="TOC13" href="#SEC13">SEE ALSO</a>
<li><a name="TOC14" href="#SEC14">AUTHOR</a>
<li><a name="TOC15" href="#SEC15">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
@@ -43,6 +46,11 @@ documentation.
</P>
<br><a name="SEC2" href="#TOC1">OPTIONS</a><br>
<P>
<b>-b</b>
Behave as if each regex has the <b>/B</b> (show bytecode) modifier; the internal
form is output after compilation.
</P>
<P>
<b>-C</b>
Output the version number of the PCRE library, and all available information
about the optional features that are included, and then exit.
@@ -50,7 +58,8 @@ about the optional features that are included, and then exit.
<P>
<b>-d</b>
Behave as if each regex has the <b>/D</b> (debug) modifier; the internal
form is output after compilation.
form and information about the compiled pattern is output after compilation;
<b>-d</b> is equivalent to <b>-b -i</b>.
</P>
<P>
<b>-dfa</b>
@@ -59,11 +68,21 @@ alternative matching function, <b>pcre_dfa_exec()</b>, to be used instead of the
standard <b>pcre_exec()</b> function (more detail is given below).
</P>
<P>
<b>-help</b>
Output a brief summary these options and then exit.
</P>
<P>
<b>-i</b>
Behave as if each regex has the <b>/I</b> modifier; information about the
compiled pattern is given after compilation.
</P>
<P>
<b>-M</b>
Behave as if each data line contains the \M escape sequence; this causes
PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
calling <b>pcre_exec()</b> repeatedly with different limits.
</P>
<P>
<b>-m</b>
Output the size of each compiled pattern after it has been compiled. This is
equivalent to adding <b>/M</b> to each regular expression. For compatibility
@@ -72,9 +91,11 @@ with earlier versions of pcretest, <b>-s</b> is a synonym for <b>-m</b>.
<P>
<b>-o</b> <i>osize</i>
Set the number of elements in the output vector that is used when calling
<b>pcre_exec()</b> to be <i>osize</i>. The default value is 45, which is enough
for 14 capturing subexpressions. The vector size can be changed for individual
matching calls by including \O in the data line (see below).
<b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> to be <i>osize</i>. The default value
is 45, which is enough for 14 capturing subexpressions for <b>pcre_exec()</b> or
22 different matches for <b>pcre_dfa_exec()</b>. The vector size can be
changed for individual matching calls by including \O in the data line (see
below).
</P>
<P>
<b>-p</b>
@@ -96,7 +117,15 @@ megabytes.
Run each compile, study, and match many times with a timer, and output
resulting time per compile or match (in milliseconds). Do not set <b>-m</b> with
<b>-t</b>, because you will then get the size output a zillion times, and the
timing will be distorted.
timing will be distorted. You can control the number of iterations that are
used for timing by following <b>-t</b> with a number (as a separate item on the
command line). For example, "-t 1000" would iterate 1000 times. The default is
to iterate 500000 times.
</P>
<P>
<b>-tm</b>
This is like <b>-t</b> except that it times only the matching phase, not the
compile or study phases.
</P>
<br><a name="SEC3" href="#TOC1">DESCRIPTION</a><br>
<P>
@@ -107,6 +136,13 @@ stdout, and prompts for each line of input, using "re&#62;" to prompt for regula
expressions, and "data&#62;" to prompt for data lines.
</P>
<P>
When <b>pcretest</b> is built, a configuration option can specify that it should
be linked with the <b>libreadline</b> library. When this is done, if the input
is from a terminal, it is read using the <b>readline()</b> function. This
provides line-editing and history facilities. The output from the <b>-help</b>
option states whether or not <b>readline()</b> will be used.
</P>
<P>
The program handles any number of sets of input on a single input file. Each
set starts with a regular expression, and continues with any number of data
lines to be matched against the pattern.
@@ -114,8 +150,8 @@ lines to be matched against the pattern.
<P>
Each data line is matched separately and independently. If you want to do
multi-line matches, you have to use the \n escape sequence (or \r or \r\n,
depending on the newline setting) in a single line of input to encode the
newline characters. There is no limit on the length of data lines; the input
etc., depending on the newline setting) in a single line of input to encode the
newline sequences. There is no limit on the length of data lines; the input
buffer is automatically extended if it is too small.
</P>
<P>
@@ -168,20 +204,30 @@ effect as they do in Perl. For example:
The following table shows additional modifiers for setting PCRE options that do
not correspond to anything in Perl:
<pre>
<b>/A</b> PCRE_ANCHORED
<b>/C</b> PCRE_AUTO_CALLOUT
<b>/E</b> PCRE_DOLLAR_ENDONLY
<b>/f</b> PCRE_FIRSTLINE
<b>/J</b> PCRE_DUPNAMES
<b>/N</b> PCRE_NO_AUTO_CAPTURE
<b>/U</b> PCRE_UNGREEDY
<b>/X</b> PCRE_EXTRA
<b>/&#60;cr&#62;</b> PCRE_NEWLINE_CR
<b>/&#60;lf&#62;</b> PCRE_NEWLINE_LF
<b>/&#60;crlf&#62;</b> PCRE_NEWLINE_CRLF
<b>/A</b> PCRE_ANCHORED
<b>/C</b> PCRE_AUTO_CALLOUT
<b>/E</b> PCRE_DOLLAR_ENDONLY
<b>/f</b> PCRE_FIRSTLINE
<b>/J</b> PCRE_DUPNAMES
<b>/N</b> PCRE_NO_AUTO_CAPTURE
<b>/U</b> PCRE_UNGREEDY
<b>/X</b> PCRE_EXTRA
<b>/&#60;JS&#62;</b> PCRE_JAVASCRIPT_COMPAT
<b>/&#60;cr&#62;</b> PCRE_NEWLINE_CR
<b>/&#60;lf&#62;</b> PCRE_NEWLINE_LF
<b>/&#60;crlf&#62;</b> PCRE_NEWLINE_CRLF
<b>/&#60;anycrlf&#62;</b> PCRE_NEWLINE_ANYCRLF
<b>/&#60;any&#62;</b> PCRE_NEWLINE_ANY
<b>/&#60;bsr_anycrlf&#62;</b> PCRE_BSR_ANYCRLF
<b>/&#60;bsr_unicode&#62;</b> PCRE_BSR_UNICODE
</pre>
Those specifying line endings are literal strings as shown. Details of the
meanings of these PCRE options are given in the
Those specifying line ending sequences are literal strings as shown, but the
letters can be in either case. This example sets multiline matching with CRLF
as the line ending sequence:
<pre>
/^abc/m&#60;crlf&#62;
</pre>
Details of the meanings of these PCRE options are given in the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation.
</P>
@@ -220,6 +266,14 @@ the subject string. This is useful for tests where the subject contains
multiple copies of the same substring.
</P>
<P>
The <b>/B</b> modifier is a debugging feature. It requests that <b>pcretest</b>
output a representation of the compiled byte code after compilation. Normally
this information contains length and offset values; however, if <b>/Z</b> is
also present, this data is replaced by spaces. This is a special feature for
use in the automatic test scripts; it ensures that the same output is generated
for different internal link sizes.
</P>
<P>
The <b>/L</b> modifier must be followed directly by the name of a locale, for
example,
<pre>
@@ -238,10 +292,8 @@ so on). It does this by calling <b>pcre_fullinfo()</b> after compiling a
pattern. If the pattern is studied, the results of that are also output.
</P>
<P>
The <b>/D</b> modifier is a PCRE debugging feature, which also assumes <b>/I</b>.
It causes the internal form of compiled regular expressions to be output after
compilation. If the pattern was studied, the information returned is also
output.
The <b>/D</b> modifier is a PCRE debugging feature, and is equivalent to
<b>/BI</b>, that is, both the <b>/B</b> and the <b>/I</b> modifiers.
</P>
<P>
The <b>/F</b> modifier causes <b>pcretest</b> to flip the byte order of the
@@ -289,15 +341,15 @@ complicated features of PCRE. If you are just testing "ordinary" regular
expressions, you probably don't need any of these. The following escapes are
recognized:
<pre>
\a alarm (= BEL)
\b backspace
\e escape
\f formfeed
\n newline
\a alarm (BEL, \x07)
\b backspace (\x08)
\e escape (\x27)
\f formfeed (\x0c)
\n newline (\x0a)
\qdd set the PCRE_MATCH_LIMIT limit to dd (any number of digits)
\r carriage return
\t tab
\v vertical tab
\r carriage return (\x0d)
\t tab (\x09)
\v vertical tab (\x0b)
\nnn octal character (up to 3 octal digits)
\xhh hexadecimal character (up to 2 hex digits)
\x{hh...} hexadecimal character, any number of digits in UTF-8 mode
@@ -331,11 +383,17 @@ recognized:
\&#60;cr&#62; pass the PCRE_NEWLINE_CR option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;lf&#62; pass the PCRE_NEWLINE_LF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;crlf&#62; pass the PCRE_NEWLINE_CRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;anycrlf&#62; pass the PCRE_NEWLINE_ANYCRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
\&#60;any&#62; pass the PCRE_NEWLINE_ANY option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
</pre>
The escapes that specify line endings are literal strings, exactly as shown.
A backslash followed by anything else just escapes the anything else. If the
very last character is a backslash, it is ignored. This gives a way of passing
an empty line as data, since a real empty line terminates the data input.
The escapes that specify line ending sequences are literal strings, exactly as
shown. No more than one newline setting should be present in any data line.
</P>
<P>
A backslash followed by anything else just escapes the anything else. If
the very last character is a backslash, it is ignored. This gives a way of
passing an empty line as data, since a real empty line terminates the data
input.
</P>
<P>
If \M is present, <b>pcretest</b> calls <b>pcre_exec()</b> several times, with
@@ -365,7 +423,10 @@ and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
of the <b>/8</b> modifier on the pattern. It is recognized always. There may be
any number of hexadecimal digits inside the braces. The result is from one to
six bytes, encoded according to the UTF-8 rules.
six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
valid Unicode code points, or indeed valid UTF-8 characters according to the
later rules in RFC 3629.
</P>
<br><a name="SEC6" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P>
@@ -398,7 +459,7 @@ respectively, and otherwise the PCRE negative error number. Here is an example
of an interactive <b>pcretest</b> run.
<pre>
$ pcretest
PCRE version 5.00 07-Sep-2004
PCRE version 7.0 30-Nov-2006
re&#62; /^abc(\d+)/
data&#62; abc123
@@ -407,11 +468,26 @@ of an interactive <b>pcretest</b> run.
data&#62; xyz
No match
</pre>
Note that unset capturing substrings that are not followed by one that is set
are not returned by <b>pcre_exec()</b>, and are not shown by <b>pcretest</b>. In
the following example, there are two capturing substrings, but when the first
data line is matched, the second, unset substring is not shown. An "internal"
unset substring is shown as "&#60;unset&#62;", as for the second data line.
<pre>
re&#62; /(a)|(b)/
data&#62; a
0: a
1: a
data&#62; b
0: b
1: &#60;unset&#62;
2: b
</pre>
If the strings contain any non-printing characters, they are output as \0x
escapes, or as \x{...} escapes if the <b>/8</b> modifier was present on the
pattern. If the pattern has the <b>/+</b> modifier, the output for substring 0
is followed by the the rest of the subject string, identified by "0+" like
this:
pattern. See below for the definition of non-printing characters. If the
pattern has the <b>/+</b> modifier, the output for substring 0 is followed by
the the rest of the subject string, identified by "0+" like this:
<pre>
re&#62; /cat/+
data&#62; cataract
@@ -441,10 +517,10 @@ length (that is, the return from the extraction function) is given in
parentheses after each string for <b>\C</b> and <b>\G</b>.
</P>
<P>
Note that while patterns can be continued over several lines (a plain "&#62;"
Note that whereas patterns can be continued over several lines (a plain "&#62;"
prompt is used for continuations), data lines may not. However newlines can be
included in data by means of the \n escape (or \r or \r\n for those newline
settings).
included in data by means of the \n escape (or \r, \r\n, etc., depending on
the newline sequence setting).
</P>
<br><a name="SEC8" href="#TOC1">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P>
@@ -463,7 +539,7 @@ the subject where there is at least one match. For example:
longest matching string is always given first (and numbered zero).
</P>
<P>
If \fB/g\P is present on the pattern, the search for further matches resumes
If <b>/g</b> is present on the pattern, the search for further matches resumes
at the end of the longest match. For example:
<pre>
re&#62; /(tang|tangerine|tan)/g
@@ -537,7 +613,19 @@ the
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation.
</P>
<br><a name="SEC11" href="#TOC1">SAVING AND RELOADING COMPILED PATTERNS</a><br>
<br><a name="SEC11" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
<P>
When <b>pcretest</b> is outputting text in the compiled version of a pattern,
bytes other than 32-126 are always treated as non-printing characters are are
therefore shown as hex escapes.
</P>
<P>
When <b>pcretest</b> is outputting text that is a matched part of a subject
string, it behaves in the same way, unless a different locale has been set for
the pattern (using the <b>/L</b> modifier). In this case, the <b>isprint()</b>
function to distinguish printing and non-printing characters.
</P>
<br><a name="SEC12" href="#TOC1">SAVING AND RELOADING COMPILED PATTERNS</a><br>
<P>
The facilities described in this section are not available when the POSIX
inteface to PCRE is being used, that is, when the <b>/P</b> pattern modifier is
@@ -599,18 +687,26 @@ string using a reloaded pattern is likely to cause <b>pcretest</b> to crash.
Finally, if you attempt to load a file that is not in the correct format, the
result is undefined.
</P>
<br><a name="SEC12" href="#TOC1">AUTHOR</a><br>
<br><a name="SEC13" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrematching</b>(3),
<b>pcrepartial</b>(d), <b>pcrepattern</b>(3), <b>pcreprecompile</b>(3).
</P>
<br><a name="SEC14" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service,
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
Cambridge CB2 3QG, England.
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
Last updated: 29 June 2006
Last updated: 10 March 2009
<br>
Copyright &copy; 1997-2009 University of Cambridge.
<br>
Copyright &copy; 1997-2006 University of Cambridge.
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>

View File

@@ -0,0 +1,140 @@
<html>
<!-- This is a manually maintained file that is the root of the HTML version of
the PCRE documentation. When the HTML documents are built from the man
page versions, the entire doc/html directory is emptied, this file is then
copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
-->
<head>
<title>PCRE specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>Perl-compatible Regular Expressions (PCRE)</h1>
<p>
The HTML documentation for PCRE comprises the following pages:
</p>
<table>
<tr><td><a href="pcre.html">pcre</a></td>
<td>&nbsp;&nbsp;Introductory page</td></tr>
<tr><td><a href="pcre-config.html">pcre-config</a></td>
<td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
<tr><td><a href="pcreapi.html">pcreapi</a></td>
<td>&nbsp;&nbsp;PCRE's native API</td></tr>
<tr><td><a href="pcrebuild.html">pcrebuild</a></td>
<td>&nbsp;&nbsp;Options for building PCRE</td></tr>
<tr><td><a href="pcrecallout.html">pcrecallout</a></td>
<td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
<tr><td><a href="pcrecompat.html">pcrecompat</a></td>
<td>&nbsp;&nbsp;Compability with Perl</td></tr>
<tr><td><a href="pcrecpp.html">pcrecpp</a></td>
<td>&nbsp;&nbsp;The C++ wrapper for the PCRE library</td></tr>
<tr><td><a href="pcregrep.html">pcregrep</a></td>
<td>&nbsp;&nbsp;The <b>pcregrep</b> command</td></tr>
<tr><td><a href="pcrematching.html">pcrematching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
<tr><td><a href="pcrepartial.html">pcrepartial</a></td>
<td>&nbsp;&nbsp;Using PCRE for partial matching</td></tr>
<tr><td><a href="pcrepattern.html">pcrepattern</a></td>
<td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE</td></tr>
<tr><td><a href="pcreperform.html">pcreperform</a></td>
<td>&nbsp;&nbsp;Some comments on performance</td></tr>
<tr><td><a href="pcreposix.html">pcreposix</a></td>
<td>&nbsp;&nbsp;The POSIX API to the PCRE library</td></tr>
<tr><td><a href="pcreprecompile.html">pcreprecompile</a></td>
<td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
<tr><td><a href="pcresample.html">pcresample</a></td>
<td>&nbsp;&nbsp;Description of the sample program</td></tr>
<tr><td><a href="pcrestack.html">pcrestack</a></td>
<td>&nbsp;&nbsp;Discussion of PCRE's stack usage</td></tr>
<tr><td><a href="pcresyntax.html">pcresyntax</a></td>
<td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
<tr><td><a href="pcretest.html">pcretest</a></td>
<td>&nbsp;&nbsp;The <b>pcretest</b> command for testing PCRE</td></tr>
</table>
<p>
There are also individual pages that summarize the interface for each function
in the library:
</p>
<table>
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
<tr><td><a href="pcre_compile2.html">pcre_compile2</a></td>
<td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
<tr><td><a href="pcre_config.html">pcre_config</a></td>
<td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
<tr><td><a href="pcre_copy_named_substring.html">pcre_copy_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
<tr><td><a href="pcre_copy_substring.html">pcre_copy_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
<tr><td><a href="pcre_dfa_exec.html">pcre_dfa_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
<tr><td><a href="pcre_exec.html">pcre_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
<tr><td><a href="pcre_free_substring.html">pcre_free_substring</a></td>
<td>&nbsp;&nbsp;Free extracted substring</td></tr>
<tr><td><a href="pcre_free_substring_list.html">pcre_free_substring_list</a></td>
<td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
<tr><td><a href="pcre_fullinfo.html">pcre_fullinfo</a></td>
<td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
<tr><td><a href="pcre_get_named_substring.html">pcre_get_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
<tr><td><a href="pcre_get_stringnumber.html">pcre_get_stringnumber</a></td>
<td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
<tr><td><a href="pcre_get_substring.html">pcre_get_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
<tr><td><a href="pcre_get_substring_list.html">pcre_get_substring_list</a></td>
<td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
<tr><td><a href="pcre_info.html">pcre_info</a></td>
<td>&nbsp;&nbsp;Obsolete information extraction function</td></tr>
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
<tr><td><a href="pcre_study.html">pcre_study</a></td>
<td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
<tr><td><a href="pcre_version.html">pcre_version</a></td>
<td>&nbsp;&nbsp;Return PCRE version and release date</td></tr>
</table>
</html>

View File

@@ -0,0 +1,73 @@
.TH PCRE-CONFIG 1
.SH NAME
pcre-config - program to return PCRE configuration
.SH SYNOPSIS
.rs
.sp
.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
.ti +5n
.B [--libs-posix] [--cflags] [--cflags-posix]
.
.
.SH DESCRIPTION
.rs
.sp
\fBpcre-config\fP returns the configuration of the installed PCRE
libraries and the options required to compile a program to use them.
.
.
.SH OPTIONS
.rs
.TP 10
\fB--prefix\fP
Writes the directory prefix used in the PCRE installation for architecture
independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some
systems) to the standard output.
.TP 10
\fB--exec-prefix\fP
Writes the directory prefix used in the PCRE installation for architecture
dependent files (normally the same as \fB--prefix\fP) to the standard output.
.TP 10
\fB--version\fP
Writes the version number of the installed PCRE libraries to the standard
output.
.TP 10
\fB--libs\fP
Writes to the standard output the command line options required to link
with PCRE (\fB-lpcre\fP on many systems).
.TP 10
\fB--libs-posix\fP
Writes to the standard output the command line options required to link with
the PCRE posix emulation library (\fB-lpcreposix\fP \fB-lpcre\fP on many
systems).
.TP 10
\fB--cflags\fP
Writes to the standard output the command line options required to compile
files that use PCRE (this may include some \fB-I\fP options, but is blank on
many systems).
.TP 10
\fB--cflags-posix\fP
Writes to the standard output the command line options required to compile
files that use the PCRE posix emulation library (this may include some \fB-I\fP
options, but is blank on many systems).
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcre(3)\fP
.
.
.SH AUTHOR
.rs
.sp
This manual page was originally written by Mark Baker for the Debian GNU/Linux
system. It has been slightly revised as a generic PCRE man page.
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 18 April 2007
.fi

View File

@@ -0,0 +1,67 @@
PCRE-CONFIG(1) PCRE-CONFIG(1)
NAME
pcre-config - program to return PCRE configuration
SYNOPSIS
pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
[--libs-posix] [--cflags] [--cflags-posix]
DESCRIPTION
pcre-config returns the configuration of the installed PCRE libraries
and the options required to compile a program to use them.
OPTIONS
--prefix Writes the directory prefix used in the PCRE installation for
architecture independent files (/usr on many systems,
/usr/local on some systems) to the standard output.
--exec-prefix
Writes the directory prefix used in the PCRE installation for
architecture dependent files (normally the same as --prefix)
to the standard output.
--version Writes the version number of the installed PCRE libraries to
the standard output.
--libs Writes to the standard output the command line options
required to link with PCRE (-lpcre on many systems).
--libs-posix
Writes to the standard output the command line options
required to link with the PCRE posix emulation library
(-lpcreposix -lpcre on many systems).
--cflags Writes to the standard output the command line options
required to compile files that use PCRE (this may include
some -I options, but is blank on many systems).
--cflags-posix
Writes to the standard output the command line options
required to compile files that use the PCRE posix emulation
library (this may include some -I options, but is blank on
many systems).
SEE ALSO
pcre(3)
AUTHOR
This manual page was originally written by Mark Baker for the Debian
GNU/Linux system. It has been slightly revised as a generic PCRE man
page.
REVISION
Last updated: 18 April 2007

View File

@@ -6,12 +6,18 @@ PCRE - Perl-compatible regular expressions
.sp
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl, with just a few
differences. The current implementation of PCRE (release 6.x) corresponds
approximately with Perl 5.8, including support for UTF-8 encoded strings and
Unicode general category properties. However, this support has to be explicitly
enabled; it is not the default.
differences. Certain features that appeared in Python and PCRE before they
appeared in Perl are also available using the Python syntax. There is also some
support for certain .NET and Oniguruma syntax items, and there is an option for
requesting some minor changes that give better JavaScript compatibility.
.P
In addition to the Perl-compatible matching function, PCRE also contains an
The current implementation of PCRE (release 7.x) corresponds approximately with
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
category properties. However, UTF-8 and Unicode support has to be explicitly
enabled; it is not the default. The Unicode tables correspond to Unicode
release 5.1.
.P
In addition to the Perl-compatible matching function, PCRE contains an
alternative matching function that matches the same compiled patterns in a
different way. In certain circumstances, the alternative function has some
advantages. For a discussion of the two matching algorithms, see the
@@ -43,7 +49,11 @@ and
.\" HREF
\fBpcrecompat\fR
.\"
pages.
pages. There is a syntax summary in the
.\" HREF
\fBpcresyntax\fR
.\"
page.
.P
Some features of PCRE can be included, excluded, or changed when the library is
built. The
@@ -77,6 +87,7 @@ all the sections are concatenated, for ease of searching. The sections are as
follows:
.sp
pcre this document
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
pcrecallout details of the callout feature
@@ -88,6 +99,7 @@ follows:
.\" JOIN
pcrepattern syntax and semantics of supported
regular expressions
pcresyntax quick syntax reference
pcreperform discussion of performance issues
pcreposix the POSIX-compatible C API
pcreprecompile details of saving and re-using precompiled patterns
@@ -114,18 +126,15 @@ distribution and the
\fBpcrebuild\fP
.\"
documentation for details). In these cases the limit is substantially larger.
However, the speed of execution will be slower.
However, the speed of execution is slower.
.P
All values in repeating quantifiers must be less than 65536. The maximum
compiled length of subpattern with an explicit repeat count is 30000 bytes. The
maximum number of capturing subpatterns is 65535.
All values in repeating quantifiers must be less than 65536.
.P
There is no limit to the number of non-capturing subpatterns, but the maximum
depth of nesting of all kinds of parenthesized subpattern, including capturing
subpatterns, assertions, and other types of subpattern, is 200.
There is no limit to the number of parenthesized subpatterns, but there can be
no more than 65535 capturing subpatterns.
.P
The maximum length of name for a named subpattern is 32, and the maximum number
of named subpatterns is 10000.
The maximum length of name for a named subpattern is 32 characters, and the
maximum number of named subpatterns is 10000.
.P
The maximum length of a subject string is the largest positive number that an
integer variable can hold. However, when using the traditional matching
@@ -137,7 +146,7 @@ issues, see the
\fBpcrestack\fP
.\"
documentation.
.sp
.
.\" HTML <a name="utf8support"></a>
.
.
@@ -154,13 +163,14 @@ the code, and, in addition, you must call
.\" HREF
\fBpcre_compile()\fP
.\"
with the PCRE_UTF8 option flag. When you do this, both the pattern and any
subject strings that are matched against it are treated as UTF-8 strings
instead of just strings of bytes.
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
(*UTF8). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF-8 strings instead of
just strings of bytes.
.P
If you compile PCRE with UTF-8 support, but do not use it at run time, the
library will be a bit bigger, but the additional run time overhead is limited
to testing the PCRE_UTF8 flag in several places, so should not be very large.
to testing the PCRE_UTF8 flag occasionally, so should not be very big.
.P
If PCRE is built with Unicode character property support (which implies UTF-8
support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
@@ -175,46 +185,83 @@ documentation. Only the short names for properties are supported. For example,
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
compatibility with Perl 5.6. PCRE does not support this.
.
.\" HTML <a name="utf8strings"></a>
.
.SS "Validity of UTF-8 strings"
.rs
.sp
When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
are (by default) checked for validity on entry to the relevant functions. From
release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
themselves derived from the Unicode specification. Earlier releases of PCRE
followed the rules of RFC 2279, which allows the full range of 31-bit values (0
to 0x7FFFFFFF). The current check allows only values in the range U+0 to
U+10FFFF, excluding U+D800 to U+DFFF.
.P
The following comments apply when PCRE is running in UTF-8 mode:
The excluded code points are the "Low Surrogate Area" of Unicode, of which the
Unicode Standard says this: "The Low Surrogate Area does not contain any
character assignments, consequently no character code charts or namelists are
provided for this area. Surrogates are reserved for use with UTF-16 and then
must be used in pairs." The code points that are encoded by UTF-16 pairs are
available as independent code points in the UTF-8 encoding. (In other words,
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
UTF-8.)
.P
1. When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
are checked for validity on entry to the relevant functions. If an invalid
UTF-8 string is passed, an error return is given. In some situations, you may
already know that your strings are valid, and therefore want to skip these
checks in order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag
at compile time or at run time, PCRE assumes that the pattern or subject it
is given (respectively) contains only valid UTF-8 codes. In this case, it does
not diagnose an invalid UTF-8 string. If you pass an invalid UTF-8 string to
PCRE when PCRE_NO_UTF8_CHECK is set, the results are undefined. Your program
may crash.
If an invalid UTF-8 string is passed to PCRE, an error return
(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
your strings are valid, and therefore want to skip these checks in order to
improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
at run time, PCRE assumes that the pattern or subject it is given
(respectively) contains only valid UTF-8 codes. In this case, it does not
diagnose an invalid UTF-8 string.
.P
2. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
happens depends on why the string is invalid. If the string conforms to the
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
rules of RFC 2279. However, if the string does not even conform to RFC 2279,
the result is undefined. Your program may crash.
.P
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
encoded in a UTF-8-like manner as per the old RFC, you can set
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
situation, you will have to apply your own validity check.
.
.SS "General comments about UTF-8 mode"
.rs
.sp
1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
UTF-8 character if the value is greater than 127.
.P
3. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
characters for values greater than \e177.
.P
4. Repeat quantifiers apply to complete UTF-8 characters, not to individual
3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
bytes, for example: \ex{100}{3}.
.P
5. The dot metacharacter matches one UTF-8 character instead of a single byte.
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
.P
6. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
but its use can lead to some strange effects. This facility is not available in
the alternative matching function, \fBpcre_dfa_exec()\fP.
.P
7. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
test characters of any code value, but the characters that PCRE recognizes as
digits, spaces, or word characters remain the same set as before, all with
values less than 256. This remains true even when PCRE includes Unicode
property support, because to do otherwise would slow down PCRE in many common
cases. If you really want to test for a wider sense of, say, "digit", you
must use Unicode property tests such as \ep{Nd}.
must use Unicode property tests such as \ep{Nd}. Note that this also applies to
\eb, because it is defined in terms of \ew and \eW.
.P
8. Similarly, characters that match the POSIX named character classes are all
7. Similarly, characters that match the POSIX named character classes are all
low-valued characters.
.P
8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
(\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters.
.P
9. Case-insensitive matching applies only to characters whose values are less
than 128, unless PCRE is built with Unicode property support. Even when Unicode
property support is available, PCRE still uses its own character tables when
@@ -225,20 +272,25 @@ case-insensitive matching only when there is a one-to-one mapping between a
letter's cases. There are a small number of many-to-one mappings in Unicode;
these are not supported by PCRE.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
.br
University Computing Service,
.br
Cambridge CB2 3QG, England.
University Computing Service
Cambridge CB2 3QH, England.
.fi
.P
Putting an actual email address here seems to have been a spam magnet, so I've
taken it away. If you want to email me, use my initial and surname, separated
by a dot, at the domain ucs.cam.ac.uk.
taken it away. If you want to email me, use my two initials, followed by the
two digits 10, at the domain cam.ac.uk.
.
.
.SH REVISION
.rs
.sp
.in 0
Last updated: 05 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
.nf
Last updated: 11 April 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

File diff suppressed because it is too large Load Diff

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
.ti +5n
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
@@ -17,8 +16,9 @@ PCRE - Perl-compatible regular expressions
.SH DESCRIPTION
.rs
.sp
This function compiles a regular expression into an internal form. Its
arguments are:
This function compiles a regular expression into an internal form. It is the
same as \fBpcre_compile2()\fP, except for the absence of the \fIerrorcodeptr\fP
argument. Its arguments are:
.sp
\fIpattern\fR A zero-terminated string containing the
regular expression to be compiled
@@ -30,33 +30,41 @@ arguments are:
.sp
The option bits are:
.sp
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
PCRE_ANCHORED Force pattern anchoring
PCRE_AUTO_CALLOUT Compile automatic callouts
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \eR matches all Unicode line endings
PCRE_CASELESS Do caseless matching
PCRE_DOLLAR_ENDONLY $ not to match newline at end
PCRE_DOTALL . matches anything including NL
PCRE_DUPNAMES Allow duplicate names for subpatterns
PCRE_EXTENDED Ignore whitespace and # comments
PCRE_EXTRA PCRE extra features
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
theses (named ones available)
PCRE_UNGREEDY Invert greediness of quantifiers
PCRE_UTF8 Run in UTF-8 mode
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
validity (only relevant if
PCRE_UTF8 is set)
.sp
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
.P
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected.
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
.P
There is a complete description of the PCRE native API in the
.\" HREF

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
.ti +5n
.B int *\fIerrorcodeptr\fP,
@@ -46,6 +45,8 @@ The option bits are:
(not much use currently)
PCRE_FIRSTLINE Force matching to be before newline
PCRE_MULTILINE ^ and $ match newlines within data
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
@@ -61,7 +62,9 @@ PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
PCRE_NO_UTF8_CHECK.
.P
The yield of the function is a pointer to a private data structure that
contains the compiled pattern, or NULL if an error was detected.
contains the compiled pattern, or NULL if an error was detected. Note that
compiling regular expressions with one version of PCRE for use with a different
version is not guaranteed to work and may cause crashes.
.P
There is a complete description of the PCRE native API in the
.\" HREF

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
.
.SH DESCRIPTION
@@ -26,7 +25,15 @@ The available codes are:
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
Internal recursion depth limit
PCRE_CONFIG_NEWLINE Value of the newline sequence
PCRE_CONFIG_NEWLINE Value of the default newline sequence:
13 (0x000d) for CR
10 (0x000a) for LF
3338 (0x0d0a) for CRLF
-2 for ANYCRLF
-1 for ANY
PCRE_CONFIG_BSR Indicates what \eR matches by default:
0 all Unicode line endings
1 CR, LF, or CRLF only
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
Threshold of return slots, above
which \fBmalloc()\fR is used by

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
@@ -27,7 +26,7 @@ buffer. The arguments are:
\fIbuffer\fP Buffer to receive the string
\fIbuffersize\fP Size of buffer
.sp
The yield is the legnth of the string, PCRE_ERROR_NOMEMORY if the buffer was
The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
.P
There is a complete description of the PCRE native API in the

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
@@ -20,9 +19,9 @@ PCRE - Perl-compatible regular expressions
.rs
.sp
This function matches a compiled regular expression against a given subject
string, using a DFA matching algorithm (\fInot\fP Perl-compatible). Note that
the main, Perl-compatible, matching function is \fBpcre_exec()\fP. The
arguments for this function are:
string, using an alternative matching algorithm that scans the subject string
just once (\fInot\fP Perl-compatible). Note that the main, Perl-compatible,
matching function is \fBpcre_exec()\fP. The arguments for this function are:
.sp
\fIcode\fP Points to the compiled pattern
\fIextra\fP Points to an associated \fBpcre_extra\fP structure,
@@ -40,12 +39,17 @@ arguments for this function are:
The options are:
.sp
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \eR matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
@@ -53,8 +57,8 @@ The options are:
PCRE_DFA_SHORTEST Return only the shortest match
PCRE_DFA_RESTART This is a restart after a partial match
.sp
There are restrictions on what may appear in a pattern when matching using the
DFA algorithm is requested. Details are given in the
There are restrictions on what may appear in a pattern when using this matching
function. Details are given in the
.\" HREF
\fBpcrematching\fP
.\"
@@ -71,7 +75,7 @@ A \fBpcre_extra\fP structure contains the following fields:
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
PCRE_EXTRA_TABLES. For DFA matching, the \fImatch_limit\fP and
PCRE_EXTRA_TABLES. For this matching function, the \fImatch_limit\fP and
\fImatch_limit_recursion\fP fields are not used, and must not be set.
.P
There is a complete description of the PCRE native API in the

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
@@ -35,19 +34,28 @@ offsets to captured substrings. Its arguments are:
The options are:
.sp
PCRE_ANCHORED Match only at the first position
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
PCRE_BSR_UNICODE \eR matches all Unicode line endings
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
PCRE_NEWLINE_CR Set CR as the newline sequence
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
PCRE_NEWLINE_LF Set LF as the newline sequence
PCRE_NOTBOL Subject is not the beginning of a line
PCRE_NOTEOL Subject is not the end of a line
PCRE_NOTEMPTY An empty string is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
validity (only relevant if PCRE_UTF8
was set at compile time)
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
.sp
There are restrictions on what may appear in a pattern when partial matching is
requested.
requested. For details, see the
.\" HREF
\fBpcrepartial\fP
.\"
page.
.P
A \fBpcre_extra\fP structure contains the following fields:
.sp

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B void pcre_free_substring(const char *\fIstringptr\fP);
.
.SH DESCRIPTION

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B void pcre_free_substring_list(const char **\fIstringptr\fP);
.
.SH DESCRIPTION

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B int \fIwhat\fP, void *\fIwhere\fP);
@@ -31,13 +30,14 @@ The following information is available:
-1 for start of string
or after newline, or
-2 otherwise
PCRE_INFO_FIRSTTABLE Table of first bytes
(after studying)
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_NAMECOUNT Number of named subpatterns
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
PCRE_INFO_NAMETABLE Pointer to name table
PCRE_INFO_OPTIONS Options used for compilation
PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
PCRE_INFO_OPTIONS Option bits used for compilation
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
.sp

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
@@ -30,9 +29,10 @@ arguments are:
\fIstringptr\fP Where to put the string pointer
.sp
The memory in which the substring is placed is obtained by calling
\fBpcre_malloc()\fP. The yield of the function is the length of the extracted
substring, PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
PCRE_ERROR_NOSUBSTRING if the string name is invalid.
\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
be used to free it when it is no longer needed. The yield of the function is
the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
.P
There is a complete description of the PCRE native API in the
.\" HREF

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIname\fP);
@@ -22,7 +21,10 @@ parenthesis in a compiled pattern. Its arguments are:
\fIname\fP Name whose number is required
.sp
The yield of the function is the number of the parenthesis if the name is
found, or PCRE_ERROR_NOSUBSTRING otherwise.
found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
\fBpcre_get_stringnumber()\fP. You can obtain the complete list by calling
\fBpcre_get_stringtable_entries()\fP.
.P
There is a complete description of the PCRE native API in the
.\" HREF

View File

@@ -1,4 +1,4 @@
.TH PCRE_GET_STRINGNUMBER 3
.TH PCRE_GET_STRINGTABLE_ENTRIES 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
@@ -34,7 +33,7 @@ the table entries, in the
.\" HREF
\fBpcreapi\fP
.\"
page and a description of the POSIX API in the
page, and a description of the POSIX API in the
.\" HREF
\fBpcreposix\fP
.\"

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP,
@@ -27,9 +26,10 @@ arguments are:
\fIstringptr\fP Where to put the string pointer
.sp
The memory in which the substring is placed is obtained by calling
\fBpcre_malloc()\fP. The yield of the function is the length of the substring,
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
PCRE_ERROR_NOSUBSTRING if the string number is invalid.
\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
be used to free it when it is no longer needed. The yield of the function is
the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
.P
There is a complete description of the PCRE native API in the
.\" HREF

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_get_substring_list(const char *\fIsubject\fP,
.ti +5n
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
@@ -24,10 +23,12 @@ substrings. The arguments are:
\fIlistptr\fP Where to put a pointer to the list
.sp
The memory in which the substrings and the list are placed is obtained by
calling \fBpcre_malloc()\fP. A pointer to a list of pointers is put in
the variable whose address is in \fIlistptr\fP. The list is terminated by a
NULL pointer. The yield of the function is zero on success or
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained.
calling \fBpcre_malloc()\fP. The convenience function
\fBpcre_free_substring_list()\fP can be used to free it when it is no longer
needed. A pointer to a list of pointers is put in the variable whose address is
in \fIlistptr\fP. The list is terminated by a NULL pointer. The yield of the
function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
not be obtained.
.P
There is a complete description of the PCRE native API in the
.\" HREF

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
.B *\fIfirstcharptr\fP);
.

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B const unsigned char *pcre_maketables(void);
.
.SH DESCRIPTION

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
.
.SH DESCRIPTION

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
.ti +5n
.B const char **\fIerrptr\fP);

View File

@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B char *pcre_version(void);
.
.SH DESCRIPTION

View File

@@ -7,14 +7,12 @@ PCRE - Perl-compatible regular expressions
.B #include <pcre.h>
.PP
.SM
.br
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
.ti +5n
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
.ti +5n
.B const unsigned char *\fItableptr\fP);
.PP
.br
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
.ti +5n
.B int *\fIerrorcodeptr\fP,
@@ -23,19 +21,16 @@ PCRE - Perl-compatible regular expressions
.ti +5n
.B const unsigned char *\fItableptr\fP);
.PP
.br
.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
.ti +5n
.B const char **\fIerrptr\fP);
.PP
.br
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
.ti +5n
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
.PP
.br
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
@@ -44,7 +39,6 @@ PCRE - Perl-compatible regular expressions
.ti +5n
.B int *\fIworkspace\fP, int \fIwscount\fP);
.PP
.br
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
@@ -53,14 +47,12 @@ PCRE - Perl-compatible regular expressions
.ti +5n
.B char *\fIbuffer\fP, int \fIbuffersize\fP);
.PP
.br
.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
.ti +5n
.B int \fIbuffersize\fP);
.PP
.br
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
@@ -69,76 +61,59 @@ PCRE - Perl-compatible regular expressions
.ti +5n
.B const char **\fIstringptr\fP);
.PP
.br
.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIname\fP);
.PP
.br
.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
.PP
.br
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP,
.ti +5n
.B const char **\fIstringptr\fP);
.PP
.br
.B int pcre_get_substring_list(const char *\fIsubject\fP,
.ti +5n
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
.PP
.br
.B void pcre_free_substring(const char *\fIstringptr\fP);
.PP
.br
.B void pcre_free_substring_list(const char **\fIstringptr\fP);
.PP
.br
.B const unsigned char *pcre_maketables(void);
.PP
.br
.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
.ti +5n
.B int \fIwhat\fP, void *\fIwhere\fP);
.PP
.br
.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
.B *\fIfirstcharptr\fP);
.PP
.br
.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
.PP
.br
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
.PP
.br
.B char *pcre_version(void);
.PP
.br
.B void *(*pcre_malloc)(size_t);
.PP
.br
.B void (*pcre_free)(void *);
.PP
.br
.B void *(*pcre_stack_malloc)(size_t);
.PP
.br
.B void (*pcre_stack_free)(void *);
.PP
.br
.B int (*pcre_callout)(pcre_callout_block *);
.
.
.SH "PCRE API OVERVIEW"
.rs
.sp
PCRE has its own native API, which is described in this document. There is
also a set of wrapper functions that correspond to the POSIX regular expression
PCRE has its own native API, which is described in this document. There are
also some wrapper functions that correspond to the POSIX regular expression
API. These are described in the
.\" HREF
\fBpcreposix\fP
@@ -165,14 +140,14 @@ distribution. The
.\" HREF
\fBpcresample\fP
.\"
documentation describes how to run it.
documentation describes how to compile and run it.
.P
A second matching function, \fBpcre_dfa_exec()\fP, which is not
Perl-compatible, is also provided. This uses a different algorithm for the
matching. The alternative algorithm finds all possible matches (at a given
point in the subject). However, this algorithm does not return captured
substrings. A description of the two matching algorithms and their advantages
and disadvantages is given in the
point in the subject), and scans the subject just once. However, this algorithm
does not return captured substrings. A description of the two matching
algorithms and their advantages and disadvantages is given in the
.\" HREF
\fBpcrematching\fP
.\"
@@ -243,16 +218,47 @@ points during a matching operation. Details are given in the
documentation.
.
.
.\" HTML <a name="newlines"></a>
.SH NEWLINES
PCRE supports three different conventions for indicating line breaks in
strings: a single CR character, a single LF character, or the two-character
sequence CRLF. All three are used as "standard" by different operating systems.
When PCRE is built, a default can be specified. The default default is LF,
which is the Unix standard. When PCRE is run, the default can be overridden,
either when a pattern is compiled, or when it is matched.
.rs
.sp
PCRE supports five different conventions for indicating line breaks in
strings: a single CR (carriage return) character, a single LF (linefeed)
character, the two-character sequence CRLF, any of the three preceding, or any
Unicode newline sequence. The Unicode newline sequences are the three just
mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
(paragraph separator, U+2029).
.P
Each of the first three conventions is used by at least one operating system as
its standard newline sequence. When PCRE is built, a default can be specified.
The default default is LF, which is the Unix standard. When PCRE is run, the
default can be overridden, either when a pattern is compiled, or when it is
matched.
.P
At compile time, the newline convention can be specified by the \fIoptions\fP
argument of \fBpcre_compile()\fP, or it can be specified by special text at the
start of the pattern itself; this overrides any other settings. See the
.\" HREF
\fBpcrepattern\fP
.\"
page for details of the special character sequences.
.P
In the PCRE documentation the word "newline" is used to mean "the character or
pair of characters that indicate a line break".
pair of characters that indicate a line break". The choice of newline
convention affects the handling of the dot, circumflex, and dollar
metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
recognized line ending sequence, the match position advancement for a
non-anchored pattern. There is more detail about this in the
.\" HTML <a href="#execoptions">
.\" </a>
section on \fBpcre_exec()\fP options
.\"
below.
.P
The choice of newline convention does not affect the interpretation of
the \en or \er escape sequences, nor does it affect what \eR matches, which is
controlled in a similar way, but by separate options.
.
.
.SH MULTITHREADING
@@ -276,7 +282,9 @@ which it was compiled. Details are given in the
.\" HREF
\fBpcreprecompile\fP
.\"
documentation.
documentation. However, compiling a regular expression with one version of PCRE
for use with a different version is not guaranteed to work and may cause
crashes.
.
.
.SH "CHECKING BUILD-TIME OPTIONS"
@@ -308,9 +316,18 @@ properties is available; otherwise it is set to zero.
PCRE_CONFIG_NEWLINE
.sp
The output is an integer whose value specifies the default character sequence
that is recognized as meaning "newline". The three values that are supported
are: 10 for LF, 13 for CR, and 3338 for CRLF. The default should normally be
the standard sequence for your operating system.
that is recognized as meaning "newline". The four values that are supported
are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY.
Though they are derived from ASCII, the same values are returned in EBCDIC
environments. The default should normally correspond to the standard sequence
for your operating system.
.sp
PCRE_CONFIG_BSR
.sp
The output is an integer whose value indicates what character sequences the \eR
escape sequence matches by default. A value of 0 means that \eR matches any
Unicode line ending sequence; a value of 1 means that \eR matches only CR, LF,
or CRLF. The default can be overridden when a pattern is compiled or matched.
.sp
PCRE_CONFIG_LINK_SIZE
.sp
@@ -332,13 +349,13 @@ documentation.
.sp
PCRE_CONFIG_MATCH_LIMIT
.sp
The output is an integer that gives the default limit for the number of
The output is a long integer that gives the default limit for the number of
internal matching function calls in a \fBpcre_exec()\fP execution. Further
details are given with \fBpcre_exec()\fP below.
.sp
PCRE_CONFIG_MATCH_LIMIT_RECURSION
.sp
The output is an integer that gives the default limit for the depth of
The output is a long integer that gives the default limit for the depth of
recursion when calling the internal matching function in a \fBpcre_exec()\fP
execution. Further details are given with \fBpcre_exec()\fP below.
.sp
@@ -387,18 +404,19 @@ depend on memory location, the complete \fBpcre\fP data block is not
fully relocatable, because it may contain a copy of the \fItableptr\fP
argument, which is an address (see below).
.P
The \fIoptions\fP argument contains independent bits that affect the
The \fIoptions\fP argument contains various bit settings that affect the
compilation. It should be zero if no options are required. The available
options are described below. Some of them, in particular, those that are
compatible with Perl, can also be set and unset from within the pattern (see
the detailed description in the
options are described below. Some of them (in particular, those that are
compatible with Perl, but also some others) can also be set and unset from
within the pattern (see the detailed description in the
.\" HREF
\fBpcrepattern\fP
.\"
documentation). For these options, the contents of the \fIoptions\fP argument
specifies their initial settings at the start of compilation and execution. The
PCRE_ANCHORED and PCRE_NEWLINE_\fIxxx\fP options can be set at the time of
matching as well as at compile time.
documentation). For those options that can be different in different parts of
the pattern, the contents of the \fIoptions\fP argument specifies their initial
settings at the start of compilation and execution. The PCRE_ANCHORED and
PCRE_NEWLINE_\fIxxx\fP options can be set at the time of matching as well as at
compile time.
.P
If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
@@ -452,6 +470,15 @@ facility, see the
\fBpcrecallout\fP
.\"
documentation.
.sp
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
.sp
These options (which are mutually exclusive) control what the \eR escape
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
match any Unicode newline sequence. The default is specified when PCRE is
built. It can be overridden from within the pattern, or by setting an option
when a compiled pattern is matched.
.sp
PCRE_CASELESS
.sp
@@ -480,8 +507,8 @@ If this bit is set, a dot metacharater in the pattern matches all characters,
including those that indicate newline. Without it, a dot does not match when
the current position is at a newline. This option is equivalent to Perl's /s
option, and it can be changed within a pattern by a (?s) option setting. A
negative class such as [^a] always matches newlines, independent of the setting
of this option.
negative class such as [^a] always matches newline characters, independent of
the setting of this option.
.sp
PCRE_DUPNAMES
.sp
@@ -524,6 +551,20 @@ this option. It can also be set by a (?X) option setting within a pattern.
If this option is set, an unanchored pattern is required to match before or at
the first newline in the subject string, though the matched text may continue
over the newline.
.sp
PCRE_JAVASCRIPT_COMPAT
.sp
If this option is set, PCRE's behaviour is changed in some ways so that it is
compatible with JavaScript rather than Perl. The changes are as follows:
.P
(1) A lone closing square bracket in a pattern causes a compile-time error,
because this is illegal in JavaScript (by default it is treated as a data
character). Thus, the pattern AB]CD becomes illegal when this option is set.
.P
(2) At run time, a back reference to an unset subpattern group matches an empty
string (by default this causes the current matching alternative to fail). A
pattern such as (\e1)(a) succeeds when this option is set (assuming it can find
an "a" in the subject), whereas it fails by default, for Perl compatibility.
.sp
PCRE_MULTILINE
.sp
@@ -544,18 +585,37 @@ occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
.sp
These options override the default newline definition that was chosen when PCRE
was built. Setting the first or the second specifies that a newline is
indicated by a single character (CR or LF, respectively). Setting both of them
specifies that a newline is indicated by the two-character CRLF sequence. For
convenience, PCRE_NEWLINE_CRLF is defined to contain both bits. The only time
that a line break is relevant when compiling a pattern is if PCRE_EXTENDED is
set, and an unescaped # outside a character class is encountered. This
indicates a comment that lasts until after the next newline.
indicated by a single character (CR or LF, respectively). Setting
PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
that any Unicode newline sequence should be recognized. The Unicode newline
sequences are the three just mentioned, plus the single characters VT (vertical
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
separator, U+2028), and PS (paragraph separator, U+2029). The last two are
recognized only in UTF-8 mode.
.P
The newline option set at compile time becomes the default that is used for
\fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
The newline setting in the options word uses three bits that are treated
as a number, giving eight possibilities. Currently only six are used (default
plus the five values above). This means that if you set more than one newline
option, the combination may or may not be sensible. For example,
PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
other combinations may yield unused numbers and cause an error.
.P
The only time that a line break is specially recognized when compiling a
pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
class is encountered. This indicates a comment that lasts until after the next
line break sequence. In other circumstances, line break sequences are treated
as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
as whitespace characters and are therefore ignored.
.P
The newline option that is set at compile time becomes the default that is used
for \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
.sp
PCRE_NO_AUTO_CAPTURE
.sp
@@ -591,14 +651,22 @@ page.
PCRE_NO_UTF8_CHECK
.sp
When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
automatically checked. If an invalid UTF-8 sequence of bytes is found,
\fBpcre_compile()\fP returns an error. If you already know that your pattern is
valid, and you want to skip this check for performance reasons, you can set the
PCRE_NO_UTF8_CHECK option. When it is set, the effect of passing an invalid
UTF-8 string as a pattern is undefined. It may cause your program to crash.
Note that this option can also be passed to \fBpcre_exec()\fP and
\fBpcre_dfa_exec()\fP, to suppress the UTF-8 validity checking of subject
strings.
automatically checked. There is a discussion about the
.\" HTML <a href="pcre.html#utf8strings">
.\" </a>
validity of UTF-8 strings
.\"
in the main
.\" HREF
\fBpcre\fP
.\"
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_compile()\fP
returns an error. If you already know that your pattern is valid, and you want
to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
option. When it is set, the effect of passing an invalid UTF-8 string as a
pattern is undefined. It may cause your program to crash. Note that this option
can also be passed to \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, to suppress
the UTF-8 validity checking of subject strings.
.
.
.SH "COMPILATION ERROR CODES"
@@ -606,7 +674,8 @@ strings.
.sp
The following table lists the error codes than may be returned by
\fBpcre_compile2()\fP, along with the error messages that may be returned by
both compiling functions.
both compiling functions. As PCRE has developed, some error codes have fallen
out of use. To avoid confusion, they have not been re-used.
.sp
0 no error
1 \e at end of pattern
@@ -618,17 +687,17 @@ both compiling functions.
7 invalid escape sequence in character class
8 range out of order in character class
9 nothing to repeat
10 operand of unlimited repeat could match the empty string
10 [this code is not in use]
11 internal error: unexpected repeat
12 unrecognized character after (?
12 unrecognized character after (? or (?-
13 POSIX named classes are supported only within a class
14 missing )
15 reference to non-existent subpattern
16 erroffset passed as NULL
17 unknown option bit(s) set
18 missing ) after comment
19 parentheses nested too deeply
20 regular expression too large
19 [this code is not in use]
20 regular expression is too large
21 failed to get memory
22 unmatched parentheses
23 internal error: code overflow
@@ -637,11 +706,11 @@ both compiling functions.
26 malformed number or name after (?(
27 conditional group contains more than two branches
28 assertion expected after (?(
29 (?R or (?digits must be followed by )
29 (?R or (?[+-]digits must be followed by )
30 unknown POSIX class name
31 POSIX collating elements are not supported
32 this version of PCRE is not compiled with PCRE_UTF8 support
33 spare error
33 [this code is not in use]
34 character value in \ex{...} sequence is too large
35 invalid condition (?(0)
36 \eC not allowed in lookbehind assertion
@@ -650,16 +719,33 @@ both compiling functions.
39 closing ) for (?C expected
40 recursive call could loop indefinitely
41 unrecognized character after (?P
42 syntax error after (?P
42 syntax error in subpattern name (missing terminator)
43 two named subpatterns have the same name
44 invalid UTF-8 string
45 support for \eP, \ep, and \eX has not been compiled
46 malformed \eP or \ep sequence
47 unknown property name after \eP or \ep
48 subpattern name is too long (maximum 32 characters)
49 too many named subpatterns (maximum 10,000)
50 repeated subpattern is too long
49 too many named subpatterns (maximum 10000)
50 [this code is not in use]
51 octal value is greater than \e377 (not in UTF-8 mode)
52 internal error: overran compiling workspace
53 internal error: previously-checked referenced subpattern not found
54 DEFINE group contains more than one branch
55 repeating a DEFINE group is not allowed
56 inconsistent NEWLINE options
57 \eg is not followed by a braced, angle-bracketed, or quoted
name/number or by a plain number
58 a numbered reference must not be zero
59 (*VERB) with an argument is not supported
60 (*VERB) not recognized
61 number is too big
62 subpattern name expected
63 digit expected after (?+
64 ] is an invalid data character in JavaScript compatibility mode
.sp
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
be used if the limits were changed when PCRE was built.
.
.
.SH "STUDYING A PATTERN"
@@ -719,19 +805,25 @@ bytes is created.
.SH "LOCALE SUPPORT"
.rs
.sp
PCRE handles caseless matching, and determines whether characters are letters
PCRE handles caseless matching, and determines whether characters are letters,
digits, or whatever, by reference to a set of tables, indexed by character
value. When running in UTF-8 mode, this applies only to characters with codes
less than 128. Higher-valued codes never match escapes such as \ew or \ed, but
can be tested with \ep if PCRE is built with Unicode character property
support. The use of locales with Unicode is discouraged.
support. The use of locales with Unicode is discouraged. If you are handling
characters with codes greater than 128, you should either use UTF-8 and
Unicode, or use locales, but not try to mix the two.
.P
An internal set of tables is created in the default C locale when PCRE is
built. This is used when the final argument of \fBpcre_compile()\fP is NULL,
and is sufficient for many applications. An alternative set of tables can,
however, be supplied. These may be created in a different locale from the
default. As more and more applications change to using Unicode, the need for
this locale support is expected to die away.
PCRE contains an internal set of tables that are used when the final argument
of \fBpcre_compile()\fP is NULL. These are sufficient for many applications.
Normally, the internal tables recognize only ASCII characters. However, when
PCRE is built, it is possible to cause the internal tables to be rebuilt in the
default "C" locale of the local system, which may cause them to be different.
.P
The internal tables can always be overridden by tables supplied by the
application that calls PCRE. These may be created in a different locale from
the default. As more and more applications change to using Unicode, the need
for this locale support is expected to die away.
.P
External tables are built by calling the \fBpcre_maketables()\fP function,
which has no arguments, in the relevant locale. The result can then be passed
@@ -744,6 +836,9 @@ the following code could be used:
tables = pcre_maketables();
re = pcre_compile(..., tables);
.sp
The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
are using Windows, the name for the French locale is "french".
.P
When \fBpcre_maketables()\fP runs, the tables are built in memory that is
obtained via \fBpcre_malloc\fP. It is the caller's responsibility to ensure
that the memory containing the tables remains available for as long as it is
@@ -827,7 +922,7 @@ variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is
still recognized for backwards compatibility.)
.P
If there is a fixed first byte, for example, from a pattern such as
(cat|cow|coyote). Otherwise, if either
(cat|cow|coyote), its value is returned. Otherwise, if either
.sp
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
starts with "^", or
@@ -845,6 +940,18 @@ If the pattern was studied, and this resulted in the construction of a 256-bit
table indicating a fixed set of bytes for the first byte in any matching
string, a pointer to the table is returned. Otherwise NULL is returned. The
fourth argument should point to an \fBunsigned char *\fP variable.
.sp
PCRE_INFO_HASCRORLF
.sp
Return 1 if the pattern contains any explicit matches for CR or LF characters,
otherwise 0. The fourth argument should point to an \fBint\fP variable. An
explicit match is either a literal CR or LF character, or \er or \en.
.sp
PCRE_INFO_JCHANGED
.sp
Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
0. The fourth argument should point to an \fBint\fP variable. (?J) and
(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
.sp
PCRE_INFO_LASTLITERAL
.sp
@@ -882,8 +989,8 @@ their parentheses numbers. For example, consider the following pattern (assume
PCRE_EXTENDED is set, so white space - including newlines - is ignored):
.sp
.\" JOIN
(?P<date> (?P<year>(\ed\ed)?\ed\ed) -
(?P<month>\ed\ed) - (?P<day>\ed\ed) )
(?<date> (?<year>(\ed\ed)?\ed\ed) -
(?<month>\ed\ed) - (?<day>\ed\ed) )
.sp
There are four named subpatterns, so the table has four entries, and each entry
in the table is eight bytes long. The table is as follows, with non-printing
@@ -897,13 +1004,26 @@ bytes shows in hexadecimal, and undefined bytes shown as ??:
When writing code to extract data from named subpatterns using the
name-to-number map, remember that the length of the entries is likely to be
different for each compiled pattern.
.sp
PCRE_INFO_OKPARTIAL
.sp
Return 1 if the pattern can be used for partial matching, otherwise 0. The
fourth argument should point to an \fBint\fP variable. The
.\" HREF
\fBpcrepartial\fP
.\"
documentation lists the restrictions that apply to patterns when partial
matching is used.
.sp
PCRE_INFO_OPTIONS
.sp
Return a copy of the options with which the pattern was compiled. The fourth
argument should point to an \fBunsigned long int\fP variable. These option bits
are those specified in the call to \fBpcre_compile()\fP, modified by any
top-level option settings within the pattern itself.
top-level option settings at the start of the pattern itself. In other words,
they are the options that will be in force when matching starts. For example,
if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
.P
A pattern is automatically anchored by PCRE if all of its top-level
alternatives begin with one of the following:
@@ -1114,12 +1234,14 @@ called. See the
.\"
documentation for a discussion of saving compiled patterns for later use.
.
.\" HTML <a name="execoptions"></a>
.SS "Option bits for \fBpcre_exec()\fP"
.rs
.sp
The unused bits of the \fIoptions\fP argument for \fBpcre_exec()\fP must be
zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
.sp
PCRE_ANCHORED
.sp
@@ -1127,15 +1249,48 @@ The PCRE_ANCHORED option limits \fBpcre_exec()\fP to matching at the first
matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
to be anchored by virtue of its contents, it cannot be made unachored at
matching time.
.sp
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
.sp
These options (which are mutually exclusive) control what the \eR escape
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
match any Unicode newline sequence. These options override the choice that was
made or defaulted when the pattern was compiled.
.sp
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
.sp
These options override the newline definition that was chosen or defaulted when
the pattern was compiled. For details, see the description \fBpcre_compile()\fP
above. During matching, the newline choice affects the behaviour of the dot,
circumflex, and dollar metacharacters.
the pattern was compiled. For details, see the description of
\fBpcre_compile()\fP above. During matching, the newline choice affects the
behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
the way the match position is advanced after a match failure for an unanchored
pattern.
.P
When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
match attempt for an unanchored pattern fails when the current position is at a
CRLF sequence, and the pattern contains no explicit matches for CR or LF
characters, the match position is advanced by two characters instead of one, in
other words, to after the CRLF.
.P
The above rule is a compromise that makes the most common cases work as
expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
set), it does not match the string "\er\enA" because, after failing at the
start, it skips both the CR and the LF before retrying. However, the pattern
[\er\en]A does match that string, because it contains an explicit CR or LF
reference, and so advances only by one character after the first failure.
.P
An explicit match for CR of LF is either a literal appearance of one of those
characters, or one of the \er or \en escape sequences. Implicit matches such as
[^X] do not count, nor does \es (which includes CR and LF in the characters
that it matches).
.P
Notwithstanding the above, anomalous effects may still occur when CRLF is a
valid newline sequence and explicit \er or \en escapes appear in the pattern.
.sp
PCRE_NOTBOL
.sp
@@ -1172,15 +1327,35 @@ matching a null string by first trying the match again at the same offset with
PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
starting offset (see below) and trying an ordinary match again. There is some
code that demonstrates how to do this in the \fIpcredemo.c\fP sample program.
.sp
PCRE_NO_START_OPTIMIZE
.sp
There are a number of optimizations that \fBpcre_exec()\fP uses at the start of
a match, in order to speed up the process. For example, if it is known that a
match must start with a specific character, it searches the subject for that
character, and fails immediately if it cannot find it, without actually running
the main matching function. When callouts are in use, these optimizations can
cause them to be skipped. This option disables the "start-up" optimizations,
causing performance to suffer, but ensuring that the callouts do occur.
.sp
PCRE_NO_UTF8_CHECK
.sp
When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
string is automatically checked when \fBpcre_exec()\fP is subsequently called.
The value of \fIstartoffset\fP is also checked to ensure that it points to the
start of a UTF-8 character. If an invalid UTF-8 sequence of bytes is found,
\fBpcre_exec()\fP returns the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP
contains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
start of a UTF-8 character. There is a discussion about the validity of UTF-8
strings in the
.\" HTML <a href="pcre.html#utf8strings">
.\" </a>
section on UTF-8 support
.\"
in the main
.\" HREF
\fBpcre\fP
.\"
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains an invalid value,
PCRE_ERROR_BADUTF8_OFFSET is returned.
.P
If you already know that your subject is valid, and you want to skip these
checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
@@ -1210,11 +1385,11 @@ documentation.
.rs
.sp
The subject string is passed to \fBpcre_exec()\fP as a pointer in
\fIsubject\fP, a length in \fIlength\fP, and a starting byte offset in
\fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of a
UTF-8 character. Unlike the pattern string, the subject may contain binary zero
bytes. When the starting offset is zero, the search for a match starts at the
beginning of the subject, and this is by far the most common case.
\fIsubject\fP, a length (in bytes) in \fIlength\fP, and a starting byte offset
in \fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of
a UTF-8 character. Unlike the pattern string, the subject may contain binary
zero bytes. When the starting offset is zero, the search for a match starts at
the beginning of the subject, and this is by far the most common case.
.P
A non-zero starting offset is useful when searching for another match in the
same subject by calling \fBpcre_exec()\fP again after a previous success.
@@ -1248,38 +1423,41 @@ pattern. Following the usage in Jeffrey Friedl's book, this is called
a fragment of a pattern that picks out a substring. PCRE supports several other
kinds of parenthesized subpattern that do not cause substrings to be captured.
.P
Captured substrings are returned to the caller via a vector of integer offsets
whose address is passed in \fIovector\fP. The number of elements in the vector
is passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP:
this argument is NOT the size of \fIovector\fP in bytes.
Captured substrings are returned to the caller via a vector of integers whose
address is passed in \fIovector\fP. The number of elements in the vector is
passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP: this
argument is NOT the size of \fIovector\fP in bytes.
.P
The first two-thirds of the vector is used to pass back captured substrings,
each substring using a pair of integers. The remaining third of the vector is
used as workspace by \fBpcre_exec()\fP while matching capturing subpatterns,
and is not available for passing back information. The length passed in
and is not available for passing back information. The number passed in
\fIovecsize\fP should always be a multiple of three. If it is not, it is
rounded down.
.P
When a match is successful, information about captured substrings is returned
in pairs of integers, starting at the beginning of \fIovector\fP, and
continuing up to two-thirds of its length at the most. The first element of a
pair is set to the offset of the first character in a substring, and the second
is set to the offset of the first character after the end of a substring. The
first pair, \fIovector[0]\fP and \fIovector[1]\fP, identify the portion of the
subject string matched by the entire pattern. The next pair is used for the
first capturing subpattern, and so on. The value returned by \fBpcre_exec()\fP
is one more than the highest numbered pair that has been set. For example, if
two substrings have been captured, the returned value is 3. If there are no
capturing subpatterns, the return value from a successful match is 1,
indicating that just the first pair of offsets has been set.
continuing up to two-thirds of its length at the most. The first element of
each pair is set to the byte offset of the first character in a substring, and
the second is set to the byte offset of the first character after the end of a
substring. \fBNote\fP: these values are always byte offsets, even in UTF-8
mode. They are not character counts.
.P
The first pair of integers, \fIovector[0]\fP and \fIovector[1]\fP, identify the
portion of the subject string matched by the entire pattern. The next pair is
used for the first capturing subpattern, and so on. The value returned by
\fBpcre_exec()\fP is one more than the highest numbered pair that has been set.
For example, if two substrings have been captured, the returned value is 3. If
there are no capturing subpatterns, the return value from a successful match is
1, indicating that just the first pair of offsets has been set.
.P
If a capturing subpattern is matched repeatedly, it is the last portion of the
string that it matched that is returned.
.P
If the vector is too small to hold all the captured substring offsets, it is
used as far as possible (up to two-thirds of its length), and the function
returns a value of zero. In particular, if the substring offsets are not of
interest, \fBpcre_exec()\fP may be called with \fIovector\fP passed as NULL and
returns a value of zero. If the substring offsets are not of interest,
\fBpcre_exec()\fP may be called with \fIovector\fP passed as NULL and
\fIovecsize\fP as zero. However, if the pattern contains back references and
the \fIovector\fP is not big enough to remember the related substrings, PCRE
has to get additional memory for use during matching. Thus it is usually
@@ -1336,7 +1514,7 @@ compiled in an environment of one endianness is run in an environment with the
other endianness. This is the error that PCRE gives when the magic number is
not present.
.sp
PCRE_ERROR_UNKNOWN_NODE (-5)
PCRE_ERROR_UNKNOWN_OPCODE (-5)
.sp
While running the pattern match, an unknown item was encountered in the
compiled pattern. This error could be caused by a bug in PCRE or by overwriting
@@ -1361,12 +1539,6 @@ below). It is never returned by \fBpcre_exec()\fP.
The backtracking limit, as specified by the \fImatch_limit\fP field in a
\fBpcre_extra\fP structure (or defaulted) was reached. See the description
above.
.sp
PCRE_ERROR_RECURSIONLIMIT (-21)
.sp
The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
description above.
.sp
PCRE_ERROR_CALLOUT (-9)
.sp
@@ -1411,6 +1583,18 @@ in PCRE or by overwriting of the compiled pattern.
PCRE_ERROR_BADCOUNT (-15)
.sp
This error is given if the value of the \fIovecsize\fP argument is negative.
.sp
PCRE_ERROR_RECURSIONLIMIT (-21)
.sp
The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
description above.
.sp
PCRE_ERROR_BADNEWLINE (-23)
.sp
An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.
.
.SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER"
@@ -1422,14 +1606,12 @@ This error is given if the value of the \fIovecsize\fP argument is negative.
.ti +5n
.B int \fIbuffersize\fP);
.PP
.br
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
.ti +5n
.B int \fIstringcount\fP, int \fIstringnumber\fP,
.ti +5n
.B const char **\fIstringptr\fP);
.PP
.br
.B int pcre_get_substring_list(const char *\fIsubject\fP,
.ti +5n
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
@@ -1468,7 +1650,7 @@ the string is placed in \fIbuffer\fP, whose length is given by
\fIbuffersize\fP, while for \fBpcre_get_substring()\fP a new block of memory is
obtained via \fBpcre_malloc\fP, and its address is returned via
\fIstringptr\fP. The yield of the function is the length of the string, not
including the terminating zero, or one of
including the terminating zero, or one of these error codes:
.sp
PCRE_ERROR_NOMEMORY (-6)
.sp
@@ -1484,7 +1666,7 @@ and builds a list of pointers to them. All this is done in a single block of
memory that is obtained via \fBpcre_malloc\fP. The address of the memory block
is returned via \fIlistptr\fP, which is also the start of the list of string
pointers. The end of the list is marked by a NULL pointer. The yield of the
function is zero if all went well, or
function is zero if all went well, or the error code
.sp
PCRE_ERROR_NOMEMORY (-6)
.sp
@@ -1515,7 +1697,6 @@ provided.
.ti +5n
.B const char *\fIname\fP);
.PP
.br
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
@@ -1524,7 +1705,6 @@ provided.
.ti +5n
.B char *\fIbuffer\fP, int \fIbuffersize\fP);
.PP
.br
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
.ti +5n
.B const char *\fIsubject\fP, int *\fIovector\fP,
@@ -1536,7 +1716,7 @@ provided.
To extract a substring by name, you first have to find associated number.
For example, for this pattern
.sp
(a+)b(?P<xxx>\ed+)...
(a+)b(?<xxx>\ed+)...
.sp
the number of the subpattern called "xxx" is 2. If the name is known to be
unique (PCRE_DUPNAMES was not set), you can find the number from the name by
@@ -1560,9 +1740,14 @@ pattern. This is needed in order to gain access to the name-to-number
translation table.
.P
These functions call \fBpcre_get_stringnumber()\fP, and if it succeeds, they
then call \fIpcre_copy_substring()\fP or \fIpcre_get_substring()\fP, as
appropriate.
.
then call \fBpcre_copy_substring()\fP or \fBpcre_get_substring()\fP, as
appropriate. \fBNOTE:\fP If PCRE_DUPNAMES is set and there are duplicate names,
the behaviour may not be what you want (see the next section).
.P
\fBWarning:\fP If the pattern uses the "(?|" feature to set up multiple
subpatterns with the same number, you cannot use names to distinguish them,
because names are not included in the compiled code. The matching process uses
only numbers.
.
.SH "DUPLICATE SUBPATTERN NAMES"
.rs
@@ -1578,22 +1763,25 @@ example is shown in the
.\" HREF
\fBpcrepattern\fP
.\"
documentation. When duplicates are present, \fBpcre_copy_named_substring()\fP
and \fBpcre_get_named_substring()\fP return the first substring corresponding
to the given name that is set. If none are set, an empty string is returned.
The \fBpcre_get_stringnumber()\fP function returns one of the numbers that are
associated with the name, but it is not defined which it is.
.sp
documentation.
.P
When duplicates are present, \fBpcre_copy_named_substring()\fP and
\fBpcre_get_named_substring()\fP return the first substring corresponding to
the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
returned; no data is returned. The \fBpcre_get_stringnumber()\fP function
returns one of the numbers that are associated with the name, but it is not
defined which it is.
.P
If you want to get full details of all captured substrings for a given name,
you must use the \fBpcre_get_stringtable_entries()\fP function. The first
argument is the compiled pattern, and the second is the name. The third and
fourth are pointers to variables which are updated by the function. After it
has run, they point to the first and last entries in the name-to-number table
for the given name. The function itself returns the length of each entry, or
PCRE_ERROR_NOSUBSTRING if there are none. The format of the table is described
above in the section entitled \fIInformation about a pattern\fP. Given all the
relevant entries for the name, you can extract each of their numbers, and hence
the captured data, if any.
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
described above in the section entitled \fIInformation about a pattern\fP.
Given all the relevant entries for the name, you can extract each of their
numbers, and hence the captured data, if any.
.
.
.SH "FINDING ALL POSSIBLE MATCHES"
@@ -1631,11 +1819,12 @@ will yield PCRE_ERROR_NOMATCH.
.B int *\fIworkspace\fP, int \fIwscount\fP);
.P
The function \fBpcre_dfa_exec()\fP is called to match a subject string against
a compiled pattern, using a "DFA" matching algorithm. This has different
characteristics to the normal algorithm, and is not compatible with Perl. Some
of the features of PCRE patterns are not supported. Nevertheless, there are
times when this kind of matching can be useful. For a discussion of the two
matching algorithms, see the
a compiled pattern, using a matching algorithm that scans the subject string
just once, and does not backtrack. This has different characteristics to the
normal algorithm, and is not compatible with Perl. Some of the features of PCRE
patterns are not supported. Nevertheless, there are times when this kind of
matching can be useful. For a discussion of the two matching algorithms, see
the
.\" HREF
\fBpcrematching\fP
.\"
@@ -1691,9 +1880,9 @@ matching string.
PCRE_DFA_SHORTEST
.sp
Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
soon as it has found one match. Because of the way the DFA algorithm works,
this is necessarily the shortest possible match at the first possible matching
point in the subject string.
soon as it has found one match. Because of the way the alternative algorithm
works, this is necessarily the shortest possible match at the first possible
matching point in the subject string.
.sp
PCRE_DFA_RESTART
.sp
@@ -1732,10 +1921,10 @@ the three matched strings are
On success, the yield of the function is a number greater than zero, which is
the number of matched substrings. The substrings themselves are returned in
\fIovector\fP. Each string uses two elements; the first is the offset to the
start, and the second is the offset to the end. All the strings have the same
start offset. (Space could have been saved by giving this only once, but it was
decided to retain some compatibility with the way \fBpcre_exec()\fP returns
data, even though the meaning of the strings is different.)
start, and the second is the offset to the end. In fact, all the strings have
the same start offset. (Space could have been saved by giving this only once,
but it was decided to retain some compatibility with the way \fBpcre_exec()\fP
returns data, even though the meaning of the strings is different.)
.P
The strings are returned in reverse order of length; that is, the longest
matching string is given first. If there were too many matches to fit into
@@ -1762,8 +1951,9 @@ that it does not support, for instance, the use of \eC or a back reference.
.sp
PCRE_ERROR_DFA_UCOND (-17)
.sp
This return is given if \fBpcre_dfa_exec()\fP encounters a condition item in a
pattern that uses a back reference for the condition. This is not supported.
This return is given if \fBpcre_dfa_exec()\fP encounters a condition item that
uses a back reference for the condition, or a test for recursion in a specific
group. These are not supported.
.sp
PCRE_ERROR_DFA_UMLIMIT (-18)
.sp
@@ -1782,8 +1972,30 @@ When a recursive subpattern is processed, the matching function calls itself
recursively, using private vectors for \fIovector\fP and \fIworkspace\fP. This
error is given if the output vector is not large enough. This should be
extremely rare, as a vector of size 1000 is used.
.P
.in 0
Last updated: 08 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcrebuild\fP(3), \fBpcrecallout\fP(3), \fBpcrecpp(3)\fP(3),
\fBpcrematching\fP(3), \fBpcrepartial\fP(3), \fBpcreposix\fP(3),
\fBpcreprecompile\fP(3), \fBpcresample\fP(3), \fBpcrestack\fP(3).
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 11 April 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -5,16 +5,21 @@ PCRE - Perl-compatible regular expressions
.rs
.sp
This document describes the optional features of PCRE that can be selected when
the library is compiled. They are all selected, or deselected, by providing
options to the \fBconfigure\fP script that is run before the \fBmake\fP
command. The complete list of options for \fBconfigure\fP (which includes the
standard ones such as the selection of the installation directory) can be
obtained by running
the library is compiled. It assumes use of the \fBconfigure\fP script, where
the optional features are selected or deselected by providing options to
\fBconfigure\fP before running the \fBmake\fP command. However, the same
options can be selected in both Unix-like and non-Unix-like environments using
the GUI facility of \fBCMakeSetup\fP if you are using \fBCMake\fP instead of
\fBconfigure\fP to build PCRE.
.P
The complete list of options for \fBconfigure\fP (which includes the standard
ones such as the selection of the installation directory) can be obtained by
running
.sp
./configure --help
.sp
The following sections describe certain options whose names begin with --enable
or --disable. These settings specify changes to the defaults for the
The following sections include descriptions of options whose names begin with
--enable or --disable. These settings specify changes to the defaults for the
\fBconfigure\fP command. Because of the way that \fBconfigure\fP works,
--enable and --disable always come in pairs, so the complementary option always
exists as well, but as it specifies the default, it is not described.
@@ -33,7 +38,7 @@ to the \fBconfigure\fP command.
.SH "UTF-8 SUPPORT"
.rs
.sp
To build PCRE with support for UTF-8 character strings, add
To build PCRE with support for UTF-8 Unicode character strings, add
.sp
--enable-utf8
.sp
@@ -41,6 +46,12 @@ to the \fBconfigure\fP command. Of itself, this does not make PCRE treat
strings as UTF-8. As well as compiling PCRE with this option, you also have
have to set the PCRE_UTF8 option when you call the \fBpcre_compile()\fP
function.
.P
If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
its input to be either ASCII or UTF-8 (depending on the runtime option). It is
not possible to support both EBCDIC and UTF-8 codes in the same version of the
library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
exclusive.
.
.SH "UNICODE CHARACTER PROPERTY SUPPORT"
.rs
@@ -56,9 +67,9 @@ character properties, you must add
to the \fBconfigure\fP command. This implies UTF-8 support, even if you have
not explicitly requested it.
.P
Including Unicode property support adds around 90K of tables to the PCRE
library, approximately doubling its size. Only the general category properties
such as \fILu\fP and \fINd\fP are supported. Details are given in the
Including Unicode property support adds around 30K of tables to the PCRE
library. Only the general category properties such as \fILu\fP and \fINd\fP are
supported. Details are given in the
.\" HREF
\fBpcrepattern\fP
.\"
@@ -67,9 +78,9 @@ documentation.
.SH "CODE VALUE OF NEWLINE"
.rs
.sp
By default, PCRE interprets character 10 (linefeed, LF) as indicating the end
By default, PCRE interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can
compile PCRE to use character 13 (carriage return, CR) instead, by adding
compile PCRE to use carriage return (CR) instead, by adding
.sp
--enable-newline-is-cr
.sp
@@ -81,9 +92,32 @@ character sequence CRLF. If you want this, add
.sp
--enable-newline-is-crlf
.sp
to the \fBconfigure\fP command. Whatever line ending convention is selected
when PCRE is built can be overridden when the library functions are called. At
build time it is conventional to use the standard for your operating system.
to the \fBconfigure\fP command. There is a fourth option, specified by
.sp
--enable-newline-is-anycrlf
.sp
which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
indicating a line ending. Finally, a fifth option, specified by
.sp
--enable-newline-is-any
.sp
causes PCRE to recognize any Unicode newline sequence.
.P
Whatever line ending convention is selected when PCRE is built can be
overridden when the library functions are called. At build time it is
conventional to use the standard for your operating system.
.
.SH "WHAT \eR MATCHES"
.rs
.sp
By default, the sequence \eR in a pattern matches any Unicode newline sequence,
whatever has been selected as the line ending sequence. If you specify
.sp
--enable-bsr-anycrlf
.sp
the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is
selected when PCRE is built can be overridden when the library functions are
called.
.
.SH "BUILDING SHARED AND STATIC LIBRARIES"
.rs
@@ -131,10 +165,6 @@ or four-byte offsets by adding a setting such as
to the \fBconfigure\fP command. The value given must be 2, 3, or 4. Using
longer offsets slows down the operation of PCRE because it has to load
additional bytes when handling them.
.P
If you build PCRE with an increased link size, test 2 (and test 5 if you are
using UTF-8) will fail. Part of the output of these tests is a representation
of the compiled pattern, and this changes with the link size.
.
.SH "AVOIDING EXCESSIVE STACK USAGE"
.rs
@@ -157,13 +187,17 @@ build a version of PCRE that works this way, add
.sp
to the \fBconfigure\fP command. With this configuration, PCRE will use the
\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables to call memory
management functions. Separate functions are provided because the usage is very
predictable: the block sizes requested are always the same, and the blocks are
always freed in reverse order. A calling program might be able to implement
optimized functions that perform better than the standard \fBmalloc()\fP and
\fBfree()\fP functions. PCRE runs noticeably more slowly when built in this
way. This option affects only the \fBpcre_exec()\fP function; it is not
relevant for the the \fBpcre_dfa_exec()\fP function.
management functions. By default these point to \fBmalloc()\fP and
\fBfree()\fP, but you can replace the pointers so that your own functions are
used.
.P
Separate functions are provided rather than using \fBpcre_malloc\fP and
\fBpcre_free\fP because the usage is very predictable: the block sizes
requested are always the same, and the blocks are always freed in reverse
order. A calling program might be able to implement optimized functions that
perform better than \fBmalloc()\fP and \fBfree()\fP. PCRE runs noticeably more
slowly when built in this way. This option affects only the \fBpcre_exec()\fP
function; it is not relevant for the the \fBpcre_dfa_exec()\fP function.
.
.SH "LIMITING PCRE RESOURCE USAGE"
.rs
@@ -196,18 +230,105 @@ constraints. However, you can set a lower limit by adding, for example,
.sp
to the \fBconfigure\fP command. This value can also be overridden at run time.
.
.SH "CREATING CHARACTER TABLES AT BUILD TIME"
.rs
.sp
PCRE uses fixed tables for processing characters whose code values are less
than 256. By default, PCRE is built with a set of tables that are distributed
in the file \fIpcre_chartables.c.dist\fP. These tables are for ASCII codes
only. If you add
.sp
--enable-rebuild-chartables
.sp
to the \fBconfigure\fP command, the distributed tables are no longer used.
Instead, a program called \fBdftables\fP is compiled and run. This outputs the
source for new set of tables, created in the default locale of your C runtime
system. (This method of replacing the tables does not work if you are cross
compiling, because \fBdftables\fP is run on the local host. If you need to
create alternative tables when cross compiling, you will have to do so "by
hand".)
.
.SH "USING EBCDIC CODE"
.rs
.sp
PCRE assumes by default that it will run in an environment where the character
code is ASCII (or Unicode, which is a superset of ASCII). PCRE can, however, be
compiled to run in an EBCDIC environment by adding
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
most computer operating systems. PCRE can, however, be compiled to run in an
EBCDIC environment by adding
.sp
--enable-ebcdic
.sp
to the \fBconfigure\fP command.
to the \fBconfigure\fP command. This setting implies
--enable-rebuild-chartables. You should only use it if you know that you are in
an EBCDIC environment (for example, an IBM mainframe operating system). The
--enable-ebcdic option is incompatible with --enable-utf8.
.
.SH "PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT"
.rs
.sp
By default, \fBpcregrep\fP reads all files as plain text. You can build it so
that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads
them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of
.sp
--enable-pcregrep-libz
--enable-pcregrep-libbz2
.sp
to the \fBconfigure\fP command. These options naturally require that the
relevant libraries are installed on your system. Configuration will fail if
they are not.
.
.SH "PCRETEST OPTION FOR LIBREADLINE SUPPORT"
.rs
.sp
If you add
.sp
--enable-pcretest-libreadline
.sp
to the \fBconfigure\fP command, \fBpcretest\fP is linked with the
\fBlibreadline\fP library, and when its input is from a terminal, it reads it
using the \fBreadline()\fP function. This provides line-editing and history
facilities. Note that \fBlibreadline\fP is GPL-licenced, so if you distribute a
binary of \fBpcretest\fP linked in this way, there may be licensing issues.
.P
.in 0
Last updated: 06 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
Setting this option causes the \fB-lreadline\fP option to be added to the
\fBpcretest\fP build. In many operating environments with a sytem-installed
\fBlibreadline\fP this is sufficient. However, in some environments (e.g.
if an unmodified distribution version of readline is in use), some extra
configuration may be necessary. The INSTALL file for \fBlibreadline\fP says
this:
.sp
"Readline uses the termcap functions, but does not link with the
termcap or curses library itself, allowing applications which link
with readline the to choose an appropriate library."
.sp
If your environment has not been set up so that an appropriate library is
automatically included, you may need to add something like
.sp
LIBS="-ncurses"
.sp
immediately before the \fBconfigure\fP command.
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcreapi\fP(3), \fBpcre_config\fP(3).
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 17 March 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -17,7 +17,7 @@ function is to be called. Different callout points can be identified by putting
a number less than 256 after the letter C. The default value is zero.
For example, this pattern has two callout points:
.sp
(?C1)\deabc(?C2)def
(?C1)abc(?C2)def
.sp
If the PCRE_AUTO_CALLOUT option bit is set when \fBpcre_compile()\fP is called,
PCRE automatically inserts callouts, all with number 255, before each item in
@@ -44,7 +44,8 @@ trying to optimize the performance of a particular pattern.
.rs
.sp
You should be aware that, because of optimizations in the way PCRE matches
patterns, callouts sometimes do not happen. For example, if the pattern is
patterns by default, callouts sometimes do not happen. For example, if the
pattern is
.sp
ab(?C4)cd
.sp
@@ -52,6 +53,11 @@ PCRE knows that any matching string must contain the letter "d". If the subject
string is "abyz", the lack of "d" means that matching doesn't ever start, and
the callout is never reached. However, with "abyd", though the result is still
no match, the callout is obeyed.
.P
You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
option to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. This slows down the
matching process, but does ensure that callouts such as the example above are
obeyed.
.
.
.SH "THE CALLOUT INTERFACE"
@@ -95,10 +101,12 @@ not useful.
The \fIsubject\fP and \fIsubject_length\fP fields contain copies of the values
that were passed to \fBpcre_exec()\fP.
.P
The \fIstart_match\fP field contains the offset within the subject at which the
current match attempt started. If the pattern is not anchored, the callout
function may be called several times from the same point in the pattern for
different starting points in the subject.
The \fIstart_match\fP field normally contains the offset within the subject at
which the current match attempt started. However, if the escape sequence \eK
has been encountered, this value is changed to reflect the modified starting
point. If the pattern is not anchored, the callout function may be called
several times from the same point in the pattern for different starting points
in the subject.
.P
The \fIcurrent_position\fP field contains the offset within the subject of the
current match pointer.
@@ -154,8 +162,22 @@ Negative values should normally be chosen from the set of PCRE_ERROR_xxx
values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
it will never be used by PCRE itself.
.P
.in 0
Last updated: 28 February 2005
.br
Copyright (c) 1997-2005 University of Cambridge.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 15 March 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -5,8 +5,9 @@ PCRE - Perl-compatible regular expressions
.rs
.sp
This document describes the differences in the ways that PCRE and Perl handle
regular expressions. The differences described here are with respect to Perl
5.8.
regular expressions. The differences described here are mainly with respect to
Perl 5.8, though PCRE versions 7.0 and later contain some features that are
expected to be in the forthcoming Perl 5.10.
.P
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
it does have are given in the
@@ -63,20 +64,32 @@ following examples:
.sp
The \eQ...\eE sequence is recognized both inside and outside character classes.
.P
8. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
constructions. However, there is support for recursive patterns using the
non-Perl items (?R), (?number), and (?P>name). Also, the PCRE "callout" feature
allows an external function to be called during pattern matching. See the
8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
constructions. However, there is support for recursive patterns. This is not
available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
feature allows an external function to be called during pattern matching. See
the
.\" HREF
\fBpcrecallout\fP
.\"
documentation for details.
.P
9. There are some differences that are concerned with the settings of captured
9. Subpatterns that are called recursively or as "subroutines" are always
treated as atomic groups in PCRE. This is like Python, but unlike Perl.
.P
10. There are some differences that are concerned with the settings of captured
strings when part of a pattern is repeated. For example, matching "aba" against
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
.P
10. PCRE provides some extensions to the Perl regular expression facilities:
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
parentheses, PCRE does not set that capture group; this is different to Perl.
.P
12. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 will include new features that are not in earlier versions, some of
which (such as named parentheses) have been in PCRE for some time. This list is
with respect to Perl 5.10:
.sp
(a) Although lookbehind assertions must match fixed length strings, each
alternative branch of a lookbehind assertion can match a different length of
@@ -86,8 +99,8 @@ string. Perl requires them all to have the same length.
meta-character matches only at the very end of the string.
.sp
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
meaning is faulted. Otherwise, like Perl, the backslash is ignored. (Perl can
be made to issue a warning.)
meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
(Perl can be made to issue a warning.)
.sp
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
inverted, that is, by default they are not greedy, but if followed by a
@@ -99,28 +112,37 @@ only at the first matching position in the subject string.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
options for \fBpcre_exec()\fP have no Perl equivalents.
.sp
(g) The (?R), (?number), and (?P>name) constructs allows for recursive pattern
matching (Perl can do this using the (?p{code}) construct, which PCRE cannot
support.)
(g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF
by the PCRE_BSR_ANYCRLF option.
.sp
(h) PCRE supports named capturing substrings, using the Python syntax.
(h) The callout facility is PCRE-specific.
.sp
(i) PCRE supports the possessive quantifier "++" syntax, taken from Sun's Java
package.
(i) The partial matching facility is PCRE-specific.
.sp
(j) The (R) condition, for testing recursion, is a PCRE extension.
.sp
(k) The callout facility is PCRE-specific.
.sp
(l) The partial matching facility is PCRE-specific.
.sp
(m) Patterns compiled by PCRE can be saved and re-used at a later time, even on
(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
different hosts that have the other endianness.
.sp
(n) The alternative matching function (\fBpcre_dfa_exec()\fP) matches in a
(k) The alternative matching function (\fBpcre_dfa_exec()\fP) matches in a
different way and is not Perl-compatible.
.P
.in 0
Last updated: 06 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
.sp
(l) PCRE recognizes some special sequences such as (*CR) at the start of
a pattern that set overall options that cannot be changed within the pattern.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 11 September 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi

View File

@@ -5,9 +5,7 @@ PCRE - Perl-compatible regular expressions.
.rs
.sp
.B #include <pcrecpp.h>
.PP
.SM
.br
.
.SH DESCRIPTION
.rs
.sp
@@ -81,14 +79,42 @@ The function returns true iff all of the following conditions are satisfied:
.sp
c. The "i"th argument has a suitable type for holding the
string captured as the "i"th sub-pattern. If you pass in
NULL for the "i"th argument, or pass fewer arguments than
void * NULL for the "i"th argument, or a non-void * NULL
of the correct type, or pass fewer arguments than the
number of sub-patterns, "i"th captured sub-pattern is
ignored.
.sp
CAVEAT: An optional sub-pattern that does not exist in the matched
string is assigned the empty string. Therefore, the following will
return false (because the empty string is not a valid number):
.sp
int number;
pcrecpp::RE::FullMatch("abc", "[a-z]+(\e\ed+)?", &number);
.sp
The matching interface supports at most 16 arguments per call.
If you need more, consider using the more general interface
\fBpcrecpp::RE::DoMatch\fP. See \fBpcrecpp.h\fP for the signature for
\fBDoMatch\fP.
.P
NOTE: Do not use \fBno_arg\fP, which is used internally to mark the end of a
list of optional arguments, as a placeholder for missing arguments, as this can
lead to segfaults.
.
.
.SH "QUOTING METACHARACTERS"
.rs
.sp
You can use the "QuoteMeta" operation to insert backslashes before all
potentially meaningful characters in a string. The returned string, used as a
regular expression, will exactly match the original string.
.sp
Example:
string quoted = RE::QuoteMeta(unquoted);
.sp
Note that it's legal to escape a character even if it has no special meaning in
a regular expression -- so this function does that. (This also makes it
identical to the perl function of the same name; see "perldoc -f quotemeta".)
For example, "1.5-2.0?" becomes "1\e.5\e-2\e.0\e?".
.
.SH "PARTIAL MATCHES"
.rs
@@ -307,6 +333,15 @@ string is left unaffected.
.SH AUTHOR
.rs
.sp
.nf
The C++ wrapper was contributed by Google Inc.
.br
Copyright (c) 2005 Google Inc.
Copyright (c) 2007 Google Inc.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 17 March 2009
.fi

View File

@@ -11,10 +11,10 @@ pcregrep - a grep with Perl-compatible regular expressions.
grep commands do, but it uses the PCRE regular expression library to support
patterns that are compatible with the regular expressions of Perl 5. See
.\" HREF
\fBpcrepattern\fP
\fBpcrepattern\fP(3)
.\"
for a full description of syntax and semantics of the regular expressions that
PCRE supports.
for a full description of syntax and semantics of the regular expressions
that PCRE supports.
.P
Patterns, whether supplied on the command line or in a separate file, are given
without delimiters. For example:
@@ -23,9 +23,9 @@ without delimiters. For example:
.sp
If you attempt to use delimiters (for example, by surrounding a pattern with
slashes, as is common in Perl scripts), they are interpreted as part of the
pattern. Quotes can of course be used on the command line because they are
interpreted by the shell, and indeed they are required if a pattern contains
white space or shell metacharacters.
pattern. Quotes can of course be used to delimit patterns on the command line
because they are interpreted by the shell, and indeed they are required if a
pattern contains white space or shell metacharacters.
.P
The first argument that follows any option settings is treated as the single
pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present.
@@ -39,20 +39,53 @@ For example:
.sp
pcregrep some-pattern /file1 - /file3
.sp
By default, each line that matches the pattern is copied to the standard
By default, each line that matches a pattern is copied to the standard
output, and if there is more than one file, the file name is output at the
start of each line. However, there are options that can change how
\fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it possible to
search for patterns that span line boundaries. What defines a line boundary is
controlled by the \fB-N\fP (\fB--newline\fP) option.
start of each line, followed by a colon. However, there are options that can
change how \fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it
possible to search for patterns that span line boundaries. What defines a line
boundary is controlled by the \fB-N\fP (\fB--newline\fP) option.
.P
Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
BUFSIZ is defined in \fB<stdio.h>\fP.
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to
each line in the order in which they are defined, except that all the \fB-e\fP
patterns are tried before the \fB-f\fP patterns.
.P
By default, as soon as one pattern matches (or fails to match when \fB-v\fP is
used), no further patterns are considered. However, if \fB--colour\fP (or
\fB--color\fP) is used to colour the matching substrings, or if
\fB--only-matching\fP, \fB--file-offsets\fP, or \fB--line-offsets\fP is used to
output only the part of the line that matched (either shown literally, or as an
offset), scanning resumes immediately following the match, so that further
matches on the same line can be found. If there are multiple patterns, they are
all tried on the remainder of the line, but patterns that follow the one that
matched are not tried on the earlier part of the line.
.P
This is the same behaviour as GNU grep, but it does mean that the order in
which multiple patterns are specified can affect the output when one of the
above options is used.
.P
Patterns that can match an empty string are accepted, but empty string
matches are not recognized. An example is the pattern "(super)?(man)?", in
which all components are optional. This pattern finds all occurrences of both
"super" and "man"; the output differs from matching with "super|man" when only
the matching substrings are being shown.
.P
If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set,
\fBpcregrep\fP uses the value to set a locale when calling the PCRE library.
The \fB--locale\fP option can be used to override this.
.
.SH "SUPPORT FOR COMPRESSED FILES"
.rs
.sp
It is possible to compile \fBpcregrep\fP so that it uses \fBlibz\fP or
\fBlibbz2\fP to read files whose names end in \fB.gz\fP or \fB.bz2\fP,
respectively. You can find out whether your binary has support for one or both
of these file types by running it with the \fB--help\fP option. If the
appropriate support is not present, files are treated as plain text. The
standard input is always so treated.
.
.SH OPTIONS
.rs
.TP 10
@@ -93,16 +126,20 @@ If data is required, it must be given in the same shell item, separated by an
equals sign.
.TP
\fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
This option specifies under what circumstances the part of a line that matched
a pattern should be coloured in the output. The value may be "never" (the
default), "always", or "auto". In the latter case, colouring happens only if
the standard output is connected to a terminal. The colour can be specified by
setting the environment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
of this variable should be a string of two numbers, separated by a semicolon.
They are copied directly into the control string for setting colour on a
terminal, so it is your responsibility to ensure that they make sense. If
neither of the environment variables is set, the default is "1;31", which gives
red.
This option specifies under what circumstances the parts of a line that matched
a pattern should be coloured in the output. By default, the output is not
coloured. The value (which is optional, see above) may be "never", "always", or
"auto". In the latter case, colouring happens only if the standard output is
connected to a terminal. More resources are used when colouring is enabled,
because \fBpcregrep\fP has to search for all possible matches in a line, not
just one, in order to colour them all.
The colour that is used can be specified by setting the environment variable
PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
string of two numbers, separated by a semicolon. They are copied directly into
the control string for setting colour on a terminal, so it is your
responsibility to ensure that they make sense. If neither of the environment
variables is set, the default is "1;31", which gives red.
.TP
\fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP
If an input path is not a regular file or a directory, "action" specifies how
@@ -116,29 +153,41 @@ option), or "skip" (silently skip the path). In the default case, directories
are read as if they were ordinary files. In some operating systems the effect
of reading a directory like this is an immediate end-of-file.
.TP
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP,
\fB--regexp=\fP\fIpattern\fP Specify a pattern to be matched. This option can
be used multiple times in order to specify several patterns. It can also be
used as a way of specifying a single pattern that starts with a hyphen. When
\fB-e\fP is used, no argument pattern is taken from the command line; all
arguments are treated as file names. There is an overall maximum of 100
patterns. They are applied to each line in the order in which they are defined
until one matches (or fails to match if \fB-v\fP is used). If \fB-f\fP is used
with \fB-e\fP, the command line patterns are matched first, followed by the
patterns from the file, independent of the order in which these options are
specified. Note that multiple use of \fB-e\fP is not the same as a single
pattern with alternatives. For example, X|Y finds the first character in a line
that is X or Y, whereas if the two patterns are given separately,
\fBpcregrep\fP finds X if it is present, even if it follows Y in the line. It
finds Y only if there is no X in the line. This really matters only if you are
using \fB-o\fP to show the portion of the line that matched.
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
Specify a pattern to be matched. This option can be used multiple times in
order to specify several patterns. It can also be used as a way of specifying a
single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
pattern is taken from the command line; all arguments are treated as file
names. There is an overall maximum of 100 patterns. They are applied to each
line in the order in which they are defined until one matches (or fails to
match if \fB-v\fP is used). If \fB-f\fP is used with \fB-e\fP, the command line
patterns are matched first, followed by the patterns from the file, independent
of the order in which these options are specified. Note that multiple use of
\fB-e\fP is not the same as a single pattern with alternatives. For example,
X|Y finds the first character in a line that is X or Y, whereas if the two
patterns are given separately, \fBpcregrep\fP finds X if it is present, even if
it follows Y in the line. It finds Y only if there is no X in the line. This
really matters only if you are using \fB-o\fP to show the part(s) of the line
that matched.
.TP
\fB--exclude\fP=\fIpattern\fP
When \fBpcregrep\fP is searching the files in a directory as a consequence of
the \fB-r\fP (recursive search) option, any files whose names match the pattern
are excluded. The pattern is a PCRE regular expression. If a file name matches
both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no short
form for this option.
the \fB-r\fP (recursive search) option, any regular files whose names match the
pattern are excluded. Subdirectories are not excluded by this option; they are
searched recursively, subject to the \fB--exclude_dir\fP and
\fB--include_dir\fP options. The pattern is a PCRE regular expression, and is
matched against the final component of the file name (not the entire path). If
a file name matches both \fB--include\fP and \fB--exclude\fP, it is excluded.
There is no short form for this option.
.TP
\fB--exclude_dir\fP=\fIpattern\fP
When \fBpcregrep\fP is searching the contents of a directory as a consequence
of the \fB-r\fP (recursive search) option, any subdirectories whose names match
the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
subdirectories.) The pattern is a PCRE regular expression, and is matched
against the final component of the name (not the entire path). If a
subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
is excluded. There is no short form for this option.
.TP
\fB-F\fP, \fB--fixed-strings\fP
Interpret each pattern as a list of fixed strings, separated by newlines,
@@ -156,34 +205,55 @@ present; they are tested before the file's patterns. However, no other pattern
is taken from the command line; all arguments are treated as file names. There
is an overall maximum of 100 patterns. Trailing white space is removed from
each line, and blank lines are ignored. An empty file contains no patterns and
therefore matches nothing.
therefore matches nothing. See also the comments about multiple patterns versus
a single pattern with alternatives in the description of \fB-e\fP above.
.TP
\fB--file-offsets\fP
Instead of showing lines or parts of lines that match, show each match as an
offset from the start of the file and a length, separated by a comma. In this
mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
options are ignored. If there is more than one match in a line, each of them is
shown separately. This option is mutually exclusive with \fB--line-offsets\fP
and \fB--only-matching\fP.
.TP
\fB-H\fP, \fB--with-filename\fP
Force the inclusion of the filename at the start of output lines when searching
a single file. By default, the filename is not shown in this case. For matching
lines, the filename is followed by a colon and a space; for context lines, a
hyphen separator is used. If a line number is also being output, it follows the
file name without a space.
lines, the filename is followed by a colon; for context lines, a hyphen
separator is used. If a line number is also being output, it follows the file
name.
.TP
\fB-h\fP, \fB--no-filename\fP
Suppress the output filenames when searching multiple files. By default,
filenames are shown when multiple files are searched. For matching lines, the
filename is followed by a colon and a space; for context lines, a hyphen
separator is used. If a line number is also being output, it follows the file
name without a space.
filename is followed by a colon; for context lines, a hyphen separator is used.
If a line number is also being output, it follows the file name.
.TP
\fB--help\fP
Output a brief help message and exit.
Output a help message, giving brief details of the command options and file
type support, and then exit.
.TP
\fB-i\fP, \fB--ignore-case\fP
Ignore upper/lower case distinctions during comparisons.
.TP
\fB--include\fP=\fIpattern\fP
When \fBpcregrep\fP is searching the files in a directory as a consequence of
the \fB-r\fP (recursive search) option, only those files whose names match the
pattern are included. The pattern is a PCRE regular expression. If a file name
matches both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no
short form for this option.
the \fB-r\fP (recursive search) option, only those regular files whose names
match the pattern are included. Subdirectories are always included and searched
recursively, subject to the \fP--include_dir\fP and \fB--exclude_dir\fP
options. The pattern is a PCRE regular expression, and is matched against the
final component of the file name (not the entire path). If a file name matches
both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no short
form for this option.
.TP
\fB--include_dir\fP=\fIpattern\fP
When \fBpcregrep\fP is searching the contents of a directory as a consequence
of the \fB-r\fP (recursive search) option, only those subdirectories whose
names match the pattern are included. (Note that the \fB--include\fP option
does not affect subdirectories.) The pattern is a PCRE regular expression, and
is matched against the final component of the name (not the entire path). If a
subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
is excluded. There is no short form for this option.
.TP
\fB-L\fP, \fB--files-without-match\fP
Instead of outputting lines from the files, just output the names of the files
@@ -201,6 +271,15 @@ This option supplies a name to be used for the standard input when file names
are being output. If not supplied, "(standard input)" is used. There is no
short form for this option.
.TP
\fB--line-offsets\fP
Instead of showing lines or parts of lines that match, show each match as a
line number, the offset from the start of the line, and a length. The line
number is terminated by a colon (as usual; see the \fB-n\fP option), and the
offset and length are separated by a comma. In this mode, no context is shown.
That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is
more than one match in a line, each of them is shown separately. This option is
mutually exclusive with \fB--file-offsets\fP and \fB--only-matching\fP.
.TP
\fB--locale\fP=\fIlocale-name\fP
This option specifies a locale to be used for pattern matching. It overrides
the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
@@ -220,26 +299,38 @@ the previous 8K characters (or all the previous characters, if fewer than 8K)
are guaranteed to be available for lookbehind assertions.
.TP
\fB-N\fP \fInewline-type\fP, \fB--newline=\fP\fInewline-type\fP
The PCRE library supports three different character sequences for indicating
The PCRE library supports five different conventions for indicating
the ends of lines. They are the single-character sequences CR (carriage return)
and LF (linefeed), and the two-character sequence CR, LF. When the library is
built, a default line-ending sequence is specified. This is normally the
standard sequence for the operating system. Unless otherwise specified by this
option, \fBpcregrep\fP uses the default. The possible values for this option
are CR, LF, or CRLF. This makes it possible to use \fBpcregrep\fP on files that
have come from other environments without having to modify their line endings.
If the data that is being scanned does not agree with the convention set by
this option, \fBpcregrep\fP may behave in strange ways.
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
which recognizes any of the preceding three types, and an "any" convention, in
which any Unicode line ending sequence is assumed to end a line. The Unicode
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
PS (paragraph separator, U+2029).
.sp
When the PCRE library is built, a default line-ending sequence is specified.
This is normally the standard sequence for the operating system. Unless
otherwise specified by this option, \fBpcregrep\fP uses the library's default.
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
makes it possible to use \fBpcregrep\fP on files that have come from other
environments without having to modify their line endings. If the data that is
being scanned does not agree with the convention set by this option,
\fBpcregrep\fP may behave in strange ways.
.TP
\fB-n\fP, \fB--line-number\fP
Precede each output line by its line number in the file, followed by a colon
and a space for matching lines or a hyphen and a space for context lines. If
the filename is also being output, it precedes the line number.
for matching lines or a hyphen for context lines. If the filename is also being
output, it precedes the line number. This option is forced if
\fB--line-offsets\fP is used.
.TP
\fB-o\fP, \fB--only-matching\fP
Show only the part of the line that matched a pattern. In this mode, no
context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are
ignored.
ignored. If there is more than one match in a line, each of them is shown
separately. If \fB-o\fP is combined with \fB-v\fP (invert the sense of the
match to find non-matching lines), no output is generated, but the return code
is set appropriately. This option is mutually exclusive with
\fB--file-offsets\fP and \fB--line-offsets\fP.
.TP
\fB-q\fP, \fB--quiet\fP
Work quietly, that is, display nothing except error messages. The exit
@@ -274,7 +365,7 @@ the patterns are the ones that are found.
Force the patterns to match only whole words. This is equivalent to having \eb
at the start and end of the pattern.
.TP
\fB-x\fP, \fB--line-regex\fP, \fP--line-regexp\fP
\fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP
Force the patterns to be anchored (each must start matching at the beginning of
a line) and in addition, require them to match entire lines. This is
equivalent to having ^ and $ characters at the start and end of each
@@ -339,7 +430,7 @@ in the first form, using an equals character. Otherwise it will be assumed that
it has no data.
.
.
.SH MATCHING ERRORS
.SH "MATCHING ERRORS"
.rs
.sp
It is possible to supply a regular expression that takes a very long time to
@@ -361,16 +452,26 @@ suppress error messages about inaccessble files does not affect the return
code.
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcrepattern\fP(3), \fBpcretest\fP(1).
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
.br
University Computing Service
.br
Cambridge CB2 3QG, England.
.P
.in 0
Last updated: 06 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 01 March 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -14,8 +14,8 @@ DESCRIPTION
pcregrep searches files for character patterns, in the same way as
other grep commands do, but it uses the PCRE regular expression library
to support patterns that are compatible with the regular expressions of
Perl 5. See pcrepattern for a full description of syntax and semantics
of the regular expressions that PCRE supports.
Perl 5. See pcrepattern(3) for a full description of syntax and seman-
tics of the regular expressions that PCRE supports.
Patterns, whether supplied on the command line or in a separate file,
are given without delimiters. For example:
@@ -24,37 +24,72 @@ DESCRIPTION
If you attempt to use delimiters (for example, by surrounding a pattern
with slashes, as is common in Perl scripts), they are interpreted as
part of the pattern. Quotes can of course be used on the command line
because they are interpreted by the shell, and indeed they are required
if a pattern contains white space or shell metacharacters.
part of the pattern. Quotes can of course be used to delimit patterns
on the command line because they are interpreted by the shell, and
indeed they are required if a pattern contains white space or shell
metacharacters.
The first argument that follows any option settings is treated as the
single pattern to be matched when neither -e nor -f is present. Con-
versely, when one or both of these options are used to specify pat-
The first argument that follows any option settings is treated as the
single pattern to be matched when neither -e nor -f is present. Con-
versely, when one or both of these options are used to specify pat-
terns, all arguments are treated as path names. At least one of -e, -f,
or an argument pattern must be provided.
If no files are specified, pcregrep reads the standard input. The stan-
dard input can also be referenced by a name consisting of a single
dard input can also be referenced by a name consisting of a single
hyphen. For example:
pcregrep some-pattern /file1 - /file3
By default, each line that matches the pattern is copied to the stan-
dard output, and if there is more than one file, the file name is out-
put at the start of each line. However, there are options that can
change how pcregrep behaves. In particular, the -M option makes it pos-
sible to search for patterns that span line boundaries. What defines a
line boundary is controlled by the -N (--newline) option.
By default, each line that matches a pattern is copied to the standard
output, and if there is more than one file, the file name is output at
the start of each line, followed by a colon. However, there are options
that can change how pcregrep behaves. In particular, the -M option
makes it possible to search for patterns that span line boundaries.
What defines a line boundary is controlled by the -N (--newline)
option.
Patterns are limited to 8K or BUFSIZ characters, whichever is the
greater. BUFSIZ is defined in <stdio.h>.
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
pattern (specified by the use of -e and/or -f), each pattern is applied
to each line in the order in which they are defined, except that all
the -e patterns are tried before the -f patterns.
If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
the value to set a locale when calling the PCRE library. The --locale
By default, as soon as one pattern matches (or fails to match when -v
is used), no further patterns are considered. However, if --colour (or
--color) is used to colour the matching substrings, or if --only-match-
ing, --file-offsets, or --line-offsets is used to output only the part
of the line that matched (either shown literally, or as an offset),
scanning resumes immediately following the match, so that further
matches on the same line can be found. If there are multiple patterns,
they are all tried on the remainder of the line, but patterns that fol-
low the one that matched are not tried on the earlier part of the line.
This is the same behaviour as GNU grep, but it does mean that the order
in which multiple patterns are specified can affect the output when one
of the above options is used.
Patterns that can match an empty string are accepted, but empty string
matches are not recognized. An example is the pattern "(super)?(man)?",
in which all components are optional. This pattern finds all occur-
rences of both "super" and "man"; the output differs from matching with
"super|man" when only the matching substrings are being shown.
If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
the value to set a locale when calling the PCRE library. The --locale
option can be used to override this.
SUPPORT FOR COMPRESSED FILES
It is possible to compile pcregrep so that it uses libz or libbz2 to
read files whose names end in .gz or .bz2, respectively. You can find
out whether your binary has support for one or both of these file types
by running it with the --help option. If the appropriate support is not
present, files are treated as plain text. The standard input is always
so treated.
OPTIONS
-- This terminate the list of options. It is useful if the next
@@ -99,110 +134,156 @@ OPTIONS
the same shell item, separated by an equals sign.
--colour=value, --color=value
This option specifies under what circumstances the part of a
This option specifies under what circumstances the parts of a
line that matched a pattern should be coloured in the output.
The value may be "never" (the default), "always", or "auto".
In the latter case, colouring happens only if the standard
output is connected to a terminal. The colour can be speci-
fied by setting the environment variable PCREGREP_COLOUR or
PCREGREP_COLOR. The value of this variable should be a string
of two numbers, separated by a semicolon. They are copied
directly into the control string for setting colour on a ter-
minal, so it is your responsibility to ensure that they make
sense. If neither of the environment variables is set, the
default is "1;31", which gives red.
By default, the output is not coloured. The value (which is
optional, see above) may be "never", "always", or "auto". In
the latter case, colouring happens only if the standard out-
put is connected to a terminal. More resources are used when
colouring is enabled, because pcregrep has to search for all
possible matches in a line, not just one, in order to colour
them all.
The colour that is used can be specified by setting the envi-
ronment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
of this variable should be a string of two numbers, separated
by a semicolon. They are copied directly into the control
string for setting colour on a terminal, so it is your
responsibility to ensure that they make sense. If neither of
the environment variables is set, the default is "1;31",
which gives red.
-D action, --devices=action
If an input path is not a regular file or a directory,
"action" specifies how it is to be processed. Valid values
are "read" (the default) or "skip" (silently skip the path).
are "read" (the default) or "skip" (silently skip the path).
-d action, --directories=action
If an input path is a directory, "action" specifies how it is
to be processed. Valid values are "read" (the default),
"recurse" (equivalent to the -r option), or "skip" (silently
skip the path). In the default case, directories are read as
if they were ordinary files. In some operating systems the
effect of reading a directory like this is an immediate end-
to be processed. Valid values are "read" (the default),
"recurse" (equivalent to the -r option), or "skip" (silently
skip the path). In the default case, directories are read as
if they were ordinary files. In some operating systems the
effect of reading a directory like this is an immediate end-
of-file.
-e pattern, --regex=pattern,
--regexp=pattern Specify a pattern to be matched. This option
can be used multiple times in order to specify several pat-
terns. It can also be used as a way of specifying a single
pattern that starts with a hyphen. When -e is used, no argu-
ment pattern is taken from the command line; all arguments
are treated as file names. There is an overall maximum of 100
patterns. They are applied to each line in the order in which
they are defined until one matches (or fails to match if -v
is used). If -f is used with -e, the command line patterns
are matched first, followed by the patterns from the file,
independent of the order in which these options are speci-
fied. Note that multiple use of -e is not the same as a sin-
gle pattern with alternatives. For example, X|Y finds the
first character in a line that is X or Y, whereas if the two
patterns are given separately, pcregrep finds X if it is
present, even if it follows Y in the line. It finds Y only if
there is no X in the line. This really matters only if you
are using -o to show the portion of the line that matched.
-e pattern, --regex=pattern, --regexp=pattern
Specify a pattern to be matched. This option can be used mul-
tiple times in order to specify several patterns. It can also
be used as a way of specifying a single pattern that starts
with a hyphen. When -e is used, no argument pattern is taken
from the command line; all arguments are treated as file
names. There is an overall maximum of 100 patterns. They are
applied to each line in the order in which they are defined
until one matches (or fails to match if -v is used). If -f is
used with -e, the command line patterns are matched first,
followed by the patterns from the file, independent of the
order in which these options are specified. Note that multi-
ple use of -e is not the same as a single pattern with alter-
natives. For example, X|Y finds the first character in a line
that is X or Y, whereas if the two patterns are given sepa-
rately, pcregrep finds X if it is present, even if it follows
Y in the line. It finds Y only if there is no X in the line.
This really matters only if you are using -o to show the
part(s) of the line that matched.
--exclude=pattern
When pcregrep is searching the files in a directory as a con-
sequence of the -r (recursive search) option, any files whose
names match the pattern are excluded. The pattern is a PCRE
regular expression. If a file name matches both --include and
--exclude, it is excluded. There is no short form for this
sequence of the -r (recursive search) option, any regular
files whose names match the pattern are excluded. Subdirecto-
ries are not excluded by this option; they are searched
recursively, subject to the --exclude_dir and --include_dir
options. The pattern is a PCRE regular expression, and is
matched against the final component of the file name (not the
entire path). If a file name matches both --include and
--exclude, it is excluded. There is no short form for this
option.
--exclude_dir=pattern
When pcregrep is searching the contents of a directory as a
consequence of the -r (recursive search) option, any subdi-
rectories whose names match the pattern are excluded. (Note
that the --exclude option does not affect subdirectories.)
The pattern is a PCRE regular expression, and is matched
against the final component of the name (not the entire
path). If a subdirectory name matches both --include_dir and
--exclude_dir, it is excluded. There is no short form for
this option.
-F, --fixed-strings
Interpret each pattern as a list of fixed strings, separated
by newlines, instead of as a regular expression. The -w
(match as a word) and -x (match whole line) options can be
Interpret each pattern as a list of fixed strings, separated
by newlines, instead of as a regular expression. The -w
(match as a word) and -x (match whole line) options can be
used with -F. They apply to each of the fixed strings. A line
is selected if any of the fixed strings are found in it (sub-
ject to -w or -x, if present).
-f filename, --file=filename
Read a number of patterns from the file, one per line, and
match them against each line of input. A data line is output
Read a number of patterns from the file, one per line, and
match them against each line of input. A data line is output
if any of the patterns match it. The filename can be given as
"-" to refer to the standard input. When -f is used, patterns
specified on the command line using -e may also be present;
specified on the command line using -e may also be present;
they are tested before the file's patterns. However, no other
pattern is taken from the command line; all arguments are
treated as file names. There is an overall maximum of 100
pattern is taken from the command line; all arguments are
treated as file names. There is an overall maximum of 100
patterns. Trailing white space is removed from each line, and
blank lines are ignored. An empty file contains no patterns
and therefore matches nothing.
blank lines are ignored. An empty file contains no patterns
and therefore matches nothing. See also the comments about
multiple patterns versus a single pattern with alternatives
in the description of -e above.
--file-offsets
Instead of showing lines or parts of lines that match, show
each match as an offset from the start of the file and a
length, separated by a comma. In this mode, no context is
shown. That is, the -A, -B, and -C options are ignored. If
there is more than one match in a line, each of them is shown
separately. This option is mutually exclusive with --line-
offsets and --only-matching.
-H, --with-filename
Force the inclusion of the filename at the start of output
lines when searching a single file. By default, the filename
is not shown in this case. For matching lines, the filename
is followed by a colon and a space; for context lines, a
hyphen separator is used. If a line number is also being out-
put, it follows the file name without a space.
Force the inclusion of the filename at the start of output
lines when searching a single file. By default, the filename
is not shown in this case. For matching lines, the filename
is followed by a colon; for context lines, a hyphen separator
is used. If a line number is also being output, it follows
the file name.
-h, --no-filename
Suppress the output filenames when searching multiple files.
By default, filenames are shown when multiple files are
searched. For matching lines, the filename is followed by a
colon and a space; for context lines, a hyphen separator is
used. If a line number is also being output, it follows the
file name without a space.
Suppress the output filenames when searching multiple files.
By default, filenames are shown when multiple files are
searched. For matching lines, the filename is followed by a
colon; for context lines, a hyphen separator is used. If a
line number is also being output, it follows the file name.
--help Output a brief help message and exit.
--help Output a help message, giving brief details of the command
options and file type support, and then exit.
-i, --ignore-case
Ignore upper/lower case distinctions during comparisons.
--include=pattern
When pcregrep is searching the files in a directory as a con-
sequence of the -r (recursive search) option, only those
files whose names match the pattern are included. The pattern
is a PCRE regular expression. If a file name matches both
--include and --exclude, it is excluded. There is no short
form for this option.
sequence of the -r (recursive search) option, only those reg-
ular files whose names match the pattern are included. Subdi-
rectories are always included and searched recursively, sub-
ject to the --include_dir and --exclude_dir options. The pat-
tern is a PCRE regular expression, and is matched against the
final component of the file name (not the entire path). If a
file name matches both --include and --exclude, it is
excluded. There is no short form for this option.
--include_dir=pattern
When pcregrep is searching the contents of a directory as a
consequence of the -r (recursive search) option, only those
subdirectories whose names match the pattern are included.
(Note that the --include option does not affect subdirecto-
ries.) The pattern is a PCRE regular expression, and is
matched against the final component of the name (not the
entire path). If a subdirectory name matches both
--include_dir and --exclude_dir, it is excluded. There is no
short form for this option.
-L, --files-without-match
Instead of outputting lines from the files, just output the
@@ -222,6 +303,17 @@ OPTIONS
when file names are being output. If not supplied, "(standard
input)" is used. There is no short form for this option.
--line-offsets
Instead of showing lines or parts of lines that match, show
each match as a line number, the offset from the start of the
line, and a length. The line number is terminated by a colon
(as usual; see the -n option), and the offset and length are
separated by a comma. In this mode, no context is shown.
That is, the -A, -B, and -C options are ignored. If there is
more than one match in a line, each of them is shown sepa-
rately. This option is mutually exclusive with --file-offsets
and --only-matching.
--locale=locale-name
This option specifies a locale to be used for pattern match-
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
@@ -245,60 +337,73 @@ OPTIONS
lookbehind assertions.
-N newline-type, --newline=newline-type
The PCRE library supports three different character sequences
for indicating the ends of lines. They are the single-charac-
ter sequences CR (carriage return) and LF (linefeed), and the
two-character sequence CR, LF. When the library is built, a
default line-ending sequence is specified. This is normally
the standard sequence for the operating system. Unless other-
wise specified by this option, pcregrep uses the default. The
possible values for this option are CR, LF, or CRLF. This
makes it possible to use pcregrep on files that have come
from other environments without having to modify their line
endings. If the data that is being scanned does not agree
with the convention set by this option, pcregrep may behave
in strange ways.
The PCRE library supports five different conventions for
indicating the ends of lines. They are the single-character
sequences CR (carriage return) and LF (linefeed), the two-
character sequence CRLF, an "anycrlf" convention, which rec-
ognizes any of the preceding three types, and an "any" con-
vention, in which any Unicode line ending sequence is assumed
to end a line. The Unicode sequences are the three just men-
tioned, plus VT (vertical tab, U+000B), FF (formfeed,
U+000C), NEL (next line, U+0085), LS (line separator,
U+2028), and PS (paragraph separator, U+2029).
When the PCRE library is built, a default line-ending
sequence is specified. This is normally the standard
sequence for the operating system. Unless otherwise specified
by this option, pcregrep uses the library's default. The
possible values for this option are CR, LF, CRLF, ANYCRLF, or
ANY. This makes it possible to use pcregrep on files that
have come from other environments without having to modify
their line endings. If the data that is being scanned does
not agree with the convention set by this option, pcregrep
may behave in strange ways.
-n, --line-number
Precede each output line by its line number in the file, fol-
lowed by a colon and a space for matching lines or a hyphen
and a space for context lines. If the filename is also being
output, it precedes the line number.
lowed by a colon for matching lines or a hyphen for context
lines. If the filename is also being output, it precedes the
line number. This option is forced if --line-offsets is used.
-o, --only-matching
Show only the part of the line that matched a pattern. In
this mode, no context is shown. That is, the -A, -B, and -C
options are ignored.
options are ignored. If there is more than one match in a
line, each of them is shown separately. If -o is combined
with -v (invert the sense of the match to find non-matching
lines), no output is generated, but the return code is set
appropriately. This option is mutually exclusive with --file-
offsets and --line-offsets.
-q, --quiet
Work quietly, that is, display nothing except error messages.
The exit status indicates whether or not any matches were
The exit status indicates whether or not any matches were
found.
-r, --recursive
If any given path is a directory, recursively scan the files
it contains, taking note of any --include and --exclude set-
tings. By default, a directory is read as a normal file; in
some operating systems this gives an immediate end-of-file.
This option is a shorthand for setting the -d option to
If any given path is a directory, recursively scan the files
it contains, taking note of any --include and --exclude set-
tings. By default, a directory is read as a normal file; in
some operating systems this gives an immediate end-of-file.
This option is a shorthand for setting the -d option to
"recurse".
-s, --no-messages
Suppress error messages about non-existent or unreadable
files. Such files are quietly skipped. However, the return
Suppress error messages about non-existent or unreadable
files. Such files are quietly skipped. However, the return
code is still 2, even if matches were found in other files.
-u, --utf-8
Operate in UTF-8 mode. This option is available only if PCRE
has been compiled with UTF-8 support. Both patterns and sub-
Operate in UTF-8 mode. This option is available only if PCRE
has been compiled with UTF-8 support. Both patterns and sub-
ject lines must be valid strings of UTF-8 characters.
-V, --version
Write the version numbers of pcregrep and the PCRE library
Write the version numbers of pcregrep and the PCRE library
that is being used to the standard error stream.
-v, --invert-match
Invert the sense of the match, so that lines which do not
Invert the sense of the match, so that lines which do not
match any of the patterns are the ones that are found.
-w, --word-regex, --word-regexp
@@ -306,61 +411,61 @@ OPTIONS
lent to having \b at the start and end of the pattern.
-x, --line-regex, --line-regexp
Force the patterns to be anchored (each must start matching
at the beginning of a line) and in addition, require them to
match entire lines. This is equivalent to having ^ and $
Force the patterns to be anchored (each must start matching
at the beginning of a line) and in addition, require them to
match entire lines. This is equivalent to having ^ and $
characters at the start and end of each alternative branch in
every pattern.
ENVIRONMENT VARIABLES
The environment variables LC_ALL and LC_CTYPE are examined, in that
order, for a locale. The first one that is set is used. This can be
overridden by the --locale option. If no locale is set, the PCRE
The environment variables LC_ALL and LC_CTYPE are examined, in that
order, for a locale. The first one that is set is used. This can be
overridden by the --locale option. If no locale is set, the PCRE
library's default (usually the "C" locale) is used.
NEWLINES
The -N (--newline) option allows pcregrep to scan files with different
newline conventions from the default. However, the setting of this
option does not affect the way in which pcregrep writes information to
the standard error and output streams. It uses the string "\n" in C
printf() calls to indicate newlines, relying on the C I/O library to
convert this to an appropriate sequence if the output is sent to a
The -N (--newline) option allows pcregrep to scan files with different
newline conventions from the default. However, the setting of this
option does not affect the way in which pcregrep writes information to
the standard error and output streams. It uses the string "\n" in C
printf() calls to indicate newlines, relying on the C I/O library to
convert this to an appropriate sequence if the output is sent to a
file.
OPTIONS COMPATIBILITY
The majority of short and long forms of pcregrep's options are the same
as in the GNU grep program. Any long option of the form --xxx-regexp
(GNU terminology) is also available as --xxx-regex (PCRE terminology).
However, the --locale, -M, --multiline, -u, and --utf-8 options are
as in the GNU grep program. Any long option of the form --xxx-regexp
(GNU terminology) is also available as --xxx-regex (PCRE terminology).
However, the --locale, -M, --multiline, -u, and --utf-8 options are
specific to pcregrep.
OPTIONS WITH DATA
There are four different ways in which an option with data can be spec-
ified. If a short form option is used, the data may follow immedi-
ified. If a short form option is used, the data may follow immedi-
ately, or in the next command line item. For example:
-f/some/file
-f /some/file
If a long form option is used, the data may appear in the same command
If a long form option is used, the data may appear in the same command
line item, separated by an equals character, or (with one exception) it
may appear in the next command line item. For example:
--file=/some/file
--file /some/file
Note, however, that if you want to supply a file name beginning with ~
as data in a shell command, and have the shell expand ~ to a home
Note, however, that if you want to supply a file name beginning with ~
as data in a shell command, and have the shell expand ~ to a home
directory, you must separate the file name from the option, because the
shell does not treat ~ specially unless it is at the start of an item.
shell does not treat ~ specially unless it is at the start of an item.
The exception to the above is the --colour (or --color) option, for
which the data is optional. If this option does have data, it must be
@@ -389,11 +494,19 @@ DIAGNOSTICS
not affect the return code.
SEE ALSO
pcrepattern(3), pcretest(1).
AUTHOR
Philip Hazel
University Computing Service
Cambridge CB2 3QG, England.
Cambridge CB2 3QH, England.
Last updated: 06 June 2006
Copyright (c) 1997-2006 University of Cambridge.
REVISION
Last updated: 01 March 2009
Copyright (c) 1997-2009 University of Cambridge.

View File

@@ -26,7 +26,7 @@ is matched against the string
<something> <something else> <something further>
.sp
there are three possible answers. The standard algorithm finds only one of
them, whereas the DFA algorithm finds all three.
them, whereas the alternative algorithm finds all three.
.
.SH "REGULAR EXPRESSIONS AS TREES"
.rs
@@ -41,8 +41,8 @@ correspond to the two matching algorithms provided by PCRE.
.SH "THE STANDARD MATCHING ALGORITHM"
.rs
.sp
In the terminology of Jeffrey Friedl's book \fIMastering Regular
Expressions\fP, the standard algorithm is an "NFA algorithm". It conducts a
In the terminology of Jeffrey Friedl's book "Mastering Regular
Expressions", the standard algorithm is an "NFA algorithm". It conducts a
depth-first search of the pattern tree. That is, it proceeds along a single
path through the tree, checking that the subject matches what is required. When
there is a mismatch, the algorithm tries any alternatives at the current point,
@@ -63,15 +63,16 @@ straightforward for this algorithm to keep track of the substrings that are
matched by portions of the pattern in parentheses. This provides support for
capturing parentheses and back references.
.
.SH "THE DFA MATCHING ALGORITHM"
.SH "THE ALTERNATIVE MATCHING ALGORITHM"
.rs
.sp
DFA stands for "deterministic finite automaton", but you do not need to
understand the origins of that name. This algorithm conducts a breadth-first
search of the tree. Starting from the first matching point in the subject, it
scans the subject string from left to right, once, character by character, and
as it does this, it remembers all the paths through the tree that represent
valid matches.
This algorithm conducts a breadth-first search of the tree. Starting from the
first matching point in the subject, it scans the subject string from left to
right, once, character by character, and as it does this, it remembers all the
paths through the tree that represent valid matches. In Friedl's terminology,
this is a kind of "DFA algorithm", though it is not implemented as a
traditional finite state machine (it keeps multiple states active
simultaneously).
.P
The scan continues until either the end of the subject is reached, or there are
no more unterminated paths. At this point, terminated paths represent the
@@ -92,11 +93,20 @@ character of the subject. The algorithm does not automatically move on to find
matches that start at later positions.
.P
There are a number of features of PCRE regular expressions that are not
supported by the DFA matching algorithm. They are as follows:
supported by the alternative matching algorithm. They are as follows:
.P
1. Because the algorithm finds all possible matches, the greedy or ungreedy
nature of repetition quantifiers is not relevant. Greedy and ungreedy
quantifiers are treated in exactly the same way.
quantifiers are treated in exactly the same way. However, possessive
quantifiers can make a difference when what follows could also match what is
quantified, for example in a pattern like this:
.sp
^a++\ew!
.sp
This pattern matches "aaab!" but not "aaa!", which would be matched by a
non-possessive quantifier. Similarly, if an atomic group is present, it is
matched as if it were a standalone pattern at the current point, and the
longest match is then "locked in" for the rest of the overall pattern.
.P
2. When dealing with multiple paths through the tree simultaneously, it is not
straightforward to keep track of captured substrings for the different matching
@@ -107,21 +117,27 @@ do this. This means that no captured substrings are available.
not supported, and cause errors if encountered.
.P
4. For the same reason, conditional expressions that use a backreference as the
condition are not supported.
condition or test for a specific group recursion are not supported.
.P
5. Callouts are supported, but the value of the \fIcapture_top\fP field is
5. Because many paths through the tree may be active, the \eK escape sequence,
which resets the start of the match when encountered (but may be on some paths
and not on others), is not supported. It causes an error if encountered.
.P
6. Callouts are supported, but the value of the \fIcapture_top\fP field is
always 1, and the value of the \fIcapture_last\fP field is always -1.
.P
6.
The \eC escape sequence, which (in the standard algorithm) matches a single
byte, even in UTF-8 mode, is not supported because the DFA algorithm moves
through the subject string one character at a time, for all active paths
7. The \eC escape sequence, which (in the standard algorithm) matches a single
byte, even in UTF-8 mode, is not supported because the alternative algorithm
moves through the subject string one character at a time, for all active paths
through the tree.
.P
8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
.
.SH "ADVANTAGES OF THE DFA ALGORITHM"
.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
.rs
.sp
Using the DFA matching algorithm provides the following advantages:
Using the alternative matching algorithm provides the following advantages:
.P
1. All possible matches (at a single point in the subject) are automatically
found, and in particular, the longest match is found. To find more than one
@@ -130,17 +146,18 @@ callouts.
.P
2. There is much better support for partial matching. The restrictions on the
content of the pattern that apply when using the standard algorithm for partial
matching do not apply to the DFA algorithm. For non-anchored patterns, the
starting position of a partial match is available.
matching do not apply to the alternative algorithm. For non-anchored patterns,
the starting position of a partial match is available.
.P
3. Because the DFA algorithm scans the subject string just once, and never
needs to backtrack, it is possible to pass very long subject strings to the
matching function in several pieces, checking for partial matching each time.
3. Because the alternative algorithm scans the subject string just once, and
never needs to backtrack, it is possible to pass very long subject strings to
the matching function in several pieces, checking for partial matching each
time.
.
.SH "DISADVANTAGES OF THE DFA ALGORITHM"
.SH "DISADVANTAGES OF THE ALTERNATIVE ALGORITHM"
.rs
.sp
The DFA algorithm suffers from a number of disadvantages:
The alternative algorithm suffers from a number of disadvantages:
.P
1. It is substantially slower than the standard algorithm. This is partly
because it has to search for all possible matches, but is also because it is
@@ -148,10 +165,24 @@ less susceptible to optimization.
.P
2. Capturing parentheses and back references are not supported.
.P
3. The "atomic group" feature of PCRE regular expressions is supported, but
does not provide the advantage that it does for the standard algorithm.
.P
.in 0
Last updated: 06 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
3. Although atomic groups are supported, their use does not provide the
performance advantage that it does for the standard algorithm.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 19 April 2008
Copyright (c) 1997-2008 University of Cambridge.
.fi

View File

@@ -71,6 +71,8 @@ envisaged for this facility, this is not felt to be a major restriction.
.P
If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
\fBpcre_exec()\fP returns the error code PCRE_ERROR_BADPARTIAL (-13).
You can use the PCRE_INFO_OKPARTIAL call to \fBpcre_fullinfo()\fP to find out
if a compiled pattern can be used for partial matching.
.
.
.SH "EXAMPLE OF PARTIAL MATCHING USING PCRETEST"
@@ -95,10 +97,11 @@ uses the date example quoted above:
.sp
The first data string is matched completely, so \fBpcretest\fP shows the
matched substrings. The remaining four strings do not match the complete
pattern, but the first two are partial matches. The same test, using DFA
matching (by means of the \eD escape sequence), produces the following output:
pattern, but the first two are partial matches. The same test, using
\fBpcre_dfa_exec()\fP matching (by means of the \eD escape sequence), produces
the following output:
.sp
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
data> 25jun04\eP\eD
0: 25jun04
data> 23dec3\eP\eD
@@ -119,13 +122,13 @@ available.
.sp
When a partial match has been found using \fBpcre_dfa_exec()\fP, it is possible
to continue the match by providing additional subject data and calling
\fBpcre_dfa_exec()\fP again with the PCRE_DFA_RESTART option and the same
working space (where details of the previous partial match are stored). Here is
an example using \fBpcretest\fP, where the \eR escape sequence sets the
PCRE_DFA_RESTART option and the \eD escape sequence requests the use of
\fBpcre_dfa_exec()\fP:
\fBpcre_dfa_exec()\fP again with the same compiled regular expression, this
time setting the PCRE_DFA_RESTART option. You must also pass the same working
space as before, because this is where details of the previous partial match
are stored. Here is an example using \fBpcretest\fP, using the \eR escape
sequence to set the PCRE_DFA_RESTART option (\eP and \eD are as above):
.sp
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
data> 23ja\eP\eD
Partial match: 23ja
data> n05\eR\eD
@@ -137,9 +140,10 @@ Notice that when the match is complete, only the last part is shown; PCRE does
not retain the previously partially-matched string. It is up to the calling
program to do that if it needs to.
.P
This facility can be used to pass very long subject strings to
\fBpcre_dfa_exec()\fP. However, some care is needed for certain types of
pattern.
You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
over multiple segments. This facility can be used to pass very long subject
strings to \fBpcre_dfa_exec()\fP. However, some care is needed for certain
types of pattern.
.P
1. If the pattern contains tests for the beginning or end of a line, you need
to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
@@ -147,7 +151,7 @@ subject string for any call does not contain the beginning or end of a line.
.P
2. If the pattern contains backward assertions (including \eb or \eB), you need
to arrange for some overlap in the subject strings to allow for this. For
example, you could pass the subject in chunks that were 500 bytes long, but in
example, you could pass the subject in chunks that are 500 bytes long, but in
a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
bytes at the start of the buffer.
.P
@@ -155,7 +159,7 @@ bytes at the start of the buffer.
always produce exactly the same result as matching over one single long string.
The difference arises when there are multiple matching possibilities, because a
partial match result is given only when there are no completed matches in a
call to fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
call to \fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
been found, continuation to a new subject segment is no longer possible.
Consider this \fBpcretest\fP example:
.sp
@@ -196,8 +200,20 @@ patterns or patterns such as:
where no string can be a partial match for both alternatives.
.
.
.P
.in 0
Last updated: 16 January 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 04 June 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi

File diff suppressed because it is too large Load Diff

View File

@@ -4,13 +4,75 @@ PCRE - Perl-compatible regular expressions
.SH "PCRE PERFORMANCE"
.rs
.sp
Certain items that may appear in regular expression patterns are more efficient
Two aspects of performance are discussed below: memory usage and processing
time. The way you express your pattern as a regular expression can affect both
of them.
.
.SH "MEMORY USAGE"
.rs
.sp
Patterns are compiled by PCRE into a reasonably efficient byte code, so that
most simple patterns do not use much memory. However, there is one case where
memory usage can be unexpectedly large. When a parenthesized subpattern has a
quantifier with a minimum greater than 1 and/or a limited maximum, the whole
subpattern is repeated in the compiled code. For example, the pattern
.sp
(abc|def){2,4}
.sp
is compiled as if it were
.sp
(abc|def)(abc|def)((abc|def)(abc|def)?)?
.sp
(Technical aside: It is done this way so that backtrack points within each of
the repetitions can be independently maintained.)
.P
For regular expressions whose quantifiers use only small numbers, this is not
usually a problem. However, if the numbers are large, and particularly if such
repetitions are nested, the memory usage can become an embarrassment. For
example, the very simple pattern
.sp
((ab){1,1000}c){1,3}
.sp
uses 51K bytes when compiled. When PCRE is compiled with its default internal
pointer size of two bytes, the size limit on a compiled pattern is 64K, and
this is reached with the above pattern if the outer repetition is increased
from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
handle larger compiled patterns, but it is better to try to rewrite your
pattern to use less memory if you can.
.P
One way of reducing the memory usage for such patterns is to make use of PCRE's
.\" HTML <a href="pcrepattern.html#subpatternsassubroutines">
.\" </a>
"subroutine"
.\"
facility. Re-writing the above pattern as
.sp
((ab)(?2){0,999}c)(?1){0,2}
.sp
reduces the memory requirements to 18K, and indeed it remains under 20K even
with the outer repetition increased to 100. However, this pattern is not
exactly equivalent, because the "subroutine" calls are treated as
.\" HTML <a href="pcrepattern.html#atomicgroup">
.\" </a>
atomic groups
.\"
into which there can be no backtracking if there is a subsequent matching
failure. Therefore, PCRE cannot do this kind of rewriting automatically.
Furthermore, there is a noticeable loss of speed when executing the modified
pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
speed is acceptable, this kind of rewriting will allow you to process patterns
that PCRE cannot otherwise handle.
.
.SH "PROCESSING TIME"
.rs
.sp
Certain items in regular expression patterns are processed more efficiently
than others. It is more efficient to use a character class like [aeiou] than a
set of alternatives such as (a|e|i|o|u). In general, the simplest construction
that provides the required behaviour is usually the most efficient. Jeffrey
Friedl's book contains a lot of useful general discussion about optimizing
regular expressions for efficient performance. This document contains a few
observations about PCRE.
set of single-character alternatives such as (a|e|i|o|u). In general, the
simplest construction that provides the required behaviour is usually the most
efficient. Jeffrey Friedl's book contains a lot of useful general discussion
about optimizing regular expressions for efficient performance. This document
contains a few observations about PCRE.
.P
Using Unicode character properties (the \ep, \eP, and \eX escapes) is slow,
because PCRE has to scan a structure that contains data for over fifteen
@@ -42,14 +104,15 @@ Beware of patterns that contain nested indefinite repeats. These can take a
long time to run when applied to a string that does not match. Consider the
pattern fragment
.sp
(a+)*
^(a+)*
.sp
This can match "aaaa" in 33 different ways, and this number increases very
This can match "aaaa" in 16 different ways, and this number increases very
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
times, and for each of those cases other than 0, the + repeats can match
times, and for each of those cases other than 0 or 4, the + repeats can match
different numbers of times.) When the remainder of the pattern is such that the
entire match is going to fail, PCRE has in principle to try every possible
variation, and this can take an extremely long time.
variation, and this can take an extremely long time, even for relatively short
strings.
.P
An optimization catches some of the more simple cases such as
.sp
@@ -69,8 +132,22 @@ appreciable time with strings longer than about 20 characters.
.P
In many cases, the solution to this kind of performance issue is to use an
atomic group or a possessive quantifier.
.P
.in 0
Last updated: 28 February 2005
.br
Copyright (c) 1997-2005 University of Cambridge.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 06 March 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi

View File

@@ -7,22 +7,18 @@ PCRE - Perl-compatible regular expressions.
.B #include <pcreposix.h>
.PP
.SM
.br
.B int regcomp(regex_t *\fIpreg\fP, const char *\fIpattern\fP,
.ti +5n
.B int \fIcflags\fP);
.PP
.br
.B int regexec(regex_t *\fIpreg\fP, const char *\fIstring\fP,
.ti +5n
.B size_t \fInmatch\fP, regmatch_t \fIpmatch\fP[], int \fIeflags\fP);
.PP
.br
.B size_t regerror(int \fIerrcode\fP, const regex_t *\fIpreg\fP,
.ti +5n
.B char *\fIerrbuf\fP, size_t \fIerrbuf_size\fP);
.PP
.br
.B void regfree(regex_t *\fIpreg\fP);
.
.SH DESCRIPTION
@@ -43,11 +39,11 @@ header file, and on Unix systems the library itself is called
command for linking an application that uses them. Because the POSIX functions
call the native ones, it is also necessary to add \fB-lpcre\fP.
.P
I have implemented only those option bits that can be reasonably mapped to PCRE
native options. In addition, the option REG_EXTENDED is defined with the value
zero. This has no effect, but since programs that are written to the POSIX
interface often use it, this makes it easier to slot in PCRE as a replacement
library. Other POSIX options are not even defined.
I have implemented only those POSIX option bits that can be reasonably mapped
to PCRE native options. In addition, the option REG_EXTENDED is defined with
the value zero. This has no effect, but since programs that are written to the
POSIX interface often use it, this makes it easier to slot in PCRE as a
replacement library. Other POSIX options are not even defined.
.P
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
@@ -161,18 +157,36 @@ REG_NEWLINE action.
.rs
.sp
The function \fBregexec()\fP is called to match a compiled pattern \fIpreg\fP
against a given \fIstring\fP, which is terminated by a zero byte, subject to
the options in \fIeflags\fP. These can be:
against a given \fIstring\fP, which is by default terminated by a zero byte
(but see REG_STARTEND below), subject to the options in \fIeflags\fP. These can
be:
.sp
REG_NOTBOL
.sp
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
function.
.sp
REG_NOTEMPTY
.sp
The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
setting this option can give more POSIX-like behaviour in some situations.
.sp
REG_NOTEOL
.sp
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
function.
.sp
REG_STARTEND
.sp
The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and
to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
(there need not actually be a NUL at that location), regardless of the value of
\fInmatch\fP. This is a BSD extension, compatible with but not specified by
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
how it is matched.
.P
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
@@ -214,13 +228,17 @@ memory, after which \fIpreg\fP may no longer be used as a compiled expression.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
.br
University Computing Service,
.br
Cambridge CB2 3QG, England.
.P
.in 0
Last updated: 16 January 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 11 March 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -17,7 +17,9 @@ tables, it is a little bit more complicated.
If you save compiled patterns to a file, you can copy them to a different host
and run them there. This works even if the new host has the opposite endianness
to the one on which the patterns were compiled. There may be a small
performance penalty, but it should be insignificant.
performance penalty, but it should be insignificant. However, compiling regular
expressions with one version of PCRE for use with a different version is not
guaranteed to work and may cause crashes.
.
.
.SH "SAVING A COMPILED PATTERN"
@@ -115,17 +117,26 @@ usual way.
.SH "COMPATIBILITY WITH DIFFERENT PCRE RELEASES"
.rs
.sp
The layout of the control block that is at the start of the data that makes up
a compiled pattern was changed for release 5.0. If you have any saved patterns
that were compiled with previous releases (not a facility that was previously
advertised), you will have to recompile them for release 5.0. However, from now
on, it should be possible to make changes in a compatible manner.
.P
Notwithstanding the above, if you have any saved patterns in UTF-8 mode that
use \ep or \eP that were compiled with any release up to and including 6.4, you
will have to recompile them for release 6.5 and above.
.P
.in 0
Last updated: 01 February 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
In general, it is safest to recompile all saved patterns when you update to a
new PCRE release, though not all updates actually require this. Recompiling is
definitely needed for release 7.2.
.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 13 June 2007
Copyright (c) 1997-2007 University of Cambridge.
.fi

View File

@@ -59,8 +59,22 @@ need to add
-R/usr/local/lib
.sp
(for example) to the compile command to get round this problem.
.P
.in 0
Last updated: 09 September 2004
.br
Copyright (c) 1997-2004 University of Cambridge.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 23 January 2008
Copyright (c) 1997-2008 University of Cambridge.
.fi

View File

@@ -52,7 +52,7 @@ frame for each matched character. For a long string, a lot of stack is
required. Consider now this rewritten pattern, which matches exactly the same
strings:
.sp
([^<]++|<(?!inet))
([^<]++|<(?!inet))+
.sp
This uses very much less stack, because runs of characters that do not contain
"<" are "swallowed" in one item inside the parentheses. Recursion happens only
@@ -61,6 +61,13 @@ assume this is relatively rare). A possessive quantifier is used to stop any
backtracking into the runs of non-"<" characters, but that is not related to
stack usage.
.P
This example shows that one way of avoiding stack problems when matching long
subject strings is to write repeated parenthesized subpatterns to match more
than one character whenever possible.
.
.SS "Compiling PCRE to use heap instead of stack"
.rs
.sp
In environments where stack memory is constrained, you might want to compile
PCRE to use heap memory instead of stack for remembering back-up points. This
makes it run a lot more slowly, however. Details of how to do this are given in
@@ -68,27 +75,17 @@ the
.\" HREF
\fBpcrebuild\fP
.\"
documentation.
.P
In Unix-like environments, there is not often a problem with the stack, though
the default limit on stack size varies from system to system. Values from 8Mb
to 64Mb are common. You can find your default limit by running the command:
documentation. When built in this way, instead of using the stack, PCRE obtains
and frees memory by calling the functions that are pointed to by the
\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables. By default, these
point to \fBmalloc()\fP and \fBfree()\fP, but you can replace the pointers to
cause PCRE to use your own functions. Since the block sizes are always the
same, and are always freed in reverse order, it may be possible to implement
customized memory handlers that are more efficient than the standard functions.
.
.SS "Limiting PCRE's stack usage"
.rs
.sp
ulimit -s
.sp
The effect of running out of stack is often SIGSEGV, though sometimes an error
message is given. You can normally increase the limit on stack size by code
such as this:
.sp
struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = 100*1024*1024;
setrlimit(RLIMIT_STACK, &rlim);
.sp
This reads the current limits (soft and hard) using \fBgetrlimit()\fP, then
attempts to increase the soft limit to 100Mb using \fBsetrlimit()\fP. You must
do this before calling \fBpcre_exec()\fP.
.P
PCRE has an internal counter that can be used to limit the depth of recursion,
and thus cause \fBpcre_exec()\fP to give an error code before it runs out of
stack. By default, the limit is very large, and unlikely ever to operate. It
@@ -107,9 +104,57 @@ As a very rough rule of thumb, you should reckon on about 500 bytes per
recursion. Thus, if you want to limit your stack usage to 8Mb, you
should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
support around 128000 recursions. The \fBpcretest\fP test program has a command
line option (\fB-S\fP) that can be used to increase its stack.
.P
.in 0
Last updated: 29 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
line option (\fB-S\fP) that can be used to increase the size of its stack.
.
.SS "Changing stack size in Unix-like systems"
.rs
.sp
In Unix-like environments, there is not often a problem with the stack unless
very long strings are involved, though the default limit on stack size varies
from system to system. Values from 8Mb to 64Mb are common. You can find your
default limit by running the command:
.sp
ulimit -s
.sp
Unfortunately, the effect of running out of stack is often SIGSEGV, though
sometimes a more explicit error message is given. You can normally increase the
limit on stack size by code such as this:
.sp
struct rlimit rlim;
getrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = 100*1024*1024;
setrlimit(RLIMIT_STACK, &rlim);
.sp
This reads the current limits (soft and hard) using \fBgetrlimit()\fP, then
attempts to increase the soft limit to 100Mb using \fBsetrlimit()\fP. You must
do this before calling \fBpcre_exec()\fP.
.
.SS "Changing stack size in Mac OS X"
.rs
.sp
Using \fBsetrlimit()\fP, as described above, should also work on Mac OS X. It
is also possible to set a stack size when linking a program. There is a
discussion about stack sizes in Mac OS X at this web site:
.\" HTML <a href="http://developer.apple.com/qa/qa2005/qa1419.html">
.\" </a>
http://developer.apple.com/qa/qa2005/qa1419.html.
.\"
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 09 July 2008
Copyright (c) 1997-2008 University of Cambridge.
.fi

449
libs/pcre/doc/pcresyntax.3 Normal file
View File

@@ -0,0 +1,449 @@
.TH PCRESYNTAX 3
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "PCRE REGULAR EXPRESSION SYNTAX SUMMARY"
.rs
.sp
The full syntax and semantics of the regular expressions that are supported by
PCRE are described in the
.\" HREF
\fBpcrepattern\fP
.\"
documentation. This document contains just a quick-reference summary of the
syntax.
.
.
.SH "QUOTING"
.rs
.sp
\ex where x is non-alphanumeric is a literal x
\eQ...\eE treat enclosed characters as literal
.
.
.SH "CHARACTERS"
.rs
.sp
\ea alarm, that is, the BEL character (hex 07)
\ecx "control-x", where x is any character
\ee escape (hex 1B)
\ef formfeed (hex 0C)
\en newline (hex 0A)
\er carriage return (hex 0D)
\et tab (hex 09)
\eddd character with octal code ddd, or backreference
\exhh character with hex code hh
\ex{hhh..} character with hex code hhh..
.
.
.SH "CHARACTER TYPES"
.rs
.sp
. any character except newline;
in dotall mode, any character whatsoever
\eC one byte, even in UTF-8 mode (best avoided)
\ed a decimal digit
\eD a character that is not a decimal digit
\eh a horizontal whitespace character
\eH a character that is not a horizontal whitespace character
\ep{\fIxx\fP} a character with the \fIxx\fP property
\eP{\fIxx\fP} a character without the \fIxx\fP property
\eR a newline sequence
\es a whitespace character
\eS a character that is not a whitespace character
\ev a vertical whitespace character
\eV a character that is not a vertical whitespace character
\ew a "word" character
\eW a "non-word" character
\eX an extended Unicode sequence
.sp
In PCRE, \ed, \eD, \es, \eS, \ew, and \eW recognize only ASCII characters.
.
.
.SH "GENERAL CATEGORY PROPERTY CODES FOR \ep and \eP"
.rs
.sp
C Other
Cc Control
Cf Format
Cn Unassigned
Co Private use
Cs Surrogate
.sp
L Letter
Ll Lower case letter
Lm Modifier letter
Lo Other letter
Lt Title case letter
Lu Upper case letter
L& Ll, Lu, or Lt
.sp
M Mark
Mc Spacing mark
Me Enclosing mark
Mn Non-spacing mark
.sp
N Number
Nd Decimal number
Nl Letter number
No Other number
.sp
P Punctuation
Pc Connector punctuation
Pd Dash punctuation
Pe Close punctuation
Pf Final punctuation
Pi Initial punctuation
Po Other punctuation
Ps Open punctuation
.sp
S Symbol
Sc Currency symbol
Sk Modifier symbol
Sm Mathematical symbol
So Other symbol
.sp
Z Separator
Zl Line separator
Zp Paragraph separator
Zs Space separator
.
.
.SH "SCRIPT NAMES FOR \ep AND \eP"
.rs
.sp
Arabic,
Armenian,
Balinese,
Bengali,
Bopomofo,
Braille,
Buginese,
Buhid,
Canadian_Aboriginal,
Carian,
Cham,
Cherokee,
Common,
Coptic,
Cuneiform,
Cypriot,
Cyrillic,
Deseret,
Devanagari,
Ethiopic,
Georgian,
Glagolitic,
Gothic,
Greek,
Gujarati,
Gurmukhi,
Han,
Hangul,
Hanunoo,
Hebrew,
Hiragana,
Inherited,
Kannada,
Katakana,
Kayah_Li,
Kharoshthi,
Khmer,
Lao,
Latin,
Lepcha,
Limbu,
Linear_B,
Lycian,
Lydian,
Malayalam,
Mongolian,
Myanmar,
New_Tai_Lue,
Nko,
Ogham,
Old_Italic,
Old_Persian,
Ol_Chiki,
Oriya,
Osmanya,
Phags_Pa,
Phoenician,
Rejang,
Runic,
Saurashtra,
Shavian,
Sinhala,
Sudanese,
Syloti_Nagri,
Syriac,
Tagalog,
Tagbanwa,
Tai_Le,
Tamil,
Telugu,
Thaana,
Thai,
Tibetan,
Tifinagh,
Ugaritic,
Vai,
Yi.
.
.
.SH "CHARACTER CLASSES"
.rs
.sp
[...] positive character class
[^...] negative character class
[x-y] range (can be used for hex characters)
[[:xxx:]] positive POSIX named set
[[:^xxx:]] negative POSIX named set
.sp
alnum alphanumeric
alpha alphabetic
ascii 0-127
blank space or tab
cntrl control character
digit decimal digit
graph printing, excluding space
lower lower case letter
print printing, including space
punct printing, excluding alphanumeric
space whitespace
upper upper case letter
word same as \ew
xdigit hexadecimal digit
.sp
In PCRE, POSIX character set names recognize only ASCII characters. You can use
\eQ...\eE inside a character class.
.
.
.SH "QUANTIFIERS"
.rs
.sp
? 0 or 1, greedy
?+ 0 or 1, possessive
?? 0 or 1, lazy
* 0 or more, greedy
*+ 0 or more, possessive
*? 0 or more, lazy
+ 1 or more, greedy
++ 1 or more, possessive
+? 1 or more, lazy
{n} exactly n
{n,m} at least n, no more than m, greedy
{n,m}+ at least n, no more than m, possessive
{n,m}? at least n, no more than m, lazy
{n,} n or more, greedy
{n,}+ n or more, possessive
{n,}? n or more, lazy
.
.
.SH "ANCHORS AND SIMPLE ASSERTIONS"
.rs
.sp
\eb word boundary (only ASCII letters recognized)
\eB not a word boundary
^ start of subject
also after internal newline in multiline mode
\eA start of subject
$ end of subject
also before newline at end of subject
also before internal newline in multiline mode
\eZ end of subject
also before newline at end of subject
\ez end of subject
\eG first matching position in subject
.
.
.SH "MATCH POINT RESET"
.rs
.sp
\eK reset start of match
.
.
.SH "ALTERNATION"
.rs
.sp
expr|expr|expr...
.
.
.SH "CAPTURING"
.rs
.sp
(...) capturing group
(?<name>...) named capturing group (Perl)
(?'name'...) named capturing group (Perl)
(?P<name>...) named capturing group (Python)
(?:...) non-capturing group
(?|...) non-capturing group; reset group numbers for
capturing groups in each alternative
.
.
.SH "ATOMIC GROUPS"
.rs
.sp
(?>...) atomic, non-capturing group
.
.
.
.
.SH "COMMENT"
.rs
.sp
(?#....) comment (not nestable)
.
.
.SH "OPTION SETTING"
.rs
.sp
(?i) caseless
(?J) allow duplicate names
(?m) multiline
(?s) single line (dotall)
(?U) default ungreedy (lazy)
(?x) extended (ignore white space)
(?-...) unset option(s)
.sp
The following is recognized only at the start of a pattern or after one of the
newline-setting options with similar syntax:
.sp
(*UTF8) set UTF-8 mode
.
.
.SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS"
.rs
.sp
(?=...) positive look ahead
(?!...) negative look ahead
(?<=...) positive look behind
(?<!...) negative look behind
.sp
Each top-level branch of a look behind must be of a fixed length.
.
.
.SH "BACKREFERENCES"
.rs
.sp
\en reference by number (can be ambiguous)
\egn reference by number
\eg{n} reference by number
\eg{-n} relative reference by number
\ek<name> reference by name (Perl)
\ek'name' reference by name (Perl)
\eg{name} reference by name (Perl)
\ek{name} reference by name (.NET)
(?P=name) reference by name (Python)
.
.
.SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)"
.rs
.sp
(?R) recurse whole pattern
(?n) call subpattern by absolute number
(?+n) call subpattern by relative number
(?-n) call subpattern by relative number
(?&name) call subpattern by name (Perl)
(?P>name) call subpattern by name (Python)
\eg<name> call subpattern by name (Oniguruma)
\eg'name' call subpattern by name (Oniguruma)
\eg<n> call subpattern by absolute number (Oniguruma)
\eg'n' call subpattern by absolute number (Oniguruma)
\eg<+n> call subpattern by relative number (PCRE extension)
\eg'+n' call subpattern by relative number (PCRE extension)
\eg<-n> call subpattern by relative number (PCRE extension)
\eg'-n' call subpattern by relative number (PCRE extension)
.
.
.SH "CONDITIONAL PATTERNS"
.rs
.sp
(?(condition)yes-pattern)
(?(condition)yes-pattern|no-pattern)
.sp
(?(n)... absolute reference condition
(?(+n)... relative reference condition
(?(-n)... relative reference condition
(?(<name>)... named reference condition (Perl)
(?('name')... named reference condition (Perl)
(?(name)... named reference condition (PCRE)
(?(R)... overall recursion condition
(?(Rn)... specific group recursion condition
(?(R&name)... specific recursion condition
(?(DEFINE)... define subpattern for reference
(?(assert)... assertion condition
.
.
.SH "BACKTRACKING CONTROL"
.rs
.sp
The following act immediately they are reached:
.sp
(*ACCEPT) force successful match
(*FAIL) force backtrack; synonym (*F)
.sp
The following act only when a subsequent match failure causes a backtrack to
reach them. They all force a match failure, but they differ in what happens
afterwards. Those that advance the start-of-match point do so only if the
pattern is not anchored.
.sp
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
(*SKIP) advance start to current matching position
(*THEN) local failure, backtrack to next alternation
.
.
.SH "NEWLINE CONVENTIONS"
.rs
.sp
These are recognized only at the very start of the pattern or after a
(*BSR_...) or (*UTF8) option.
.sp
(*CR) carriage return only
(*LF) linefeed only
(*CRLF) carriage return followed by linefeed
(*ANYCRLF) all three of the above
(*ANY) any Unicode newline sequence
.
.
.SH "WHAT \eR MATCHES"
.rs
.sp
These are recognized only at the very start of the pattern or after a
(*...) option that sets the newline convention or UTF-8 mode.
.sp
(*BSR_ANYCRLF) CR, LF, or CRLF
(*BSR_UNICODE) any Unicode newline sequence
.
.
.SH "CALLOUTS"
.rs
.sp
(?C) callout
(?Cn) callout with data n
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcrepattern\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3),
\fBpcrematching\fP(3), \fBpcre\fP(3).
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 11 April 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -24,23 +24,36 @@ documentation.
.SH OPTIONS
.rs
.TP 10
\fB-b\fP
Behave as if each regex has the \fB/B\fP (show bytecode) modifier; the internal
form is output after compilation.
.TP 10
\fB-C\fP
Output the version number of the PCRE library, and all available information
about the optional features that are included, and then exit.
.TP 10
\fB-d\fP
Behave as if each regex has the \fB/D\fP (debug) modifier; the internal
form is output after compilation.
form and information about the compiled pattern is output after compilation;
\fB-d\fP is equivalent to \fB-b -i\fP.
.TP 10
\fB-dfa\fP
Behave as if each data line contains the \eD escape sequence; this causes the
alternative matching function, \fBpcre_dfa_exec()\fP, to be used instead of the
standard \fBpcre_exec()\fP function (more detail is given below).
.TP 10
\fB-help\fP
Output a brief summary these options and then exit.
.TP 10
\fB-i\fP
Behave as if each regex has the \fB/I\fP modifier; information about the
compiled pattern is given after compilation.
.TP 10
\fB-M\fP
Behave as if each data line contains the \eM escape sequence; this causes
PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
calling \fBpcre_exec()\fP repeatedly with different limits.
.TP 10
\fB-m\fP
Output the size of each compiled pattern after it has been compiled. This is
equivalent to adding \fB/M\fP to each regular expression. For compatibility
@@ -48,9 +61,11 @@ with earlier versions of pcretest, \fB-s\fP is a synonym for \fB-m\fP.
.TP 10
\fB-o\fP \fIosize\fP
Set the number of elements in the output vector that is used when calling
\fBpcre_exec()\fP to be \fIosize\fP. The default value is 45, which is enough
for 14 capturing subexpressions. The vector size can be changed for individual
matching calls by including \eO in the data line (see below).
\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP to be \fIosize\fP. The default value
is 45, which is enough for 14 capturing subexpressions for \fBpcre_exec()\fP or
22 different matches for \fBpcre_dfa_exec()\fP. The vector size can be
changed for individual matching calls by including \eO in the data line (see
below).
.TP 10
\fB-p\fP
Behave as if each regex has the \fB/P\fP modifier; the POSIX wrapper API is
@@ -68,7 +83,14 @@ megabytes.
Run each compile, study, and match many times with a timer, and output
resulting time per compile or match (in milliseconds). Do not set \fB-m\fP with
\fB-t\fP, because you will then get the size output a zillion times, and the
timing will be distorted.
timing will be distorted. You can control the number of iterations that are
used for timing by following \fB-t\fP with a number (as a separate item on the
command line). For example, "-t 1000" would iterate 1000 times. The default is
to iterate 500000 times.
.TP 10
\fB-tm\fP
This is like \fB-t\fP except that it times only the matching phase, not the
compile or study phases.
.
.
.SH DESCRIPTION
@@ -80,14 +102,20 @@ that file and writes to stdout. Otherwise, it reads from stdin and writes to
stdout, and prompts for each line of input, using "re>" to prompt for regular
expressions, and "data>" to prompt for data lines.
.P
When \fBpcretest\fP is built, a configuration option can specify that it should
be linked with the \fBlibreadline\fP library. When this is done, if the input
is from a terminal, it is read using the \fBreadline()\fP function. This
provides line-editing and history facilities. The output from the \fB-help\fP
option states whether or not \fBreadline()\fP will be used.
.P
The program handles any number of sets of input on a single input file. Each
set starts with a regular expression, and continues with any number of data
lines to be matched against the pattern.
.P
Each data line is matched separately and independently. If you want to do
multi-line matches, you have to use the \en escape sequence (or \er or \er\en,
depending on the newline setting) in a single line of input to encode the
newline characters. There is no limit on the length of data lines; the input
etc., depending on the newline setting) in a single line of input to encode the
newline sequences. There is no limit on the length of data lines; the input
buffer is automatically extended if it is too small.
.P
An empty line signals the end of the data lines, at which point a new regular
@@ -140,20 +168,30 @@ effect as they do in Perl. For example:
The following table shows additional modifiers for setting PCRE options that do
not correspond to anything in Perl:
.sp
\fB/A\fP PCRE_ANCHORED
\fB/C\fP PCRE_AUTO_CALLOUT
\fB/E\fP PCRE_DOLLAR_ENDONLY
\fB/f\fP PCRE_FIRSTLINE
\fB/J\fP PCRE_DUPNAMES
\fB/N\fP PCRE_NO_AUTO_CAPTURE
\fB/U\fP PCRE_UNGREEDY
\fB/X\fP PCRE_EXTRA
\fB/<cr>\fP PCRE_NEWLINE_CR
\fB/<lf>\fP PCRE_NEWLINE_LF
\fB/<crlf>\fP PCRE_NEWLINE_CRLF
\fB/A\fP PCRE_ANCHORED
\fB/C\fP PCRE_AUTO_CALLOUT
\fB/E\fP PCRE_DOLLAR_ENDONLY
\fB/f\fP PCRE_FIRSTLINE
\fB/J\fP PCRE_DUPNAMES
\fB/N\fP PCRE_NO_AUTO_CAPTURE
\fB/U\fP PCRE_UNGREEDY
\fB/X\fP PCRE_EXTRA
\fB/<JS>\fP PCRE_JAVASCRIPT_COMPAT
\fB/<cr>\fP PCRE_NEWLINE_CR
\fB/<lf>\fP PCRE_NEWLINE_LF
\fB/<crlf>\fP PCRE_NEWLINE_CRLF
\fB/<anycrlf>\fP PCRE_NEWLINE_ANYCRLF
\fB/<any>\fP PCRE_NEWLINE_ANY
\fB/<bsr_anycrlf>\fP PCRE_BSR_ANYCRLF
\fB/<bsr_unicode>\fP PCRE_BSR_UNICODE
.sp
Those specifying line endings are literal strings as shown. Details of the
meanings of these PCRE options are given in the
Those specifying line ending sequences are literal strings as shown, but the
letters can be in either case. This example sets multiline matching with CRLF
as the line ending sequence:
.sp
/^abc/m<crlf>
.sp
Details of the meanings of these PCRE options are given in the
.\" HREF
\fBpcreapi\fP
.\"
@@ -191,6 +229,13 @@ matched the entire pattern, pcretest should in addition output the remainder of
the subject string. This is useful for tests where the subject contains
multiple copies of the same substring.
.P
The \fB/B\fP modifier is a debugging feature. It requests that \fBpcretest\fP
output a representation of the compiled byte code after compilation. Normally
this information contains length and offset values; however, if \fB/Z\fP is
also present, this data is replaced by spaces. This is a special feature for
use in the automatic test scripts; it ensures that the same output is generated
for different internal link sizes.
.P
The \fB/L\fP modifier must be followed directly by the name of a locale, for
example,
.sp
@@ -207,10 +252,8 @@ compiled pattern (whether it is anchored, has a fixed first character, and
so on). It does this by calling \fBpcre_fullinfo()\fP after compiling a
pattern. If the pattern is studied, the results of that are also output.
.P
The \fB/D\fP modifier is a PCRE debugging feature, which also assumes \fB/I\fP.
It causes the internal form of compiled regular expressions to be output after
compilation. If the pattern was studied, the information returned is also
output.
The \fB/D\fP modifier is a PCRE debugging feature, and is equivalent to
\fB/BI\fP, that is, both the \fB/B\fP and the \fB/I\fP modifiers.
.P
The \fB/F\fP modifier causes \fBpcretest\fP to flip the byte order of the
fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
@@ -254,17 +297,17 @@ complicated features of PCRE. If you are just testing "ordinary" regular
expressions, you probably don't need any of these. The following escapes are
recognized:
.sp
\ea alarm (= BEL)
\eb backspace
\ee escape
\ef formfeed
\en newline
\ea alarm (BEL, \ex07)
\eb backspace (\ex08)
\ee escape (\ex27)
\ef formfeed (\ex0c)
\en newline (\ex0a)
.\" JOIN
\eqdd set the PCRE_MATCH_LIMIT limit to dd
(any number of digits)
\er carriage return
\et tab
\ev vertical tab
\er carriage return (\ex0d)
\et tab (\ex09)
\ev vertical tab (\ex0b)
\ennn octal character (up to 3 octal digits)
\exhh hexadecimal character (up to 2 hex digits)
.\" JOIN
@@ -344,11 +387,20 @@ recognized:
.\" JOIN
\e<crlf> pass the PCRE_NEWLINE_CRLF option to \fBpcre_exec()\fP
or \fBpcre_dfa_exec()\fP
.\" JOIN
\e<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to \fBpcre_exec()\fP
or \fBpcre_dfa_exec()\fP
.\" JOIN
\e<any> pass the PCRE_NEWLINE_ANY option to \fBpcre_exec()\fP
or \fBpcre_dfa_exec()\fP
.sp
The escapes that specify line endings are literal strings, exactly as shown.
A backslash followed by anything else just escapes the anything else. If the
very last character is a backslash, it is ignored. This gives a way of passing
an empty line as data, since a real empty line terminates the data input.
The escapes that specify line ending sequences are literal strings, exactly as
shown. No more than one newline setting should be present in any data line.
.P
A backslash followed by anything else just escapes the anything else. If
the very last character is a backslash, it is ignored. This gives a way of
passing an empty line as data, since a real empty line terminates the data
input.
.P
If \eM is present, \fBpcretest\fP calls \fBpcre_exec()\fP several times, with
different values in the \fImatch_limit\fP and \fImatch_limit_recursion\fP
@@ -374,7 +426,10 @@ and \eZ, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
The use of \ex{hh...} to represent UTF-8 characters is not dependent on the use
of the \fB/8\fP modifier on the pattern. It is recognized always. There may be
any number of hexadecimal digits inside the braces. The result is from one to
six bytes, encoded according to the UTF-8 rules.
six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
valid Unicode code points, or indeed valid UTF-8 characters according to the
later rules in RFC 3629.
.
.
.SH "THE ALTERNATIVE MATCHING FUNCTION"
@@ -411,7 +466,7 @@ respectively, and otherwise the PCRE negative error number. Here is an example
of an interactive \fBpcretest\fP run.
.sp
$ pcretest
PCRE version 5.00 07-Sep-2004
PCRE version 7.0 30-Nov-2006
.sp
re> /^abc(\ed+)/
data> abc123
@@ -420,11 +475,26 @@ of an interactive \fBpcretest\fP run.
data> xyz
No match
.sp
Note that unset capturing substrings that are not followed by one that is set
are not returned by \fBpcre_exec()\fP, and are not shown by \fBpcretest\fP. In
the following example, there are two capturing substrings, but when the first
data line is matched, the second, unset substring is not shown. An "internal"
unset substring is shown as "<unset>", as for the second data line.
.sp
re> /(a)|(b)/
data> a
0: a
1: a
data> b
0: b
1: <unset>
2: b
.sp
If the strings contain any non-printing characters, they are output as \e0x
escapes, or as \ex{...} escapes if the \fB/8\fP modifier was present on the
pattern. If the pattern has the \fB/+\fP modifier, the output for substring 0
is followed by the the rest of the subject string, identified by "0+" like
this:
pattern. See below for the definition of non-printing characters. If the
pattern has the \fB/+\fP modifier, the output for substring 0 is followed by
the the rest of the subject string, identified by "0+" like this:
.sp
re> /cat/+
data> cataract
@@ -452,10 +522,11 @@ instead of a colon. This is in addition to the normal full list. The string
length (that is, the return from the extraction function) is given in
parentheses after each string for \fB\eC\fP and \fB\eG\fP.
.P
Note that while patterns can be continued over several lines (a plain ">"
Note that whereas patterns can be continued over several lines (a plain ">"
prompt is used for continuations), data lines may not. However newlines can be
included in data by means of the \en escape (or \er or \er\en for those newline
settings).
included in data by means of the \en escape (or \er, \er\en, etc., depending on
the newline sequence setting).
.
.
.
.SH "OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION"
@@ -475,7 +546,7 @@ the subject where there is at least one match. For example:
(Using the normal matching function on this data finds only "tang".) The
longest matching string is always given first (and numbered zero).
.P
If \fB/g\P is present on the pattern, the search for further matches resumes
If \fB/g\fP is present on the pattern, the search for further matches resumes
at the end of the longest match. For example:
.sp
re> /(tang|tangerine|tan)/g
@@ -499,7 +570,7 @@ indicating that the subject partially matched the pattern, you can restart the
match with additional subject data by means of the \eR escape sequence. For
example:
.sp
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
data> 23ja\eP\eD
Partial match: 23ja
data> n05\eR\eD
@@ -556,6 +627,21 @@ the
documentation.
.
.
.
.SH "NON-PRINTING CHARACTERS"
.rs
.sp
When \fBpcretest\fP is outputting text in the compiled version of a pattern,
bytes other than 32-126 are always treated as non-printing characters are are
therefore shown as hex escapes.
.P
When \fBpcretest\fP is outputting text that is a matched part of a subject
string, it behaves in the same way, unless a different locale has been set for
the pattern (using the \fB/L\fP modifier). In this case, the \fBisprint()\fP
function to distinguish printing and non-printing characters.
.
.
.
.SH "SAVING AND RELOADING COMPILED PATTERNS"
.rs
.sp
@@ -616,16 +702,27 @@ Finally, if you attempt to load a file that is not in the correct format, the
result is undefined.
.
.
.SH "SEE ALSO"
.rs
.sp
\fBpcre\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3), \fBpcrematching\fP(3),
\fBpcrepartial\fP(d), \fBpcrepattern\fP(3), \fBpcreprecompile\fP(3).
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
.br
University Computing Service,
.br
Cambridge CB2 3QG, England.
.P
.in 0
Last updated: 29 June 2006
.br
Copyright (c) 1997-2006 University of Cambridge.
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 10 March 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi

View File

@@ -19,67 +19,93 @@ SYNOPSIS
OPTIONS
-b Behave as if each regex has the /B (show bytecode) modifier;
the internal form is output after compilation.
-C Output the version number of the PCRE library, and all avail-
able information about the optional features that are
able information about the optional features that are
included, and then exit.
-d Behave as if each regex has the /D (debug) modifier; the
internal form is output after compilation.
-d Behave as if each regex has the /D (debug) modifier; the
internal form and information about the compiled pattern is
output after compilation; -d is equivalent to -b -i.
-dfa Behave as if each data line contains the \D escape sequence;
this causes the alternative matching function,
pcre_dfa_exec(), to be used instead of the standard
pcre_exec() function (more detail is given below).
-help Output a brief summary these options and then exit.
-i Behave as if each regex has the /I modifier; information
about the compiled pattern is given after compilation.
-m Output the size of each compiled pattern after it has been
compiled. This is equivalent to adding /M to each regular
expression. For compatibility with earlier versions of
-M Behave as if each data line contains the \M escape sequence;
this causes PCRE to discover the minimum MATCH_LIMIT and
MATCH_LIMIT_RECURSION settings by calling pcre_exec() repeat-
edly with different limits.
-m Output the size of each compiled pattern after it has been
compiled. This is equivalent to adding /M to each regular
expression. For compatibility with earlier versions of
pcretest, -s is a synonym for -m.
-o osize Set the number of elements in the output vector that is used
when calling pcre_exec() to be osize. The default value is
45, which is enough for 14 capturing subexpressions. The vec-
tor size can be changed for individual matching calls by
including \O in the data line (see below).
-o osize Set the number of elements in the output vector that is used
when calling pcre_exec() or pcre_dfa_exec() to be osize. The
default value is 45, which is enough for 14 capturing subex-
pressions for pcre_exec() or 22 different matches for
pcre_dfa_exec(). The vector size can be changed for individ-
ual matching calls by including \O in the data line (see
below).
-p Behave as if each regex has the /P modifier; the POSIX wrap-
per API is used to call PCRE. None of the other options has
-p Behave as if each regex has the /P modifier; the POSIX wrap-
per API is used to call PCRE. None of the other options has
any effect when -p is set.
-q Do not output the version number of pcretest at the start of
-q Do not output the version number of pcretest at the start of
execution.
-S size On Unix-like systems, set the size of the runtime stack to
-S size On Unix-like systems, set the size of the runtime stack to
size megabytes.
-t Run each compile, study, and match many times with a timer,
and output resulting time per compile or match (in millisec-
onds). Do not set -m with -t, because you will then get the
size output a zillion times, and the timing will be dis-
torted.
-t Run each compile, study, and match many times with a timer,
and output resulting time per compile or match (in millisec-
onds). Do not set -m with -t, because you will then get the
size output a zillion times, and the timing will be dis-
torted. You can control the number of iterations that are
used for timing by following -t with a number (as a separate
item on the command line). For example, "-t 1000" would iter-
ate 1000 times. The default is to iterate 500000 times.
-tm This is like -t except that it times only the matching phase,
not the compile or study phases.
DESCRIPTION
If pcretest is given two filename arguments, it reads from the first
If pcretest is given two filename arguments, it reads from the first
and writes to the second. If it is given only one filename argument, it
reads from that file and writes to stdout. Otherwise, it reads from
stdin and writes to stdout, and prompts for each line of input, using
reads from that file and writes to stdout. Otherwise, it reads from
stdin and writes to stdout, and prompts for each line of input, using
"re>" to prompt for regular expressions, and "data>" to prompt for data
lines.
When pcretest is built, a configuration option can specify that it
should be linked with the libreadline library. When this is done, if
the input is from a terminal, it is read using the readline() function.
This provides line-editing and history facilities. The output from the
-help option states whether or not readline() will be used.
The program handles any number of sets of input on a single input file.
Each set starts with a regular expression, and continues with any num-
Each set starts with a regular expression, and continues with any num-
ber of data lines to be matched against the pattern.
Each data line is matched separately and independently. If you want to
Each data line is matched separately and independently. If you want to
do multi-line matches, you have to use the \n escape sequence (or \r or
\r\n, depending on the newline setting) in a single line of input to
encode the newline characters. There is no limit on the length of data
lines; the input buffer is automatically extended if it is too small.
\r\n, etc., depending on the newline setting) in a single line of input
to encode the newline sequences. There is no limit on the length of
data lines; the input buffer is automatically extended if it is too
small.
An empty line signals the end of the data lines, at which point a new
regular expression is read. The regular expressions are given enclosed
@@ -131,39 +157,49 @@ PATTERN MODIFIERS
The following table shows additional modifiers for setting PCRE options
that do not correspond to anything in Perl:
/A PCRE_ANCHORED
/C PCRE_AUTO_CALLOUT
/E PCRE_DOLLAR_ENDONLY
/f PCRE_FIRSTLINE
/J PCRE_DUPNAMES
/N PCRE_NO_AUTO_CAPTURE
/U PCRE_UNGREEDY
/X PCRE_EXTRA
/<cr> PCRE_NEWLINE_CR
/<lf> PCRE_NEWLINE_LF
/<crlf> PCRE_NEWLINE_CRLF
/A PCRE_ANCHORED
/C PCRE_AUTO_CALLOUT
/E PCRE_DOLLAR_ENDONLY
/f PCRE_FIRSTLINE
/J PCRE_DUPNAMES
/N PCRE_NO_AUTO_CAPTURE
/U PCRE_UNGREEDY
/X PCRE_EXTRA
/<JS> PCRE_JAVASCRIPT_COMPAT
/<cr> PCRE_NEWLINE_CR
/<lf> PCRE_NEWLINE_LF
/<crlf> PCRE_NEWLINE_CRLF
/<anycrlf> PCRE_NEWLINE_ANYCRLF
/<any> PCRE_NEWLINE_ANY
/<bsr_anycrlf> PCRE_BSR_ANYCRLF
/<bsr_unicode> PCRE_BSR_UNICODE
Those specifying line endings are literal strings as shown. Details of
the meanings of these PCRE options are given in the pcreapi documenta-
tion.
Those specifying line ending sequences are literal strings as shown,
but the letters can be in either case. This example sets multiline
matching with CRLF as the line ending sequence:
/^abc/m<crlf>
Details of the meanings of these PCRE options are given in the pcreapi
documentation.
Finding all matches in a string
Searching for all possible matches within each subject string can be
requested by the /g or /G modifier. After finding a match, PCRE is
Searching for all possible matches within each subject string can be
requested by the /g or /G modifier. After finding a match, PCRE is
called again to search the remainder of the subject string. The differ-
ence between /g and /G is that the former uses the startoffset argument
to pcre_exec() to start searching at a new point within the entire
string (which is in effect what Perl does), whereas the latter passes
over a shortened substring. This makes a difference to the matching
to pcre_exec() to start searching at a new point within the entire
string (which is in effect what Perl does), whereas the latter passes
over a shortened substring. This makes a difference to the matching
process if the pattern begins with a lookbehind assertion (including \b
or \B).
If any call to pcre_exec() in a /g or /G sequence matches an empty
string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
flags set in order to search for another, non-empty, match at the same
point. If this second match fails, the start offset is advanced by
one, and the normal match is retried. This imitates the way Perl han-
If any call to pcre_exec() in a /g or /G sequence matches an empty
string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
flags set in order to search for another, non-empty, match at the same
point. If this second match fails, the start offset is advanced by
one, and the normal match is retried. This imitates the way Perl han-
dles such cases when using the /g modifier or the split() function.
Other modifiers
@@ -175,38 +211,43 @@ PATTERN MODIFIERS
remainder of the subject string. This is useful for tests where the
subject contains multiple copies of the same substring.
The /L modifier must be followed directly by the name of a locale, for
The /B modifier is a debugging feature. It requests that pcretest out-
put a representation of the compiled byte code after compilation. Nor-
mally this information contains length and offset values; however, if
/Z is also present, this data is replaced by spaces. This is a special
feature for use in the automatic test scripts; it ensures that the same
output is generated for different internal link sizes.
The /L modifier must be followed directly by the name of a locale, for
example,
/pattern/Lfr_FR
For this reason, it must be the last modifier. The given locale is set,
pcre_maketables() is called to build a set of character tables for the
locale, and this is then passed to pcre_compile() when compiling the
regular expression. Without an /L modifier, NULL is passed as the
tables pointer; that is, /L applies only to the expression on which it
pcre_maketables() is called to build a set of character tables for the
locale, and this is then passed to pcre_compile() when compiling the
regular expression. Without an /L modifier, NULL is passed as the
tables pointer; that is, /L applies only to the expression on which it
appears.
The /I modifier requests that pcretest output information about the
compiled pattern (whether it is anchored, has a fixed first character,
and so on). It does this by calling pcre_fullinfo() after compiling a
pattern. If the pattern is studied, the results of that are also out-
The /I modifier requests that pcretest output information about the
compiled pattern (whether it is anchored, has a fixed first character,
and so on). It does this by calling pcre_fullinfo() after compiling a
pattern. If the pattern is studied, the results of that are also out-
put.
The /D modifier is a PCRE debugging feature, which also assumes /I. It
causes the internal form of compiled regular expressions to be output
after compilation. If the pattern was studied, the information returned
is also output.
The /D modifier is a PCRE debugging feature, and is equivalent to /BI,
that is, both the /B and the /I modifiers.
The /F modifier causes pcretest to flip the byte order of the fields in
the compiled pattern that contain 2-byte and 4-byte numbers. This
facility is for testing the feature in PCRE that allows it to execute
the compiled pattern that contain 2-byte and 4-byte numbers. This
facility is for testing the feature in PCRE that allows it to execute
patterns that were compiled on a host with a different endianness. This
feature is not available when the POSIX interface to PCRE is being
used, that is, when the /P pattern modifier is specified. See also the
feature is not available when the POSIX interface to PCRE is being
used, that is, when the /P pattern modifier is specified. See also the
section about saving and reloading compiled patterns below.
The /S modifier causes pcre_study() to be called after the expression
The /S modifier causes pcre_study() to be called after the expression
has been compiled, and the results used when the expression is matched.
The /M modifier causes the size of memory block used to hold the com-
@@ -216,38 +257,38 @@ PATTERN MODIFIERS
rather than its native API. When this is done, all other modifiers
except /i, /m, and /+ are ignored. REG_ICASE is set if /i is present,
and REG_NEWLINE is set if /m is present. The wrapper functions force
PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option
set. This turns on support for UTF-8 character handling in PCRE, pro-
vided that it was compiled with this support enabled. This modifier
The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option
set. This turns on support for UTF-8 character handling in PCRE, pro-
vided that it was compiled with this support enabled. This modifier
also causes any non-printing characters in output strings to be printed
using the \x{hh...} notation if they are valid UTF-8 sequences.
If the /? modifier is used with /8, it causes pcretest to call
pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
If the /? modifier is used with /8, it causes pcretest to call
pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
checking of the string for UTF-8 validity.
DATA LINES
Before each data line is passed to pcre_exec(), leading and trailing
whitespace is removed, and it is then scanned for \ escapes. Some of
these are pretty esoteric features, intended for checking out some of
the more complicated features of PCRE. If you are just testing "ordi-
nary" regular expressions, you probably don't need any of these. The
Before each data line is passed to pcre_exec(), leading and trailing
whitespace is removed, and it is then scanned for \ escapes. Some of
these are pretty esoteric features, intended for checking out some of
the more complicated features of PCRE. If you are just testing "ordi-
nary" regular expressions, you probably don't need any of these. The
following escapes are recognized:
\a alarm (= BEL)
\b backspace
\e escape
\f formfeed
\n newline
\a alarm (BEL, \x07)
\b backspace (\x08)
\e escape (\x27)
\f formfeed (\x0c)
\n newline (\x0a)
\qdd set the PCRE_MATCH_LIMIT limit to dd
(any number of digits)
\r carriage return
\t tab
\v vertical tab
\r carriage return (\x0d)
\t tab (\x09)
\v vertical tab (\x0b)
\nnn octal character (up to 3 octal digits)
\xhh hexadecimal character (up to 2 hex digits)
\x{hh...} hexadecimal character, any number of digits
@@ -304,12 +345,19 @@ DATA LINES
or pcre_dfa_exec()
\<crlf> pass the PCRE_NEWLINE_CRLF option to pcre_exec()
or pcre_dfa_exec()
\<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to pcre_exec()
or pcre_dfa_exec()
\<any> pass the PCRE_NEWLINE_ANY option to pcre_exec()
or pcre_dfa_exec()
The escapes that specify line endings are literal strings, exactly as
shown. A backslash followed by anything else just escapes the anything
else. If the very last character is a backslash, it is ignored. This
gives a way of passing an empty line as data, since a real empty line
terminates the data input.
The escapes that specify line ending sequences are literal strings,
exactly as shown. No more than one newline setting should be present in
any data line.
A backslash followed by anything else just escapes the anything else.
If the very last character is a backslash, it is ignored. This gives a
way of passing an empty line as data, since a real empty line termi-
nates the data input.
If \M is present, pcretest calls pcre_exec() several times, with dif-
ferent values in the match_limit and match_limit_recursion fields of
@@ -335,38 +383,42 @@ DATA LINES
The use of \x{hh...} to represent UTF-8 characters is not dependent on
the use of the /8 modifier on the pattern. It is recognized always.
There may be any number of hexadecimal digits inside the braces. The
result is from one to six bytes, encoded according to the UTF-8 rules.
result is from one to six bytes, encoded according to the original
UTF-8 rules of RFC 2279. This allows for values in the range 0 to
0x7FFFFFFF. Note that not all of those are valid Unicode code points,
or indeed valid UTF-8 characters according to the later rules in RFC
3629.
THE ALTERNATIVE MATCHING FUNCTION
By default, pcretest uses the standard PCRE matching function,
By default, pcretest uses the standard PCRE matching function,
pcre_exec() to match each data line. From release 6.0, PCRE supports an
alternative matching function, pcre_dfa_test(), which operates in a
different way, and has some restrictions. The differences between the
alternative matching function, pcre_dfa_test(), which operates in a
different way, and has some restrictions. The differences between the
two functions are described in the pcrematching documentation.
If a data line contains the \D escape sequence, or if the command line
contains the -dfa option, the alternative matching function is called.
If a data line contains the \D escape sequence, or if the command line
contains the -dfa option, the alternative matching function is called.
This function finds all possible matches at a given point. If, however,
the \F escape sequence is present in the data line, it stops after the
the \F escape sequence is present in the data line, it stops after the
first match is found. This is always the shortest possible match.
DEFAULT OUTPUT FROM PCRETEST
This section describes the output when the normal matching function,
This section describes the output when the normal matching function,
pcre_exec(), is being used.
When a match succeeds, pcretest outputs the list of captured substrings
that pcre_exec() returns, starting with number 0 for the string that
that pcre_exec() returns, starting with number 0 for the string that
matched the whole pattern. Otherwise, it outputs "No match" or "Partial
match" when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PAR-
TIAL, respectively, and otherwise the PCRE negative error number. Here
match" when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PAR-
TIAL, respectively, and otherwise the PCRE negative error number. Here
is an example of an interactive pcretest run.
$ pcretest
PCRE version 5.00 07-Sep-2004
PCRE version 7.0 30-Nov-2006
re> /^abc(\d+)/
data> abc123
@@ -375,18 +427,35 @@ DEFAULT OUTPUT FROM PCRETEST
data> xyz
No match
Note that unset capturing substrings that are not followed by one that
is set are not returned by pcre_exec(), and are not shown by pcretest.
In the following example, there are two capturing substrings, but when
the first data line is matched, the second, unset substring is not
shown. An "internal" unset substring is shown as "<unset>", as for the
second data line.
re> /(a)|(b)/
data> a
0: a
1: a
data> b
0: b
1: <unset>
2: b
If the strings contain any non-printing characters, they are output as
\0x escapes, or as \x{...} escapes if the /8 modifier was present on
the pattern. If the pattern has the /+ modifier, the output for sub-
string 0 is followed by the the rest of the subject string, identified
by "0+" like this:
the pattern. See below for the definition of non-printing characters.
If the pattern has the /+ modifier, the output for substring 0 is fol-
lowed by the the rest of the subject string, identified by "0+" like
this:
re> /cat/+
data> cataract
0: cat
0+ aract
If the pattern has the /g or /G modifier, the results of successive
If the pattern has the /g or /G modifier, the results of successive
matching attempts are output in sequence, like this:
re> /\Bi(\w\w)/g
@@ -400,24 +469,24 @@ DEFAULT OUTPUT FROM PCRETEST
"No match" is output only if the first match attempt fails.
If any of the sequences \C, \G, or \L are present in a data line that
is successfully matched, the substrings extracted by the convenience
If any of the sequences \C, \G, or \L are present in a data line that
is successfully matched, the substrings extracted by the convenience
functions are output with C, G, or L after the string number instead of
a colon. This is in addition to the normal full list. The string length
(that is, the return from the extraction function) is given in paren-
(that is, the return from the extraction function) is given in paren-
theses after each string for \C and \G.
Note that while patterns can be continued over several lines (a plain
Note that whereas patterns can be continued over several lines (a plain
">" prompt is used for continuations), data lines may not. However new-
lines can be included in data by means of the \n escape (or \r or \r\n
for those newline settings).
lines can be included in data by means of the \n escape (or \r, \r\n,
etc., depending on the newline sequence setting).
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
When the alternative matching function, pcre_dfa_exec(), is used (by
means of the \D escape sequence or the -dfa command line option), the
output consists of a list of all the matches that start at the first
When the alternative matching function, pcre_dfa_exec(), is used (by
means of the \D escape sequence or the -dfa command line option), the
output consists of a list of all the matches that start at the first
point in the subject where there is at least one match. For example:
re> /(tang|tangerine|tan)/
@@ -426,11 +495,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
1: tang
2: tan
(Using the normal matching function on this data finds only "tang".)
The longest matching string is always given first (and numbered zero).
(Using the normal matching function on this data finds only "tang".)
The longest matching string is always given first (and numbered zero).
If /gP is present on the pattern, the search for further matches
resumes at the end of the longest match. For example:
If /g is present on the pattern, the search for further matches resumes
at the end of the longest match. For example:
re> /(tang|tangerine|tan)/g
data> yellow tangerine and tangy sultana\D
@@ -453,7 +522,7 @@ RESTARTING AFTER A PARTIAL MATCH
can restart the match with additional subject data by means of the \R
escape sequence. For example:
re> /^?(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)$/
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data> 23ja\P\D
Partial match: 23ja
data> n05\R\D
@@ -503,67 +572,88 @@ CALLOUTS
the pcrecallout documentation.
NON-PRINTING CHARACTERS
When pcretest is outputting text in the compiled version of a pattern,
bytes other than 32-126 are always treated as non-printing characters
are are therefore shown as hex escapes.
When pcretest is outputting text that is a matched part of a subject
string, it behaves in the same way, unless a different locale has been
set for the pattern (using the /L modifier). In this case, the
isprint() function to distinguish printing and non-printing characters.
SAVING AND RELOADING COMPILED PATTERNS
The facilities described in this section are not available when the
The facilities described in this section are not available when the
POSIX inteface to PCRE is being used, that is, when the /P pattern mod-
ifier is specified.
When the POSIX interface is not in use, you can cause pcretest to write
a compiled pattern to a file, by following the modifiers with > and a
a compiled pattern to a file, by following the modifiers with > and a
file name. For example:
/pattern/im >/some/file
See the pcreprecompile documentation for a discussion about saving and
See the pcreprecompile documentation for a discussion about saving and
re-using compiled patterns.
The data that is written is binary. The first eight bytes are the
length of the compiled pattern data followed by the length of the
optional study data, each written as four bytes in big-endian order
(most significant byte first). If there is no study data (either the
The data that is written is binary. The first eight bytes are the
length of the compiled pattern data followed by the length of the
optional study data, each written as four bytes in big-endian order
(most significant byte first). If there is no study data (either the
pattern was not studied, or studying did not return any data), the sec-
ond length is zero. The lengths are followed by an exact copy of the
ond length is zero. The lengths are followed by an exact copy of the
compiled pattern. If there is additional study data, this follows imme-
diately after the compiled pattern. After writing the file, pcretest
diately after the compiled pattern. After writing the file, pcretest
expects to read a new pattern.
A saved pattern can be reloaded into pcretest by specifing < and a file
name instead of a pattern. The name of the file must not contain a <
character, as otherwise pcretest will interpret the line as a pattern
name instead of a pattern. The name of the file must not contain a <
character, as otherwise pcretest will interpret the line as a pattern
delimited by < characters. For example:
re> </some/file
Compiled regex loaded from /some/file
No study data
When the pattern has been loaded, pcretest proceeds to read data lines
When the pattern has been loaded, pcretest proceeds to read data lines
in the usual way.
You can copy a file written by pcretest to a different host and reload
it there, even if the new host has opposite endianness to the one on
which the pattern was compiled. For example, you can compile on an i86
You can copy a file written by pcretest to a different host and reload
it there, even if the new host has opposite endianness to the one on
which the pattern was compiled. For example, you can compile on an i86
machine and run on a SPARC machine.
File names for saving and reloading can be absolute or relative, but
note that the shell facility of expanding a file name that starts with
File names for saving and reloading can be absolute or relative, but
note that the shell facility of expanding a file name that starts with
a tilde (~) is not available.
The ability to save and reload files in pcretest is intended for test-
ing and experimentation. It is not intended for production use because
only a single pattern can be written to a file. Furthermore, there is
no facility for supplying custom character tables for use with a
reloaded pattern. If the original pattern was compiled with custom
tables, an attempt to match a subject string using a reloaded pattern
is likely to cause pcretest to crash. Finally, if you attempt to load
The ability to save and reload files in pcretest is intended for test-
ing and experimentation. It is not intended for production use because
only a single pattern can be written to a file. Furthermore, there is
no facility for supplying custom character tables for use with a
reloaded pattern. If the original pattern was compiled with custom
tables, an attempt to match a subject string using a reloaded pattern
is likely to cause pcretest to crash. Finally, if you attempt to load
a file that is not in the correct format, the result is undefined.
SEE ALSO
pcre(3), pcreapi(3), pcrecallout(3), pcrematching(3), pcrepartial(d),
pcrepattern(3), pcreprecompile(3).
AUTHOR
Philip Hazel
University Computing Service,
Cambridge CB2 3QG, England.
University Computing Service
Cambridge CB2 3QH, England.
Last updated: 29 June 2006
Copyright (c) 1997-2006 University of Cambridge.
REVISION
Last updated: 10 March 2009
Copyright (c) 1997-2009 University of Cambridge.

Some files were not shown because too many files have changed in this diff Show More