diff --git a/etc/pacman.d/mirrorlist.clang64 b/etc/pacman.d/mirrorlist.clang64 index e0415b88ee2..e503f17d512 100644 --- a/etc/pacman.d/mirrorlist.clang64 +++ b/etc/pacman.d/mirrorlist.clang64 @@ -22,7 +22,7 @@ Server = https://repo.extreme-ix.org/msys2/mingw/clang64/ Server = https://mirror.clarkson.edu/msys2/mingw/clang64/ Server = https://quantum-mirror.hu/mirrors/pub/msys2/mingw/clang64/ Server = https://mirror.archlinux.tw/MSYS2/mingw/clang64/ -Server = https://fastmirror.pp.ua/msys2/mingw/clang64/ +Server = https://distrohub.kyiv.ua/msys2/mingw/clang64/ ## Tier 2 Server = https://ftp.cc.uoc.gr/mirrors/msys2/mingw/clang64/ diff --git a/etc/pacman.d/mirrorlist.mingw b/etc/pacman.d/mirrorlist.mingw index 42335b930cd..3fcf936a8e6 100644 --- a/etc/pacman.d/mirrorlist.mingw +++ b/etc/pacman.d/mirrorlist.mingw @@ -22,7 +22,7 @@ Server = https://repo.extreme-ix.org/msys2/mingw/$repo/ Server = https://mirror.clarkson.edu/msys2/mingw/$repo/ Server = https://quantum-mirror.hu/mirrors/pub/msys2/mingw/$repo/ Server = https://mirror.archlinux.tw/MSYS2/mingw/$repo/ -Server = https://fastmirror.pp.ua/msys2/mingw/$repo/ +Server = https://distrohub.kyiv.ua/msys2/mingw/$repo/ ## Tier 2 Server = https://ftp.cc.uoc.gr/mirrors/msys2/mingw/$repo/ diff --git a/etc/pacman.d/mirrorlist.mingw32 b/etc/pacman.d/mirrorlist.mingw32 index 0d7d124a75f..f4d3f891e02 100644 --- a/etc/pacman.d/mirrorlist.mingw32 +++ b/etc/pacman.d/mirrorlist.mingw32 @@ -22,7 +22,7 @@ Server = https://repo.extreme-ix.org/msys2/mingw/i686/ Server = https://mirror.clarkson.edu/msys2/mingw/i686/ Server = https://quantum-mirror.hu/mirrors/pub/msys2/mingw/i686/ Server = https://mirror.archlinux.tw/MSYS2/mingw/i686/ -Server = https://fastmirror.pp.ua/msys2/mingw/i686/ +Server = https://distrohub.kyiv.ua/msys2/mingw/i686/ ## Tier 2 Server = https://ftp.cc.uoc.gr/mirrors/msys2/mingw/i686/ diff --git a/etc/pacman.d/mirrorlist.mingw64 b/etc/pacman.d/mirrorlist.mingw64 index e8b7123266b..6fedad9359a 100644 --- a/etc/pacman.d/mirrorlist.mingw64 +++ b/etc/pacman.d/mirrorlist.mingw64 @@ -22,7 +22,7 @@ Server = https://repo.extreme-ix.org/msys2/mingw/x86_64/ Server = https://mirror.clarkson.edu/msys2/mingw/x86_64/ Server = https://quantum-mirror.hu/mirrors/pub/msys2/mingw/x86_64/ Server = https://mirror.archlinux.tw/MSYS2/mingw/x86_64/ -Server = https://fastmirror.pp.ua/msys2/mingw/x86_64/ +Server = https://distrohub.kyiv.ua/msys2/mingw/x86_64/ ## Tier 2 Server = https://ftp.cc.uoc.gr/mirrors/msys2/mingw/x86_64/ diff --git a/etc/pacman.d/mirrorlist.msys b/etc/pacman.d/mirrorlist.msys index f9e9a9f0712..f19f990ccf1 100644 --- a/etc/pacman.d/mirrorlist.msys +++ b/etc/pacman.d/mirrorlist.msys @@ -22,7 +22,7 @@ Server = https://repo.extreme-ix.org/msys2/msys/$arch/ Server = https://mirror.clarkson.edu/msys2/msys/$arch/ Server = https://quantum-mirror.hu/mirrors/pub/msys2/msys/$arch/ Server = https://mirror.archlinux.tw/MSYS2/msys/$arch/ -Server = https://fastmirror.pp.ua/msys2/msys/$arch/ +Server = https://distrohub.kyiv.ua/msys2/msys/$arch/ ## Tier 2 Server = https://ftp.cc.uoc.gr/mirrors/msys2/msys/$arch/ diff --git a/etc/pacman.d/mirrorlist.ucrt64 b/etc/pacman.d/mirrorlist.ucrt64 index efa5f06b6d2..cdb24ff343d 100644 --- a/etc/pacman.d/mirrorlist.ucrt64 +++ b/etc/pacman.d/mirrorlist.ucrt64 @@ -22,7 +22,7 @@ Server = https://repo.extreme-ix.org/msys2/mingw/ucrt64/ Server = https://mirror.clarkson.edu/msys2/mingw/ucrt64/ Server = https://quantum-mirror.hu/mirrors/pub/msys2/mingw/ucrt64/ Server = https://mirror.archlinux.tw/MSYS2/mingw/ucrt64/ -Server = https://fastmirror.pp.ua/msys2/mingw/ucrt64/ +Server = https://distrohub.kyiv.ua/msys2/mingw/ucrt64/ ## Tier 2 Server = https://ftp.cc.uoc.gr/mirrors/msys2/mingw/ucrt64/ diff --git a/mingw32/bin/libpcre2-16-0.dll b/mingw32/bin/libpcre2-16-0.dll index 37234d9fa39..1d0e77540da 100644 Binary files a/mingw32/bin/libpcre2-16-0.dll and b/mingw32/bin/libpcre2-16-0.dll differ diff --git a/mingw32/bin/libpcre2-32-0.dll b/mingw32/bin/libpcre2-32-0.dll index 28280be3f20..cad82fd325a 100644 Binary files a/mingw32/bin/libpcre2-32-0.dll and b/mingw32/bin/libpcre2-32-0.dll differ diff --git a/mingw32/bin/libpcre2-8-0.dll b/mingw32/bin/libpcre2-8-0.dll index c64040b03de..55756fd93e1 100644 Binary files a/mingw32/bin/libpcre2-8-0.dll and b/mingw32/bin/libpcre2-8-0.dll differ diff --git a/mingw32/bin/libpcre2-posix-3.dll b/mingw32/bin/libpcre2-posix-3.dll index c42404d864a..0bc35172fba 100644 Binary files a/mingw32/bin/libpcre2-posix-3.dll and b/mingw32/bin/libpcre2-posix-3.dll differ diff --git a/mingw32/bin/pcre2-config b/mingw32/bin/pcre2-config index 26cb0225d00..7effa1f1567 100644 --- a/mingw32/bin/pcre2-config +++ b/mingw32/bin/pcre2-config @@ -66,7 +66,7 @@ while test $# -gt 0; do echo $exec_prefix ;; --version) - echo 10.44 + echo 10.45 ;; --cflags) if test ${prefix}/include != /usr/include ; then diff --git a/mingw32/bin/pcre2grep.exe b/mingw32/bin/pcre2grep.exe index c82da7bbc61..d7850586b14 100644 Binary files a/mingw32/bin/pcre2grep.exe and b/mingw32/bin/pcre2grep.exe differ diff --git a/mingw32/bin/pcre2test.exe b/mingw32/bin/pcre2test.exe index 7db8dfdb626..59711e4b067 100644 Binary files a/mingw32/bin/pcre2test.exe and b/mingw32/bin/pcre2test.exe differ diff --git a/mingw32/include/pcre2.h b/mingw32/include/pcre2.h index a322d9f2d56..061f3db0a76 100644 --- a/mingw32/include/pcre2.h +++ b/mingw32/include/pcre2.h @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE2_MAJOR 10 -#define PCRE2_MINOR 44 +#define PCRE2_MINOR 45 #define PCRE2_PRERELEASE -#define PCRE2_DATE 2024-06-07 +#define PCRE2_DATE 2025-02-05 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE2, the appropriate @@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ #define PCRE2_LITERAL 0x02000000u /* C */ #define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ +#define PCRE2_ALT_EXTENDED_CLASS 0x08000000u /* C */ /* An additional compile options word is available in the compile context. */ @@ -159,6 +160,10 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ #define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ #define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ +#define PCRE2_EXTRA_PYTHON_OCTAL 0x00002000u /* C */ +#define PCRE2_EXTRA_NO_BS0 0x00004000u /* C */ +#define PCRE2_EXTRA_NEVER_CALLOUT 0x00008000u /* C */ +#define PCRE2_EXTRA_TURKISH_CASING 0x00010000u /* C */ /* These are for pcre2_jit_compile(). */ @@ -166,6 +171,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_JIT_PARTIAL_SOFT 0x00000002u #define PCRE2_JIT_PARTIAL_HARD 0x00000004u #define PCRE2_JIT_INVALID_UTF 0x00000100u +#define PCRE2_JIT_TEST_ALLOC 0x00000200u /* These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and pcre2_substitute(). Some are allowed only for one of the functions, and in @@ -318,9 +324,25 @@ pcre2_pattern_convert(). */ #define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195 #define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196 #define PCRE2_ERROR_TOO_MANY_CAPTURES 197 -#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198 +#define PCRE2_ERROR_MISSING_OCTAL_DIGIT 198 #define PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND 199 - +#define PCRE2_ERROR_MAX_VAR_LOOKBEHIND_EXCEEDED 200 +#define PCRE2_ERROR_PATTERN_COMPILED_SIZE_TOO_BIG 201 +#define PCRE2_ERROR_OVERSIZE_PYTHON_OCTAL 202 +#define PCRE2_ERROR_CALLOUT_CALLER_DISABLED 203 +#define PCRE2_ERROR_EXTRA_CASING_REQUIRES_UNICODE 204 +#define PCRE2_ERROR_TURKISH_CASING_REQUIRES_UTF 205 +#define PCRE2_ERROR_EXTRA_CASING_INCOMPATIBLE 206 +#define PCRE2_ERROR_ECLASS_NEST_TOO_DEEP 207 +#define PCRE2_ERROR_ECLASS_INVALID_OPERATOR 208 +#define PCRE2_ERROR_ECLASS_UNEXPECTED_OPERATOR 209 +#define PCRE2_ERROR_ECLASS_EXPECTED_OPERAND 210 +#define PCRE2_ERROR_ECLASS_MIXED_OPERATORS 211 +#define PCRE2_ERROR_ECLASS_HINT_SQUARE_BRACKET 212 +#define PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_EXPR 213 +#define PCRE2_ERROR_PERL_ECLASS_EMPTY_EXPR 214 +#define PCRE2_ERROR_PERL_ECLASS_MISSING_CLOSE 215 +#define PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_CHAR 216 /* "Expected" matching error codes: no match and partial match. */ @@ -407,6 +429,9 @@ released, the numbers must not be changed. */ #define PCRE2_ERROR_INTERNAL_DUPMATCH (-65) #define PCRE2_ERROR_DFA_UINVALID_UTF (-66) #define PCRE2_ERROR_INVALIDOFFSET (-67) +#define PCRE2_ERROR_JIT_UNSUPPORTED (-68) +#define PCRE2_ERROR_REPLACECASE (-69) +#define PCRE2_ERROR_TOOLARGEREPLACE (-70) /* Request types for pcre2_pattern_info() */ @@ -460,6 +485,30 @@ released, the numbers must not be changed. */ #define PCRE2_CONFIG_COMPILED_WIDTHS 14 #define PCRE2_CONFIG_TABLES_LENGTH 15 +/* Optimization directives for pcre2_set_optimize(). +For binary compatibility, only add to this list; do not renumber. */ + +#define PCRE2_OPTIMIZATION_NONE 0 +#define PCRE2_OPTIMIZATION_FULL 1 + +#define PCRE2_AUTO_POSSESS 64 +#define PCRE2_AUTO_POSSESS_OFF 65 +#define PCRE2_DOTSTAR_ANCHOR 66 +#define PCRE2_DOTSTAR_ANCHOR_OFF 67 +#define PCRE2_START_OPTIMIZE 68 +#define PCRE2_START_OPTIMIZE_OFF 69 + +/* Types used in pcre2_set_substitute_case_callout(). + +PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the +callout to indicate that the case of the entire callout input should be +case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that +only the first character or glyph should be transformed to Unicode titlecase, +and the rest to lowercase. */ + +#define PCRE2_SUBSTITUTE_CASE_LOWER 1 +#define PCRE2_SUBSTITUTE_CASE_UPPER 2 +#define PCRE2_SUBSTITUTE_CASE_TITLE_FIRST 3 /* Types for code units in patterns and subject strings. */ @@ -613,7 +662,9 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ - int (*)(uint32_t, void *), void *); + int (*)(uint32_t, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_optimize(pcre2_compile_context *, uint32_t); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \ @@ -628,6 +679,11 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_substitute_callout(pcre2_match_context *, \ int (*)(pcre2_substitute_callout_block *, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_substitute_case_callout(pcre2_match_context *, \ + PCRE2_SIZE (*)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, \ + void *), \ + void *); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ @@ -740,6 +796,7 @@ PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); + /* Functions for serializing / deserializing compiled patterns. */ #define PCRE2_SERIALIZE_FUNCTIONS \ @@ -907,7 +964,9 @@ pcre2_compile are called by application code. */ #define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) #define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) #define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) +#define pcre2_set_optimize PCRE2_SUFFIX(pcre2_set_optimize_) #define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) +#define pcre2_set_substitute_case_callout PCRE2_SUFFIX(pcre2_set_substitute_case_callout_) #define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) #define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) #define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_) diff --git a/mingw32/lib/libpcre2-16.a b/mingw32/lib/libpcre2-16.a index cfdc77bb4db..7640d9d81df 100644 Binary files a/mingw32/lib/libpcre2-16.a and b/mingw32/lib/libpcre2-16.a differ diff --git a/mingw32/lib/libpcre2-16.dll.a b/mingw32/lib/libpcre2-16.dll.a index 9ab6587e968..25e24424a10 100644 Binary files a/mingw32/lib/libpcre2-16.dll.a and b/mingw32/lib/libpcre2-16.dll.a differ diff --git a/mingw32/lib/libpcre2-32.a b/mingw32/lib/libpcre2-32.a index bb6f8e1de79..6815057a087 100644 Binary files a/mingw32/lib/libpcre2-32.a and b/mingw32/lib/libpcre2-32.a differ diff --git a/mingw32/lib/libpcre2-32.dll.a b/mingw32/lib/libpcre2-32.dll.a index b6c0e43e8fb..358f44e6957 100644 Binary files a/mingw32/lib/libpcre2-32.dll.a and b/mingw32/lib/libpcre2-32.dll.a differ diff --git a/mingw32/lib/libpcre2-8.a b/mingw32/lib/libpcre2-8.a index 22091c0cb9f..a38763249d4 100644 Binary files a/mingw32/lib/libpcre2-8.a and b/mingw32/lib/libpcre2-8.a differ diff --git a/mingw32/lib/libpcre2-8.dll.a b/mingw32/lib/libpcre2-8.dll.a index 5778d31eb33..a9b99ce25cb 100644 Binary files a/mingw32/lib/libpcre2-8.dll.a and b/mingw32/lib/libpcre2-8.dll.a differ diff --git a/mingw32/lib/libpcre2-posix.a b/mingw32/lib/libpcre2-posix.a index 812185e8eda..642626d15d9 100644 Binary files a/mingw32/lib/libpcre2-posix.a and b/mingw32/lib/libpcre2-posix.a differ diff --git a/mingw32/lib/libpcre2-posix.dll.a b/mingw32/lib/libpcre2-posix.dll.a index 1deda1e6896..799032a89ad 100644 Binary files a/mingw32/lib/libpcre2-posix.dll.a and b/mingw32/lib/libpcre2-posix.dll.a differ diff --git a/mingw32/lib/pkgconfig/libpcre2-16.pc b/mingw32/lib/pkgconfig/libpcre2-16.pc index 81c173bc6cd..e7a8a185ba5 100644 --- a/mingw32/lib/pkgconfig/libpcre2-16.pc +++ b/mingw32/lib/pkgconfig/libpcre2-16.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-16 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 16 bit character support -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-16 Libs.private: Cflags: -I${includedir} diff --git a/mingw32/lib/pkgconfig/libpcre2-32.pc b/mingw32/lib/pkgconfig/libpcre2-32.pc index 8e9bc5a1f30..f9cc708948a 100644 --- a/mingw32/lib/pkgconfig/libpcre2-32.pc +++ b/mingw32/lib/pkgconfig/libpcre2-32.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-32 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 32 bit character support -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-32 Libs.private: Cflags: -I${includedir} diff --git a/mingw32/lib/pkgconfig/libpcre2-8.pc b/mingw32/lib/pkgconfig/libpcre2-8.pc index 8908ddd5ae8..691c56274a2 100644 --- a/mingw32/lib/pkgconfig/libpcre2-8.pc +++ b/mingw32/lib/pkgconfig/libpcre2-8.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-8 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 8 bit character support -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-8 Libs.private: Cflags: -I${includedir} diff --git a/mingw32/lib/pkgconfig/libpcre2-posix.pc b/mingw32/lib/pkgconfig/libpcre2-posix.pc index 2e79b37bb8e..0ffe710ce1e 100644 --- a/mingw32/lib/pkgconfig/libpcre2-posix.pc +++ b/mingw32/lib/pkgconfig/libpcre2-posix.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-posix Description: Posix compatible interface to libpcre2-8 -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-posix Cflags: -I${includedir} -DPCRE2POSIX_SHARED Requires.private: libpcre2-8 diff --git a/mingw32/share/doc/pcre2/AUTHORS b/mingw32/share/doc/pcre2/AUTHORS deleted file mode 100644 index 9669f7755ad..00000000000 --- a/mingw32/share/doc/pcre2/AUTHORS +++ /dev/null @@ -1,36 +0,0 @@ -THE MAIN PCRE2 LIBRARY CODE ---------------------------- - -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com - -Retired from University of Cambridge Computing Service, -Cambridge, England. - -Copyright (c) 1997-2024 University of Cambridge -All rights reserved - - -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. - - -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. - -#### diff --git a/mingw32/share/doc/pcre2/AUTHORS.md b/mingw32/share/doc/pcre2/AUTHORS.md new file mode 100644 index 00000000000..708fc2325ce --- /dev/null +++ b/mingw32/share/doc/pcre2/AUTHORS.md @@ -0,0 +1,200 @@ +PCRE2 Authorship and Contributors +================================= + +COPYRIGHT +--------- + +Please see the file [LICENCE](./LICENCE.md) in the PCRE2 distribution for +copyright details. + + +MAINTAINERS +----------- + +The PCRE and PCRE2 libraries were authored and maintained by Philip Hazel. + +Since 2024, the contributors with administrator access to the project are now +Nicholas Wilson and Zoltán Herczeg. See the file [SECURITY](./SECURITY.md) for +GPG keys. + +Both administrators are volunteers acting in a personal capacity. + + + + + + + + + + + + + + + + + + +
NameRole
+ + Nicholas Wilson
+ `nicholas@nicholaswilson.me.uk`
+ Currently of Microsoft Research Cambridge, UK + +
+ + * General project administration & maintenance + * Release management + * Code maintenance + +
+ + Zoltán Herczeg
+ `hzmester@freemail.hu`
+ Currently of the University of Szeged, Hungary + +
+ + * Code maintenance + * Ownership of `sljit` and PCRE2's JIT + +
+ + +CONTRIBUTORS +------------ + +Many others have participated and contributed to PCRE2 over its history. + +The maintainers are grateful for all contributions and participation over the +years. We apologise for any names we have forgotten. + +We are especially grateful to Philip Hazel, creator of PCRE and PCRE2, and +maintainer from 1997 to 2024. + +All names listed alphabetically. + +### Contributors to PCRE2 + +This list includes names up until the PCRE2 10.44 release. New names will be +added from the Git history on each release. + + Scott Bell + Carlo Marcelo Arenas Belón + Edward Betts + Jan-Willem Blokland + Ross Burton + Dmitry Cherniachenko + Alexey Chupahin + Jessica Clarke + Alejandro Colomar + Jeremie Courreges-Anglas + Addison Crump + Alex Dowad + Daniel Engberg + Daniel Richard G + David Gaussmann + Andrey Gorbachev + Jordan Griege + Jason Hood + Bumsu Hyeon + Roy Ivy + Martin Joerg + Guillem Jover + Ralf Junker + Ayesh Karunaratne + Michael Kaufmann + Yunho Kim + Joshua Kinard + David Korczynski + Uwe Korn + Jonas Kvinge + Kristian Larsson + Kai Lu + Behzod Mansurov + B. Scott Michel + Nathan Moinvaziri + Mike Munday + Marc Mutz + Fabio Pagani + Christian Persch + Tristan Ross + William A Rowe Jr + David Seifert + Yaakov Selkowitz + Rich Siegel + Karl Skomski + Maciej Sroczyński + Wolfgang Stöggl + Thomas Tempelmann + Greg Thain + Lucas Trzesniewski + Theodore Tsirpanis + Matthew Vernon + Rémi Verschelde + Thomas Voss + Ezekiel Warren + Carl Weaver + Chris Wilson + Amin Yahyaabadi + Joe Zhang + +### Contributors to PCRE1 + +These people contributed either by sending patches or reporting serious issues. + + Irfan Adilovic + Alexander Barkov + Daniel Bergström + David Burgess + Ross Burton + David Byron + Fred Cox + Christian Ehrlicher + Tom Fortmann + Lionel Fourquaux + Mike Frysinger + Daniel Richard G + Dair Gran + "Graycode" (Red Hat Product Security) + Viktor Griph + Wen Guanxing + Robin Houston + Martin Jerabek + Peter Kankowski + Stephen Kelly + Yunho Kim + Joshua Kinard + Carsten Klein + Evgeny Kotkov + Ronald Landheer-Cieslak + Alan Lehotsky + Dmitry V. Levin + Nuno Lopes + Kai Lu + Giuseppe Maxia + Dan Mooney + Marc Mutz + Markus Oberhumer + Sheri Pierce + Petr Pisar + Ari Pollak + Bob Rossi + Ruiger Rill + Michael Shigorin + Rich Siegel + Craig Silverstein (C++ wrapper) + Karl Skomski + Paul Sokolovsky + Stan Switzer + Ian Taylor + Mark Tetrode + Jeff Trawick + Steven Van Ingelgem + Lawrence Velazquez + Jiong Wang + Stefan Weber + Chris Wilson + +Thanks go to Jeffrey Friedl for testing and debugging assistance. diff --git a/mingw32/share/doc/pcre2/ChangeLog b/mingw32/share/doc/pcre2/ChangeLog index ea228c193f7..5217d078599 100644 --- a/mingw32/share/doc/pcre2/ChangeLog +++ b/mingw32/share/doc/pcre2/ChangeLog @@ -4,6 +4,194 @@ Change Log for PCRE2 Before the move to GitHub, this was the only record of changes to PCRE2. Now there is also the log of commit messages. +Internal changes which are not visible to clients of the library are mostly not +listed here. + +Version 10.45 05-February-2025 +------------------------------ + +1. (#418) Change 6 of 10.44 broke 32-bit tests because pcre2test's reporting of +memory size was changed to the entire compiled data block, instead of just the +pattern and tables data, so as to align with the new length restriction. +Because the block's header contains pointers, this meant the pcre2test output +was different in 32-bit mode. A patch by Carlo reverts to the previous state +and makes sure that any limit set by pcre2_set_max_pattern_compiled_length() +also avoids the internal struct overhead. + +2. (#416, #622) Updates to build.zig. + +3. (#427, et al.) Various fixes to pacify static analyzers. + +4. (#428) Add --posix-pattern-file to pcre2grep to allow processing of empty +patterns through the -f option, as well as patterns that end in space +characters, for compatibility with other grep tools. + +5. (4fa5b8bd) Fix a bug in the fuzz support quantifier-limiting code. It ignores +strings of more than 5 digits because they are necessarily numbers greater than +65535, the largest legal quantifier. However, it wasn't ignoring non-significant +leading zeros. + +6. (6d82f0cd) The case-independent processing of the letter-matching Unicode +properties Ll, Lt, and Lu have been changed to match Perl (which changed a while +ago). When caseless matching is in force, all three of these properties are now +treated as Lc (cased letter). + +7. (#433) The pcre2_jit_compile() function was updated by the addition of a new +option PCRE2_JIT_TEST_ALLOC which, if called with a NULL first argument, tests +not only the availability of JIT, but also its ability to allocate executable +memory. Update pcre2test to use this support to extend the -C option. + +8. (75b1025a) The code for parsing Unicode property descriptions for \p and \P +been changed as follows: + + . White space etc. before ^ in a negated value such as \p{ ^L } was not being + ignored. + + . The code wouldn't have worked if PCRE2 was compiled for UTF-8 support + within an EBCDIC environment. Possibly nobody does this any more, but it + should now work. + + . The documentation of the syntax of what can follow \p and \P has been + updated. + +9. (1c24ba01) There was an error in the table of lengths for parsed items for +the OPTIONS item, but fortuitously it could never have actually bitten. While +fixing this, some other code that could never be obeyed was discovered and +removed. + +10. (674b6640) Removed some incorect optimization code from DFA matching that +has been there since PCRE1, but has just been found to cause a no match return +instead of a partial match in some cases. It involves partial matching when (*F) +is present so is unlikely to have actually affected anyone. + +11. (b0f4ac17) Tidy the wording and formatting of some pcre2test error messages +concerned with bad modifiers. Also restrict single-letter modifier sequences to +the first item in a modifier list, as documented and always intended. + +12. (1415565c) An iterator at the end of many assertions can always be +auto-possessified, but not at the end of variable-length lookbehinds. There was +a bug in the code that checks for such a lookbehind; it was looking only at the +first branch, which is wrong because some branches can be fixed length when +others are not, for example (?<=AB|CD?). Now all branches are checked for +variability. + +13. (ead08288) Matching with pcre2_match() could give an incorrect result if a +variable-length lookbehind was used as the condition in a conditional group. +The condition could erroneously be treated as true if a branch matched but +overran the current position. This bug was in the interpreter only; matching +with JIT was correct. + +14. (#443) Split out the sljit sub-project into a "Git submodule". Git users +must now run `git submodule init; git submodule update` after a Git checkout, or +the build will fail due to missing files in deps/sljit. + +15. (#441) Add a new error code (PCRE2_ERROR_JIT_UNSUPPORTED) which is yielded +for unsupported jit features. + +16. (#444) Fix bug in 'first code unit' and 'last code unit' optimization +combined with lookahead assertions. + +17. (#445, #447, #449, #451, #452, #459, #563) Add a new feature called scan +substring. This feature is a new type of assertion which matches the content of +a capturing block to a sub-pattern. + +18. (#450) Improvements to 'first code unit' / 'starting code units' +optimisation. + +19. (#455) Many, many improvements to the JIT compiler. + +20. Item 43 of 10.43 was incomplete because it addressed only \z and not \Z, +which was still misbehaving when matching fragments inside invalid UTF strings. + +21. (d29e7290) Octal escapes of the form \045 or \111 were not being recognized +in substitution strings, and if encountered gave an error, though the \o{...} +form was recognized. This bug is now fixed. + +22. (#463, #487) Fix 1 byte out-of-bounds read when parsing malformed limits +(e.g. LIMIT_HEAP) + +23. Many improvements to test infrastructure. Many more platforms and +configurations are now run in Continuous Integration, and all the platforms now +run the full test suite, rather than a partial subset. + +24. (#475) Implement title casing in substitution strings using Perl syntax. + +25. (#478, #504) Disallow \x if not followed by { or a hex digit. + +26. (#473) Implements Python-style backrefs in substitutions. + +27. (#472) Fix error reporting for certain over-large octal escapes. + +28. (#482) Fix parsing of named captures in replacement strings, allowing +non-ASCII capture names to be used. + +29. (#477, #474, #488, #494, #496, #506, #508, #511, #518, #524, #540) Many +improvements to parsing and optimising of character classes. + +30. (#483, #498) Add support for \g and $ to replacement strings. + +31. (#470) Add option flags PCRE2_EXTRA_NO_BS0 and PCRE2_EXTRA_PYTHON_OCTAL. + +32. (#471) Add new API function pcre2_set_optimize() for controlling which +optimizations are enabled. + +33. (#491) Adds $& $` $' and $_ to substitution replacements, as well as +interpreting \b and \v as characters. + +34. (#499) Add option PCRE2_EXTRA_NEVER_CALLOUT to disable callouts. + +35. (#503, #513) Update Unicode support to UCD 16. + +36. (#512, #618, #638) Add new function pcre2_set_substitute_case_callout() to +allow clients to provide a custom callback with locale-aware case +transformation. + +37. (#516) Fix case-insensitive matching of backreferences when using the +PCRE2_EXTRA_CASELESS_RESTRICT option. + +38. (#519) In pcre2grep, add $& as an alias for $0 + +39. (c9bf8339, #534) Updated perltest.sh to enable locale setting. + +40. (#521) Add support for Turkish I casefolding, using new options +PCRE2_EXTRA_TURKISH_CASING, and added pre-pattern flags (*TURKISH_CASING) and +(*CASELESS_RESTRICT). + +41. (#523, #546, #547) Add support for UTS#18 compatible character classes, +using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a metacharacter +within character classes and the operators '&&', '--' and '~~', allowing +subtractions and intersections of character classes to be easily expressed. + +42. (#553, #586, #596, #597) Add support for Perl-style extended character +classes, using the syntax (?[...]). This also allows expressing subtractions and +intersections of character classes, but using a different syntax to UTS#18. + +43. (#554) Fixed a bug in JIT affecting greedy bounded repeats. The upper limit +of repeats inside a repeated bracket might be incorrectly checked. + +44. (#556) Fixed a bug in JIT affecting caseful matching of backreferences. When +utf is disabled, and dupnames is enabled, caseless matching was used even +if caseful matching was needed. + +45. (f34fc0a3) Fixed a bug in pcre2grep reported by Alejandro Colomar + (GitHub issue #577). In certain cases, when lines of above and +below context were contiguous, a separator line was incorrectly being inserted. + +46. (#594) Fix a small (one/two byte) out-of-bounds read on invalid UTF-8 input +in pcre2grep. + +47. (#370) Fix the INSTALL_MSVC_PDB CMake flag. + +48. (#366) Install cmake files in prefix/lib/cmake/pcre2 rather than +prefix/cmake. The new CMake flag PCRE2_INSTALL_CMAKEDIR allows customising this +location. + +49. (#624, #626, #628, #632, #639, #641) Reduce code size of generated JIT code +for repeated character classes. + +50. (#623) Update the Bazel build files. + + Version 10.44 07-June-2024 -------------------------- diff --git a/mingw32/share/doc/pcre2/NEWS b/mingw32/share/doc/pcre2/NEWS index 5f8dde35406..4b5ec1e5103 100644 --- a/mingw32/share/doc/pcre2/NEWS +++ b/mingw32/share/doc/pcre2/NEWS @@ -1,6 +1,92 @@ News about PCRE2 releases ------------------------- +Version 10.45 05-February-2025 +------------------------------ + +This is a comparatively large release, incorporating new features, some +bugfixes, and a few changes with slight backwards compatibility implications. +Please see the ChangeLog and Git log for further details. + +Only changes to behaviour, changes to the API, and major changes to the pattern +syntax are described here. + +This release is the first to be available as a (signed) Git tag, or +alternatively as a (signed) tarball of the Git tag. + +This is also the first release to be made by the new maintainers of PCRE2, and +we would like to thank Philip Hazel, creator and maintainer of PCRE and PCRE2. + +* (Git change) The sljit project has been split out into a separate Git + repository. Git users must now run `git submodule init; git submodule update` + after a Git checkout. + +* (Behaviour change) Update Unicode support to UCD 16. + +* (Match behaviour change) Case-insensitive matching of Unicode properties + Ll, Lt, and Lu has been changed to match Perl. Previously, /\p{Ll}/i would + match only lower-case characters (even though case-insensitive matching was + specified). This also affects case-insensitive matching of POSIX classes such + as [:lower:]. + +* (Minor match behaviour change) Case-insensitive matching of backreferences now + respects the PCRE2_EXTRA_CASELESS_RESTRICT option. + +* (Minor pattern syntax change) Parsing of the \x escape is stricter, and is + no longer parsed as an escape for the NUL character if not followed by '{' or + a hexadecimal digit. Use \x00 instead. + +* (Major new feature) Add a new feature called scan substring. This is a new + type of assertion which matches the content of a capturing block to a + sub-pattern. + + Example: to find a word that contains the rare (in English) sequence of + letters "rh" not at the start: + + \b(\w++)(*scan_substring:(1).+rh) + + The first group captures a word which is then scanned by the + (*scan_substring:(1) ... ) assertion, which tests whether the pattern ".+rh" + matches the capture group "(1)". + +* (Major new feature) Add support for UTS#18 compatible character classes, + using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a + metacharacter within character classes and the operators '&&', '--' and '~~', + allowing subtractions and intersections of character classes to be easily + expressed. + + Example: to match Thai or Greek letters (but not letters or other characters + in those scripts), use [\p{L}&&[\p{Thai}||\p{Greek}]]. + +* (Major new feature) Add support for Perl-style extended character classes, + using the syntax (?[...]). This also allows expressing subtractions and + intersections of character classes, but using a different syntax to UTS#18. + + Example: to match Thai or Greek letters (but not letters or other characters + in those scripts), use (?[\p{L} & (\p{Thai} + \p{Greek})]). + +* (Minor feature) Significant improvements to the character class match engine. + Compiled character classes are now more compact, and have faster matching + for large or complex character sets, using binary search through the set. + +* JIT compilation now fails with the new error code PCRE2_ERROR_JIT_UNSUPPORTED + for patterns which use features not supported by the JIT compiler. + +* (Minor feature) New options PCRE2_EXTRA_NO_BS0 (disallow \0 as an escape for + the NUL character); PCRE2_EXTRA_PYTHON_OCTAL (use Python disambiguation rules + for deciding whether \12 is a backreference or an octal escape); + PCRE2_EXTRA_NEVER_CALLOUT (disable callout syntax entirely); + PCRE2_EXTRA_TURKISH_CASING (use Turkish rules for case-insensitive matching). + +* (Minor feature) Add new API function pcre2_set_optimize() for controlling + which optimizations are enabled. + +* (Minor new features) A variety of extensions have been made to + pcre2_substitute() and its syntax for replacement strings. These now support: + \123 octal escapes; titlecasing \u\L; \1 backreferences; \g<1> and $ + backreferences; $& $` $' and $_; new function + pcre2_set_substitute_case_callout() to allow locale-aware case transformation. + Version 10.44 07-June-2024 -------------------------- @@ -13,7 +99,7 @@ increased to 128. Some auxiliary files for building under VMS are added. Version 10.43 16-February-2024 ------------------------------ -There are quite a lot of changes in this release (see ChangeLog and git log for +There are quite a lot of changes in this release (see ChangeLog and Git log for a list). Those that are not bugfixes or code tidies are: * The JIT code no longer supports ARMv5 architecture. @@ -52,7 +138,7 @@ a list). Those that are not bugfixes or code tidies are: matches the "fullwidth" versions of hex digits. PCRE2_EXTRA_ASCII_DIGIT can be used to keep it ASCII only. -* Make PCRE2_UCP the default in UTF mode in pcre2grep and add -no_ucp, +* Make PCRE2_UCP the default in UTF mode in pcre2grep and add --no-ucp, --case-restrict and --posix-digit. * Add --group-separator and --no-group-separator to pcre2grep. diff --git a/mingw32/share/doc/pcre2/README b/mingw32/share/doc/pcre2/README index dab5e94210b..5a50f7f11b5 100644 --- a/mingw32/share/doc/pcre2/README +++ b/mingw32/share/doc/pcre2/README @@ -385,7 +385,7 @@ library. They are also documented in the pcre2build man page. If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of + Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. @@ -411,20 +411,19 @@ library. They are also documented in the pcre2build man page. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about what it is doing. The input strings are specified - by arguments: if an argument starts with "=" the rest of it is a literal - input string. Otherwise, it is assumed to be a file name, and the contents - of the file are the test string. + want to run fuzzing tests on PCRE2. If set, it causes an extra library + called libpcre2-fuzzsupport.a to be built, but not installed. This contains + a single function called LLVMFuzzerTestOneInput() whose arguments are a + pointer to a string and the length of the string. When called, this function + tries to compile the string as a pattern, and if that succeeds, to match + it. This is done both with no options and with some random options bits that + are generated from the string. Setting --enable-fuzz-support also causes an + executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally + run under valgrind or used when PCRE2 is compiled with address sanitizing + enabled. It calls the fuzzing function and outputs information about what it + is doing. The input strings are specified by arguments: if an argument + starts with "=" the rest of it is a literal input string. Otherwise, it is + assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -510,6 +509,7 @@ system. The following are installed (file names are all relative to the LICENCE NEWS README + SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page @@ -607,8 +607,9 @@ zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. +should first run the maint/PrepareRelease script before making a distribution. +This script creates the .txt and HTML forms of the documentation from the man +pages. Testing PCRE2 @@ -822,37 +823,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -860,13 +862,16 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header + src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_neon_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_ucp.h header for Unicode property handling + src/pcre2_util.h header for internal utils - sljit/* source files for the JIT compiler + deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: @@ -878,48 +883,49 @@ The distribution should contain the files listed below. (C) Auxiliary files: - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 + AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 + LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name + SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests + RunTest.bat a Windows batch file for running tests + RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library + m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages + doc/pcre2-config.txt plain text documentation of pcre2-config script + doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories + ar-lib ) + config.guess ) + config.sub ) + depcomp ) helper tools generated by libtool and + compile ) automake, used internally by ./configure + install-sh ) + ltmain.sh ) + missing ) + test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests @@ -927,12 +933,13 @@ The distribution should contain the files listed below. testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files -(D) Auxiliary files for cmake support +(D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake cmake/FindEditline.cmake cmake/FindReadline.cmake + cmake/pcre2-config-version.cmake.in + cmake/pcre2-config.cmake.in CMakeLists.txt config-cmake.h.in @@ -943,14 +950,21 @@ The distribution should contain the files listed below. src/config.h.generic ) a version of config.h for use in non-"configure" ) environments -(F) Auxiliary files for building PCRE2 under OpenVMS +(F) Auxiliary files for building PCRE2 using other build systems + + BUILD.bazel ) + MODULE.bazel ) files used by the Bazel build system + WORKSPACE.bazel ) + build.zig file used by zig's build system + +(G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 15 April 2024 +============================== +Last updated: 18 December 2024 +============================== + diff --git a/mingw32/share/doc/pcre2/SECURITY.md b/mingw32/share/doc/pcre2/SECURITY.md new file mode 100644 index 00000000000..1e3a05b9aef --- /dev/null +++ b/mingw32/share/doc/pcre2/SECURITY.md @@ -0,0 +1,44 @@ +# Security policies + +## Release security + +The PCRE2 project provides source-only releases, with no binaries. + +These source releases can be downloaded from the +[GitHub Releases](https://github.com/PCRE2Project/pcre2/releases) page. Each +release file is GPG-signed. + +* Releases up to and including 10.44 are signed by Philip Hazel (GPG key: + 45F68D54BBE23FB3039B46E59766E084FB0F43D8) +* Releases from 10.45 onwards will be signed by Nicholas Wilson (GPG key: + A95536204A3BB489715231282A98E77EB6F24CA8, cross-signed by Philip + Hazel's key for release continuity) + +From releases 10.45 onwards, the source code will additionally be provided via +Git checkout of the (GPG-signed) release tag. + +Please contact the maintainers for any queries about release integrity or the +project's supply-chain. + +## Reporting vulnerabilities + +The PCRE2 project prioritises security. We appreciate third-party testing and +security research, and would be grateful if you could responsibly disclose your +findings to us. We will make every effort to acknowledge your contributions. + +To report a security issue, please use the GitHub Security Advisory +["Report a Vulnerability"](https://github.com/PCRE2Project/pcre2/security/advisories/new) +tab. (Alternatively, if you prefer you may send a GPG-encrypted email to one of +the maintainers.) + +### Timeline + +As a very small volunteer team, we cannot guarantee rapid response, but would +aim to respond within 1 week, or perhaps 2 during holidays. + +### Response procedure + +PCRE2 has never previously made a rapid or embargoed release in response to a +security incident. We would work with security managers from trusted downstream +distributors, such as major Linux distributions, before disclosing the +vulnerability publicly. diff --git a/mingw32/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt b/mingw32/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt index 851976ae238..bb687f7d040 100644 --- a/mingw32/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt +++ b/mingw32/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt @@ -105,6 +105,7 @@ example. pcre2_chkdint.c pcre2_chartables.c pcre2_compile.c + pcre2_compile_class.c pcre2_config.c pcre2_context.c pcre2_convert.c @@ -138,7 +139,7 @@ example. Note that you must compile pcre2_jit_compile.c, even if you have not defined SUPPORT_JIT in src/config.h, because when JIT support is not configured, dummy functions are compiled. When JIT support IS configured, - pcre2_jit_compile.c #includes other files from the sljit subdirectory, + pcre2_jit_compile.c #includes other files from the sljit dependency, all of whose names begin with "sljit". It also #includes src/pcre2_jit_match.c and src/pcre2_jit_misc.c, so you should not compile those yourself. @@ -301,56 +302,66 @@ Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no spaces in the names for your CMake installation and your PCRE2 source and build directories. -The following instructions were contributed by a PCRE1 user, but they should -also work for PCRE2. If they are not followed exactly, errors may occur. In the -event that errors do occur, it is recommended that you delete the CMake cache -before attempting to repeat the CMake build process. In the CMake GUI, the -cache can be deleted by selecting "File > Delete Cache". +If you are using CMake and encounter errors, deleting the CMake cache and +restarting from a fresh build may fix the error. In the CMake GUI, the cache can +be deleted by selecting "File > Delete Cache"; or the folder "CMakeCache" can +be deleted. -1. Install the latest CMake version available from http://www.cmake.org/, and - ensure that cmake\bin is on your path. +1. Install the latest CMake version available from http://www.cmake.org/, and + ensure that cmake\bin is on your path. -2. Unzip (retaining folder structure) the PCRE2 source tree into a source - directory such as C:\pcre2. You should ensure your local date and time - is not earlier than the file dates in your source dir if the release is - very new. +2. Unzip (retaining folder structure) the PCRE2 source tree into a source + directory such as C:\pcre2. You should ensure your local date and time + is not earlier than the file dates in your source dir if the release is + very new. -3. Create a new, empty build directory, preferably a subdirectory of the - source dir. For example, C:\pcre2\pcre2-xx\build. +3. Create a new, empty build directory, preferably a subdirectory of the + source dir. For example, C:\pcre2\pcre2-xx\build. -4. Run cmake-gui from the Shell environment of your build tool, for example, - Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try - to start Cmake from the Windows Start menu, as this can lead to errors. +4. Run CMake. -5. Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and - build directories, respectively. + - Using the CLI, simply run `cmake ..` inside the `build/` directory. You can + use the `ccmake` ncurses GUI to select and configure PCRE2 features. -6. Hit the "Configure" button. + - Using the CMake GUI: -7. Select the particular IDE / build tool that you are using (Visual - Studio, MSYS makefiles, MinGW makefiles, etc.) + a) Run cmake-gui from the Shell environment of your build tool, for + example, Msys for Msys/MinGW or Visual Studio Command Prompt for + VC/VC++. -8. The GUI will then list several configuration options. This is where - you can disable Unicode support or select other PCRE2 optional features. + b) Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and + build directories, respectively. -9. Hit "Configure" again. The adjacent "Generate" button should now be - active. + c) Press the "Configure" button. -10. Hit "Generate". + d) Select the particular IDE / build tool that you are using (Visual + Studio, MSYS makefiles, MinGW makefiles, etc.) -11. The build directory should now contain a usable build system, be it a - solution file for Visual Studio, makefiles for MinGW, etc. Exit from - cmake-gui and use the generated build system with your compiler or IDE. - E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 - solution, select the desired configuration (Debug, or Release, etc.) and - build the ALL_BUILD project. + e) The GUI will then list several configuration options. This is where + you can disable Unicode support or select other PCRE2 optional features. -12. If during configuration with cmake-gui you've elected to build the test - programs, you can execute them by building the test project. E.g., for - MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The - most recent build configuration is targeted by the tests. A summary of - test results is presented. Complete test output is subsequently - available for review in Testing\Temporary under your build dir. + f) Press "Configure" again. The adjacent "Generate" button should now be + active. + + g) Press "Generate". + +5. The build directory should now contain a usable build system, be it a + solution file for Visual Studio, makefiles for MinGW, etc. Exit from + cmake-gui and use the generated build system with your compiler or IDE. + E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 + solution, select the desired configuration (Debug, or Release, etc.) and + build the ALL_BUILD project. + + Regardless of build system used, `cmake --build .` will build it. + +6. If during configuration with cmake-gui you've elected to build the test + programs, you can execute them by building the test project. E.g., for + MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The + most recent build configuration is targeted by the tests. A summary of + test results is presented. Complete test output is subsequently + available for review in Testing\Temporary under your build dir. + + Regardless of build system used, `ctest` will run the tests. BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO @@ -425,6 +436,7 @@ OpenVMS. They are in the "vms" directory in the distribution tarball. Please read the file called vms/openvms_readme.txt. The pcre2test and pcre2grep programs contain some VMS-specific code. -=========================== -Last Updated: 16 April 2024 -=========================== +============================== +Last updated: 26 December 2024 +============================== + diff --git a/mingw32/share/doc/pcre2/html/README.txt b/mingw32/share/doc/pcre2/html/README.txt index dab5e94210b..5a50f7f11b5 100644 --- a/mingw32/share/doc/pcre2/html/README.txt +++ b/mingw32/share/doc/pcre2/html/README.txt @@ -385,7 +385,7 @@ library. They are also documented in the pcre2build man page. If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of + Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. @@ -411,20 +411,19 @@ library. They are also documented in the pcre2build man page. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about what it is doing. The input strings are specified - by arguments: if an argument starts with "=" the rest of it is a literal - input string. Otherwise, it is assumed to be a file name, and the contents - of the file are the test string. + want to run fuzzing tests on PCRE2. If set, it causes an extra library + called libpcre2-fuzzsupport.a to be built, but not installed. This contains + a single function called LLVMFuzzerTestOneInput() whose arguments are a + pointer to a string and the length of the string. When called, this function + tries to compile the string as a pattern, and if that succeeds, to match + it. This is done both with no options and with some random options bits that + are generated from the string. Setting --enable-fuzz-support also causes an + executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally + run under valgrind or used when PCRE2 is compiled with address sanitizing + enabled. It calls the fuzzing function and outputs information about what it + is doing. The input strings are specified by arguments: if an argument + starts with "=" the rest of it is a literal input string. Otherwise, it is + assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -510,6 +509,7 @@ system. The following are installed (file names are all relative to the LICENCE NEWS README + SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page @@ -607,8 +607,9 @@ zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. +should first run the maint/PrepareRelease script before making a distribution. +This script creates the .txt and HTML forms of the documentation from the man +pages. Testing PCRE2 @@ -822,37 +823,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -860,13 +862,16 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header + src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_neon_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_ucp.h header for Unicode property handling + src/pcre2_util.h header for internal utils - sljit/* source files for the JIT compiler + deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: @@ -878,48 +883,49 @@ The distribution should contain the files listed below. (C) Auxiliary files: - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 + AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 + LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name + SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests + RunTest.bat a Windows batch file for running tests + RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library + m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages + doc/pcre2-config.txt plain text documentation of pcre2-config script + doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories + ar-lib ) + config.guess ) + config.sub ) + depcomp ) helper tools generated by libtool and + compile ) automake, used internally by ./configure + install-sh ) + ltmain.sh ) + missing ) + test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests @@ -927,12 +933,13 @@ The distribution should contain the files listed below. testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files -(D) Auxiliary files for cmake support +(D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake cmake/FindEditline.cmake cmake/FindReadline.cmake + cmake/pcre2-config-version.cmake.in + cmake/pcre2-config.cmake.in CMakeLists.txt config-cmake.h.in @@ -943,14 +950,21 @@ The distribution should contain the files listed below. src/config.h.generic ) a version of config.h for use in non-"configure" ) environments -(F) Auxiliary files for building PCRE2 under OpenVMS +(F) Auxiliary files for building PCRE2 using other build systems + + BUILD.bazel ) + MODULE.bazel ) files used by the Bazel build system + WORKSPACE.bazel ) + build.zig file used by zig's build system + +(G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 15 April 2024 +============================== +Last updated: 18 December 2024 +============================== + diff --git a/mingw32/share/doc/pcre2/html/index.html b/mingw32/share/doc/pcre2/html/index.html index e4dc78620fd..2d81b678fef 100644 --- a/mingw32/share/doc/pcre2/html/index.html +++ b/mingw32/share/doc/pcre2/html/index.html @@ -267,6 +267,9 @@

Perl-compatible Regular Expressions (revised API: PCRE2)

pcre2_set_offset_limit   Set the offset limit +pcre2_set_optimize +   Set an optimization directive + pcre2_set_parens_nest_limit   Set the parentheses nesting limit @@ -276,6 +279,12 @@

Perl-compatible Regular Expressions (revised API: PCRE2)

pcre2_set_recursion_memory_management   Obsolete function that (from 10.30 onwards) does nothing +pcre2_set_substitute_callout +   Set a substitution callout function + +pcre2_set_substitute_case_callout +   Set a substitution case callout function + pcre2_substitute   Match a compiled pattern to a subject string and do substitutions diff --git a/mingw32/share/doc/pcre2/html/pcre2.html b/mingw32/share/doc/pcre2/html/pcre2.html index 4cb83dc184b..e72b6b1cb1d 100644 --- a/mingw32/share/doc/pcre2/html/pcre2.html +++ b/mingw32/share/doc/pcre2/html/pcre2.html @@ -16,7 +16,7 @@

pcre2 man page

  • INTRODUCTION
  • SECURITY CONSIDERATIONS
  • USER DOCUMENTATION -
  • AUTHOR +
  • AUTHORS
  • REVISION
    INTRODUCTION
    @@ -190,22 +190,22 @@

    pcre2 man page

    In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results.

    -
    AUTHOR
    +
    AUTHORS

    -Philip Hazel -
    -Retired from University Computing Service -
    -Cambridge, England. -
    +The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Herczeg. +

    +

    +PCRE2 was written by Philip Hazel, of the University Computing Service, +Cambridge, England. Many others have also contributed.

    -Putting an actual email address here is a spam magnet. If you want to email me, -use my two names separated by a dot at gmail.com. +To contact the maintainers, please use the GitHub issues tracker or PCRE2 +mailing list, as described at the project page: +https://github.com/PCRE2Project/pcre2


    REVISION

    -Last updated: 27 August 2021 +Last updated: 18 December 2024
    Copyright © 1997-2021 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2_compile.html b/mingw32/share/doc/pcre2/html/pcre2_compile.html index f0080eabe45..ee933f38983 100644 --- a/mingw32/share/doc/pcre2/html/pcre2_compile.html +++ b/mingw32/share/doc/pcre2/html/pcre2_compile.html @@ -57,6 +57,7 @@

    pcre2_compile man page

    PCRE2_ALLOW_EMPTY_CLASS Allow empty classes PCRE2_ALT_BSUX Alternative handling of \u, \U, and \x PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode + PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax PCRE2_ALT_VERBNAMES Process backslashes in verb names PCRE2_AUTO_CALLOUT Compile automatic callouts PCRE2_CASELESS Do caseless matching diff --git a/mingw32/share/doc/pcre2/html/pcre2_jit_compile.html b/mingw32/share/doc/pcre2/html/pcre2_jit_compile.html index 873d0ddefc6..791dd0c3d78 100644 --- a/mingw32/share/doc/pcre2/html/pcre2_jit_compile.html +++ b/mingw32/share/doc/pcre2/html/pcre2_jit_compile.html @@ -33,9 +33,18 @@

    pcre2_jit_compile man page

    documentation.

    -The first argument is a pointer that was returned by a successful call to -pcre2_compile(), and the second must contain one or more of the following -bits: +The availability of JIT support can be tested by calling +pcre2_compile_jit() with a single option PCRE2_JIT_TEST_ALLOC (the +code argument is ignored, so a NULL value is accepted). Such a call +returns zero if JIT is available and has a working allocator. Otherwise +it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate +executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not +compiled. +

    +

    +Otherwise, the first argument must be a pointer that was returned by a +successful call to pcre2_compile(), and the second must contain one or +more of the following bits:

       PCRE2_JIT_COMPLETE      compile code for full matching
       PCRE2_JIT_PARTIAL_SOFT  compile code for soft partial matching
    @@ -46,11 +55,13 @@ 

    pcre2_jit_compile man page

    option is deprecated and may be removed in the future.

    -The yield of the function is 0 for success, or a negative error code otherwise. -In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or -if an unknown bit is set in options. The function can also return -PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the -compiler, even if it was because of a system security restriction. +The yield of the function when called with any of the three options above is 0 +for success, or a negative error code otherwise. In particular, +PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or if an unknown +bit is set in options. The function can also return PCRE2_ERROR_NOMEMORY +if JIT is unable to allocate executable memory for the compiler, even if it was +because of a system security restriction. In a few cases, the function may +return with PCRE2_ERROR_JIT_UNSUPPORTED for unsupported features.

    There is a complete description of the PCRE2 native API in the diff --git a/mingw32/share/doc/pcre2/html/pcre2_set_compile_extra_options.html b/mingw32/share/doc/pcre2/html/pcre2_set_compile_extra_options.html index 4924ed79b5e..cb62022a22e 100644 --- a/mingw32/share/doc/pcre2/html/pcre2_set_compile_extra_options.html +++ b/mingw32/share/doc/pcre2/html/pcre2_set_compile_extra_options.html @@ -43,6 +43,10 @@

    pcre2_set_compile_extra_options man page

    PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines PCRE2_EXTRA_MATCH_WORD Pattern matches "words" + PCRE2_EXTRA_NEVER_CALLOUT Disallow callouts in pattern + PCRE2_EXTRA_NO_BS0 Disallow \0 (but not \00 or \000) + PCRE2_EXTRA_PYTHON_OCTAL Use Python rules for octal + PCRE2_EXTRA_TURKISH_CASING Use Turkish I case folding
    There is a complete description of the PCRE2 native API in the pcre2api diff --git a/mingw32/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html b/mingw32/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html index ab570cf60d1..a40f41e450c 100644 --- a/mingw32/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html +++ b/mingw32/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html @@ -27,9 +27,9 @@

    pcre2_set_max_pattern_compiled_length man page


    This function sets, in a compile context, the maximum size (in bytes) for the -memory needed to hold the compiled version of a pattern that is compiled with -this context. The result is always zero. If a pattern that is passed to -pcre2_compile() with this context needs more memory, an error is +memory needed to hold the compiled version of a pattern that is using this +context. The result is always zero. If a pattern that is passed to +pcre2_compile() referencing this context needs more memory, an error is generated. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.

    diff --git a/mingw32/share/doc/pcre2/html/pcre2_set_optimize.html b/mingw32/share/doc/pcre2/html/pcre2_set_optimize.html new file mode 100644 index 00000000000..47caeb267ae --- /dev/null +++ b/mingw32/share/doc/pcre2/html/pcre2_set_optimize.html @@ -0,0 +1,57 @@ + + +pcre2_set_optimize specification + + +

    pcre2_set_optimize man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +
    +SYNOPSIS +
    +

    +#include <pcre2.h> +

    +

    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +

    +
    +DESCRIPTION +
    +

    +This function controls which performance optimizations will be applied +by pcre2_compile(). It can be called multiple times with the same compile +context; the effects are cumulative, with the effects of later calls taking +precedence over earlier ones. +

    +

    +The result is zero for success, PCRE2_ERROR_NULL if ccontext is NULL, +or PCRE2_ERROR_BADOPTION if directive is unknown. The latter could be +useful to detect if a certain optimization is available. +

    +

    +The list of possible values for the directive parameter are: +

    +  PCRE2_OPTIMIZATION_FULL   Enable all optimizations (default)
    +  PCRE2_OPTIMIZATION_NONE   Disable all optimizations
    +  PCRE2_AUTO_POSSESS        Enable auto-possessification
    +  PCRE2_AUTO_POSSESS_OFF    Disable auto-possessification
    +  PCRE2_DOTSTAR_ANCHOR      Enable implicit dotstar anchoring
    +  PCRE2_DOTSTAR_ANCHOR_OFF  Disable implicit dotstar anchoring
    +  PCRE2_START_OPTIMIZE      Enable start-up optimizations at match time
    +  PCRE2_START_OPTIMIZE_OFF  Disable start-up optimizations at match time
    +
    +There is a complete description of the PCRE2 native API, including detailed +descriptions directive parameter values in the +pcre2api +page. +

    +Return to the PCRE2 index page. +

    diff --git a/mingw32/share/doc/pcre2/html/pcre2_set_substitute_callout.html b/mingw32/share/doc/pcre2/html/pcre2_set_substitute_callout.html index 7ae3a398d79..8640728fdc4 100644 --- a/mingw32/share/doc/pcre2/html/pcre2_set_substitute_callout.html +++ b/mingw32/share/doc/pcre2/html/pcre2_set_substitute_callout.html @@ -20,7 +20,7 @@

    pcre2_set_substitute_callout man page

    int pcre2_set_substitute_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_substitute_callout_block *), + int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);


    diff --git a/mingw32/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html b/mingw32/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html new file mode 100644 index 00000000000..ab506879f1f --- /dev/null +++ b/mingw32/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html @@ -0,0 +1,45 @@ + + +pcre2_set_substitute_case_callout specification + + +

    pcre2_set_substitute_case_callout man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +
    +SYNOPSIS +
    +

    +#include <pcre2.h> +

    +

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +

    +
    +DESCRIPTION +
    +

    +This function sets the substitute case callout fields in a match context (the +first argument). The second argument specifies a callout function, and the third +argument is an opaque data item that is passed to it. The result of this +function is always zero. +

    +

    +There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +

    +Return to the PCRE2 index page. +

    diff --git a/mingw32/share/doc/pcre2/html/pcre2api.html b/mingw32/share/doc/pcre2/html/pcre2api.html index 6b60ee9fa7a..079cf176daa 100644 --- a/mingw32/share/doc/pcre2/html/pcre2api.html +++ b/mingw32/share/doc/pcre2/html/pcre2api.html @@ -179,6 +179,10 @@

    pcre2api man page


    int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); +
    +
    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive);


    PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

    @@ -203,6 +207,13 @@

    pcre2api man page

    void *callout_data);

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);
    @@ -808,6 +819,7 @@

    pcre2api man page

    The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) + Which performance optimizations the compiler should apply A compile context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of @@ -952,6 +964,110 @@

    pcre2api man page

    nesting, and the second is user data that is set up by the last argument of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. +
    +
    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +
    +
    +PCRE2 can apply various performance optimizations during compilation, in order +to make matching faster. For example, the compiler might convert some regex +constructs into an equivalent construct which pcre2_match() can execute +faster. By default, all available optimizations are enabled. However, in rare +cases, one might wish to disable specific optimizations. For example, if it is +known that some optimizations cannot benefit a certain regex, it might be +desirable to disable them, in order to speed up compilation. +

    +

    +The permitted values of directive are as follows: +

    +  PCRE2_OPTIMIZATION_FULL
    +
    +Enable all optional performance optimizations. This is the default value. +
    +  PCRE2_OPTIMIZATION_NONE
    +
    +Disable all optional performance optimizations. +
    +  PCRE2_AUTO_POSSESS
    +  PCRE2_AUTO_POSSESS_OFF
    +
    +Enable/disable "auto-possessification" of variable quantifiers such as * and +. +This optimization, for example, turns a+b into a++b in order to avoid +backtracks into a+ that can never be successful. However, if callouts are in +use, auto-possessification means that some callouts are never taken. You can +disable this optimization if you want the matching functions to do a full, +unoptimized search and run all the callouts. +
    +  PCRE2_DOTSTAR_ANCHOR
    +  PCRE2_DOTSTAR_ANCHOR_OFF
    +
    +Enable/disable an optimization that is applied when .* is the first significant +item in a top-level branch of a pattern, and all the other branches also start +with .* or with \A or \G or ^. Such a pattern is automatically anchored if +PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any +^ items. Otherwise, the fact that any match must start either at the start of +the subject or following a newline is remembered. Like other optimizations, +this can cause callouts to be skipped. +

    +

    +Dotstar anchor optimization is automatically disabled for .* if it is inside an +atomic group or a capture group that is the subject of a backreference, or if +the pattern contains (*PRUNE) or (*SKIP). +

    +  PCRE2_START_OPTIMIZE
    +  PCRE2_START_OPTIMIZE_OFF
    +
    +Enable/disable optimizations which cause matching functions to scan the subject +string for specific code unit values before attempting a match. For example, if +it is known that an unanchored match must start with a specific value, the +matching code searches the subject for that value, and fails immediately if it +cannot find it, without actually running the main matching function. This means +that a special item such as (*COMMIT) at the start of a pattern is not +considered until after a suitable starting point for the match has been found. +Also, when callouts or (*MARK) items are in use, these "start-up" optimizations +can cause them to be skipped if the pattern is never actually used. The start-up +optimizations are in effect a pre-scan of the subject that takes place before +the pattern is run. +

    +

    +Disabling start-up optimizations ensures that in cases where the result is "no +match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are +considered at every possible starting position in the subject string. +

    +

    +Disabling start-up optimizations may change the outcome of a matching operation. +Consider the pattern +

    +  (*COMMIT)ABC
    +
    +When this is compiled, PCRE2 records the fact that a match must start with the +character "A". Suppose the subject string is "DEFABC". The start-up +optimization scans along the subject, finds "A" and runs the first match +attempt from there. The (*COMMIT) item means that the pattern must match the +current starting position, which in this case, it does. However, if the same +match is run without start-up optimizations, the initial scan along the subject +string does not happen. The first match attempt is run starting from "D" and +when this fails, (*COMMIT) prevents any further matches being tried, so the +overall result is "no match". +

    +

    +Another start-up optimization makes use of a minimum length for a matching +subject, which is recorded when possible. Consider the pattern +

    +  (*MARK:1)B(*MARK:2)(X|Y)
    +
    +The minimum length for a match is two characters. If the subject is "XXBB", the +"starting character" optimization skips "XX", then tries to match "BB", which +is long enough. In the process, (*MARK:2) is encountered and remembered. When +the match attempt fails, the next "B" is found, but there is only one character +left, so there are no more attempts, and "no match" is returned with the "last +mark seen" set to "2". Without start-up optimizations, however, matches are +tried at every possible starting position, including at the end of the subject, +where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" +that is returned is "1". In this case, the optimizations do not affect the +overall match result, which is still "no match", but they do affect the +auxiliary information that is returned.


    The match context @@ -1011,6 +1127,19 @@

    pcre2api man page

    below.

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    +This sets up a callout function for PCRE2 to call when performing case +transformations inside pcre2_substitute(). Details are given in the +section entitled "Creating a new string with substitutions" +below. +
    +
    int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);
    @@ -1228,7 +1357,10 @@

    pcre2api man page

    The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee that JIT will be used for -any given match. See the +any given match, and neither does it guarantee that JIT will actually be able +to function, because it may not be able to allocate executable memory in some +environments. There is a special call to pcre2_jit_compile() that can be +used to check this. See the pcre2jit documentation for more details.
    @@ -1431,7 +1563,7 @@ 

    pcre2api man page

    error has occurred.

    -There are nearly 100 positive error codes that pcre2_compile() may return +There are over 100 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and @@ -1539,6 +1671,16 @@

    pcre2api man page

    end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. +
    +  PCRE2_ALT_EXTENDED_CLASS
    +
    +Alters the parsing of character classes to follow the extended syntax +described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact +on the behaviour of the Perl-specific "(?[...])" syntax for extended classes, +but instead enables the alternative syntax of extended class behaviour inside +ordinary "[...]" character classes. See the +pcre2pattern +documentation for details of the character classes supported.
       PCRE2_ALT_VERBNAMES
     
    @@ -1569,16 +1711,31 @@

    pcre2api man page

    changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than -U+007F. Note that there are two ASCII characters, K and S, that, in addition to +U+007F. +

    +

    +Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.

    +One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +

    +

    For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having another case. +

    +

    +From release 10.45 PCRE2_CASELESS also affects what some of the letter-related +Unicode property escapes (\p and \P) match. The properties Lu (upper case +letter), Ll (lower case letter), and Lt (title case letter) are all treated as +LC (cased letter) when PCRE2_CASELESS is set.

       PCRE2_DOLLAR_ENDONLY
     
    @@ -1775,7 +1932,7 @@

    pcre2api man page

    for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external -sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error.
       PCRE2_NEVER_UTF
     
    @@ -1798,85 +1955,57 @@

    pcre2api man page

       PCRE2_NO_AUTO_POSSESS
     
    -If this option is set, it disables "auto-possessification", which is an -optimization that, for example, turns a+b into a++b in order to avoid +If this (deprecated) option is set, it disables "auto-possessification", which +is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. +

    +

    +If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather +than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS +takes precedence over the pcre2_set_optimize() optimization directives +PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF.

       PCRE2_NO_DOTSTAR_ANCHOR
     
    -If this option is set, it disables an optimization that is applied when .* is -the first significant item in a top-level branch of a pattern, and all the -other branches also start with .* or with \A or \G or ^. The optimization is -automatically disabled for .* if it is inside an atomic group or a capture -group that is the subject of a backreference, or if the pattern contains -(*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is -automatically anchored if PCRE2_DOTALL is set for all the .* items and -PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match -must start either at the start of the subject or following a newline is +If this (deprecated) option is set, it disables an optimization that is applied +when .* is the first significant item in a top-level branch of a pattern, and +all the other branches also start with .* or with \A or \G or ^. The +optimization is automatically disabled for .* if it is inside an atomic group +or a capture group that is the subject of a backreference, or if the pattern +contains (*PRUNE) or (*SKIP). When the optimization is not disabled, such a +pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items +and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any +match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. +(If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF +instead.)
       PCRE2_NO_START_OPTIMIZE
     
    This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT -compiler. +compiler. Setting this option is equivalent to calling pcre2_set_optimize() +with the directive parameter set to PCRE2_START_OPTIMIZE_OFF.

    There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without -actually running the main matching function. This means that a special item -such as (*COMMIT) at the start of a pattern is not considered until after a -suitable starting point for the match has been found. Also, when callouts or -(*MARK) items are in use, these "start-up" optimizations can cause them to be -skipped if the pattern is never actually used. The start-up optimizations are +actually running the main matching function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run.

    -The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, -possibly causing performance to suffer, but ensuring that in cases where the -result is "no match", the callouts do occur, and that items such as (*COMMIT) -and (*MARK) are considered at every possible starting position in the subject -string. -

    -

    -Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation. -Consider the pattern -

    -  (*COMMIT)ABC
    -
    -When this is compiled, PCRE2 records the fact that a match must start with the -character "A". Suppose the subject string is "DEFABC". The start-up -optimization scans along the subject, finds "A" and runs the first match -attempt from there. The (*COMMIT) item means that the pattern must match the -current starting position, which in this case, it does. However, if the same -match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the -subject string does not happen. The first match attempt is run starting from -"D" and when this fails, (*COMMIT) prevents any further matches being tried, so -the overall result is "no match". -

    -

    -As another start-up optimization makes use of a minimum length for a matching -subject, which is recorded when possible. Consider the pattern -

    -  (*MARK:1)B(*MARK:2)(X|Y)
    -
    -The minimum length for a match is two characters. If the subject is "XXBB", the -"starting character" optimization skips "XX", then tries to match "BB", which -is long enough. In the process, (*MARK:2) is encountered and remembered. When -the match attempt fails, the next "B" is found, but there is only one character -left, so there are no more attempts, and "no match" is returned with the "last -mark seen" set to "2". If NO_START_OPTIMIZE is set, however, matches are tried -at every possible starting position, including at the end of the subject, where -(*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is -returned is "1". In this case, the optimizations do not affect the overall -match result, which is still "no match", but they do affect the auxiliary -information that is returned. +Disabling the start-up optimizations may cause performance to suffer. However, +this may be desirable for patterns which contain callouts or items such as +(*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF +for further details.
       PCRE2_NO_UTF_CHECK
     
    @@ -1931,9 +2060,16 @@

    pcre2api man page

    upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default). -The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless +

    +

    +The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII -characters match only non-ASCII characters. +characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option +(see above) alters the matching of the 'i' characters to follow their behaviour +in Turkish and Azeri languages. For further details on +PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the +pcre2unicode +page.

       PCRE2_UNGREEDY
     
    @@ -2070,7 +2206,8 @@

    pcre2api man page

    ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must either be ASCII or non-ASCII. The option -can be changed with a pattern by the (?r) option setting. +can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option +settings.
       PCRE2_EXTRA_ESCAPED_CR_IS_LF
     
    @@ -2097,6 +2234,34 @@

    pcre2api man page

    at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. +
    +  PCRE2_EXTRA_NO_BS0
    +
    +If this option is set (note that its final character is the digit 0) it locks +out the use of the sequence \0 unless at least one more octal digit follows. +
    +  PCRE2_EXTRA_PYTHON_OCTAL
    +
    +If this option is set, PCRE2 follows Python's rules for interpreting octal +escape sequences. The rules for handling sequences such as \14, which could +be an octal number or a back reference are different. Details are given in the +pcre2pattern +documentation. +
    +  PCRE2_EXTRA_NEVER_CALLOUT
    +
    +If this option is set, PCRE2 treats callouts in the pattern as a syntax error, +returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application +knows that a callout will not be provided to pcre2_match(), so that +callouts in the pattern are not silently ignored. +
    +  PCRE2_EXTRA_TURKISH_CASING
    +
    +This option alters case-equivalence of the 'i' letters to follow the +alphabet used by Turkish and Azeri languages. The option can be changed within +a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or +UCP options must be set. In the 8-bit library, UTF must be set. This option +cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.


    JUST-IN-TIME (JIT) COMPILATION

    @@ -2303,6 +2468,7 @@

    pcre2api man page

    PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF
    For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3646,9 +3812,10 @@

    pcre2api man page

    too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (without, of course, writing anything) -in order to compute the size of buffer that is needed. This value is passed -back via the outlengthptr variable, with the result of the function still -being PCRE2_ERROR_NOMEMORY. +in order to compute the size of buffer that is needed, which will include the +extra space for the terminating NUL. This value is passed back via the +outlengthptr variable, with the result of the function still being +PCRE2_ERROR_NOMEMORY.

    Passing a buffer size of zero is a permitted way of finding out how much memory @@ -3667,18 +3834,26 @@

    pcre2api man page

    in any way. By default, however, a dollar character is an escape character that can specify the insertion of characters from capture groups and names from (*MARK) or other control verbs in the pattern. Dollar is the only escape -character (backslash is treated as literal). The following forms are always +character (backslash is treated as literal). The following forms are recognized:
       $$                  insert a dollar character
    -  $<n> or ${<n>}      insert the contents of group <n>
    +  $n or ${n}          insert the contents of group n
    +  $0 or $&            insert the entire matched substring
    +  $`                  insert the substring that precedes the match
    +  $'                  insert the substring that follows the match
    +  $_                  insert the entire input string
       $*MARK or ${*MARK}  insert a control verb name
     
    -Either a group number or a group name can be given for <n>. Curly brackets are -required only if the following character would be interpreted as part of the -number or name. The number may be zero to include the entire matched string. -For example, if the pattern a(b)c is matched with "=abc=" and the replacement -string "+$1$0$1+", the result is "=+babcb+=". +Either a group number or a group name can be given for n, for example $2 or +$NAME. Curly brackets are required only if the following character would be +interpreted as part of the number or name. The number may be zero to include +the entire matched string. For example, if the pattern a(b)c is matched with +"=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=". +

    +

    +The JavaScript form $<name>, where the angle brackets are part of the syntax, +is also recognized for group names, but not for group numbers or *MARK.

    $*MARK inserts the name from the last encountered backtracking control verb on @@ -3732,28 +3907,53 @@

    pcre2api man page

    PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When -PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +PCRE2_SUBSTITUTE_EXTENDED is set, several things change:

    Firstly, backslash in a replacement string is interpreted as an escape -character. The usual forms such as \n or \x{ddd} can be used to specify -particular character codes, and backslash followed by any non-alphanumeric -character quotes that character. Extended quoting can be coded using \Q...\E, -exactly as in pattern strings. +character. The usual forms such as \x{ddd} can be used to specify particular +character codes, and backslash followed by any non-alphanumeric character +quotes that character. Extended quoting can be coded using \Q...\E, exactly +as in pattern strings. The escapes \b and \v are interpreted as the +characters backspace and vertical tab, respectively. +

    +

    +The interpretation of backslash followed by one or more digits is the same as +in a pattern, which in Perl has some ambiguities. Details are given in the +pcre2pattern +page. +

    +

    +The Python form \g<n>, where the angle brackets are part of the syntax and n +is either a group name or number, is recognized as an altertive way of +inserting the contents of a group, for example \g<3>.

    There are also four escape sequences for forcing the case of inserted letters. -The insertion mechanism has three states: no case forcing, force upper case, -and force lower case. The escape sequences change the current state: \U and -\L change to upper or lower case forcing, respectively, and \E (when not -terminating a \Q quoted sequence) reverts to no case forcing. The sequences -\u and \l force the next character (if it is a letter) to upper or lower -case, respectively, and then the state automatically reverts to no case -forcing. Case forcing applies to all inserted characters, including those from -capture groups and letters within \Q...\E quoted sequences. If either -PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode +Case forcing applies to all inserted characters, including those from capture +groups and letters within \Q...\E quoted sequences. The insertion mechanism +has three states: no case forcing, force upper case, and force lower case. The +escape sequences change the current state: \U and \L change to upper or lower +case forcing, respectively, and \E (when not terminating a \Q quoted +sequence) reverts to no case forcing. The sequences \u and \l force the next +character (if it is a letter) to upper or lower case, respectively, and then +the state automatically reverts to no case forcing. +

    +

    +However, if \u is immediately followed by \L or \l is immediately followed +by \U, the next character's case is forced by the first escape sequence, and +subsequent characters by the second. This provides a "title casing" facility +that can be applied to group captures. For example, if group 1 has captured +"heLLo", the replacement string "\u\L$1" becomes "Hello". +

    +

    +If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater -than 127. +than 127. However, only simple case folding, as determined by the Unicode file +CaseFolding.txt is supported. PCRE2 does not support language-specific +special casing rules such as using different lower case Greek sigmas in the +middle and ends of words (as defined in the Unicode file +SpecialCasing.txt).

    Note that case forcing sequences such as \U...\E do not nest. For example, @@ -3762,20 +3962,20 @@

    pcre2api man page

    not apply to replacement strings.

    -The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash:

    -  ${<n>:-<string>}
    -  ${<n>:+<string1>:<string2>}
    +  ${n:-string}
    +  ${n:+string1:string2}
     
    -As before, <n> may be a group number or a name. The first form specifies a -default value. If group <n> is set, its value is inserted; if not, <string> is -expanded and the result inserted. The second form specifies strings that are -expanded and inserted when group <n> is set or unset, respectively. The first -form is just a convenient shorthand for +As in the simple case, n may be a group number or a name. The first form +specifies a default value. If group n is set, its value is inserted; if +not, the string is expanded and the result inserted. The second form specifies +strings that are expanded and inserted when group n is set or unset, +respectively. The first form is just a convenient shorthand for
    -  ${<n>:+${<n>}:<string>}
    +  ${n:+${n}:string}
     
    Backslash can be used to escape colons and closing curly brackets in the replacement strings. A change of the case forcing state within a replacement @@ -3852,9 +4052,18 @@

    pcre2api man page

    The pcre2_set_substitution_callout() function can be used to specify a callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution has -been processed, but it can cause the replacement not to happen. The callout -function is not called for simulated substitutions that happen as a result of -the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. +been processed, but it can cause the replacement not to happen. +

    +

    +The callout function is not called for simulated substitutions that happen as a +result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when +substitution processing exceeds the buffer space provided by the caller, +processing continues by counting code units. The simulation is unable to +populate the callout block, and so the simulation is pessimistic about the +required buffer size. Whichever is larger of accepted or rejected substitution +is reported as the required size. Therefore, the returned buffer length may be +an overestimate (without a substitution callout, it is normally an exact +measurement).

    The first argument of the callout function is a pointer to a substitute callout @@ -3903,6 +4112,107 @@

    pcre2api man page

    output and the call to pcre2_substitute() exits, returning the number of matches so far.

    +
    +Substitution case callouts +
    +

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    +The pcre2_set_substitution_case_callout() function can be used to specify +a callout function for pcre2_substitute() to use when performing case +transformations. This does not affect any case insensitivity behaviour when +performing a match, but only the user-visible transformations performed when +processing a substitution such as: +

    +    pcre2_substitute(..., "\\U$1", ...)
    +
    +

    +

    +The default case transformations applied by PCRE2 are reasonably complete, and, +in UTF or UCP mode, perform the simple locale-invariant case transformations as +specified by Unicode. This is suitable for the internal (invisible) +case-equivalence procedures used during pattern matching, but an application +may wish to use more sophisticated locale-aware processing for the user-visible +substitution transformations. +

    +

    +One example implementation of the callout_function using the ICU +library would be: +
    +
    +

    +    PCRE2_SIZE
    +    icu_case_callout(
    +      PCRE2_SPTR input, PCRE2_SIZE input_len,
    +      PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
    +      int to_case, void *data_ptr)
    +    {
    +      UErrorCode err = U_ZERO_ERROR;
    +      int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER
    +        ? u_strToLower(output, output_cap, input, input_len, NULL, &err)
    +        : to_case == PCRE2_SUBSTITUTE_CASE_UPPER
    +        ? u_strToUpper(output, output_cap, input, input_len, NULL, &err)
    +        : u_strToTitle(output, output_cap, input, input_len, &first_char_only,
    +                       NULL, &err);
    +      if (U_FAILURE(err)) return (~(PCRE2_SIZE)0);
    +      return r;
    +    }
    +
    +

    +

    +The first and second arguments of the case callout function are the Unicode +string to transform. +

    +

    +The third and fourth arguments are the output buffer and its capacity. +

    +

    +The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, +PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. +PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the +callout to indicate that the case of the entire callout input should be +case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that +only the first character or glyph should be transformed to Unicode titlecase +and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode +properties to titlecase each word in a string; but PCRE2 is requesting that only +the single leading character is to be titlecased). +

    +

    +The sixth argument is the callout_data supplied to +pcre2_set_substitute_case_callout(). +

    +

    +The resulting string in the destination buffer may be larger or smaller than the +input, if the casing rules merge or split characters. The return value is the +length required for the output string. If a buffer of sufficient size was +provided to the callout, then the result must be written to the buffer and the +number of code units returned. If the result does not fit in the provided +buffer, then the required capacity must be returned and PCRE2 will not make use +of the output buffer. PCRE2 provides input and output buffers which overlap, so +the callout must support this by suitable internal buffering. +

    +

    +Alternatively, if the callout wishes to indicate an error, then it may return +(~(PCRE2_SIZE)0). In this case pcre2_substitute() will immediately fail with +error PCRE2_ERROR_REPLACECASE. +

    +

    +When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH +option, there are situations when pcre2_substitute() will return an +underestimate of the required buffer size. If you call pcre2_substitute() once +with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for +the replacement string to be constructed, then instead of calling the case +callout, pcre2_substitute() will make an estimate of the required buffer size. +The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that +second call is not guaranteed to succeed either, if the case callout requires +more buffer space than expected. The caller must make repeated attempts in a +loop. +


    DUPLICATE CAPTURE GROUP NAMES

    int pcre2_substring_nametable_scan(const pcre2_code *code, @@ -4177,7 +4487,7 @@

    pcre2api man page


    REVISION

    -Last updated: 24 April 2024 +Last updated: 26 December 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2build.html b/mingw32/share/doc/pcre2/html/pcre2build.html index d4b0d336b08..f4e127f14ca 100644 --- a/mingw32/share/doc/pcre2/html/pcre2build.html +++ b/mingw32/share/doc/pcre2/html/pcre2build.html @@ -643,7 +643,7 @@

    pcre2build man page


    REVISION

    -Last updated: 15 April 2024 +Last updated: 16 April 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2compat.html b/mingw32/share/doc/pcre2/html/pcre2compat.html index d60182ed48a..5f7e280d34f 100644 --- a/mingw32/share/doc/pcre2/html/pcre2compat.html +++ b/mingw32/share/doc/pcre2/html/pcre2compat.html @@ -71,7 +71,7 @@

    pcre2compat man page

    7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties such as Lu and -Nd, the derived properties Any and LC (synonym L&), script names such as Greek +Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See the @@ -99,7 +99,12 @@

    pcre2compat man page

    \Q\\E \ \\E The \Q...\E sequence is recognized both inside and outside character classes -by both PCRE2 and Perl. +by both PCRE2 and Perl. Another difference from Perl is that any appearance of +\Q or \E inside what might otherwise be a quantifier causes PCRE2 not to +recognize the sequence as a quantifier. Perl recognizes a quantifier if +(redundantly) either of the numbers is inside \Q...\E, but not if the +separating comma is. When not recognized as a quantifier a sequence such as +{\Q1\E,2} is treated as the literal string "{1,2}".

    9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) @@ -120,7 +125,9 @@

    pcre2compat man page

    not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are -processed as anchored at the point where they are tested. +processed as anchored at the point where they are tested. PCRE2 also confines +all control verbs within atomic assertions, again including (*THEN) in +assertions with only one branch.

    12. If a pattern contains more than one backtracking control verb, the first @@ -159,11 +166,11 @@

    pcre2compat man page

    certainly user mistakes.

    -17. In PCRE2, the upper/lower case character properties Lu and Ll are not -affected when case-independent matching is specified. For example, \p{Lu} -always matches an upper case letter. I think Perl has changed in this respect; -in the release at the time of writing (5.38), \p{Lu} and \p{Ll} match all -letters, regardless of case, when case independence is specified. +17. In PCRE2, until release 10.45, the upper/lower case character properties Lu +and Ll were not affected when case-independent matching was specified. Perl has +changed in this respect, and PCRE2 has now changed to match. When caseless +matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased +letter).

    18. From release 5.32.0, Perl locks out the use of \K in lookaround @@ -231,6 +238,10 @@

    pcre2compat man page

    numbers such as +2 and -4 in all three cases. Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. +
    +
    +(m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 extension +that is not available in Perl.

    20. Perl has different limits than PCRE2. See the @@ -252,6 +263,18 @@

    pcre2compat man page

    /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject.

    +

    +23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl tries to +recover and prints a warning if the problem was that an invalid hexadecimal +digit was found, since PCRE2 doesn't have warnings it returns an error instead. +Additionally, Perl accepts \x{} and generates NUL unlike PCRE2. +

    +

    +24. From release 10.45, PCRE2 gives an error if \x is not followed by a +hexadecimal digit or a curly bracket. It used to interpret this as the NUL +character. Perl still generates NUL, but warns when in warning mode in most +cases. +


    AUTHOR
    @@ -267,9 +290,9 @@

    pcre2compat man page

    REVISION

    -Last updated: 30 November 2023 +Last updated: 02 October 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/mingw32/share/doc/pcre2/html/pcre2convert.html b/mingw32/share/doc/pcre2/html/pcre2convert.html index 6b9fea5575e..57e8989fb4a 100644 --- a/mingw32/share/doc/pcre2/html/pcre2convert.html +++ b/mingw32/share/doc/pcre2/html/pcre2convert.html @@ -182,7 +182,7 @@

    pcre2convert man page


    REVISION

    -Last updated: 28 June 2018 +Last updated: 14 November 2023
    Copyright © 1997-2018 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2grep.html b/mingw32/share/doc/pcre2/html/pcre2grep.html index bd12246ae99..66c56029698 100644 --- a/mingw32/share/doc/pcre2/html/pcre2grep.html +++ b/mingw32/share/doc/pcre2/html/pcre2grep.html @@ -391,9 +391,10 @@

    pcre2grep man page

    command line, no delimiters should be used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --newline option has no effect on this option. Trailing white space is -removed from each line, and blank lines are ignored. An empty file contains no +removed from each line, and blank lines are ignored unless the +--posix-pattern-file option is also provided. An empty file contains no patterns and therefore matches nothing. Patterns read from a file in this way -may contain binary zeros, which are treated as ordinary data characters. +may contain binary zeros, which are treated as ordinary character literals.

    If this option is given more than once, all the specified files are read. A @@ -723,9 +724,9 @@

    pcre2grep man page



    $<digits> or ${<digits>} is replaced by the captured substring of the given -decimal number; zero substitutes the whole match. If the number is greater than -the number of capturing substrings, or if the capture is unset, the replacement -is empty. +decimal number; $& (or the legacy $0) substitutes the whole match. If the +number is greater than the number of capturing substrings, or if the capture +is unset, the replacement is empty.

    $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by @@ -808,6 +809,15 @@

    pcre2grep man page

    allowing \w to match Unicode letters and digits.

    +--posix-pattern-file +When patterns are provided with the -f option, do not trim trailing +spaces or ignore empty lines in a similar way than other grep tools. To keep +the behaviour consistent with older versions, if the pattern read was +terminated with CRLF (as character literals) then both characters won't be +included as part of it, so if you really need to have pattern ending in '\r', +use a escape sequence or provide it by a different method. +

    +

    -q, --quiet Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found. @@ -993,7 +1003,7 @@

    pcre2grep man page

    callout facility. However, this support can be completely or partially disabled when pcre2grep is built. You can find out whether your binary has support for callouts by running it with the --help option. If callout support is -completely disabled, all callouts in patterns are ignored by pcre2grep. +completely disabled, callouts in patterns are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored.

    @@ -1015,9 +1025,9 @@

    pcre2grep man page

    zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the --output (-O) option (see -above). However, $0 cannot be used to insert a matched substring because the -match is still in progress. Instead, the single character '0' is inserted. Any -syntax errors in the string (for example, a dollar not followed by another +above). However, $0 or $& cannot be used to insert a matched substring because +the match is still in progress. Instead, the single character '0' is inserted. +Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example: @@ -1047,9 +1057,9 @@

    pcre2grep man page

    Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the --output -(-O) option documented above, except that $0 cannot insert the matched -string because the match is still in progress. Instead, the character '0' -is inserted. If you need a literal dollar or pipe character in any +(-O) option documented above, except that $0 or $& cannot insert the +matched string because the match is still in progress. Instead, the character +'0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example:
       echo -e "abcde\n12345" | pcre2grep \
    @@ -1116,7 +1126,7 @@ 

    pcre2grep man page


    REVISION

    -Last updated: 22 December 2023 +Last updated: 04 February 2025
    Copyright © 1997-2023 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2jit.html b/mingw32/share/doc/pcre2/html/pcre2jit.html index d97d8003ccb..6835cd8898a 100644 --- a/mingw32/share/doc/pcre2/html/pcre2jit.html +++ b/mingw32/share/doc/pcre2/html/pcre2jit.html @@ -64,7 +64,7 @@

    pcre2jit man page

    If --enable-jit is set on an unsupported platform, compilation fails.

    -A client program can tell if JIT support is available by calling +A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular @@ -72,11 +72,19 @@

    pcre2jit man page

    items that are not supported by JIT (see below). Another reason is that in some environments JIT is unable to get -memory in which to build its compiled code. The only guarantee from +executable memory in which to build its compiled code. The only guarantee from pcre2_config() is that if it returns zero, JIT will definitely not be used.

    +As of release 10.45 there is a more informative way to test for JIT support. If +pcre2_compile_jit() is called with the single option PCRE2_JIT_TEST_ALLOC +it returns zero if JIT is available and has a working allocator. Otherwise it +returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable +memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. The +code argument is ignored, so it can be a NULL value. +

    +

    A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs @@ -126,7 +134,8 @@

    pcre2jit man page

    PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial matching. If pcre2_jit_compile() is called with no option bits set, it immediately -returns zero. This is an alternative way of testing whether JIT is available. +returns zero. This is an alternative way of testing whether JIT support has +been compiled.

    At present, it is not possible to free JIT compiled code except when the entire @@ -487,7 +496,7 @@

    pcre2jit man page


    REVISION

    -Last updated: 21 February 2024 +Last updated: 22 August 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2limits.html b/mingw32/share/doc/pcre2/html/pcre2limits.html index 8152ed22d71..514c50b2396 100644 --- a/mingw32/share/doc/pcre2/html/pcre2limits.html +++ b/mingw32/share/doc/pcre2/html/pcre2limits.html @@ -96,7 +96,7 @@

    pcre2limits man page

    REVISION

    -Last updated: August 2023 +Last updated: 16 August 2023
    Copyright © 1997-2023 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2matching.html b/mingw32/share/doc/pcre2/html/pcre2matching.html index 3b8b629380c..4d0232507b6 100644 --- a/mingw32/share/doc/pcre2/html/pcre2matching.html +++ b/mingw32/share/doc/pcre2/html/pcre2matching.html @@ -27,7 +27,7 @@

    pcre2matching man page

    This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() -function. This works in the same as Perl's matching function, and provide a +function. This works in the same as Perl's matching function, and provides a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the pcre2jit @@ -42,7 +42,7 @@

    pcre2matching man page

    When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, -when there are multiple possibilities. For example, if the pattern +when there are multiple possibilities. For example, if the anchored pattern

       ^<.*>
     
    @@ -115,9 +115,9 @@

    pcre2matching man page

    Note that the size of vector needed to contain all the results depends on the -number of simultaneous matches, not on the number of parentheses in the -pattern. Using pcre2_match_data_create_from_pattern() to create the match -data block is therefore not advisable when doing DFA matching. +number of simultaneous matches, not on the number of capturing parentheses in +the pattern. Using pcre2_match_data_create_from_pattern() to create the +match data block is therefore not advisable when doing DFA matching.

    Note also that all the matches that are found start at the same point in the @@ -166,37 +166,43 @@

    pcre2matching man page

    do this. This means that no captured substrings are available.

    -3. Because no substrings are captured, backreferences within the pattern are -not supported. -

    -

    -4. For the same reason, conditional expressions that use a backreference as the -condition or test for a specific group recursion are not supported. -

    -

    -5. Again for the same reason, script runs are not supported. +3. Because no substrings are captured, a number of related features are not +available: +
    +
    +(a) Backreferences; +
    +
    +(b) Conditional expressions that use a backreference as the condition or test +for a specific group recursion; +
    +
    +(c) Script runs; +
    +
    +(d) Scan substring assertions.

    -6. Because many paths through the tree may be active, the \K escape sequence, +4. Because many paths through the tree may be active, the \K escape sequence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported.

    -7. Callouts are supported, but the value of the capture_top field is +5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0.

    -8. The \C escape sequence, which (in the standard algorithm) always matches a -single code unit, even in a UTF mode, is not supported in these modes, because +6. The \C escape sequence, which (in the standard algorithm) always matches a +single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree.

    -9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not +7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion.

    -10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not +8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not supported by pcre2_dfa_match().


    ADVANTAGES OF THE ALTERNATIVE ALGORITHM
    @@ -223,15 +229,18 @@

    pcre2matching man page

    less susceptible to optimization.

    -2. Capturing parentheses, backreferences, script runs, and matching within -invalid UTF string are not supported. +2. Capturing parentheses and other features such as backreferences that rely on +them are not supported. +

    +

    +3. Matching within invalid UTF strings is not supported.

    -3. Although atomic groups are supported, their use does not provide the +4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm.

    -4. JIT optimization is not supported. +5. JIT optimization is not supported.


    AUTHOR

    @@ -244,7 +253,7 @@

    pcre2matching man page


    REVISION

    -Last updated: 19 January 2024 +Last updated: 30 August 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2partial.html b/mingw32/share/doc/pcre2/html/pcre2partial.html index 64116c4f20f..067064d90a1 100644 --- a/mingw32/share/doc/pcre2/html/pcre2partial.html +++ b/mingw32/share/doc/pcre2/html/pcre2partial.html @@ -399,7 +399,7 @@

    pcre2partial man page


    REVISION

    -Last updated: 04 September 2019 +Last updated: 27 November 2024
    Copyright © 1997-2019 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2pattern.html b/mingw32/share/doc/pcre2/html/pcre2pattern.html index cf50c1a1095..84eb0aa17c5 100644 --- a/mingw32/share/doc/pcre2/html/pcre2pattern.html +++ b/mingw32/share/doc/pcre2/html/pcre2pattern.html @@ -14,37 +14,41 @@

    pcre2pattern man page



    PCRE2 REGULAR EXPRESSION DETAILS

    @@ -52,9 +56,11 @@

    pcre2pattern man page

    are described in detail below. There is a quick-reference syntax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. -PCRE2 also supports some alternative regular expression syntax (which does not -conflict with the Perl syntax) in order to provide some compatibility with -regular expressions in Python, .NET, and Oniguruma. +PCRE2 also supports some alternative regular expression syntax that does not +conflict with the Perl syntax in order to provide some compatibility with +regular expressions in Python, .NET, and Oniguruma. There are in addition some +options that enable alternative syntax and semantics that are not the same as +in Perl.

    Perl's regular expressions are described in its own documentation, and regular @@ -74,7 +80,19 @@

    pcre2pattern man page

    pcre2matching page.

    -
    SPECIAL START-OF-PATTERN ITEMS
    +
    EBCDIC CHARACTER CODES
    +

    +Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes +this by default. However, it can be compiled to run in an environment that uses +the EBCDIC code, which is the case for some IBM mainframe operating systems. In +the sections below, character code values are ASCII or Unicode; in an EBCDIC +environment these characters may have different code values, and there are no +code points greater than 255. Differences in behaviour when PCRE2 is running in +an EBCDIC environment are described in the section +"EBCDIC environments" +below, which you can ignore unless you really are in an EBCDIC environment. +

    +
    SPECIAL START-OF-PATTERN ITEMS

    A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-compatible, but @@ -141,7 +159,8 @@

    pcre2pattern man page


    If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting -the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making quantifiers +the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_optimize() with +a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers possessive when what follows cannot match the repeated item. For example, by default a+b is treated as a++b. For more details, see the pcre2api @@ -152,8 +171,9 @@

    pcre2pattern man page


    If a pattern starts with (*NO_START_OPT), it has the same effect as setting the -PCRE2_NO_START_OPTIMIZE option. This disables several optimizations for quickly -reaching "no match" results. For more details, see the +PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_optimize() with +a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for +quickly reaching "no match" results. For more details, see the pcre2api documentation.

    @@ -162,7 +182,8 @@

    pcre2pattern man page


    If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as -setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimizations that +setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_optimize() +with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables optimizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the pcre2api @@ -275,14 +296,6 @@

    pcre2pattern man page

    (*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, corresponding to PCRE2_BSR_UNICODE.

    -
    EBCDIC CHARACTER CODES
    -

    -PCRE2 can be compiled to run in an environment that uses EBCDIC as its -character code instead of ASCII or Unicode (typically a mainframe system). In -the sections below, character code values are ASCII or Unicode; in an EBCDIC -environment these characters may have different code values, and there are no -code points greater than 255. -


    CHARACTERS AND METACHARACTERS

    A regular expression is a pattern that is matched against a subject string from @@ -298,7 +311,10 @@

    pcre2pattern man page

    equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to -pcre2_compile() or set by (?r) within the pattern). +pcre2_compile() or set by (*CASELESS_RESTRICT) or (?r) within the +pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed +to pcre2_compile() or set by (*TURKISH_CASING) within the pattern), then +the 'i' letters are matched according to Turkish and Azeri languages.

    The power of regular expressions comes from the ability to include wild cards, @@ -346,7 +362,7 @@

    pcre2pattern man page

    If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or -between a # outside a character class and the next newline, inclusive, are +between a # outside a character class and the next newline, inclusive, is ignored. An escaping backslash can be used to include a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are @@ -404,6 +420,14 @@

    pcre2pattern man page

    the pattern (that is, \E is assumed at the end). If the isolated \Q is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket. +

    +

    +Another difference from Perl is that any appearance of \Q or \E inside what +might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a +quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers +is inside \Q...\E, but not if the separating comma is. When not recognized as +a quantifier a sequence such as {\Q1\E,2} is treated as the literal string +"{1,2}".


    Non-printing characters @@ -424,17 +448,28 @@

    pcre2pattern man page

    \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd - \ddd character with octal code ddd, or backreference + \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh.. -By default, after \x that is not followed by {, from zero to two hexadecimal -digits are read (letters can be in upper or lower case). Any number of -hexadecimal digits may appear between \x{ and }. If a character other than a -hexadecimal digit appears between \x{ and }, or if there is no terminating }, -an error occurs. +A description of how back references work is given +later, +following the discussion of +parenthesized groups. +

    +

    +By default, after \x that is not followed by {, one or two hexadecimal +digits are read (letters can be in upper or lower case). If the character that +follows \x is neither { nor a hexadecimal digit, an error occurs. This is +different from Perl's default behaviour, which generates a NUL character, but +is in line with the behaviour of Perl's 'strict' mode in re. +

    +

    +Any number of hexadecimal digits may appear between \x{ and }. If a character +other than a hexadecimal digit appears between \x{ and }, or if there is no +terminating }, an error occurs.

    Characters whose code points are less than 256 can be defined by either of the @@ -481,69 +516,54 @@

    pcre2pattern man page

    a compile-time error occurs.

    -When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, -\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c -escape is processed as specified for Perl in the perlebcdic document. The -only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], -^, _, or ?. Any other character provokes a compile-time error. The sequence -\c@ encodes character code 0; after \c the letters (in either case) encode -characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 -(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +For differences in the way some escapes behave in EBCDIC environments, +see section +"EBCDIC environments" +below.

    +
    +Octal escapes and back references +

    -Thus, apart from \c?, these escapes generate the same character code values as -they do in an ASCII environment, though the meanings of the values mostly -differ. For example, \cG always generates code value 7, which is BEL in ASCII -but DEL in EBCDIC. +The escape \o must be followed by a sequence of octal digits, enclosed in +braces. An error occurs if this is not the case. This escape provides a way of +specifying character code points as octal numbers greater than 0777, and it +also allows octal numbers and backreferences to be unambiguously distinguished.

    -The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but -because 127 is not a control character in EBCDIC, Perl makes it generate the -APC character. Unfortunately, there are several variants of EBCDIC. In most of -them the APC character has the value 255 (hex FF), but in the one Perl calls -POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC -values, PCRE2 makes \c? generate 95; otherwise it generates 255. +If braces are not used, after \0 up to two further octal digits are read. +However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit +must follow \0 (use \00 to generate a NUL character). Make sure you supply +two digits after the initial zero if the pattern character that follows is +itself an octal digit.

    -After \0 up to two further octal digits are read. If there are fewer than two -digits, just those that are present are used. Thus the sequence \0\x\015 -specifies two binary zeros followed by a CR character (code value 13). Make -sure you supply two digits after the initial zero if the pattern character that -follows is itself an octal digit. +Inside a character class, when a backslash is followed by any octal digit, up +to three octal digits are read to generate a code point. Any subsequent digits +stand for themselves. The sequences \8 and \9 are treated as the literal +characters "8" and "9".

    -The escape \o must be followed by a sequence of octal digits, enclosed in -braces. An error occurs if this is not the case. This escape is a recent -addition to Perl; it provides way of specifying character code points as octal -numbers greater than 0777, and it also allows octal numbers and backreferences -to be unambiguously specified. +Outside a character class, Perl's handling of a backslash followed by a digit +other than 0 is complicated by ambiguity, and Perl has changed over time, +causing PCRE2 also to change. From PCRE2 release 10.45 there is an option +called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous +rules. The next two subsections describe the two sets of rules.

    For greater clarity and unambiguity, it is best to avoid following \ by a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical -character code points, and \g{...} to specify backreferences. The following -paragraphs describe the old, ambiguous syntax. -

    -

    -The handling of a backslash followed by a digit other than 0 is complicated, -and Perl has changed over time, causing PCRE2 also to change. -

    -

    -Outside a character class, PCRE2 reads the digit and any following digits as a -decimal number. If the number is less than 10, begins with the digit 8 or 9, or -if there are at least that many previous capture groups in the expression, the -entire sequence is taken as a backreference. A description of how this -works is given -later, -following the discussion of -parenthesized groups. -Otherwise, up to three octal digits are read to form a character code. +character code points, and \g{...} to specify backreferences.

    +
    +Perl rules for non-class backslash 1-9 +

    -Inside a character class, PCRE2 handles \8 and \9 as the literal characters -"8" and "9", and otherwise reads up to three octal digits following the -backslash, using them to generate a data character. Any subsequent digits stand -for themselves. For example, outside a character class: +All the digits that follow the backslash are read as a decimal number. If the +number is less than 10, begins with the digit 8 or 9, or if there are at least +that many previous capture groups in the expression, the entire sequence is +taken as a back reference. Otherwise, up to three octal digits are read to form +a character code. For example:

       \040   is another way of writing an ASCII space
       \40    is the same, provided there are fewer than 40 previous capture groups
    @@ -560,6 +580,19 @@ 

    pcre2pattern man page

    digits are ever read.


    +Python rules for non_class backslash 1-9 +
    +

    +If there are at least three octal digits after the backslash, exactly three are +read as an octal code point number, but the value must be no greater than +\377, even in modes where higher code point values are supported. Any +subsequent digits stand for themselves. If there are fewer than three octal +digits, the sequence is taken as a decimal back reference. Thus, for example, +\12 is always a back reference, independent of how many captures there are in +the pattern. An error is generated for a reference to a non-existent capturing +group. +

    +
    Constraints on character values

    @@ -805,7 +838,7 @@

    pcre2pattern man page

    sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are -less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points +less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Unknown script and with an unassigned type.

    @@ -823,12 +856,33 @@

    pcre2pattern man page

    \P{xx} a character without the xx property \X a Unicode extended grapheme cluster
    -The property names represented by xx above are not case-sensitive, and in -accordance with Unicode's "loose matching" rules, spaces, hyphens, and -underscores are ignored. There is support for Unicode script names, Unicode -general category properties, "Any", which matches any character (including -newline), Bidi_Class, a number of binary (yes/no) properties, and some special -PCRE2 properties (described +For compatibility with Perl, negation can be specified by including a +circumflex between the opening brace and the property. For example, \p{^Lu} is +the same as \P{Lu}. +

    +

    +In accordance with Unicode's "loose matching" rules, ASCII white space +characters, hyphens, and underscores are ignored in the properties represented +by xx above. As well as the space character, ASCII white space can be +tab, linefeed, vertical tab, formfeed, or carriage return. +

    +

    +Some properties are specified as a name only; others as a name and a value, +separated by a colon or an equals sign. The names and values consist of ASCII +letters and digits (with one Perl-specific exception, see below). They are not +case sensitive. Note, however, that the escapes themselves, \p and \P, +are case sensitive. There are abbreviations for many names. The following +examples are all equivalent: +

    +  \p{bidiclass=al}
    +  \p{BC=al}
    +  \p{ Bidi_Class : AL }
    +  \p{ Bi-di class = Al }
    +  \P{ ^ Bi-di class = Al }
    +
    +There is support for Unicode script names, Unicode general category properties, +"Any", which matches any character (including newline), Bidi_Class, a number of +binary (yes/no) properties, and some special PCRE2 properties (described below). Certain other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} does not match any characters, so always causes a @@ -844,10 +898,11 @@

    pcre2pattern man page

    example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and "script extensions" for the -property types are recognized, and a equals sign is an alternative to the -colon. If a script name is given without a property type, for example, -\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this -interpretation at release 5.26 and PCRE2 changed at release 10.40. +property types are recognized and, as for all property specifications, an +equals sign is an alternative to the colon. If a script name is given without a +property type, for example, \p{Adlam}, it is treated as \p{scx:Adlam}. Perl +changed to this interpretation at release 5.26 and PCRE2 changed at release +10.40.

    Unassigned characters (and in non-UTF 32-bit mode, characters with code points @@ -865,15 +920,10 @@

    pcre2pattern man page


    Each character has exactly one Unicode general category property, specified by -a two-letter abbreviation. For compatibility with Perl, negation can be -specified by including a circumflex between the opening brace and the property -name. For example, \p{^Lu} is the same as \P{Lu}. -

    -

    -If only one letter is specified with \p or \P, it includes all the general -category properties that start with that letter. In this case, in the absence -of negation, the curly brackets in the escape sequence are optional; these two -examples have the same effect: +a two-letter abbreviation. If only one letter is specified with \p or \P, it +includes all the general category properties that start with that letter. In +this case, in the absence of negation, the curly brackets in the escape +sequence are optional; these two examples have the same effect:

       \p{L}
       \pL
    @@ -888,6 +938,7 @@ 

    pcre2pattern man page

    Cs Surrogate L Letter + Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter @@ -924,9 +975,13 @@

    pcre2pattern man page

    Zp Paragraph separator Zs Space separator
    -The special property LC, which has the synonym L&, is also supported: it -matches a character that has the Lu, Ll, or Lt property, in other words, a -letter that is not classified as a modifier or "other". +Perl originally used the name L& for the Lc property. This is still supported +by Perl, but discouraged. PCRE2 also still supports it. This property matches +any character that has the Lu, Ll, or Lt property, in other words, any letter +that is not classified as a modifier or "other". From release 10.45 of PCRE2 +the properties Lu, Ll, and Lt are all treated as Lc when case-independent +matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The +other properties are not affected by caseless matching.

    The Cs (Surrogate) property applies only to characters whose code points are in @@ -948,11 +1003,6 @@

    pcre2pattern man page

    Instead, this property is assumed for any code point that is not in the Unicode table.

    -

    -Specifying caseless matching does not affect these escape sequences. For -example, \p{Lu} always matches only upper case letters. This is different from -the behaviour of current versions of Perl. -


    Binary (yes/no) properties for \p and \P
    @@ -997,10 +1047,11 @@

    pcre2pattern man page

    RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space -An equals sign may be used instead of a colon. The class names are -case-insensitive; only the short names listed above are recognized. +As in all property specifications, an equals sign may be used instead of a +colon and the class names are case-insensitive. Only the short names listed +above are recognized; PCRE2 does not at present support any long alternatives.


    Extended grapheme clusters @@ -1073,11 +1124,11 @@

    pcre2pattern man page

    Xan matches characters that have either the L (letter) or the N (number) property. Xps matches the characters tab, linefeed, vertical tab, form feed, or -carriage return, and any other character that has the Z (separator) property. -Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl -compatibility, but Perl changed. Xwd matches the same characters as Xan, plus -those that match Mn (non-spacing mark) or Pc (connector punctuation, which -includes underscore). +carriage return, and any other character that has the Z (separator) property +(this includes the space character). Xsp is the same as Xps; in PCRE1 it used +to exclude vertical tab, for Perl compatibility, but Perl changed. Xwd matches +the same characters as Xan, plus those that match Mn (non-spacing mark) or Pc +(connector punctuation, which includes underscore).

    There is another non-standard property, Xuc, which matches any character that @@ -1389,13 +1440,12 @@

    pcre2pattern man page

    character, or escape it with a backslash.

    -For example, the character class [aeiou] matches any lower case vowel, while -[^aeiou] matches any character that is not a lower case vowel. Note that a -circumflex is just a convenient notation for specifying the characters that -are in the class by enumerating those that are not. A class that starts with a -circumflex is not an assertion; it still consumes a character from the subject -string, and therefore it fails if the current pointer is at the end of the -string. +For example, the character class [aeiou] matches any lower case English vowel, +whereas [^aeiou] matches all other characters. Note that a circumflex is just a +convenient notation for specifying the characters that are in the class by +enumerating those that are not. A class that starts with a circumflex is not an +assertion; it still consumes a character from the subject string, and therefore +it fails to match if the current pointer is at the end of the string.

    Characters in a class may be specified by their code points using \o, \x, or @@ -1405,7 +1455,10 @@

    pcre2pattern man page

    match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) -respectively when either PCRE2_UTF or PCRE2_UCP is set. +respectively when either PCRE2_UTF or PCRE2_UCP is set. If you do not want +these ASCII/non-ASCII case equivalences, you can suppress them by setting +PCRE2_EXTRA_CASELESS_RESTRICT, either as an option in a compile context, or by +including (*CASELESS_RESTRICT) or (?r) within a pattern.

    Characters that might indicate line breaks are never treated in any special way @@ -1437,6 +1490,12 @@

    pcre2pattern man page

    b to d, a hyphen character, or z.

    +There is some special treatment for alphabetic ranges in EBCDIC environments; +see the section +"EBCDIC environments" +below. +

    +

    Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d or \H. However, unless the hyphen is the last character in the class, Perl outputs a @@ -1448,9 +1507,9 @@

    pcre2pattern man page

    range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as -the end of range, so [W-\]46] is interpreted as a class containing a range -followed by two other characters. The octal or hexadecimal representation of -"]" can also be used to end a range. +the end of a range, so [W-\]46] is interpreted as a class containing a range +and two other characters. The octal or hexadecimal representation of "]" can +also be used to end a range.

    Ranges normally include all code points between the start and end characters, @@ -1463,15 +1522,6 @@

    pcre2pattern man page

    surrogates, are always permitted.

    -There is a special case in EBCDIC environments for ranges whose end points are -both specified as literal letters in the same case. For compatibility with -Perl, EBCDIC code points within the range that are not letters are omitted. For -example, [h-k] matches only four characters, even though the codes for h and k -are 0x88 and 0x92, a range of 11 code points. However, if the range is -specified numerically, for example, [\x88-\x92] or [h-\x92], all code points -are included. -

    -

    If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character @@ -1487,18 +1537,132 @@

    pcre2pattern man page

    something AND NOT ...".

    -The only metacharacters that are recognized in character classes are backslash, -hyphen (only where it can be interpreted as specifying a range), circumflex -(only at the start), opening square bracket (only when it can be interpreted as -introducing a POSIX class name, or for a special compatibility feature - see -the next two sections), and the terminating closing square bracket. However, -escaping other non-alphanumeric characters does no harm. +The metacharacters that are recognized in character classes are backslash, +hyphen (when it can be interpreted as specifying a range), circumflex +(only at the start), and the terminating closing square bracket. An opening +square bracket is also special when it can be interpreted as introducing a +POSIX class (see +"Posix character classes" +below), or a special compatibility feature (see +"Compatibility feature for word boundaries" +below. Escaping any non-alphanumeric character in a class turns it into a +literal, whether or not it would otherwise be a metacharacter. +

    +
    PERL EXTENDED CHARACTER CLASSES
    +

    +From release 10.45 PCRE2 supports Perl's (?[...]) extended character class +syntax. This can be used to perform set operations such as intersection on +character classes. +

    +

    +The syntax permitted within (?[...]) is quite different to ordinary character +classes. Inside the extended class, there is an expression syntax consisting of +"atoms", operators, and ordinary parentheses "()" used for grouping. Such +classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) +turned on within them. This means that literal space and tab characters are +ignored everywhere in the class. +

    +

    +The allowed atoms are individual characters specified by escape sequences such +as \n or \x{123}, character types such as \d, POSIX classes such as +[:alpha:], and nested ordinary (non-extended) character classes. For example, +in (?[\d & [...]]) the nested class [...] follows the usual rules for ordinary +character classes, in which parentheses are not metacharacters, and character +literals and ranges are permitted. +

    +

    +Character literals and ranges may not appear outside a nested ordinary +character class because they are not atoms in the extended syntax. The extended +syntax does not introduce any additional escape sequences, so (?[\y]) is an +unknown escape, as it would be in [\y]. +

    +

    +In the extended syntax, ^ does not negate a class (except within an +ordinary class nested inside an extended class); it is instead a binary +operator. +

    +

    +The binary operators are "&" (intersection), "|" or "+" (union), "-" +(subtraction) and "^" (symmetric difference). These are left-associative and +"&" has higher (tighter) precedence, while the others have equal lower +precedence. The one prefix unary operator is "!" (complement), with highest +precedence. +

    +
    UTS#18 EXTENDED CHARACTER CLASSES
    +

    +The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) +syntax, allowing instead extended class behaviour inside ordinary [...] +character classes. This altered syntax for [...] classes is loosely described +by the Unicode standard UTS#18. The PCRE2_ALT_EXTENDED_CLASS option does not +prevent use of (?[...]) classes; it just changes the meaning of all +[...] classes that are not nested inside a Perl (?[...]) class. +

    +

    +Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a +character class with two literal characters "a" and "[", but in UTS#18 extended +classes the "[" character becomes an additional metacharacter within classes, +denoting the start of a nested class, so a literal "[" must be escaped as "\[". +

    +

    +Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", +"--" and "~~" which denote character class union, intersection, subtraction, +and symmetric difference respectively. In standard Perl syntax, these would +simply be needlessly-repeated literals (except for "--" which could be the +start or end of a range). In UTS#18 extended classes these operators can be used +in constructs such as [\p{L}--[QW]] for "Unicode letters, other than Q and W". +A literal "-" at the start or end of a range must be escaped, so while "[--1]" +in Perl syntax is the range from hyphen to "1", it must be escaped as "[\--1]" +in UTS#18 extended classes. +

    +

    +Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to +ignore space and tab characters is not automatically enabled for UTS#18 +extended classes, but it is honoured if set. +

    +

    +Extended UTS#18 classes can be nested, and nested classes are themselves +extended classes (unlike Perl, where nested classes must be simple classes). +For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any letter that is in +the Thai or Greek scripts. Note that this means that no special grouping +characters (such as the parentheses used in Perl's (?[...]) class syntax) are +needed. +

    +

    +Individual class items (literal characters, literal ranges, properties such as +\d or \p{...}, and nested classes) can be combined by juxtaposition or by an +operator. Juxtaposition is the implicit union operator, and binds more tightly +than any explicit operator. Thus a sequence of literals and/or ranges behaves +as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the +same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character +except "E" or "8". +

    +

    +Precedence between the explicit operators is not defined, so mixing operators +is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is +valid.

    -
    POSIX CHARACTER CLASSES
    +

    +This is an emerging syntax which is being adopted gradually across the regex +ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024; +Python's "re" module reserves the syntax for future use with a FutureWarning +for unescaped use of "[" as a literal within character classes. Due to UTS#18 +providing insufficient guidance, engines interpret the syntax differently. +Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 +extended classes, but with slight incompatibilities ([A||B&&C] is parsed as +[A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex"). +

    +

    +PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so +that all the UTS#18 extended classes accepted as valid by PCRE2 have the +property that they are interpreted either with the same behaviour, or as +invalid, by all other major engines. Please file an issue if you are aware of +cross-engine differences in behaviour between PCRE2 and another major engine. +

    +
    POSIX CHARACTER CLASSES

    Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports -this notation. For example, +this notation, in both ordinary and extended classes. For example,

       [01[:alpha:]%]
     
    @@ -1584,7 +1748,7 @@

    pcre2pattern man page

    [:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This -is a change that was made in PCRE release 10.43 for Perl compatibility. +is a change that was made in PCRE2 release 10.43 for Perl compatibility.

    The other POSIX classes are unchanged by PCRE2_UCP, and match only characters @@ -1597,8 +1761,8 @@

    pcre2pattern man page

    (?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency. -

    -
    COMPATIBILITY FEATURE FOR WORD BOUNDARIES
    +

    +
    COMPATIBILITY FEATURE FOR WORD BOUNDARIES

    In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of @@ -1619,7 +1783,7 @@

    pcre2pattern man page

    PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so it also affects these POSIX sequences.

    -
    VERTICAL BAR
    +
    VERTICAL BAR

    Vertical bar characters are used to separate alternative patterns. For example, the pattern @@ -1634,7 +1798,7 @@

    pcre2pattern man page

    "succeeds" means matching the rest of the main pattern as well as the alternative in the group.

    -
    INTERNAL OPTION SETTING
    +
    INTERNAL OPTION SETTING

    The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, @@ -1732,7 +1896,7 @@

    pcre2pattern man page

    the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences.

    -
    GROUPS
    +
    GROUPS

    Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things: @@ -1788,7 +1952,7 @@

    pcre2pattern man page

    reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday".

    -
    DUPLICATE GROUP NUMBERS
    +
    DUPLICATE GROUP NUMBERS

    Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is @@ -1834,7 +1998,7 @@

    pcre2pattern man page

    An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section.

    -
    NAMED CAPTURE GROUPS
    +
    NAMED CAPTURE GROUPS

    Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is @@ -1954,7 +2118,7 @@

    pcre2pattern man page

    pcre2api documentation.

    -
    REPETITION
    +
    REPETITION

    Repetition is specified by quantifiers, which may follow any one of these items: @@ -2118,8 +2282,9 @@

    pcre2pattern man page

    (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking control verbs -(*PRUNE) and (*SKIP) also disable this optimization, and there is an option, -PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. +(*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, +either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call +pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive.

    When a capture group is repeated, the value captured is the substring that @@ -2135,7 +2300,7 @@

    pcre2pattern man page

    matches "aba" the value of the second captured substring is "b".

    -
    ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
    +
    ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS

    With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be @@ -2216,8 +2381,9 @@

    pcre2pattern man page

    PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. -This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting -the pattern with (*NO_AUTO_POSSESS). +This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling +pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive, or by +starting the pattern with (*NO_AUTO_POSSESS).

    When a pattern contains an unlimited repeat inside a group that can itself be @@ -2245,7 +2411,7 @@

    pcre2pattern man page

    sequences of non-digits cannot be broken, and failure happens quickly.

    -
    BACKREFERENCES
    +
    BACKREFERENCES

    Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that @@ -2383,23 +2549,32 @@

    pcre2pattern man page

    This restriction no longer applies, and backtracking into such groups can occur as normal.

    -
    ASSERTIONS
    +
    ASSERTIONS

    -An assertion is a test on the characters following or preceding the current -matching point that does not consume any characters. The simple assertions -coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described +An assertion is a test that does not consume any characters. The test must +succeed for the match to continue. The simple assertions coded as \b, \B, +\A, \G, \Z, \z, ^ and $ are described above.

    -More complicated assertions are coded as parenthesized groups. There are two -kinds: those that look ahead of the current position in the subject string, and -those that look behind it, and in each case an assertion may be positive (must -match for the assertion to be true) or negative (must not match for the -assertion to be true). An assertion group is matched in the normal way, -and if it is true, matching continues after it, but with the matching position +More complicated assertions are coded as parenthesized groups. If matching such +a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed.

    +A special kind of assertion, called a "scan substring" assertion, matches a +subpattern against a previously captured substring. This is described in the +section entitled +"Scan substring assertions" +below. It is a PCRE2 extension, not compatible with Perl. +

    +

    +The other goup-based assertions are of two kinds: those that look ahead of the +current position in the subject string, and those that look behind it, and in +each case an assertion may be positive (must match for the assertion to be +true) or negative (must not match for the assertion to be true). +

    +

    The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic assertions can be @@ -2624,7 +2799,7 @@

    pcre2pattern man page

    is another pattern that matches "foo" preceded by three digits and any three characters that are not "999".

    -
    NON-ATOMIC ASSERTIONS
    +
    NON-ATOMIC ASSERTIONS

    Traditional lookaround assertions are atomic. That is, if an assertion is true, but there is a subsequent matching failure, there is no backtracking into the @@ -2683,8 +2858,67 @@

    pcre2pattern man page

    that assertions that appear as conditions for conditional groups (see below) must be atomic. +

    +
    SCAN SUBSTRING ASSERTIONS
    +

    +A special kind of assertion, not compatible with Perl, makes it possible to +check the contents of a captured substring by matching it with a subpattern. +Because this involves capturing, this feature is not supported by +pcre2_dfa_match(). +

    +

    +A scan substring assertion starts with the sequence (*scan_substring: or +(*scs: which is followed by a list of substring numbers (absolute or relative) +and/or substring names enclosed in single quotes or angle brackets, all within +parentheses. The rest of the item is the subpattern that is applied to the +substring, as shown in these examples: +

    +  (*scan_substring:(1)...)
    +  (*scs:(-2)...)
    +  (*scs:('AB')...)
    +  (*scs:(1,'AB',-2)...)
    +
    +The list of groups is checked in the order they are given, and it is the +contents of the first one that is found to be set that are scanned. When +PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the +same name are checked in numerical order. A scan substring assertion fails if +none of the groups it references have been set.

    -
    SCRIPT RUNS
    +

    +The pattern match on the substring is always anchored, that is, it must match +from the start of the substring. There is no "bumpalong" if it does not match +at the start. The end of the subject is temporarily reset to be the end of the +substring, so \Z, \z, and $ will match there. However, the start of the +subject is not reset. This means that ^ matches only if the substring is +actually at the start of the main subject, but it also means that lookbehind +assertions into what precedes the substring are possible. +

    +

    +Here is a very simple example: find a word that contains the rare (in English) +sequence of letters "rh" not at the start: +

    +  \b(\w++)(*scs:(1).+rh)
    +
    +The first group captures a word which is then scanned by the second group. +This example does not actually need this heavyweight feature; the same match +can be achieved with: +
    +  \b\w+?rh\w*\b
    +
    +When things are more complicated, however, scanning a captured substring can be +a useful way to describe the required match. For exmple, there is a rather +complicated pattern in the PCRE2 test data that checks an entire subject string +for a palindrome, that is, the sequence of letters is the same in both +directions. Suppose you want to search for individual words of two or more +characters such as "level" that are palindromes: +
    +  (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...)
    +
    +Within a substring scanning subpattern, references to other groups work as +normal. Capturing groups may appear, and will retain their values during +ongoing matching if the assertion succeeds. +

    +
    SCRIPT RUNS

    In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are @@ -2746,7 +2980,7 @@

    pcre2pattern man page

    should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking.

    -
    CONDITIONAL GROUPS
    +
    CONDITIONAL GROUPS

    It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on @@ -2947,13 +3181,13 @@

    pcre2pattern man page

    assertion, whether it succeeds or fails. (Compare non-conditional assertions, for which captures are retained only for positive assertions that succeed.)

    -
    COMMENTS
    +
    COMMENTS

    There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as -(?: or a group name or number. The characters that make up a comment play -no part in the pattern matching. +(?: or a group name or number or a Unicode property name. The characters that +make up a comment play no part in the pattern matching.

    The sequence (?# marks the start of a comment that continues up to the next @@ -2977,7 +3211,7 @@

    pcre2pattern man page

    it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so.

    -
    RECURSIVE PATTERNS
    +
    RECURSIVE PATTERNS

    Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can @@ -3165,7 +3399,7 @@

    pcre2pattern man page

    "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works.

    -
    GROUPS AS SUBROUTINES
    +
    GROUPS AS SUBROUTINES

    If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine @@ -3213,7 +3447,7 @@

    pcre2pattern man page

    "Backtracking verbs in subroutines" below.

    -
    ONIGURUMA SUBROUTINE SYNTAX
    +
    ONIGURUMA SUBROUTINE SYNTAX

    For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative @@ -3231,7 +3465,7 @@

    pcre2pattern man page

    Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call.

    -
    CALLOUTS
    +
    CALLOUTS

    Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it @@ -3244,7 +3478,9 @@

    pcre2pattern man page

    function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is passed, or if the callout -entry point is set to NULL, callouts are disabled. +entry point is set to NULL, callout points will be passed over silently during +matching. To disallow callouts in the pattern syntax, you may use the +PCRE2_EXTRA_NEVER_CALLOUT option.

    Within a regular expression, (?C<arg>) indicates a point at which the external @@ -3307,7 +3543,7 @@

    pcre2pattern man page

    The doubling is removed before the string is passed to the callout function.

    -
    BACKTRACKING CONTROL
    +
    BACKTRACKING CONTROL

    There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They @@ -3347,8 +3583,8 @@

    pcre2pattern man page

    Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching -function, because that uses a backtracking algorithm. With the exception of -(*FAIL), which behaves like a failing negative assertion, the backtracking +function or JIT, because they use backtracking algorithms. With the exception +of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function.

    @@ -3369,7 +3605,8 @@

    pcre2pattern man page

    present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option -when calling pcre2_compile(), or by starting the pattern with +when calling pcre2_compile(), by calling pcre2_set_optimize() with a +PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled "Compiling a pattern" @@ -3502,7 +3739,8 @@

    pcre2pattern man page

    If you are interested in (*MARK) values after failed matches, you should -probably set the PCRE2_NO_START_OPTIMIZE option +probably either set the PCRE2_NO_START_OPTIMIZE option or call +pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see above) to ensure that the match is always attempted.

    @@ -3514,9 +3752,9 @@

    pcre2pattern man page

    with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, backtracking cannot pass to the left of the verb. However, when one of these verbs appears inside an -atomic group or in a lookaround assertion that is true, its effect is confined -to that group, because once the group has been matched, there is never any -backtracking into it. Backtracking from beyond an assertion or an atomic group +atomic group or in an atomic lookaround assertion that is true, its effect is +confined to that group, because once the group has been matched, there is never +any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point.

    @@ -3782,9 +4020,11 @@

    pcre2pattern man page

    assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.

    -PCRE2 now supports non-atomic positive assertions, as described in the section -entitled +PCRE2 now supports non-atomic positive assertions and also "scan substring" +assertions, as described in the sections entitled "Non-atomic assertions" +and +"Scan substring assertions" above. These assertions must be standalone (not used as conditions). They are not Perl-compatible. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by @@ -3793,7 +4033,8 @@

    pcre2pattern man page

    The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, -and a negative assertion to be true. +and a negative assertion to be true. This behaviour differs from Perl when the +assertion has only one branch.

    The other backtracking verbs are not treated specially if they appear in a @@ -3829,13 +4070,57 @@

    pcre2pattern man page

    enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +

    +
    EBCDIC ENVIRONMENTS
    +

    +Differences in the way PCRE behaves when it is running in an EBCDIC environment +are covered in this section. +

    +
    +Escape sequences +
    +

    +When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, +\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c +escape is processed as specified for Perl in the perlebcdic document. The +only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], +^, _, or ?. Any other character provokes a compile-time error. The sequence +\c@ encodes character code 0; after \c the letters (in either case) encode +characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 +(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +

    +

    +Thus, apart from \c?, these escapes generate the same character code values as +they do in an ASCII or Unicode environment, though the meanings of the values +mostly differ. For example, \cG always generates code value 7, which is BEL in +ASCII but DEL in EBCDIC. +

    +

    +The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but +because 127 is not a control character in EBCDIC, Perl makes it generate the +APC character. Unfortunately, there are several variants of EBCDIC. In most of +them the APC character has the value 255 (hex FF), but in the one Perl calls +POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC +values, PCRE2 makes \c? generate 95; otherwise it generates 255. +

    +
    +Character classes +
    +

    +In character classes there is a special case in EBCDIC environments for ranges +whose end points are both specified as literal letters in the same case. For +compatibility with Perl, EBCDIC code points within the range that are not +letters are omitted. For example, [h-k] matches only four characters, even +though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code +points. However, if the range is specified numerically, for example, +[\x88-\x92] or [h-\x92], all code points are included.

    -
    SEE ALSO
    +
    SEE ALSO

    pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -3844,9 +4129,9 @@

    pcre2pattern man page

    Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 04 June 2024 +Last updated: 27 November 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2perform.html b/mingw32/share/doc/pcre2/html/pcre2perform.html index 55fdf202fc4..b595119ba88 100644 --- a/mingw32/share/doc/pcre2/html/pcre2perform.html +++ b/mingw32/share/doc/pcre2/html/pcre2perform.html @@ -271,7 +271,7 @@

    pcre2perform man page


    REVISION

    -Last updated: 27 July 2022 +Last updated: 06 December 2022
    Copyright © 1997-2022 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2posix.html b/mingw32/share/doc/pcre2/html/pcre2posix.html index 6e7abd932ab..bc60c3b798c 100644 --- a/mingw32/share/doc/pcre2/html/pcre2posix.html +++ b/mingw32/share/doc/pcre2/html/pcre2posix.html @@ -171,7 +171,7 @@

    pcre2posix man page

    When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments -are ignored, and no captured strings are returned. Versions of the PCRE library +are ignored, and no captured strings are returned. Versions of the PCRE2 library prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens because it disables the use of backreferences.
    @@ -370,7 +370,7 @@ 

    pcre2posix man page


    REVISION

    -Last updated: 19 January 2024 +Last updated: 27 November 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2sample.html b/mingw32/share/doc/pcre2/html/pcre2sample.html index 345df031131..0903f04f99b 100644 --- a/mingw32/share/doc/pcre2/html/pcre2sample.html +++ b/mingw32/share/doc/pcre2/html/pcre2sample.html @@ -101,7 +101,7 @@

    pcre2sample man page

    REVISION

    -Last updated: 02 February 2016 +Last updated: 14 November 2023
    Copyright © 1997-2016 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2serialize.html b/mingw32/share/doc/pcre2/html/pcre2serialize.html index 19418a83b21..d189bde2b63 100644 --- a/mingw32/share/doc/pcre2/html/pcre2serialize.html +++ b/mingw32/share/doc/pcre2/html/pcre2serialize.html @@ -203,7 +203,7 @@

    pcre2serialize man page


    REVISION

    -Last updated: 27 June 2018 +Last updated: 19 January 2024
    Copyright © 1997-2018 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2syntax.html b/mingw32/share/doc/pcre2/html/pcre2syntax.html index 1c0ccb003e2..46da3d71fcc 100644 --- a/mingw32/share/doc/pcre2/html/pcre2syntax.html +++ b/mingw32/share/doc/pcre2/html/pcre2syntax.html @@ -24,34 +24,41 @@

    pcre2syntax man page

  • SCRIPT MATCHING WITH \p AND \P
  • THE BIDI_CLASS PROPERTY FOR \p AND \P
  • CHARACTER CLASSES -
  • QUANTIFIERS -
  • ANCHORS AND SIMPLE ASSERTIONS -
  • REPORTED MATCH POINT SETTING -
  • ALTERNATION -
  • CAPTURING -
  • ATOMIC GROUPS -
  • COMMENT -
  • OPTION SETTING -
  • NEWLINE CONVENTION -
  • WHAT \R MATCHES -
  • LOOKAHEAD AND LOOKBEHIND ASSERTIONS -
  • NON-ATOMIC LOOKAROUND ASSERTIONS -
  • SCRIPT RUNS -
  • BACKREFERENCES -
  • SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) -
  • CONDITIONAL PATTERNS -
  • BACKTRACKING CONTROL -
  • CALLOUTS -
  • SEE ALSO -
  • AUTHOR -
  • REVISION +
  • PERL EXTENDED CHARACTER CLASSES +
  • QUANTIFIERS +
  • ANCHORS AND SIMPLE ASSERTIONS +
  • REPORTED MATCH POINT SETTING +
  • ALTERNATION +
  • CAPTURING +
  • ATOMIC GROUPS +
  • COMMENT +
  • OPTION SETTING +
  • NEWLINE CONVENTION +
  • WHAT \R MATCHES +
  • LOOKAHEAD AND LOOKBEHIND ASSERTIONS +
  • NON-ATOMIC LOOKAROUND ASSERTIONS +
  • SUBSTRING SCAN ASSERTION +
  • SCRIPT RUNS +
  • BACKREFERENCES +
  • SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) +
  • CONDITIONAL PATTERNS +
  • BACKTRACKING CONTROL +
  • CALLOUTS +
  • REPLACEMENT STRINGS +
  • SEE ALSO +
  • AUTHOR +
  • REVISION
    PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY

    -The full syntax and semantics of the regular expressions that are supported by -PCRE2 are described in the +The full syntax and semantics of the regular expression patterns that are +supported by PCRE2 are described in the pcre2pattern -documentation. This document contains a quick-reference summary of the syntax. +documentation. This document contains a quick-reference summary of the pattern +syntax followed by the syntax of replacement strings in substitution function. +The full description of the latter is in the +pcre2api +documentation.


    QUOTING

    @@ -60,7 +67,10 @@

    pcre2syntax man page

    \Q...\E treat enclosed characters as literal
  • Note that white space inside \Q...\E is always treated as literal, even if -PCRE2_EXTENDED is set, causing most other white space to be ignored. +PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also +that PCRE2's handling of \Q...\E has some differences from Perl's. See the +pcre2pattern +documentation for details.


    BRACED ITEMS

    @@ -91,6 +101,11 @@

    pcre2syntax man page

    \xhh character with hex code hh \x{hh..} character with hex code hh.. +\N{U+hh..} is synonymous with \x{hh..} but is not supported in environments +that use EBCDIC code (mainly IBM mainframes). Note that \N not followed by an +opening curly bracket has a different meaning (see below). +

    +

    If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized:

    @@ -98,7 +113,7 @@ 

    pcre2syntax man page

    \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX
    -When \x is not followed by {, from zero to two hexadecimal digits are read, +When \x is not followed by {, one or two hexadecimal digits are read, but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise it matches a literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits @@ -112,9 +127,7 @@

    pcre2syntax man page

    in the pcre2pattern documentation, where details of escape processing in EBCDIC environments are -also given. \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not -supported in EBCDIC environments. Note that \N not followed by an opening -curly bracket has a different meaning (see below). +also given.


    CHARACTER TYPES

    @@ -154,8 +167,9 @@

    pcre2syntax man page

    Property descriptions in \p and \P are matched caselessly; hyphens, -underscores, and white space are ignored, in accordance with Unicode's "loose -matching" rules. +underscores, and ASCII white space characters are ignored, in accordance with +Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} is the same +as \p{ bidi class = AL }.


    GENERAL CATEGORY PROPERTIES FOR \p and \P

    @@ -168,13 +182,13 @@

    pcre2syntax man page

    Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -205,7 +219,9 @@

    pcre2syntax man page

    Zl Line separator Zp Paragraph separator Zs Space separator - + +From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all +equivalent to Lc.


    PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P

    @@ -268,7 +284,7 @@

    pcre2syntax man page

    RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space


    CHARACTER CLASSES
    @@ -299,7 +315,45 @@

    pcre2syntax man page

    but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class.

    -
    QUANTIFIERS
    +

    +When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be +used, allowing nested character classes, combined using set operators. +

    +  [x&&[^y]]   UTS#18 extended character class
    +
    +  x||y        set union (OR)
    +  x&&y        set intersection (AND)
    +  x--y        set difference (AND NOT)
    +  x~~y        set symmetric difference (XOR)
    +
    +
    +

    +
    PERL EXTENDED CHARACTER CLASSES
    +

    +

    +  (?[...])                Perl extended character class
    +  (?[\p{Thai} & \p{Nd}])  operators; whitespace ignored
    +  (?[(x - y) & z])        parentheses for grouping
    +
    +  (?[ [^3] & \p{Nd} ])    [...] is a nested ordinary class
    +  (?[ [:alpha:] - [z] ])  POSIX set is allowed outside [...]
    +  (?[ \d - [3] ])         backslash-escaped set is allowed outside [...]
    +  (?[ !\n & [:ascii:] ])  backslash-escaped character is allowed outside [...]
    +                      all other characters or ranges must be enclosed in [...]
    +
    +  x|y, x+y                set union (OR)
    +  x&y                     set intersection (AND)
    +  x-y                     set difference (AND NOT)
    +  x^y                     set symmetric difference (XOR)
    +  !x                      set complement (NOT)
    +
    +Inside a Perl extended character class, [...] switches mode to be interpreted +as an ordinary character class. Outside of a nested [...], the only items +permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside +a nested ordinary class, ^ has its usual meaning (inverts the class when used +as the first character); outside of a nested class, ^ is the XOR operator. +

    +
    QUANTIFIERS

       ?           0 or 1, greedy
    @@ -323,7 +377,7 @@ 

    pcre2syntax man page

    {,m}? zero up to m, lazy

    -
    ANCHORS AND SIMPLE ASSERTIONS
    +
    ANCHORS AND SIMPLE ASSERTIONS

       \b          word boundary
    @@ -341,7 +395,7 @@ 

    pcre2syntax man page

    \G first matching position in subject

    -
    REPORTED MATCH POINT SETTING
    +
    REPORTED MATCH POINT SETTING

       \K          set reported start of match
    @@ -351,13 +405,13 @@ 

    pcre2syntax man page

    option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ignored in negative ones.

    -
    ALTERNATION
    +
    ALTERNATION

       expr|expr|expr...
     

    -
    CAPTURING
    +
    CAPTURING

       (...)           capture group
    @@ -372,20 +426,20 @@ 

    pcre2syntax man page

    in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit.

    -
    ATOMIC GROUPS
    +
    ATOMIC GROUPS

       (?>...)         atomic non-capture group
       (*atomic:...)   atomic non-capture group
     

    -
    COMMENT
    +
    COMMENT

       (?#....)        comment (not nestable)
     

    -
    OPTION SETTING
    +
    OPTION SETTING

    Changes of these options within a group are automatically cancelled at the end of the group. @@ -409,7 +463,7 @@

    pcre2syntax man page

    (?^) unset imnrsx options
    (?aP) implies (?aT) as well, though this has no additional effect. However, it -means that (?-aP) is really (?-PT) which disables all ASCII restrictions for +means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes.

    @@ -421,20 +475,22 @@

    pcre2syntax man page

    The following are recognized only at the very start of a pattern or after one -of the newline or \R options with similar syntax. More than one of them may -appear. For the first three, d is a decimal number. -

    -  (*LIMIT_DEPTH=d) set the backtracking limit to d
    -  (*LIMIT_HEAP=d)  set the heap size limit to d * 1024 bytes
    -  (*LIMIT_MATCH=d) set the match limit to d
    -  (*NOTEMPTY)      set PCRE2_NOTEMPTY when matching
    -  (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching
    -  (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
    +of the newline or \R sequences or options with similar syntax. More than one
    +of them may appear. For the first three, d is a decimal number.
    +
    +  (*LIMIT_DEPTH=d)     set the backtracking limit to d
    +  (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
    +  (*LIMIT_MATCH=d)     set the match limit to d
    +  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
    +  (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
    +  (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
    +  (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
       (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
    -  (*NO_JIT)       disable JIT optimization
    -  (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
    -  (*UTF)          set appropriate UTF mode for the library in use
    -  (*UCP)          set PCRE2_UCP (use Unicode properties for \d etc)
    +  (*NO_JIT)            disable JIT optimization
    +  (*NO_START_OPT)      no start-match optimization (PCRE2_NO_START_OPTIMIZE)
    +  (*TURKISH_CASING)    set PCRE2_EXTRA_TURKISH_CASING when matching
    +  (*UTF)               set appropriate UTF mode for the library in use
    +  (*UCP)               set PCRE2_UCP (use Unicode properties for \d etc)
     
    Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or pcre2_dfa_match(), @@ -442,7 +498,7 @@

    pcre2syntax man page

    application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.

    -
    NEWLINE CONVENTION
    +
    NEWLINE CONVENTION

    These are recognized only at the very start of the pattern or after option settings with a similar syntax. @@ -455,7 +511,7 @@

    pcre2syntax man page

    (*NUL) the NUL character (binary zero)

    -
    WHAT \R MATCHES
    +
    WHAT \R MATCHES

    These are recognized only at the very start of the pattern or after option setting with a similar syntax. @@ -464,7 +520,7 @@

    pcre2syntax man page

    (*BSR_UNICODE) any Unicode newline sequence

    -
    LOOKAHEAD AND LOOKBEHIND ASSERTIONS
    +
    LOOKAHEAD AND LOOKBEHIND ASSERTIONS

       (?=...)                     )
    @@ -490,7 +546,7 @@ 

    pcre2syntax man page

    (ultimate default 255). If every branch matches a fixed number of characters, the limit for each branch is 65535 characters.

    -
    NON-ATOMIC LOOKAROUND ASSERTIONS
    +
    NON-ATOMIC LOOKAROUND ASSERTIONS

    These assertions are specific to PCRE2 and are not Perl-compatible.

    @@ -503,7 +559,24 @@ 

    pcre2syntax man page

    (*non_atomic_positive_lookbehind:...) )

    -
    SCRIPT RUNS
    +
    SUBSTRING SCAN ASSERTION
    +

    +This feature is not Perl-compatible. +

    +  (*scan_substring:(grouplist)...)  scan captured substring
    +  (*scs:(grouplist)...)             scan captured substring
    +
    +The comma-separated list may identify groups in any of the following ways: +
    +  n       absolute reference
    +  +n      relative reference
    +  -n      relative reference
    +  <name>  name
    +  'name'  name
    +
    +
    +

    +
    SCRIPT RUNS

       (*script_run:...)           ) script run, can be backtracked into
    @@ -513,7 +586,7 @@ 

    pcre2syntax man page

    (*asr:...) )

    -
    BACKREFERENCES
    +
    BACKREFERENCES

       \n              reference by number (can be ambiguous)
    @@ -530,7 +603,7 @@ 

    pcre2syntax man page

    (?P=name) reference by name (Python)

    -
    SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
    +
    SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)

       (?R)            recurse whole pattern
    @@ -549,7 +622,7 @@ 

    pcre2syntax man page

    \g'-n' call subroutine by relative number (PCRE2 extension)

    -
    CONDITIONAL PATTERNS
    +
    CONDITIONAL PATTERNS

       (?(condition)yes-pattern)
    @@ -572,7 +645,7 @@ 

    pcre2syntax man page

    conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists.

    -
    BACKTRACKING CONTROL
    +
    BACKTRACKING CONTROL

    All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour @@ -599,7 +672,7 @@

    pcre2syntax man page

    The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call.

    -
    CALLOUTS
    +
    CALLOUTS

       (?C)            callout (assumed number 0)
    @@ -610,12 +683,58 @@ 

    pcre2syntax man page

    start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it.

    -
    SEE ALSO
    +
    REPLACEMENT STRINGS
    +

    +If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for +pcre2_substitute() is not interpreted. Otherwise, by default, the only +special character is the dollar character in one of the following forms: +

    +  $$                  insert a dollar character
    +  $n or ${n}          insert the contents of group n
    +  $<name>             insert the contents of named group
    +  $0 or $&            insert the entire matched substring
    +  $`                  insert the substring that precedes the match
    +  $'                  insert the substring that follows the match
    +  $_                  insert the entire input string
    +  $*MARK or ${*MARK}  insert a control verb name
    +
    +For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, +there is additional interpretation: +

    +

    +1. Backslash is an escape character, and the forms described in "ESCAPED +CHARACTERS" above are recognized. Also: +

    +  \Q...\E   can be used to suppress interpretation
    +  \l        force the next character to lower case
    +  \u        force the next character to upper case
    +  \L        force subsequent characters to lower case
    +  \U        force subsequent characters to upper case
    +  \u\L      force next character to upper case, then all lower
    +  \l\U      force next character to lower case, then all upper
    +  \E        end \L or \U case forcing
    +  \b        backspace character (note: as in character class in pattern)
    +  \v        vertical tab character (note: not the same as in a pattern)
    +
    +2. The Python form \g<n>, where the angle brackets are part of the syntax and +n is either a group name or a number, is recognized as an alternative way +of inserting the contents of a group, for example \g<3>. +

    +

    +3. Capture substitution supports the following additional forms: +

    +  ${n:-string}             default for unset group
    +  ${n:+string1:string2}    values for set/unset group
    +
    +The substitution strings themselves are expanded. Backslash can be used to +escape colons and closing curly brackets. +

    +
    SEE ALSO

    pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -624,11 +743,11 @@

    pcre2syntax man page

    Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 12 October 2023 +Last updated: 27 November 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/mingw32/share/doc/pcre2/html/pcre2test.html b/mingw32/share/doc/pcre2/html/pcre2test.html index 6cc3cc317ff..db9073f0e60 100644 --- a/mingw32/share/doc/pcre2/html/pcre2test.html +++ b/mingw32/share/doc/pcre2/html/pcre2test.html @@ -105,8 +105,8 @@

    pcre2test man page

    When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed -to the library. For subject lines, backslash escapes can be used. In addition, -when the utf modifier (see +to the library. For subject lines and some patterns, backslash escapes can be +used. In addition, when the utf modifier (see "Setting compilation options" below) is set, the pattern and any following subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. @@ -125,9 +125,8 @@

    pcre2test man page

    than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) -0x80000000 is added to the character's value. This is the only way of passing -such code points in a pattern string. For subject strings, using an escape -sequence is preferable. +0x80000000 is added to the character's value. For subject strings, using an +escape sequence is preferable.


    COMMAND LINE OPTIONS

    @@ -178,8 +177,8 @@

    pcre2test man page

    following options output the value and set the exit code as indicated:
       ebcdic-nl  the code for LF (= NL) in an EBCDIC environment:
    -               0x15 or 0x25
    -               0 if used in an ASCII environment
    +               either 0x15 or 0x25
    +               0 if used in an ASCII/Unicode environment
                    exit code is always 0
       linksize   the configured internal link size (2, 3, or 4)
                    exit code is set to the link size
    @@ -201,6 +200,16 @@ 

    pcre2test man page

    pcre2-8 the 8-bit library was built unicode Unicode support is available
    +Note that the availability of JIT support in the library does not guarantee +that it can actually be used because in some environments it is unable to +allocate executable memory. The option "jitusable" gives more detailed +information. It returns one of the following values: +
    +  0  JIT is available and usable
    +  1  JIT is available but cannot allocate executable memory
    +  2  JIT is not available
    +  3  Unexpected return from test call to pcre2_jit_compile()
    +
    If an unknown option is given, an error message is output; the exit code is 0.

    @@ -527,39 +536,48 @@

    pcre2test man page

    subject_literal modifier was set for the pattern. The following provide a means of encoding non-printing characters in a visible way:
    -  \a         alarm (BEL, \x07)
    -  \b         backspace (\x08)
    -  \e         escape (\x27)
    -  \f         form feed (\x0c)
    -  \n         newline (\x0a)
    -  \r         carriage return (\x0d)
    -  \t         tab (\x09)
    -  \v         vertical tab (\x0b)
    -  \nnn       octal character (up to 3 octal digits); always
    -               a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode
    -  \o{dd...}  octal character (any number of octal digits}
    -  \xhh       hexadecimal byte (up to 2 hex digits)
    -  \x{hh...}  hexadecimal character (any number of hex digits)
    +  \a          alarm (BEL, \x07)
    +  \b          backspace (\x08)
    +  \e          escape (\x27)
    +  \f          form feed (\x0c)
    +  \n          newline (\x0a)
    +  \N{U+hh...} unicode character (any number of hex digits)
    +  \r          carriage return (\x0d)
    +  \t          tab (\x09)
    +  \v          vertical tab (\x0b)
    +  \ddd        octal number (up to 3 octal digits); represent a single
    +                code point unless larger than 255 with the 8-bit library
    +  \o{dd...}   octal number (any number of octal digits} representing a
    +                character in UTF mode or a code point
    +  \xhh        hexadecimal byte (up to 2 hex digits)
    +  \x{hh...}   hexadecimal number (up to 8 hex digits) representing a
    +                character in UTF mode or a code point
     
    -The use of \x{hh...} is not dependent on the use of the utf modifier on -the pattern. It is recognized always. There may be any number of hexadecimal -digits inside the braces; invalid values provoke error messages. +Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf +modifier on the pattern. It is always recognized. There may be any number of +hexadecimal digits inside the braces; invalid values provoke error messages +but when using \N{U+hh...} with some invalid unicode characters they will +be accepted with a warning instead.

    -Note that \xhh specifies one byte rather than one character in UTF-8 mode; -this makes it possible to construct invalid UTF-8 sequences for testing -purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in -UTF-8 mode, generating more than one byte if the value is greater than 127. -When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte -for values less than 256, and causes an error for greater values. +Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) +describe one byte rather than one character; this makes it possible to +construct invalid UTF-8 sequences for testing purposes. On the other hand, +\x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating +more than one byte if the value is greater than 127. To avoid the ambiguity +it is preferred to use \N{U+hh...} when describing characters. When testing +the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values +that could fit on it, and causes an error for greater values.

    -In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it -possible to construct invalid UTF-16 sequences for testing purposes. +When testing the 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh} +values are accepted. This makes it possible to construct invalid UTF-16 +sequences for testing purposes.

    -In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it -possible to construct invalid UTF-32 sequences for testing purposes. +When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...} +values are accepted. This makes it possible to construct invalid UTF-32 +sequences for testing purposes.

    There is a special backslash sequence that specifies replication of one or more @@ -625,6 +643,7 @@

    pcre2test man page

    allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -653,13 +672,17 @@

    pcre2test man page

    match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -671,6 +694,23 @@

    pcre2test man page

    brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. +
    +
    +The following modifiers enable or disable performance optimizations by +calling pcre2_set_optimize() before invoking the regex compiler. +
    +      optimization_full      enable all optional optimizations
    +      optimization_none      disable all optional optimizations
    +      auto_possess           auto-possessify variable quantifiers
    +      auto_possess_off       don't auto-possessify variable quantifiers
    +      dotstar_anchor         anchor patterns starting with .*
    +      dotstar_anchor_off     don't anchor patterns starting with .*
    +      start_optimize         enable pre-scan of subject string
    +      start_optimize_off     disable pre-scan of subject string
    +
    +See the +pcre2_set_optimize +documentation for details on these optimizations.


    Setting compilation controls @@ -680,14 +720,15 @@

    pcre2test man page

    about the pattern. There are single-letter abbreviations for some that are heavily used in the test files.
    -      bsr=[anycrlf|unicode]     specify \R handling
       /B  bincode                   show binary code without lengths
    +      bsr=[anycrlf|unicode]     specify \R handling
           callout_info              show callout information
           convert=<options>         request foreign pattern conversion
           convert_glob_escape=c     set glob escape character
           convert_glob_separator=c  set glob separator character
           convert_length            set convert buffer length
           debug                     same as info,fullbincode
    +      expand                    expand repetition syntax in pattern
           framesize                 show matching frame size
           fullbincode               show binary code with lengths
       /I  info                      show info about compiled pattern
    @@ -709,6 +750,7 @@ 

    pcre2test man page

    posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard=<number> test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -1128,6 +1170,7 @@

    pcre2test man page

    replace=<string> specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1217,10 +1260,11 @@

    pcre2test man page

    The following modifiers set options for pcre2_match() or pcre2_dfa_match(). See -pcreapi +pcre2api for a description of their effects.

           anchored                   set PCRE2_ANCHORED
    +      copy_matched_subject       set PCRE2_COPY_MATCHED_SUBJECT
           endanchored                set PCRE2_ENDANCHORED
           dfa_restart                set PCRE2_DFA_RESTART
           dfa_shortest               set PCRE2_DFA_SHORTEST
    @@ -1271,8 +1315,8 @@ 

    pcre2test man page

    aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data=<n> set a value to pass via callouts @@ -1306,7 +1350,8 @@

    pcre2test man page

    startchar show startchar when relevant startoffset=<n> same as offset=<n> substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1592,6 +1637,21 @@

    pcre2test man page

    or stop is supported, which is sufficient for testing that the feature works.


    +Testing substitute case callouts +
    +

    +If the substitute_case_callout modifier is set, a substitution +case callout function is set up. The callout function is called for each +substituted chunk which is to be case-transformed. +

    +

    +The callout function passed is a fixed function with implementation for certain +behaviours: inputs which shrink when case-transformed; inputs which grow; inputs +with distinct upper/lower/titlecase forms. The characters which are not +special-cased for testing purposes are left unmodified, as if they are caseless +characters. +

    +
    Setting the JIT stack size

    @@ -2204,7 +2264,7 @@

    pcre2test man page


    REVISION

    -Last updated: 24 April 2024 +Last updated: 26 December 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw32/share/doc/pcre2/html/pcre2unicode.html b/mingw32/share/doc/pcre2/html/pcre2unicode.html index 6f0972e706a..5b425329fac 100644 --- a/mingw32/share/doc/pcre2/html/pcre2unicode.html +++ b/mingw32/share/doc/pcre2/html/pcre2unicode.html @@ -53,7 +53,7 @@

    pcre2unicode man page

    The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal number, the derived properties -Any and LC (synonym L&), the Unicode script names such as Arabic or Han, +Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties.

    @@ -157,6 +157,40 @@

    pcre2unicode man page

    counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. +
    +    Without PCRE2_EXTRA_CASELESS_RESTRICT:
    +      'k' = 'K' = U+212A (Kelvin sign)
    +      's' = 'S' = U+017F (long S)
    +    With PCRE2_EXTRA_CASELESS_RESTRICT:
    +      'k' = 'K'
    +      U+212A (Kelvin sign)  only case-equivalent to itself
    +      's' = 'S'
    +      U+017F (long S)       only case-equivalent to itself
    +
    +

    +

    +One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +

    +    Without PCRE2_EXTRA_TURKISH_CASING:
    +      'i' = 'I'
    +      U+0130 (capital I with dot above)  only case-equivalent to itself
    +      U+0131 (small dotless i)           only case-equivalent to itself
    +    With PCRE2_EXTRA_TURKISH_CASING:
    +      'i' = U+0130 (capital I with dot above)
    +      U+0131 (small dotless i) = 'I'
    +
    +

    +

    +It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and +PCRE2_EXTRA_TURKISH_CASING together. +

    +

    +From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower +case), and Lt (title case) are all treated as Lc (cased letter) when caseless +matching is set by the PCRE2_CASELESS option or (?i) within the pattern.


    SCRIPT RUNS @@ -513,9 +547,9 @@

    pcre2unicode man page

    REVISION

    -Last updated: 12 October 2023 +Last updated: 27 November 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/mingw32/share/doc/pcre2/pcre2-config.txt b/mingw32/share/doc/pcre2/pcre2-config.txt index dc8cf8f7ed4..c87de589db7 100644 --- a/mingw32/share/doc/pcre2/pcre2-config.txt +++ b/mingw32/share/doc/pcre2/pcre2-config.txt @@ -1,4 +1,3 @@ - PCRE2-CONFIG(1) General Commands Manual PCRE2-CONFIG(1) @@ -82,4 +81,4 @@ REVISION Last updated: 28 September 2014 -PCRE2 10.00 28 September 2014 PCRE2-CONFIG(1) +PCRE2 10.45 28 September 2014 PCRE2-CONFIG(1) diff --git a/mingw32/share/doc/pcre2/pcre2.txt b/mingw32/share/doc/pcre2/pcre2.txt index 85eead6e61f..38e86d6e6a3 100644 --- a/mingw32/share/doc/pcre2/pcre2.txt +++ b/mingw32/share/doc/pcre2/pcre2.txt @@ -8,7 +8,6 @@ pcre2test commands. ----------------------------------------------------------------------------- - PCRE2(3) Library Functions Manual PCRE2(3) @@ -171,27 +170,29 @@ USER DOCUMENTATION library function, listing its arguments and results. -AUTHOR +AUTHORS - Philip Hazel - Retired from University Computing Service - Cambridge, England. + The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Her- + czeg. + + PCRE2 was written by Philip Hazel, of the University Computing Service, + Cambridge, England. Many others have also contributed. - Putting an actual email address here is a spam magnet. If you want to - email me, use my two names separated by a dot at gmail.com. + To contact the maintainers, please use the GitHub issues tracker or + PCRE2 mailing list, as described at the project page: + https://github.com/PCRE2Project/pcre2 REVISION - Last updated: 27 August 2021 + Last updated: 18 December 2024 Copyright (c) 1997-2021 University of Cambridge. -PCRE2 10.38 27 August 2021 PCRE2(3) +PCRE2 10.45 18 December 2024 PCRE2(3) ------------------------------------------------------------------------------ - PCRE2API(3) Library Functions Manual PCRE2API(3) @@ -298,6 +299,9 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); + int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); + PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS @@ -317,6 +321,12 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); @@ -858,6 +868,7 @@ PCRE2 CONTEXTS The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) + Which performance optimizations the compiler should apply A compile context is also required if you are using custom memory man- agement. If none of these apply, just pass NULL as the context argu- @@ -980,6 +991,110 @@ PCRE2 CONTEXTS ment of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. + int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); + + PCRE2 can apply various performance optimizations during compilation, + in order to make matching faster. For example, the compiler might con- + vert some regex constructs into an equivalent construct which + pcre2_match() can execute faster. By default, all available optimiza- + tions are enabled. However, in rare cases, one might wish to disable + specific optimizations. For example, if it is known that some optimiza- + tions cannot benefit a certain regex, it might be desirable to disable + them, in order to speed up compilation. + + The permitted values of directive are as follows: + + PCRE2_OPTIMIZATION_FULL + + Enable all optional performance optimizations. This is the default + value. + + PCRE2_OPTIMIZATION_NONE + + Disable all optional performance optimizations. + + PCRE2_AUTO_POSSESS + PCRE2_AUTO_POSSESS_OFF + + Enable/disable "auto-possessification" of variable quantifiers such as + * and +. This optimization, for example, turns a+b into a++b in order + to avoid backtracks into a+ that can never be successful. However, if + callouts are in use, auto-possessification means that some callouts are + never taken. You can disable this optimization if you want the matching + functions to do a full, unoptimized search and run all the callouts. + + PCRE2_DOTSTAR_ANCHOR + PCRE2_DOTSTAR_ANCHOR_OFF + + Enable/disable an optimization that is applied when .* is the first + significant item in a top-level branch of a pattern, and all the other + branches also start with .* or with \A or \G or ^. Such a pattern is + automatically anchored if PCRE2_DOTALL is set for all the .* items and + PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that + any match must start either at the start of the subject or following a + newline is remembered. Like other optimizations, this can cause call- + outs to be skipped. + + Dotstar anchor optimization is automatically disabled for .* if it is + inside an atomic group or a capture group that is the subject of a + backreference, or if the pattern contains (*PRUNE) or (*SKIP). + + PCRE2_START_OPTIMIZE + PCRE2_START_OPTIMIZE_OFF + + Enable/disable optimizations which cause matching functions to scan the + subject string for specific code unit values before attempting a match. + For example, if it is known that an unanchored match must start with a + specific value, the matching code searches the subject for that value, + and fails immediately if it cannot find it, without actually running + the main matching function. This means that a special item such as + (*COMMIT) at the start of a pattern is not considered until after a + suitable starting point for the match has been found. Also, when call- + outs or (*MARK) items are in use, these "start-up" optimizations can + cause them to be skipped if the pattern is never actually used. The + start-up optimizations are in effect a pre-scan of the subject that + takes place before the pattern is run. + + Disabling start-up optimizations ensures that in cases where the result + is "no match", the callouts do occur, and that items such as (*COMMIT) + and (*MARK) are considered at every possible starting position in the + subject string. + + Disabling start-up optimizations may change the outcome of a matching + operation. Consider the pattern + + (*COMMIT)ABC + + When this is compiled, PCRE2 records the fact that a match must start + with the character "A". Suppose the subject string is "DEFABC". The + start-up optimization scans along the subject, finds "A" and runs the + first match attempt from there. The (*COMMIT) item means that the pat- + tern must match the current starting position, which in this case, it + does. However, if the same match is run without start-up optimizations, + the initial scan along the subject string does not happen. The first + match attempt is run starting from "D" and when this fails, (*COMMIT) + prevents any further matches being tried, so the overall result is "no + match". + + Another start-up optimization makes use of a minimum length for a + matching subject, which is recorded when possible. Consider the pattern + + (*MARK:1)B(*MARK:2)(X|Y) + + The minimum length for a match is two characters. If the subject is + "XXBB", the "starting character" optimization skips "XX", then tries to + match "BB", which is long enough. In the process, (*MARK:2) is encoun- + tered and remembered. When the match attempt fails, the next "B" is + found, but there is only one character left, so there are no more at- + tempts, and "no match" is returned with the "last mark seen" set to + "2". Without start-up optimizations, however, matches are tried at + every possible starting position, including at the end of the subject, + where (*MARK:1) is encountered, but there is no "B", so the "last mark + seen" that is returned is "1". In this case, the optimizations do not + affect the overall match result, which is still "no match", but they do + affect the auxiliary information that is returned. + The match context A match context is required if you want to: @@ -1025,6 +1140,16 @@ PCRE2 CONTEXTS tion made by pcre2_substitute(). Details are given in the section enti- tled "Creating a new string with substitutions" below. + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + + This sets up a callout function for PCRE2 to call when performing case + transformations inside pcre2_substitute(). Details are given in the + section entitled "Creating a new string with substitutions" below. + int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); @@ -1224,8 +1349,11 @@ CHECKING BUILD-TIME OPTIONS The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee - that JIT will be used for any given match. See the pcre2jit documenta- - tion for more details. + that JIT will be used for any given match, and neither does it guaran- + tee that JIT will actually be able to function, because it may not be + able to allocate executable memory in some environments. There is a + special call to pcre2_jit_compile() that can be used to check this. See + the pcre2jit documentation for more details. PCRE2_CONFIG_JITTARGET @@ -1413,10 +1541,10 @@ COMPILING A PATTERN spectively, when pcre2_compile() returns NULL because a compilation er- ror has occurred. - There are nearly 100 positive error codes that pcre2_compile() may re- - turn if it finds an error in the pattern. There are also some negative - error codes that are used for invalid UTF strings when validity check- - ing is in force. These are the same as given by pcre2_match() and + There are over 100 positive error codes that pcre2_compile() may return + if it finds an error in the pattern. There are also some negative error + codes that are used for invalid UTF strings when validity checking is + in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and are described in the pcre2unicode documentation. There is no separate documentation for the positive error codes, be- cause the textual error messages that are obtained by calling the @@ -1511,39 +1639,56 @@ COMPILING A PATTERN Perl. If you want a multiline circumflex also to match after a termi- nating newline, you must set PCRE2_ALT_CIRCUMFLEX. + PCRE2_ALT_EXTENDED_CLASS + + Alters the parsing of character classes to follow the extended syntax + described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no + impact on the behaviour of the Perl-specific "(?[...])" syntax for ex- + tended classes, but instead enables the alternative syntax of extended + class behaviour inside ordinary "[...]" character classes. See the + pcre2pattern documentation for details of the character classes sup- + ported. + PCRE2_ALT_VERBNAMES - By default, for compatibility with Perl, the name in any verb sequence - such as (*MARK:NAME) is any sequence of characters that does not in- - clude a closing parenthesis. The name is not processed in any way, and - it is not possible to include a closing parenthesis in the name. How- - ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- - cessing is applied to verb names and only an unescaped closing paren- - thesis terminates the name. A closing parenthesis can be included in a - name either as \) or between \Q and \E. If the PCRE2_EXTENDED or - PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped - whitespace in verb names is skipped and #-comments are recognized, ex- + By default, for compatibility with Perl, the name in any verb sequence + such as (*MARK:NAME) is any sequence of characters that does not in- + clude a closing parenthesis. The name is not processed in any way, and + it is not possible to include a closing parenthesis in the name. How- + ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- + cessing is applied to verb names and only an unescaped closing paren- + thesis terminates the name. A closing parenthesis can be included in a + name either as \) or between \Q and \E. If the PCRE2_EXTENDED or + PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped + whitespace in verb names is skipped and #-comments are recognized, ex- actly as in the rest of the pattern. PCRE2_AUTO_CALLOUT - If this bit is set, pcre2_compile() automatically inserts callout - items, all with number 255, before each pattern item, except immedi- - ately before or after an explicit callout in the pattern. For discus- + If this bit is set, pcre2_compile() automatically inserts callout + items, all with number 255, before each pattern item, except immedi- + ately before or after an explicit callout in the pattern. For discus- sion of the callout facility, see the pcre2callout documentation. PCRE2_CASELESS - If this bit is set, letters in the pattern match both upper and lower - case letters in the subject. It is equivalent to Perl's /i option, and - it can be changed within a pattern by a (?i) option setting. If either - PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all - characters with more than one other case, and for all characters whose - code points are greater than U+007F. Note that there are two ASCII - characters, K and S, that, in addition to their lower case ASCII equiv- - alents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long - S) respectively. If you do not want this case equivalence, you can sup- - press it by setting PCRE2_EXTRA_CASELESS_RESTRICT. + If this bit is set, letters in the pattern match both upper and lower + case letters in the subject. It is equivalent to Perl's /i option, and + it can be changed within a pattern by a (?i) option setting. If either + PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all + characters with more than one other case, and for all characters whose + code points are greater than U+007F. + + Note that there are two ASCII characters, K and S, that, in addition to + their lower case ASCII equivalents, are case-equivalent with U+212A + (Kelvin sign) and U+017F (long S) respectively. If you do not want this + case equivalence, you can suppress it by setting PCRE2_EXTRA_CASE- + LESS_RESTRICT. + + One language family, Turkish and Azeri, has its own case-insensitivity + rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. + This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot + above), and U+0131 (small dotless i) characters. For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup @@ -1551,201 +1696,206 @@ COMPILING A PATTERN (available only in 16-bit or 32-bit mode) are treated as not having an- other case. + From release 10.45 PCRE2_CASELESS also affects what some of the letter- + related Unicode property escapes (\p and \P) match. The properties Lu + (upper case letter), Ll (lower case letter), and Lt (title case letter) + are all treated as LC (cased letter) when PCRE2_CASELESS is set. + PCRE2_DOLLAR_ENDONLY - If this bit is set, a dollar metacharacter in the pattern matches only - at the end of the subject string. Without this option, a dollar also - matches immediately before a newline at the end of the string (but not - before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored - if PCRE2_MULTILINE is set. There is no equivalent to this option in + If this bit is set, a dollar metacharacter in the pattern matches only + at the end of the subject string. Without this option, a dollar also + matches immediately before a newline at the end of the string (but not + before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored + if PCRE2_MULTILINE is set. There is no equivalent to this option in Perl, and no way to set it within a pattern. PCRE2_DOTALL - If this bit is set, a dot metacharacter in the pattern matches any - character, including one that indicates a newline. However, it only + If this bit is set, a dot metacharacter in the pattern matches any + character, including one that indicates a newline. However, it only ever matches one character, even if newlines are coded as CRLF. Without this option, a dot does not match when the current position in the sub- - ject is at a newline. This option is equivalent to Perl's /s option, + ject is at a newline. This option is equivalent to Perl's /s option, and it can be changed within a pattern by a (?s) option setting. A neg- - ative class such as [^a] always matches newline characters, and the \N - escape sequence always matches a non-newline character, independent of + ative class such as [^a] always matches newline characters, and the \N + escape sequence always matches a non-newline character, independent of the setting of PCRE2_DOTALL. PCRE2_DUPNAMES - If this bit is set, names used to identify capture groups need not be - unique. This can be helpful for certain types of pattern when it is - known that only one instance of the named group can ever be matched. - There are more details of named capture groups below; see also the + If this bit is set, names used to identify capture groups need not be + unique. This can be helpful for certain types of pattern when it is + known that only one instance of the named group can ever be matched. + There are more details of named capture groups below; see also the pcre2pattern documentation. PCRE2_ENDANCHORED - If this bit is set, the end of any pattern match must be right at the + If this bit is set, the end of any pattern match must be right at the end of the string being searched (the "subject string"). If the pattern match succeeds by reaching (*ACCEPT), but does not reach the end of the - subject, the match fails at the current starting point. For unanchored - patterns, a new match is then tried at the next starting point. How- + subject, the match fails at the current starting point. For unanchored + patterns, a new match is then tried at the next starting point. How- ever, if the match succeeds by reaching the end of the pattern, but not - the end of the subject, backtracking occurs and an alternative match + the end of the subject, backtracking occurs and an alternative match may be found. Consider these two patterns: .(*ACCEPT)|.. .|.. - If matched against "abc" with PCRE2_ENDANCHORED set, the first matches - "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED - can also be achieved by appropriate constructs in the pattern itself, + If matched against "abc" with PCRE2_ENDANCHORED set, the first matches + "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED + can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only - to the first (that is, the longest) matched string. Other parallel - matches, which are necessarily substrings of the first one, must obvi- + to the first (that is, the longest) matched string. Other parallel + matches, which are necessarily substrings of the first one, must obvi- ously end before the end of the subject. PCRE2_EXTENDED - If this bit is set, most white space characters in the pattern are to- - tally ignored except when escaped, inside a character class, or inside - a \Q...\E sequence. However, white space is not allowed within se- - quences such as (?> that introduce various parenthesized groups, nor - within numerical quantifiers such as {1,3}. Ignorable white space is - permitted between an item and a following quantifier and between a - quantifier and a following + that indicates possessiveness. PCRE2_EX- - TENDED is equivalent to Perl's /x option, and it can be changed within + If this bit is set, most white space characters in the pattern are to- + tally ignored except when escaped, inside a character class, or inside + a \Q...\E sequence. However, white space is not allowed within se- + quences such as (?> that introduce various parenthesized groups, nor + within numerical quantifiers such as {1,3}. Ignorable white space is + permitted between an item and a following quantifier and between a + quantifier and a following + that indicates possessiveness. PCRE2_EX- + TENDED is equivalent to Perl's /x option, and it can be changed within a pattern by a (?x) option setting. - When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- - nizes as white space only those characters with code points less than + When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- + nizes as white space only those characters with code points less than 256 that are flagged as white space in its low-character table. The ta- ble is normally created by pcre2_maketables(), which uses the isspace() - function to identify space characters. In most ASCII environments, the - relevant characters are those with code points 0x0009 (tab), 0x000A - (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage + function to identify space characters. In most ASCII environments, the + relevant characters are those with code points 0x0009 (tab), 0x000A + (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage return), and 0x0020 (space). When PCRE2 is compiled with Unicode support, in addition to these char- - acters, five more Unicode "Pattern White Space" characters are recog- + acters, five more Unicode "Pattern White Space" characters are recog- nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to- - right mark), U+200F (right-to-left mark), U+2028 (line separator), and - U+2029 (paragraph separator). This set of characters is the same as - recognized by Perl's /x option. Note that the horizontal and vertical - space characters that are matched by the \h and \v escapes in patterns + right mark), U+200F (right-to-left mark), U+2028 (line separator), and + U+2029 (paragraph separator). This set of characters is the same as + recognized by Perl's /x option. Note that the horizontal and vertical + space characters that are matched by the \h and \v escapes in patterns are a much bigger set. - As well as ignoring most white space, PCRE2_EXTENDED also causes char- - acters between an unescaped # outside a character class and the next - newline, inclusive, to be ignored, which makes it possible to include + As well as ignoring most white space, PCRE2_EXTENDED also causes char- + acters between an unescaped # outside a character class and the next + newline, inclusive, to be ignored, which makes it possible to include comments inside complicated patterns. Note that the end of this type of - comment is a literal newline sequence in the pattern; escape sequences + comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. Which characters are interpreted as newlines can be specified by a set- - ting in the compile context that is passed to pcre2_compile() or by a - special sequence at the start of the pattern, as described in the sec- - tion entitled "Newline conventions" in the pcre2pattern documentation. + ting in the compile context that is passed to pcre2_compile() or by a + special sequence at the start of the pattern, as described in the sec- + tion entitled "Newline conventions" in the pcre2pattern documentation. A default is defined when PCRE2 is built. PCRE2_EXTENDED_MORE - This option has the effect of PCRE2_EXTENDED, but, in addition, un- - escaped space and horizontal tab characters are ignored inside a char- - acter class. Note: only these two characters are ignored, not the full - set of pattern white space characters that are ignored outside a char- - acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, + This option has the effect of PCRE2_EXTENDED, but, in addition, un- + escaped space and horizontal tab characters are ignored inside a char- + acter class. Note: only these two characters are ignored, not the full + set of pattern white space characters that are ignored outside a char- + acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, and it can be changed within a pattern by a (?xx) option setting. PCRE2_FIRSTLINE If this option is set, the start of an unanchored pattern match must be - before or at the first newline in the subject string following the - start of matching, though the matched text may continue over the new- + before or at the first newline in the subject string following the + start of matching, though the matched text may continue over the new- line. If startoffset is non-zero, the limiting newline is not necessar- - ily the first newline in the subject. For example, if the subject + ily the first newline in the subject. For example, if the subject string is "abc\nxyz" (where \n represents a single-character newline) a - pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is - greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more - general limiting facility. If PCRE2_FIRSTLINE is set with an offset - limit, a match must occur in the first line and also within the offset + pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is + greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more + general limiting facility. If PCRE2_FIRSTLINE is set with an offset + limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. This option has no effect for anchored patterns. PCRE2_LITERAL If this option is set, all meta-characters in the pattern are disabled, - and it is treated as a literal string. Matching literal strings with a + and it is treated as a literal string. Matching literal strings with a regular expression engine is not the most efficient way of doing it. If - you are doing a lot of literal matching and are worried about effi- + you are doing a lot of literal matching and are worried about effi- ciency, you should consider using other approaches. The only other main options that are allowed with PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, - PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- + PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- TRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error. PCRE2_MATCH_INVALID_UTF - This option forces PCRE2_UTF (see below) and also enables support for - matching by pcre2_match() in subject strings that contain invalid UTF - sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries - process strings as sequences of uint16_t or uint32_t code points. They + This option forces PCRE2_UTF (see below) and also enables support for + matching by pcre2_match() in subject strings that contain invalid UTF + sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries + process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes un- - less such sequences are suitably aligned. This facility is not sup- - ported for DFA matching. For details, see the pcre2unicode documenta- + less such sequences are suitably aligned. This facility is not sup- + ported for DFA matching. For details, see the pcre2unicode documenta- tion. PCRE2_MATCH_UNSET_BACKREF - If this option is set, a backreference to an unset capture group - matches an empty string (by default this causes the current matching + If this option is set, a backreference to an unset capture group + matches an empty string (by default this causes the current matching alternative to fail). A pattern such as (\1)(a) succeeds when this op- - tion is set (assuming it can find an "a" in the subject), whereas it - fails by default, for Perl compatibility. Setting this option makes + tion is set (assuming it can find an "a" in the subject), whereas it + fails by default, for Perl compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka JavaScript). PCRE2_MULTILINE - By default, for the purposes of matching "start of line" and "end of - line", PCRE2 treats the subject string as consisting of a single line - of characters, even if it actually contains newlines. The "start of - line" metacharacter (^) matches only at the start of the string, and - the "end of line" metacharacter ($) matches only at the end of the - string, or before a terminating newline (except when PCRE2_DOLLAR_EN- + By default, for the purposes of matching "start of line" and "end of + line", PCRE2 treats the subject string as consisting of a single line + of characters, even if it actually contains newlines. The "start of + line" metacharacter (^) matches only at the start of the string, and + the "end of line" metacharacter ($) matches only at the end of the + string, or before a terminating newline (except when PCRE2_DOLLAR_EN- DONLY is set). Note, however, that unless PCRE2_DOTALL is set, the "any - character" metacharacter (.) does not match at a newline. This behav- + character" metacharacter (.) does not match at a newline. This behav- iour (for ^, $, and dot) is the same as Perl. - When PCRE2_MULTILINE it is set, the "start of line" and "end of line" - constructs match immediately following or immediately before internal - newlines in the subject string, respectively, as well as at the very - start and end. This is equivalent to Perl's /m option, and it can be + When PCRE2_MULTILINE it is set, the "start of line" and "end of line" + constructs match immediately following or immediately before internal + newlines in the subject string, respectively, as well as at the very + start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. Note that the "start of line" metacharacter does not match after a newline at the end of the - subject, for compatibility with Perl. However, you can change this by - setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a - subject string, or no occurrences of ^ or $ in a pattern, setting + subject, for compatibility with Perl. However, you can change this by + setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a + subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. PCRE2_NEVER_BACKSLASH_C - This option locks out the use of \C in the pattern that is being com- - piled. This escape can cause unpredictable behaviour in UTF-8 or - UTF-16 modes, because it may leave the current matching point in the + This option locks out the use of \C in the pattern that is being com- + piled. This escape can cause unpredictable behaviour in UTF-8 or + UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in ap- plications that process patterns from external sources. Note that there is also a build-time option that permanently locks out the use of \C. PCRE2_NEVER_UCP - This option locks out the use of Unicode properties for handling \B, + This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as - described for the PCRE2_UCP option below. In particular, it prevents - the creator of the pattern from enabling this facility by starting the - pattern with (*UCP). This option may be useful in applications that - process patterns from external sources. The option combination PCRE_UCP - and PCRE_NEVER_UCP causes an error. + described for the PCRE2_UCP option below. In particular, it prevents + the creator of the pattern from enabling this facility by starting the + pattern with (*UCP). This option may be useful in applications that + process patterns from external sources. The option combination + PCRE2_UCP and PCRE2_NEVER_UCP causes an error. PCRE2_NEVER_UTF @@ -1769,86 +1919,56 @@ COMPILING A PATTERN PCRE2_NO_AUTO_POSSESS - If this option is set, it disables "auto-possessification", which is an - optimization that, for example, turns a+b into a++b in order to avoid - backtracks into a+ that can never be successful. However, if callouts - are in use, auto-possessification means that some callouts are never - taken. You can set this option if you want the matching functions to do - a full unoptimized search and run all the callouts, but it is mainly - provided for testing purposes. + If this (deprecated) option is set, it disables "auto-possessifica- + tion", which is an optimization that, for example, turns a+b into a++b + in order to avoid backtracks into a+ that can never be successful. How- + ever, if callouts are in use, auto-possessification means that some + callouts are never taken. You can set this option if you want the + matching functions to do a full unoptimized search and run all the + callouts, but it is mainly provided for testing purposes. + + If a compile context is available, it is recommended to use + pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather + than the compile option PCRE2_NO_AUTO_POSSESS. Note that + PCRE2_NO_AUTO_POSSESS takes precedence over the pcre2_set_optimize() + optimization directives PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF. PCRE2_NO_DOTSTAR_ANCHOR - If this option is set, it disables an optimization that is applied when - .* is the first significant item in a top-level branch of a pattern, - and all the other branches also start with .* or with \A or \G or ^. - The optimization is automatically disabled for .* if it is inside an - atomic group or a capture group that is the subject of a backreference, - or if the pattern contains (*PRUNE) or (*SKIP). When the optimization - is not disabled, such a pattern is automatically anchored if + If this (deprecated) option is set, it disables an optimization that is + applied when .* is the first significant item in a top-level branch of + a pattern, and all the other branches also start with .* or with \A or + \G or ^. The optimization is automatically disabled for .* if it is in- + side an atomic group or a capture group that is the subject of a back- + reference, or if the pattern contains (*PRUNE) or (*SKIP). When the op- + timization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set - for any ^ items. Otherwise, the fact that any match must start either - at the start of the subject or following a newline is remembered. Like - other optimizations, this can cause callouts to be skipped. + for any ^ items. Otherwise, the fact that any match must start either + at the start of the subject or following a newline is remembered. Like + other optimizations, this can cause callouts to be skipped. (If a com- + pile context is available, it is recommended to use pcre2_set_opti- + mize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF instead.) PCRE2_NO_START_OPTIMIZE This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of - the JIT compiler. + the JIT compiler. Setting this option is equivalent to calling + pcre2_set_optimize() with the directive parameter set to + PCRE2_START_OPTIMIZE_OFF. There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails imme- diately if it cannot find it, without actually running the main match- - ing function. This means that a special item such as (*COMMIT) at the - start of a pattern is not considered until after a suitable starting - point for the match has been found. Also, when callouts or (*MARK) - items are in use, these "start-up" optimizations can cause them to be - skipped if the pattern is never actually used. The start-up optimiza- - tions are in effect a pre-scan of the subject that takes place before - the pattern is run. - - The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, - possibly causing performance to suffer, but ensuring that in cases - where the result is "no match", the callouts do occur, and that items - such as (*COMMIT) and (*MARK) are considered at every possible starting - position in the subject string. - - Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching - operation. Consider the pattern + ing function. The start-up optimizations are in effect a pre-scan of + the subject that takes place before the pattern is run. - (*COMMIT)ABC - - When this is compiled, PCRE2 records the fact that a match must start - with the character "A". Suppose the subject string is "DEFABC". The - start-up optimization scans along the subject, finds "A" and runs the - first match attempt from there. The (*COMMIT) item means that the pat- - tern must match the current starting position, which in this case, it - does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE - set, the initial scan along the subject string does not happen. The - first match attempt is run starting from "D" and when this fails, - (*COMMIT) prevents any further matches being tried, so the overall re- - sult is "no match". - - As another start-up optimization makes use of a minimum length for a - matching subject, which is recorded when possible. Consider the pattern - - (*MARK:1)B(*MARK:2)(X|Y) - - The minimum length for a match is two characters. If the subject is - "XXBB", the "starting character" optimization skips "XX", then tries to - match "BB", which is long enough. In the process, (*MARK:2) is encoun- - tered and remembered. When the match attempt fails, the next "B" is - found, but there is only one character left, so there are no more at- - tempts, and "no match" is returned with the "last mark seen" set to - "2". If NO_START_OPTIMIZE is set, however, matches are tried at every - possible starting position, including at the end of the subject, where - (*MARK:1) is encountered, but there is no "B", so the "last mark seen" - that is returned is "1". In this case, the optimizations do not affect - the overall match result, which is still "no match", but they do affect - the auxiliary information that is returned. + Disabling the start-up optimizations may cause performance to suffer. + However, this may be desirable for patterns which contain callouts or + items such as (*COMMIT) and (*MARK). See the above description of + PCRE2_START_OPTIMIZE_OFF for further details. PCRE2_NO_UTF_CHECK @@ -1892,41 +2012,46 @@ COMPILING A PATTERN ties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode - support (which is the default). The PCRE2_EXTRA_CASELESS_RESTRICT op- - tion (see below) restricts caseless matching such that ASCII characters - match only ASCII characters and non-ASCII characters match only non- - ASCII characters. + support (which is the default). + + The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless + matching such that ASCII characters match only ASCII characters and + non-ASCII characters match only non-ASCII characters. The PCRE2_EX- + TRA_TURKISH_CASING option (see above) alters the matching of the 'i' + characters to follow their behaviour in Turkish and Azeri languages. + For further details on PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EX- + TRA_TURKISH_CASING, see the pcre2unicode page. PCRE2_UNGREEDY - This option inverts the "greediness" of the quantifiers so that they - are not greedy by default, but become greedy if followed by "?". It is - not compatible with Perl. It can also be set by a (?U) option setting + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting within the pattern. PCRE2_USE_OFFSET_LIMIT This option must be set for pcre2_compile() if pcre2_set_offset_limit() - is going to be used to set a non-default offset limit in a match con- - text for matches that use this pattern. An error is generated if an - offset limit is set without this option. For more details, see the de- - scription of pcre2_set_offset_limit() in the section that describes + is going to be used to set a non-default offset limit in a match con- + text for matches that use this pattern. An error is generated if an + offset limit is set without this option. For more details, see the de- + scription of pcre2_set_offset_limit() in the section that describes match contexts. See also the PCRE2_FIRSTLINE option above. PCRE2_UTF - This option causes PCRE2 to regard both the pattern and the subject - strings that are subsequently processed as strings of UTF characters - instead of single-code-unit strings. It is available when PCRE2 is - built to include Unicode support (which is the default). If Unicode + This option causes PCRE2 to regard both the pattern and the subject + strings that are subsequently processed as strings of UTF characters + instead of single-code-unit strings. It is available when PCRE2 is + built to include Unicode support (which is the default). If Unicode support is not available, the use of this option provokes an error. De- - tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the + tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the pcre2unicode page. In particular, note that it changes the way PCRE2_CASELESS works. Extra compile options - The option bits that can be set in a compile context by calling the + The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_options() function are as follows: PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK @@ -1938,102 +2063,102 @@ COMPILING A PATTERN PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES - This option applies when compiling a pattern in UTF-8 or UTF-32 mode. - It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode + This option applies when compiling a pattern in UTF-8 or UTF-32 mode. + It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs - in UTF-16 to encode code points with values in the range 0x10000 to - 0x10ffff. The surrogates cannot therefore be represented in UTF-16. + in UTF-16 to encode code points with values in the range 0x10000 to + 0x10ffff. The surrogates cannot therefore be represented in UTF-16. They can be represented in UTF-8 and UTF-32, but are defined as invalid - code points, and cause errors if encountered in a UTF-8 or UTF-32 + code points, and cause errors if encountered in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2. - These values also cause errors if encountered in escape sequences such + These values also cause errors if encountered in escape sequences such as \x{d912} within a pattern. However, it seems that some applications, when using PCRE2 to check for unwanted characters in UTF-8 strings, ex- - plicitly test for the surrogates using escape sequences. The - PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- + plicitly test for the surrogates using escape sequences. The + PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- cause it applies only to the testing of input strings for UTF validity. - If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- - gate code point values in UTF-8 and UTF-32 patterns no longer provoke - errors and are incorporated in the compiled pattern. However, they can - only match subject characters if the matching function is called with + If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- + gate code point values in UTF-8 and UTF-32 patterns no longer provoke + errors and are incorporated in the compiled pattern. However, they can + only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set. PCRE2_EXTRA_ALT_BSUX - The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and - \x in the way that ECMAscript (aka JavaScript) does. Additional func- + The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and + \x in the way that ECMAscript (aka JavaScript) does. Additional func- tionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has - the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} + the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal character code, where hhh.. is any number of hexadeci- mal digits. PCRE2_EXTRA_ASCII_BSD - This option forces \d to match only ASCII digits, even when PCRE2_UCP - is set. It can be changed within a pattern by means of the (?aD) op- + This option forces \d to match only ASCII digits, even when PCRE2_UCP + is set. It can be changed within a pattern by means of the (?aD) op- tion setting. PCRE2_EXTRA_ASCII_BSS - This option forces \s to match only ASCII space characters, even when - PCRE2_UCP is set. It can be changed within a pattern by means of the + This option forces \s to match only ASCII space characters, even when + PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) option setting. PCRE2_EXTRA_ASCII_BSW - This option forces \w to match only ASCII word characters, even when - PCRE2_UCP is set. It can be changed within a pattern by means of the + This option forces \w to match only ASCII word characters, even when + PCRE2_UCP is set. It can be changed within a pattern by means of the (?aW) option setting. PCRE2_EXTRA_ASCII_DIGIT This option forces the POSIX character classes [:digit:] and [:xdigit:] - to match only ASCII digits, even when PCRE2_UCP is set. It can be + to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option setting. PCRE2_EXTRA_ASCII_POSIX This option forces all the POSIX character classes, including [:digit:] - and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is - set. It can be changed within a pattern by means of the (?aP) option - setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order + and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is + set. It can be changed within a pattern by means of the (?aP) option + setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL - This is a dangerous option. Use with care. By default, an unrecognized - escape such as \j or a malformed one such as \x{2z} causes a compile- + This is a dangerous option. Use with care. By default, an unrecognized + escape such as \j or a malformed one such as \x{2z} causes a compile- time error when detected by pcre2_compile(). Perl is somewhat inconsis- - tent in handling such items: for example, \j is treated as a literal - "j", and non-hexadecimal digits in \x{} are just ignored, though warn- - ings are given in both cases if Perl's warning switch is enabled. How- - ever, a malformed octal number after \o{ always causes an error in + tent in handling such items: for example, \j is treated as a literal + "j", and non-hexadecimal digits in \x{} are just ignored, though warn- + ings are given in both cases if Perl's warning switch is enabled. How- + ever, a malformed octal number after \o{ always causes an error in Perl. - If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to - pcre2_compile(), all unrecognized or malformed escape sequences are - treated as single-character escapes. For example, \j is a literal "j" - and \x{2z} is treated as the literal string "x{2z}". Setting this op- + If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to + pcre2_compile(), all unrecognized or malformed escape sequences are + treated as single-character escapes. For example, \j is a literal "j" + and \x{2z} is treated as the literal string "x{2z}". Setting this op- tion means that typos in patterns may go undetected and have unexpected - results. Also note that a sequence such as [\N{] is interpreted as a - malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] + results. Also note that a sequence such as [\N{] is interpreted as a + malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] gives an error because an unqualified \N is a valid escape sequence but - is not supported in a character class. To reiterate: this is a danger- + is not supported in a character class. To reiterate: this is a danger- ous option. Use with great care. PCRE2_EXTRA_CASELESS_RESTRICT - When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows + When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow for more than two cases per character. There are two case-equivalent character sets that contain both ASCII and non- ASCII characters. The ASCII letter S is case-equivalent to U+017f (long - S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). - This option disables recognition of case-equivalences that cross the + S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). + This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must ei- - ther be ASCII or non-ASCII. The option can be changed with a pattern by - the (?r) option setting. + ther be ASCII or non-ASCII. The option can be changed within a pattern + by the (*CASELESS_RESTRICT) or (?r) option settings. PCRE2_EXTRA_ESCAPED_CR_IS_LF @@ -2062,6 +2187,36 @@ COMPILING A PATTERN end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. + PCRE2_EXTRA_NO_BS0 + + If this option is set (note that its final character is the digit 0) it + locks out the use of the sequence \0 unless at least one more octal + digit follows. + + PCRE2_EXTRA_PYTHON_OCTAL + + If this option is set, PCRE2 follows Python's rules for interpreting + octal escape sequences. The rules for handling sequences such as \14, + which could be an octal number or a back reference are different. De- + tails are given in the pcre2pattern documentation. + + PCRE2_EXTRA_NEVER_CALLOUT + + If this option is set, PCRE2 treats callouts in the pattern as a syntax + error, returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if + the application knows that a callout will not be provided to + pcre2_match(), so that callouts in the pattern are not silently ig- + nored. + + PCRE2_EXTRA_TURKISH_CASING + + This option alters case-equivalence of the 'i' letters to follow the + alphabet used by Turkish and Azeri languages. The option can be changed + within a pattern by the (*TURKISH_CASING) start-of-pattern setting. Ei- + ther the UTF or UCP options must be set. In the 8-bit library, UTF must + be set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RE- + STRICT. + JUST-IN-TIME (JIT) COMPILATION @@ -2255,6 +2410,7 @@ INFORMATION ABOUT A COMPILED PATTERN PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3520,9 +3676,9 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (with- out, of course, writing anything) in order to compute the size of - buffer that is needed. This value is passed back via the outlengthptr - variable, with the result of the function still being PCRE2_ER- - ROR_NOMEMORY. + buffer that is needed, which will include the extra space for the ter- + minating NUL. This value is passed back via the outlengthptr variable, + with the result of the function still being PCRE2_ERROR_NOMEMORY. Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean @@ -3541,24 +3697,32 @@ CREATING A NEW STRING WITH SUBSTITUTIONS cape character that can specify the insertion of characters from cap- ture groups and names from (*MARK) or other control verbs in the pat- tern. Dollar is the only escape character (backslash is treated as lit- - eral). The following forms are always recognized: + eral). The following forms are recognized: $$ insert a dollar character - $ or ${} insert the contents of group + $n or ${n} insert the contents of group n + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string $*MARK or ${*MARK} insert a control verb name - Either a group number or a group name can be given for . Curly - brackets are required only if the following character would be inter- - preted as part of the number or name. The number may be zero to include - the entire matched string. For example, if the pattern a(b)c is - matched with "=abc=" and the replacement string "+$1$0$1+", the result - is "=+babcb+=". + Either a group number or a group name can be given for n, for example + $2 or $NAME. Curly brackets are required only if the following charac- + ter would be interpreted as part of the number or name. The number may + be zero to include the entire matched string. For example, if the pat- + tern a(b)c is matched with "=abc=" and the replacement string + "+$1$0$1+", the result is "=+babcb+=". + + The JavaScript form $, where the angle brackets are part of the + syntax, is also recognized for group names, but not for group numbers + or *MARK. - $*MARK inserts the name from the last encountered backtracking control - verb on the matching path that has a name. (*MARK) must always include - a name, but the other verbs need not. For example, in the case of + $*MARK inserts the name from the last encountered backtracking control + verb on the matching path that has a name. (*MARK) must always include + a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) - the relevant name is "B". This facility can be used to perform simple + the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this pcre2test example shows: /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} @@ -3566,15 +3730,15 @@ CREATING A NEW STRING WITH SUBSTITUTIONS 2: pear orange PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject - string, replacing every matching substring. If this option is not set, - only the first matching substring is replaced. The search for matches - takes place in the original subject string (that is, previous replace- - ments do not affect it). Iteration is implemented by advancing the - startoffset value for each search, which is always passed the entire + string, replacing every matching substring. If this option is not set, + only the first matching substring is replaced. The search for matches + takes place in the original subject string (that is, previous replace- + ments do not affect it). Iteration is implemented by advancing the + startoffset value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, search- ing stops when that limit is reached. - You can restrict the effect of a global substitution to a portion of + You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of startoffset and an off- set limit. Here is a pcre2test example: @@ -3582,73 +3746,95 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ABC ABC ABC ABC\=offset=3,offset_limit=12 2: ABC A!C A!C ABC - When continuing with global substitutions after matching a substring + When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same off- set is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next - two characters are CR, LF. In this case, the offset is advanced by two + two characters are CR, LF. In this case, the offset is advanced by two characters. PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option - should be used with care, because it means that a typo in a group name + should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error. PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including un- - known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated - as empty strings when inserted as described above. If this option is + known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated + as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UN- - SET error. This option does not influence the extended substitution + SET error. This option does not influence the extended substitution syntax described below. - PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the - replacement string. Without this option, only the dollar character is - special, and only the group insertion forms listed above are valid. - When PCRE2_SUBSTITUTE_EXTENDED is set, two things change: + PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the + replacement string. Without this option, only the dollar character is + special, and only the group insertion forms listed above are valid. + When PCRE2_SUBSTITUTE_EXTENDED is set, several things change: + + Firstly, backslash in a replacement string is interpreted as an escape + character. The usual forms such as \x{ddd} can be used to specify par- + ticular character codes, and backslash followed by any non-alphanumeric + character quotes that character. Extended quoting can be coded using + \Q...\E, exactly as in pattern strings. The escapes \b and \v are in- + terpreted as the characters backspace and vertical tab, respectively. - Firstly, backslash in a replacement string is interpreted as an escape - character. The usual forms such as \n or \x{ddd} can be used to specify - particular character codes, and backslash followed by any non-alphanu- - meric character quotes that character. Extended quoting can be coded - using \Q...\E, exactly as in pattern strings. + The interpretation of backslash followed by one or more digits is the + same as in a pattern, which in Perl has some ambiguities. Details are + given in the pcre2pattern page. + + The Python form \g, where the angle brackets are part of the syntax + and n is either a group name or number, is recognized as an altertive + way of inserting the contents of a group, for example \g<3>. There are also four escape sequences for forcing the case of inserted - letters. The insertion mechanism has three states: no case forcing, - force upper case, and force lower case. The escape sequences change the - current state: \U and \L change to upper or lower case forcing, respec- - tively, and \E (when not terminating a \Q quoted sequence) reverts to - no case forcing. The sequences \u and \l force the next character (if - it is a letter) to upper or lower case, respectively, and then the - state automatically reverts to no case forcing. Case forcing applies to - all inserted characters, including those from capture groups and let- - ters within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP - was set when the pattern was compiled, Unicode properties are used for - case forcing characters whose code points are greater than 127. + letters. Case forcing applies to all inserted characters, including + those from capture groups and letters within \Q...\E quoted sequences. + The insertion mechanism has three states: no case forcing, force upper + case, and force lower case. The escape sequences change the current + state: \U and \L change to upper or lower case forcing, respectively, + and \E (when not terminating a \Q quoted sequence) reverts to no case + forcing. The sequences \u and \l force the next character (if it is a + letter) to upper or lower case, respectively, and then the state auto- + matically reverts to no case forcing. + + However, if \u is immediately followed by \L or \l is immediately fol- + lowed by \U, the next character's case is forced by the first escape + sequence, and subsequent characters by the second. This provides a "ti- + tle casing" facility that can be applied to group captures. For exam- + ple, if group 1 has captured "heLLo", the replacement string "\u\L$1" + becomes "Hello". + + If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, + Unicode properties are used for case forcing characters whose code + points are greater than 127. However, only simple case folding, as de- + termined by the Unicode file CaseFolding.txt is supported. PCRE2 does + not support language-specific special casing rules such as using dif- + ferent lower case Greek sigmas in the middle and ends of words (as de- + fined in the Unicode file SpecialCasing.txt). Note that case forcing sequences such as \U...\E do not nest. For exam- ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EX- TRA_ALT_BSUX options do not apply to replacement strings. - The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more + The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash: - ${:-} - ${:+:} + ${n:-string} + ${n:+string1:string2} - As before, may be a group number or a name. The first form speci- - fies a default value. If group is set, its value is inserted; if - not, is expanded and the result inserted. The second form - specifies strings that are expanded and inserted when group is set - or unset, respectively. The first form is just a convenient shorthand - for + As in the simple case, n may be a group number or a name. The first + form specifies a default value. If group n is set, its value is in- + serted; if not, the string is expanded and the result inserted. The + second form specifies strings that are expanded and inserted when group + n is set or unset, respectively. The first form is just a convenient + shorthand for - ${:+${}:} + ${n:+${n}:string} - Backslash can be used to escape colons and closing curly brackets in - the replacement strings. A change of the case forcing state within a - replacement string remains in force afterwards, as shown in this + Backslash can be used to escape colons and closing curly brackets in + the replacement strings. A change of the case forcing state within a + replacement string remains in force afterwards, as shown in this pcre2test example: /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo @@ -3657,8 +3843,8 @@ CREATING A NEW STRING WITH SUBSTITUTIONS somebody 1: HELLO - The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended - substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- + The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended + substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- known groups in the extended syntax forms to be treated as unset. If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, @@ -3667,39 +3853,39 @@ CREATING A NEW STRING WITH SUBSTITUTIONS Substitution errors - In the event of an error, pcre2_substitute() returns a negative error - code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors + In the event of an error, pcre2_substitute() returns a negative error + code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() are passed straight back. PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser- tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ- - ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) - when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- + ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) + when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- SET_EMPTY is not set. - PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big + PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size - of buffer that is needed is returned via outlengthptr. Note that this + of buffer that is needed is returned via outlengthptr. Note that this does not happen by default. PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the - match_data argument is NULL or if the subject or replacement arguments - are NULL. For backward compatibility reasons an exception is made for + match_data argument is NULL or if the subject or replacement arguments + are NULL. For backward compatibility reasons an exception is made for the replacement argument if the rlength argument is also 0. - PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in - the replacement string, with more particular errors being PCRE2_ER- + PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in + the replacement string, with more particular errors being PCRE2_ER- ROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE - (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax - error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN + (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax + error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier - than the current position in the subject, which can happen if \K is + than the current position in the subject, which can happen if \K is used in an assertion). As for all PCRE2 errors, a text message that describes the error can be - obtained by calling the pcre2_get_error_message() function (see "Ob- + obtained by calling the pcre2_get_error_message() function (see "Ob- taining a textual error message" above). Substitution callouts @@ -3708,12 +3894,20 @@ CREATING A NEW STRING WITH SUBSTITUTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); - The pcre2_set_substitution_callout() function can be used to specify a - callout function for pcre2_substitute(). This information is passed in + The pcre2_set_substitution_callout() function can be used to specify a + callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution - has been processed, but it can cause the replacement not to happen. The - callout function is not called for simulated substitutions that happen - as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. + has been processed, but it can cause the replacement not to happen. + + The callout function is not called for simulated substitutions that + happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In + this mode, when substitution processing exceeds the buffer space pro- + vided by the caller, processing continues by counting code units. The + simulation is unable to populate the callout block, and so the simula- + tion is pessimistic about the required buffer size. Whichever is larger + of accepted or rejected substitution is reported as the required size. + Therefore, the returned buffer length may be an overestimate (without a + substitution callout, it is normally an exact measurement). The first argument of the callout function is a pointer to a substitute callout block structure, which contains the following fields, not nec- @@ -3757,62 +3951,149 @@ CREATING A NEW STRING WITH SUBSTITUTIONS to the output and the call to pcre2_substitute() exits, returning the number of matches so far. + Substitution case callouts + + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + + The pcre2_set_substitution_case_callout() function can be used to spec- + ify a callout function for pcre2_substitute() to use when performing + case transformations. This does not affect any case insensitivity be- + haviour when performing a match, but only the user-visible transforma- + tions performed when processing a substitution such as: + + pcre2_substitute(..., "\\U$1", ...) + + The default case transformations applied by PCRE2 are reasonably com- + plete, and, in UTF or UCP mode, perform the simple locale-invariant + case transformations as specified by Unicode. This is suitable for the + internal (invisible) case-equivalence procedures used during pattern + matching, but an application may wish to use more sophisticated locale- + aware processing for the user-visible substitution transformations. + + One example implementation of the callout_function using the ICU li- + brary would be: + + PCRE2_SIZE + icu_case_callout( + PCRE2_SPTR input, PCRE2_SIZE input_len, + PCRE2_UCHAR *output, PCRE2_SIZE output_cap, + int to_case, void *data_ptr) + { + UErrorCode err = U_ZERO_ERROR; + int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER + ? u_strToLower(output, output_cap, input, input_len, NULL, &err) + : to_case == PCRE2_SUBSTITUTE_CASE_UPPER + ? u_strToUpper(output, output_cap, input, input_len, NULL, &err) + : u_strToTitle(output, output_cap, input, input_len, &first_char_only, + NULL, &err); + if (U_FAILURE(err)) return (~(PCRE2_SIZE)0); + return r; + } + + The first and second arguments of the case callout function are the + Unicode string to transform. + + The third and fourth arguments are the output buffer and its capacity. + + The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, + PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. + PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed + to the callout to indicate that the case of the entire callout input + should be case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed + to indicate that only the first character or glyph should be trans- + formed to Unicode titlecase and the rest to Unicode lowercase (note + that titlecasing sometimes uses Unicode properties to titlecase each + word in a string; but PCRE2 is requesting that only the single leading + character is to be titlecased). + + The sixth argument is the callout_data supplied to pcre2_set_substi- + tute_case_callout(). + + The resulting string in the destination buffer may be larger or smaller + than the input, if the casing rules merge or split characters. The re- + turn value is the length required for the output string. If a buffer of + sufficient size was provided to the callout, then the result must be + written to the buffer and the number of code units returned. If the re- + sult does not fit in the provided buffer, then the required capacity + must be returned and PCRE2 will not make use of the output buffer. + PCRE2 provides input and output buffers which overlap, so the callout + must support this by suitable internal buffering. + + Alternatively, if the callout wishes to indicate an error, then it may + return (~(PCRE2_SIZE)0). In this case pcre2_substitute() will immedi- + ately fail with error PCRE2_ERROR_REPLACECASE. + + When a case callout is combined with the PCRE2_SUBSTITUTE_OVER- + FLOW_LENGTH option, there are situations when pcre2_substitute() will + return an underestimate of the required buffer size. If you call + pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the + input buffer is too small for the replacement string to be constructed, + then instead of calling the case callout, pcre2_substitute() will make + an estimate of the required buffer size. The second call should also + pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second call is not + guaranteed to succeed either, if the case callout requires more buffer + space than expected. The caller must make repeated attempts in a loop. + DUPLICATE CAPTURE GROUP NAMES int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); - When a pattern is compiled with the PCRE2_DUPNAMES option, names for - capture groups are not required to be unique. Duplicate names are al- - ways allowed for groups with the same number, created by using the (?| + When a pattern is compiled with the PCRE2_DUPNAMES option, names for + capture groups are not required to be unique. Duplicate names are al- + ways allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names. - Normally, patterns that use duplicate names are such that in any one - match, only one of each set of identically-named groups participates. + Normally, patterns that use duplicate names are such that in any one + match, only one of each set of identically-named groups participates. An example is shown in the pcre2pattern documentation. - When duplicates are present, pcre2_substring_copy_byname() and - pcre2_substring_get_byname() return the first substring corresponding - to the given name that is set. Only if none are set is PCRE2_ERROR_UN- - SET is returned. The pcre2_substring_number_from_name() function re- - turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate + When duplicates are present, pcre2_substring_copy_byname() and + pcre2_substring_get_byname() return the first substring corresponding + to the given name that is set. Only if none are set is PCRE2_ERROR_UN- + SET is returned. The pcre2_substring_number_from_name() function re- + turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. - If you want to get full details of all captured substrings for a given - name, you must use the pcre2_substring_nametable_scan() function. The - first argument is the compiled pattern, and the second is the name. If - the third and fourth arguments are NULL, the function returns a group + If you want to get full details of all captured substrings for a given + name, you must use the pcre2_substring_nametable_scan() function. The + first argument is the compiled pattern, and the second is the name. If + the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. When the third and fourth arguments are not NULL, they must be pointers - to variables that are updated by the function. After it has run, they + to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the - given name, and the function returns the length of each entry in code - units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are + given name, and the function returns the length of each entry in code + units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. The format of the name table is described above in the section entitled - Information about a pattern. Given all the relevant entries for the - name, you can extract each of their numbers, and hence the captured + Information about a pattern. Given all the relevant entries for the + name, you can extract each of their numbers, and hence the captured data. FINDING ALL POSSIBLE MATCHES AT ONE POSITION - The traditional matching function uses a similar algorithm to Perl, - which stops when it finds the first match at a given point in the sub- + The traditional matching function uses a similar algorithm to Perl, + which stops when it finds the first match at a given point in the sub- ject. If you want to find all possible matches, or the longest possible - match at a given position, consider using the alternative matching - function (see below) instead. If you cannot use the alternative func- + match at a given position, consider using the alternative matching + function (see below) instead. If you cannot use the alternative func- tion, you can kludge it up by making use of the callout facility, which is described in the pcre2callout documentation. What you have to do is to insert a callout right at the end of the pat- - tern. When your callout function is called, extract and save the cur- - rent matched substring. Then return 1, which forces pcre2_match() to - backtrack and try other alternatives. Ultimately, when it runs out of + tern. When your callout function is called, extract and save the cur- + rent matched substring. Then return 1, which forces pcre2_match() to + backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH. @@ -3824,27 +4105,27 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount); - The function pcre2_dfa_match() is called to match a subject string - against a compiled pattern, using a matching algorithm that scans the + The function pcre2_dfa_match() is called to match a subject string + against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does - not backtrack (except when processing lookaround assertions). This has - different characteristics to the normal algorithm, and is not compati- - ble with Perl. Some of the features of PCRE2 patterns are not sup- + not backtrack (except when processing lookaround assertions). This has + different characteristics to the normal algorithm, and is not compati- + ble with Perl. Some of the features of PCRE2 patterns are not sup- ported. Nevertheless, there are times when this kind of matching can be - useful. For a discussion of the two matching algorithms, and a list of + useful. For a discussion of the two matching algorithms, and a list of features that pcre2_dfa_match() does not support, see the pcre2matching documentation. - The arguments for the pcre2_dfa_match() function are the same as for + The arguments for the pcre2_dfa_match() function are the same as for pcre2_match(), plus two extras. The ovector within the match data block is used in a different way, and this is described below. The other com- - mon arguments are used in the same way as for pcre2_match(), so their + mon arguments are used in the same way as for pcre2_match(), so their description is not repeated here. - The two additional arguments provide workspace for the function. The - workspace vector should contain at least 20 elements. It is used for - keeping track of multiple paths through the pattern tree. More work- - space is needed for patterns and subjects where there are a lot of po- + The two additional arguments provide workspace for the function. The + workspace vector should contain at least 20 elements. It is used for + keeping track of multiple paths through the pattern tree. More work- + space is needed for patterns and subjects where there are a lot of po- tential matches. Here is an example of a simple call to pcre2_dfa_match(): @@ -3864,45 +4145,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION Option bits for pcre2_dfa_match() - The unused bits of the options argument for pcre2_dfa_match() must be - zero. The only bits that may be set are PCRE2_ANCHORED, - PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- + The unused bits of the options argument for pcre2_dfa_match() must be + zero. The only bits that may be set are PCRE2_ANCHORED, + PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- TEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, - PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and - PCRE2_DFA_RESTART. All but the last four of these are exactly the same + PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and + PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for pcre2_match(), so their description is not repeated here. PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT - These have the same general effect as they do for pcre2_match(), but - the details are slightly different. When PCRE2_PARTIAL_HARD is set for - pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the + These have the same general effect as they do for pcre2_match(), but + the details are slightly different. When PCRE2_PARTIAL_HARD is set for + pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the subject is reached and there is still at least one matching possibility that requires additional characters. This happens even if some complete - matches have already been found. When PCRE2_PARTIAL_SOFT is set, the - return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL - if the end of the subject is reached, there have been no complete + matches have already been found. When PCRE2_PARTIAL_SOFT is set, the + return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL + if the end of the subject is reached, there have been no complete matches, but there is still at least one matching possibility. The por- - tion of the string that was inspected when the longest partial match + tion of the string that was inspected when the longest partial match was found is set as the first matching string in both cases. There is a - more detailed discussion of partial and multi-segment matching, with + more detailed discussion of partial and multi-segment matching, with examples, in the pcre2partial documentation. PCRE2_DFA_SHORTEST - Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to + Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as soon as it has found one match. Because of the way the alterna- - tive algorithm works, this is necessarily the shortest possible match + tive algorithm works, this is necessarily the shortest possible match at the first possible matching point in the subject string. PCRE2_DFA_RESTART - When pcre2_dfa_match() returns a partial match, it is possible to call + When pcre2_dfa_match() returns a partial match, it is possible to call it again, with additional subject characters, and have it continue with the same match. The PCRE2_DFA_RESTART option requests this action; when - it is set, the workspace and wscount options must reference the same - vector as before because data about the match so far is left in them + it is set, the workspace and wscount options must reference the same + vector as before because data about the match so far is left in them after a partial match. There is more discussion of this facility in the pcre2partial documentation. @@ -3910,8 +4191,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION When pcre2_dfa_match() succeeds, it may have matched more than one sub- string in the subject. Note, however, that all the matches from one run - of the function start at the same point in the subject. The shorter - matches are all initial substrings of the longer matches. For example, + of the function start at the same point in the subject. The shorter + matches are all initial substrings of the longer matches. For example, if the pattern <.*> @@ -3926,80 +4207,80 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION - On success, the yield of the function is a number greater than zero, - which is the number of matched substrings. The offsets of the sub- - strings are returned in the ovector, and can be extracted by number in - the same way as for pcre2_match(), but the numbers bear no relation to - any capture groups that may exist in the pattern, because DFA matching + On success, the yield of the function is a number greater than zero, + which is the number of matched substrings. The offsets of the sub- + strings are returned in the ovector, and can be extracted by number in + the same way as for pcre2_match(), but the numbers bear no relation to + any capture groups that may exist in the pattern, because DFA matching does not support capturing. - Calls to the convenience functions that extract substrings by name re- + Calls to the convenience functions that extract substrings by name re- turn the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used af- - ter a DFA match. The convenience functions that extract substrings by + ter a DFA match. The convenience functions that extract substrings by number never return PCRE2_ERROR_NOSUBSTRING. - The matched strings are stored in the ovector in reverse order of - length; that is, the longest matching string is first. If there were - too many matches to fit into the ovector, the yield of the function is + The matched strings are stored in the ovector in reverse order of + length; that is, the longest matching string is first. If there were + too many matches to fit into the ovector, the yield of the function is zero, and the vector is filled with the longest matches. - NOTE: PCRE2's "auto-possessification" optimization usually applies to - character repeats at the end of a pattern (as well as internally). For - example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA - matching, this means that only one possible match is found. If you re- + NOTE: PCRE2's "auto-possessification" optimization usually applies to + character repeats at the end of a pattern (as well as internally). For + example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA + matching, this means that only one possible match is found. If you re- ally do want multiple matches in such cases, either use an ungreedy re- - peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- + peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- piling. Error returns from pcre2_dfa_match() The pcre2_dfa_match() function returns a negative number when it fails. - Many of the errors are the same as for pcre2_match(), as described + Many of the errors are the same as for pcre2_match(), as described above. There are in addition the following errors that are specific to pcre2_dfa_match(): PCRE2_ERROR_DFA_UITEM - This return is given if pcre2_dfa_match() encounters an item in the - pattern that it does not support, for instance, the use of \C in a UTF + This return is given if pcre2_dfa_match() encounters an item in the + pattern that it does not support, for instance, the use of \C in a UTF mode or a backreference. PCRE2_ERROR_DFA_UCOND - This return is given if pcre2_dfa_match() encounters a condition item + This return is given if pcre2_dfa_match() encounters a condition item that uses a backreference for the condition, or a test for recursion in a specific capture group. These are not supported. PCRE2_ERROR_DFA_UINVALID_UTF - This return is given if pcre2_dfa_match() is called for a pattern that - was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for + This return is given if pcre2_dfa_match() is called for a pattern that + was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA matching. PCRE2_ERROR_DFA_WSSIZE - This return is given if pcre2_dfa_match() runs out of space in the + This return is given if pcre2_dfa_match() runs out of space in the workspace vector. PCRE2_ERROR_DFA_RECURSE When a recursion or subroutine call is processed, the matching function - calls itself recursively, using private memory for the ovector and - workspace. This error is given if the internal ovector is not large - enough. This should be extremely rare, as a vector of size 1000 is + calls itself recursively, using private memory for the ovector and + workspace. This error is given if the internal ovector is not large + enough. This should be extremely rare, as a vector of size 1000 is used. PCRE2_ERROR_DFA_BADRESTART - When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, - some plausibility checks are made on the contents of the workspace, - which should contain data about the previous partial match. If any of + When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, + some plausibility checks are made on the contents of the workspace, + which should contain data about the previous partial match. If any of these checks fail, this error is given. SEE ALSO - pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), + pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3). @@ -4012,15 +4293,14 @@ AUTHOR REVISION - Last updated: 24 April 2024 + Last updated: 26 December 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 24 April 2024 PCRE2API(3) +PCRE2 10.45 26 December 2024 PCRE2API(3) ------------------------------------------------------------------------------ - PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3) @@ -4639,15 +4919,14 @@ AUTHOR REVISION - Last updated: 15 April 2024 + Last updated: 16 April 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 15 April 2024 PCRE2BUILD(3) +PCRE2 10.45 16 April 2024 PCRE2BUILD(3) ------------------------------------------------------------------------------ - PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) @@ -5077,11 +5356,10 @@ REVISION Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2CALLOUT(3) +PCRE2 10.45 19 January 2024 PCRE2CALLOUT(3) ------------------------------------------------------------------------------ - PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3) @@ -5140,7 +5418,7 @@ DIFFERENCES BETWEEN PCRE2 AND PERL 7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties - such as Lu and Nd, the derived properties Any and LC (synonym L&), + such as Lu and Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) prop- erty, but in PCRE2 its use is limited. See the pcre2pattern documenta- @@ -5167,118 +5445,128 @@ DIFFERENCES BETWEEN PCRE2 AND PERL \Q\\E \ \\E The \Q...\E sequence is recognized both inside and outside character - classes by both PCRE2 and Perl. - - 9. Fairly obviously, PCRE2 does not support the (?{code}) and + classes by both PCRE2 and Perl. Another difference from Perl is that + any appearance of \Q or \E inside what might otherwise be a quantifier + causes PCRE2 not to recognize the sequence as a quantifier. Perl recog- + nizes a quantifier if (redundantly) either of the numbers is inside + \Q...\E, but not if the separating comma is. When not recognized as a + quantifier a sequence such as {\Q1\E,2} is treated as the literal + string "{1,2}". + + 9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details. - 10. Subroutine calls (whether recursive or not) were treated as atomic - groups up to PCRE2 release 10.23, but from release 10.30 this changed, + 10. Subroutine calls (whether recursive or not) were treated as atomic + groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. - 11. In PCRE2, if any of the backtracking control verbs are used in a - group that is called as a subroutine (whether or not recursively), - their effect is confined to that group; it does not extend to the sur- - rounding pattern. This is not always the case in Perl. In particular, - if (*THEN) is present in a group that is called as a subroutine, its + 11. In PCRE2, if any of the backtracking control verbs are used in a + group that is called as a subroutine (whether or not recursively), + their effect is confined to that group; it does not extend to the sur- + rounding pattern. This is not always the case in Perl. In particular, + if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any - | characters. Note that such groups are processed as anchored at the - point where they are tested. - - 12. If a pattern contains more than one backtracking control verb, the - first one that is backtracked onto acts. For example, in the pattern - A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure + | characters. Note that such groups are processed as anchored at the + point where they are tested. PCRE2 also confines all control verbs + within atomic assertions, again including (*THEN) in assertions with + only one branch. + + 12. If a pattern contains more than one backtracking control verb, the + first one that is backtracked onto acts. For example, in the pattern + A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. - 13. There are some differences that are concerned with the settings of - captured strings when part of a pattern is repeated. For example, - matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- + 13. There are some differences that are concerned with the settings of + captured strings when part of a pattern is repeated. For example, + matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- set, but in PCRE2 it is set to "b". - 14. PCRE2's handling of duplicate capture group numbers and names is - not as general as Perl's. This is a consequence of the fact the PCRE2 - works internally just with numbers, using an external table to trans- - late between numbers and names. In particular, a pattern such as - (?|(?A)|(?B)), where the two capture groups have the same number - but different names, is not supported, and causes an error at compile + 14. PCRE2's handling of duplicate capture group numbers and names is + not as general as Perl's. This is a consequence of the fact the PCRE2 + works internally just with numbers, using an external table to trans- + late between numbers and names. In particular, a pattern such as + (?|(?A)|(?B)), where the two capture groups have the same number + but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which - group matched, because both names map to capture group number 1. To + group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. 15. Perl used to recognize comments in some places that PCRE2 does not, - for example, between the ( and ? at the start of a group. If the /x - modifier is set, Perl allowed white space between ( and ? though the - latest Perls give an error (for a while it was just deprecated). There + for example, between the ( and ? at the start of a group. If the /x + modifier is set, Perl allowed white space between ( and ? though the + latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. - 16. Perl, when in warning mode, gives warnings for character classes - such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- + 16. Perl, when in warning mode, gives warnings for character classes + such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- als. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. - 17. In PCRE2, the upper/lower case character properties Lu and Ll are - not affected when case-independent matching is specified. For example, - \p{Lu} always matches an upper case letter. I think Perl has changed in - this respect; in the release at the time of writing (5.38), \p{Lu} and - \p{Ll} match all letters, regardless of case, when case independence is - specified. + 17. In PCRE2, until release 10.45, the upper/lower case character prop- + erties Lu and Ll were not affected when case-independent matching was + specified. Perl has changed in this respect, and PCRE2 has now changed + to match. When caseless matching is in force, Lu, Ll, and Lt (title + case) are all treated as Lc (cased letter). 18. From release 5.32.0, Perl locks out the use of \K in lookaround as- - sertions. From release 10.38 PCRE2 does the same by default. However, - there is an option for re-enabling the previous behaviour. When this - option is set, \K is acted on when it occurs in positive assertions, + sertions. From release 10.38 PCRE2 does the same by default. However, + there is an option for re-enabling the previous behaviour. When this + option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions. - 19. PCRE2 provides some extensions to the Perl regular expression fa- - cilities. Perl 5.10 included new features that were not in earlier - versions of Perl, some of which (such as named parentheses) were in + 19. PCRE2 provides some extensions to the Perl regular expression fa- + cilities. Perl 5.10 included new features that were not in earlier + versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This list is with respect to Perl 5.38: - (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the + (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. - (b) A backslash followed by a letter with no special meaning is + (b) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) - (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- + (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- fiers is inverted, that is, by default they are not greedy, but if fol- lowed by a question mark they are. - (d) PCRE2_ANCHORED can be used at matching time to force a pattern to + (d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. - (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and + (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART options have no Perl equivalents. - (f) The \R escape sequence can be restricted to match only CR, LF, or + (f) The \R escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. - (g) The callout facility is PCRE2-specific. Perl supports codeblocks + (g) The callout facility is PCRE2-specific. Perl supports codeblocks and variable interpolation, but not general hooks on every match. (h) The partial matching facility is PCRE2-specific. - (i) The alternative matching function (pcre2_dfa_match() matches in a + (i) The alternative matching function (pcre2_dfa_match() matches in a different way and is not Perl-compatible. - (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) - at the start of a pattern. These set overall options that cannot be + (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) + at the start of a pattern. These set overall options that cannot be changed within the pattern. - (k) PCRE2 supports non-atomic positive lookaround assertions. This is + (k) PCRE2 supports non-atomic positive lookaround assertions. This is an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. - (l) There are three syntactical items in patterns that can refer to a - capturing group by number: back references such as \g{2}, subroutine - calls such as (?3), and condition references such as (?(4)...). PCRE2 - supports relative group numbers such as +2 and -4 in all three cases. - Perl supports both plus and minus for subroutine calls, but only minus + (l) There are three syntactical items in patterns that can refer to a + capturing group by number: back references such as \g{2}, subroutine + calls such as (?3), and condition references such as (?(4)...). PCRE2 + supports relative group numbers such as +2 and -4 in all three cases. + Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. + (m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 ex- + tension that is not available in Perl. + 20. Perl has different limits than PCRE2. See the pcre2limit documenta- tion for details. Perl went with 5.10 from recursion to iteration keep- ing the intermediate matches on the heap, which is ~10% slower but does @@ -5297,6 +5585,17 @@ DIFFERENCES BETWEEN PCRE2 AND PERL ple is /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject. + 23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl + tries to recover and prints a warning if the problem was that an in- + valid hexadecimal digit was found, since PCRE2 doesn't have warnings it + returns an error instead. Additionally, Perl accepts \x{} and gener- + ates NUL unlike PCRE2. + + 24. From release 10.45, PCRE2 gives an error if \x is not followed by a + hexadecimal digit or a curly bracket. It used to interpret this as the + NUL character. Perl still generates NUL, but warns when in warning mode + in most cases. + AUTHOR @@ -5307,15 +5606,14 @@ AUTHOR REVISION - Last updated: 30 November 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 02 October 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 30 November 2023 PCRE2COMPAT(3) +PCRE2 10.45 02 October 2024 PCRE2COMPAT(3) ------------------------------------------------------------------------------ - PCRE2JIT(3) Library Functions Manual PCRE2JIT(3) @@ -5359,146 +5657,155 @@ AVAILABILITY OF JIT SUPPORT If --enable-jit is set on an unsupported platform, compilation fails. - A client program can tell if JIT support is available by calling + A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular match. One reason for this is that there are a number of op- tions and pattern items that are not supported by JIT (see below). An- - other reason is that in some environments JIT is unable to get memory - in which to build its compiled code. The only guarantee from pcre2_con- - fig() is that if it returns zero, JIT will definitely not be used. - - A simple program does not need to check availability in order to use - JIT when possible. The API is implemented in a way that falls back to - the interpretive code if JIT is not available or cannot be used for a - given match. For programs that need the best possible performance, + other reason is that in some environments JIT is unable to get exe- + cutable memory in which to build its compiled code. The only guarantee + from pcre2_config() is that if it returns zero, JIT will definitely not + be used. + + As of release 10.45 there is a more informative way to test for JIT + support. If pcre2_compile_jit() is called with the single option + PCRE2_JIT_TEST_ALLOC it returns zero if JIT is available and has a + working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is + available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UN- + SUPPORTED if JIT support is not compiled. The code argument is ignored, + so it can be a NULL value. + + A simple program does not need to check availability in order to use + JIT when possible. The API is implemented in a way that falls back to + the interpretive code if JIT is not available or cannot be used for a + given match. For programs that need the best possible performance, there is a "fast path" API that is JIT-specific. SIMPLE USE OF JIT - To make use of the JIT support in the simplest way, all you have to do - is to call pcre2_jit_compile() after successfully compiling a pattern + To make use of the JIT support in the simplest way, all you have to do + is to call pcre2_jit_compile() after successfully compiling a pattern with pcre2_compile(). This function has two arguments: the first is the - compiled pattern pointer that was returned by pcre2_compile(), and the - second is zero or more of the following option bits: PCRE2_JIT_COM- + compiled pattern pointer that was returned by pcre2_compile(), and the + second is zero or more of the following option bits: PCRE2_JIT_COM- PLETE, PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT. - If JIT support is not available, a call to pcre2_jit_compile() does - nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled + If JIT support is not available, a call to pcre2_jit_compile() does + nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern is passed to the JIT compiler, which turns it into machine code that executes much faster than the normal interpretive code, but yields - exactly the same results. The returned value from pcre2_jit_compile() + exactly the same results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code. - There is a limit to the size of pattern that JIT supports, imposed by - the size of machine stack that it uses. The exact rules are not docu- + There is a limit to the size of pattern that JIT supports, imposed by + the size of machine stack that it uses. The exact rules are not docu- mented because they may change at any time, in particular, when new op- - timizations are introduced. If a pattern is too big, a call to + timizations are introduced. If a pattern is too big, a call to pcre2_jit_compile() returns PCRE2_ERROR_NOMEMORY. - PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- - plete matches. If you want to run partial matches using the PCRE2_PAR- - TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should - set one or both of the other options as well as, or instead of + PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- + plete matches. If you want to run partial matches using the PCRE2_PAR- + TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should + set one or both of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT compiler generates different optimized code - for each of the three modes (normal, soft partial, hard partial). When - pcre2_match() is called, the appropriate code is run if it is avail- + for each of the three modes (normal, soft partial, hard partial). When + pcre2_match() is called, the appropriate code is run if it is avail- able. Otherwise, the pattern is matched using interpretive code. - You can call pcre2_jit_compile() multiple times for the same compiled - pattern. It does nothing if it has previously compiled code for any of - the option bits. For example, you can call it once with PCRE2_JIT_COM- - PLETE and (perhaps later, when you find you need partial matching) - again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it + You can call pcre2_jit_compile() multiple times for the same compiled + pattern. It does nothing if it has previously compiled code for any of + the option bits. For example, you can call it once with PCRE2_JIT_COM- + PLETE and (perhaps later, when you find you need partial matching) + again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial match- ing. If pcre2_jit_compile() is called with no option bits set, it imme- diately returns zero. This is an alternative way of testing whether JIT - is available. + support has been compiled. - At present, it is not possible to free JIT compiled code except when + At present, it is not possible to free JIT compiled code except when the entire compiled pattern is freed by calling pcre2_code_free(). - In some circumstances you may need to call additional functions. These - are described in the section entitled "Controlling the JIT stack" be- + In some circumstances you may need to call additional functions. These + are described in the section entitled "Controlling the JIT stack" be- low. There are some pcre2_match() options that are not supported by JIT, and - there are also some pattern items that JIT cannot handle. Details are - given below. In both cases, matching automatically falls back to the - interpretive code. If you want to know whether JIT was actually used - for a particular match, you should arrange for a JIT callback function - to be set up as described in the section entitled "Controlling the JIT - stack" below, even if you do not need to supply a non-default JIT + there are also some pattern items that JIT cannot handle. Details are + given below. In both cases, matching automatically falls back to the + interpretive code. If you want to know whether JIT was actually used + for a particular match, you should arrange for a JIT callback function + to be set up as described in the section entitled "Controlling the JIT + stack" below, even if you do not need to supply a non-default JIT stack. Such a callback function is called whenever JIT code is about to - be obeyed. If the match-time options are not right for JIT execution, + be obeyed. If the match-time options are not right for JIT execution, the callback function is not obeyed. - If the JIT compiler finds an unsupported item, no JIT data is gener- + If the JIT compiler finds an unsupported item, no JIT data is gener- ated. You can find out if JIT compilation was successful for a compiled pattern by calling pcre2_pattern_info() with the PCRE2_INFO_JITSIZE op- - tion. A non-zero result means that JIT compilation was successful. A + tion. A non-zero result means that JIT compilation was successful. A result of 0 means that JIT support is not available, or the pattern was - not processed by pcre2_jit_compile(), or the JIT compiler was not able - to handle the pattern. Successful JIT compilation does not, however, - guarantee the use of JIT at match time because there are some match + not processed by pcre2_jit_compile(), or the JIT compiler was not able + to handle the pattern. Successful JIT compilation does not, however, + guarantee the use of JIT at match time because there are some match time options that are not supported by JIT. MATCHING SUBJECTS CONTAINING INVALID UTF - When a pattern is compiled with the PCRE2_UTF option, subject strings - are normally expected to be a valid sequence of UTF code units. By de- - fault, this is checked at the start of matching and an error is gener- - ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be + When a pattern is compiled with the PCRE2_UTF option, subject strings + are normally expected to be a valid sequence of UTF code units. By de- + fault, this is checked at the start of matching and an error is gener- + ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be passed to pcre2_match() to skip the check (for improved performance) if - you are sure that a subject string is valid. If this option is used - with an invalid string, the result is undefined. The calling program + you are sure that a subject string is valid. If this option is used + with an invalid string, the result is undefined. The calling program may crash or loop or otherwise misbehave. - However, a way of running matches on strings that may contain invalid - UTF sequences is available. Calling pcre2_compile() with the - PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- - preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- - pile() is subsequently called, the compiled JIT code also supports in- - valid UTF. Details of how this support works, in both the JIT and the + However, a way of running matches on strings that may contain invalid + UTF sequences is available. Calling pcre2_compile() with the + PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- + preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- + pile() is subsequently called, the compiled JIT code also supports in- + valid UTF. Details of how this support works, in both the JIT and the interpretive cases, is given in the pcre2unicode documentation. There is also an obsolete option for pcre2_jit_compile() called PCRE2_JIT_INVALID_UTF, which currently exists only for backward compat- - ibility. It is superseded by the pcre2_compile() option + ibility. It is superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF and should no longer be used. It may be removed in future. UNSUPPORTED OPTIONS AND PATTERN ITEMS - The pcre2_match() options that are supported for JIT matching are + The pcre2_match() options that are supported for JIT matching are PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, - PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and - PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options + PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and + PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not supported at match time. - If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the + If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the use of JIT, forcing matching by the interpreter code. - The only unsupported pattern items are \C (match a single data unit) - when running in a UTF mode, and a callout immediately before an asser- + The only unsupported pattern items are \C (match a single data unit) + when running in a UTF mode, and a callout immediately before an asser- tion condition in a conditional group. RETURN VALUES FROM JIT MATCHING - When a pattern is matched using JIT, the return values are the same as - those given by the interpretive pcre2_match() code, with the addition - of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the - memory used for the JIT stack was insufficient. See "Controlling the + When a pattern is matched using JIT, the return values are the same as + those given by the interpretive pcre2_match() code, with the addition + of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the + memory used for the JIT stack was insufficient. See "Controlling the JIT stack" below for a discussion of JIT stack usage. - The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if - searching a very large pattern tree goes on for too long, as it is in - the same circumstance when JIT is not used, but the details of exactly + The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if + searching a very large pattern tree goes on for too long, as it is in + the same circumstance when JIT is not used, but the details of exactly what is counted are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned when JIT matching is used. @@ -5506,25 +5813,25 @@ RETURN VALUES FROM JIT MATCHING CONTROLLING THE JIT STACK When the compiled JIT code runs, it needs a block of memory to use as a - stack. By default, it uses 32KiB on the machine stack. However, some - large or complicated patterns need more than this. The error PCRE2_ER- + stack. By default, it uses 32KiB on the machine stack. However, some + large or complicated patterns need more than this. The error PCRE2_ER- ROR_JIT_STACKLIMIT is given when there is not enough stack. Three func- tions are provided for managing blocks of memory for use as JIT stacks. - There is further discussion about the use of JIT stacks in the section + There is further discussion about the use of JIT stacks in the section entitled "JIT stack FAQ" below. - The pcre2_jit_stack_create() function creates a JIT stack. Its argu- - ments are a starting size, a maximum size, and a general context (for - memory allocation functions, or NULL for standard memory allocation). + The pcre2_jit_stack_create() function creates a JIT stack. Its argu- + ments are a starting size, a maximum size, and a general context (for + memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type pcre2_jit_stack, or - NULL if there is an error. The pcre2_jit_stack_free() function is used + NULL if there is an error. The pcre2_jit_stack_free() function is used to free a stack that is no longer needed. If its argument is NULL, this - function returns immediately, without doing anything. (For the techni- - cally minded: the address space is allocated by mmap or VirtualAlloc.) - A maximum stack size of 512KiB to 1MiB should be more than enough for + function returns immediately, without doing anything. (For the techni- + cally minded: the address space is allocated by mmap or VirtualAlloc.) + A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. - The pcre2_jit_stack_assign() function specifies which stack JIT code + The pcre2_jit_stack_assign() function specifies which stack JIT code should use. Its arguments are as follows: pcre2_match_context *mcontext @@ -5534,7 +5841,7 @@ CONTROLLING THE JIT STACK The first argument is a pointer to a match context. When this is subse- quently passed to a matching function, its information determines which JIT stack is used. If this argument is NULL, the function returns imme- - diately, without doing anything. There are three cases for the values + diately, without doing anything. There are three cases for the values of the other two options: (1) If callback is NULL and data is NULL, an internal 32KiB block @@ -5552,34 +5859,34 @@ CONTROLLING THE JIT STACK return value must be a valid JIT stack, the result of calling pcre2_jit_stack_create(). - A callback function is obeyed whenever JIT code is about to be run; it + A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when pcre2_match() is called with options that are incom- - patible for JIT matching. A callback function can therefore be used to - determine whether a match operation was executed by JIT or by the in- + patible for JIT matching. A callback function can therefore be used to + determine whether a match operation was executed by JIT or by the in- terpreter. You may safely use the same JIT stack for more than one pattern (either - by assigning directly or by callback), as long as the patterns are + by assigning directly or by callback), as long as the patterns are matched sequentially in the same thread. Currently, the only way to set - up non-sequential matches in one thread is to use callouts: if a call- - out function starts another match, that match must use a different JIT + up non-sequential matches in one thread is to use callouts: if a call- + out function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es). - In a multithread application, if you do not specify a JIT stack, or if - you assign or pass back NULL from a callback, that is thread-safe, be- - cause each thread has its own machine stack. However, if you assign or + In a multithread application, if you do not specify a JIT stack, or if + you assign or pass back NULL from a callback, that is thread-safe, be- + cause each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for each thread so that the application is thread-safe. - Strictly speaking, even more is allowed. You can assign the same non- - NULL stack to a match context that is used by any number of patterns, - as long as they are not used for matching by multiple threads at the - same time. For example, you could use the same stack in all compiled - patterns, with a global mutex in the callback to wait until the stack + Strictly speaking, even more is allowed. You can assign the same non- + NULL stack to a match context that is used by any number of patterns, + as long as they are not used for matching by multiple threads at the + same time. For example, you could use the same stack in all compiled + patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended. - This is a suggestion for how a multithreaded program that needs to set + This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate: During thread initialization @@ -5591,7 +5898,7 @@ CONTROLLING THE JIT STACK Use a one-line callback function return thread_local_var - All the functions described in this section do nothing if JIT is not + All the functions described in this section do nothing if JIT is not available. @@ -5600,20 +5907,20 @@ JIT STACK FAQ (1) Why do we need JIT stacks? PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack - where the local data of the current node is pushed before checking its + where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is diffi- cult. For example, the stack chain needs to be updated every time if we - extend the stack on PowerPC. Although it is possible, its updating + extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory. (2) Why don't we simply allocate blocks of memory with malloc()? - Modern operating systems have a nice feature: they can reserve an ad- + Modern operating systems have a nice feature: they can reserve an ad- dress space instead of allocating memory. We can safely allocate memory pages inside this address space, so the stack could grow without moving - memory data (this is important because of pointers). Thus we can allo- - cate 1MiB address space, and use only a single memory page (usually - 4KiB) if that is enough. However, we can still grow up to 1MiB anytime + memory data (this is important because of pointers). Thus we can allo- + cate 1MiB address space, and use only a single memory page (usually + 4KiB) if that is enough. However, we can still grow up to 1MiB anytime if needed. (3) Who "owns" a JIT stack? @@ -5621,8 +5928,8 @@ JIT STACK FAQ The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by pcre2_match(), (that is, it is assigned to a match context that - is passed to the pattern currently running), that stack must not be - used by any other threads (to avoid overwriting the same memory area). + is passed to the pattern currently running), that stack must not be + used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function. @@ -5630,36 +5937,36 @@ JIT STACK FAQ You can free a JIT stack at any time, as long as it will not be used by pcre2_match() again. When you assign the stack to a match context, only - a pointer is set. There is no reference counting or any other magic. + a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, any- - time. Just do not call pcre2_match() with a match context pointing to + time. Just do not call pcre2_match() with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free - a stack currently used by pcre2_match() in another thread). You can - also replace the stack in a context at any time when it is not in use. + a stack currently used by pcre2_match() in another thread). You can + also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement. - (5) Should I allocate/free a stack every time before/after calling + (5) Should I allocate/free a stack every time before/after calling pcre2_match()? - No, because this is too costly in terms of resources. However, you - could implement some clever idea which release the stack if it is not - used in let's say two minutes. The JIT callback can help to achieve + No, because this is too costly in terms of resources. However, you + could implement some clever idea which release the stack if it is not + used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns. - (6) OK, the stack is for long term memory allocation. But what happens - if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB + (6) OK, the stack is for long term memory allocation. But what happens + if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the stack is freed? Especially on embedded systems, it might be a good idea to release mem- - ory sometimes without freeing the stack. There is no API for this at - the moment. Probably a function call which returns with the currently - allocated memory for any stack and another which allows releasing mem- + ory sometimes without freeing the stack. There is no API for this at + the moment. Probably a function call which returns with the currently + allocated memory for any stack and another which allows releasing mem- ory (shrinking the stack) would be a good idea if someone needs this. (7) This is too much of a headache. Isn't there any better solution for JIT stack handling? - No, thanks to Windows. If POSIX threads were used everywhere, we could + No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API. @@ -5668,18 +5975,18 @@ FREEING JIT SPECULATIVE MEMORY void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); The JIT executable allocator does not free all memory when it is possi- - ble. It expects new allocations, and keeps some free memory around to - improve allocation speed. However, in low memory conditions, it might - be better to free all possible memory. You can cause this to happen by - calling pcre2_jit_free_unused_memory(). Its argument is a general con- + ble. It expects new allocations, and keeps some free memory around to + improve allocation speed. However, in low memory conditions, it might + be better to free all possible memory. You can cause this to happen by + calling pcre2_jit_free_unused_memory(). Its argument is a general con- text, for custom memory management, or NULL for standard memory manage- ment. EXAMPLE CODE - This is a single-threaded example that specifies a JIT stack without - using a callback. A real program should include error checking after + This is a single-threaded example that specifies a JIT stack without + using a callback. A real program should include error checking after all the function calls. int rc; @@ -5707,36 +6014,36 @@ EXAMPLE CODE JIT FAST PATH API Because the API described above falls back to interpreted matching when - JIT is not available, it is convenient for programs that are written + JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via pcre2_match() does have a performance impact. Programs that are written - for use where JIT is known to be available, and which need the best - possible performance, can instead use a "fast path" API to call JIT - matching directly instead of calling pcre2_match() (obviously only for + for use where JIT is known to be available, and which need the best + possible performance, can instead use a "fast path" API to call JIT + matching directly instead of calling pcre2_match() (obviously only for patterns that have been successfully processed by pcre2_jit_compile()). - The fast path function is called pcre2_jit_match(), and it takes ex- - actly the same arguments as pcre2_match(). However, the subject string - must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- + The fast path function is called pcre2_jit_match(), and it takes ex- + actly the same arguments as pcre2_match(). However, the subject string + must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- ported. Unsupported option bits (for example, PCRE2_ANCHORED and - PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- - turn values are also the same as for pcre2_match(), plus PCRE2_ER- + PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- + turn values are also the same as for pcre2_match(), plus PCRE2_ER- ROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. - When you call pcre2_match(), as well as testing for invalid options, a + When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For exam- - ple, if the subject pointer is NULL but the length is non-zero, an im- - mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF + ple, if the subject pointer is NULL but the length is non-zero, an im- + mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these - checks do not happen on the JIT fast path. If invalid UTF data is - passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), - the result is undefined. The program may crash or loop or give wrong - results. In the absence of PCRE2_MATCH_INVALID_UTF you should call - pcre2_jit_match() in UTF mode only if you are sure the subject is + checks do not happen on the JIT fast path. If invalid UTF data is + passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), + the result is undefined. The program may crash or loop or give wrong + results. In the absence of PCRE2_MATCH_INVALID_UTF you should call + pcre2_jit_match() in UTF mode only if you are sure the subject is valid. - Bypassing the sanity checks and the pcre2_match() wrapping can give + Bypassing the sanity checks and the pcre2_match() wrapping can give speedups of more than 10%. @@ -5754,15 +6061,14 @@ AUTHOR REVISION - Last updated: 21 February 2024 + Last updated: 22 August 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 21 February 2024 PCRE2JIT(3) +PCRE2 10.45 22 August 2024 PCRE2JIT(3) ------------------------------------------------------------------------------ - PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3) @@ -5838,15 +6144,14 @@ AUTHOR REVISION - Last updated: August 2023 + Last updated: 16 August 2023 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 1 August 2023 PCRE2LIMITS(3) +PCRE2 10.45 16 August 2023 PCRE2LIMITS(3) ------------------------------------------------------------------------------ - PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3) @@ -5860,7 +6165,7 @@ PCRE2 MATCHING ALGORITHMS in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() function. This works in the same as Perl's matching func- - tion, and provide a Perl-compatible matching operation. The just-in- + tion, and provides a Perl-compatible matching operation. The just-in- time (JIT) optimization that is described in the pcre2jit documentation is compatible with this function. @@ -5872,7 +6177,7 @@ PCRE2 MATCHING ALGORITHMS When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, when there are multiple possibilities. For example, if - the pattern + the anchored pattern ^<.*> @@ -5948,83 +6253,86 @@ THE ALTERNATIVE MATCHING ALGORITHM first match (which is necessarily the shortest) is found. Note that the size of vector needed to contain all the results depends - on the number of simultaneous matches, not on the number of parentheses - in the pattern. Using pcre2_match_data_create_from_pattern() to create - the match data block is therefore not advisable when doing DFA match- - ing. + on the number of simultaneous matches, not on the number of capturing + parentheses in the pattern. Using pcre2_match_data_create_from_pat- + tern() to create the match data block is therefore not advisable when + doing DFA matching. - Note also that all the matches that are found start at the same point + Note also that all the matches that are found start at the same point in the subject. If the pattern cat(er(pillar)?)? - is matched against the string "the caterpillar catchment", the result - is the three strings "caterpillar", "cater", and "cat" that start at - the fifth character of the subject. The algorithm does not automati- + is matched against the string "the caterpillar catchment", the result + is the three strings "caterpillar", "cater", and "cat" that start at + the fifth character of the subject. The algorithm does not automati- cally move on to find matches that start at later positions. PCRE2's "auto-possessification" optimization usually applies to charac- - ter repeats at the end of a pattern (as well as internally). For exam- + ter repeats at the end of a pattern (as well as internally). For exam- ple, the pattern "a\d+" is compiled as if it were "a\d++" because there - is no point even considering the possibility of backtracking into the - repeated digits. For DFA matching, this means that only one possible - match is found. If you really do want multiple matches in such cases, - either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- + is no point even considering the possibility of backtracking into the + repeated digits. For DFA matching, this means that only one possible + match is found. If you really do want multiple matches in such cases, + either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- SESS option when compiling. - There are a number of features of PCRE2 regular expressions that are - not supported or behave differently in the alternative matching func- + There are a number of features of PCRE2 regular expressions that are + not supported or behave differently in the alternative matching func- tion. Those that are not supported cause an error if encountered. - 1. Because the algorithm finds all possible matches, the greedy or un- - greedy nature of repetition quantifiers is not relevant (though it may - affect auto-possessification, as just described). During matching, - greedy and ungreedy quantifiers are treated in exactly the same way. + 1. Because the algorithm finds all possible matches, the greedy or un- + greedy nature of repetition quantifiers is not relevant (though it may + affect auto-possessification, as just described). During matching, + greedy and ungreedy quantifiers are treated in exactly the same way. However, possessive quantifiers can make a difference when what follows - could also match what is quantified, for example in a pattern like + could also match what is quantified, for example in a pattern like this: ^a++\w! - This pattern matches "aaab!" but not "aaa!", which would be matched by - a non-possessive quantifier. Similarly, if an atomic group is present, - it is matched as if it were a standalone pattern at the current point, - and the longest match is then "locked in" for the rest of the overall + This pattern matches "aaab!" but not "aaa!", which would be matched by + a non-possessive quantifier. Similarly, if an atomic group is present, + it is matched as if it were a standalone pattern at the current point, + and the longest match is then "locked in" for the rest of the overall pattern. 2. When dealing with multiple paths through the tree simultaneously, it - is not straightforward to keep track of captured substrings for the - different matching possibilities, and PCRE2's implementation of this + is not straightforward to keep track of captured substrings for the + different matching possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured sub- strings are available. - 3. Because no substrings are captured, backreferences within the pat- - tern are not supported. + 3. Because no substrings are captured, a number of related features are + not available: - 4. For the same reason, conditional expressions that use a backrefer- - ence as the condition or test for a specific group recursion are not - supported. + (a) Backreferences; - 5. Again for the same reason, script runs are not supported. + (b) Conditional expressions that use a backreference as the condition + or test for a specific group recursion; - 6. Because many paths through the tree may be active, the \K escape se- - quence, which resets the start of the match when encountered (but may + (c) Script runs; + + (d) Scan substring assertions. + + 4. Because many paths through the tree may be active, the \K escape se- + quence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported. - 7. Callouts are supported, but the value of the capture_top field is + 5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0. - 8. The \C escape sequence, which (in the standard algorithm) always - matches a single code unit, even in a UTF mode, is not supported in - these modes, because the alternative algorithm moves through the sub- - ject string one character (not code unit) at a time, for all active - paths through the tree. + 6. The \C escape sequence, which (in the standard algorithm) always + matches a single code unit, even in a UTF mode, is not supported in UTF + modes because the alternative algorithm moves through the subject + string one character (not code unit) at a time, for all active paths + through the tree. - 9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) + 7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion. - 10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- + 8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- ported by pcre2_dfa_match(). @@ -6049,13 +6357,15 @@ DISADVANTAGES OF THE ALTERNATIVE ALGORITHM partly because it has to search for all possible matches, but is also because it is less susceptible to optimization. - 2. Capturing parentheses, backreferences, script runs, and matching - within invalid UTF string are not supported. + 2. Capturing parentheses and other features such as backreferences that + rely on them are not supported. - 3. Although atomic groups are supported, their use does not provide the + 3. Matching within invalid UTF strings is not supported. + + 4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm. - 4. JIT optimization is not supported. + 5. JIT optimization is not supported. AUTHOR @@ -6067,20 +6377,19 @@ AUTHOR REVISION - Last updated: 19 January 2024 + Last updated: 30 August 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2MATCHING(3) +PCRE2 10.45 30 August 2024 PCRE2MATCHING(3) ------------------------------------------------------------------------------ - PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3) NAME - PCRE2 - Perl-compatible regular expressions + PCRE2 - Perl-compatible regular expressions (revised API) PARTIAL MATCHING IN PCRE2 @@ -6451,15 +6760,14 @@ AUTHOR REVISION - Last updated: 04 September 2019 + Last updated: 27 November 2024 Copyright (c) 1997-2019 University of Cambridge. -PCRE2 10.34 04 September 2019 PCRE2PARTIAL(3) +PCRE2 10.45 27 November 2024 PCRE2PARTIAL(3) ------------------------------------------------------------------------------ - PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) @@ -6473,9 +6781,11 @@ PCRE2 REGULAR EXPRESSION DETAILS by PCRE2 are described in detail below. There is a quick-reference syn- tax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. PCRE2 also supports some alterna- - tive regular expression syntax (which does not conflict with the Perl - syntax) in order to provide some compatibility with regular expressions - in Python, .NET, and Oniguruma. + tive regular expression syntax that does not conflict with the Perl + syntax in order to provide some compatibility with regular expressions + in Python, .NET, and Oniguruma. There are in addition some options that + enable alternative syntax and semantics that are not the same as in + Perl. Perl's regular expressions are described in its own documentation, and regular expressions in general are covered in a number of books, some @@ -6494,82 +6804,98 @@ PCRE2 REGULAR EXPRESSION DETAILS tion, are discussed in the pcre2matching page. +EBCDIC CHARACTER CODES + + Most computers use ASCII or Unicode for encoding characters, and PCRE2 + assumes this by default. However, it can be compiled to run in an envi- + ronment that uses the EBCDIC code, which is the case for some IBM main- + frame operating systems. In the sections below, character code values + are ASCII or Unicode; in an EBCDIC environment these characters may + have different code values, and there are no code points greater than + 255. Differences in behaviour when PCRE2 is running in an EBCDIC envi- + ronment are described in the section "EBCDIC environments" below, which + you can ignore unless you really are in an EBCDIC environment. + + SPECIAL START-OF-PATTERN ITEMS - A number of options that can be passed to pcre2_compile() can also be + A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-com- - patible, but are provided to make these options accessible to pattern - writers who are not able to change the program that processes the pat- - tern. Any number of these items may appear, but they must all be to- - gether right at the start of the pattern string, and the letters must + patible, but are provided to make these options accessible to pattern + writers who are not able to change the program that processes the pat- + tern. Any number of these items may appear, but they must all be to- + gether right at the start of the pattern string, and the letters must be in upper case. UTF support In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 - can be specified for the 32-bit library, in which case it constrains - the character values to valid Unicode code points. To process UTF - strings, PCRE2 must be built to include Unicode support (which is the - default). When using UTF strings you must either call the compiling - function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF - options, or the pattern must start with the special sequence (*UTF), - which is equivalent to setting the relevant PCRE2_UTF. How setting a + can be specified for the 32-bit library, in which case it constrains + the character values to valid Unicode code points. To process UTF + strings, PCRE2 must be built to include Unicode support (which is the + default). When using UTF strings you must either call the compiling + function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF + options, or the pattern must start with the special sequence (*UTF), + which is equivalent to setting the relevant PCRE2_UTF. How setting a UTF mode affects pattern matching is mentioned in several places below. There is also a summary of features in the pcre2unicode page. Some applications that allow their users to supply patterns may wish to - restrict them to non-UTF data for security reasons. If the - PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- + restrict them to non-UTF data for security reasons. If the + PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- lowed, and its appearance in a pattern causes an error. Unicode property support - Another special sequence that may appear at the start of a pattern is - (*UCP). This has the same effect as setting the PCRE2_UCP option: it - causes sequences such as \d and \w to use Unicode properties to deter- + Another special sequence that may appear at the start of a pattern is + (*UCP). This has the same effect as setting the PCRE2_UCP option: it + causes sequences such as \d and \w to use Unicode properties to deter- mine character types, instead of recognizing only characters with codes less than 256 via a lookup table. If also causes upper/lower casing op- - erations to use Unicode properties for characters with code points - greater than 127, even when UTF is not set. These behaviours can be - changed within the pattern; see the section entitled "Internal Option + erations to use Unicode properties for characters with code points + greater than 127, even when UTF is not set. These behaviours can be + changed within the pattern; see the section entitled "Internal Option Setting" below. Some applications that allow their users to supply patterns may wish to - restrict them for security reasons. If the PCRE2_NEVER_UCP option is + restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to pcre2_compile(), (*UCP) is not allowed, and its appearance in a pattern causes an error. Locking out empty string matching Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same - effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option + effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever matching function is subsequently called to match the pat- - tern. These options lock out the matching of empty strings, either en- + tern. These options lock out the matching of empty strings, either en- tirely, or only at the start of the subject. Disabling auto-possessification - If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as - setting the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making - quantifiers possessive when what follows cannot match the repeated - item. For example, by default a+b is treated as a++b. For more details, - see the pcre2api documentation. + If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as + setting the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_opti- + mize() with a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from + making quantifiers possessive when what follows cannot match the re- + peated item. For example, by default a+b is treated as a++b. For more + details, see the pcre2api documentation. Disabling start-up optimizations - If a pattern starts with (*NO_START_OPT), it has the same effect as - setting the PCRE2_NO_START_OPTIMIZE option. This disables several opti- - mizations for quickly reaching "no match" results. For more details, - see the pcre2api documentation. + If a pattern starts with (*NO_START_OPT), it has the same effect as + setting the PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_opti- + mize() with a PCRE2_START_OPTIMIZE_OFF directive. This disables several + optimizations for quickly reaching "no match" results. For more de- + tails, see the pcre2api documentation. Disabling automatic anchoring If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect - as setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimiza- - tions that apply to patterns whose top-level branches all start with .* - (match any number of arbitrary characters). For more details, see the - pcre2api documentation. + as setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_op- + timize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables opti- + mizations that apply to patterns whose top-level branches all start + with .* (match any number of arbitrary characters). For more details, + see the pcre2api documentation. Disabling JIT compilation @@ -6666,33 +6992,27 @@ SPECIAL START-OF-PATTERN ITEMS CODE) is also recognized, corresponding to PCRE2_BSR_UNICODE. -EBCDIC CHARACTER CODES - - PCRE2 can be compiled to run in an environment that uses EBCDIC as its - character code instead of ASCII or Unicode (typically a mainframe sys- - tem). In the sections below, character code values are ASCII or Uni- - code; in an EBCDIC environment these characters may have different code - values, and there are no code points greater than 255. - - CHARACTERS AND METACHARACTERS - A regular expression is a pattern that is matched against a subject - string from left to right. Most characters stand for themselves in a - pattern, and match the corresponding characters in the subject. As a + A regular expression is a pattern that is matched against a subject + string from left to right. Most characters stand for themselves in a + pattern, and match the corresponding characters in the subject. As a trivial example, the pattern The quick brown fox matches a portion of a subject string that is identical to itself. When - caseless matching is specified (the PCRE2_CASELESS option or (?i) - within the pattern), letters are matched independently of case. Note - that there are two ASCII characters, K and S, that, in addition to - their lower case ASCII equivalents, are case-equivalent with Unicode - U+212A (Kelvin sign) and U+017F (long S) respectively when either + caseless matching is specified (the PCRE2_CASELESS option or (?i) + within the pattern), letters are matched independently of case. Note + that there are two ASCII characters, K and S, that, in addition to + their lower case ASCII equivalents, are case-equivalent with Unicode + U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT - option is in force (either passed to pcre2_compile() or set by (?r) - within the pattern). + option is in force (either passed to pcre2_compile() or set by (*CASE- + LESS_RESTRICT) or (?r) within the pattern). If the PCRE2_EXTRA_TURK- + ISH_CASING option is in force (either passed to pcre2_compile() or set + by (*TURKISH_CASING) within the pattern), then the 'i' letters are + matched according to Turkish and Azeri languages. The power of regular expressions comes from the ability to include wild cards, character classes, alternatives, and repetitions in the pattern. @@ -6739,7 +7059,7 @@ CHARACTERS AND METACHARACTERS If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or between a # outside a character class and the next new- - line, inclusive, are ignored. An escaping backslash can be used to in- + line, inclusive, is ignored. An escaping backslash can be used to in- clude a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are ignored inside a @@ -6797,6 +7117,13 @@ BACKSLASH error, because the character class is then not terminated by a closing square bracket. + Another difference from Perl is that any appearance of \Q or \E inside + what might otherwise be a quantifier causes PCRE2 not to recognize the + sequence as a quantifier. Perl recognizes a quantifier if (redundantly) + either of the numbers is inside \Q...\E, but not if the separating + comma is. When not recognized as a quantifier a sequence such as + {\Q1\E,2} is treated as the literal string "{1,2}". + Non-printing characters A second use of backslash provides a way of encoding non-printing char- @@ -6815,115 +7142,107 @@ BACKSLASH \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd - \ddd character with octal code ddd, or backreference + \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh.. - By default, after \x that is not followed by {, from zero to two hexa- - decimal digits are read (letters can be in upper or lower case). Any - number of hexadecimal digits may appear between \x{ and }. If a charac- - ter other than a hexadecimal digit appears between \x{ and }, or if - there is no terminating }, an error occurs. + A description of how back references work is given later, following the + discussion of parenthesized groups. + + By default, after \x that is not followed by {, one or two hexadecimal + digits are read (letters can be in upper or lower case). If the charac- + ter that follows \x is neither { nor a hexadecimal digit, an error oc- + curs. This is different from Perl's default behaviour, which generates + a NUL character, but is in line with the behaviour of Perl's 'strict' + mode in re. + + Any number of hexadecimal digits may appear between \x{ and }. If a + character other than a hexadecimal digit appears between \x{ and }, or + if there is no terminating }, an error occurs. Characters whose code points are less than 256 can be defined by either of the two syntaxes for \x or by an octal sequence. There is no differ- ence in the way they are handled. For example, \xdc is exactly the same - as \x{dc} or \334. However, using the braced versions does make such + as \x{dc} or \334. However, using the braced versions does make such sequences easier to read. - Support is available for some ECMAScript (aka JavaScript) escape se- + Support is available for some ECMAScript (aka JavaScript) escape se- quences via two compile-time options. If PCRE2_ALT_BSUX is set, the se- - quence \x followed by { is not recognized. Only if \x is followed by - two hexadecimal digits is it recognized as a character escape. Other- - wise it is interpreted as a literal "x" character. In this mode, sup- - port for code points greater than 256 is provided by \u, which must be - followed by four hexadecimal digits; otherwise it is interpreted as a + quence \x followed by { is not recognized. Only if \x is followed by + two hexadecimal digits is it recognized as a character escape. Other- + wise it is interpreted as a literal "x" character. In this mode, sup- + port for code points greater than 256 is provided by \u, which must be + followed by four hexadecimal digits; otherwise it is interpreted as a literal "u" character. - PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- + PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- dition, \u{hhh..} is recognized as the character specified by hexadeci- mal code point. There may be any number of hexadecimal digits, but un- - like other places that also use curly brackets, spaces are not allowed - and would result in the string being interpreted as a literal. This + like other places that also use curly brackets, spaces are not allowed + and would result in the string being interpreted as a literal. This syntax is from ECMAScript 6. - The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- - ating in UTF mode. Perl also uses \N{name} to specify characters by - Unicode name; PCRE2 does not support this. Note that when \N is not + The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- + ating in UTF mode. Perl also uses \N{name} to specify characters by + Unicode name; PCRE2 does not support this. Note that when \N is not followed by an opening brace (curly bracket) it has an entirely differ- ent meaning, matching any character that is not a newline. - There are some legacy applications where the escape sequence \r is ex- - pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option - is set, \r in a pattern is converted to \n so that it matches a LF + There are some legacy applications where the escape sequence \r is ex- + pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option + is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character. - An error occurs if \c is not followed by a character whose ASCII code - point is in the range 32 to 126. The precise effect of \cx is as fol- - lows: if x is a lower case letter, it is converted to upper case. Then + An error occurs if \c is not followed by a character whose ASCII code + point is in the range 32 to 126. The precise effect of \cx is as fol- + lows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \cA to \cZ become hex - 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and - \c; becomes hex 7B (; is 3B). If the code unit following \c has a code + 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and + \c; becomes hex 7B (; is 3B). If the code unit following \c has a code point less than 32 or greater than 126, a compile-time error occurs. - When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. - \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. - The \c escape is processed as specified for Perl in the perlebcdic doc- - ument. The only characters that are allowed after \c are A-Z, a-z, or - one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- - time error. The sequence \c@ encodes character code 0; after \c the - letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, - \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- - comes either 255 (hex FF) or 95 (hex 5F). + For differences in the way some escapes behave in EBCDIC environments, + see section "EBCDIC environments" below. - Thus, apart from \c?, these escapes generate the same character code - values as they do in an ASCII environment, though the meanings of the - values mostly differ. For example, \cG always generates code value 7, - which is BEL in ASCII but DEL in EBCDIC. + Octal escapes and back references - The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, - but because 127 is not a control character in EBCDIC, Perl makes it - generate the APC character. Unfortunately, there are several variants - of EBCDIC. In most of them the APC character has the value 255 (hex - FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If - certain other characters have POSIX-BC values, PCRE2 makes \c? generate - 95; otherwise it generates 255. + The escape \o must be followed by a sequence of octal digits, enclosed + in braces. An error occurs if this is not the case. This escape pro- + vides a way of specifying character code points as octal numbers + greater than 0777, and it also allows octal numbers and backreferences + to be unambiguously distinguished. - After \0 up to two further octal digits are read. If there are fewer - than two digits, just those that are present are used. Thus the se- - quence \0\x\015 specifies two binary zeros followed by a CR character - (code value 13). Make sure you supply two digits after the initial zero - if the pattern character that follows is itself an octal digit. + If braces are not used, after \0 up to two further octal digits are + read. However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one + more octal digit must follow \0 (use \00 to generate a NUL character). + Make sure you supply two digits after the initial zero if the pattern + character that follows is itself an octal digit. - The escape \o must be followed by a sequence of octal digits, enclosed - in braces. An error occurs if this is not the case. This escape is a - recent addition to Perl; it provides way of specifying character code - points as octal numbers greater than 0777, and it also allows octal - numbers and backreferences to be unambiguously specified. + Inside a character class, when a backslash is followed by any octal + digit, up to three octal digits are read to generate a code point. Any + subsequent digits stand for themselves. The sequences \8 and \9 are + treated as the literal characters "8" and "9". + + Outside a character class, Perl's handling of a backslash followed by a + digit other than 0 is complicated by ambiguity, and Perl has changed + over time, causing PCRE2 also to change. From PCRE2 release 10.45 there + is an option called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use + Python's unambiguous rules. The next two subsections describe the two + sets of rules. For greater clarity and unambiguity, it is best to avoid following \ by - a digit greater than zero. Instead, use \o{...} or \x{...} to specify + a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical character code points, and \g{...} to specify backreferences. - The following paragraphs describe the old, ambiguous syntax. - - The handling of a backslash followed by a digit other than 0 is compli- - cated, and Perl has changed over time, causing PCRE2 also to change. - Outside a character class, PCRE2 reads the digit and any following dig- - its as a decimal number. If the number is less than 10, begins with the - digit 8 or 9, or if there are at least that many previous capture - groups in the expression, the entire sequence is taken as a backrefer- - ence. A description of how this works is given later, following the - discussion of parenthesized groups. Otherwise, up to three octal dig- - its are read to form a character code. + Perl rules for non-class backslash 1-9 - Inside a character class, PCRE2 handles \8 and \9 as the literal char- - acters "8" and "9", and otherwise reads up to three octal digits fol- - lowing the backslash, using them to generate a data character. Any sub- - sequent digits stand for themselves. For example, outside a character - class: + All the digits that follow the backslash are read as a decimal number. + If the number is less than 10, begins with the digit 8 or 9, or if + there are at least that many previous capture groups in the expression, + the entire sequence is taken as a back reference. Otherwise, up to + three octal digits are read to form a character code. For example: \040 is another way of writing an ASCII space \40 is the same, provided there are fewer than 40 @@ -6939,10 +7258,21 @@ BACKSLASH the value 255 (decimal) \81 is always a backreference - Note that octal values of 100 or greater that are specified using this - syntax must not be introduced by a leading zero, because no more than + Note that octal values of 100 or greater that are specified using this + syntax must not be introduced by a leading zero, because no more than three octal digits are ever read. + Python rules for non_class backslash 1-9 + + If there are at least three octal digits after the backslash, exactly + three are read as an octal code point number, but the value must be no + greater than \377, even in modes where higher code point values are + supported. Any subsequent digits stand for themselves. If there are + fewer than three octal digits, the sequence is taken as a decimal back + reference. Thus, for example, \12 is always a back reference, indepen- + dent of how many captures there are in the pattern. An error is gener- + ated for a reference to a non-existent capturing group. + Constraints on character values Characters that are specified using octal or hexadecimal numbers are @@ -7161,7 +7491,7 @@ BACKSLASH tional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing charac- - ters whose code points are less than U+0100 and U+10000, respectively. + ters whose code points are less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Un- known script and with an unassigned type. @@ -7179,15 +7509,34 @@ BACKSLASH \P{xx} a character without the xx property \X a Unicode extended grapheme cluster - The property names represented by xx above are not case-sensitive, and - in accordance with Unicode's "loose matching" rules, spaces, hyphens, - and underscores are ignored. There is support for Unicode script names, - Unicode general category properties, "Any", which matches any character - (including newline), Bidi_Class, a number of binary (yes/no) proper- - ties, and some special PCRE2 properties (described below). Certain - other Perl properties such as "InMusicalSymbols" are not supported by - PCRE2. Note that \P{Any} does not match any characters, so always - causes a match failure. + For compatibility with Perl, negation can be specified by including a + circumflex between the opening brace and the property. For example, + \p{^Lu} is the same as \P{Lu}. + + In accordance with Unicode's "loose matching" rules, ASCII white space + characters, hyphens, and underscores are ignored in the properties rep- + resented by xx above. As well as the space character, ASCII white space + can be tab, linefeed, vertical tab, formfeed, or carriage return. + + Some properties are specified as a name only; others as a name and a + value, separated by a colon or an equals sign. The names and values + consist of ASCII letters and digits (with one Perl-specific exception, + see below). They are not case sensitive. Note, however, that the es- + capes themselves, \p and \P, are case sensitive. There are abbrevia- + tions for many names. The following examples are all equivalent: + + \p{bidiclass=al} + \p{BC=al} + \p{ Bidi_Class : AL } + \p{ Bi-di class = Al } + \P{ ^ Bi-di class = Al } + + There is support for Unicode script names, Unicode general category + properties, "Any", which matches any character (including newline), + Bidi_Class, a number of binary (yes/no) properties, and some special + PCRE2 properties (described below). Certain other Perl properties such + as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} + does not match any characters, so always causes a match failure. Script properties for \p and \P @@ -7197,15 +7546,15 @@ BACKSLASH Adlam script as an example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and - "script extensions" for the property types are recognized, and a equals - sign is an alternative to the colon. If a script name is given without - a property type, for example, \p{Adlam}, it is treated as \p{scx:Ad- - lam}. Perl changed to this interpretation at release 5.26 and PCRE2 - changed at release 10.40. + "script extensions" for the property types are recognized and, as for + all property specifications, an equals sign is an alternative to the + colon. If a script name is given without a property type, for example, + \p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this inter- + pretation at release 5.26 and PCRE2 changed at release 10.40. Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others - that are not part of an identified script are lumped together as "Com- + that are not part of an identified script are lumped together as "Com- mon". The current list of recognized script names and their 4-character abbreviations can be obtained by running this command: @@ -7215,15 +7564,11 @@ BACKSLASH The general category property for \p and \P Each character has exactly one Unicode general category property, spec- - ified by a two-letter abbreviation. For compatibility with Perl, nega- - tion can be specified by including a circumflex between the opening - brace and the property name. For example, \p{^Lu} is the same as - \P{Lu}. - - If only one letter is specified with \p or \P, it includes all the gen- - eral category properties that start with that letter. In this case, in - the absence of negation, the curly brackets in the escape sequence are - optional; these two examples have the same effect: + ified by a two-letter abbreviation. If only one letter is specified + with \p or \P, it includes all the general category properties that + start with that letter. In this case, in the absence of negation, the + curly brackets in the escape sequence are optional; these two examples + have the same effect: \p{L} \pL @@ -7238,6 +7583,7 @@ BACKSLASH Cs Surrogate L Letter + Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter @@ -7274,35 +7620,36 @@ BACKSLASH Zp Paragraph separator Zs Space separator - The special property LC, which has the synonym L&, is also supported: - it matches a character that has the Lu, Ll, or Lt property, in other - words, a letter that is not classified as a modifier or "other". - - The Cs (Surrogate) property applies only to characters whose code - points are in the range U+D800 to U+DFFF. These characters are no dif- - ferent to any other character when PCRE2 is not in UTF mode (using the - 16-bit or 32-bit library). However, they are not valid in Unicode + Perl originally used the name L& for the Lc property. This is still + supported by Perl, but discouraged. PCRE2 also still supports it. This + property matches any character that has the Lu, Ll, or Lt property, in + other words, any letter that is not classified as a modifier or + "other". From release 10.45 of PCRE2 the properties Lu, Ll, and Lt are + all treated as Lc when case-independent matching is set by the + PCRE2_CASELESS option or (?i) within the pattern. The other properties + are not affected by caseless matching. + + The Cs (Surrogate) property applies only to characters whose code + points are in the range U+D800 to U+DFFF. These characters are no dif- + ferent to any other character when PCRE2 is not in UTF mode (using the + 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF valid- - ity checking has been turned off (see the discussion of + ity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api page). - The long synonyms for property names that Perl supports (such as - \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix + The long synonyms for property names that Perl supports (such as + \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". No character that is in the Unicode table has the Cn (unassigned) prop- erty. Instead, this property is assumed for any code point that is not in the Unicode table. - Specifying caseless matching does not affect these escape sequences. - For example, \p{Lu} always matches only upper case letters. This is - different from the behaviour of current versions of Perl. - Binary (yes/no) properties for \p and \P - Unicode defines a number of binary properties, that is, properties - whose only values are true or false. You can obtain a list of those - that are recognized by \p and \P, along with their abbreviations, by + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP @@ -7337,63 +7684,65 @@ BACKSLASH RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space - An equals sign may be used instead of a colon. The class names are - case-insensitive; only the short names listed above are recognized. + As in all property specifications, an equals sign may be used instead + of a colon and the class names are case-insensitive. Only the short + names listed above are recognized; PCRE2 does not at present support + any long alternatives. Extended grapheme clusters - The \X escape matches any number of Unicode characters that form an + The \X escape matches any number of Unicode characters that form an "extended grapheme cluster", and treats the sequence as an atomic group - (see below). Unicode supports various kinds of composite character by - giving each character a grapheme breaking property, and having rules + (see below). Unicode supports various kinds of composite character by + giving each character a grapheme breaking property, and having rules that use these properties to define the boundaries of extended grapheme - clusters. The rules are defined in Unicode Standard Annex 29, "Unicode - Text Segmentation". Unicode 11.0.0 abandoned the use of some previous - properties that had been used for emojis. Instead it introduced vari- - ous emoji-specific properties. PCRE2 uses only the Extended Picto- + clusters. The rules are defined in Unicode Standard Annex 29, "Unicode + Text Segmentation". Unicode 11.0.0 abandoned the use of some previous + properties that had been used for emojis. Instead it introduced vari- + ous emoji-specific properties. PCRE2 uses only the Extended Picto- graphic property. - \X always matches at least one character. Then it decides whether to + \X always matches at least one character. Then it decides whether to add additional characters according to the following rules for ending a cluster: 1. End at the end of the subject string. - 2. Do not end between CR and LF; otherwise end after any control char- + 2. Do not end between CR and LF; otherwise end after any control char- acter. - 3. Do not break Hangul (a Korean script) syllable sequences. Hangul - characters are of five types: L, V, T, LV, and LVT. An L character may - be followed by an L, V, LV, or LVT character; an LV or V character may - be followed by a V or T character; an LVT or T character may be fol- + 3. Do not break Hangul (a Korean script) syllable sequences. Hangul + characters are of five types: L, V, T, LV, and LVT. An L character may + be followed by an L, V, LV, or LVT character; an LV or V character may + be followed by a V or T character; an LVT or T character may be fol- lowed only by a T character. 4. Do not end before extending characters or spacing marks or the zero- - width joiner (ZWJ) character. Characters with the "mark" property al- + width joiner (ZWJ) character. Characters with the "mark" property al- ways have the "extend" grapheme breaking property. 5. Do not end after prepend characters. - 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width - joiner) sequences. An emoji ZWJ sequence consists of a character with - the Extended_Pictographic property, optionally followed by one or more - characters with the Extend property, followed by the ZWJ character, + 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width + joiner) sequences. An emoji ZWJ sequence consists of a character with + the Extended_Pictographic property, optionally followed by one or more + characters with the Extend property, followed by the ZWJ character, followed by another Extended_Pictographic character. - 7. Do not break within emoji flag sequences. That is, do not break be- - tween regional indicator (RI) characters if there are an odd number of + 7. Do not break within emoji flag sequences. That is, do not break be- + tween regional indicator (RI) characters if there are an odd number of RI characters before the break point. 8. Otherwise, end the cluster. PCRE2's additional properties - As well as the standard Unicode properties described above, PCRE2 sup- + As well as the standard Unicode properties described above, PCRE2 sup- ports four more that make it possible to convert traditional escape se- - quences such as \w and \s to use Unicode properties. PCRE2 uses these - non-standard, non-Perl properties internally when PCRE2_UCP is set. + quences such as \w and \s to use Unicode properties. PCRE2 uses these + non-standard, non-Perl properties internally when PCRE2_UCP is set. However, they may also be used explicitly. These properties are: Xan Any alphanumeric character @@ -7401,73 +7750,74 @@ BACKSLASH Xsp Any Perl space character Xwd Any Perl "word" character - Xan matches characters that have either the L (letter) or the N (num- - ber) property. Xps matches the characters tab, linefeed, vertical tab, - form feed, or carriage return, and any other character that has the Z - (separator) property. Xsp is the same as Xps; in PCRE1 it used to ex- - clude vertical tab, for Perl compatibility, but Perl changed. Xwd - matches the same characters as Xan, plus those that match Mn (non-spac- - ing mark) or Pc (connector punctuation, which includes underscore). - - There is another non-standard property, Xuc, which matches any charac- - ter that can be represented by a Universal Character Name in C++ and - other programming languages. These are the characters $, @, ` (grave - accent), and all characters with Unicode code points greater than or - equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that - most base (ASCII) characters are excluded. (Universal Character Names - are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. + Xan matches characters that have either the L (letter) or the N (num- + ber) property. Xps matches the characters tab, linefeed, vertical tab, + form feed, or carriage return, and any other character that has the Z + (separator) property (this includes the space character). Xsp is the + same as Xps; in PCRE1 it used to exclude vertical tab, for Perl compat- + ibility, but Perl changed. Xwd matches the same characters as Xan, plus + those that match Mn (non-spacing mark) or Pc (connector punctuation, + which includes underscore). + + There is another non-standard property, Xuc, which matches any charac- + ter that can be represented by a Universal Character Name in C++ and + other programming languages. These are the characters $, @, ` (grave + accent), and all characters with Unicode code points greater than or + equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that + most base (ASCII) characters are excluded. (Universal Character Names + are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. Note that the Xuc property does not match these sequences but the char- acters that they represent.) Resetting the match start - In normal use, the escape sequence \K causes any previously matched + In normal use, the escape sequence \K causes any previously matched characters not to be included in the final matched sequence that is re- turned. For example, the pattern: foo\Kbar - matches "foobar", but reports that it has matched "bar". \K does not + matches "foobar", but reports that it has matched "bar". \K does not interact with anchoring in any way. The pattern: ^foo\Kbar - matches only when the subject begins with "foobar" (in single line - mode), though it again reports the matched string as "bar". This fea- - ture is similar to a lookbehind assertion (described below), but the + matches only when the subject begins with "foobar" (in single line + mode), though it again reports the matched string as "bar". This fea- + ture is similar to a lookbehind assertion (described below), but the part of the pattern that precedes \K is not constrained to match a lim- - ited number of characters, as is required for a lookbehind assertion. - The use of \K does not interfere with the setting of captured sub- + ited number of characters, as is required for a lookbehind assertion. + The use of \K does not interfere with the setting of captured sub- strings. For example, when the pattern (foo)\Kbar matches "foobar", the first substring is still set to "foo". - From version 5.32.0 Perl forbids the use of \K in lookaround asser- - tions. From release 10.38 PCRE2 also forbids this by default. However, - the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling - pcre2_compile() to re-enable the previous behaviour. When this option + From version 5.32.0 Perl forbids the use of \K in lookaround asser- + tions. From release 10.38 PCRE2 also forbids this by default. However, + the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling + pcre2_compile() to re-enable the previous behaviour. When this option is set, \K is acted upon when it occurs inside positive assertions, but - is ignored in negative assertions. Note that when a pattern such as - (?=ab\K) matches, the reported start of the match can be greater than - the end of the match. Using \K in a lookbehind assertion at the start - of a pattern can also lead to odd effects. For example, consider this + is ignored in negative assertions. Note that when a pattern such as + (?=ab\K) matches, the reported start of the match can be greater than + the end of the match. Using \K in a lookbehind assertion at the start + of a pattern can also lead to odd effects. For example, consider this pattern: (?<=\Kfoo)bar - If the subject is "foobar", a call to pcre2_match() with a starting - offset of 3 succeeds and reports the matching string as "foobar", that - is, the start of the reported match is earlier than where the match + If the subject is "foobar", a call to pcre2_match() with a starting + offset of 3 succeeds and reports the matching string as "foobar", that + is, the start of the reported match is earlier than where the match started. Simple assertions - The final use of backslash is for certain simple assertions. An asser- - tion specifies a condition that has to be met at a particular point in - a match, without consuming any characters from the subject string. The - use of groups for more complicated assertions is described below. The + The final use of backslash is for certain simple assertions. An asser- + tion specifies a condition that has to be met at a particular point in + a match, without consuming any characters from the subject string. The + use of groups for more complicated assertions is described below. The backslashed assertions are: \b matches at a word boundary @@ -7478,193 +7828,193 @@ BACKSLASH \z matches only at the end of the subject \G matches at the first matching position in the subject - Inside a character class, \b has a different meaning; it matches the - backspace character. If any other of these assertions appears in a + Inside a character class, \b has a different meaning; it matches the + backspace character. If any other of these assertions appears in a character class, an "invalid escape sequence" error is generated. - A word boundary is a position in the subject string where the current - character and the previous character do not both match \w or \W (i.e. - one matches \w and the other matches \W), or the start or end of the - string if the first or last character matches \w, respectively. When - PCRE2 is built with Unicode support, the meanings of \w and \W can be + A word boundary is a position in the subject string where the current + character and the previous character do not both match \w or \W (i.e. + one matches \w and the other matches \W), or the start or end of the + string if the first or last character matches \w, respectively. When + PCRE2 is built with Unicode support, the meanings of \w and \W can be changed by setting the PCRE2_UCP option. When this is done, it also af- - fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" - or "end of word" metasequence. However, whatever follows \b normally - determines which it is. For example, the fragment \ba matches "a" at + fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" + or "end of word" metasequence. However, whatever follows \b normally + determines which it is. For example, the fragment \ba matches "a" at the start of a word. - The \A, \Z, and \z assertions differ from the traditional circumflex + The \A, \Z, and \z assertions differ from the traditional circumflex and dollar (described in the next section) in that they only ever match - at the very start and end of the subject string, whatever options are - set. Thus, they are independent of multiline mode. These three asser- - tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, - which affect only the behaviour of the circumflex and dollar metachar- - acters. However, if the startoffset argument of pcre2_match() is non- - zero, indicating that matching is to start at a point other than the - beginning of the subject, \A can never match. The difference between - \Z and \z is that \Z matches before a newline at the end of the string + at the very start and end of the subject string, whatever options are + set. Thus, they are independent of multiline mode. These three asser- + tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, + which affect only the behaviour of the circumflex and dollar metachar- + acters. However, if the startoffset argument of pcre2_match() is non- + zero, indicating that matching is to start at a point other than the + beginning of the subject, \A can never match. The difference between + \Z and \z is that \Z matches before a newline at the end of the string as well as at the very end, whereas \z matches only at the end. - The \G assertion is true only when the current matching position is at - the start point of the matching process, as specified by the startoff- - set argument of pcre2_match(). It differs from \A when the value of - startoffset is non-zero. By calling pcre2_match() multiple times with - appropriate arguments, you can mimic Perl's /g option, and it is in + The \G assertion is true only when the current matching position is at + the start point of the matching process, as specified by the startoff- + set argument of pcre2_match(). It differs from \A when the value of + startoffset is non-zero. By calling pcre2_match() multiple times with + appropriate arguments, you can mimic Perl's /g option, and it is in this kind of implementation where \G can be useful. - Note, however, that PCRE2's implementation of \G, being true at the - starting character of the matching process, is subtly different from - Perl's, which defines it as true at the end of the previous match. In - Perl, these can be different when the previously matched string was + Note, however, that PCRE2's implementation of \G, being true at the + starting character of the matching process, is subtly different from + Perl's, which defines it as true at the end of the previous match. In + Perl, these can be different when the previously matched string was empty. Because PCRE2 does just one match at a time, it cannot reproduce this behaviour. - If all the alternatives of a pattern begin with \G, the expression is + If all the alternatives of a pattern begin with \G, the expression is anchored to the starting match position, and the "anchored" flag is set in the compiled regular expression. CIRCUMFLEX AND DOLLAR - The circumflex and dollar metacharacters are zero-width assertions. - That is, they test for a particular condition being true without con- + The circumflex and dollar metacharacters are zero-width assertions. + That is, they test for a particular condition being true without con- suming any characters from the subject string. These two metacharacters - are concerned with matching the starts and ends of lines. If the new- - line convention is set so that only the two-character sequence CRLF is - recognized as a newline, isolated CR and LF characters are treated as + are concerned with matching the starts and ends of lines. If the new- + line convention is set so that only the two-character sequence CRLF is + recognized as a newline, isolated CR and LF characters are treated as ordinary data characters, and are not recognized as newlines. Outside a character class, in the default matching mode, the circumflex - character is an assertion that is true only if the current matching - point is at the start of the subject string. If the startoffset argu- - ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- - flex can never match if the PCRE2_MULTILINE option is unset. Inside a - character class, circumflex has an entirely different meaning (see be- + character is an assertion that is true only if the current matching + point is at the start of the subject string. If the startoffset argu- + ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- + flex can never match if the PCRE2_MULTILINE option is unset. Inside a + character class, circumflex has an entirely different meaning (see be- low). - Circumflex need not be the first character of the pattern if a number - of alternatives are involved, but it should be the first thing in each - alternative in which it appears if the pattern is ever to match that - branch. If all possible alternatives start with a circumflex, that is, - if the pattern is constrained to match only at the start of the sub- - ject, it is said to be an "anchored" pattern. (There are also other + Circumflex need not be the first character of the pattern if a number + of alternatives are involved, but it should be the first thing in each + alternative in which it appears if the pattern is ever to match that + branch. If all possible alternatives start with a circumflex, that is, + if the pattern is constrained to match only at the start of the sub- + ject, it is said to be an "anchored" pattern. (There are also other constructs that can cause a pattern to be anchored.) - The dollar character is an assertion that is true only if the current - matching point is at the end of the subject string, or immediately be- - fore a newline at the end of the string (by default), unless PCRE2_NO- - TEOL is set. Note, however, that it does not actually match the new- - line. Dollar need not be the last character of the pattern if a number - of alternatives are involved, but it should be the last item in any - branch in which it appears. Dollar has no special meaning in a charac- + The dollar character is an assertion that is true only if the current + matching point is at the end of the subject string, or immediately be- + fore a newline at the end of the string (by default), unless PCRE2_NO- + TEOL is set. Note, however, that it does not actually match the new- + line. Dollar need not be the last character of the pattern if a number + of alternatives are involved, but it should be the last item in any + branch in which it appears. Dollar has no special meaning in a charac- ter class. - The meaning of dollar can be changed so that it matches only at the - very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at + The meaning of dollar can be changed so that it matches only at the + very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \Z assertion. The meanings of the circumflex and dollar metacharacters are changed if - the PCRE2_MULTILINE option is set. When this is the case, a dollar - character matches before any newlines in the string, as well as at the - very end, and a circumflex matches immediately after internal newlines - as well as at the start of the subject string. It does not match after - a newline that ends the string, for compatibility with Perl. However, + the PCRE2_MULTILINE option is set. When this is the case, a dollar + character matches before any newlines in the string, as well as at the + very end, and a circumflex matches immediately after internal newlines + as well as at the start of the subject string. It does not match after + a newline that ends the string, for compatibility with Perl. However, this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option. - For example, the pattern /^abc$/ matches the subject string "def\nabc" - (where \n represents a newline) in multiline mode, but not otherwise. - Consequently, patterns that are anchored in single line mode because - all branches start with ^ are not anchored in multiline mode, and a - match for circumflex is possible when the startoffset argument of - pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored + For example, the pattern /^abc$/ matches the subject string "def\nabc" + (where \n represents a newline) in multiline mode, but not otherwise. + Consequently, patterns that are anchored in single line mode because + all branches start with ^ are not anchored in multiline mode, and a + match for circumflex is possible when the startoffset argument of + pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. - When the newline convention (see "Newline conventions" below) recog- - nizes the two-character sequence CRLF as a newline, this is preferred, - even if the single characters CR and LF are also recognized as new- - lines. For example, if the newline convention is "any", a multiline - mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather - than after CR, even though CR on its own is a valid newline. (It also + When the newline convention (see "Newline conventions" below) recog- + nizes the two-character sequence CRLF as a newline, this is preferred, + even if the single characters CR and LF are also recognized as new- + lines. For example, if the newline convention is "any", a multiline + mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather + than after CR, even though CR on its own is a valid newline. (It also matches at the very start of the string, of course.) - Note that the sequences \A, \Z, and \z can be used to match the start - and end of the subject in both modes, and if all branches of a pattern - start with \A it is always anchored, whether or not PCRE2_MULTILINE is + Note that the sequences \A, \Z, and \z can be used to match the start + and end of the subject in both modes, and if all branches of a pattern + start with \A it is always anchored, whether or not PCRE2_MULTILINE is set. FULL STOP (PERIOD, DOT) AND \N Outside a character class, a dot in the pattern matches any one charac- - ter in the subject string except (by default) a character that signi- + ter in the subject string except (by default) a character that signi- fies the end of a line. One or more characters may be specified as line terminators (see "Newline conventions" above). - Dot never matches a single line-ending character. When the two-charac- - ter sequence CRLF is the only line ending, dot does not match CR if it - is immediately followed by LF, but otherwise it matches all characters - (including isolated CRs and LFs). When ANYCRLF is selected for line - endings, no occurrences of CR of LF match dot. When all Unicode line + Dot never matches a single line-ending character. When the two-charac- + ter sequence CRLF is the only line ending, dot does not match CR if it + is immediately followed by LF, but otherwise it matches all characters + (including isolated CRs and LFs). When ANYCRLF is selected for line + endings, no occurrences of CR of LF match dot. When all Unicode line endings are being recognized, dot does not match CR or LF or any of the other line ending characters. - The behaviour of dot with regard to newlines can be changed. If the - PCRE2_DOTALL option is set, a dot matches any one character, without - exception. If the two-character sequence CRLF is present in the sub- + The behaviour of dot with regard to newlines can be changed. If the + PCRE2_DOTALL option is set, a dot matches any one character, without + exception. If the two-character sequence CRLF is present in the sub- ject string, it takes two dots to match it. - The handling of dot is entirely independent of the handling of circum- - flex and dollar, the only relationship being that they both involve + The handling of dot is entirely independent of the handling of circum- + flex and dollar, the only relationship being that they both involve newlines. Dot has no special meaning in a character class. - The escape sequence \N when not followed by an opening brace behaves - like a dot, except that it is not affected by the PCRE2_DOTALL option. - In other words, it matches any character except one that signifies the + The escape sequence \N when not followed by an opening brace behaves + like a dot, except that it is not affected by the PCRE2_DOTALL option. + In other words, it matches any character except one that signifies the end of a line. When \N is followed by an opening brace it has a different meaning. See - the section entitled "Non-printing characters" above for details. Perl - also uses \N{name} to specify characters by Unicode name; PCRE2 does + the section entitled "Non-printing characters" above for details. Perl + also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. MATCHING A SINGLE CODE UNIT - Outside a character class, the escape sequence \C matches any one code - unit, whether or not a UTF mode is set. In the 8-bit library, one code - unit is one byte; in the 16-bit library it is a 16-bit unit; in the - 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches - line-ending characters. The feature is provided in Perl in order to + Outside a character class, the escape sequence \C matches any one code + unit, whether or not a UTF mode is set. In the 8-bit library, one code + unit is one byte; in the 16-bit library it is a 16-bit unit; in the + 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches + line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, but it is unclear how it can use- fully be used. - Because \C breaks up characters into individual code units, matching - one unit with \C in UTF-8 or UTF-16 mode means that the rest of the + Because \C breaks up characters into individual code units, matching + one unit with \C in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined re- sults, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's valid- - ity at the start of processing unless the PCRE2_NO_UTF_CHECK or + ity at the start of processing unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used). - An application can lock out the use of \C by setting the - PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also + An application can lock out the use of \C by setting the + PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to build PCRE2 with the use of \C permanently disabled. - PCRE2 does not allow \C to appear in lookbehind assertions (described - below) in UTF-8 or UTF-16 modes, because this would make it impossible - to calculate the length of the lookbehind. Neither the alternative + PCRE2 does not allow \C to appear in lookbehind assertions (described + below) in UTF-8 or UTF-16 modes, because this would make it impossible + to calculate the length of the lookbehind. Neither the alternative matching function pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. The former gives a match-time error; the latter fails to optimize and so the match is always run using the interpreter. - In the 32-bit library, however, \C is always supported (when not ex- - plicitly locked out) because it always matches a single code unit, + In the 32-bit library, however, \C is always supported (when not ex- + plicitly locked out) because it always matches a single code unit, whether or not UTF-32 is specified. In general, the \C escape sequence is best avoided. However, one way of - using it that avoids the problem of malformed UTF-8 or UTF-16 charac- - ters is to use a lookahead to check the length of the next character, - as in this pattern, which could be used with a UTF-8 string (ignore + using it that avoids the problem of malformed UTF-8 or UTF-16 charac- + ters is to use a lookahead to check the length of the next character, + as in this pattern, which could be used with a UTF-8 string (ignore white space and line breaks): (?| (?=[\x00-\x7f])(\C) | @@ -7672,11 +8022,11 @@ MATCHING A SINGLE CODE UNIT (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) | (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C)) - In this example, a group that starts with (?| resets the capturing - parentheses numbers in each alternative (see "Duplicate Group Numbers" + In this example, a group that starts with (?| resets the capturing + parentheses numbers in each alternative (see "Duplicate Group Numbers" below). The assertions at the start of each branch check the next UTF-8 - character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- - tively. The character's individual bytes are then captured by the ap- + character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- + tively. The character's individual bytes are then captured by the ap- propriate number of \C groups. @@ -7684,27 +8034,27 @@ SQUARE BRACKETS AND CHARACTER CLASSES An opening square bracket introduces a character class, terminated by a closing square bracket. A closing square bracket on its own is not spe- - cial by default. If a closing square bracket is required as a member + cial by default. If a closing square bracket is required as a member of the class, it should be the first data character in the class (after - an initial circumflex, if present) or escaped with a backslash. This - means that, by default, an empty class cannot be defined. However, if - the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at + an initial circumflex, if present) or escaped with a backslash. This + means that, by default, an empty class cannot be defined. However, if + the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at the start does end the (empty) class. - A character class matches a single character in the subject. A matched + A character class matches a single character in the subject. A matched character must be in the set of characters defined by the class, unless - the first character in the class definition is a circumflex, in which + the first character in the class definition is a circumflex, in which case the subject character must not be in the set defined by the class. - If a circumflex is actually required as a member of the class, ensure + If a circumflex is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash. - For example, the character class [aeiou] matches any lower case vowel, - while [^aeiou] matches any character that is not a lower case vowel. - Note that a circumflex is just a convenient notation for specifying the - characters that are in the class by enumerating those that are not. A - class that starts with a circumflex is not an assertion; it still con- - sumes a character from the subject string, and therefore it fails if - the current pointer is at the end of the string. + For example, the character class [aeiou] matches any lower case English + vowel, whereas [^aeiou] matches all other characters. Note that a cir- + cumflex is just a convenient notation for specifying the characters + that are in the class by enumerating those that are not. A class that + starts with a circumflex is not an assertion; it still consumes a char- + acter from the subject string, and therefore it fails to match if the + current pointer is at the end of the string. Characters in a class may be specified by their code points using \o, \x, or \N{U+hh..} in the usual way. When caseless matching is set, any @@ -7714,7 +8064,10 @@ SQUARE BRACKETS AND CHARACTER CLASSES would. Note that there are two ASCII characters, K and S, that, in ad- dition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when ei- - ther PCRE2_UTF or PCRE2_UCP is set. + ther PCRE2_UTF or PCRE2_UCP is set. If you do not want these ASCII/non- + ASCII case equivalences, you can suppress them by setting PCRE2_EX- + TRA_CASELESS_RESTRICT, either as an option in a compile context, or by + including (*CASELESS_RESTRICT) or (?r) within a pattern. Characters that might indicate line breaks are never treated in any special way when matching character classes, whatever line-ending se- @@ -7743,67 +8096,171 @@ SQUARE BRACKETS AND CHARACTER CLASSES last character in the class, or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z. + There is some special treatment for alphabetic ranges in EBCDIC envi- + ronments; see the section "EBCDIC environments" below. + Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d - or \H. However, unless the hyphen is the last character in the class, - Perl outputs a warning in its warning mode, as this is most likely a - user error. As PCRE2 has no facility for warning, an error is given in + or \H. However, unless the hyphen is the last character in the class, + Perl outputs a warning in its warning mode, as this is most likely a + user error. As PCRE2 has no facility for warning, an error is given in these cases. It is not possible to have the literal character "]" as the end charac- - ter of a range. A pattern such as [W-]46] is interpreted as a class of - two characters ("W" and "-") followed by a literal string "46]", so it - would match "W46]" or "-46]". However, if the "]" is escaped with a - backslash it is interpreted as the end of range, so [W-\]46] is inter- - preted as a class containing a range followed by two other characters. - The octal or hexadecimal representation of "]" can also be used to end - a range. + ter of a range. A pattern such as [W-]46] is interpreted as a class of + two characters ("W" and "-") followed by a literal string "46]", so it + would match "W46]" or "-46]". However, if the "]" is escaped with a + backslash it is interpreted as the end of a range, so [W-\]46] is in- + terpreted as a class containing a range and two other characters. The + octal or hexadecimal representation of "]" can also be used to end a + range. Ranges normally include all code points between the start and end char- - acters, inclusive. They can also be used for code points specified nu- - merically, for example [\000-\037]. Ranges can include any characters - that are valid for the current mode. In any UTF mode, the so-called - "surrogate" characters (those whose code points lie between 0xd800 and - 0xdfff inclusive) may not be specified explicitly by default (the - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- + acters, inclusive. They can also be used for code points specified nu- + merically, for example [\000-\037]. Ranges can include any characters + that are valid for the current mode. In any UTF mode, the so-called + "surrogate" characters (those whose code points lie between 0xd800 and + 0xdfff inclusive) may not be specified explicitly by default (the + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- ever, ranges such as [\x{d7ff}-\x{e000}], which include the surrogates, are always permitted. - There is a special case in EBCDIC environments for ranges whose end - points are both specified as literal letters in the same case. For com- - patibility with Perl, EBCDIC code points within the range that are not - letters are omitted. For example, [h-k] matches only four characters, - even though the codes for h and k are 0x88 and 0x92, a range of 11 code - points. However, if the range is specified numerically, for example, - [\x88-\x92] or [h-\x92], all code points are included. - If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent - to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if - character tables for a French locale are in use, [\xc8-\xcb] matches + to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if + character tables for a French locale are in use, [\xc8-\xcb] matches accented E characters in both cases. - A circumflex can conveniently be used with the upper case character - types to specify a more restricted set of characters than the matching - lower case type. For example, the class [^\W_] matches any letter or + A circumflex can conveniently be used with the upper case character + types to specify a more restricted set of characters than the matching + lower case type. For example, the class [^\W_] matches any letter or digit, but not underscore, whereas [\w] includes underscore. A positive character class should be read as "something OR something OR ..." and a negative class as "NOT something AND NOT something AND NOT ...". - The only metacharacters that are recognized in character classes are - backslash, hyphen (only where it can be interpreted as specifying a - range), circumflex (only at the start), opening square bracket (only - when it can be interpreted as introducing a POSIX class name, or for a - special compatibility feature - see the next two sections), and the - terminating closing square bracket. However, escaping other non-al- - phanumeric characters does no harm. + The metacharacters that are recognized in character classes are back- + slash, hyphen (when it can be interpreted as specifying a range), cir- + cumflex (only at the start), and the terminating closing square + bracket. An opening square bracket is also special when it can be in- + terpreted as introducing a POSIX class (see "Posix character classes" + below), or a special compatibility feature (see "Compatibility feature + for word boundaries" below. Escaping any non-alphanumeric character in + a class turns it into a literal, whether or not it would otherwise be a + metacharacter. + + +PERL EXTENDED CHARACTER CLASSES + + From release 10.45 PCRE2 supports Perl's (?[...]) extended character + class syntax. This can be used to perform set operations such as inter- + section on character classes. + + The syntax permitted within (?[...]) is quite different to ordinary + character classes. Inside the extended class, there is an expression + syntax consisting of "atoms", operators, and ordinary parentheses "()" + used for grouping. Such classes always have the Perl /xx modifier + (PCRE2 option PCRE2_EXTENDED_MORE) turned on within them. This means + that literal space and tab characters are ignored everywhere in the + class. + + The allowed atoms are individual characters specified by escape se- + quences such as \n or \x{123}, character types such as \d, POSIX + classes such as [:alpha:], and nested ordinary (non-extended) character + classes. For example, in (?[\d & [...]]) the nested class [...] follows + the usual rules for ordinary character classes, in which parentheses + are not metacharacters, and character literals and ranges are permit- + ted. + + Character literals and ranges may not appear outside a nested ordinary + character class because they are not atoms in the extended syntax. The + extended syntax does not introduce any additional escape sequences, so + (?[\y]) is an unknown escape, as it would be in [\y]. + + In the extended syntax, ^ does not negate a class (except within an or- + dinary class nested inside an extended class); it is instead a binary + operator. + + The binary operators are "&" (intersection), "|" or "+" (union), "-" + (subtraction) and "^" (symmetric difference). These are left-associa- + tive and "&" has higher (tighter) precedence, while the others have + equal lower precedence. The one prefix unary operator is "!" (comple- + ment), with highest precedence. + + +UTS#18 EXTENDED CHARACTER CLASSES + + The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's + (?[...]) syntax, allowing instead extended class behaviour inside or- + dinary [...] character classes. This altered syntax for [...] classes + is loosely described by the Unicode standard UTS#18. The PCRE2_ALT_EX- + TENDED_CLASS option does not prevent use of (?[...]) classes; it just + changes the meaning of all [...] classes that are not nested inside a + Perl (?[...]) class. + + Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is + a character class with two literal characters "a" and "[", but in + UTS#18 extended classes the "[" character becomes an additional + metacharacter within classes, denoting the start of a nested class, so + a literal "[" must be escaped as "\[". + + Secondly, within the UTS#18 extended syntax, there are operators "||", + "&&", "--" and "~~" which denote character class union, intersection, + subtraction, and symmetric difference respectively. In standard Perl + syntax, these would simply be needlessly-repeated literals (except for + "--" which could be the start or end of a range). In UTS#18 extended + classes these operators can be used in constructs such as [\p{L}--[QW]] + for "Unicode letters, other than Q and W". A literal "-" at the start + or end of a range must be escaped, so while "[--1]" in Perl syntax is + the range from hyphen to "1", it must be escaped as "[\--1]" in UTS#18 + extended classes. + + Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option + to ignore space and tab characters is not automatically enabled for + UTS#18 extended classes, but it is honoured if set. + + Extended UTS#18 classes can be nested, and nested classes are them- + selves extended classes (unlike Perl, where nested classes must be sim- + ple classes). For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any + letter that is in the Thai or Greek scripts. Note that this means that + no special grouping characters (such as the parentheses used in Perl's + (?[...]) class syntax) are needed. + + Individual class items (literal characters, literal ranges, properties + such as \d or \p{...}, and nested classes) can be combined by juxtapo- + sition or by an operator. Juxtaposition is the implicit union operator, + and binds more tightly than any explicit operator. Thus a sequence of + literals and/or ranges behaves as if it is enclosed in square brackets. + For example, [A-Z0-9&&[^E8]] is the same as [[A-Z0-9]&&[^E8]], which + matches any upper case alphanumeric character except "E" or "8". + + Precedence between the explicit operators is not defined, so mixing op- + erators is a syntax error. For example, [A&&B--C] is an error, but + [A&&[B--C]] is valid. + + This is an emerging syntax which is being adopted gradually across the + regex ecosystem: for example JavaScript adopted the "/v" flag in EC- + MAScript 2024; Python's "re" module reserves the syntax for future use + with a FutureWarning for unescaped use of "[" as a literal within char- + acter classes. Due to UTS#18 providing insufficient guidance, engines + interpret the syntax differently. Rust's "regex" crate and Python's + "regex" PyPi module both implement UTS#18 extended classes, but with + slight incompatibilities ([A||B&&C] is parsed as [A||[B&&C]] in + Python's "regex" but as [[A||B]&&C] in Rust's "regex"). + + PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v + flag, so that all the UTS#18 extended classes accepted as valid by + PCRE2 have the property that they are interpreted either with the same + behaviour, or as invalid, by all other major engines. Please file an + issue if you are aware of cross-engine differences in behaviour between + PCRE2 and another major engine. POSIX CHARACTER CLASSES Perl supports the POSIX notation for character classes. This uses names - enclosed by [: and :] within the enclosing square brackets. PCRE2 also - supports this notation. For example, + enclosed by [: and :] within the enclosing square brackets. PCRE2 also + supports this notation, in both ordinary and extended classes. For ex- + ample, [01[:alpha:]%] @@ -7883,7 +8340,7 @@ POSIX CHARACTER CLASSES In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This is a change that - was made in PCRE release 10.43 for Perl compatibility. + was made in PCRE2 release 10.43 for Perl compatibility. The other POSIX classes are unchanged by PCRE2_UCP, and match only characters with code points less than 256. @@ -8391,17 +8848,18 @@ REPETITION (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking con- - trol verbs (*PRUNE) and (*SKIP) also disable this optimization, and - there is an option, PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. + trol verbs (*PRUNE) and (*SKIP) also disable this optimization. To do + so explicitly, either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, + or call pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. - When a capture group is repeated, the value captured is the substring + When a capture group is repeated, the value captured is the substring that matched the final iteration. For example, after (tweedle[dume]{3}\s*)+ has matched "tweedledum tweedledee" the value of the captured substring - is "tweedledee". However, if there are nested capture groups, the cor- - responding captured values may have been set in previous iterations. + is "tweedledee". However, if there are nested capture groups, the cor- + responding captured values may have been set in previous iterations. For example, after (a|(b))+ @@ -8411,57 +8869,57 @@ REPETITION ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS - With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") - repetition, failure of what follows normally causes the repeated item - to be re-evaluated to see if a different number of repeats allows the - rest of the pattern to match. Sometimes it is useful to prevent this, - either to change the nature of the match, or to cause it fail earlier - than it otherwise might, when the author of the pattern knows there is + With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") + repetition, failure of what follows normally causes the repeated item + to be re-evaluated to see if a different number of repeats allows the + rest of the pattern to match. Sometimes it is useful to prevent this, + either to change the nature of the match, or to cause it fail earlier + than it otherwise might, when the author of the pattern knows there is no point in carrying on. - Consider, for example, the pattern \d+foo when applied to the subject + Consider, for example, the pattern \d+foo when applied to the subject line 123456bar After matching all 6 digits and then failing to match "foo", the normal - action of the matcher is to try again with only 5 digits matching the - \d+ item, and then with 4, and so on, before ultimately failing. - "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides + action of the matcher is to try again with only 5 digits matching the + \d+ item, and then with 4, and so on, before ultimately failing. + "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides the means for specifying that once a group has matched, it is not to be re-evaluated in this way. - If we use atomic grouping for the previous example, the matcher gives - up immediately on failing to match "foo" the first time. The notation + If we use atomic grouping for the previous example, the matcher gives + up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example: (?>\d+)foo - Perl 5.28 introduced an experimental alphabetic form starting with (* + Perl 5.28 introduced an experimental alphabetic form starting with (* which may be easier to remember: (*atomic:\d+)foo - This kind of parenthesized group "locks up" the part of the pattern it + This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is - prevented from backtracking into it. Backtracking past it to previous + prevented from backtracking into it. Backtracking past it to previous items, however, works as normal. An alternative description is that a group of this type matches exactly - the string of characters that an identical standalone pattern would + the string of characters that an identical standalone pattern would match, if anchored at the current point in the subject string. - Atomic groups are not capture groups. Simple cases such as the above - example can be thought of as a maximizing repeat that must swallow - everything it can. So, while both \d+ and \d+? are prepared to adjust - the number of digits they match in order to make the rest of the pat- + Atomic groups are not capture groups. Simple cases such as the above + example can be thought of as a maximizing repeat that must swallow + everything it can. So, while both \d+ and \d+? are prepared to adjust + the number of digits they match in order to make the rest of the pat- tern match, (?>\d+) can only match an entire sequence of digits. - Atomic groups in general can of course contain arbitrarily complicated + Atomic groups in general can of course contain arbitrarily complicated expressions, and can be nested. However, when the contents of an atomic - group is just a single repeated item, as in the example above, a sim- - pler notation, called a "possessive quantifier" can be used. This con- - sists of an additional + character following a quantifier. Using this + group is just a single repeated item, as in the example above, a sim- + pler notation, called a "possessive quantifier" can be used. This con- + sists of an additional + character following a quantifier. Using this notation, the previous example can be rewritten as \d++foo @@ -8471,24 +8929,26 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS (abc|xyz){2,3}+ - Possessive quantifiers are always greedy; the setting of the PCRE2_UN- - GREEDY option is ignored. They are a convenient notation for the sim- - pler forms of atomic group. However, there is no difference in the - meaning of a possessive quantifier and the equivalent atomic group, - though there may be a performance difference; possessive quantifiers + Possessive quantifiers are always greedy; the setting of the PCRE2_UN- + GREEDY option is ignored. They are a convenient notation for the sim- + pler forms of atomic group. However, there is no difference in the + meaning of a possessive quantifier and the equivalent atomic group, + though there may be a performance difference; possessive quantifiers should be slightly faster. - The possessive quantifier syntax is an extension to the Perl 5.8 syn- - tax. Jeffrey Friedl originated the idea (and the name) in the first + The possessive quantifier syntax is an extension to the Perl 5.8 syn- + tax. Jeffrey Friedl originated the idea (and the name) in the first edition of his book. Mike McCloskey liked it, so implemented it when he - built Sun's Java package, and PCRE1 copied it from there. It found its + built Sun's Java package, and PCRE1 copied it from there. It found its way into Perl at release 5.10. - PCRE2 has an optimization that automatically "possessifies" certain - simple pattern constructs. For example, the sequence A+B is treated as - A++B because there is no point in backtracking into a sequence of A's - when B must follow. This feature can be disabled by the PCRE2_NO_AUTO- - POSSESS option, or starting the pattern with (*NO_AUTO_POSSESS). + PCRE2 has an optimization that automatically "possessifies" certain + simple pattern constructs. For example, the sequence A+B is treated as + A++B because there is no point in backtracking into a sequence of A's + when B must follow. This feature can be disabled by the + PCRE2_NO_AUTO_POSSESS option, by calling pcre2_set_optimize() with a + PCRE2_AUTO_POSSESS_OFF directive, or by starting the pattern with + (*NO_AUTO_POSSESS). When a pattern contains an unlimited repeat inside a group that can it- self be repeated an unlimited number of times, the use of an atomic @@ -8649,19 +9109,25 @@ BACKREFERENCES ASSERTIONS - An assertion is a test on the characters following or preceding the - current matching point that does not consume any characters. The simple - assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described - above. + An assertion is a test that does not consume any characters. The test + must succeed for the match to continue. The simple assertions coded as + \b, \B, \A, \G, \Z, \z, ^ and $ are described above. + + More complicated assertions are coded as parenthesized groups. If + matching such a group succeeds, matching continues after it, but with + the matching position in the subject string reset to what it was before + the assertion was processed. + + A special kind of assertion, called a "scan substring" assertion, + matches a subpattern against a previously captured substring. This is + described in the section entitled "Scan substring assertions" below. It + is a PCRE2 extension, not compatible with Perl. - More complicated assertions are coded as parenthesized groups. There - are two kinds: those that look ahead of the current position in the - subject string, and those that look behind it, and in each case an as- - sertion may be positive (must match for the assertion to be true) or - negative (must not match for the assertion to be true). An assertion - group is matched in the normal way, and if it is true, matching contin- - ues after it, but with the matching position in the subject string re- - set to what it was before the assertion was processed. + The other goup-based assertions are of two kinds: those that look ahead + of the current position in the subject string, and those that look be- + hind it, and in each case an assertion may be positive (must match for + the assertion to be true) or negative (must not match for the assertion + to be true). The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no back- @@ -8928,6 +9394,66 @@ NON-ATOMIC ASSERTIONS groups (see below) must be atomic. +SCAN SUBSTRING ASSERTIONS + + A special kind of assertion, not compatible with Perl, makes it possi- + ble to check the contents of a captured substring by matching it with a + subpattern. Because this involves capturing, this feature is not sup- + ported by pcre2_dfa_match(). + + A scan substring assertion starts with the sequence (*scan_substring: + or (*scs: which is followed by a list of substring numbers (absolute or + relative) and/or substring names enclosed in single quotes or angle + brackets, all within parentheses. The rest of the item is the subpat- + tern that is applied to the substring, as shown in these examples: + + (*scan_substring:(1)...) + (*scs:(-2)...) + (*scs:('AB')...) + (*scs:(1,'AB',-2)...) + + The list of groups is checked in the order they are given, and it is + the contents of the first one that is found to be set that are scanned. + When PCRE2_DUPNAMES is set and there are ambiguous group names, all + groups with the same name are checked in numerical order. A scan sub- + string assertion fails if none of the groups it references have been + set. + + The pattern match on the substring is always anchored, that is, it must + match from the start of the substring. There is no "bumpalong" if it + does not match at the start. The end of the subject is temporarily re- + set to be the end of the substring, so \Z, \z, and $ will match there. + However, the start of the subject is not reset. This means that ^ + matches only if the substring is actually at the start of the main sub- + ject, but it also means that lookbehind assertions into what precedes + the substring are possible. + + Here is a very simple example: find a word that contains the rare (in + English) sequence of letters "rh" not at the start: + + \b(\w++)(*scs:(1).+rh) + + The first group captures a word which is then scanned by the second + group. This example does not actually need this heavyweight feature; + the same match can be achieved with: + + \b\w+?rh\w*\b + + When things are more complicated, however, scanning a captured sub- + string can be a useful way to describe the required match. For exmple, + there is a rather complicated pattern in the PCRE2 test data that + checks an entire subject string for a palindrome, that is, the sequence + of letters is the same in both directions. Suppose you want to search + for individual words of two or more characters such as "level" that are + palindromes: + + (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...) + + Within a substring scanning subpattern, references to other groups work + as normal. Capturing groups may appear, and will retain their values + during ongoing matching if the assertion succeeds. + + SCRIPT RUNS In concept, a script run is a sequence of characters that are all from @@ -9175,8 +9701,9 @@ COMMENTS There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related - characters such as (?: or a group name or number. The characters that - make up a comment play no part in the pattern matching. + characters such as (?: or a group name or number or a Unicode property + name. The characters that make up a comment play no part in the pattern + matching. The sequence (?# marks the start of a comment that continues up to the next closing parenthesis. Nested parentheses are not permitted. If the @@ -9459,8 +9986,9 @@ CALLOUTS provides an external function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is - passed, or if the callout entry point is set to NULL, callouts are dis- - abled. + passed, or if the callout entry point is set to NULL, callout points + will be passed over silently during matching. To disallow callouts in + the pattern syntax, you may use the PCRE2_EXTRA_NEVER_CALLOUT option. Within a regular expression, (?C) indicates a point at which the external function is to be called. There are two kinds of callout: @@ -9555,10 +10083,10 @@ BACKTRACKING CONTROL Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the tra- - ditional matching function, because that uses a backtracking algorithm. - With the exception of (*FAIL), which behaves like a failing negative - assertion, the backtracking control verbs cause an error if encountered - by the DFA matching function. + ditional matching function or JIT, because they use backtracking algo- + rithms. With the exception of (*FAIL), which behaves like a failing + negative assertion, the backtracking control verbs cause an error if + encountered by the DFA matching function. The behaviour of these verbs in repeated groups, assertions, and in capture groups called as subroutines (whether or not recursively) is @@ -9573,11 +10101,12 @@ BACKTRACKING CONTROL running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option when calling pcre2_com- - pile(), or by starting the pattern with (*NO_START_OPT). There is more - discussion of this option in the section entitled "Compiling a pattern" - in the pcre2api documentation. + pile(), by calling pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF + directive, or by starting the pattern with (*NO_START_OPT). There is + more discussion of this option in the section entitled "Compiling a + pattern" in the pcre2api documentation. - Experiments with Perl suggest that it too has similar optimizations, + Experiments with Perl suggest that it too has similar optimizations, and like PCRE2, turning them off can change the result of a match. Verbs that act immediately @@ -9586,77 +10115,77 @@ BACKTRACKING CONTROL (*ACCEPT) or (*ACCEPT:NAME) - This verb causes the match to end successfully, skipping the remainder - of the pattern. However, when it is inside a capture group that is + This verb causes the match to end successfully, skipping the remainder + of the pattern. However, when it is inside a capture group that is called as a subroutine, only that group is ended successfully. Matching then continues at the outer level. If (*ACCEPT) in triggered in a posi- - tive assertion, the assertion succeeds; in a negative assertion, the + tive assertion, the assertion succeeds; in a negative assertion, the assertion fails. - If (*ACCEPT) is inside capturing parentheses, the data so far is cap- + If (*ACCEPT) is inside capturing parentheses, the data so far is cap- tured. For example: A((?:A|B(*ACCEPT)|C)D) - This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- + This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- tured by the outer parentheses. - (*ACCEPT) is the only backtracking verb that is allowed to be quanti- - fied because an ungreedy quantification with a minimum of zero acts + (*ACCEPT) is the only backtracking verb that is allowed to be quanti- + fied because an ungreedy quantification with a minimum of zero acts only when a backtrack happens. Consider, for example, (A(*ACCEPT)??B)C - where A, B, and C may be complex expressions. After matching "A", the - matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) - is triggered and the match succeeds. In both cases, all but C is cap- - tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- + where A, B, and C may be complex expressions. After matching "A", the + matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) + is triggered and the match succeeds. In both cases, all but C is cap- + tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- peated (*ACCEPT) of this type means "succeed on backtrack". - Warning: (*ACCEPT) should not be used within a script run group, be- - cause it causes an immediate exit from the group, bypassing the script + Warning: (*ACCEPT) should not be used within a script run group, be- + cause it causes an immediate exit from the group, bypassing the script run checking. (*FAIL) or (*FAIL:NAME) - This verb causes a matching failure, forcing backtracking to occur. It - may be abbreviated to (*F). It is equivalent to (?!) but easier to + This verb causes a matching failure, forcing backtracking to occur. It + may be abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl documentation notes that it is probably useful only when combined with (?{}) or (??{}). Those are, of course, Perl features that - are not present in PCRE2. The nearest equivalent is the callout fea- + are not present in PCRE2. The nearest equivalent is the callout fea- ture, as for example in this pattern: a+(?C)(*FAIL) - A match with the string "aaaa" always fails, but the callout is taken + A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times). - (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- - CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is + (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- + CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before the verb acts. Recording which path was taken - There is one verb whose main purpose is to track how a match was ar- - rived at, though it also has a secondary use in conjunction with ad- + There is one verb whose main purpose is to track how a match was ar- + rived at, though it also has a secondary use in conjunction with ad- vancing the match starting point (see (*SKIP) below). (*MARK:NAME) or (*:NAME) - A name is always required with this verb. For all the other backtrack- + A name is always required with this verb. For all the other backtrack- ing control verbs, a NAME argument is optional. - When a match succeeds, the name of the last-encountered mark name on + When a match succeeds, the name of the last-encountered mark name on the matching path is passed back to the caller as described in the sec- tion entitled "Other information about the match" in the pcre2api docu- - mentation. This applies to all instances of (*MARK) and other verbs, + mentation. This applies to all instances of (*MARK) and other verbs, including those inside assertions and atomic groups. However, there are - differences in those cases when (*MARK) is used in conjunction with + differences in those cases when (*MARK) is used in conjunction with (*SKIP) as described below. - The mark name that was last encountered on the matching path is passed - back. A verb without a NAME argument is ignored for this purpose. Here - is an example of pcre2test output, where the "mark" modifier requests + The mark name that was last encountered on the matching path is passed + back. A verb without a NAME argument is ignored for this purpose. Here + is an example of pcre2test output, where the "mark" modifier requests the retrieval and outputting of (*MARK) data: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark @@ -9668,30 +10197,31 @@ BACKTRACKING CONTROL MK: B The (*MARK) name is tagged with "MK:" in this output, and in this exam- - ple it indicates which of the two alternatives matched. This is a more - efficient way of obtaining this information than putting each alterna- + ple it indicates which of the two alternatives matched. This is a more + efficient way of obtaining this information than putting each alterna- tive in its own capturing parentheses. - If a verb with a name is encountered in a positive assertion that is - true, the name is recorded and passed back if it is the last-encoun- + If a verb with a name is encountered in a positive assertion that is + true, the name is recorded and passed back if it is the last-encoun- tered. This does not happen for negative assertions or failing positive assertions. - After a partial match or a failed match, the last encountered name in + After a partial match or a failed match, the last encountered name in the entire match process is returned. For example: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XP No match, mark = B - Note that in this unanchored example the mark is retained from the + Note that in this unanchored example the mark is retained from the match attempt that started at the letter "X" in the subject. Subsequent match attempts starting at "P" and then with an empty string do not get as far as the (*MARK) item, but nevertheless do not reset it. - If you are interested in (*MARK) values after failed matches, you - should probably set the PCRE2_NO_START_OPTIMIZE option (see above) to - ensure that the match is always attempted. + If you are interested in (*MARK) values after failed matches, you + should probably either set the PCRE2_NO_START_OPTIMIZE option or call + pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see + above) to ensure that the match is always attempted. Verbs that act after backtracking @@ -9699,11 +10229,11 @@ BACKTRACKING CONTROL tinues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, back- tracking cannot pass to the left of the verb. However, when one of - these verbs appears inside an atomic group or in a lookaround assertion - that is true, its effect is confined to that group, because once the - group has been matched, there is never any backtracking into it. Back- - tracking from beyond an assertion or an atomic group ignores the entire - group, and seeks a preceding backtracking point. + these verbs appears inside an atomic group or in an atomic lookaround + assertion that is true, its effect is confined to that group, because + once the group has been matched, there is never any backtracking into + it. Backtracking from beyond an atomic assertion or group ignores the + entire group, and seeks a preceding backtracking point. These verbs differ in exactly what kind of failure occurs when back- tracking reaches them. The behaviour described below is what happens @@ -9960,21 +10490,23 @@ BACKTRACKING CONTROL (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern. - PCRE2 now supports non-atomic positive assertions, as described in the - section entitled "Non-atomic assertions" above. These assertions must - be standalone (not used as conditions). They are not Perl-compatible. - For these assertions, a later backtrack does jump back into the asser- - tion, and therefore verbs such as (*COMMIT) can be triggered by back- - tracks from later in the pattern. + PCRE2 now supports non-atomic positive assertions and also "scan sub- + string" assertions, as described in the sections entitled "Non-atomic + assertions" and "Scan substring assertions" above. These assertions + must be standalone (not used as conditions). They are not Perl-compati- + ble. For these assertions, a later backtrack does jump back into the + assertion, and therefore verbs such as (*COMMIT) can be triggered by + backtracks from later in the pattern. The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion - to be false, and a negative assertion to be true. + to be false, and a negative assertion to be true. This behaviour dif- + fers from Perl when the assertion has only one branch. - The other backtracking verbs are not treated specially if they appear - in a standalone positive assertion. In a conditional positive asser- + The other backtracking verbs are not treated specially if they appear + in a standalone positive assertion. In a conditional positive asser- tion, backtracking (from within the assertion) into (*COMMIT), (*SKIP), - or (*PRUNE) causes the condition to be false. However, for both stand- + or (*PRUNE) causes the condition to be false. However, for both stand- alone and conditional negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes the assertion to be true, without consider- ing any further alternative branches. @@ -9984,26 +10516,68 @@ BACKTRACKING CONTROL These behaviours occur whether or not the group is called recursively. (*ACCEPT) in a group called as a subroutine causes the subroutine match - to succeed without any further processing. Matching then continues af- - ter the subroutine call. Perl documents this behaviour. Perl's treat- + to succeed without any further processing. Matching then continues af- + ter the subroutine call. Perl documents this behaviour. Perl's treat- ment of the other verbs in subroutines is different in some cases. - (*FAIL) in a group called as a subroutine has its normal effect: it + (*FAIL) in a group called as a subroutine has its normal effect: it forces an immediate backtrack. - (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail - when triggered by being backtracked to in a group called as a subrou- + (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail + when triggered by being backtracked to in a group called as a subrou- tine. There is then a backtrack at the outer level. (*THEN), when triggered, skips to the next alternative in the innermost - enclosing group that has alternatives (its normal behaviour). However, + enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +EBCDIC ENVIRONMENTS + + Differences in the way PCRE behaves when it is running in an EBCDIC en- + vironment are covered in this section. + + Escape sequences + + When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. + \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. + The \c escape is processed as specified for Perl in the perlebcdic doc- + ument. The only characters that are allowed after \c are A-Z, a-z, or + one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- + time error. The sequence \c@ encodes character code 0; after \c the + letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, + \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- + comes either 255 (hex FF) or 95 (hex 5F). + + Thus, apart from \c?, these escapes generate the same character code + values as they do in an ASCII or Unicode environment, though the mean- + ings of the values mostly differ. For example, \cG always generates + code value 7, which is BEL in ASCII but DEL in EBCDIC. + + The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, + but because 127 is not a control character in EBCDIC, Perl makes it + generate the APC character. Unfortunately, there are several variants + of EBCDIC. In most of them the APC character has the value 255 (hex + FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If + certain other characters have POSIX-BC values, PCRE2 makes \c? generate + 95; otherwise it generates 255. + + Character classes + + In character classes there is a special case in EBCDIC environments for + ranges whose end points are both specified as literal letters in the + same case. For compatibility with Perl, EBCDIC code points within the + range that are not letters are omitted. For example, [h-k] matches only + four characters, even though the EBCDIC codes for h and k are 0x88 and + 0x92, a range of 11 code points. However, if the range is specified nu- + merically, for example, [\x88-\x92] or [h-\x92], all code points are + included. + + SEE ALSO - pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), + pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3). @@ -10016,15 +10590,14 @@ AUTHOR REVISION - Last updated: 04 June 2024 + Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 04 June 2024 PCRE2PATTERN(3) +PCRE2 10.45 27 November 2024 PCRE2PATTERN(3) ------------------------------------------------------------------------------ - PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) @@ -10272,15 +10845,14 @@ AUTHOR REVISION - Last updated: 27 July 2022 + Last updated: 06 December 2022 Copyright (c) 1997-2022 University of Cambridge. -PCRE2 10.41 27 July 2022 PCRE2PERFORM(3) +PCRE2 10.45 06 December 2022 PCRE2PERFORM(3) ------------------------------------------------------------------------------ - PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3) @@ -10431,7 +11003,7 @@ COMPILING A PATTERN When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments are ig- - nored, and no captured strings are returned. Versions of the PCRE li- + nored, and no captured strings are returned. Versions of the PCRE2 li- brary prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile op- tion, but this no longer happens because it disables the use of back- references. @@ -10631,15 +11203,14 @@ AUTHOR REVISION - Last updated: 19 January 2024 + Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2POSIX(3) +PCRE2 10.45 27 November 2024 PCRE2POSIX(3) ------------------------------------------------------------------------------ - PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) @@ -10725,13 +11296,12 @@ AUTHOR REVISION - Last updated: 02 February 2016 + Last updated: 14 November 2023 Copyright (c) 1997-2016 University of Cambridge. -PCRE2 10.22 02 February 2016 PCRE2SAMPLE(3) +PCRE2 10.45 14 November 2023 PCRE2SAMPLE(3) ------------------------------------------------------------------------------ - PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3) @@ -10917,15 +11487,14 @@ AUTHOR REVISION - Last updated: 27 June 2018 + Last updated: 19 January 2024 Copyright (c) 1997-2018 University of Cambridge. -PCRE2 10.32 27 June 2018 PCRE2SERIALIZE(3) +PCRE2 10.45 19 January 2024 PCRE2SERIALIZE(3) ------------------------------------------------------------------------------ - PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3) @@ -10935,9 +11504,11 @@ NAME PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY - The full syntax and semantics of the regular expressions that are sup- - ported by PCRE2 are described in the pcre2pattern documentation. This - document contains a quick-reference summary of the syntax. + The full syntax and semantics of the regular expression patterns that + are supported by PCRE2 are described in the pcre2pattern documentation. + This document contains a quick-reference summary of the pattern syntax + followed by the syntax of replacement strings in substitution function. + The full description of the latter is in the pcre2api documentation. QUOTING @@ -10947,22 +11518,24 @@ QUOTING Note that white space inside \Q...\E is always treated as literal, even if PCRE2_EXTENDED is set, causing most other white space to be ignored. + Note also that PCRE2's handling of \Q...\E has some differences from + Perl's. See the pcre2pattern documentation for details. BRACED ITEMS - With one exception, wherever brace characters { and } are required to - enclose data for constructions such as \g{2} or \k{name}, space and/or - horizontal tab characters that follow { or precede } are allowed and + With one exception, wherever brace characters { and } are required to + enclose data for constructions such as \g{2} or \k{name}, space and/or + horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or - after the comma. The exception is \u{...} which is not Perl-compatible + after the comma. The exception is \u{...} which is not Perl-compatible and is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This is an EC- MAScript compatibility feature, and follows ECMAScript's behaviour. ESCAPED CHARACTERS - This table applies to ASCII and Unicode environments. An unrecognized + This table applies to ASCII and Unicode environments. An unrecognized escape sequence causes an error. \a alarm, that is, the BEL character (hex 07) @@ -10979,6 +11552,11 @@ ESCAPED CHARACTERS \xhh character with hex code hh \x{hh..} character with hex code hh.. + \N{U+hh..} is synonymous with \x{hh..} but is not supported in environ- + ments that use EBCDIC code (mainly IBM mainframes). Note that \N not + followed by an opening curly bracket has a different meaning (see be- + low). + If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized: @@ -10986,20 +11564,17 @@ ESCAPED CHARACTERS \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX - When \x is not followed by {, from zero to two hexadecimal digits are - read, but in ALT_BSUX mode \x must be followed by two hexadecimal dig- - its to be recognized as a hexadecimal escape; otherwise it matches a - literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by - four hexadecimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex - digits in curly brackets, it matches a literal "u". + When \x is not followed by {, one or two hexadecimal digits are read, + but in ALT_BSUX mode \x must be followed by two hexadecimal digits to + be recognized as a hexadecimal escape; otherwise it matches a literal + "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexa- + decimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in + curly brackets, it matches a literal "u". Note that \0dd is always an octal code. The treatment of backslash fol- - lowed by a non-zero digit is complicated; for details see the section - "Non-printing characters" in the pcre2pattern documentation, where de- - tails of escape processing in EBCDIC environments are also given. - \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not supported in - EBCDIC environments. Note that \N not followed by an opening curly - bracket has a different meaning (see below). + lowed by a non-zero digit is complicated; for details see the section + "Non-printing characters" in the pcre2pattern documentation, where de- + tails of escape processing in EBCDIC environments are also given. CHARACTER TYPES @@ -11023,23 +11598,24 @@ CHARACTER TYPES \W a "non-word" character \X a Unicode extended grapheme cluster - \C is dangerous because it may leave the current matching point in the + \C is dangerous because it may leave the current matching point in the middle of a UTF-8 or UTF-16 character. The application can lock out the - use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also + use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 with the use of \C permanently disabled. - By default, \d, \s, and \w match only ASCII characters, even in UTF-8 + By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific - matching is happening, \s and \w may also match characters with code + matching is happening, \s and \w may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behav- iour of these escape sequences is changed to use Unicode properties and - they match many more characters, but there are some option settings - that can restrict individual sequences to matching only ASCII charac- + they match many more characters, but there are some option settings + that can restrict individual sequences to matching only ASCII charac- ters. Property descriptions in \p and \P are matched caselessly; hyphens, un- - derscores, and white space are ignored, in accordance with Unicode's - "loose matching" rules. + derscores, and ASCII white space characters are ignored, in accordance + with Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} + is the same as \p{ bidi class = AL }. GENERAL CATEGORY PROPERTIES FOR \p and \P @@ -11052,13 +11628,13 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -11090,6 +11666,9 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Zp Paragraph separator Zs Space separator + From release 10.45, when caseless matching is set, Ll, Lu, and Lt are + all equivalent to Lc. + PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P @@ -11106,9 +11685,9 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P BINARY PROPERTIES FOR \p AND \P - Unicode defines a number of binary properties, that is, properties - whose only values are true or false. You can obtain a list of those - that are recognized by \p and \P, along with their abbreviations, by + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP @@ -11116,8 +11695,8 @@ BINARY PROPERTIES FOR \p AND \P SCRIPT MATCHING WITH \p AND \P - Many script names and their 4-letter abbreviations are recognized in - \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P + Many script names and their 4-letter abbreviations are recognized in + \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of course). You can obtain a list of these scripts by running this com- mand: @@ -11153,7 +11732,7 @@ THE BIDI_CLASS PROPERTY FOR \p AND \P RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space CHARACTER CLASSES @@ -11179,10 +11758,50 @@ CHARACTER CLASSES word same as \w xdigit hexadecimal digit - In PCRE2, POSIX character set names recognize only ASCII characters by - default, but some of them use Unicode properties if PCRE2_UCP is set. + In PCRE2, POSIX character set names recognize only ASCII characters by + default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class. + When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes + may be used, allowing nested character classes, combined using set op- + erators. + + [x&&[^y]] UTS#18 extended character class + + x||y set union (OR) + x&&y set intersection (AND) + x--y set difference (AND NOT) + x~~y set symmetric difference (XOR) + + +PERL EXTENDED CHARACTER CLASSES + + (?[...]) Perl extended character class + (?[\p{Thai} & \p{Nd}]) operators; whitespace ignored + (?[(x - y) & z]) parentheses for grouping + + (?[ [^3] & \p{Nd} ]) [...] is a nested ordinary class + (?[ [:alpha:] - [z] ]) POSIX set is allowed outside [...] + (?[ \d - [3] ]) backslash-escaped set is allowed outside + [...] + (?[ !\n & [:ascii:] ]) backslash-escaped character is allowed out- + side [...] + all other characters or ranges must be enclosed + in [...] + + x|y, x+y set union (OR) + x&y set intersection (AND) + x-y set difference (AND NOT) + x^y set symmetric difference (XOR) + !x set complement (NOT) + + Inside a Perl extended character class, [...] switches mode to be in- + terpreted as an ordinary character class. Outside of a nested [...], + the only items permitted are backslash-escapes, POSIX sets, operators, + and parentheses. Inside a nested ordinary class, ^ has its usual mean- + ing (inverts the class when used as the first character); outside of a + nested class, ^ is the XOR operator. + QUANTIFIERS @@ -11289,7 +11908,7 @@ OPTION SETTING (?^) unset imnrsx options (?aP) implies (?aT) as well, though this has no additional effect. How- - ever, it means that (?-aP) is really (?-PT) which disables all ASCII + ever, it means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes. Unsetting x or xx unsets both. Several options may be set at once, and @@ -11299,20 +11918,25 @@ OPTION SETTING capture group, for example (?i:...). The following are recognized only at the very start of a pattern or af- - ter one of the newline or \R options with similar syntax. More than one - of them may appear. For the first three, d is a decimal number. - - (*LIMIT_DEPTH=d) set the backtracking limit to d - (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes - (*LIMIT_MATCH=d) set the match limit to d - (*NOTEMPTY) set PCRE2_NOTEMPTY when matching - (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching - (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) + ter one of the newline or \R sequences or options with similar syntax. + More than one of them may appear. For the first three, d is a decimal + number. + + (*LIMIT_DEPTH=d) set the backtracking limit to d + (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes + (*LIMIT_MATCH=d) set the match limit to d + (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching + (*NOTEMPTY) set PCRE2_NOTEMPTY when matching + (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching + (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) - (*NO_JIT) disable JIT optimization - (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) - (*UTF) set appropriate UTF mode for the library in use - (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) + (*NO_JIT) disable JIT optimization + (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OP- + TIMIZE) + (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching + (*UTF) set appropriate UTF mode for the library in use + (*UCP) set PCRE2_UCP (use Unicode properties for \d + etc) Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or @@ -11383,6 +12007,22 @@ NON-ATOMIC LOOKAROUND ASSERTIONS (*non_atomic_positive_lookbehind:...) ) +SUBSTRING SCAN ASSERTION + This feature is not Perl-compatible. + + (*scan_substring:(grouplist)...) scan captured substring + (*scs:(grouplist)...) scan captured substring + + The comma-separated list may identify groups in any of the following + ways: + + n absolute reference + +n relative reference + -n relative reference + name + 'name' name + + SCRIPT RUNS (*script_run:...) ) script run, can be backtracked into @@ -11444,16 +12084,16 @@ CONDITIONAL PATTERNS (?(VERSION[>]=n.m) test PCRE2 version (?(assert) assertion condition - Note the ambiguity of (?(R) and (?(Rn) which might be named reference - conditions or recursion tests. Such a condition is interpreted as a + Note the ambiguity of (?(R) and (?(Rn) which might be named reference + conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. BACKTRACKING CONTROL - All backtracking control verbs may be in the form (*VERB:NAME). For - (*MARK) the name is mandatory, for the others it is optional. (*SKIP) - changes its behaviour if :NAME is present. The others just set a name + All backtracking control verbs may be in the form (*VERB:NAME). For + (*MARK) the name is mandatory, for the others it is optional. (*SKIP) + changes its behaviour if :NAME is present. The others just set a name for passing back to the caller, but this is not a name that (*SKIP) can see. The following act immediately they are reached: @@ -11461,7 +12101,7 @@ BACKTRACKING CONTROL (*FAIL) force backtrack; synonym (*F) (*MARK:NAME) set name to be passed back; synonym (*:NAME) - The following act only when a subsequent match failure causes a back- + The following act only when a subsequent match failure causes a back- track to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored. @@ -11473,7 +12113,7 @@ BACKTRACKING CONTROL (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation - The effect of one of these verbs in a group called as a subroutine is + The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call. @@ -11484,14 +12124,61 @@ CALLOUTS (?C"text") callout with string data The allowed string delimiters are ` ' " ^ % # $ (which are the same for - the start and the end), and the starting delimiter { matched with the - ending delimiter }. To encode the ending delimiter within the string, + the start and the end), and the starting delimiter { matched with the + ending delimiter }. To encode the ending delimiter within the string, double it. +REPLACEMENT STRINGS + + If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for + pcre2_substitute() is not interpreted. Otherwise, by default, the only + special character is the dollar character in one of the following + forms: + + $$ insert a dollar character + $n or ${n} insert the contents of group n + $ insert the contents of named group + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string + $*MARK or ${*MARK} insert a control verb name + + For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is + set, there is additional interpretation: + + 1. Backslash is an escape character, and the forms described in "ES- + CAPED CHARACTERS" above are recognized. Also: + + \Q...\E can be used to suppress interpretation + \l force the next character to lower case + \u force the next character to upper case + \L force subsequent characters to lower case + \U force subsequent characters to upper case + \u\L force next character to upper case, then all lower + \l\U force next character to lower case, then all upper + \E end \L or \U case forcing + \b backspace character (note: as in character class in pat- + tern) + \v vertical tab character (note: not the same as in a pattern) + + 2. The Python form \g, where the angle brackets are part of the syn- + tax and n is either a group name or a number, is recognized as an al- + ternative way of inserting the contents of a group, for example \g<3>. + + 3. Capture substitution supports the following additional forms: + + ${n:-string} default for unset group + ${n:+string1:string2} values for set/unset group + + The substitution strings themselves are expanded. Backslash can be used + to escape colons and closing curly brackets. + + SEE ALSO - pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), + pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3). @@ -11504,20 +12191,19 @@ AUTHOR REVISION - Last updated: 12 October 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 27 November 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 12 October 2023 PCRE2SYNTAX(3) +PCRE2 10.45 27 November 2024 PCRE2SYNTAX(3) ------------------------------------------------------------------------------ - PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) NAME - PCRE - Perl-compatible regular expressions (revised API) + PCRE2 - Perl-compatible regular expressions (revised API) UNICODE AND UTF SUPPORT @@ -11554,7 +12240,7 @@ UNICODE PROPERTY SUPPORT ting. The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal num- - ber, the derived properties Any and LC (synonym L&), the Unicode script + ber, the derived properties Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties. @@ -11647,173 +12333,203 @@ UNICODE CASE-EQUIVALENCE in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. + Without PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' = U+212A (Kelvin sign) + 's' = 'S' = U+017F (long S) + With PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' + U+212A (Kelvin sign) only case-equivalent to itself + 's' = 'S' + U+017F (long S) only case-equivalent to itself + + One language family, Turkish and Azeri, has its own case-insensitivity + rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. + This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot + above), and U+0131 (small dotless i) characters. + + Without PCRE2_EXTRA_TURKISH_CASING: + 'i' = 'I' + U+0130 (capital I with dot above) only case-equivalent to itself + U+0131 (small dotless i) only case-equivalent to itself + With PCRE2_EXTRA_TURKISH_CASING: + 'i' = U+0130 (capital I with dot above) + U+0131 (small dotless i) = 'I' + + It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and + PCRE2_EXTRA_TURKISH_CASING together. + + From release 10.45 the Unicode letter properties Lu (upper case), Ll + (lower case), and Lt (title case) are all treated as Lc (cased letter) + when caseless matching is set by the PCRE2_CASELESS option or (?i) + within the pattern. + SCRIPT RUNS - The pattern constructs (*script_run:...) and (*atomic_script_run:...), - with synonyms (*sr:...) and (*asr:...), verify that the string matched - within the parentheses is a script run. In concept, a script run is a - sequence of characters that are all from the same Unicode script. How- + The pattern constructs (*script_run:...) and (*atomic_script_run:...), + with synonyms (*sr:...) and (*asr:...), verify that the string matched + within the parentheses is a script run. In concept, a script run is a + sequence of characters that are all from the same Unicode script. How- ever, because some scripts are commonly used together, and because some - diacritical and other marks are used with multiple scripts, it is not + diacritical and other marks are used with multiple scripts, it is not that simple. Every Unicode character has a Script property, mostly with a value cor- - responding to the name of a script, such as Latin, Greek, or Cyrillic. + responding to the name of a script, such as Latin, Greek, or Cyrillic. There are also three special values: "Unknown" is used for code points that have not been assigned, and also - for the surrogate code points. In the PCRE2 32-bit library, characters - whose code points are greater than the Unicode maximum (U+10FFFF), - which are accessible only in non-UTF mode, are assigned the Unknown + for the surrogate code points. In the PCRE2 32-bit library, characters + whose code points are greater than the Unicode maximum (U+10FFFF), + which are accessible only in non-UTF mode, are assigned the Unknown script. - "Common" is used for characters that are used with many scripts. These - include punctuation, emoji, mathematical, musical, and currency sym- + "Common" is used for characters that are used with many scripts. These + include punctuation, emoji, mathematical, musical, and currency sym- bols, and the ASCII digits 0 to 9. - "Inherited" is used for characters such as diacritical marks that mod- + "Inherited" is used for characters such as diacritical marks that mod- ify a previous character. These are considered to take on the script of the character that they modify. - Some Inherited characters are used with many scripts, but many of them - are only normally used with a small number of scripts. For example, + Some Inherited characters are used with many scripts, but many of them + are only normally used with a small number of scripts. For example, U+102E0 (Coptic Epact thousands mark) is used only with Arabic and Cop- - tic. In order to make it possible to check this, a Unicode property + tic. In order to make it possible to check this, a Unicode property called Script Extension exists. Its value is a list of scripts that ap- ply to the character. For the majority of characters, the list contains - just one script, the same one as the Script property. However, for - characters such as U+102E0 more than one Script is listed. There are - also some Common characters that have a single, non-Common script in + just one script, the same one as the Script property. However, for + characters such as U+102E0 more than one Script is listed. There are + also some Common characters that have a single, non-Common script in their Script Extension list. The next section describes the basic rules for deciding whether a given - string of characters is a script run. Note, however, that there are - some special cases involving the Chinese Han script, and an additional - constraint for decimal digits. These are covered in subsequent sec- + string of characters is a script run. Note, however, that there are + some special cases involving the Chinese Han script, and an additional + constraint for decimal digits. These are covered in subsequent sec- tions. Basic script run rules A string that is less than two characters long is a script run. This is - the only case in which an Unknown character can be part of a script - run. Longer strings are checked using only the Script Extensions prop- + the only case in which an Unknown character can be part of a script + run. Longer strings are checked using only the Script Extensions prop- erty, not the basic Script property. - If a character's Script Extension property is the single value "Inher- + If a character's Script Extension property is the single value "Inher- ited", it is always accepted as part of a script run. This is also true - for the property "Common", subject to the checking of decimal digits + for the property "Common", subject to the checking of decimal digits described below. All the remaining characters in a script run must have - at least one script in common in their Script Extension lists. In set- + at least one script in common in their Script Extension lists. In set- theoretic terminology, the intersection of all the sets of scripts must not be empty. - A simple example is an Internet name such as "google.com". The letters + A simple example is an Internet name such as "google.com". The letters are all in the Latin script, and the dot is Common, so this string is a script run. However, the Cyrillic letter "o" looks exactly the same as - the Latin "o"; a string that looks the same, but with Cyrillic "o"s is + the Latin "o"; a string that looks the same, but with Cyrillic "o"s is not a script run. - More interesting examples involve characters with more than one script + More interesting examples involve characters with more than one script in their Script Extension. Consider the following characters: U+060C Arabic comma U+06D4 Arabic full stop - The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- - iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both - of them could appear in script runs of either Arabic or Hanifi Ro- - hingya. The first could also appear in Syriac or Thaana script runs, + The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- + iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both + of them could appear in script runs of either Arabic or Hanifi Ro- + hingya. The first could also appear in Syriac or Thaana script runs, but the second could not. The Chinese Han script - The Chinese Han script is commonly used in conjunction with other - scripts for writing certain languages. Japanese uses the Hiragana and - Katakana scripts together with Han; Korean uses Hangul and Han; Tai- - wanese Mandarin uses Bopomofo and Han. These three combinations are - treated as special cases when checking script runs and are, in effect, - "virtual scripts". Thus, a script run may contain a mixture of Hira- - gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture - of Bopomofo and Han, but not, for example, a mixture of Hangul and - Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- - dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- + The Chinese Han script is commonly used in conjunction with other + scripts for writing certain languages. Japanese uses the Hiragana and + Katakana scripts together with Han; Korean uses Hangul and Han; Tai- + wanese Mandarin uses Bopomofo and Han. These three combinations are + treated as special cases when checking script runs and are, in effect, + "virtual scripts". Thus, a script run may contain a mixture of Hira- + gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture + of Bopomofo and Han, but not, for example, a mixture of Hangul and + Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- + dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- ports/tr39/) in allowing such mixtures. Decimal digits - Unicode contains many sets of 10 decimal digits in different scripts, - and some scripts (including the Common script) contain more than one - set. Some of these decimal digits them are visually indistinguishable - from the common ASCII digits. In addition to the script checking de- - scribed above, if a script run contains any decimal digits, they must + Unicode contains many sets of 10 decimal digits in different scripts, + and some scripts (including the Common script) contain more than one + set. Some of these decimal digits them are visually indistinguishable + from the common ASCII digits. In addition to the script checking de- + scribed above, if a script run contains any decimal digits, they must all come from the same set of 10 adjacent characters. VALIDITY OF UTF STRINGS - When the PCRE2_UTF option is set, the strings passed as patterns and + When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant functions. If an invalid UTF string is passed, a negative error code is - returned. The code unit offset to the offending character can be ex- - tracted from the match data block by calling pcre2_get_startchar(), + returned. The code unit offset to the offending character can be ex- + tracted from the match data block by calling pcre2_get_startchar(), which is used for this purpose after a UTF error. - In some situations, you may already know that your strings are valid, - and therefore want to skip these checks in order to improve perfor- - mance, for example in the case of a long subject string that is being - scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- - pile time or at match time, PCRE2 assumes that the pattern or subject + In some situations, you may already know that your strings are valid, + and therefore want to skip these checks in order to improve perfor- + mance, for example in the case of a long subject string that is being + scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- + pile time or at match time, PCRE2 assumes that the pattern or subject it is given (respectively) contains only valid UTF code unit sequences. - If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the - result is undefined and your program may crash or loop indefinitely or - give incorrect results. There is, however, one mode of matching that - can handle invalid UTF subject strings. This is enabled by passing - PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in - the next section. The rest of this section covers the case when + If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the + result is undefined and your program may crash or loop indefinitely or + give incorrect results. There is, however, one mode of matching that + can handle invalid UTF subject strings. This is enabled by passing + PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in + the next section. The rest of this section covers the case when PCRE2_MATCH_INVALID_UTF is not set. - Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF - check for the pattern; it does not also apply to subject strings. If - you want to disable the check for a subject string you must pass this + Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF + check for the pattern; it does not also apply to subject strings. If + you want to disable the check for a subject string you must pass this same option to pcre2_match() or pcre2_dfa_match(). UTF-16 and UTF-32 strings can indicate their endianness by special code - knows as a byte-order mark (BOM). The PCRE2 functions do not handle + knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. - Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any + Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other processing takes place. In the case of pcre2_match() and - pcre2_dfa_match() calls with a non-zero starting offset, the check is + pcre2_dfa_match() calls with a non-zero starting offset, the check is applied only to that part of the subject that could be inspected during - matching, and there is a check that the starting offset points to the - first code unit of a character or to the end of the subject. If there - are no lookbehind assertions in the pattern, the check starts at the - starting offset. Otherwise, it starts at the length of the longest - lookbehind before the starting offset, or at the start of the subject - if there are not that many characters before the starting offset. Note + matching, and there is a check that the starting offset points to the + first code unit of a character or to the end of the subject. If there + are no lookbehind assertions in the pattern, the check starts at the + starting offset. Otherwise, it starts at the length of the longest + lookbehind before the starting offset, or at the start of the subject + if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds. - In addition to checking the format of the string, there is a check to + In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding - the surrogate area. The so-called "non-character" code points are not + the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be. - Characters in the "Surrogate Area" of Unicode are reserved for use by - UTF-16, where they are used in pairs to encode code points with values - greater than 0xFFFF. The code points that are encoded by UTF-16 pairs - are available independently in the UTF-8 and UTF-32 encodings. (In - other words, the whole surrogate thing is a fudge for UTF-16 which un- + Characters in the "Surrogate Area" of Unicode are reserved for use by + UTF-16, where they are used in pairs to encode code points with values + greater than 0xFFFF. The code points that are encoded by UTF-16 pairs + are available independently in the UTF-8 and UTF-32 encodings. (In + other words, the whole surrogate thing is a fudge for UTF-16 which un- fortunately messes up UTF-8 and UTF-32.) - Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error - that is given if an escape sequence for an invalid Unicode code point - is encountered in the pattern. If you want to allow escape sequences - such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- - TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible - only in UTF-8 and UTF-32 modes, because these values are not repre- + Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error + that is given if an escape sequence for an invalid Unicode code point + is encountered in the pattern. If you want to allow escape sequences + such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- + TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible + only in UTF-8 and UTF-32 modes, because these values are not repre- sentable in UTF-16. Errors in UTF-8 strings @@ -11826,10 +12542,10 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR4 PCRE2_ERROR_UTF8_ERR5 - The string ends with a truncated UTF-8 character; the code specifies - how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 - characters to be no longer than 4 bytes, the encoding scheme (origi- - nally defined by RFC 2279) allows for up to 6 bytes, and this is + The string ends with a truncated UTF-8 character; the code specifies + how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 + characters to be no longer than 4 bytes, the encoding scheme (origi- + nally defined by RFC 2279) allows for up to 6 bytes, and this is checked first; hence the possibility of 4 or 5 missing bytes. PCRE2_ERROR_UTF8_ERR6 @@ -11839,13 +12555,13 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR10 The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of - the character do not have the binary value 0b10 (that is, either the + the character do not have the binary value 0b10 (that is, either the most significant bit is 0, or the next bit is 1). PCRE2_ERROR_UTF8_ERR11 PCRE2_ERROR_UTF8_ERR12 - A character that is valid by the RFC 2279 rules is either 5 or 6 bytes + A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; these code points are excluded by RFC 3629. PCRE2_ERROR_UTF8_ERR13 @@ -11855,8 +12571,8 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR14 - A 3-byte character has a value in the range 0xd800 to 0xdfff; this - range of code points are reserved by RFC 3629 for use with UTF-16, and + A 3-byte character has a value in the range 0xd800 to 0xdfff; this + range of code points are reserved by RFC 3629 for use with UTF-16, and so are excluded from UTF-8. PCRE2_ERROR_UTF8_ERR15 @@ -11865,26 +12581,26 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR18 PCRE2_ERROR_UTF8_ERR19 - A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes - for a value that can be represented by fewer bytes, which is invalid. - For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- + A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes + for a value that can be represented by fewer bytes, which is invalid. + For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- rect coding uses just one byte. PCRE2_ERROR_UTF8_ERR20 The two most significant bits of the first byte of a character have the - binary value 0b10 (that is, the most significant bit is 1 and the sec- - ond is 0). Such a byte can only validly occur as the second or subse- + binary value 0b10 (that is, the most significant bit is 1 and the sec- + ond is 0). Such a byte can only validly occur as the second or subse- quent byte of a multi-byte character. PCRE2_ERROR_UTF8_ERR21 - The first byte of a character has the value 0xfe or 0xff. These values + The first byte of a character has the value 0xfe or 0xff. These values can never occur in a valid UTF-8 string. Errors in UTF-16 strings - The following negative error codes are given for invalid UTF-16 + The following negative error codes are given for invalid UTF-16 strings: PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string @@ -11894,7 +12610,7 @@ VALIDITY OF UTF STRINGS Errors in UTF-32 strings - The following negative error codes are given for invalid UTF-32 + The following negative error codes are given for invalid UTF-32 strings: PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) @@ -11904,60 +12620,60 @@ VALIDITY OF UTF STRINGS MATCHING IN INVALID UTF STRINGS You can run pattern matches on subject strings that may contain invalid - UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- - VALID_UTF option. This is supported by pcre2_match(), including JIT + UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- + VALID_UTF option. This is supported by pcre2_match(), including JIT matching, but not by pcre2_dfa_match(). When PCRE2_MATCH_INVALID_UTF is - set, it forces PCRE2_UTF to be set as well. Note, however, that the + set, it forces PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a valid UTF string. - If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, - and you are not certain that your subject strings are valid UTF se- - quences, you should not make use of the JIT "fast path" function - pcre2_jit_match() because it bypasses sanity checks, including the one - for UTF validity. An invalid string may cause undefined behaviour, in- + If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, + and you are not certain that your subject strings are valid UTF se- + quences, you should not make use of the JIT "fast path" function + pcre2_jit_match() because it bypasses sanity checks, including the one + for UTF validity. An invalid string may cause undefined behaviour, in- cluding looping, crashing, or giving the wrong answer. - Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() - generates, but if pcre2_jit_compile() is subsequently called, it does + Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() + generates, but if pcre2_jit_compile() is subsequently called, it does generate different code. If JIT is not used, the option affects the be- haviour of the interpretive code in pcre2_match(). When PCRE2_MATCH_IN- - VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at + VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time. - In this mode, an invalid code unit sequence in the subject never - matches any pattern item. It does not match dot, it does not match - \p{Any}, it does not even match negative items such as [^X]. A lookbe- - hind assertion fails if it encounters an invalid sequence while moving - the current point backwards. In other words, an invalid UTF code unit + In this mode, an invalid code unit sequence in the subject never + matches any pattern item. It does not match dot, it does not match + \p{Any}, it does not even match negative items such as [^X]. A lookbe- + hind assertion fails if it encounters an invalid sequence while moving + the current point backwards. In other words, an invalid UTF code unit sequence acts as a barrier which no match can cross. You can also think of this as the subject being split up into fragments - of valid UTF, delimited internally by invalid code unit sequences. The - pattern is matched fragment by fragment. The result of a successful - match, however, is given as code unit offsets in the entire subject + of valid UTF, delimited internally by invalid code unit sequences. The + pattern is matched fragment by fragment. The result of a successful + match, however, is given as code unit offsets in the entire subject string in the usual way. There are a few points to consider: - The internal boundaries are not interpreted as the beginnings or ends - of lines and so do not match circumflex or dollar characters in the + The internal boundaries are not interpreted as the beginnings or ends + of lines and so do not match circumflex or dollar characters in the pattern. - If pcre2_match() is called with an offset that points to an invalid - UTF-sequence, that sequence is skipped, and the match starts at the + If pcre2_match() is called with an offset that points to an invalid + UTF-sequence, that sequence is skipped, and the match starts at the next valid UTF character, or the end of the subject. At internal fragment boundaries, \b and \B behave in the same way as at - the beginning and end of the subject. For example, a sequence such as - \bWORD\b would match an instance of WORD that is surrounded by invalid + the beginning and end of the subject. For example, a sequence such as + \bWORD\b would match an instance of WORD that is surrounded by invalid UTF code units. - Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- - trary data, knowing that any matched strings that are returned are + Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- + trary data, knowing that any matched strings that are returned are valid UTF. This can be useful when searching for UTF text in executable or other binary files. - Note, however, that the 16-bit and 32-bit PCRE2 libraries process - strings as sequences of uint16_t or uint32_t code points. They cannot - find valid UTF sequences within an arbitrary string of bytes unless + Note, however, that the 16-bit and 32-bit PCRE2 libraries process + strings as sequences of uint16_t or uint32_t code points. They cannot + find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. @@ -11970,11 +12686,11 @@ AUTHOR REVISION - Last updated: 12 October 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 27 November 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 04 February 2023 PCRE2UNICODE(3) +PCRE2 10.45 27 November 2024 PCRE2UNICODE(3) ------------------------------------------------------------------------------ diff --git a/mingw32/share/doc/pcre2/pcre2grep.txt b/mingw32/share/doc/pcre2/pcre2grep.txt index 7914c450fcb..9e07a5a7dac 100644 --- a/mingw32/share/doc/pcre2/pcre2grep.txt +++ b/mingw32/share/doc/pcre2/pcre2grep.txt @@ -1,4 +1,3 @@ - PCRE2GREP(1) General Commands Manual PCRE2GREP(1) @@ -366,139 +365,140 @@ OPTIONS used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --new- line option has no effect on this option. Trailing white - space is removed from each line, and blank lines are ignored. - An empty file contains no patterns and therefore matches - nothing. Patterns read from a file in this way may contain - binary zeros, which are treated as ordinary data characters. - - If this option is given more than once, all the specified - files are read. A data line is output if any of the patterns - match it. A file name can be given as "-" to refer to the - standard input. When -f is used, patterns specified on the - command line using -e may also be present; they are matched + space is removed from each line, and blank lines are ignored + unless the --posix-pattern-file option is also provided. An + empty file contains no patterns and therefore matches noth- + ing. Patterns read from a file in this way may contain binary + zeros, which are treated as ordinary character literals. + + If this option is given more than once, all the specified + files are read. A data line is output if any of the patterns + match it. A file name can be given as "-" to refer to the + standard input. When -f is used, patterns specified on the + command line using -e may also be present; they are matched before the file's patterns. However, no pattern is taken from - the command line; all arguments are treated as the names of + the command line; all arguments are treated as the names of paths to be searched. --file-list=filename - Read a list of files and/or directories that are to be + Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a - newline when reading the file is the operating system's de- - fault. Trailing white space is removed from each line, and + newline when reading the file is the operating system's de- + fault. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any - that are listed on the command line. The file name can be - given as "-" to refer to the standard input. If --file and - --file-list are both specified as "-", patterns are read - first. This is useful only when the standard input is a ter- - minal, from which further lines (the list of files) can be + that are listed on the command line. The file name can be + given as "-" to refer to the standard input. If --file and + --file-list are both specified as "-", patterns are read + first. This is useful only when the standard input is a ter- + minal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read. --file-offsets - Instead of showing lines or parts of lines that match, show - each match as an offset from the start of the file and a - length, separated by a comma. In this mode, --colour has no - effect, and no context is shown. That is, the -A, -B, and -C - options are ignored. If there is more than one match in a - line, each of them is shown separately. This option is mutu- - ally exclusive with --output, --line-offsets, and --only- + Instead of showing lines or parts of lines that match, show + each match as an offset from the start of the file and a + length, separated by a comma. In this mode, --colour has no + effect, and no context is shown. That is, the -A, -B, and -C + options are ignored. If there is more than one match in a + line, each of them is shown separately. This option is mutu- + ally exclusive with --output, --line-offsets, and --only- matching. --group-separator=text Output this text string instead of two hyphens between groups - of lines when -A, -B, or -C is in use. See also --no-group- + of lines when -A, -B, or -C is in use. See also --no-group- separator. -H, --with-filename - Force the inclusion of the file name at the start of output + Force the inclusion of the file name at the start of output lines when searching a single file. The file name is not nor- - mally shown in this case. By default, for matching lines, - the file name is followed by a colon; for context lines, a + mally shown in this case. By default, for matching lines, + the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. When the -M option causes a - pattern to match more than one line, only the first is pre- - ceded by the file name. This option overrides any previous + pattern to match more than one line, only the first is pre- + ceded by the file name. This option overrides any previous -h, -l, or -L options. -h, --no-filename Suppress the output file names when searching multiple files. File names are normally shown when multiple files are - searched. By default, for matching lines, the file name is + searched. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a - zero byte. If a line number is also being output, it follows + zero byte. If a line number is also being output, it follows the file name. This option overrides any previous -H, -L, or -l options. --heap-limit=number See --match-limit below. - --help Output a help message, giving brief details of the command - options and file type support, and then exit. Anything else + --help Output a help message, giving brief details of the command + options and file type support, and then exit. Anything else on the command line is ignored. - -I Ignore binary files. This is equivalent to --binary- + -I Ignore binary files. This is equivalent to --binary- files=without-match. -i, --ignore-case - Ignore upper/lower case distinctions when pattern matching. + Ignore upper/lower case distinctions when pattern matching. This applies when matching path names for inclusion or exclu- sion as well as when matching lines in files. --include=pattern - If any --include patterns are specified, the only files that + If any --include patterns are specified, the only files that are processed are those whose names match one of the patterns - and do not match an --exclude pattern. This option does not - affect directories, but it applies to all files, whether - listed on the command line, obtained from --file-list, or by - scanning a directory. The pattern is a PCRE2 regular expres- - sion, and is matched against the final component of the file - name, not the entire path. The -F, -w, and -x options do not - apply to this pattern. The option may be given any number of - times. If a file name matches both an --include and an --ex- - clude pattern, it is excluded. There is no short form for + and do not match an --exclude pattern. This option does not + affect directories, but it applies to all files, whether + listed on the command line, obtained from --file-list, or by + scanning a directory. The pattern is a PCRE2 regular expres- + sion, and is matched against the final component of the file + name, not the entire path. The -F, -w, and -x options do not + apply to this pattern. The option may be given any number of + times. If a file name matches both an --include and an --ex- + clude pattern, it is excluded. There is no short form for this option. --include-from=filename - Treat each non-empty line of the file as the data for an + Treat each non-empty line of the file as the data for an --include option. What constitutes a newline for this purpose - is the operating system's default. The --newline option has + is the operating system's default. The --newline option has no effect on this option. This option may be given any number of times; all the files are read. --include-dir=pattern - If any --include-dir patterns are specified, the only direc- - tories that are processed are those whose names match one of - the patterns and do not match an --exclude-dir pattern. This - applies to all directories, whether listed on the command - line, obtained from --file-list, or by scanning a parent di- - rectory. The pattern is a PCRE2 regular expression, and is - matched against the final component of the directory name, - not the entire path. The -F, -w, and -x options do not apply + If any --include-dir patterns are specified, the only direc- + tories that are processed are those whose names match one of + the patterns and do not match an --exclude-dir pattern. This + applies to all directories, whether listed on the command + line, obtained from --file-list, or by scanning a parent di- + rectory. The pattern is a PCRE2 regular expression, and is + matched against the final component of the directory name, + not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. - If a directory matches both --include-dir and --exclude-dir, + If a directory matches both --include-dir and --exclude-dir, it is excluded. There is no short form for this option. -L, --files-without-match - Instead of outputting lines from the files, just output the - names of the files that do not contain any lines that would - have been output. Each file name is output once, on a sepa- - rate line by default, but if the -Z option is set, they are - separated by zero bytes instead of newlines. This option + Instead of outputting lines from the files, just output the + names of the files that do not contain any lines that would + have been output. Each file name is output once, on a sepa- + rate line by default, but if the -Z option is set, they are + separated by zero bytes instead of newlines. This option overrides any previous -H, -h, or -l options. -l, --files-with-matches - Instead of outputting lines from the files, just output the + Instead of outputting lines from the files, just output the names of the files containing lines that would have been out- - put. Each file name is output once, on a separate line, but + put. Each file name is output once, on a separate line, but if the -Z option is set, they are separated by zero bytes in- - stead of newlines. Searching normally stops as soon as a - matching line is found in a file. However, if the -c (count) - option is also used, matching continues in order to obtain - the correct count, and those files that have at least one - match are listed along with their counts. Using this option - with -c is a way of suppressing the listing of files with no + stead of newlines. Searching normally stops as soon as a + matching line is found in a file. However, if the -c (count) + option is also used, matching continues in order to obtain + the correct count, and those files that have at least one + match are listed along with their counts. Using this option + with -c is a way of suppressing the listing of files with no matches that occurs with -c on its own. This option overrides any previous -H, -h, or -L options. @@ -508,130 +508,130 @@ OPTIONS input)" is used. There is no short form for this option. --line-buffered - When this option is given, non-compressed input is read and - processed line by line, and the output is flushed after each - write. By default, input is read in large chunks, unless - pcre2grep can determine that it is reading from a terminal, + When this option is given, non-compressed input is read and + processed line by line, and the output is flushed after each + write. By default, input is read in large chunks, unless + pcre2grep can determine that it is reading from a terminal, which is currently possible only in Unix-like environments or Windows. Output to terminal is normally automatically flushed - by the operating system. This option can be useful when the - input or output is attached to a pipe and you do not want - pcre2grep to buffer up large amounts of data. However, its - use will affect performance, and the -M (multiline) option - ceases to work. When input is from a compressed .gz or .bz2 + by the operating system. This option can be useful when the + input or output is attached to a pipe and you do not want + pcre2grep to buffer up large amounts of data. However, its + use will affect performance, and the -M (multiline) option + ceases to work. When input is from a compressed .gz or .bz2 file, --line-buffered is ignored. --line-offsets - Instead of showing lines or parts of lines that match, show + Instead of showing lines or parts of lines that match, show each match as a line number, the offset from the start of the - line, and a length. The line number is terminated by a colon - (as usual; see the -n option), and the offset and length are - separated by a comma. In this mode, --colour has no effect, - and no context is shown. That is, the -A, -B, and -C options - are ignored. If there is more than one match in a line, each - of them is shown separately. This option is mutually exclu- + line, and a length. The line number is terminated by a colon + (as usual; see the -n option), and the offset and length are + separated by a comma. In this mode, --colour has no effect, + and no context is shown. That is, the -A, -B, and -C options + are ignored. If there is more than one match in a line, each + of them is shown separately. This option is mutually exclu- sive with --output, --file-offsets, and --only-matching. --locale=locale-name - This option specifies a locale to be used for pattern match- - ing. It overrides the value in the LC_ALL or LC_CTYPE envi- - ronment variables. If no locale is specified, the PCRE2 li- + This option specifies a locale to be used for pattern match- + ing. It overrides the value in the LC_ALL or LC_CTYPE envi- + ronment variables. If no locale is specified, the PCRE2 li- brary's default (usually the "C" locale) is used. There is no short form for this option. -M, --multiline - Allow patterns to match more than one line. When this option - is set, the PCRE2 library is called in "multiline" mode, and - a match is allowed to continue past the end of the initial + Allow patterns to match more than one line. When this option + is set, the PCRE2 library is called in "multiline" mode, and + a match is allowed to continue past the end of the initial line and onto one or more subsequent lines. - Patterns used with -M may usefully contain literal newline - characters and internal occurrences of ^ and $ characters, - because in multiline mode these can match at internal new- - lines. Because pcre2grep is scanning multiple lines, the \Z - and \z assertions match only at the end of the last line in + Patterns used with -M may usefully contain literal newline + characters and internal occurrences of ^ and $ characters, + because in multiline mode these can match at internal new- + lines. Because pcre2grep is scanning multiple lines, the \Z + and \z assertions match only at the end of the last line in the file. The \A assertion matches at the start of the first - line of a match. This can be any line in the file; it is not + line of a match. This can be any line in the file; it is not anchored to the first line. - The output for a successful match may consist of more than - one line. The first line is the line in which the match - started, and the last line is the line in which the match - ended. If the matched string ends with a newline sequence, - the output ends at the end of that line. If -v is set, none - of the lines in a multi-line match are output. Once a match - has been handled, scanning restarts at the beginning of the + The output for a successful match may consist of more than + one line. The first line is the line in which the match + started, and the last line is the line in which the match + ended. If the matched string ends with a newline sequence, + the output ends at the end of that line. If -v is set, none + of the lines in a multi-line match are output. Once a match + has been handled, scanning restarts at the beginning of the line after the one in which the match ended. - The newline sequence that separates multiple lines must be - matched as part of the pattern. For example, to find the - phrase "regular expression" in a file where "regular" might - be at the end of a line and "expression" at the start of the + The newline sequence that separates multiple lines must be + matched as part of the pattern. For example, to find the + phrase "regular expression" in a file where "regular" might + be at the end of a line and "expression" at the start of the next line, you could use this command: pcre2grep -M 'regular\s+expression' The \s escape sequence matches any white space character, in- - cluding newlines, and is followed by + so as to match trail- - ing white space on the first line as well as possibly han- + cluding newlines, and is followed by + so as to match trail- + ing white space on the first line as well as possibly han- dling a two-character newline sequence. - There is a limit to the number of lines that can be matched, - imposed by the way that pcre2grep buffers the input file as - it scans it. With a sufficiently large processing buffer, + There is a limit to the number of lines that can be matched, + imposed by the way that pcre2grep buffers the input file as + it scans it. With a sufficiently large processing buffer, this should not be a problem. - The -M option does not work when input is read line by line + The -M option does not work when input is read line by line (see --line-buffered.) -m number, --max-count=number - Stop processing after finding number matching lines, or non- - matching lines if -v is also set. Any trailing context lines - are output after the final match. In multiline mode, each - multiline match counts as just one line for this purpose. If - this limit is reached when reading the standard input from a + Stop processing after finding number matching lines, or non- + matching lines if -v is also set. Any trailing context lines + are output after the final match. In multiline mode, each + multiline match counts as just one line for this purpose. If + this limit is reached when reading the standard input from a regular file, the file is left positioned just after the last - matching line. If -c is also set, the count that is output - is never greater than number. This option has no effect if + matching line. If -c is also set, the count that is output + is never greater than number. This option has no effect if used with -L, -l, or -q, or when just checking for a match in a binary file. --match-limit=number - Processing some regular expression patterns may take a very + Processing some regular expression patterns may take a very long time to search for all possible matching strings. Others - may require a very large amount of memory. There are three + may require a very large amount of memory. There are three options that set resource limits for matching. The --match-limit option provides a means of limiting comput- - ing resource usage when processing patterns that are not go- + ing resource usage when processing patterns that are not go- ing to match, but which have a very large number of possibil- ities in their search trees. The classic example is a pattern - that uses nested unlimited repeats. Internally, PCRE2 has a - counter that is incremented each time around its main pro- - cessing loop. If the value set by --match-limit is reached, + that uses nested unlimited repeats. Internally, PCRE2 has a + counter that is incremented each time around its main pro- + cessing loop. If the value set by --match-limit is reached, an error occurs. - The --heap-limit option specifies, as a number of kibibytes + The --heap-limit option specifies, as a number of kibibytes (units of 1024 bytes), the maximum amount of heap memory that may be used for matching. - The --depth-limit option limits the depth of nested back- + The --depth-limit option limits the depth of nested back- tracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtrack- - ing point depends on the number of capturing parentheses in + ing point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this - limit acts varies from pattern to pattern. This limit is of + limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than --match-limit. - There are no short forms for these options. The default lim- - its can be set when the PCRE2 library is compiled; if they - are not specified, the defaults are very large and so effec- + There are no short forms for these options. The default lim- + its can be set when the PCRE2 library is compiled; if they + are not specified, the defaults are very large and so effec- tively unlimited. --max-buffer-size=number - This limits the expansion of the processing buffer, whose - initial size can be set by --buffer-size. The maximum buffer - size is silently forced to be no smaller than the starting + This limits the expansion of the processing buffer, whose + initial size can be set by --buffer-size. The maximum buffer + size is silently forced to be no smaller than the starting buffer size. -N newline-type, --newline=newline-type @@ -640,72 +640,72 @@ OPTIONS pcre2grep -N CRLF 'some pattern' - The newline type may be specified in upper, lower, or mixed - case. If the newline type is NUL, lines are separated by bi- - nary zero characters. The other types are the single-charac- - ter sequences CR (carriage return) and LF (linefeed), the - two-character sequence CRLF, an "anycrlf" type, which recog- - nizes any of the preceding three types, and an "any" type, - for which any Unicode line ending sequence is assumed to end - a line. The Unicode sequences are the three just mentioned, - plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL - (next line, U+0085), LS (line separator, U+2028), and PS + The newline type may be specified in upper, lower, or mixed + case. If the newline type is NUL, lines are separated by bi- + nary zero characters. The other types are the single-charac- + ter sequences CR (carriage return) and LF (linefeed), the + two-character sequence CRLF, an "anycrlf" type, which recog- + nizes any of the preceding three types, and an "any" type, + for which any Unicode line ending sequence is assumed to end + a line. The Unicode sequences are the three just mentioned, + plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL + (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). - When the PCRE2 library is built, a default line-ending se- - quence is specified. This is normally the standard sequence - for the operating system. Unless otherwise specified by this + When the PCRE2 library is built, a default line-ending se- + quence is specified. This is normally the standard sequence + for the operating system. Unless otherwise specified by this option, pcre2grep uses the library's default. - This option makes it possible to use pcre2grep to scan files + This option makes it possible to use pcre2grep to scan files that have come from other environments without having to mod- - ify their line endings. If the data that is being scanned - does not agree with the convention set by this option, - pcre2grep may behave in strange ways. Note that this option - does not apply to files specified by the -f, --exclude-from, - or --include-from options, which are expected to use the op- + ify their line endings. If the data that is being scanned + does not agree with the convention set by this option, + pcre2grep may behave in strange ways. Note that this option + does not apply to files specified by the -f, --exclude-from, + or --include-from options, which are expected to use the op- erating system's standard newline sequence. -n, --line-number Precede each output line by its line number in the file, fol- - lowed by a colon for matching lines or a hyphen for context + lowed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the - line number. When the -M option causes a pattern to match - more than one line, only the first is preceded by its line + line number. When the -M option causes a pattern to match + more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used. --no-group-separator - Do not output a separator between groups of lines when -A, + Do not output a separator between groups of lines when -A, -B, or -C is in use. The default is to output a line contain- ing two hyphens. See also --group-separator. - --no-jit If the PCRE2 library is built with support for just-in-time + --no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build - time. This option can be used to disable the use of JIT at + time. This option can be used to disable the use of JIT at run time. It is provided for testing and working around prob- lems. It should never be needed in normal use. -O text, --output=text - When there is a match, instead of outputting the line that - matched, output just the text specified in this option, fol- - lowed by an operating-system standard newline. In this mode, - --colour has no effect, and no context is shown. That is, - the -A, -B, and -C options are ignored. The --newline option - has no effect on this option, which is mutually exclusive + When there is a match, instead of outputting the line that + matched, output just the text specified in this option, fol- + lowed by an operating-system standard newline. In this mode, + --colour has no effect, and no context is shown. That is, + the -A, -B, and -C options are ignored. The --newline option + has no effect on this option, which is mutually exclusive with --only-matching, --file-offsets, and --line-offsets. - However, like --only-matching, if there is more than one + However, like --only-matching, if there is more than one match in a line, each of them causes a line of output. Escape sequences starting with a dollar character may be used to insert the contents of the matched part of the line and/or captured substrings into the text. - $ or ${} is replaced by the captured sub- - string of the given decimal number; zero substitutes the - whole match. If the number is greater than the number of cap- - turing substrings, or if the capture is unset, the replace- - ment is empty. + $ or ${} is replaced by the captured sub- + string of the given decimal number; $& (or the legacy $0) + substitutes the whole match. If the number is greater than + the number of capturing substrings, or if the capture is un- + set, the replacement is empty. $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; @@ -787,93 +787,103 @@ OPTIONS mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \w to match Unicode letters and digits. + --posix-pattern-file + When patterns are provided with the -f option, do not trim + trailing spaces or ignore empty lines in a similar way than + other grep tools. To keep the behaviour consistent with older + versions, if the pattern read was terminated with CRLF (as + character literals) then both characters won't be included as + part of it, so if you really need to have pattern ending in + '\r', use a escape sequence or provide it by a different + method. + -q, --quiet Work quietly, that is, display nothing except error messages. - The exit status indicates whether or not any matches were + The exit status indicates whether or not any matches were found. -r, --recursive - If any given path is a directory, recursively scan the files - it contains, taking note of any --include and --exclude set- - tings. By default, a directory is read as a normal file; in - some operating systems this gives an immediate end-of-file. - This option is a shorthand for setting the -d option to "re- + If any given path is a directory, recursively scan the files + it contains, taking note of any --include and --exclude set- + tings. By default, a directory is read as a normal file; in + some operating systems this gives an immediate end-of-file. + This option is a shorthand for setting the -d option to "re- curse". --recursion-limit=number - This is an obsolete synonym for --depth-limit. See --match- + This is an obsolete synonym for --depth-limit. See --match- limit above for details. -s, --no-messages - Suppress error messages about non-existent or unreadable - files. Such files are quietly skipped. However, the return + Suppress error messages about non-existent or unreadable + files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. -t, --total-count - This option is useful when scanning more than one file. If - used on its own, -t suppresses all output except for a grand - total number of matching lines (or non-matching lines if -v + This option is useful when scanning more than one file. If + used on its own, -t suppresses all output except for a grand + total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand to- - tal is output except when the previous output is just one - line. In other words, it is not output when just one file's - count is listed. If file names are being output, the grand - total is preceded by "TOTAL:". Otherwise, it appears as just - another number. The -t option is ignored when used with -L - (list files without matches), because the grand total would + tal is output except when the previous output is just one + line. In other words, it is not output when just one file's + count is listed. If file names are being output, the grand + total is preceded by "TOTAL:". Otherwise, it appears as just + another number. The -t option is ignored when used with -L + (list files without matches), because the grand total would always be zero. -u, --utf Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (in- - cluding those for any --exclude and --include options) and - all lines that are scanned must be valid strings of UTF-8 + cluding those for any --exclude and --include options) and + all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an er- ror occurs. -U, --utf-allow-invalid - As --utf, but in addition subject lines may contain invalid - UTF-8 code unit sequences. These can never form part of any - pattern match. Patterns themselves, however, must still be + As --utf, but in addition subject lines may contain invalid + UTF-8 code unit sequences. These can never form part of any + pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or - other binary files. For more details about matching in non- + other binary files. For more details about matching in non- valid UTF-8 strings, see the pcre2unicode(3) documentation. -V, --version - Write the version numbers of pcre2grep and the PCRE2 library - to the standard output and then exit. Anything else on the + Write the version numbers of pcre2grep and the PCRE2 library + to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match - Invert the sense of the match, so that lines which do not - match any of the patterns are the ones that are found. When - this option is set, options such as --only-matching and - --output, which specify parts of a match that are to be out- + Invert the sense of the match, so that lines which do not + match any of the patterns are the ones that are found. When + this option is set, options such as --only-matching and + --output, which specify parts of a match that are to be out- put, are ignored. -w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must - be a word boundary at the start and end of each matched - string. This is equivalent to having "\b(?:" at the start of - each pattern, and ")\b" at the end. This option applies only - to the patterns that are matched against the contents of - files; it does not apply to patterns specified by any of the + be a word boundary at the start and end of each matched + string. This is equivalent to having "\b(?:" at the start of + each pattern, and ")\b" at the end. This option applies only + to the patterns that are matched against the contents of + files; it does not apply to patterns specified by any of the --include or --exclude options. -x, --line-regex, --line-regexp - Force the patterns to start matching only at the beginnings - of lines, and in addition, require them to match entire + Force the patterns to start matching only at the beginnings + of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pat- - tern and ")$" at the end. This option applies only to the - patterns that are matched against the contents of files; it - does not apply to patterns specified by any of the --include + tern and ")$" at the end. This option applies only to the + patterns that are matched against the contents of files; it + does not apply to patterns specified by any of the --include or --exclude options. -Z, --null - Terminate files names in the regular output with a zero byte - (the NUL character) instead of what would normally appear. - This is useful when file names contain unusual characters - such as colons, hyphens, or even newlines. The option does + Terminate files names in the regular output with a zero byte + (the NUL character) instead of what would normally appear. + This is useful when file names contain unusual characters + such as colons, hyphens, or even newlines. The option does not apply to file names in error messages. @@ -887,90 +897,90 @@ ENVIRONMENT VARIABLES NEWLINES - The -N (--newline) option allows pcre2grep to scan files with newline - conventions that differ from the default. This option affects only the - way scanned files are processed. It does not affect the interpretation - of files specified by the -f, --file-list, --exclude-from, or --in- + The -N (--newline) option allows pcre2grep to scan files with newline + conventions that differ from the default. This option affects only the + way scanned files are processed. It does not affect the interpretation + of files specified by the -f, --file-list, --exclude-from, or --in- clude-from options. - Any parts of the scanned input files that are written to the standard - output are copied with whatever newline sequences they have in the in- - put. However, if the final line of a file is output, and it does not - end with a newline sequence, a newline sequence is added. If the new- - line setting is CR, LF, CRLF or NUL, that line ending is output; for + Any parts of the scanned input files that are written to the standard + output are copied with whatever newline sequences they have in the in- + put. However, if the final line of a file is output, and it does not + end with a newline sequence, a newline sequence is added. If the new- + line setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. - The newline setting does not affect the way in which pcre2grep writes - newlines in informational messages to the standard output and error - streams. Under Windows, the standard output is set to be binary, so - that "\r\n" at the ends of output lines that are copied from the input - is not converted to "\r\r\n" by the C I/O library. This means that any - messages written to the standard output must end with "\r\n". For all - other operating systems, and for all messages to the standard error + The newline setting does not affect the way in which pcre2grep writes + newlines in informational messages to the standard output and error + streams. Under Windows, the standard output is set to be binary, so + that "\r\n" at the ends of output lines that are copied from the input + is not converted to "\r\r\n" by the C I/O library. This means that any + messages written to the standard output must end with "\r\n". For all + other operating systems, and for all messages to the standard error stream, "\n" is used. OPTIONS COMPATIBILITY WITH GNU GREP Many of the short and long forms of pcre2grep's options are the same as - in the GNU grep program. Any long option of the form --xxx-regexp (GNU - terminology) is also available as --xxx-regex (PCRE2 terminology). - However, the --case-restrict, --depth-limit, -E, --file-list, --file- + in the GNU grep program. Any long option of the form --xxx-regexp (GNU + terminology) is also available as --xxx-regex (PCRE2 terminology). + However, the --case-restrict, --depth-limit, -E, --file-list, --file- offsets, --heap-limit, --include-dir, --line-offsets, --locale, - --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- - tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are + --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- + tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number. - Although most of the common options work the same way, a few are dif- - ferent in pcre2grep. For example, the --include option's argument is a + Although most of the common options work the same way, a few are dif- + ferent in pcre2grep. For example, the --include option's argument is a glob for GNU grep, but in pcre2grep it is a regular expression to which - the -i option applies. If both the -c and -l options are given, GNU - grep lists only file names, without counts, but pcre2grep gives the + the -i option applies. If both the -c and -l options are given, GNU + grep lists only file names, without counts, but pcre2grep gives the counts as well. OPTIONS WITH DATA There are four different ways in which an option with data can be spec- - ified. If a short form option is used, the data may follow immedi- + ified. If a short form option is used, the data may follow immedi- ately, or (with one exception) in the next command line item. For exam- ple: -f/some/file -f /some/file - The exception is the -o option, which may appear with or without data. - Because of this, if data is present, it must follow immediately in the + The exception is the -o option, which may appear with or without data. + Because of this, if data is present, it must follow immediately in the same item, for example -o3. - If a long form option is used, the data may appear in the same command - line item, separated by an equals character, or (with two exceptions) + If a long form option is used, the data may appear in the same command + line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: --file=/some/file --file /some/file - Note, however, that if you want to supply a file name beginning with ~ - as data in a shell command, and have the shell expand ~ to a home di- - rectory, you must separate the file name from the option, because the + Note, however, that if you want to supply a file name beginning with ~ + as data in a shell command, and have the shell expand ~ to a home di- + rectory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. - The exceptions to the above are the --colour (or --color) and --only- - matching options, for which the data is optional. If one of these op- - tions does have data, it must be given in the first form, using an + The exceptions to the above are the --colour (or --color) and --only- + matching options, for which the data is optional. If one of these op- + tions does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data. USING PCRE2'S CALLOUT FACILITY - pcre2grep has, by default, support for calling external programs or - scripts or echoing specific strings during matching by making use of - PCRE2's callout facility. However, this support can be completely or - partially disabled when pcre2grep is built. You can find out whether - your binary has support for callouts by running it with the --help op- - tion. If callout support is completely disabled, all callouts in pat- - terns are ignored by pcre2grep. If the facility is partially disabled, + pcre2grep has, by default, support for calling external programs or + scripts or echoing specific strings during matching by making use of + PCRE2's callout facility. However, this support can be completely or + partially disabled when pcre2grep is built. You can find out whether + your binary has support for callouts by running it with the --help op- + tion. If callout support is completely disabled, callouts in patterns + are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored. @@ -988,13 +998,13 @@ USING PCRE2'S CALLOUT FACILITY processed as a zero-terminated string, which means it should not con- tain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the - --output (-O) option (see above). However, $0 cannot be used to insert - a matched substring because the match is still in progress. Instead, - the single character '0' is inserted. Any syntax errors in the string - (for example, a dollar not followed by another character) causes the - callout to be ignored. No terminator is added to the output string, so - if you want a newline, you must include it explicitly using the escape - $n. For example: + --output (-O) option (see above). However, $0 or $& cannot be used to + insert a matched substring because the match is still in progress. In- + stead, the single character '0' is inserted. Any syntax errors in the + string (for example, a dollar not followed by another character) causes + the callout to be ignored. No terminator is added to the output string, + so if you want a newline, you must include it explicitly using the es- + cape $n. For example: pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' @@ -1018,10 +1028,11 @@ USING PCRE2'S CALLOUT FACILITY Any substring (including the executable name) may contain escape se- quences started by a dollar character. These are the same as for the - --output (-O) option documented above, except that $0 cannot insert the - matched string because the match is still in progress. Instead, the - character '0' is inserted. If you need a literal dollar or pipe charac- - ter in any substring, use $$ or $| respectively. Here is an example: + --output (-O) option documented above, except that $0 or $& cannot in- + sert the matched string because the match is still in progress. In- + stead, the character '0' is inserted. If you need a literal dollar or + pipe character in any substring, use $$ or $| respectively. Here is an + example: echo -e "abcde\n12345" | pcre2grep \ '(?x)(.)(..(.)) @@ -1034,43 +1045,43 @@ USING PCRE2'S CALLOUT FACILITY Arg1: [1] [234] [4] Arg2: |1| () 12345 - The parameters for the system call that is used to run the program or + The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero charac- - ters in the callout argument will cause premature termination of their - substrings, and therefore should not be present. Any syntax errors in - the string (for example, a dollar not followed by another character) + ters in the callout argument will cause premature termination of their + substrings, and therefore should not be present. Any syntax errors in + the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any - reason (including the non-existence of the executable), a local match- + reason (including the non-existence of the executable), a local match- ing failure occurs and the matcher backtracks in the normal way. MATCHING ERRORS - It is possible to supply a regular expression that takes a very long - time to fail to match certain lines. Such patterns normally involve - nested indefinite repeats, for example: (a+)*\d when matched against a - line of a's with no final digit. The PCRE2 matching function has a re- - source limit that causes it to abort in these circumstances. If this - happens, pcre2grep outputs an error message and the line that caused - the problem to the standard error stream. If there are more than 20 + It is possible to supply a regular expression that takes a very long + time to fail to match certain lines. Such patterns normally involve + nested indefinite repeats, for example: (a+)*\d when matched against a + line of a's with no final digit. The PCRE2 matching function has a re- + source limit that causes it to abort in these circumstances. If this + happens, pcre2grep outputs an error message and the line that caused + the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. - The --match-limit option of pcre2grep can be used to set the overall - resource limit. There are also other limits that affect the amount of - memory used during matching; see the discussion of --heap-limit and + The --match-limit option of pcre2grep can be used to set the overall + resource limit. There are also other limits that affect the amount of + memory used during matching; see the discussion of --heap-limit and --depth-limit above. DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, - and 2 for syntax errors, overlong lines, non-existent or inaccessible - files (even if matches were found in other files) or too many matching + and 2 for syntax errors, overlong lines, non-existent or inaccessible + files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. - When run under VMS, the return code is placed in the symbol - PCRE2GREP_RC because VMS does not distinguish between exit(0) and + When run under VMS, the return code is placed in the symbol + PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). @@ -1088,8 +1099,8 @@ AUTHOR REVISION - Last updated: 22 December 2023 + Last updated: 04 February 2025 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 22 December 2023 PCRE2GREP(1) +PCRE2 10.45 04 February 2025 PCRE2GREP(1) diff --git a/mingw32/share/doc/pcre2/pcre2test.txt b/mingw32/share/doc/pcre2/pcre2test.txt index ddb491d7e7c..b6574b2ea1b 100644 --- a/mingw32/share/doc/pcre2/pcre2test.txt +++ b/mingw32/share/doc/pcre2/pcre2test.txt @@ -1,4 +1,3 @@ - PCRE2TEST(1) General Commands Manual PCRE2TEST(1) @@ -72,26 +71,25 @@ INPUT ENCODING When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that - are passed to the library. For subject lines, backslash escapes can be - used. In addition, when the utf modifier (see "Setting compilation op- - tions" below) is set, the pattern and any following subject lines are - interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as ap- - propriate. - - For non-UTF testing of wide characters, the utf8_input modifier can be - used. This is mutually exclusive with utf, and is allowed only in - 16-bit or 32-bit mode. It causes the pattern and following subject - lines to be treated as UTF-8 according to the original definition (RFC + are passed to the library. For subject lines and some patterns, back- + slash escapes can be used. In addition, when the utf modifier (see + "Setting compilation options" below) is set, the pattern and any fol- + lowing subject lines are interpreted as UTF-8 strings and translated to + UTF-16 or UTF-32 as appropriate. + + For non-UTF testing of wide characters, the utf8_input modifier can be + used. This is mutually exclusive with utf, and is allowed only in + 16-bit or 32-bit mode. It causes the pattern and following subject + lines to be treated as UTF-8 according to the original definition (RFC 2279), which allows for character values up to 0x7fffffff. Each charac- - ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, + ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error to occur). - UTF-8 (in its original definition) is not capable of encoding values - greater than 0x7fffffff, but such values can be handled by the 32-bit + UTF-8 (in its original definition) is not capable of encoding values + greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte - in UTF-8) 0x80000000 is added to the character's value. This is the - only way of passing such code points in a pattern string. For subject + in UTF-8) 0x80000000 is added to the character's value. For subject strings, using an escape sequence is preferable. @@ -135,8 +133,8 @@ COMMAND LINE OPTIONS the exit code as indicated: ebcdic-nl the code for LF (= NL) in an EBCDIC environment: - 0x15 or 0x25 - 0 if used in an ASCII environment + either 0x15 or 0x25 + 0 if used in an ASCII/Unicode environment exit code is always 0 linksize the configured internal link size (2, 3, or 4) exit code is set to the link size @@ -158,56 +156,67 @@ COMMAND LINE OPTIONS pcre2-8 the 8-bit library was built unicode Unicode support is available - If an unknown option is given, an error message is output; + Note that the availability of JIT support in the library does + not guarantee that it can actually be used because in some + environments it is unable to allocate executable memory. The + option "jitusable" gives more detailed information. It re- + turns one of the following values: + + 0 JIT is available and usable + 1 JIT is available but cannot allocate executable memory + 2 JIT is not available + 3 Unexpected return from test call to pcre2_jit_compile() + + If an unknown option is given, an error message is output; the exit code is 0. - -d Behave as if each pattern has the debug modifier; the inter- + -d Behave as if each pattern has the debug modifier; the inter- nal form and information about the compiled pattern is output after compilation; -d is equivalent to -b -i. -dfa Behave as if each subject line has the dfa modifier; matching - is done using the pcre2_dfa_match() function instead of the + is done using the pcre2_dfa_match() function instead of the default pcre2_match(). -error number[,number,...] - Call pcre2_get_error_message() for each of the error numbers - in the comma-separated list, display the resulting messages - on the standard output, then exit with zero exit code. The - numbers may be positive or negative. This is a convenience + Call pcre2_get_error_message() for each of the error numbers + in the comma-separated list, display the resulting messages + on the standard output, then exit with zero exit code. The + numbers may be positive or negative. This is a convenience facility for PCRE2 maintainers. -help Output a brief summary these options and then exit. - -i Behave as if each pattern has the info modifier; information + -i Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation. - -jit Behave as if each pattern line has the jit modifier; after - successful compilation, each pattern is passed to the just- + -jit Behave as if each pattern line has the jit modifier; after + successful compilation, each pattern is passed to the just- in-time compiler, if available. - -jitfast Behave as if each pattern line has the jitfast modifier; af- - ter successful compilation, each pattern is passed to the + -jitfast Behave as if each pattern line has the jitfast modifier; af- + ter successful compilation, each pattern is passed to the just-in-time compiler, if available, and each subject line is passed directly to the JIT matcher via its "fast path". -jitverify - Behave as if each pattern line has the jitverify modifier; - after successful compilation, each pattern is passed to the - just-in-time compiler, if available, and the use of JIT for + Behave as if each pattern line has the jitverify modifier; + after successful compilation, each pattern is passed to the + just-in-time compiler, if available, and the use of JIT for matching is verified. -LM List modifiers: write a list of available pattern and subject - modifiers to the standard output, then exit with zero exit - code. All other options are ignored. If both -C and any -Lx + modifiers to the standard output, then exit with zero exit + code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. - -LP List properties: write a list of recognized Unicode proper- - ties to the standard output, then exit with zero exit code. + -LP List properties: write a list of recognized Unicode proper- + ties to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -LS List scripts: write a list of recognized Unicode script names - to the standard output, then exit with zero exit code. All + to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. @@ -217,25 +226,25 @@ COMMAND LINE OPTIONS -q Do not output the version number of pcre2test at the start of execution. - -S size On Unix-like systems, set the size of the run-time stack to + -S size On Unix-like systems, set the size of the run-time stack to size mebibytes (units of 1024*1024 bytes). -subject modifier-list Behave as if each subject line contains the given modifiers. - -t Run each compile and match many times with a timer, and out- - put the resulting times per compile or match. When JIT is - used, separate times are given for the initial compile and - the JIT compile. You can control the number of iterations - that are used for timing by following -t with a number (as a - separate item on the command line). For example, "-t 1000" + -t Run each compile and match many times with a timer, and out- + put the resulting times per compile or match. When JIT is + used, separate times are given for the initial compile and + the JIT compile. You can control the number of iterations + that are used for timing by following -t with a number (as a + separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times. -tm This is like -t except that it times only the matching phase, not the compile phase. - -T -TM These behave like -t and -tm, but in addition, at the end of - a run, the total times for all compiles and matches are out- + -T -TM These behave like -t and -tm, but in addition, at the end of + a run, the total times for all compiles and matches are out- put. -version Output the PCRE2 version number and then exit. @@ -243,153 +252,153 @@ COMMAND LINE OPTIONS DESCRIPTION - If pcre2test is given two filename arguments, it reads from the first + If pcre2test is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from - the standard input. If pcre2test is given only one argument, it reads + the standard input. If pcre2test is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout. - When pcre2test is built, a configuration option can specify that it - should be linked with the libreadline or libedit library. When this is - done, if the input is from a terminal, it is read using the readline() + When pcre2test is built, a configuration option can specify that it + should be linked with the libreadline or libedit library. When this is + done, if the input is from a terminal, it is read using the readline() function. This provides line-editing and history facilities. The output from the -help option states whether or not readline() will be used. - The program handles any number of tests, each of which consists of a - set of input lines. Each set starts with a regular expression pattern, + The program handles any number of tests, each of which consists of a + set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pat- tern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed - by the perltest.sh script that is distributed with PCRE2 as a means of + by the perltest.sh script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. For a speci- - fication of perltest.sh, see the comments near its beginning. See also + fication of perltest.sh, see the comments near its beginning. See also the #perltest command below. When the input is a terminal, pcre2test prompts for each line of input, - using "re>" to prompt for regular expression patterns, and "data>" to - prompt for subject lines. Command lines starting with # can be entered + using "re>" to prompt for regular expression patterns, and "data>" to + prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt. - Each subject line is matched separately and independently. If you want + Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r - or \r\n, etc., depending on the newline setting) in a single line of - input to encode the newline sequences. There is no limit on the length - of subject lines; the input buffer is automatically extended if it is - too small. There are replication features that makes it possible to - generate long repetitive pattern or subject lines without having to + or \r\n, etc., depending on the newline setting) in a single line of + input to encode the newline sequences. There is no limit on the length + of subject lines; the input buffer is automatically extended if it is + too small. There are replication features that makes it possible to + generate long repetitive pattern or subject lines without having to supply them explicitly. - An empty line or the end of the file signals the end of the subject - lines for a test, at which point a new pattern or command line is ex- + An empty line or the end of the file signals the end of the subject + lines for a test, at which point a new pattern or command line is ex- pected if there is still input to be read. COMMAND LINES - In between sets of test data, a line that begins with # is interpreted + In between sets of test data, a line that begins with # is interpreted as a command line. If the first character is followed by white space or - an exclamation mark, the line is treated as a comment, and ignored. + an exclamation mark, the line is treated as a comment, and ignored. Otherwise, the following commands are recognized: #forbid_utf - Subsequent patterns automatically have the PCRE2_NEVER_UTF and - PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF - and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of - patterns. This command also forces an error if a subsequent pattern - contains any occurrences of \P, \p, or \X, which are still supported - when PCRE2_UTF is not set, but which require Unicode property support + Subsequent patterns automatically have the PCRE2_NEVER_UTF and + PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF + and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of + patterns. This command also forces an error if a subsequent pattern + contains any occurrences of \P, \p, or \X, which are still supported + when PCRE2_UTF is not set, but which require Unicode property support to be included in the library. - This is a trigger guard that is used in test files to ensure that UTF - or Unicode property tests are not accidentally added to files that are - used when Unicode support is not included in the library. Setting - PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained - by the use of #pattern; the difference is that #forbid_utf cannot be - unset, and the automatic options are not displayed in pattern informa- + This is a trigger guard that is used in test files to ensure that UTF + or Unicode property tests are not accidentally added to files that are + used when Unicode support is not included in the library. Setting + PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained + by the use of #pattern; the difference is that #forbid_utf cannot be + unset, and the automatic options are not displayed in pattern informa- tion, to avoid cluttering up test output. #load This command is used to load a set of precompiled patterns from a file, - as described in the section entitled "Saving and restoring compiled + as described in the section entitled "Saving and restoring compiled patterns" below. #loadtables - This command is used to load a set of binary character tables that can - be accessed by the tables=3 qualifier. Such tables can be created by + This command is used to load a set of binary character tables that can + be accessed by the tables=3 qualifier. Such tables can be created by the pcre2_dftables program with the -b option. #newline_default [] - When PCRE2 is built, a default newline convention can be specified. - This determines which characters and/or character pairs are recognized + When PCRE2 is built, a default newline convention can be specified. + This determines which characters and/or character pairs are recognized as indicating a newline in a pattern or subject string. The default can - be overridden when a pattern is compiled. The standard test files con- - tain tests of various newline conventions, but the majority of the - tests expect a single linefeed to be recognized as a newline by de- - fault. Without special action the tests would fail when PCRE2 is com- + be overridden when a pattern is compiled. The standard test files con- + tain tests of various newline conventions, but the majority of the + tests expect a single linefeed to be recognized as a newline by de- + fault. Without special action the tests would fail when PCRE2 is com- piled with either CR or CRLF as the default newline. The #newline_default command specifies a list of newline types that are - acceptable as the default. The types must be one of CR, LF, CRLF, ANY- + acceptable as the default. The types must be one of CR, LF, CRLF, ANY- CRLF, ANY, or NUL (in upper or lower case), for example: #newline_default LF Any anyCRLF If the default newline is in the list, this command has no effect. Oth- - erwise, except when testing the POSIX API, a newline modifier that + erwise, except when testing the POSIX API, a newline modifier that specifies the first newline convention in the list (LF in the above ex- - ample) is added to any pattern that does not already have a newline + ample) is added to any pattern that does not already have a newline modifier. If the newline list is empty, the feature is turned off. This command is present in a number of the standard test input files. - When the POSIX API is being tested there is no way to override the de- + When the POSIX API is being tested there is no way to override the de- fault newline convention, though it is possible to set the newline con- - vention from within the pattern. A warning is given if the posix or - posix_nosub modifier is used when #newline_default would set a default + vention from within the pattern. A warning is given if the posix or + posix_nosub modifier is used when #newline_default would set a default for the non-POSIX API. #pattern - This command sets a default modifier list that applies to all subse- + This command sets a default modifier list that applies to all subse- quent patterns. Modifiers on a pattern can change these settings. #perltest - This line is used in test files that can also be processed by perl- - test.sh to confirm that Perl gives the same results as PCRE2. Subse- - quent tests are checked for the use of pcre2test features that are in- + This line is used in test files that can also be processed by perl- + test.sh to confirm that Perl gives the same results as PCRE2. Subse- + quent tests are checked for the use of pcre2test features that are in- compatible with the perltest.sh script. - Patterns must use '/' as their delimiter, and only certain modifiers - are supported. Comment lines, #pattern commands, and #subject commands - that set or unset "mark" are recognized and acted on. The #perltest, - #forbid_utf, and #newline_default commands, which are needed in the + Patterns must use '/' as their delimiter, and only certain modifiers + are supported. Comment lines, #pattern commands, and #subject commands + that set or unset "mark" are recognized and acted on. The #perltest, + #forbid_utf, and #newline_default commands, which are needed in the relevant pcre2test files, are silently ignored. All other command lines - are ignored, but give a warning message. The #perltest command helps - detect tests that are accidentally put in the wrong file or use the - wrong delimiter. For more details of the perltest.sh script see the + are ignored, but give a warning message. The #perltest command helps + detect tests that are accidentally put in the wrong file or use the + wrong delimiter. For more details of the perltest.sh script see the comments it contains. #pop [] #popcopy [] - These commands are used to manipulate the stack of compiled patterns, - as described in the section entitled "Saving and restoring compiled + These commands are used to manipulate the stack of compiled patterns, + as described in the section entitled "Saving and restoring compiled patterns" below. #save - This command is used to save a set of compiled patterns to a file, as - described in the section entitled "Saving and restoring compiled pat- + This command is used to save a set of compiled patterns to a file, as + described in the section entitled "Saving and restoring compiled pat- terns" below. #subject - This command sets a default modifier list that applies to all subse- - quent subject lines. Modifiers on a subject line can change these set- + This command sets a default modifier list that applies to all subse- + quent subject lines. Modifiers on a subject line can change these set- tings. @@ -397,47 +406,47 @@ MODIFIER SYNTAX Modifier lists are used with both pattern and subject lines. Items in a list are separated by commas followed by optional white space. Trailing - whitespace in a modifier list is ignored. Some modifiers may be given - for both patterns and subject lines, whereas others are valid only for - one or the other. Each modifier has a long name, for example "an- - chored", and some of them must be followed by an equals sign and a - value, for example, "offset=12". Values cannot contain comma charac- - ters, but may contain spaces. Modifiers that do not take values may be + whitespace in a modifier list is ignored. Some modifiers may be given + for both patterns and subject lines, whereas others are valid only for + one or the other. Each modifier has a long name, for example "an- + chored", and some of them must be followed by an equals sign and a + value, for example, "offset=12". Values cannot contain comma charac- + ters, but may contain spaces. Modifiers that do not take values may be preceded by a minus sign to turn off a previous setting. A few of the more common modifiers can also be specified as single let- - ters, for example "i" for "caseless". In documentation, following the + ters, for example "i" for "caseless". In documentation, following the Perl convention, these are written with a slash ("the /i modifier") for - clarity. Abbreviated modifiers must all be concatenated in the first - item of a modifier list. If the first item is not recognized as a long - modifier name, it is interpreted as a sequence of these abbreviations. + clarity. Abbreviated modifiers must all be concatenated in the first + item of a modifier list. If the first item is not recognized as a long + modifier name, it is interpreted as a sequence of these abbreviations. For example: /abc/ig,newline=cr,jit=3 - This is a pattern line whose modifier list starts with two one-letter - modifiers (/i and /g). The lower-case abbreviated modifiers are the + This is a pattern line whose modifier list starts with two one-letter + modifiers (/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. PATTERN SYNTAX - A pattern line must start with one of the following characters (common + A pattern line must start with one of the following characters (common symbols, excluding pattern meta-characters): / ! " ' ` - = _ : ; , % & @ ~ - This is interpreted as the pattern's delimiter. A regular expression - may be continued over several input lines, in which case the newline + This is interpreted as the pattern's delimiter. A regular expression + may be continued over several input lines, in which case the newline characters are included within it. It is possible to include the delim- - iter as a literal within the pattern by escaping it with a backslash, + iter as a literal within the pattern by escaping it with a backslash, for example /abc\/def/ - If you do this, the escape and the delimiter form part of the pattern, + If you do this, the escape and the delimiter form part of the pattern, but since the delimiters are all non-alphanumeric, the inclusion of the - backslash does not affect the pattern's interpretation. Note, however, + backslash does not affect the pattern's interpretation. Note, however, that this trick does not work within \Q...\E literal bracketing because the backslash will itself be interpreted as a literal. If the terminat- ing delimiter is immediately followed by a backslash, for example, @@ -445,13 +454,13 @@ PATTERN SYNTAX /abc/\ a backslash is added to the end of the pattern. This is done to provide - a way of testing the error condition that arises if a pattern finishes + a way of testing the error condition that arises if a pattern finishes with a backslash, because /abc\/ - is interpreted as the first line of a pattern that starts with "abc/", - causing pcre2test to read the next line as a continuation of the regu- + is interpreted as the first line of a pattern that starts with "abc/", + causing pcre2test to read the next line as a continuation of the regu- lar expression. A pattern can be followed by a modifier list (details below). @@ -460,44 +469,52 @@ PATTERN SYNTAX SUBJECT LINE SYNTAX Before each subject line is passed to pcre2_match(), pcre2_dfa_match(), - or pcre2_jit_match(), leading and trailing white space is removed, and - the line is scanned for backslash escapes, unless the subject_literal - modifier was set for the pattern. The following provide a means of en- + or pcre2_jit_match(), leading and trailing white space is removed, and + the line is scanned for backslash escapes, unless the subject_literal + modifier was set for the pattern. The following provide a means of en- coding non-printing characters in a visible way: - \a alarm (BEL, \x07) - \b backspace (\x08) - \e escape (\x27) - \f form feed (\x0c) - \n newline (\x0a) - \r carriage return (\x0d) - \t tab (\x09) - \v vertical tab (\x0b) - \nnn octal character (up to 3 octal digits); always - a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode - \o{dd...} octal character (any number of octal digits} - \xhh hexadecimal byte (up to 2 hex digits) - \x{hh...} hexadecimal character (any number of hex digits) - - The use of \x{hh...} is not dependent on the use of the utf modifier on - the pattern. It is recognized always. There may be any number of hexa- - decimal digits inside the braces; invalid values provoke error mes- - sages. - - Note that \xhh specifies one byte rather than one character in UTF-8 - mode; this makes it possible to construct invalid UTF-8 sequences for - testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8 - character in UTF-8 mode, generating more than one byte if the value is - greater than 127. When testing the 8-bit library not in UTF-8 mode, - \x{hh} generates one byte for values less than 256, and causes an error + \a alarm (BEL, \x07) + \b backspace (\x08) + \e escape (\x27) + \f form feed (\x0c) + \n newline (\x0a) + \N{U+hh...} unicode character (any number of hex digits) + \r carriage return (\x0d) + \t tab (\x09) + \v vertical tab (\x0b) + \ddd octal number (up to 3 octal digits); represent a single + code point unless larger than 255 with the 8-bit li- + brary + \o{dd...} octal number (any number of octal digits} representing a + character in UTF mode or a code point + \xhh hexadecimal byte (up to 2 hex digits) + \x{hh...} hexadecimal number (up to 8 hex digits) representing a + character in UTF mode or a code point + + Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf + modifier on the pattern. It is always recognized. There may be any num- + ber of hexadecimal digits inside the braces; invalid values provoke er- + ror messages but when using \N{U+hh...} with some invalid unicode char- + acters they will be accepted with a warning instead. + + Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) + describe one byte rather than one character; this makes it possible to + construct invalid UTF-8 sequences for testing purposes. On the other + hand, \x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only + generating more than one byte if the value is greater than 127. To + avoid the ambiguity it is preferred to use \N{U+hh...} when describing + characters. When testing the 8-bit library not in UTF-8 mode, \x{hh} + generates one byte for values that could fit on it, and causes an error for greater values. - In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it - possible to construct invalid UTF-16 sequences for testing purposes. + When testing the 16-bit library, not in UTF-16 mode, all 4-digit + \x{hhhh} values are accepted. This makes it possible to construct in- + valid UTF-16 sequences for testing purposes. - In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This - makes it possible to construct invalid UTF-32 sequences for testing - purposes. + When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit + \x{...} values are accepted. This makes it possible to construct in- + valid UTF-32 sequences for testing purposes. There is a special backslash sequence that specifies replication of one or more characters: @@ -561,6 +578,7 @@ PATTERN MODIFIERS allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -589,13 +607,17 @@ PATTERN MODIFIERS match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -608,20 +630,36 @@ PATTERN MODIFIERS causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. + The following modifiers enable or disable performance optimizations by + calling pcre2_set_optimize() before invoking the regex compiler. + + optimization_full enable all optional optimizations + optimization_none disable all optional optimizations + auto_possess auto-possessify variable quantifiers + auto_possess_off don't auto-possessify variable quantifiers + dotstar_anchor anchor patterns starting with .* + dotstar_anchor_off don't anchor patterns starting with .* + start_optimize enable pre-scan of subject string + start_optimize_off disable pre-scan of subject string + + See the pcre2_set_optimize documentation for details on these optimiza- + tions. + Setting compilation controls - The following modifiers affect the compilation process or request in- - formation about the pattern. There are single-letter abbreviations for + The following modifiers affect the compilation process or request in- + formation about the pattern. There are single-letter abbreviations for some that are heavily used in the test files. - bsr=[anycrlf|unicode] specify \R handling /B bincode show binary code without lengths + bsr=[anycrlf|unicode] specify \R handling callout_info show callout information convert= request foreign pattern conversion convert_glob_escape=c set glob escape character convert_glob_separator=c set glob separator character convert_length set convert buffer length debug same as info,fullbincode + expand expand repetition syntax in pattern framesize show matching frame size fullbincode show binary code with lengths /I info show info about compiled pattern @@ -643,6 +681,7 @@ PATTERN MODIFIERS posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard= test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -653,35 +692,35 @@ PATTERN MODIFIERS Newline and \R handling - The bsr modifier specifies what \R in a pattern should match. If it is - set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to - "unicode", \R matches any Unicode newline sequence. The default can be + The bsr modifier specifies what \R in a pattern should match. If it is + set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to + "unicode", \R matches any Unicode newline sequence. The default can be specified when PCRE2 is built; if it is not, the default is set to Uni- code. - The newline modifier specifies which characters are to be interpreted + The newline modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case). Information about a pattern - The debug modifier is a shorthand for info,fullbincode, requesting all + The debug modifier is a shorthand for info,fullbincode, requesting all available information. The bincode modifier causes a representation of the compiled code to be - output after compilation. This information does not contain length and + output after compilation. This information does not contain length and offset values, which ensures that the same output is generated for dif- - ferent internal link sizes and different code unit widths. By using - bincode, the same regression tests can be used in different environ- + ferent internal link sizes and different code unit widths. By using + bincode, the same regression tests can be used in different environ- ments. - The fullbincode modifier, by contrast, does include length and offset - values. This is used in a few special tests that run only for specific + The fullbincode modifier, by contrast, does include length and offset + values. This is used in a few special tests that run only for specific code unit widths and link sizes, and is also useful for one-off tests. - The info modifier requests information about the compiled pattern - (whether it is anchored, has a fixed first character, and so on). The - information is obtained from the pcre2_pattern_info() function. Here + The info modifier requests information about the compiled pattern + (whether it is anchored, has a fixed first character, and so on). The + information is obtained from the pcre2_pattern_info() function. Here are some typical examples: re> /(?i)(^a|^b)/m,info @@ -699,136 +738,136 @@ PATTERN MODIFIERS Last code unit = 'c' (caseless) Subject length lower bound = 3 - "Compile options" are those specified by modifiers; "overall options" - have added options that are taken or deduced from the pattern. If both - sets of options are the same, just a single "options" line is output; - if there are no options, the line is omitted. "First code unit" is - where any match must start; if there is more than one they are listed - as "starting code units". "Last code unit" is the last literal code - unit that must be present in any match. This is not necessarily the - last character. These lines are omitted if no starting or ending code - units are recorded. The subject length line is omitted when - no_start_optimize is set because the minimum length is not calculated + "Compile options" are those specified by modifiers; "overall options" + have added options that are taken or deduced from the pattern. If both + sets of options are the same, just a single "options" line is output; + if there are no options, the line is omitted. "First code unit" is + where any match must start; if there is more than one they are listed + as "starting code units". "Last code unit" is the last literal code + unit that must be present in any match. This is not necessarily the + last character. These lines are omitted if no starting or ending code + units are recorded. The subject length line is omitted when + no_start_optimize is set because the minimum length is not calculated when it can never be used. - The framesize modifier shows the size, in bytes, of each storage frame - used by pcre2_match() for handling backtracking. The size depends on - the number of capturing parentheses in the pattern. A vector of these - frames is used at matching time; its overall size is shown when the + The framesize modifier shows the size, in bytes, of each storage frame + used by pcre2_match() for handling backtracking. The size depends on + the number of capturing parentheses in the pattern. A vector of these + frames is used at matching time; its overall size is shown when the heaframes_size subject modifier is set. - The callout_info modifier requests information about all the callouts + The callout_info modifier requests information about all the callouts in the pattern. A list of them is output at the end of any other infor- mation that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. Passing a NULL context - Normally, pcre2test passes a context block to pcre2_compile(). If the - null_context modifier is set, however, NULL is passed. This is for - testing that pcre2_compile() behaves correctly in this case (it uses + Normally, pcre2test passes a context block to pcre2_compile(). If the + null_context modifier is set, however, NULL is passed. This is for + testing that pcre2_compile() behaves correctly in this case (it uses default values). Passing a NULL pattern - The null_pattern modifier is for testing the behaviour of pcre2_com- - pile() when the pattern argument is NULL. The length value passed is + The null_pattern modifier is for testing the behaviour of pcre2_com- + pile() when the pattern argument is NULL. The length value passed is the default PCRE2_ZERO_TERMINATED unless use_length is set. Any length other than zero causes an error. Specifying pattern characters in hexadecimal - The hex modifier specifies that the characters of the pattern, except - for substrings enclosed in single or double quotes, are to be inter- - preted as pairs of hexadecimal digits. This feature is provided as a + The hex modifier specifies that the characters of the pattern, except + for substrings enclosed in single or double quotes, are to be inter- + preted as pairs of hexadecimal digits. This feature is provided as a way of creating patterns that contain binary zeros and other non-print- - ing characters. White space is permitted between pairs of digits. For + ing characters. White space is permitted between pairs of digits. For example, this pattern contains three characters: /ab 32 59/hex - Parts of such a pattern are taken literally if quoted. This pattern - contains nine characters, only two of which are specified in hexadeci- + Parts of such a pattern are taken literally if quoted. This pattern + contains nine characters, only two of which are specified in hexadeci- mal: /ab "literal" 32/hex - Either single or double quotes may be used. There is no way of includ- - ing the delimiter within a substring. The hex and expand modifiers are + Either single or double quotes may be used. There is no way of includ- + ing the delimiter within a substring. The hex and expand modifiers are mutually exclusive. Specifying the pattern's length By default, patterns are passed to the compiling functions as zero-ter- - minated strings but can be passed by length instead of being zero-ter- - minated. The use_length modifier causes this to happen. Using a length - happens automatically (whether or not use_length is set) when hex is - set, because patterns specified in hexadecimal may contain binary ze- + minated strings but can be passed by length instead of being zero-ter- + minated. The use_length modifier causes this to happen. Using a length + happens automatically (whether or not use_length is set) when hex is + set, because patterns specified in hexadecimal may contain binary ze- ros. If hex or use_length is used with the POSIX wrapper API (see "Using the - POSIX wrapper API" below), the REG_PEND extension is used to pass the + POSIX wrapper API" below), the REG_PEND extension is used to pass the pattern's length. Specifying a maximum for variable lookbehinds - Variable lookbehind assertions are supported only if, for each one, + Variable lookbehind assertions are supported only if, for each one, there is a maximum length (in characters) that it can match. There is a limit on this, whose default can be set at build time, with an ultimate - default of 255. The max_varlookbehind modifier uses the + default of 255. The max_varlookbehind modifier uses the pcre2_set_max_varlookbehind() function to change the limit. Lookbehinds - whose branches each match a fixed length are limited to 65535 charac- + whose branches each match a fixed length are limited to 65535 charac- ters per branch. Specifying wide characters in 16-bit and 32-bit modes In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 - and translated to UTF-16 or UTF-32 when the utf modifier is set. For + and translated to UTF-16 or UTF-32 when the utf modifier is set. For testing the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input - modifier can be used. It is mutually exclusive with utf. Input lines + modifier can be used. It is mutually exclusive with utf. Input lines are interpreted as UTF-8 as a means of specifying wide characters. More details are given in "Input encoding" above. Generating long repetitive patterns - Some tests use long patterns that are very repetitive. Instead of cre- - ating a very long input line for such a pattern, you can use a special - repetition feature, similar to the one described for subject lines - above. If the expand modifier is present on a pattern, parts of the + Some tests use long patterns that are very repetitive. Instead of cre- + ating a very long input line for such a pattern, you can use a special + repetition feature, similar to the one described for subject lines + above. If the expand modifier is present on a pattern, parts of the pattern that have the form \[]{} are expanded before the pattern is passed to pcre2_compile(). For exam- ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction - cannot be nested. An initial "\[" sequence is recognized only if "]{" - followed by decimal digits and "}" is found later in the pattern. If + cannot be nested. An initial "\[" sequence is recognized only if "]{" + followed by decimal digits and "}" is found later in the pattern. If not, the characters remain in the pattern unaltered. The expand and hex modifiers are mutually exclusive. - If part of an expanded pattern looks like an expansion, but is really + If part of an expanded pattern looks like an expansion, but is really part of the actual pattern, unwanted expansion can be avoided by giving two values in the quantifier. For example, \[AB]{6000,6000} is not rec- ognized as an expansion item. - If the info modifier is set on an expanded pattern, the result of the + If the info modifier is set on an expanded pattern, the result of the expansion is included in the information that is output. JIT compilation - Just-in-time (JIT) compiling is a heavyweight optimization that can - greatly speed up pattern matching. See the pcre2jit documentation for - details. JIT compiling happens, optionally, after a pattern has been - successfully compiled into an internal form. The JIT compiler converts + Just-in-time (JIT) compiling is a heavyweight optimization that can + greatly speed up pattern matching. See the pcre2jit documentation for + details. JIT compiling happens, optionally, after a pattern has been + successfully compiled into an internal form. The JIT compiler converts this to optimized machine code. It needs to know whether the match-time options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, - because different code is generated for the different cases. See the - partial modifier in "Subject Modifiers" below for details of how these + because different code is generated for the different cases. See the + partial modifier in "Subject Modifiers" below for details of how these options are specified for each match attempt. JIT compilation is requested by the jit pattern modifier, which may op- - tionally be followed by an equals sign and a number in the range 0 to - 7. The three bits that make up the number specify which of the three + tionally be followed by an equals sign and a number in the range 0 to + 7. The three bits that make up the number specify which of the three JIT operating modes are to be compiled: 1 compile JIT code for non-partial matching @@ -845,31 +884,31 @@ PATTERN MODIFIERS 6 soft and hard partial matching only 7 all three modes - If no number is given, 7 is assumed. The phrase "partial matching" + If no number is given, 7 is assumed. The phrase "partial matching" means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the - PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- + PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- plete match; the options enable the possibility of a partial match, but - do not require it. Note also that if you request JIT compilation only - for partial matching (for example, jit=2) but do not set the partial - modifier on a subject line, that match will not use JIT code because + do not require it. Note also that if you request JIT compilation only + for partial matching (for example, jit=2) but do not set the partial + modifier on a subject line, that match will not use JIT code because none was compiled for non-partial matching. - If JIT compilation is successful, the compiled JIT code will automati- + If JIT compilation is successful, the compiled JIT code will automati- cally be used when an appropriate type of match is run, except when in- - compatible run-time options are specified. For more details, see the - pcre2jit documentation. See also the jitstack modifier below for a way + compatible run-time options are specified. For more details, see the + pcre2jit documentation. See also the jitstack modifier below for a way of setting the size of the JIT stack. - If the jitfast modifier is specified, matching is done using the JIT - "fast path" interface, pcre2_jit_match(), which skips some of the san- - ity checks that are done by pcre2_match(), and of course does not work - when JIT is not supported. If jitfast is specified without jit, jit=7 + If the jitfast modifier is specified, matching is done using the JIT + "fast path" interface, pcre2_jit_match(), which skips some of the san- + ity checks that are done by pcre2_match(), and of course does not work + when JIT is not supported. If jitfast is specified without jit, jit=7 is assumed. - If the jitverify modifier is specified, information about the compiled - pattern shows whether JIT compilation was or was not successful. If - jitverify is specified without jit, jit=7 is assumed. If JIT compila- - tion is successful when jitverify is set, the text "(JIT)" is added to + If the jitverify modifier is specified, information about the compiled + pattern shows whether JIT compilation was or was not successful. If + jitverify is specified without jit, jit=7 is assumed. If JIT compila- + tion is successful when jitverify is set, the text "(JIT)" is added to the first output line after a match or non match when JIT-compiled code was actually used in the match. @@ -880,19 +919,19 @@ PATTERN MODIFIERS /pattern/locale=fr_FR The given locale is set, pcre2_maketables() is called to build a set of - character tables for the locale, and this is then passed to pcre2_com- - pile() when compiling the regular expression. The same tables are used - when matching the following subject lines. The locale modifier applies + character tables for the locale, and this is then passed to pcre2_com- + pile() when compiling the regular expression. The same tables are used + when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern - command if a default is needed. Setting a locale and alternate charac- + command if a default is needed. Setting a locale and alternate charac- ter tables are mutually exclusive. Showing pattern memory The memory modifier causes the size in bytes of the memory used to hold - the compiled pattern to be output. This does not include the size of - the pcre2_code block; it is just the actual compiled data. If the pat- - tern is subsequently passed to the JIT compiler, the size of the JIT + the compiled pattern to be output. This does not include the size of + the pcre2_code block; it is just the actual compiled data. If the pat- + tern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example: re> /a(b)c/jit,memory @@ -902,34 +941,34 @@ PATTERN MODIFIERS Limiting nested parentheses - The parens_nest_limit modifier sets a limit on the depth of nested - parentheses in a pattern. Breaching the limit causes a compilation er- - ror. The default for the library is set when PCRE2 is built, but - pcre2test sets its own default of 220, which is required for running + The parens_nest_limit modifier sets a limit on the depth of nested + parentheses in a pattern. Breaching the limit causes a compilation er- + ror. The default for the library is set when PCRE2 is built, but + pcre2test sets its own default of 220, which is required for running the standard test suite. Limiting the pattern length - The max_pattern_length modifier sets a limit, in code units, to the + The max_pattern_length modifier sets a limit, in code units, to the length of pattern that pcre2_compile() will accept. Breaching the limit - causes a compilation error. The default is the largest number a + causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Limiting the size of a compiled pattern The max_pattern_compiled_length modifier sets a limit, in bytes, to the amount of memory used by a compiled pattern. Breaching the limit causes - a compilation error. The default is the largest number a PCRE2_SIZE + a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Using the POSIX wrapper API - The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via - the POSIX wrapper API rather than its native API. When posix_nosub is - used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX - wrapper supports only the 8-bit library. Note that it does not imply + The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via + the POSIX wrapper API rather than its native API. When posix_nosub is + used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX + wrapper supports only the 8-bit library. Note that it does not imply POSIX matching semantics; for more detail see the pcre2posix documenta- - tion. The following pattern modifiers set options for the regcomp() + tion. The following pattern modifiers set options for the regcomp() function: caseless REG_ICASE @@ -939,42 +978,42 @@ PATTERN MODIFIERS ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) - The regerror_buffsize modifier specifies a size for the error buffer - that is passed to regerror() in the event of a compilation error. For + The regerror_buffsize modifier specifies a size for the error buffer + that is passed to regerror() in the event of a compilation error. For example: /abc/posix,regerror_buffsize=20 - This provides a means of testing the behaviour of regerror() when the - buffer is too small for the error message. If this modifier has not + This provides a means of testing the behaviour of regerror() when the + buffer is too small for the error message. If this modifier has not been set, a large buffer is used. - The aftertext and allaftertext subject modifiers work as described be- + The aftertext and allaftertext subject modifiers work as described be- low. All other modifiers are either ignored, with a warning message, or cause an error. - The pattern is passed to regcomp() as a zero-terminated string by de- + The pattern is passed to regcomp() as a zero-terminated string by de- fault, but if the use_length or hex modifiers are set, the REG_PEND ex- tension is used to pass it by length. Testing the stack guard feature - The stackguard modifier is used to test the use of pcre2_set_com- - pile_recursion_guard(), a function that is provided to enable stack - availability to be checked during compilation (see the pcre2api docu- - mentation for details). If the number specified by the modifier is + The stackguard modifier is used to test the use of pcre2_set_com- + pile_recursion_guard(), a function that is provided to enable stack + availability to be checked during compilation (see the pcre2api docu- + mentation for details). If the number specified by the modifier is greater than zero, pcre2_set_compile_recursion_guard() is called to set - up callback from pcre2_compile() to a local function. The argument it - receives is the current nesting parenthesis depth; if this is greater + up callback from pcre2_compile() to a local function. The argument it + receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted. Using alternative character tables - The value specified for the tables modifier must be one of the digits + The value specified for the tables modifier must be one of the digits 0, 1, 2, or 3. It causes a specific set of built-in character tables to - be passed to pcre2_compile(). This is used in the PCRE2 tests to check - behaviour with different character tables. The digit specifies the ta- + be passed to pcre2_compile(). This is used in the PCRE2 tests to check + behaviour with different character tables. The digit specifies the ta- bles as follows: 0 do not pass any special character tables @@ -985,15 +1024,15 @@ PATTERN MODIFIERS In tables 2, some characters whose codes are greater than 128 are iden- tified as letters, digits, spaces, etc. Tables 3 can be used only after - a #loadtables command has loaded them from a binary file. Setting al- + a #loadtables command has loaded them from a binary file. Setting al- ternate character tables and a locale are mutually exclusive. Setting certain match controls The following modifiers are really subject modifiers, and are described - under "Subject Modifiers" below. However, they may be included in a - pattern's modifier list, in which case they are applied to every sub- - ject line that is processed with that pattern. These modifiers do not + under "Subject Modifiers" below. However, they may be included in a + pattern's modifier list, in which case they are applied to every sub- + ject line that is processed with that pattern. These modifiers do not affect the compilation process. aftertext show text after match @@ -1009,6 +1048,7 @@ PATTERN MODIFIERS replace= specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1019,39 +1059,39 @@ PATTERN MODIFIERS substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY - These modifiers may not appear in a #pattern command. If you want them + These modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command. Specifying literal subject lines - If the subject_literal modifier is present on a pattern, all the sub- + If the subject_literal modifier is present on a pattern, all the sub- ject lines that it matches are taken as literal strings, with no inter- - pretation of backslashes. It is not possible to set subject modifiers - on such lines, but any that are set as defaults by a #subject command + pretation of backslashes. It is not possible to set subject modifiers + on such lines, but any that are set as defaults by a #subject command are recognized. Saving a compiled pattern - When a pattern with the push modifier is successfully compiled, it is - pushed onto a stack of compiled patterns, and pcre2test expects the - next line to contain a new pattern (or a command) instead of a subject + When a pattern with the push modifier is successfully compiled, it is + pushed onto a stack of compiled patterns, and pcre2test expects the + next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as - described in the section entitled "Saving and restoring compiled pat- - terns" below. If pushcopy is used instead of push, a copy of the com- - piled pattern is stacked, leaving the original as current, ready to - match the following input lines. This provides a way of testing the - pcre2_code_copy() function. The push and pushcopy modifiers are in- - compatible with compilation modifiers such as global that act at match + described in the section entitled "Saving and restoring compiled pat- + terns" below. If pushcopy is used instead of push, a copy of the com- + piled pattern is stacked, leaving the original as current, ready to + match the following input lines. This provides a way of testing the + pcre2_code_copy() function. The push and pushcopy modifiers are in- + compatible with compilation modifiers such as global that act at match time. Any that are specified are ignored (for the stacked copy), with a - warning message, except for replace, which causes an error. Note that - jitverify, which is allowed, does not carry through to any subsequent + warning message, except for replace, which causes an error. Note that + jitverify, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern. Testing foreign pattern conversion - The experimental foreign pattern conversion functions in PCRE2 can be - tested by setting the convert modifier. Its argument is a colon-sepa- - rated list of options, which set the equivalent option for the + The experimental foreign pattern conversion functions in PCRE2 can be + tested by setting the convert modifier. Its argument is a colon-sepa- + rated list of options, which set the equivalent option for the pcre2_pattern_convert() function: glob PCRE2_CONVERT_GLOB @@ -1063,19 +1103,19 @@ PATTERN MODIFIERS The "unset" value is useful for turning off a default that has been set by a #pattern command. When one of these options is set, the input pat- - tern is passed to pcre2_pattern_convert(). If the conversion is suc- - cessful, the result is reflected in the output and then passed to + tern is passed to pcre2_pattern_convert(). If the conversion is suc- + cessful, the result is reflected in the output and then passed to pcre2_compile(). The normal utf and no_utf_check options, if set, cause - the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be + the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to pcre2_pattern_convert(). By default, the conversion function is allowed to allocate a buffer for - its output. However, if the convert_length modifier is set to a value - greater than zero, pcre2test passes a buffer of the given length. This + its output. However, if the convert_length modifier is set to a value + greater than zero, pcre2test passes a buffer of the given length. This makes it possible to test the length check. - The convert_glob_escape and convert_glob_separator modifiers can be - used to specify the escape and separator characters for glob process- + The convert_glob_escape and convert_glob_separator modifiers can be + used to specify the escape and separator characters for glob process- ing, overriding the defaults, which are operating-system dependent. @@ -1086,10 +1126,11 @@ SUBJECT MODIFIERS Setting match options - The following modifiers set options for pcre2_match() or - pcre2_dfa_match(). See pcreapi for a description of their effects. + The following modifiers set options for pcre2_match() or + pcre2_dfa_match(). See pcre2api for a description of their effects. anchored set PCRE2_ANCHORED + copy_matched_subject set PCRE2_COPY_MATCHED_SUBJECT endanchored set PCRE2_ENDANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST @@ -1103,42 +1144,42 @@ SUBJECT MODIFIERS partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_soft (or ps) set PCRE2_PARTIAL_SOFT - The partial matching modifiers are provided with abbreviations because + The partial matching modifiers are provided with abbreviations because they appear frequently in tests. - If the posix or posix_nosub modifier was present on the pattern, caus- + If the posix or posix_nosub modifier was present on the pattern, caus- ing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOT- - BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to + BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). The other modifiers are ignored, with a warning message. - There is one additional modifier that can be used with the POSIX wrap- + There is one additional modifier that can be used with the POSIX wrap- per. It is ignored (with a warning) if used for non-POSIX matching. posix_startend=[:] - This causes the subject string to be passed to regexec() using the - REG_STARTEND option, which uses offsets to specify which part of the - string is searched. If only one number is given, the end offset is - passed as the end of the subject string. For more detail of REG_STAR- - TEND, see the pcre2posix documentation. If the subject string contains - binary zeros (coded as escapes such as \x{00} because pcre2test does + This causes the subject string to be passed to regexec() using the + REG_STARTEND option, which uses offsets to specify which part of the + string is searched. If only one number is given, the end offset is + passed as the end of the subject string. For more detail of REG_STAR- + TEND, see the pcre2posix documentation. If the subject string contains + binary zeros (coded as escapes such as \x{00} because pcre2test does not support actual binary zeros in its input), you must use posix_star- tend to specify its length. Setting match controls - The following modifiers affect the matching process or request addi- - tional information. Some of them may also be specified on a pattern - line (see above), in which case they apply to every subject line that - is matched against that pattern, but can be overridden by modifiers on + The following modifiers affect the matching process or request addi- + tional information. Some of them may also be specified on a pattern + line (see above), in which case they apply to every subject line that + is matched against that pattern, but can be overridden by modifiers on the subject. aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data= set a value to pass via callouts @@ -1172,7 +1213,8 @@ SUBJECT MODIFIERS startchar show startchar when relevant startoffset= same as offset= substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1184,29 +1226,29 @@ SUBJECT MODIFIERS zero_terminate pass the subject as zero-terminated The effects of these modifiers are described in the following sections. - When matching via the POSIX wrapper API, the aftertext, allaftertext, - and ovector subject modifiers work as described below. All other modi- + When matching via the POSIX wrapper API, the aftertext, allaftertext, + and ovector subject modifiers work as described below. All other modi- fiers are either ignored, with a warning message, or cause an error. Showing more text - The aftertext modifier requests that as well as outputting the part of + The aftertext modifier requests that as well as outputting the part of the subject string that matched the entire pattern, pcre2test should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. - The allaftertext modifier requests the same action for captured sub- + The allaftertext modifier requests the same action for captured sub- strings as well as the main matched substring. In each case the remain- der is output on the following line with a plus character following the capture number. - The allusedtext modifier requests that all the text that was consulted - during a successful pattern match by the interpreter should be shown, - for both full and partial matches. This feature is not supported for - JIT matching, and if requested with JIT it is ignored (with a warning - message). Setting this modifier affects the output if there is a look- - behind at the start of a match, or, for a complete match, a lookahead + The allusedtext modifier requests that all the text that was consulted + during a successful pattern match by the interpreter should be shown, + for both full and partial matches. This feature is not supported for + JIT matching, and if requested with JIT it is ignored (with a warning + message). Setting this modifier affects the output if there is a look- + behind at the start of a match, or, for a complete match, a lookahead at the end, or if \K is used in the pattern. Characters that precede or - follow the start and end of the actual match are indicated in the out- + follow the start and end of the actual match are indicated in the out- put by '<' or '>' characters underneath them. Here is an example: re> /(?<=pqr)abc(?=xyz)/ @@ -1217,16 +1259,16 @@ SUBJECT MODIFIERS Partial match: pqrabcxy <<< - The first, complete match shows that the matched string is "abc", with - the preceding and following strings "pqr" and "xyz" having been con- - sulted during the match (when processing the assertions). The partial + The first, complete match shows that the matched string is "abc", with + the preceding and following strings "pqr" and "xyz" having been con- + sulted during the match (when processing the assertions). The partial match can indicate only the preceding string. - The startchar modifier requests that the starting character for the - match be indicated, if it is different to the start of the matched + The startchar modifier requests that the starting character for the + match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \K has been processed as part of the match. In this situation, the output for the matched string - is displayed from the starting character instead of from the match + is displayed from the starting character instead of from the match point, with circumflex characters under the earlier characters. For ex- ample: @@ -1235,7 +1277,7 @@ SUBJECT MODIFIERS 0: abcxyz ^^^ - Unlike allusedtext, the startchar modifier can be used with JIT. How- + Unlike allusedtext, the startchar modifier can be used with JIT. How- ever, these two modifiers are mutually exclusive. Showing the value of all capture groups @@ -1243,104 +1285,104 @@ SUBJECT MODIFIERS The allcaptures modifier requests that the values of all potential cap- tured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to - the return code from pcre2_match()). Groups that did not take part in - the match are output as "". This modifier is not relevant for - DFA matching (which does no capturing) and does not apply when replace + the return code from pcre2_match()). Groups that did not take part in + the match are output as "". This modifier is not relevant for + DFA matching (which does no capturing) and does not apply when replace is specified; it is ignored, with a warning message, if present. Showing the entire ovector, for all outcomes The allvector modifier requests that the entire ovector be shown, what- ever the outcome of the match. Compare allcaptures, which shows only up - to the maximum number of capture groups for the pattern, and then only - for a successful complete non-DFA match. This modifier, which acts af- - ter any match result, and also for DFA matching, provides a means of - checking that there are no unexpected modifications to ovector fields. - Before each match attempt, the ovector is filled with a special value, - and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all - groups after the maximum capture group for the pattern. In other cases - it applies to the entire ovector. After a partial match, the first two - elements are the only ones that should be set. After a DFA match, the - amount of ovector that is used depends on the number of matches that + to the maximum number of capture groups for the pattern, and then only + for a successful complete non-DFA match. This modifier, which acts af- + ter any match result, and also for DFA matching, provides a means of + checking that there are no unexpected modifications to ovector fields. + Before each match attempt, the ovector is filled with a special value, + and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all + groups after the maximum capture group for the pattern. In other cases + it applies to the entire ovector. After a partial match, the first two + elements are the only ones that should be set. After a DFA match, the + amount of ovector that is used depends on the number of matches that were found. Testing pattern callouts - A callout function is supplied when pcre2test calls the library match- - ing functions, unless callout_none is specified. Its behaviour can be - controlled by various modifiers listed above whose names begin with - callout_. Details are given in the section entitled "Callouts" below. - Testing callouts from pcre2_substitute() is described separately in + A callout function is supplied when pcre2test calls the library match- + ing functions, unless callout_none is specified. Its behaviour can be + controlled by various modifiers listed above whose names begin with + callout_. Details are given in the section entitled "Callouts" below. + Testing callouts from pcre2_substitute() is described separately in "Testing the substitution function" below. Finding all matches in a string Searching for all possible matches within a subject can be requested by - the global or altglobal modifier. After finding a match, the matching - function is called again to search the remainder of the subject. The - difference between global and altglobal is that the former uses the - start_offset argument to pcre2_match() or pcre2_dfa_match() to start - searching at a new point within the entire string (which is what Perl + the global or altglobal modifier. After finding a match, the matching + function is called again to search the remainder of the subject. The + difference between global and altglobal is that the former uses the + start_offset argument to pcre2_match() or pcre2_dfa_match() to start + searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbe- hind assertion (including \b or \B). - If an empty string is matched, the next match is done with the + If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for another, non-empty, match at the same point in the subject. If this - match fails, the start offset is advanced, and the normal match is re- - tried. This imitates the way Perl handles such cases when using the /g - modifier or the split() function. Normally, the start offset is ad- - vanced by one character, but if the newline convention recognizes CRLF - as a newline, and the current character is CR followed by LF, an ad- + match fails, the start offset is advanced, and the normal match is re- + tried. This imitates the way Perl handles such cases when using the /g + modifier or the split() function. Normally, the start offset is ad- + vanced by one character, but if the newline convention recognizes CRLF + as a newline, and the current character is CR followed by LF, an ad- vance of two characters occurs. Testing substring extraction functions - The copy and get modifiers can be used to test the pcre2_sub- + The copy and get modifiers can be used to test the pcre2_sub- string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be given more than once, and each can specify a capture group name or num- ber, for example: abcd\=copy=1,copy=3,get=G1 - If the #subject command is used to set default copy and/or get lists, - these can be unset by specifying a negative number to cancel all num- + If the #subject command is used to set default copy and/or get lists, + these can be unset by specifying a negative number to cancel all num- bered groups and an empty name to cancel all named groups. - The getall modifier tests pcre2_substring_list_get(), which extracts + The getall modifier tests pcre2_substring_list_get(), which extracts all captured substrings. - If the subject line is successfully matched, the substrings extracted - by the convenience functions are output with C, G, or L after the - string number instead of a colon. This is in addition to the normal - full list. The string length (that is, the return from the extraction + If the subject line is successfully matched, the substrings extracted + by the convenience functions are output with C, G, or L after the + string number instead of a colon. This is in addition to the normal + full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name. Testing the substitution function - If the replace modifier is set, the pcre2_substitute() function is - called instead of one of the matching functions (or after one call of - pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- - placement strings cannot contain commas, because a comma signifies the - end of a modifier. This is not thought to be an issue in a test pro- + If the replace modifier is set, the pcre2_substitute() function is + called instead of one of the matching functions (or after one call of + pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- + placement strings cannot contain commas, because a comma signifies the + end of a modifier. This is not thought to be an issue in a test pro- gram. - Specifying a completely empty replacement string disables this modi- - fier. However, it is possible to specify an empty replacement by pro- - viding a buffer length, as described below, for an otherwise empty re- + Specifying a completely empty replacement string disables this modi- + fier. However, it is possible to specify an empty replacement by pro- + viding a buffer length, as described below, for an otherwise empty re- placement. - Unlike subject strings, pcre2test does not process replacement strings - for escape sequences. In UTF mode, a replacement string is checked to - see if it is a valid UTF-8 string. If so, it is correctly converted to - a UTF string of the appropriate code unit width. If it is not a valid - UTF-8 string, the individual code units are copied directly. This pro- + Unlike subject strings, pcre2test does not process replacement strings + for escape sequences. In UTF mode, a replacement string is checked to + see if it is a valid UTF-8 string. If so, it is correctly converted to + a UTF string of the appropriate code unit width. If it is not a valid + UTF-8 string, the individual code units are copied directly. This pro- vides a means of passing an invalid UTF-8 string for testing purposes. - The following modifiers set options (in additional to the normal match + The following modifiers set options (in additional to the normal match options) for pcre2_substitute(): global PCRE2_SUBSTITUTE_GLOBAL @@ -1354,8 +1396,8 @@ SUBJECT MODIFIERS See the pcre2api documentation for details of these options. - After a successful substitution, the modified string is output, pre- - ceded by the number of replacements. This may be zero if there were no + After a successful substitution, the modified string is output, pre- + ceded by the number of replacements. This may be zero if there were no matches. Here is a simple example of a substitution test: /abc/replace=xxx @@ -1364,12 +1406,12 @@ SUBJECT MODIFIERS =abc=abc=\=global 2: =xxx=xxx= - Subject and replacement strings should be kept relatively short (fewer - than 256 characters) for substitution tests, as fixed-size buffers are - used. To make it easy to test for buffer overflow, if the replacement - string starts with a number in square brackets, that number is passed - to pcre2_substitute() as the size of the output buffer, with the re- - placement string starting at the next character. Here is an example + Subject and replacement strings should be kept relatively short (fewer + than 256 characters) for substitution tests, as fixed-size buffers are + used. To make it easy to test for buffer overflow, if the replacement + string starts with a number in square brackets, that number is passed + to pcre2_substitute() as the size of the output buffer, with the re- + placement string starting at the next character. Here is an example that tests the edge case: /abc/ @@ -1379,12 +1421,12 @@ SUBJECT MODIFIERS Failed: error -47: no more memory The default action of pcre2_substitute() is to return PCRE2_ER- - ROR_NOMEMORY when the output buffer is too small. However, if the - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- + ROR_NOMEMORY when the output buffer is too small. However, if the + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- tute_overflow_length modifier), pcre2_substitute() continues to go - through the motions of matching and substituting (but not doing any - callouts), in order to compute the size of buffer that is required. - When this happens, pcre2test shows the required buffer length (which + through the motions of matching and substituting (but not doing any + callouts), in order to compute the size of buffer that is required. + When this happens, pcre2test shows the required buffer length (which includes space for the trailing zero) as part of the error message. For example: @@ -1393,15 +1435,15 @@ SUBJECT MODIFIERS Failed: error -47: no more memory: 10 code units are needed A replacement string is ignored with POSIX and DFA matching. Specifying - partial matching provokes an error return ("bad option value") from + partial matching provokes an error return ("bad option value") from pcre2_substitute(). Testing substitute callouts If the substitute_callout modifier is set, a substitution callout func- - tion is set up. The null_context modifier must not be set, because the - address of the callout function is passed in a match context. When the - callout function is called (after each substitution), details of the + tion is set up. The null_context modifier must not be set, because the + address of the callout function is passed in a match context. When the + callout function is called (after each substitution), details of the input and output strings are output. For example: /abc/g,replace=<$0>,substitute_callout @@ -1410,19 +1452,19 @@ SUBJECT MODIFIERS 2(1) Old 6 9 "abc" New 8 13 "" 2: defpqr - The first number on each callout line is the count of matches. The + The first number on each callout line is the count of matches. The parenthesized number is the number of pairs that are set in the ovector - (that is, one more than the number of capturing groups that were set). + (that is, one more than the number of capturing groups that were set). Then are listed the offsets of the old substring, its contents, and the same for the replacement. - By default, the substitution callout function returns zero, which ac- - cepts the replacement and causes matching to continue if /g was used. - Two further modifiers can be used to test other return values. If sub- - stitute_skip is set to a value greater than zero the callout function - returns +1 for the match of that number, and similarly substitute_stop - returns -1. These cause the replacement to be rejected, and -1 causes - no further matching to take place. If either of them are set, substi- + By default, the substitution callout function returns zero, which ac- + cepts the replacement and causes matching to continue if /g was used. + Two further modifiers can be used to test other return values. If sub- + stitute_skip is set to a value greater than zero the callout function + returns +1 for the match of that number, and similarly substitute_stop + returns -1. These cause the replacement to be rejected, and -1 causes + no further matching to take place. If either of them are set, substi- tute_callout is assumed. For example: /abc/g,replace=<$0>,substitute_skip=1 @@ -1438,6 +1480,18 @@ SUBJECT MODIFIERS gle skip or stop is supported, which is sufficient for testing that the feature works. + Testing substitute case callouts + + If the substitute_case_callout modifier is set, a substitution case + callout function is set up. The callout function is called for each + substituted chunk which is to be case-transformed. + + The callout function passed is a fixed function with implementation for + certain behaviours: inputs which shrink when case-transformed; inputs + which grow; inputs with distinct upper/lower/titlecase forms. The char- + acters which are not special-cased for testing purposes are left unmod- + ified, as if they are caseless characters. + Setting the JIT stack size The jitstack modifier provides a way of setting the maximum stack size @@ -2007,8 +2061,8 @@ AUTHOR REVISION - Last updated: 24 April 2024 + Last updated: 26 December 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE 10.44 24 April 2024 PCRE2TEST(1) +PCRE2 10.45 26 December 2024 PCRE2TEST(1) diff --git a/usr/share/doc/pcre2/LICENCE b/mingw32/share/licenses/pcre2/LICENCE.md similarity index 55% rename from usr/share/doc/pcre2/LICENCE rename to mingw32/share/licenses/pcre2/LICENCE.md index 3c1ef032dec..f58ceb75a63 100644 --- a/usr/share/doc/pcre2/LICENCE +++ b/mingw32/share/licenses/pcre2/LICENCE.md @@ -1,5 +1,8 @@ -PCRE2 LICENCE -------------- +PCRE2 License +============= + +| SPDX-License-Identifier: | BSD-3-Clause WITH PCRE2-exception | +|---------|-------| PCRE2 is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. @@ -16,40 +19,46 @@ optimize pattern matching. This is an optional feature that can be omitted when the library is built. -THE BASIC LIBRARY FUNCTIONS ---------------------------- +COPYRIGHT +--------- + +### The basic library functions -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com + Written by: Philip Hazel + Email local part: Philip.Hazel + Email domain: gmail.com -Retired from University of Cambridge Computing Service, -Cambridge, England. + Retired from University of Cambridge Computing Service, + Cambridge, England. -Copyright (c) 1997-2024 University of Cambridge -All rights reserved. + Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 2007-2024 Philip Hazel + All rights reserved. +### PCRE2 Just-In-Time compilation support -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu + Copyright (c) 2010-2024 Zoltan Herczeg + All rights reserved. -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. +### Stack-less Just-In-Time compiler + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- + Copyright (c) 2009-2024 Zoltan Herczeg + All rights reserved. -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu +### All other contributions -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. +Many other contributors have participated in the authorship of PCRE2. As PCRE2 +has never required a Contributor Licensing Agreement, or other copyright +assignment agreement, all contributions have copyright retained by each +original contributor or their employer. THE "BSD" LICENCE @@ -58,16 +67,16 @@ THE "BSD" LICENCE Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notices, - this list of conditions and the following disclaimer. +* Redistributions of source code must retain the above copyright notices, + this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notices, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. +* Redistributions in binary form must reproduce the above copyright + notices, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of the University of Cambridge nor the names of any - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. +* Neither the name of the University of Cambridge nor the names of any + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/mingw32/share/man/man1/pcre2-config.1.gz b/mingw32/share/man/man1/pcre2-config.1.gz index bd6a7fd31d0..822df9aded3 100644 Binary files a/mingw32/share/man/man1/pcre2-config.1.gz and b/mingw32/share/man/man1/pcre2-config.1.gz differ diff --git a/mingw32/share/man/man1/pcre2grep.1.gz b/mingw32/share/man/man1/pcre2grep.1.gz index 4986ba33e42..30bcc1cf285 100644 Binary files a/mingw32/share/man/man1/pcre2grep.1.gz and b/mingw32/share/man/man1/pcre2grep.1.gz differ diff --git a/mingw32/share/man/man1/pcre2test.1.gz b/mingw32/share/man/man1/pcre2test.1.gz index 6f7f0a4ded2..2ad05f1e7ca 100644 Binary files a/mingw32/share/man/man1/pcre2test.1.gz and b/mingw32/share/man/man1/pcre2test.1.gz differ diff --git a/mingw32/share/man/man3/pcre2.3.gz b/mingw32/share/man/man3/pcre2.3.gz index 8f8de13a685..0557798fa74 100644 Binary files a/mingw32/share/man/man3/pcre2.3.gz and b/mingw32/share/man/man3/pcre2.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_callout_enumerate.3.gz b/mingw32/share/man/man3/pcre2_callout_enumerate.3.gz index 132edbb1a5e..a808e956bd6 100644 Binary files a/mingw32/share/man/man3/pcre2_callout_enumerate.3.gz and b/mingw32/share/man/man3/pcre2_callout_enumerate.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_code_copy.3.gz b/mingw32/share/man/man3/pcre2_code_copy.3.gz index 0c748480430..cc75c1145df 100644 Binary files a/mingw32/share/man/man3/pcre2_code_copy.3.gz and b/mingw32/share/man/man3/pcre2_code_copy.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_code_copy_with_tables.3.gz b/mingw32/share/man/man3/pcre2_code_copy_with_tables.3.gz index 5ece33f7768..d308117ce38 100644 Binary files a/mingw32/share/man/man3/pcre2_code_copy_with_tables.3.gz and b/mingw32/share/man/man3/pcre2_code_copy_with_tables.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_code_free.3.gz b/mingw32/share/man/man3/pcre2_code_free.3.gz index 2fc6eccaa38..f43574a3931 100644 Binary files a/mingw32/share/man/man3/pcre2_code_free.3.gz and b/mingw32/share/man/man3/pcre2_code_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_compile.3.gz b/mingw32/share/man/man3/pcre2_compile.3.gz index f7d39bd6671..038e31a7280 100644 Binary files a/mingw32/share/man/man3/pcre2_compile.3.gz and b/mingw32/share/man/man3/pcre2_compile.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_compile_context_copy.3.gz b/mingw32/share/man/man3/pcre2_compile_context_copy.3.gz index bd414744c62..e601ecd8051 100644 Binary files a/mingw32/share/man/man3/pcre2_compile_context_copy.3.gz and b/mingw32/share/man/man3/pcre2_compile_context_copy.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_compile_context_create.3.gz b/mingw32/share/man/man3/pcre2_compile_context_create.3.gz index 670392ebb3b..736809aa152 100644 Binary files a/mingw32/share/man/man3/pcre2_compile_context_create.3.gz and b/mingw32/share/man/man3/pcre2_compile_context_create.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_compile_context_free.3.gz b/mingw32/share/man/man3/pcre2_compile_context_free.3.gz index 3aa875dd01c..6af45001ad6 100644 Binary files a/mingw32/share/man/man3/pcre2_compile_context_free.3.gz and b/mingw32/share/man/man3/pcre2_compile_context_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_config.3.gz b/mingw32/share/man/man3/pcre2_config.3.gz index 6858fdeb764..0dc22637f28 100644 Binary files a/mingw32/share/man/man3/pcre2_config.3.gz and b/mingw32/share/man/man3/pcre2_config.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_convert_context_copy.3.gz b/mingw32/share/man/man3/pcre2_convert_context_copy.3.gz index e8c877a0388..4dcbd5f2637 100644 Binary files a/mingw32/share/man/man3/pcre2_convert_context_copy.3.gz and b/mingw32/share/man/man3/pcre2_convert_context_copy.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_convert_context_create.3.gz b/mingw32/share/man/man3/pcre2_convert_context_create.3.gz index 47212499bb1..872ffff8130 100644 Binary files a/mingw32/share/man/man3/pcre2_convert_context_create.3.gz and b/mingw32/share/man/man3/pcre2_convert_context_create.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_convert_context_free.3.gz b/mingw32/share/man/man3/pcre2_convert_context_free.3.gz index a8d1fa23847..344effb030f 100644 Binary files a/mingw32/share/man/man3/pcre2_convert_context_free.3.gz and b/mingw32/share/man/man3/pcre2_convert_context_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_converted_pattern_free.3.gz b/mingw32/share/man/man3/pcre2_converted_pattern_free.3.gz index b2a1dfc7175..37f2c7cda61 100644 Binary files a/mingw32/share/man/man3/pcre2_converted_pattern_free.3.gz and b/mingw32/share/man/man3/pcre2_converted_pattern_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_dfa_match.3.gz b/mingw32/share/man/man3/pcre2_dfa_match.3.gz index 7a1cc9b4bfa..6beb8b69df4 100644 Binary files a/mingw32/share/man/man3/pcre2_dfa_match.3.gz and b/mingw32/share/man/man3/pcre2_dfa_match.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_general_context_copy.3.gz b/mingw32/share/man/man3/pcre2_general_context_copy.3.gz index c1854da2b24..9f4653d1e1a 100644 Binary files a/mingw32/share/man/man3/pcre2_general_context_copy.3.gz and b/mingw32/share/man/man3/pcre2_general_context_copy.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_general_context_create.3.gz b/mingw32/share/man/man3/pcre2_general_context_create.3.gz index 559c4ad603f..05d60da320e 100644 Binary files a/mingw32/share/man/man3/pcre2_general_context_create.3.gz and b/mingw32/share/man/man3/pcre2_general_context_create.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_general_context_free.3.gz b/mingw32/share/man/man3/pcre2_general_context_free.3.gz index 42b9a03c19f..f1133f8a242 100644 Binary files a/mingw32/share/man/man3/pcre2_general_context_free.3.gz and b/mingw32/share/man/man3/pcre2_general_context_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_error_message.3.gz b/mingw32/share/man/man3/pcre2_get_error_message.3.gz index 31a4fe00d0c..5c1dd580e8b 100644 Binary files a/mingw32/share/man/man3/pcre2_get_error_message.3.gz and b/mingw32/share/man/man3/pcre2_get_error_message.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_mark.3.gz b/mingw32/share/man/man3/pcre2_get_mark.3.gz index d189a862d6e..0422987109c 100644 Binary files a/mingw32/share/man/man3/pcre2_get_mark.3.gz and b/mingw32/share/man/man3/pcre2_get_mark.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz b/mingw32/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz index 41e2b42f079..ed3807ee91a 100644 Binary files a/mingw32/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz and b/mingw32/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_match_data_size.3.gz b/mingw32/share/man/man3/pcre2_get_match_data_size.3.gz index 1493a8239d7..b8b90d8b992 100644 Binary files a/mingw32/share/man/man3/pcre2_get_match_data_size.3.gz and b/mingw32/share/man/man3/pcre2_get_match_data_size.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_ovector_count.3.gz b/mingw32/share/man/man3/pcre2_get_ovector_count.3.gz index 3e13f2457bb..cbd0b74f6d1 100644 Binary files a/mingw32/share/man/man3/pcre2_get_ovector_count.3.gz and b/mingw32/share/man/man3/pcre2_get_ovector_count.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_ovector_pointer.3.gz b/mingw32/share/man/man3/pcre2_get_ovector_pointer.3.gz index 46c73d389e6..b638a40fb4f 100644 Binary files a/mingw32/share/man/man3/pcre2_get_ovector_pointer.3.gz and b/mingw32/share/man/man3/pcre2_get_ovector_pointer.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_get_startchar.3.gz b/mingw32/share/man/man3/pcre2_get_startchar.3.gz index 8770d3f18e2..bbd6dc3d8b0 100644 Binary files a/mingw32/share/man/man3/pcre2_get_startchar.3.gz and b/mingw32/share/man/man3/pcre2_get_startchar.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_jit_compile.3.gz b/mingw32/share/man/man3/pcre2_jit_compile.3.gz index 37a905af1c9..4b334032e0f 100644 Binary files a/mingw32/share/man/man3/pcre2_jit_compile.3.gz and b/mingw32/share/man/man3/pcre2_jit_compile.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_jit_free_unused_memory.3.gz b/mingw32/share/man/man3/pcre2_jit_free_unused_memory.3.gz index b854e9f15f1..636b2f299d2 100644 Binary files a/mingw32/share/man/man3/pcre2_jit_free_unused_memory.3.gz and b/mingw32/share/man/man3/pcre2_jit_free_unused_memory.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_jit_match.3.gz b/mingw32/share/man/man3/pcre2_jit_match.3.gz index 1f60dbd220c..aea0f320a12 100644 Binary files a/mingw32/share/man/man3/pcre2_jit_match.3.gz and b/mingw32/share/man/man3/pcre2_jit_match.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_jit_stack_assign.3.gz b/mingw32/share/man/man3/pcre2_jit_stack_assign.3.gz index cd29425f7db..3d21b33376d 100644 Binary files a/mingw32/share/man/man3/pcre2_jit_stack_assign.3.gz and b/mingw32/share/man/man3/pcre2_jit_stack_assign.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_jit_stack_create.3.gz b/mingw32/share/man/man3/pcre2_jit_stack_create.3.gz index ca3326ae5c3..f5135aa0038 100644 Binary files a/mingw32/share/man/man3/pcre2_jit_stack_create.3.gz and b/mingw32/share/man/man3/pcre2_jit_stack_create.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_jit_stack_free.3.gz b/mingw32/share/man/man3/pcre2_jit_stack_free.3.gz index 143e3c8005b..6e880ae0669 100644 Binary files a/mingw32/share/man/man3/pcre2_jit_stack_free.3.gz and b/mingw32/share/man/man3/pcre2_jit_stack_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_maketables.3.gz b/mingw32/share/man/man3/pcre2_maketables.3.gz index 3df5fa94517..964e90286ea 100644 Binary files a/mingw32/share/man/man3/pcre2_maketables.3.gz and b/mingw32/share/man/man3/pcre2_maketables.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_maketables_free.3.gz b/mingw32/share/man/man3/pcre2_maketables_free.3.gz index 7e4c91a7b5e..6c0ff04dfb9 100644 Binary files a/mingw32/share/man/man3/pcre2_maketables_free.3.gz and b/mingw32/share/man/man3/pcre2_maketables_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match.3.gz b/mingw32/share/man/man3/pcre2_match.3.gz index 737e8dd5ad0..de2d9e66605 100644 Binary files a/mingw32/share/man/man3/pcre2_match.3.gz and b/mingw32/share/man/man3/pcre2_match.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match_context_copy.3.gz b/mingw32/share/man/man3/pcre2_match_context_copy.3.gz index 82317151de4..d971b5f740e 100644 Binary files a/mingw32/share/man/man3/pcre2_match_context_copy.3.gz and b/mingw32/share/man/man3/pcre2_match_context_copy.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match_context_create.3.gz b/mingw32/share/man/man3/pcre2_match_context_create.3.gz index 1607514981c..134d340d0f5 100644 Binary files a/mingw32/share/man/man3/pcre2_match_context_create.3.gz and b/mingw32/share/man/man3/pcre2_match_context_create.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match_context_free.3.gz b/mingw32/share/man/man3/pcre2_match_context_free.3.gz index d54a68d4b39..0a7f3016c50 100644 Binary files a/mingw32/share/man/man3/pcre2_match_context_free.3.gz and b/mingw32/share/man/man3/pcre2_match_context_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match_data_create.3.gz b/mingw32/share/man/man3/pcre2_match_data_create.3.gz index 79faeb6e407..dd522dda1a5 100644 Binary files a/mingw32/share/man/man3/pcre2_match_data_create.3.gz and b/mingw32/share/man/man3/pcre2_match_data_create.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match_data_create_from_pattern.3.gz b/mingw32/share/man/man3/pcre2_match_data_create_from_pattern.3.gz index 0649219da29..99b6388d286 100644 Binary files a/mingw32/share/man/man3/pcre2_match_data_create_from_pattern.3.gz and b/mingw32/share/man/man3/pcre2_match_data_create_from_pattern.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_match_data_free.3.gz b/mingw32/share/man/man3/pcre2_match_data_free.3.gz index 0d5769b50d3..d16b50815c1 100644 Binary files a/mingw32/share/man/man3/pcre2_match_data_free.3.gz and b/mingw32/share/man/man3/pcre2_match_data_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_pattern_convert.3.gz b/mingw32/share/man/man3/pcre2_pattern_convert.3.gz index 51cec7d92eb..fa281abaa5a 100644 Binary files a/mingw32/share/man/man3/pcre2_pattern_convert.3.gz and b/mingw32/share/man/man3/pcre2_pattern_convert.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_pattern_info.3.gz b/mingw32/share/man/man3/pcre2_pattern_info.3.gz index 591aadd37d8..77adf99495b 100644 Binary files a/mingw32/share/man/man3/pcre2_pattern_info.3.gz and b/mingw32/share/man/man3/pcre2_pattern_info.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_serialize_decode.3.gz b/mingw32/share/man/man3/pcre2_serialize_decode.3.gz index ee99586dc5b..94dc8db34ac 100644 Binary files a/mingw32/share/man/man3/pcre2_serialize_decode.3.gz and b/mingw32/share/man/man3/pcre2_serialize_decode.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_serialize_encode.3.gz b/mingw32/share/man/man3/pcre2_serialize_encode.3.gz index 0aebb74aeba..ef83c5158ba 100644 Binary files a/mingw32/share/man/man3/pcre2_serialize_encode.3.gz and b/mingw32/share/man/man3/pcre2_serialize_encode.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_serialize_free.3.gz b/mingw32/share/man/man3/pcre2_serialize_free.3.gz index 35e72657a5c..1b893458bf1 100644 Binary files a/mingw32/share/man/man3/pcre2_serialize_free.3.gz and b/mingw32/share/man/man3/pcre2_serialize_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz b/mingw32/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz index 2f5e616a614..d0f6e346a7f 100644 Binary files a/mingw32/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz and b/mingw32/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_bsr.3.gz b/mingw32/share/man/man3/pcre2_set_bsr.3.gz index edc42c33a2e..f886a47c949 100644 Binary files a/mingw32/share/man/man3/pcre2_set_bsr.3.gz and b/mingw32/share/man/man3/pcre2_set_bsr.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_callout.3.gz b/mingw32/share/man/man3/pcre2_set_callout.3.gz index 10c85f78b1c..a28a8719d86 100644 Binary files a/mingw32/share/man/man3/pcre2_set_callout.3.gz and b/mingw32/share/man/man3/pcre2_set_callout.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_character_tables.3.gz b/mingw32/share/man/man3/pcre2_set_character_tables.3.gz index ef705eabf76..7b5c22c7b08 100644 Binary files a/mingw32/share/man/man3/pcre2_set_character_tables.3.gz and b/mingw32/share/man/man3/pcre2_set_character_tables.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_compile_extra_options.3.gz b/mingw32/share/man/man3/pcre2_set_compile_extra_options.3.gz index f0b6532da11..c5b68d4fffe 100644 Binary files a/mingw32/share/man/man3/pcre2_set_compile_extra_options.3.gz and b/mingw32/share/man/man3/pcre2_set_compile_extra_options.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_compile_recursion_guard.3.gz b/mingw32/share/man/man3/pcre2_set_compile_recursion_guard.3.gz index be84b852dfa..0a0f31d308c 100644 Binary files a/mingw32/share/man/man3/pcre2_set_compile_recursion_guard.3.gz and b/mingw32/share/man/man3/pcre2_set_compile_recursion_guard.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_depth_limit.3.gz b/mingw32/share/man/man3/pcre2_set_depth_limit.3.gz index 23431f3bee0..63f86a671ce 100644 Binary files a/mingw32/share/man/man3/pcre2_set_depth_limit.3.gz and b/mingw32/share/man/man3/pcre2_set_depth_limit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_glob_escape.3.gz b/mingw32/share/man/man3/pcre2_set_glob_escape.3.gz index 7f633e4d483..b9686a7d52c 100644 Binary files a/mingw32/share/man/man3/pcre2_set_glob_escape.3.gz and b/mingw32/share/man/man3/pcre2_set_glob_escape.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_glob_separator.3.gz b/mingw32/share/man/man3/pcre2_set_glob_separator.3.gz index 5b40b6d42c9..ef3c380e2f5 100644 Binary files a/mingw32/share/man/man3/pcre2_set_glob_separator.3.gz and b/mingw32/share/man/man3/pcre2_set_glob_separator.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_heap_limit.3.gz b/mingw32/share/man/man3/pcre2_set_heap_limit.3.gz index 059d9882713..b1f60913032 100644 Binary files a/mingw32/share/man/man3/pcre2_set_heap_limit.3.gz and b/mingw32/share/man/man3/pcre2_set_heap_limit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_match_limit.3.gz b/mingw32/share/man/man3/pcre2_set_match_limit.3.gz index 2949258558d..9a4487ee5f4 100644 Binary files a/mingw32/share/man/man3/pcre2_set_match_limit.3.gz and b/mingw32/share/man/man3/pcre2_set_match_limit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz b/mingw32/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz index 22b23a35e17..e6b0799660f 100644 Binary files a/mingw32/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz and b/mingw32/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_max_pattern_length.3.gz b/mingw32/share/man/man3/pcre2_set_max_pattern_length.3.gz index 2c7cc625081..73b8724f64d 100644 Binary files a/mingw32/share/man/man3/pcre2_set_max_pattern_length.3.gz and b/mingw32/share/man/man3/pcre2_set_max_pattern_length.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_max_varlookbehind.3.gz b/mingw32/share/man/man3/pcre2_set_max_varlookbehind.3.gz index b18f39edaf3..7da16360947 100644 Binary files a/mingw32/share/man/man3/pcre2_set_max_varlookbehind.3.gz and b/mingw32/share/man/man3/pcre2_set_max_varlookbehind.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_newline.3.gz b/mingw32/share/man/man3/pcre2_set_newline.3.gz index c7b6911659e..779465b9227 100644 Binary files a/mingw32/share/man/man3/pcre2_set_newline.3.gz and b/mingw32/share/man/man3/pcre2_set_newline.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_offset_limit.3.gz b/mingw32/share/man/man3/pcre2_set_offset_limit.3.gz index 84f80d2cf97..d3202861471 100644 Binary files a/mingw32/share/man/man3/pcre2_set_offset_limit.3.gz and b/mingw32/share/man/man3/pcre2_set_offset_limit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_optimize.3.gz b/mingw32/share/man/man3/pcre2_set_optimize.3.gz new file mode 100644 index 00000000000..3edd2ad258d Binary files /dev/null and b/mingw32/share/man/man3/pcre2_set_optimize.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_parens_nest_limit.3.gz b/mingw32/share/man/man3/pcre2_set_parens_nest_limit.3.gz index 45222f17ed4..8135f86d958 100644 Binary files a/mingw32/share/man/man3/pcre2_set_parens_nest_limit.3.gz and b/mingw32/share/man/man3/pcre2_set_parens_nest_limit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_recursion_limit.3.gz b/mingw32/share/man/man3/pcre2_set_recursion_limit.3.gz index 1b36c8a9930..30af4aed7bb 100644 Binary files a/mingw32/share/man/man3/pcre2_set_recursion_limit.3.gz and b/mingw32/share/man/man3/pcre2_set_recursion_limit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_recursion_memory_management.3.gz b/mingw32/share/man/man3/pcre2_set_recursion_memory_management.3.gz index abf99ac387f..675b3c11476 100644 Binary files a/mingw32/share/man/man3/pcre2_set_recursion_memory_management.3.gz and b/mingw32/share/man/man3/pcre2_set_recursion_memory_management.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_substitute_callout.3.gz b/mingw32/share/man/man3/pcre2_set_substitute_callout.3.gz index 894bcb508a1..355e05fa15c 100644 Binary files a/mingw32/share/man/man3/pcre2_set_substitute_callout.3.gz and b/mingw32/share/man/man3/pcre2_set_substitute_callout.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_set_substitute_case_callout.3.gz b/mingw32/share/man/man3/pcre2_set_substitute_case_callout.3.gz new file mode 100644 index 00000000000..337b0e12190 Binary files /dev/null and b/mingw32/share/man/man3/pcre2_set_substitute_case_callout.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substitute.3.gz b/mingw32/share/man/man3/pcre2_substitute.3.gz index db19673b270..22df41e3e5b 100644 Binary files a/mingw32/share/man/man3/pcre2_substitute.3.gz and b/mingw32/share/man/man3/pcre2_substitute.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_copy_byname.3.gz b/mingw32/share/man/man3/pcre2_substring_copy_byname.3.gz index 610aa32f5c2..2ceb33380ab 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_copy_byname.3.gz and b/mingw32/share/man/man3/pcre2_substring_copy_byname.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_copy_bynumber.3.gz b/mingw32/share/man/man3/pcre2_substring_copy_bynumber.3.gz index ba365db4ebd..692696a4e5b 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_copy_bynumber.3.gz and b/mingw32/share/man/man3/pcre2_substring_copy_bynumber.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_free.3.gz b/mingw32/share/man/man3/pcre2_substring_free.3.gz index 142293639aa..f3ce3fbe38f 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_free.3.gz and b/mingw32/share/man/man3/pcre2_substring_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_get_byname.3.gz b/mingw32/share/man/man3/pcre2_substring_get_byname.3.gz index f07e76be580..2b84e593061 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_get_byname.3.gz and b/mingw32/share/man/man3/pcre2_substring_get_byname.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_get_bynumber.3.gz b/mingw32/share/man/man3/pcre2_substring_get_bynumber.3.gz index d49ab61f7e0..48da30b75af 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_get_bynumber.3.gz and b/mingw32/share/man/man3/pcre2_substring_get_bynumber.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_length_byname.3.gz b/mingw32/share/man/man3/pcre2_substring_length_byname.3.gz index e44474c9fb1..5d0beaff8af 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_length_byname.3.gz and b/mingw32/share/man/man3/pcre2_substring_length_byname.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_length_bynumber.3.gz b/mingw32/share/man/man3/pcre2_substring_length_bynumber.3.gz index aea0ec85261..cb79fb1dc33 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_length_bynumber.3.gz and b/mingw32/share/man/man3/pcre2_substring_length_bynumber.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_list_free.3.gz b/mingw32/share/man/man3/pcre2_substring_list_free.3.gz index 34f3227dfcb..3d84e95e4d5 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_list_free.3.gz and b/mingw32/share/man/man3/pcre2_substring_list_free.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_list_get.3.gz b/mingw32/share/man/man3/pcre2_substring_list_get.3.gz index afa4dd67106..f14794a0266 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_list_get.3.gz and b/mingw32/share/man/man3/pcre2_substring_list_get.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_nametable_scan.3.gz b/mingw32/share/man/man3/pcre2_substring_nametable_scan.3.gz index 31bc29d7964..ac2e7891cb8 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_nametable_scan.3.gz and b/mingw32/share/man/man3/pcre2_substring_nametable_scan.3.gz differ diff --git a/mingw32/share/man/man3/pcre2_substring_number_from_name.3.gz b/mingw32/share/man/man3/pcre2_substring_number_from_name.3.gz index ed0ce752243..77987e8f067 100644 Binary files a/mingw32/share/man/man3/pcre2_substring_number_from_name.3.gz and b/mingw32/share/man/man3/pcre2_substring_number_from_name.3.gz differ diff --git a/mingw32/share/man/man3/pcre2api.3.gz b/mingw32/share/man/man3/pcre2api.3.gz index 6cebacbacd8..a5b2415b524 100644 Binary files a/mingw32/share/man/man3/pcre2api.3.gz and b/mingw32/share/man/man3/pcre2api.3.gz differ diff --git a/mingw32/share/man/man3/pcre2build.3.gz b/mingw32/share/man/man3/pcre2build.3.gz index 8b06eaee96c..c6ffcd91b64 100644 Binary files a/mingw32/share/man/man3/pcre2build.3.gz and b/mingw32/share/man/man3/pcre2build.3.gz differ diff --git a/mingw32/share/man/man3/pcre2callout.3.gz b/mingw32/share/man/man3/pcre2callout.3.gz index a8f9c712c8a..0063d728758 100644 Binary files a/mingw32/share/man/man3/pcre2callout.3.gz and b/mingw32/share/man/man3/pcre2callout.3.gz differ diff --git a/mingw32/share/man/man3/pcre2compat.3.gz b/mingw32/share/man/man3/pcre2compat.3.gz index dc51b2f2eeb..f6fed552b6c 100644 Binary files a/mingw32/share/man/man3/pcre2compat.3.gz and b/mingw32/share/man/man3/pcre2compat.3.gz differ diff --git a/mingw32/share/man/man3/pcre2convert.3.gz b/mingw32/share/man/man3/pcre2convert.3.gz index 346ffa364a4..f654786e3ba 100644 Binary files a/mingw32/share/man/man3/pcre2convert.3.gz and b/mingw32/share/man/man3/pcre2convert.3.gz differ diff --git a/mingw32/share/man/man3/pcre2demo.3.gz b/mingw32/share/man/man3/pcre2demo.3.gz index de857078189..109e0ef15c0 100644 Binary files a/mingw32/share/man/man3/pcre2demo.3.gz and b/mingw32/share/man/man3/pcre2demo.3.gz differ diff --git a/mingw32/share/man/man3/pcre2jit.3.gz b/mingw32/share/man/man3/pcre2jit.3.gz index 3a5cecc7368..5774b3e549e 100644 Binary files a/mingw32/share/man/man3/pcre2jit.3.gz and b/mingw32/share/man/man3/pcre2jit.3.gz differ diff --git a/mingw32/share/man/man3/pcre2limits.3.gz b/mingw32/share/man/man3/pcre2limits.3.gz index dd7f0bde3fe..7fe735473f7 100644 Binary files a/mingw32/share/man/man3/pcre2limits.3.gz and b/mingw32/share/man/man3/pcre2limits.3.gz differ diff --git a/mingw32/share/man/man3/pcre2matching.3.gz b/mingw32/share/man/man3/pcre2matching.3.gz index e14317961ce..40e98bed373 100644 Binary files a/mingw32/share/man/man3/pcre2matching.3.gz and b/mingw32/share/man/man3/pcre2matching.3.gz differ diff --git a/mingw32/share/man/man3/pcre2partial.3.gz b/mingw32/share/man/man3/pcre2partial.3.gz index 14768729058..cbb76c1ce1b 100644 Binary files a/mingw32/share/man/man3/pcre2partial.3.gz and b/mingw32/share/man/man3/pcre2partial.3.gz differ diff --git a/mingw32/share/man/man3/pcre2pattern.3.gz b/mingw32/share/man/man3/pcre2pattern.3.gz index 46ad89b46b1..e24291911eb 100644 Binary files a/mingw32/share/man/man3/pcre2pattern.3.gz and b/mingw32/share/man/man3/pcre2pattern.3.gz differ diff --git a/mingw32/share/man/man3/pcre2perform.3.gz b/mingw32/share/man/man3/pcre2perform.3.gz index 745c1a602a1..9e11a4bf132 100644 Binary files a/mingw32/share/man/man3/pcre2perform.3.gz and b/mingw32/share/man/man3/pcre2perform.3.gz differ diff --git a/mingw32/share/man/man3/pcre2posix.3.gz b/mingw32/share/man/man3/pcre2posix.3.gz index c108ca4e276..41e335f1370 100644 Binary files a/mingw32/share/man/man3/pcre2posix.3.gz and b/mingw32/share/man/man3/pcre2posix.3.gz differ diff --git a/mingw32/share/man/man3/pcre2sample.3.gz b/mingw32/share/man/man3/pcre2sample.3.gz index 87b280f9a50..669f77cd328 100644 Binary files a/mingw32/share/man/man3/pcre2sample.3.gz and b/mingw32/share/man/man3/pcre2sample.3.gz differ diff --git a/mingw32/share/man/man3/pcre2serialize.3.gz b/mingw32/share/man/man3/pcre2serialize.3.gz index 709a1656f41..3069e1b3bc9 100644 Binary files a/mingw32/share/man/man3/pcre2serialize.3.gz and b/mingw32/share/man/man3/pcre2serialize.3.gz differ diff --git a/mingw32/share/man/man3/pcre2syntax.3.gz b/mingw32/share/man/man3/pcre2syntax.3.gz index 6c1976014f3..c9d5d878ed0 100644 Binary files a/mingw32/share/man/man3/pcre2syntax.3.gz and b/mingw32/share/man/man3/pcre2syntax.3.gz differ diff --git a/mingw32/share/man/man3/pcre2unicode.3.gz b/mingw32/share/man/man3/pcre2unicode.3.gz index 8d153f0dc78..ade2ccaa32d 100644 Binary files a/mingw32/share/man/man3/pcre2unicode.3.gz and b/mingw32/share/man/man3/pcre2unicode.3.gz differ diff --git a/mingw64/bin/libpcre2-16-0.dll b/mingw64/bin/libpcre2-16-0.dll index 1b03823b500..25772c2957c 100644 Binary files a/mingw64/bin/libpcre2-16-0.dll and b/mingw64/bin/libpcre2-16-0.dll differ diff --git a/mingw64/bin/libpcre2-32-0.dll b/mingw64/bin/libpcre2-32-0.dll index bcd2e4a1cce..cffacefc370 100644 Binary files a/mingw64/bin/libpcre2-32-0.dll and b/mingw64/bin/libpcre2-32-0.dll differ diff --git a/mingw64/bin/libpcre2-8-0.dll b/mingw64/bin/libpcre2-8-0.dll index a0a648dc5c2..d91db173757 100644 Binary files a/mingw64/bin/libpcre2-8-0.dll and b/mingw64/bin/libpcre2-8-0.dll differ diff --git a/mingw64/bin/libpcre2-posix-3.dll b/mingw64/bin/libpcre2-posix-3.dll index 631060d2de8..6c9a106ebab 100644 Binary files a/mingw64/bin/libpcre2-posix-3.dll and b/mingw64/bin/libpcre2-posix-3.dll differ diff --git a/mingw64/bin/pcre2-config b/mingw64/bin/pcre2-config index 75262e5dc3c..c7b81066222 100644 --- a/mingw64/bin/pcre2-config +++ b/mingw64/bin/pcre2-config @@ -66,7 +66,7 @@ while test $# -gt 0; do echo $exec_prefix ;; --version) - echo 10.44 + echo 10.45 ;; --cflags) if test ${prefix}/include != /usr/include ; then diff --git a/mingw64/bin/pcre2grep.exe b/mingw64/bin/pcre2grep.exe index e247d02c120..79988c94c12 100644 Binary files a/mingw64/bin/pcre2grep.exe and b/mingw64/bin/pcre2grep.exe differ diff --git a/mingw64/bin/pcre2test.exe b/mingw64/bin/pcre2test.exe index 2c6cc19f6e3..b0f9ebab631 100644 Binary files a/mingw64/bin/pcre2test.exe and b/mingw64/bin/pcre2test.exe differ diff --git a/mingw64/include/pcre2.h b/mingw64/include/pcre2.h index a322d9f2d56..061f3db0a76 100644 --- a/mingw64/include/pcre2.h +++ b/mingw64/include/pcre2.h @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE2_MAJOR 10 -#define PCRE2_MINOR 44 +#define PCRE2_MINOR 45 #define PCRE2_PRERELEASE -#define PCRE2_DATE 2024-06-07 +#define PCRE2_DATE 2025-02-05 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE2, the appropriate @@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ #define PCRE2_LITERAL 0x02000000u /* C */ #define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ +#define PCRE2_ALT_EXTENDED_CLASS 0x08000000u /* C */ /* An additional compile options word is available in the compile context. */ @@ -159,6 +160,10 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ #define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ #define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ +#define PCRE2_EXTRA_PYTHON_OCTAL 0x00002000u /* C */ +#define PCRE2_EXTRA_NO_BS0 0x00004000u /* C */ +#define PCRE2_EXTRA_NEVER_CALLOUT 0x00008000u /* C */ +#define PCRE2_EXTRA_TURKISH_CASING 0x00010000u /* C */ /* These are for pcre2_jit_compile(). */ @@ -166,6 +171,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_JIT_PARTIAL_SOFT 0x00000002u #define PCRE2_JIT_PARTIAL_HARD 0x00000004u #define PCRE2_JIT_INVALID_UTF 0x00000100u +#define PCRE2_JIT_TEST_ALLOC 0x00000200u /* These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and pcre2_substitute(). Some are allowed only for one of the functions, and in @@ -318,9 +324,25 @@ pcre2_pattern_convert(). */ #define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195 #define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196 #define PCRE2_ERROR_TOO_MANY_CAPTURES 197 -#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198 +#define PCRE2_ERROR_MISSING_OCTAL_DIGIT 198 #define PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND 199 - +#define PCRE2_ERROR_MAX_VAR_LOOKBEHIND_EXCEEDED 200 +#define PCRE2_ERROR_PATTERN_COMPILED_SIZE_TOO_BIG 201 +#define PCRE2_ERROR_OVERSIZE_PYTHON_OCTAL 202 +#define PCRE2_ERROR_CALLOUT_CALLER_DISABLED 203 +#define PCRE2_ERROR_EXTRA_CASING_REQUIRES_UNICODE 204 +#define PCRE2_ERROR_TURKISH_CASING_REQUIRES_UTF 205 +#define PCRE2_ERROR_EXTRA_CASING_INCOMPATIBLE 206 +#define PCRE2_ERROR_ECLASS_NEST_TOO_DEEP 207 +#define PCRE2_ERROR_ECLASS_INVALID_OPERATOR 208 +#define PCRE2_ERROR_ECLASS_UNEXPECTED_OPERATOR 209 +#define PCRE2_ERROR_ECLASS_EXPECTED_OPERAND 210 +#define PCRE2_ERROR_ECLASS_MIXED_OPERATORS 211 +#define PCRE2_ERROR_ECLASS_HINT_SQUARE_BRACKET 212 +#define PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_EXPR 213 +#define PCRE2_ERROR_PERL_ECLASS_EMPTY_EXPR 214 +#define PCRE2_ERROR_PERL_ECLASS_MISSING_CLOSE 215 +#define PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_CHAR 216 /* "Expected" matching error codes: no match and partial match. */ @@ -407,6 +429,9 @@ released, the numbers must not be changed. */ #define PCRE2_ERROR_INTERNAL_DUPMATCH (-65) #define PCRE2_ERROR_DFA_UINVALID_UTF (-66) #define PCRE2_ERROR_INVALIDOFFSET (-67) +#define PCRE2_ERROR_JIT_UNSUPPORTED (-68) +#define PCRE2_ERROR_REPLACECASE (-69) +#define PCRE2_ERROR_TOOLARGEREPLACE (-70) /* Request types for pcre2_pattern_info() */ @@ -460,6 +485,30 @@ released, the numbers must not be changed. */ #define PCRE2_CONFIG_COMPILED_WIDTHS 14 #define PCRE2_CONFIG_TABLES_LENGTH 15 +/* Optimization directives for pcre2_set_optimize(). +For binary compatibility, only add to this list; do not renumber. */ + +#define PCRE2_OPTIMIZATION_NONE 0 +#define PCRE2_OPTIMIZATION_FULL 1 + +#define PCRE2_AUTO_POSSESS 64 +#define PCRE2_AUTO_POSSESS_OFF 65 +#define PCRE2_DOTSTAR_ANCHOR 66 +#define PCRE2_DOTSTAR_ANCHOR_OFF 67 +#define PCRE2_START_OPTIMIZE 68 +#define PCRE2_START_OPTIMIZE_OFF 69 + +/* Types used in pcre2_set_substitute_case_callout(). + +PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the +callout to indicate that the case of the entire callout input should be +case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that +only the first character or glyph should be transformed to Unicode titlecase, +and the rest to lowercase. */ + +#define PCRE2_SUBSTITUTE_CASE_LOWER 1 +#define PCRE2_SUBSTITUTE_CASE_UPPER 2 +#define PCRE2_SUBSTITUTE_CASE_TITLE_FIRST 3 /* Types for code units in patterns and subject strings. */ @@ -613,7 +662,9 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ - int (*)(uint32_t, void *), void *); + int (*)(uint32_t, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_optimize(pcre2_compile_context *, uint32_t); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \ @@ -628,6 +679,11 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_substitute_callout(pcre2_match_context *, \ int (*)(pcre2_substitute_callout_block *, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_substitute_case_callout(pcre2_match_context *, \ + PCRE2_SIZE (*)(PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE, int, \ + void *), \ + void *); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ @@ -740,6 +796,7 @@ PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); + /* Functions for serializing / deserializing compiled patterns. */ #define PCRE2_SERIALIZE_FUNCTIONS \ @@ -907,7 +964,9 @@ pcre2_compile are called by application code. */ #define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) #define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) #define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) +#define pcre2_set_optimize PCRE2_SUFFIX(pcre2_set_optimize_) #define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) +#define pcre2_set_substitute_case_callout PCRE2_SUFFIX(pcre2_set_substitute_case_callout_) #define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) #define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) #define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_) diff --git a/mingw64/lib/libpcre2-16.a b/mingw64/lib/libpcre2-16.a index e82d717fbfc..54141065a68 100644 Binary files a/mingw64/lib/libpcre2-16.a and b/mingw64/lib/libpcre2-16.a differ diff --git a/mingw64/lib/libpcre2-16.dll.a b/mingw64/lib/libpcre2-16.dll.a index a62035d3ea2..b3d9d7b4803 100644 Binary files a/mingw64/lib/libpcre2-16.dll.a and b/mingw64/lib/libpcre2-16.dll.a differ diff --git a/mingw64/lib/libpcre2-32.a b/mingw64/lib/libpcre2-32.a index c74fdea7e11..38ad4522ea3 100644 Binary files a/mingw64/lib/libpcre2-32.a and b/mingw64/lib/libpcre2-32.a differ diff --git a/mingw64/lib/libpcre2-32.dll.a b/mingw64/lib/libpcre2-32.dll.a index 58bbbe6d956..d5575ac3a58 100644 Binary files a/mingw64/lib/libpcre2-32.dll.a and b/mingw64/lib/libpcre2-32.dll.a differ diff --git a/mingw64/lib/libpcre2-8.a b/mingw64/lib/libpcre2-8.a index ddd8f731c18..8ca1e5253c2 100644 Binary files a/mingw64/lib/libpcre2-8.a and b/mingw64/lib/libpcre2-8.a differ diff --git a/mingw64/lib/libpcre2-8.dll.a b/mingw64/lib/libpcre2-8.dll.a index 3c6511269df..6ee197190ae 100644 Binary files a/mingw64/lib/libpcre2-8.dll.a and b/mingw64/lib/libpcre2-8.dll.a differ diff --git a/mingw64/lib/libpcre2-posix.a b/mingw64/lib/libpcre2-posix.a index 94306676e6a..6d433a94549 100644 Binary files a/mingw64/lib/libpcre2-posix.a and b/mingw64/lib/libpcre2-posix.a differ diff --git a/mingw64/lib/libpcre2-posix.dll.a b/mingw64/lib/libpcre2-posix.dll.a index 03e4b09d7fe..c4f8afa71c6 100644 Binary files a/mingw64/lib/libpcre2-posix.dll.a and b/mingw64/lib/libpcre2-posix.dll.a differ diff --git a/mingw64/lib/pkgconfig/libpcre2-16.pc b/mingw64/lib/pkgconfig/libpcre2-16.pc index 677c6c14f8f..95266377fcc 100644 --- a/mingw64/lib/pkgconfig/libpcre2-16.pc +++ b/mingw64/lib/pkgconfig/libpcre2-16.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-16 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 16 bit character support -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-16 Libs.private: Cflags: -I${includedir} diff --git a/mingw64/lib/pkgconfig/libpcre2-32.pc b/mingw64/lib/pkgconfig/libpcre2-32.pc index c99b4fc9c34..0061b3bbd6c 100644 --- a/mingw64/lib/pkgconfig/libpcre2-32.pc +++ b/mingw64/lib/pkgconfig/libpcre2-32.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-32 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 32 bit character support -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-32 Libs.private: Cflags: -I${includedir} diff --git a/mingw64/lib/pkgconfig/libpcre2-8.pc b/mingw64/lib/pkgconfig/libpcre2-8.pc index 7d498889e9a..e50c4c1c310 100644 --- a/mingw64/lib/pkgconfig/libpcre2-8.pc +++ b/mingw64/lib/pkgconfig/libpcre2-8.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-8 Description: PCRE2 - Perl compatible regular expressions C library (2nd API) with 8 bit character support -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-8 Libs.private: Cflags: -I${includedir} diff --git a/mingw64/lib/pkgconfig/libpcre2-posix.pc b/mingw64/lib/pkgconfig/libpcre2-posix.pc index b44574690b2..75705fb8eeb 100644 --- a/mingw64/lib/pkgconfig/libpcre2-posix.pc +++ b/mingw64/lib/pkgconfig/libpcre2-posix.pc @@ -7,7 +7,7 @@ includedir=${prefix}/include Name: libpcre2-posix Description: Posix compatible interface to libpcre2-8 -Version: 10.44 +Version: 10.45 Libs: -L${libdir} -lpcre2-posix Cflags: -I${includedir} -DPCRE2POSIX_SHARED Requires.private: libpcre2-8 diff --git a/mingw64/share/doc/pcre2/AUTHORS b/mingw64/share/doc/pcre2/AUTHORS deleted file mode 100644 index 9669f7755ad..00000000000 --- a/mingw64/share/doc/pcre2/AUTHORS +++ /dev/null @@ -1,36 +0,0 @@ -THE MAIN PCRE2 LIBRARY CODE ---------------------------- - -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com - -Retired from University of Cambridge Computing Service, -Cambridge, England. - -Copyright (c) 1997-2024 University of Cambridge -All rights reserved - - -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. - - -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. - -#### diff --git a/mingw64/share/doc/pcre2/AUTHORS.md b/mingw64/share/doc/pcre2/AUTHORS.md new file mode 100644 index 00000000000..708fc2325ce --- /dev/null +++ b/mingw64/share/doc/pcre2/AUTHORS.md @@ -0,0 +1,200 @@ +PCRE2 Authorship and Contributors +================================= + +COPYRIGHT +--------- + +Please see the file [LICENCE](./LICENCE.md) in the PCRE2 distribution for +copyright details. + + +MAINTAINERS +----------- + +The PCRE and PCRE2 libraries were authored and maintained by Philip Hazel. + +Since 2024, the contributors with administrator access to the project are now +Nicholas Wilson and Zoltán Herczeg. See the file [SECURITY](./SECURITY.md) for +GPG keys. + +Both administrators are volunteers acting in a personal capacity. + + + + + + + + + + + + + + + + + + +
    NameRole
    + + Nicholas Wilson
    + `nicholas@nicholaswilson.me.uk`
    + Currently of Microsoft Research Cambridge, UK + +
    + + * General project administration & maintenance + * Release management + * Code maintenance + +
    + + Zoltán Herczeg
    + `hzmester@freemail.hu`
    + Currently of the University of Szeged, Hungary + +
    + + * Code maintenance + * Ownership of `sljit` and PCRE2's JIT + +
    + + +CONTRIBUTORS +------------ + +Many others have participated and contributed to PCRE2 over its history. + +The maintainers are grateful for all contributions and participation over the +years. We apologise for any names we have forgotten. + +We are especially grateful to Philip Hazel, creator of PCRE and PCRE2, and +maintainer from 1997 to 2024. + +All names listed alphabetically. + +### Contributors to PCRE2 + +This list includes names up until the PCRE2 10.44 release. New names will be +added from the Git history on each release. + + Scott Bell + Carlo Marcelo Arenas Belón + Edward Betts + Jan-Willem Blokland + Ross Burton + Dmitry Cherniachenko + Alexey Chupahin + Jessica Clarke + Alejandro Colomar + Jeremie Courreges-Anglas + Addison Crump + Alex Dowad + Daniel Engberg + Daniel Richard G + David Gaussmann + Andrey Gorbachev + Jordan Griege + Jason Hood + Bumsu Hyeon + Roy Ivy + Martin Joerg + Guillem Jover + Ralf Junker + Ayesh Karunaratne + Michael Kaufmann + Yunho Kim + Joshua Kinard + David Korczynski + Uwe Korn + Jonas Kvinge + Kristian Larsson + Kai Lu + Behzod Mansurov + B. Scott Michel + Nathan Moinvaziri + Mike Munday + Marc Mutz + Fabio Pagani + Christian Persch + Tristan Ross + William A Rowe Jr + David Seifert + Yaakov Selkowitz + Rich Siegel + Karl Skomski + Maciej Sroczyński + Wolfgang Stöggl + Thomas Tempelmann + Greg Thain + Lucas Trzesniewski + Theodore Tsirpanis + Matthew Vernon + Rémi Verschelde + Thomas Voss + Ezekiel Warren + Carl Weaver + Chris Wilson + Amin Yahyaabadi + Joe Zhang + +### Contributors to PCRE1 + +These people contributed either by sending patches or reporting serious issues. + + Irfan Adilovic + Alexander Barkov + Daniel Bergström + David Burgess + Ross Burton + David Byron + Fred Cox + Christian Ehrlicher + Tom Fortmann + Lionel Fourquaux + Mike Frysinger + Daniel Richard G + Dair Gran + "Graycode" (Red Hat Product Security) + Viktor Griph + Wen Guanxing + Robin Houston + Martin Jerabek + Peter Kankowski + Stephen Kelly + Yunho Kim + Joshua Kinard + Carsten Klein + Evgeny Kotkov + Ronald Landheer-Cieslak + Alan Lehotsky + Dmitry V. Levin + Nuno Lopes + Kai Lu + Giuseppe Maxia + Dan Mooney + Marc Mutz + Markus Oberhumer + Sheri Pierce + Petr Pisar + Ari Pollak + Bob Rossi + Ruiger Rill + Michael Shigorin + Rich Siegel + Craig Silverstein (C++ wrapper) + Karl Skomski + Paul Sokolovsky + Stan Switzer + Ian Taylor + Mark Tetrode + Jeff Trawick + Steven Van Ingelgem + Lawrence Velazquez + Jiong Wang + Stefan Weber + Chris Wilson + +Thanks go to Jeffrey Friedl for testing and debugging assistance. diff --git a/mingw64/share/doc/pcre2/ChangeLog b/mingw64/share/doc/pcre2/ChangeLog index ea228c193f7..5217d078599 100644 --- a/mingw64/share/doc/pcre2/ChangeLog +++ b/mingw64/share/doc/pcre2/ChangeLog @@ -4,6 +4,194 @@ Change Log for PCRE2 Before the move to GitHub, this was the only record of changes to PCRE2. Now there is also the log of commit messages. +Internal changes which are not visible to clients of the library are mostly not +listed here. + +Version 10.45 05-February-2025 +------------------------------ + +1. (#418) Change 6 of 10.44 broke 32-bit tests because pcre2test's reporting of +memory size was changed to the entire compiled data block, instead of just the +pattern and tables data, so as to align with the new length restriction. +Because the block's header contains pointers, this meant the pcre2test output +was different in 32-bit mode. A patch by Carlo reverts to the previous state +and makes sure that any limit set by pcre2_set_max_pattern_compiled_length() +also avoids the internal struct overhead. + +2. (#416, #622) Updates to build.zig. + +3. (#427, et al.) Various fixes to pacify static analyzers. + +4. (#428) Add --posix-pattern-file to pcre2grep to allow processing of empty +patterns through the -f option, as well as patterns that end in space +characters, for compatibility with other grep tools. + +5. (4fa5b8bd) Fix a bug in the fuzz support quantifier-limiting code. It ignores +strings of more than 5 digits because they are necessarily numbers greater than +65535, the largest legal quantifier. However, it wasn't ignoring non-significant +leading zeros. + +6. (6d82f0cd) The case-independent processing of the letter-matching Unicode +properties Ll, Lt, and Lu have been changed to match Perl (which changed a while +ago). When caseless matching is in force, all three of these properties are now +treated as Lc (cased letter). + +7. (#433) The pcre2_jit_compile() function was updated by the addition of a new +option PCRE2_JIT_TEST_ALLOC which, if called with a NULL first argument, tests +not only the availability of JIT, but also its ability to allocate executable +memory. Update pcre2test to use this support to extend the -C option. + +8. (75b1025a) The code for parsing Unicode property descriptions for \p and \P +been changed as follows: + + . White space etc. before ^ in a negated value such as \p{ ^L } was not being + ignored. + + . The code wouldn't have worked if PCRE2 was compiled for UTF-8 support + within an EBCDIC environment. Possibly nobody does this any more, but it + should now work. + + . The documentation of the syntax of what can follow \p and \P has been + updated. + +9. (1c24ba01) There was an error in the table of lengths for parsed items for +the OPTIONS item, but fortuitously it could never have actually bitten. While +fixing this, some other code that could never be obeyed was discovered and +removed. + +10. (674b6640) Removed some incorect optimization code from DFA matching that +has been there since PCRE1, but has just been found to cause a no match return +instead of a partial match in some cases. It involves partial matching when (*F) +is present so is unlikely to have actually affected anyone. + +11. (b0f4ac17) Tidy the wording and formatting of some pcre2test error messages +concerned with bad modifiers. Also restrict single-letter modifier sequences to +the first item in a modifier list, as documented and always intended. + +12. (1415565c) An iterator at the end of many assertions can always be +auto-possessified, but not at the end of variable-length lookbehinds. There was +a bug in the code that checks for such a lookbehind; it was looking only at the +first branch, which is wrong because some branches can be fixed length when +others are not, for example (?<=AB|CD?). Now all branches are checked for +variability. + +13. (ead08288) Matching with pcre2_match() could give an incorrect result if a +variable-length lookbehind was used as the condition in a conditional group. +The condition could erroneously be treated as true if a branch matched but +overran the current position. This bug was in the interpreter only; matching +with JIT was correct. + +14. (#443) Split out the sljit sub-project into a "Git submodule". Git users +must now run `git submodule init; git submodule update` after a Git checkout, or +the build will fail due to missing files in deps/sljit. + +15. (#441) Add a new error code (PCRE2_ERROR_JIT_UNSUPPORTED) which is yielded +for unsupported jit features. + +16. (#444) Fix bug in 'first code unit' and 'last code unit' optimization +combined with lookahead assertions. + +17. (#445, #447, #449, #451, #452, #459, #563) Add a new feature called scan +substring. This feature is a new type of assertion which matches the content of +a capturing block to a sub-pattern. + +18. (#450) Improvements to 'first code unit' / 'starting code units' +optimisation. + +19. (#455) Many, many improvements to the JIT compiler. + +20. Item 43 of 10.43 was incomplete because it addressed only \z and not \Z, +which was still misbehaving when matching fragments inside invalid UTF strings. + +21. (d29e7290) Octal escapes of the form \045 or \111 were not being recognized +in substitution strings, and if encountered gave an error, though the \o{...} +form was recognized. This bug is now fixed. + +22. (#463, #487) Fix 1 byte out-of-bounds read when parsing malformed limits +(e.g. LIMIT_HEAP) + +23. Many improvements to test infrastructure. Many more platforms and +configurations are now run in Continuous Integration, and all the platforms now +run the full test suite, rather than a partial subset. + +24. (#475) Implement title casing in substitution strings using Perl syntax. + +25. (#478, #504) Disallow \x if not followed by { or a hex digit. + +26. (#473) Implements Python-style backrefs in substitutions. + +27. (#472) Fix error reporting for certain over-large octal escapes. + +28. (#482) Fix parsing of named captures in replacement strings, allowing +non-ASCII capture names to be used. + +29. (#477, #474, #488, #494, #496, #506, #508, #511, #518, #524, #540) Many +improvements to parsing and optimising of character classes. + +30. (#483, #498) Add support for \g and $ to replacement strings. + +31. (#470) Add option flags PCRE2_EXTRA_NO_BS0 and PCRE2_EXTRA_PYTHON_OCTAL. + +32. (#471) Add new API function pcre2_set_optimize() for controlling which +optimizations are enabled. + +33. (#491) Adds $& $` $' and $_ to substitution replacements, as well as +interpreting \b and \v as characters. + +34. (#499) Add option PCRE2_EXTRA_NEVER_CALLOUT to disable callouts. + +35. (#503, #513) Update Unicode support to UCD 16. + +36. (#512, #618, #638) Add new function pcre2_set_substitute_case_callout() to +allow clients to provide a custom callback with locale-aware case +transformation. + +37. (#516) Fix case-insensitive matching of backreferences when using the +PCRE2_EXTRA_CASELESS_RESTRICT option. + +38. (#519) In pcre2grep, add $& as an alias for $0 + +39. (c9bf8339, #534) Updated perltest.sh to enable locale setting. + +40. (#521) Add support for Turkish I casefolding, using new options +PCRE2_EXTRA_TURKISH_CASING, and added pre-pattern flags (*TURKISH_CASING) and +(*CASELESS_RESTRICT). + +41. (#523, #546, #547) Add support for UTS#18 compatible character classes, +using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a metacharacter +within character classes and the operators '&&', '--' and '~~', allowing +subtractions and intersections of character classes to be easily expressed. + +42. (#553, #586, #596, #597) Add support for Perl-style extended character +classes, using the syntax (?[...]). This also allows expressing subtractions and +intersections of character classes, but using a different syntax to UTS#18. + +43. (#554) Fixed a bug in JIT affecting greedy bounded repeats. The upper limit +of repeats inside a repeated bracket might be incorrectly checked. + +44. (#556) Fixed a bug in JIT affecting caseful matching of backreferences. When +utf is disabled, and dupnames is enabled, caseless matching was used even +if caseful matching was needed. + +45. (f34fc0a3) Fixed a bug in pcre2grep reported by Alejandro Colomar + (GitHub issue #577). In certain cases, when lines of above and +below context were contiguous, a separator line was incorrectly being inserted. + +46. (#594) Fix a small (one/two byte) out-of-bounds read on invalid UTF-8 input +in pcre2grep. + +47. (#370) Fix the INSTALL_MSVC_PDB CMake flag. + +48. (#366) Install cmake files in prefix/lib/cmake/pcre2 rather than +prefix/cmake. The new CMake flag PCRE2_INSTALL_CMAKEDIR allows customising this +location. + +49. (#624, #626, #628, #632, #639, #641) Reduce code size of generated JIT code +for repeated character classes. + +50. (#623) Update the Bazel build files. + + Version 10.44 07-June-2024 -------------------------- diff --git a/mingw64/share/doc/pcre2/NEWS b/mingw64/share/doc/pcre2/NEWS index 5f8dde35406..4b5ec1e5103 100644 --- a/mingw64/share/doc/pcre2/NEWS +++ b/mingw64/share/doc/pcre2/NEWS @@ -1,6 +1,92 @@ News about PCRE2 releases ------------------------- +Version 10.45 05-February-2025 +------------------------------ + +This is a comparatively large release, incorporating new features, some +bugfixes, and a few changes with slight backwards compatibility implications. +Please see the ChangeLog and Git log for further details. + +Only changes to behaviour, changes to the API, and major changes to the pattern +syntax are described here. + +This release is the first to be available as a (signed) Git tag, or +alternatively as a (signed) tarball of the Git tag. + +This is also the first release to be made by the new maintainers of PCRE2, and +we would like to thank Philip Hazel, creator and maintainer of PCRE and PCRE2. + +* (Git change) The sljit project has been split out into a separate Git + repository. Git users must now run `git submodule init; git submodule update` + after a Git checkout. + +* (Behaviour change) Update Unicode support to UCD 16. + +* (Match behaviour change) Case-insensitive matching of Unicode properties + Ll, Lt, and Lu has been changed to match Perl. Previously, /\p{Ll}/i would + match only lower-case characters (even though case-insensitive matching was + specified). This also affects case-insensitive matching of POSIX classes such + as [:lower:]. + +* (Minor match behaviour change) Case-insensitive matching of backreferences now + respects the PCRE2_EXTRA_CASELESS_RESTRICT option. + +* (Minor pattern syntax change) Parsing of the \x escape is stricter, and is + no longer parsed as an escape for the NUL character if not followed by '{' or + a hexadecimal digit. Use \x00 instead. + +* (Major new feature) Add a new feature called scan substring. This is a new + type of assertion which matches the content of a capturing block to a + sub-pattern. + + Example: to find a word that contains the rare (in English) sequence of + letters "rh" not at the start: + + \b(\w++)(*scan_substring:(1).+rh) + + The first group captures a word which is then scanned by the + (*scan_substring:(1) ... ) assertion, which tests whether the pattern ".+rh" + matches the capture group "(1)". + +* (Major new feature) Add support for UTS#18 compatible character classes, + using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a + metacharacter within character classes and the operators '&&', '--' and '~~', + allowing subtractions and intersections of character classes to be easily + expressed. + + Example: to match Thai or Greek letters (but not letters or other characters + in those scripts), use [\p{L}&&[\p{Thai}||\p{Greek}]]. + +* (Major new feature) Add support for Perl-style extended character classes, + using the syntax (?[...]). This also allows expressing subtractions and + intersections of character classes, but using a different syntax to UTS#18. + + Example: to match Thai or Greek letters (but not letters or other characters + in those scripts), use (?[\p{L} & (\p{Thai} + \p{Greek})]). + +* (Minor feature) Significant improvements to the character class match engine. + Compiled character classes are now more compact, and have faster matching + for large or complex character sets, using binary search through the set. + +* JIT compilation now fails with the new error code PCRE2_ERROR_JIT_UNSUPPORTED + for patterns which use features not supported by the JIT compiler. + +* (Minor feature) New options PCRE2_EXTRA_NO_BS0 (disallow \0 as an escape for + the NUL character); PCRE2_EXTRA_PYTHON_OCTAL (use Python disambiguation rules + for deciding whether \12 is a backreference or an octal escape); + PCRE2_EXTRA_NEVER_CALLOUT (disable callout syntax entirely); + PCRE2_EXTRA_TURKISH_CASING (use Turkish rules for case-insensitive matching). + +* (Minor feature) Add new API function pcre2_set_optimize() for controlling + which optimizations are enabled. + +* (Minor new features) A variety of extensions have been made to + pcre2_substitute() and its syntax for replacement strings. These now support: + \123 octal escapes; titlecasing \u\L; \1 backreferences; \g<1> and $ + backreferences; $& $` $' and $_; new function + pcre2_set_substitute_case_callout() to allow locale-aware case transformation. + Version 10.44 07-June-2024 -------------------------- @@ -13,7 +99,7 @@ increased to 128. Some auxiliary files for building under VMS are added. Version 10.43 16-February-2024 ------------------------------ -There are quite a lot of changes in this release (see ChangeLog and git log for +There are quite a lot of changes in this release (see ChangeLog and Git log for a list). Those that are not bugfixes or code tidies are: * The JIT code no longer supports ARMv5 architecture. @@ -52,7 +138,7 @@ a list). Those that are not bugfixes or code tidies are: matches the "fullwidth" versions of hex digits. PCRE2_EXTRA_ASCII_DIGIT can be used to keep it ASCII only. -* Make PCRE2_UCP the default in UTF mode in pcre2grep and add -no_ucp, +* Make PCRE2_UCP the default in UTF mode in pcre2grep and add --no-ucp, --case-restrict and --posix-digit. * Add --group-separator and --no-group-separator to pcre2grep. diff --git a/mingw64/share/doc/pcre2/README b/mingw64/share/doc/pcre2/README index dab5e94210b..5a50f7f11b5 100644 --- a/mingw64/share/doc/pcre2/README +++ b/mingw64/share/doc/pcre2/README @@ -385,7 +385,7 @@ library. They are also documented in the pcre2build man page. If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of + Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. @@ -411,20 +411,19 @@ library. They are also documented in the pcre2build man page. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about what it is doing. The input strings are specified - by arguments: if an argument starts with "=" the rest of it is a literal - input string. Otherwise, it is assumed to be a file name, and the contents - of the file are the test string. + want to run fuzzing tests on PCRE2. If set, it causes an extra library + called libpcre2-fuzzsupport.a to be built, but not installed. This contains + a single function called LLVMFuzzerTestOneInput() whose arguments are a + pointer to a string and the length of the string. When called, this function + tries to compile the string as a pattern, and if that succeeds, to match + it. This is done both with no options and with some random options bits that + are generated from the string. Setting --enable-fuzz-support also causes an + executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally + run under valgrind or used when PCRE2 is compiled with address sanitizing + enabled. It calls the fuzzing function and outputs information about what it + is doing. The input strings are specified by arguments: if an argument + starts with "=" the rest of it is a literal input string. Otherwise, it is + assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -510,6 +509,7 @@ system. The following are installed (file names are all relative to the LICENCE NEWS README + SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page @@ -607,8 +607,9 @@ zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. +should first run the maint/PrepareRelease script before making a distribution. +This script creates the .txt and HTML forms of the documentation from the man +pages. Testing PCRE2 @@ -822,37 +823,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -860,13 +862,16 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header + src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_neon_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_ucp.h header for Unicode property handling + src/pcre2_util.h header for internal utils - sljit/* source files for the JIT compiler + deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: @@ -878,48 +883,49 @@ The distribution should contain the files listed below. (C) Auxiliary files: - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 + AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 + LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name + SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests + RunTest.bat a Windows batch file for running tests + RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library + m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages + doc/pcre2-config.txt plain text documentation of pcre2-config script + doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories + ar-lib ) + config.guess ) + config.sub ) + depcomp ) helper tools generated by libtool and + compile ) automake, used internally by ./configure + install-sh ) + ltmain.sh ) + missing ) + test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests @@ -927,12 +933,13 @@ The distribution should contain the files listed below. testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files -(D) Auxiliary files for cmake support +(D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake cmake/FindEditline.cmake cmake/FindReadline.cmake + cmake/pcre2-config-version.cmake.in + cmake/pcre2-config.cmake.in CMakeLists.txt config-cmake.h.in @@ -943,14 +950,21 @@ The distribution should contain the files listed below. src/config.h.generic ) a version of config.h for use in non-"configure" ) environments -(F) Auxiliary files for building PCRE2 under OpenVMS +(F) Auxiliary files for building PCRE2 using other build systems + + BUILD.bazel ) + MODULE.bazel ) files used by the Bazel build system + WORKSPACE.bazel ) + build.zig file used by zig's build system + +(G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 15 April 2024 +============================== +Last updated: 18 December 2024 +============================== + diff --git a/mingw64/share/doc/pcre2/SECURITY.md b/mingw64/share/doc/pcre2/SECURITY.md new file mode 100644 index 00000000000..1e3a05b9aef --- /dev/null +++ b/mingw64/share/doc/pcre2/SECURITY.md @@ -0,0 +1,44 @@ +# Security policies + +## Release security + +The PCRE2 project provides source-only releases, with no binaries. + +These source releases can be downloaded from the +[GitHub Releases](https://github.com/PCRE2Project/pcre2/releases) page. Each +release file is GPG-signed. + +* Releases up to and including 10.44 are signed by Philip Hazel (GPG key: + 45F68D54BBE23FB3039B46E59766E084FB0F43D8) +* Releases from 10.45 onwards will be signed by Nicholas Wilson (GPG key: + A95536204A3BB489715231282A98E77EB6F24CA8, cross-signed by Philip + Hazel's key for release continuity) + +From releases 10.45 onwards, the source code will additionally be provided via +Git checkout of the (GPG-signed) release tag. + +Please contact the maintainers for any queries about release integrity or the +project's supply-chain. + +## Reporting vulnerabilities + +The PCRE2 project prioritises security. We appreciate third-party testing and +security research, and would be grateful if you could responsibly disclose your +findings to us. We will make every effort to acknowledge your contributions. + +To report a security issue, please use the GitHub Security Advisory +["Report a Vulnerability"](https://github.com/PCRE2Project/pcre2/security/advisories/new) +tab. (Alternatively, if you prefer you may send a GPG-encrypted email to one of +the maintainers.) + +### Timeline + +As a very small volunteer team, we cannot guarantee rapid response, but would +aim to respond within 1 week, or perhaps 2 during holidays. + +### Response procedure + +PCRE2 has never previously made a rapid or embargoed release in response to a +security incident. We would work with security managers from trusted downstream +distributors, such as major Linux distributions, before disclosing the +vulnerability publicly. diff --git a/mingw64/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt b/mingw64/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt index 851976ae238..bb687f7d040 100644 --- a/mingw64/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt +++ b/mingw64/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt @@ -105,6 +105,7 @@ example. pcre2_chkdint.c pcre2_chartables.c pcre2_compile.c + pcre2_compile_class.c pcre2_config.c pcre2_context.c pcre2_convert.c @@ -138,7 +139,7 @@ example. Note that you must compile pcre2_jit_compile.c, even if you have not defined SUPPORT_JIT in src/config.h, because when JIT support is not configured, dummy functions are compiled. When JIT support IS configured, - pcre2_jit_compile.c #includes other files from the sljit subdirectory, + pcre2_jit_compile.c #includes other files from the sljit dependency, all of whose names begin with "sljit". It also #includes src/pcre2_jit_match.c and src/pcre2_jit_misc.c, so you should not compile those yourself. @@ -301,56 +302,66 @@ Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no spaces in the names for your CMake installation and your PCRE2 source and build directories. -The following instructions were contributed by a PCRE1 user, but they should -also work for PCRE2. If they are not followed exactly, errors may occur. In the -event that errors do occur, it is recommended that you delete the CMake cache -before attempting to repeat the CMake build process. In the CMake GUI, the -cache can be deleted by selecting "File > Delete Cache". +If you are using CMake and encounter errors, deleting the CMake cache and +restarting from a fresh build may fix the error. In the CMake GUI, the cache can +be deleted by selecting "File > Delete Cache"; or the folder "CMakeCache" can +be deleted. -1. Install the latest CMake version available from http://www.cmake.org/, and - ensure that cmake\bin is on your path. +1. Install the latest CMake version available from http://www.cmake.org/, and + ensure that cmake\bin is on your path. -2. Unzip (retaining folder structure) the PCRE2 source tree into a source - directory such as C:\pcre2. You should ensure your local date and time - is not earlier than the file dates in your source dir if the release is - very new. +2. Unzip (retaining folder structure) the PCRE2 source tree into a source + directory such as C:\pcre2. You should ensure your local date and time + is not earlier than the file dates in your source dir if the release is + very new. -3. Create a new, empty build directory, preferably a subdirectory of the - source dir. For example, C:\pcre2\pcre2-xx\build. +3. Create a new, empty build directory, preferably a subdirectory of the + source dir. For example, C:\pcre2\pcre2-xx\build. -4. Run cmake-gui from the Shell environment of your build tool, for example, - Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try - to start Cmake from the Windows Start menu, as this can lead to errors. +4. Run CMake. -5. Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and - build directories, respectively. + - Using the CLI, simply run `cmake ..` inside the `build/` directory. You can + use the `ccmake` ncurses GUI to select and configure PCRE2 features. -6. Hit the "Configure" button. + - Using the CMake GUI: -7. Select the particular IDE / build tool that you are using (Visual - Studio, MSYS makefiles, MinGW makefiles, etc.) + a) Run cmake-gui from the Shell environment of your build tool, for + example, Msys for Msys/MinGW or Visual Studio Command Prompt for + VC/VC++. -8. The GUI will then list several configuration options. This is where - you can disable Unicode support or select other PCRE2 optional features. + b) Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and + build directories, respectively. -9. Hit "Configure" again. The adjacent "Generate" button should now be - active. + c) Press the "Configure" button. -10. Hit "Generate". + d) Select the particular IDE / build tool that you are using (Visual + Studio, MSYS makefiles, MinGW makefiles, etc.) -11. The build directory should now contain a usable build system, be it a - solution file for Visual Studio, makefiles for MinGW, etc. Exit from - cmake-gui and use the generated build system with your compiler or IDE. - E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 - solution, select the desired configuration (Debug, or Release, etc.) and - build the ALL_BUILD project. + e) The GUI will then list several configuration options. This is where + you can disable Unicode support or select other PCRE2 optional features. -12. If during configuration with cmake-gui you've elected to build the test - programs, you can execute them by building the test project. E.g., for - MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The - most recent build configuration is targeted by the tests. A summary of - test results is presented. Complete test output is subsequently - available for review in Testing\Temporary under your build dir. + f) Press "Configure" again. The adjacent "Generate" button should now be + active. + + g) Press "Generate". + +5. The build directory should now contain a usable build system, be it a + solution file for Visual Studio, makefiles for MinGW, etc. Exit from + cmake-gui and use the generated build system with your compiler or IDE. + E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 + solution, select the desired configuration (Debug, or Release, etc.) and + build the ALL_BUILD project. + + Regardless of build system used, `cmake --build .` will build it. + +6. If during configuration with cmake-gui you've elected to build the test + programs, you can execute them by building the test project. E.g., for + MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The + most recent build configuration is targeted by the tests. A summary of + test results is presented. Complete test output is subsequently + available for review in Testing\Temporary under your build dir. + + Regardless of build system used, `ctest` will run the tests. BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO @@ -425,6 +436,7 @@ OpenVMS. They are in the "vms" directory in the distribution tarball. Please read the file called vms/openvms_readme.txt. The pcre2test and pcre2grep programs contain some VMS-specific code. -=========================== -Last Updated: 16 April 2024 -=========================== +============================== +Last updated: 26 December 2024 +============================== + diff --git a/mingw64/share/doc/pcre2/html/README.txt b/mingw64/share/doc/pcre2/html/README.txt index dab5e94210b..5a50f7f11b5 100644 --- a/mingw64/share/doc/pcre2/html/README.txt +++ b/mingw64/share/doc/pcre2/html/README.txt @@ -385,7 +385,7 @@ library. They are also documented in the pcre2build man page. If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of + Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. @@ -411,20 +411,19 @@ library. They are also documented in the pcre2build man page. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about what it is doing. The input strings are specified - by arguments: if an argument starts with "=" the rest of it is a literal - input string. Otherwise, it is assumed to be a file name, and the contents - of the file are the test string. + want to run fuzzing tests on PCRE2. If set, it causes an extra library + called libpcre2-fuzzsupport.a to be built, but not installed. This contains + a single function called LLVMFuzzerTestOneInput() whose arguments are a + pointer to a string and the length of the string. When called, this function + tries to compile the string as a pattern, and if that succeeds, to match + it. This is done both with no options and with some random options bits that + are generated from the string. Setting --enable-fuzz-support also causes an + executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally + run under valgrind or used when PCRE2 is compiled with address sanitizing + enabled. It calls the fuzzing function and outputs information about what it + is doing. The input strings are specified by arguments: if an argument + starts with "=" the rest of it is a literal input string. Otherwise, it is + assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -510,6 +509,7 @@ system. The following are installed (file names are all relative to the LICENCE NEWS README + SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page @@ -607,8 +607,9 @@ zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. +should first run the maint/PrepareRelease script before making a distribution. +This script creates the .txt and HTML forms of the documentation from the man +pages. Testing PCRE2 @@ -822,37 +823,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -860,13 +862,16 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header + src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_neon_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_ucp.h header for Unicode property handling + src/pcre2_util.h header for internal utils - sljit/* source files for the JIT compiler + deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: @@ -878,48 +883,49 @@ The distribution should contain the files listed below. (C) Auxiliary files: - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 + AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 + LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name + SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests + RunTest.bat a Windows batch file for running tests + RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library + m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages + doc/pcre2-config.txt plain text documentation of pcre2-config script + doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories + ar-lib ) + config.guess ) + config.sub ) + depcomp ) helper tools generated by libtool and + compile ) automake, used internally by ./configure + install-sh ) + ltmain.sh ) + missing ) + test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests @@ -927,12 +933,13 @@ The distribution should contain the files listed below. testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files -(D) Auxiliary files for cmake support +(D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake cmake/FindEditline.cmake cmake/FindReadline.cmake + cmake/pcre2-config-version.cmake.in + cmake/pcre2-config.cmake.in CMakeLists.txt config-cmake.h.in @@ -943,14 +950,21 @@ The distribution should contain the files listed below. src/config.h.generic ) a version of config.h for use in non-"configure" ) environments -(F) Auxiliary files for building PCRE2 under OpenVMS +(F) Auxiliary files for building PCRE2 using other build systems + + BUILD.bazel ) + MODULE.bazel ) files used by the Bazel build system + WORKSPACE.bazel ) + build.zig file used by zig's build system + +(G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 15 April 2024 +============================== +Last updated: 18 December 2024 +============================== + diff --git a/mingw64/share/doc/pcre2/html/index.html b/mingw64/share/doc/pcre2/html/index.html index e4dc78620fd..2d81b678fef 100644 --- a/mingw64/share/doc/pcre2/html/index.html +++ b/mingw64/share/doc/pcre2/html/index.html @@ -267,6 +267,9 @@

    Perl-compatible Regular Expressions (revised API: PCRE2)

    pcre2_set_offset_limit   Set the offset limit +pcre2_set_optimize +   Set an optimization directive + pcre2_set_parens_nest_limit   Set the parentheses nesting limit @@ -276,6 +279,12 @@

    Perl-compatible Regular Expressions (revised API: PCRE2)

    pcre2_set_recursion_memory_management   Obsolete function that (from 10.30 onwards) does nothing +pcre2_set_substitute_callout +   Set a substitution callout function + +pcre2_set_substitute_case_callout +   Set a substitution case callout function + pcre2_substitute   Match a compiled pattern to a subject string and do substitutions diff --git a/mingw64/share/doc/pcre2/html/pcre2.html b/mingw64/share/doc/pcre2/html/pcre2.html index 4cb83dc184b..e72b6b1cb1d 100644 --- a/mingw64/share/doc/pcre2/html/pcre2.html +++ b/mingw64/share/doc/pcre2/html/pcre2.html @@ -16,7 +16,7 @@

    pcre2 man page

  • INTRODUCTION
  • SECURITY CONSIDERATIONS
  • USER DOCUMENTATION -
  • AUTHOR +
  • AUTHORS
  • REVISION
    INTRODUCTION
    @@ -190,22 +190,22 @@

    pcre2 man page

    In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results.

    -
    AUTHOR
    +
    AUTHORS

    -Philip Hazel -
    -Retired from University Computing Service -
    -Cambridge, England. -
    +The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Herczeg. +

    +

    +PCRE2 was written by Philip Hazel, of the University Computing Service, +Cambridge, England. Many others have also contributed.

    -Putting an actual email address here is a spam magnet. If you want to email me, -use my two names separated by a dot at gmail.com. +To contact the maintainers, please use the GitHub issues tracker or PCRE2 +mailing list, as described at the project page: +https://github.com/PCRE2Project/pcre2


    REVISION

    -Last updated: 27 August 2021 +Last updated: 18 December 2024
    Copyright © 1997-2021 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2_compile.html b/mingw64/share/doc/pcre2/html/pcre2_compile.html index f0080eabe45..ee933f38983 100644 --- a/mingw64/share/doc/pcre2/html/pcre2_compile.html +++ b/mingw64/share/doc/pcre2/html/pcre2_compile.html @@ -57,6 +57,7 @@

    pcre2_compile man page

    PCRE2_ALLOW_EMPTY_CLASS Allow empty classes PCRE2_ALT_BSUX Alternative handling of \u, \U, and \x PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode + PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax PCRE2_ALT_VERBNAMES Process backslashes in verb names PCRE2_AUTO_CALLOUT Compile automatic callouts PCRE2_CASELESS Do caseless matching diff --git a/mingw64/share/doc/pcre2/html/pcre2_jit_compile.html b/mingw64/share/doc/pcre2/html/pcre2_jit_compile.html index 873d0ddefc6..791dd0c3d78 100644 --- a/mingw64/share/doc/pcre2/html/pcre2_jit_compile.html +++ b/mingw64/share/doc/pcre2/html/pcre2_jit_compile.html @@ -33,9 +33,18 @@

    pcre2_jit_compile man page

    documentation.

    -The first argument is a pointer that was returned by a successful call to -pcre2_compile(), and the second must contain one or more of the following -bits: +The availability of JIT support can be tested by calling +pcre2_compile_jit() with a single option PCRE2_JIT_TEST_ALLOC (the +code argument is ignored, so a NULL value is accepted). Such a call +returns zero if JIT is available and has a working allocator. Otherwise +it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate +executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not +compiled. +

    +

    +Otherwise, the first argument must be a pointer that was returned by a +successful call to pcre2_compile(), and the second must contain one or +more of the following bits:

       PCRE2_JIT_COMPLETE      compile code for full matching
       PCRE2_JIT_PARTIAL_SOFT  compile code for soft partial matching
    @@ -46,11 +55,13 @@ 

    pcre2_jit_compile man page

    option is deprecated and may be removed in the future.

    -The yield of the function is 0 for success, or a negative error code otherwise. -In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or -if an unknown bit is set in options. The function can also return -PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the -compiler, even if it was because of a system security restriction. +The yield of the function when called with any of the three options above is 0 +for success, or a negative error code otherwise. In particular, +PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or if an unknown +bit is set in options. The function can also return PCRE2_ERROR_NOMEMORY +if JIT is unable to allocate executable memory for the compiler, even if it was +because of a system security restriction. In a few cases, the function may +return with PCRE2_ERROR_JIT_UNSUPPORTED for unsupported features.

    There is a complete description of the PCRE2 native API in the diff --git a/mingw64/share/doc/pcre2/html/pcre2_set_compile_extra_options.html b/mingw64/share/doc/pcre2/html/pcre2_set_compile_extra_options.html index 4924ed79b5e..cb62022a22e 100644 --- a/mingw64/share/doc/pcre2/html/pcre2_set_compile_extra_options.html +++ b/mingw64/share/doc/pcre2/html/pcre2_set_compile_extra_options.html @@ -43,6 +43,10 @@

    pcre2_set_compile_extra_options man page

    PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines PCRE2_EXTRA_MATCH_WORD Pattern matches "words" + PCRE2_EXTRA_NEVER_CALLOUT Disallow callouts in pattern + PCRE2_EXTRA_NO_BS0 Disallow \0 (but not \00 or \000) + PCRE2_EXTRA_PYTHON_OCTAL Use Python rules for octal + PCRE2_EXTRA_TURKISH_CASING Use Turkish I case folding
    There is a complete description of the PCRE2 native API in the pcre2api diff --git a/mingw64/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html b/mingw64/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html index ab570cf60d1..a40f41e450c 100644 --- a/mingw64/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html +++ b/mingw64/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html @@ -27,9 +27,9 @@

    pcre2_set_max_pattern_compiled_length man page


    This function sets, in a compile context, the maximum size (in bytes) for the -memory needed to hold the compiled version of a pattern that is compiled with -this context. The result is always zero. If a pattern that is passed to -pcre2_compile() with this context needs more memory, an error is +memory needed to hold the compiled version of a pattern that is using this +context. The result is always zero. If a pattern that is passed to +pcre2_compile() referencing this context needs more memory, an error is generated. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.

    diff --git a/mingw64/share/doc/pcre2/html/pcre2_set_optimize.html b/mingw64/share/doc/pcre2/html/pcre2_set_optimize.html new file mode 100644 index 00000000000..47caeb267ae --- /dev/null +++ b/mingw64/share/doc/pcre2/html/pcre2_set_optimize.html @@ -0,0 +1,57 @@ + + +pcre2_set_optimize specification + + +

    pcre2_set_optimize man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +
    +SYNOPSIS +
    +

    +#include <pcre2.h> +

    +

    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +

    +
    +DESCRIPTION +
    +

    +This function controls which performance optimizations will be applied +by pcre2_compile(). It can be called multiple times with the same compile +context; the effects are cumulative, with the effects of later calls taking +precedence over earlier ones. +

    +

    +The result is zero for success, PCRE2_ERROR_NULL if ccontext is NULL, +or PCRE2_ERROR_BADOPTION if directive is unknown. The latter could be +useful to detect if a certain optimization is available. +

    +

    +The list of possible values for the directive parameter are: +

    +  PCRE2_OPTIMIZATION_FULL   Enable all optimizations (default)
    +  PCRE2_OPTIMIZATION_NONE   Disable all optimizations
    +  PCRE2_AUTO_POSSESS        Enable auto-possessification
    +  PCRE2_AUTO_POSSESS_OFF    Disable auto-possessification
    +  PCRE2_DOTSTAR_ANCHOR      Enable implicit dotstar anchoring
    +  PCRE2_DOTSTAR_ANCHOR_OFF  Disable implicit dotstar anchoring
    +  PCRE2_START_OPTIMIZE      Enable start-up optimizations at match time
    +  PCRE2_START_OPTIMIZE_OFF  Disable start-up optimizations at match time
    +
    +There is a complete description of the PCRE2 native API, including detailed +descriptions directive parameter values in the +pcre2api +page. +

    +Return to the PCRE2 index page. +

    diff --git a/mingw64/share/doc/pcre2/html/pcre2_set_substitute_callout.html b/mingw64/share/doc/pcre2/html/pcre2_set_substitute_callout.html index 7ae3a398d79..8640728fdc4 100644 --- a/mingw64/share/doc/pcre2/html/pcre2_set_substitute_callout.html +++ b/mingw64/share/doc/pcre2/html/pcre2_set_substitute_callout.html @@ -20,7 +20,7 @@

    pcre2_set_substitute_callout man page

    int pcre2_set_substitute_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_substitute_callout_block *), + int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);


    diff --git a/mingw64/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html b/mingw64/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html new file mode 100644 index 00000000000..ab506879f1f --- /dev/null +++ b/mingw64/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html @@ -0,0 +1,45 @@ + + +pcre2_set_substitute_case_callout specification + + +

    pcre2_set_substitute_case_callout man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +
    +SYNOPSIS +
    +

    +#include <pcre2.h> +

    +

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +

    +
    +DESCRIPTION +
    +

    +This function sets the substitute case callout fields in a match context (the +first argument). The second argument specifies a callout function, and the third +argument is an opaque data item that is passed to it. The result of this +function is always zero. +

    +

    +There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +

    +Return to the PCRE2 index page. +

    diff --git a/mingw64/share/doc/pcre2/html/pcre2api.html b/mingw64/share/doc/pcre2/html/pcre2api.html index 6b60ee9fa7a..079cf176daa 100644 --- a/mingw64/share/doc/pcre2/html/pcre2api.html +++ b/mingw64/share/doc/pcre2/html/pcre2api.html @@ -179,6 +179,10 @@

    pcre2api man page


    int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); +
    +
    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive);


    PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

    @@ -203,6 +207,13 @@

    pcre2api man page

    void *callout_data);

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);
    @@ -808,6 +819,7 @@

    pcre2api man page

    The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) + Which performance optimizations the compiler should apply
  • A compile context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of @@ -952,6 +964,110 @@

    pcre2api man page

    nesting, and the second is user data that is set up by the last argument of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. +
    +
    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +
    +
    +PCRE2 can apply various performance optimizations during compilation, in order +to make matching faster. For example, the compiler might convert some regex +constructs into an equivalent construct which pcre2_match() can execute +faster. By default, all available optimizations are enabled. However, in rare +cases, one might wish to disable specific optimizations. For example, if it is +known that some optimizations cannot benefit a certain regex, it might be +desirable to disable them, in order to speed up compilation. +

    +

    +The permitted values of directive are as follows: +

    +  PCRE2_OPTIMIZATION_FULL
    +
    +Enable all optional performance optimizations. This is the default value. +
    +  PCRE2_OPTIMIZATION_NONE
    +
    +Disable all optional performance optimizations. +
    +  PCRE2_AUTO_POSSESS
    +  PCRE2_AUTO_POSSESS_OFF
    +
    +Enable/disable "auto-possessification" of variable quantifiers such as * and +. +This optimization, for example, turns a+b into a++b in order to avoid +backtracks into a+ that can never be successful. However, if callouts are in +use, auto-possessification means that some callouts are never taken. You can +disable this optimization if you want the matching functions to do a full, +unoptimized search and run all the callouts. +
    +  PCRE2_DOTSTAR_ANCHOR
    +  PCRE2_DOTSTAR_ANCHOR_OFF
    +
    +Enable/disable an optimization that is applied when .* is the first significant +item in a top-level branch of a pattern, and all the other branches also start +with .* or with \A or \G or ^. Such a pattern is automatically anchored if +PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any +^ items. Otherwise, the fact that any match must start either at the start of +the subject or following a newline is remembered. Like other optimizations, +this can cause callouts to be skipped. +

    +

    +Dotstar anchor optimization is automatically disabled for .* if it is inside an +atomic group or a capture group that is the subject of a backreference, or if +the pattern contains (*PRUNE) or (*SKIP). +

    +  PCRE2_START_OPTIMIZE
    +  PCRE2_START_OPTIMIZE_OFF
    +
    +Enable/disable optimizations which cause matching functions to scan the subject +string for specific code unit values before attempting a match. For example, if +it is known that an unanchored match must start with a specific value, the +matching code searches the subject for that value, and fails immediately if it +cannot find it, without actually running the main matching function. This means +that a special item such as (*COMMIT) at the start of a pattern is not +considered until after a suitable starting point for the match has been found. +Also, when callouts or (*MARK) items are in use, these "start-up" optimizations +can cause them to be skipped if the pattern is never actually used. The start-up +optimizations are in effect a pre-scan of the subject that takes place before +the pattern is run. +

    +

    +Disabling start-up optimizations ensures that in cases where the result is "no +match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are +considered at every possible starting position in the subject string. +

    +

    +Disabling start-up optimizations may change the outcome of a matching operation. +Consider the pattern +

    +  (*COMMIT)ABC
    +
    +When this is compiled, PCRE2 records the fact that a match must start with the +character "A". Suppose the subject string is "DEFABC". The start-up +optimization scans along the subject, finds "A" and runs the first match +attempt from there. The (*COMMIT) item means that the pattern must match the +current starting position, which in this case, it does. However, if the same +match is run without start-up optimizations, the initial scan along the subject +string does not happen. The first match attempt is run starting from "D" and +when this fails, (*COMMIT) prevents any further matches being tried, so the +overall result is "no match". +

    +

    +Another start-up optimization makes use of a minimum length for a matching +subject, which is recorded when possible. Consider the pattern +

    +  (*MARK:1)B(*MARK:2)(X|Y)
    +
    +The minimum length for a match is two characters. If the subject is "XXBB", the +"starting character" optimization skips "XX", then tries to match "BB", which +is long enough. In the process, (*MARK:2) is encountered and remembered. When +the match attempt fails, the next "B" is found, but there is only one character +left, so there are no more attempts, and "no match" is returned with the "last +mark seen" set to "2". Without start-up optimizations, however, matches are +tried at every possible starting position, including at the end of the subject, +where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" +that is returned is "1". In this case, the optimizations do not affect the +overall match result, which is still "no match", but they do affect the +auxiliary information that is returned.


    The match context @@ -1011,6 +1127,19 @@

    pcre2api man page

    below.

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    +This sets up a callout function for PCRE2 to call when performing case +transformations inside pcre2_substitute(). Details are given in the +section entitled "Creating a new string with substitutions" +below. +
    +
    int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);
    @@ -1228,7 +1357,10 @@

    pcre2api man page

    The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee that JIT will be used for -any given match. See the +any given match, and neither does it guarantee that JIT will actually be able +to function, because it may not be able to allocate executable memory in some +environments. There is a special call to pcre2_jit_compile() that can be +used to check this. See the pcre2jit documentation for more details.
    @@ -1431,7 +1563,7 @@ 

    pcre2api man page

    error has occurred.

    -There are nearly 100 positive error codes that pcre2_compile() may return +There are over 100 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and @@ -1539,6 +1671,16 @@

    pcre2api man page

    end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. +
    +  PCRE2_ALT_EXTENDED_CLASS
    +
    +Alters the parsing of character classes to follow the extended syntax +described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact +on the behaviour of the Perl-specific "(?[...])" syntax for extended classes, +but instead enables the alternative syntax of extended class behaviour inside +ordinary "[...]" character classes. See the +pcre2pattern +documentation for details of the character classes supported.
       PCRE2_ALT_VERBNAMES
     
    @@ -1569,16 +1711,31 @@

    pcre2api man page

    changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than -U+007F. Note that there are two ASCII characters, K and S, that, in addition to +U+007F. +

    +

    +Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.

    +One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +

    +

    For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having another case. +

    +

    +From release 10.45 PCRE2_CASELESS also affects what some of the letter-related +Unicode property escapes (\p and \P) match. The properties Lu (upper case +letter), Ll (lower case letter), and Lt (title case letter) are all treated as +LC (cased letter) when PCRE2_CASELESS is set.

       PCRE2_DOLLAR_ENDONLY
     
    @@ -1775,7 +1932,7 @@

    pcre2api man page

    for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external -sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error.
       PCRE2_NEVER_UTF
     
    @@ -1798,85 +1955,57 @@

    pcre2api man page

       PCRE2_NO_AUTO_POSSESS
     
    -If this option is set, it disables "auto-possessification", which is an -optimization that, for example, turns a+b into a++b in order to avoid +If this (deprecated) option is set, it disables "auto-possessification", which +is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. +

    +

    +If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather +than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS +takes precedence over the pcre2_set_optimize() optimization directives +PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF.

       PCRE2_NO_DOTSTAR_ANCHOR
     
    -If this option is set, it disables an optimization that is applied when .* is -the first significant item in a top-level branch of a pattern, and all the -other branches also start with .* or with \A or \G or ^. The optimization is -automatically disabled for .* if it is inside an atomic group or a capture -group that is the subject of a backreference, or if the pattern contains -(*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is -automatically anchored if PCRE2_DOTALL is set for all the .* items and -PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match -must start either at the start of the subject or following a newline is +If this (deprecated) option is set, it disables an optimization that is applied +when .* is the first significant item in a top-level branch of a pattern, and +all the other branches also start with .* or with \A or \G or ^. The +optimization is automatically disabled for .* if it is inside an atomic group +or a capture group that is the subject of a backreference, or if the pattern +contains (*PRUNE) or (*SKIP). When the optimization is not disabled, such a +pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items +and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any +match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. +(If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF +instead.)
       PCRE2_NO_START_OPTIMIZE
     
    This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT -compiler. +compiler. Setting this option is equivalent to calling pcre2_set_optimize() +with the directive parameter set to PCRE2_START_OPTIMIZE_OFF.

    There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without -actually running the main matching function. This means that a special item -such as (*COMMIT) at the start of a pattern is not considered until after a -suitable starting point for the match has been found. Also, when callouts or -(*MARK) items are in use, these "start-up" optimizations can cause them to be -skipped if the pattern is never actually used. The start-up optimizations are +actually running the main matching function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run.

    -The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, -possibly causing performance to suffer, but ensuring that in cases where the -result is "no match", the callouts do occur, and that items such as (*COMMIT) -and (*MARK) are considered at every possible starting position in the subject -string. -

    -

    -Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation. -Consider the pattern -

    -  (*COMMIT)ABC
    -
    -When this is compiled, PCRE2 records the fact that a match must start with the -character "A". Suppose the subject string is "DEFABC". The start-up -optimization scans along the subject, finds "A" and runs the first match -attempt from there. The (*COMMIT) item means that the pattern must match the -current starting position, which in this case, it does. However, if the same -match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the -subject string does not happen. The first match attempt is run starting from -"D" and when this fails, (*COMMIT) prevents any further matches being tried, so -the overall result is "no match". -

    -

    -As another start-up optimization makes use of a minimum length for a matching -subject, which is recorded when possible. Consider the pattern -

    -  (*MARK:1)B(*MARK:2)(X|Y)
    -
    -The minimum length for a match is two characters. If the subject is "XXBB", the -"starting character" optimization skips "XX", then tries to match "BB", which -is long enough. In the process, (*MARK:2) is encountered and remembered. When -the match attempt fails, the next "B" is found, but there is only one character -left, so there are no more attempts, and "no match" is returned with the "last -mark seen" set to "2". If NO_START_OPTIMIZE is set, however, matches are tried -at every possible starting position, including at the end of the subject, where -(*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is -returned is "1". In this case, the optimizations do not affect the overall -match result, which is still "no match", but they do affect the auxiliary -information that is returned. +Disabling the start-up optimizations may cause performance to suffer. However, +this may be desirable for patterns which contain callouts or items such as +(*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF +for further details.
       PCRE2_NO_UTF_CHECK
     
    @@ -1931,9 +2060,16 @@

    pcre2api man page

    upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default). -The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless +

    +

    +The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII -characters match only non-ASCII characters. +characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option +(see above) alters the matching of the 'i' characters to follow their behaviour +in Turkish and Azeri languages. For further details on +PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the +pcre2unicode +page.

       PCRE2_UNGREEDY
     
    @@ -2070,7 +2206,8 @@

    pcre2api man page

    ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must either be ASCII or non-ASCII. The option -can be changed with a pattern by the (?r) option setting. +can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option +settings.
       PCRE2_EXTRA_ESCAPED_CR_IS_LF
     
    @@ -2097,6 +2234,34 @@

    pcre2api man page

    at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. +
    +  PCRE2_EXTRA_NO_BS0
    +
    +If this option is set (note that its final character is the digit 0) it locks +out the use of the sequence \0 unless at least one more octal digit follows. +
    +  PCRE2_EXTRA_PYTHON_OCTAL
    +
    +If this option is set, PCRE2 follows Python's rules for interpreting octal +escape sequences. The rules for handling sequences such as \14, which could +be an octal number or a back reference are different. Details are given in the +pcre2pattern +documentation. +
    +  PCRE2_EXTRA_NEVER_CALLOUT
    +
    +If this option is set, PCRE2 treats callouts in the pattern as a syntax error, +returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application +knows that a callout will not be provided to pcre2_match(), so that +callouts in the pattern are not silently ignored. +
    +  PCRE2_EXTRA_TURKISH_CASING
    +
    +This option alters case-equivalence of the 'i' letters to follow the +alphabet used by Turkish and Azeri languages. The option can be changed within +a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or +UCP options must be set. In the 8-bit library, UTF must be set. This option +cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.


    JUST-IN-TIME (JIT) COMPILATION

    @@ -2303,6 +2468,7 @@

    pcre2api man page

    PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF
    For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3646,9 +3812,10 @@

    pcre2api man page

    too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (without, of course, writing anything) -in order to compute the size of buffer that is needed. This value is passed -back via the outlengthptr variable, with the result of the function still -being PCRE2_ERROR_NOMEMORY. +in order to compute the size of buffer that is needed, which will include the +extra space for the terminating NUL. This value is passed back via the +outlengthptr variable, with the result of the function still being +PCRE2_ERROR_NOMEMORY.

    Passing a buffer size of zero is a permitted way of finding out how much memory @@ -3667,18 +3834,26 @@

    pcre2api man page

    in any way. By default, however, a dollar character is an escape character that can specify the insertion of characters from capture groups and names from (*MARK) or other control verbs in the pattern. Dollar is the only escape -character (backslash is treated as literal). The following forms are always +character (backslash is treated as literal). The following forms are recognized:
       $$                  insert a dollar character
    -  $<n> or ${<n>}      insert the contents of group <n>
    +  $n or ${n}          insert the contents of group n
    +  $0 or $&            insert the entire matched substring
    +  $`                  insert the substring that precedes the match
    +  $'                  insert the substring that follows the match
    +  $_                  insert the entire input string
       $*MARK or ${*MARK}  insert a control verb name
     
    -Either a group number or a group name can be given for <n>. Curly brackets are -required only if the following character would be interpreted as part of the -number or name. The number may be zero to include the entire matched string. -For example, if the pattern a(b)c is matched with "=abc=" and the replacement -string "+$1$0$1+", the result is "=+babcb+=". +Either a group number or a group name can be given for n, for example $2 or +$NAME. Curly brackets are required only if the following character would be +interpreted as part of the number or name. The number may be zero to include +the entire matched string. For example, if the pattern a(b)c is matched with +"=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=". +

    +

    +The JavaScript form $<name>, where the angle brackets are part of the syntax, +is also recognized for group names, but not for group numbers or *MARK.

    $*MARK inserts the name from the last encountered backtracking control verb on @@ -3732,28 +3907,53 @@

    pcre2api man page

    PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When -PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +PCRE2_SUBSTITUTE_EXTENDED is set, several things change:

    Firstly, backslash in a replacement string is interpreted as an escape -character. The usual forms such as \n or \x{ddd} can be used to specify -particular character codes, and backslash followed by any non-alphanumeric -character quotes that character. Extended quoting can be coded using \Q...\E, -exactly as in pattern strings. +character. The usual forms such as \x{ddd} can be used to specify particular +character codes, and backslash followed by any non-alphanumeric character +quotes that character. Extended quoting can be coded using \Q...\E, exactly +as in pattern strings. The escapes \b and \v are interpreted as the +characters backspace and vertical tab, respectively. +

    +

    +The interpretation of backslash followed by one or more digits is the same as +in a pattern, which in Perl has some ambiguities. Details are given in the +pcre2pattern +page. +

    +

    +The Python form \g<n>, where the angle brackets are part of the syntax and n +is either a group name or number, is recognized as an altertive way of +inserting the contents of a group, for example \g<3>.

    There are also four escape sequences for forcing the case of inserted letters. -The insertion mechanism has three states: no case forcing, force upper case, -and force lower case. The escape sequences change the current state: \U and -\L change to upper or lower case forcing, respectively, and \E (when not -terminating a \Q quoted sequence) reverts to no case forcing. The sequences -\u and \l force the next character (if it is a letter) to upper or lower -case, respectively, and then the state automatically reverts to no case -forcing. Case forcing applies to all inserted characters, including those from -capture groups and letters within \Q...\E quoted sequences. If either -PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode +Case forcing applies to all inserted characters, including those from capture +groups and letters within \Q...\E quoted sequences. The insertion mechanism +has three states: no case forcing, force upper case, and force lower case. The +escape sequences change the current state: \U and \L change to upper or lower +case forcing, respectively, and \E (when not terminating a \Q quoted +sequence) reverts to no case forcing. The sequences \u and \l force the next +character (if it is a letter) to upper or lower case, respectively, and then +the state automatically reverts to no case forcing. +

    +

    +However, if \u is immediately followed by \L or \l is immediately followed +by \U, the next character's case is forced by the first escape sequence, and +subsequent characters by the second. This provides a "title casing" facility +that can be applied to group captures. For example, if group 1 has captured +"heLLo", the replacement string "\u\L$1" becomes "Hello". +

    +

    +If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater -than 127. +than 127. However, only simple case folding, as determined by the Unicode file +CaseFolding.txt is supported. PCRE2 does not support language-specific +special casing rules such as using different lower case Greek sigmas in the +middle and ends of words (as defined in the Unicode file +SpecialCasing.txt).

    Note that case forcing sequences such as \U...\E do not nest. For example, @@ -3762,20 +3962,20 @@

    pcre2api man page

    not apply to replacement strings.

    -The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash:

    -  ${<n>:-<string>}
    -  ${<n>:+<string1>:<string2>}
    +  ${n:-string}
    +  ${n:+string1:string2}
     
    -As before, <n> may be a group number or a name. The first form specifies a -default value. If group <n> is set, its value is inserted; if not, <string> is -expanded and the result inserted. The second form specifies strings that are -expanded and inserted when group <n> is set or unset, respectively. The first -form is just a convenient shorthand for +As in the simple case, n may be a group number or a name. The first form +specifies a default value. If group n is set, its value is inserted; if +not, the string is expanded and the result inserted. The second form specifies +strings that are expanded and inserted when group n is set or unset, +respectively. The first form is just a convenient shorthand for
    -  ${<n>:+${<n>}:<string>}
    +  ${n:+${n}:string}
     
    Backslash can be used to escape colons and closing curly brackets in the replacement strings. A change of the case forcing state within a replacement @@ -3852,9 +4052,18 @@

    pcre2api man page

    The pcre2_set_substitution_callout() function can be used to specify a callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution has -been processed, but it can cause the replacement not to happen. The callout -function is not called for simulated substitutions that happen as a result of -the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. +been processed, but it can cause the replacement not to happen. +

    +

    +The callout function is not called for simulated substitutions that happen as a +result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when +substitution processing exceeds the buffer space provided by the caller, +processing continues by counting code units. The simulation is unable to +populate the callout block, and so the simulation is pessimistic about the +required buffer size. Whichever is larger of accepted or rejected substitution +is reported as the required size. Therefore, the returned buffer length may be +an overestimate (without a substitution callout, it is normally an exact +measurement).

    The first argument of the callout function is a pointer to a substitute callout @@ -3903,6 +4112,107 @@

    pcre2api man page

    output and the call to pcre2_substitute() exits, returning the number of matches so far.

    +
    +Substitution case callouts +
    +

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    +The pcre2_set_substitution_case_callout() function can be used to specify +a callout function for pcre2_substitute() to use when performing case +transformations. This does not affect any case insensitivity behaviour when +performing a match, but only the user-visible transformations performed when +processing a substitution such as: +

    +    pcre2_substitute(..., "\\U$1", ...)
    +
    +

    +

    +The default case transformations applied by PCRE2 are reasonably complete, and, +in UTF or UCP mode, perform the simple locale-invariant case transformations as +specified by Unicode. This is suitable for the internal (invisible) +case-equivalence procedures used during pattern matching, but an application +may wish to use more sophisticated locale-aware processing for the user-visible +substitution transformations. +

    +

    +One example implementation of the callout_function using the ICU +library would be: +
    +
    +

    +    PCRE2_SIZE
    +    icu_case_callout(
    +      PCRE2_SPTR input, PCRE2_SIZE input_len,
    +      PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
    +      int to_case, void *data_ptr)
    +    {
    +      UErrorCode err = U_ZERO_ERROR;
    +      int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER
    +        ? u_strToLower(output, output_cap, input, input_len, NULL, &err)
    +        : to_case == PCRE2_SUBSTITUTE_CASE_UPPER
    +        ? u_strToUpper(output, output_cap, input, input_len, NULL, &err)
    +        : u_strToTitle(output, output_cap, input, input_len, &first_char_only,
    +                       NULL, &err);
    +      if (U_FAILURE(err)) return (~(PCRE2_SIZE)0);
    +      return r;
    +    }
    +
    +

    +

    +The first and second arguments of the case callout function are the Unicode +string to transform. +

    +

    +The third and fourth arguments are the output buffer and its capacity. +

    +

    +The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, +PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. +PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the +callout to indicate that the case of the entire callout input should be +case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that +only the first character or glyph should be transformed to Unicode titlecase +and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode +properties to titlecase each word in a string; but PCRE2 is requesting that only +the single leading character is to be titlecased). +

    +

    +The sixth argument is the callout_data supplied to +pcre2_set_substitute_case_callout(). +

    +

    +The resulting string in the destination buffer may be larger or smaller than the +input, if the casing rules merge or split characters. The return value is the +length required for the output string. If a buffer of sufficient size was +provided to the callout, then the result must be written to the buffer and the +number of code units returned. If the result does not fit in the provided +buffer, then the required capacity must be returned and PCRE2 will not make use +of the output buffer. PCRE2 provides input and output buffers which overlap, so +the callout must support this by suitable internal buffering. +

    +

    +Alternatively, if the callout wishes to indicate an error, then it may return +(~(PCRE2_SIZE)0). In this case pcre2_substitute() will immediately fail with +error PCRE2_ERROR_REPLACECASE. +

    +

    +When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH +option, there are situations when pcre2_substitute() will return an +underestimate of the required buffer size. If you call pcre2_substitute() once +with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for +the replacement string to be constructed, then instead of calling the case +callout, pcre2_substitute() will make an estimate of the required buffer size. +The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that +second call is not guaranteed to succeed either, if the case callout requires +more buffer space than expected. The caller must make repeated attempts in a +loop. +


    DUPLICATE CAPTURE GROUP NAMES

    int pcre2_substring_nametable_scan(const pcre2_code *code, @@ -4177,7 +4487,7 @@

    pcre2api man page


    REVISION

    -Last updated: 24 April 2024 +Last updated: 26 December 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2build.html b/mingw64/share/doc/pcre2/html/pcre2build.html index d4b0d336b08..f4e127f14ca 100644 --- a/mingw64/share/doc/pcre2/html/pcre2build.html +++ b/mingw64/share/doc/pcre2/html/pcre2build.html @@ -643,7 +643,7 @@

    pcre2build man page


    REVISION

    -Last updated: 15 April 2024 +Last updated: 16 April 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2compat.html b/mingw64/share/doc/pcre2/html/pcre2compat.html index d60182ed48a..5f7e280d34f 100644 --- a/mingw64/share/doc/pcre2/html/pcre2compat.html +++ b/mingw64/share/doc/pcre2/html/pcre2compat.html @@ -71,7 +71,7 @@

    pcre2compat man page

    7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties such as Lu and -Nd, the derived properties Any and LC (synonym L&), script names such as Greek +Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See the @@ -99,7 +99,12 @@

    pcre2compat man page

    \Q\\E \ \\E
    The \Q...\E sequence is recognized both inside and outside character classes -by both PCRE2 and Perl. +by both PCRE2 and Perl. Another difference from Perl is that any appearance of +\Q or \E inside what might otherwise be a quantifier causes PCRE2 not to +recognize the sequence as a quantifier. Perl recognizes a quantifier if +(redundantly) either of the numbers is inside \Q...\E, but not if the +separating comma is. When not recognized as a quantifier a sequence such as +{\Q1\E,2} is treated as the literal string "{1,2}".

    9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) @@ -120,7 +125,9 @@

    pcre2compat man page

    not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are -processed as anchored at the point where they are tested. +processed as anchored at the point where they are tested. PCRE2 also confines +all control verbs within atomic assertions, again including (*THEN) in +assertions with only one branch.

    12. If a pattern contains more than one backtracking control verb, the first @@ -159,11 +166,11 @@

    pcre2compat man page

    certainly user mistakes.

    -17. In PCRE2, the upper/lower case character properties Lu and Ll are not -affected when case-independent matching is specified. For example, \p{Lu} -always matches an upper case letter. I think Perl has changed in this respect; -in the release at the time of writing (5.38), \p{Lu} and \p{Ll} match all -letters, regardless of case, when case independence is specified. +17. In PCRE2, until release 10.45, the upper/lower case character properties Lu +and Ll were not affected when case-independent matching was specified. Perl has +changed in this respect, and PCRE2 has now changed to match. When caseless +matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased +letter).

    18. From release 5.32.0, Perl locks out the use of \K in lookaround @@ -231,6 +238,10 @@

    pcre2compat man page

    numbers such as +2 and -4 in all three cases. Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. +
    +
    +(m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 extension +that is not available in Perl.

    20. Perl has different limits than PCRE2. See the @@ -252,6 +263,18 @@

    pcre2compat man page

    /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject.

    +

    +23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl tries to +recover and prints a warning if the problem was that an invalid hexadecimal +digit was found, since PCRE2 doesn't have warnings it returns an error instead. +Additionally, Perl accepts \x{} and generates NUL unlike PCRE2. +

    +

    +24. From release 10.45, PCRE2 gives an error if \x is not followed by a +hexadecimal digit or a curly bracket. It used to interpret this as the NUL +character. Perl still generates NUL, but warns when in warning mode in most +cases. +


    AUTHOR
    @@ -267,9 +290,9 @@

    pcre2compat man page

    REVISION

    -Last updated: 30 November 2023 +Last updated: 02 October 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/mingw64/share/doc/pcre2/html/pcre2convert.html b/mingw64/share/doc/pcre2/html/pcre2convert.html index 6b9fea5575e..57e8989fb4a 100644 --- a/mingw64/share/doc/pcre2/html/pcre2convert.html +++ b/mingw64/share/doc/pcre2/html/pcre2convert.html @@ -182,7 +182,7 @@

    pcre2convert man page


    REVISION

    -Last updated: 28 June 2018 +Last updated: 14 November 2023
    Copyright © 1997-2018 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2grep.html b/mingw64/share/doc/pcre2/html/pcre2grep.html index bd12246ae99..66c56029698 100644 --- a/mingw64/share/doc/pcre2/html/pcre2grep.html +++ b/mingw64/share/doc/pcre2/html/pcre2grep.html @@ -391,9 +391,10 @@

    pcre2grep man page

    command line, no delimiters should be used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --newline option has no effect on this option. Trailing white space is -removed from each line, and blank lines are ignored. An empty file contains no +removed from each line, and blank lines are ignored unless the +--posix-pattern-file option is also provided. An empty file contains no patterns and therefore matches nothing. Patterns read from a file in this way -may contain binary zeros, which are treated as ordinary data characters. +may contain binary zeros, which are treated as ordinary character literals.

    If this option is given more than once, all the specified files are read. A @@ -723,9 +724,9 @@

    pcre2grep man page



    $<digits> or ${<digits>} is replaced by the captured substring of the given -decimal number; zero substitutes the whole match. If the number is greater than -the number of capturing substrings, or if the capture is unset, the replacement -is empty. +decimal number; $& (or the legacy $0) substitutes the whole match. If the +number is greater than the number of capturing substrings, or if the capture +is unset, the replacement is empty.

    $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by @@ -808,6 +809,15 @@

    pcre2grep man page

    allowing \w to match Unicode letters and digits.

    +--posix-pattern-file +When patterns are provided with the -f option, do not trim trailing +spaces or ignore empty lines in a similar way than other grep tools. To keep +the behaviour consistent with older versions, if the pattern read was +terminated with CRLF (as character literals) then both characters won't be +included as part of it, so if you really need to have pattern ending in '\r', +use a escape sequence or provide it by a different method. +

    +

    -q, --quiet Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found. @@ -993,7 +1003,7 @@

    pcre2grep man page

    callout facility. However, this support can be completely or partially disabled when pcre2grep is built. You can find out whether your binary has support for callouts by running it with the --help option. If callout support is -completely disabled, all callouts in patterns are ignored by pcre2grep. +completely disabled, callouts in patterns are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored.

    @@ -1015,9 +1025,9 @@

    pcre2grep man page

    zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the --output (-O) option (see -above). However, $0 cannot be used to insert a matched substring because the -match is still in progress. Instead, the single character '0' is inserted. Any -syntax errors in the string (for example, a dollar not followed by another +above). However, $0 or $& cannot be used to insert a matched substring because +the match is still in progress. Instead, the single character '0' is inserted. +Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example: @@ -1047,9 +1057,9 @@

    pcre2grep man page

    Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the --output -(-O) option documented above, except that $0 cannot insert the matched -string because the match is still in progress. Instead, the character '0' -is inserted. If you need a literal dollar or pipe character in any +(-O) option documented above, except that $0 or $& cannot insert the +matched string because the match is still in progress. Instead, the character +'0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example:
       echo -e "abcde\n12345" | pcre2grep \
    @@ -1116,7 +1126,7 @@ 

    pcre2grep man page


    REVISION

    -Last updated: 22 December 2023 +Last updated: 04 February 2025
    Copyright © 1997-2023 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2jit.html b/mingw64/share/doc/pcre2/html/pcre2jit.html index d97d8003ccb..6835cd8898a 100644 --- a/mingw64/share/doc/pcre2/html/pcre2jit.html +++ b/mingw64/share/doc/pcre2/html/pcre2jit.html @@ -64,7 +64,7 @@

    pcre2jit man page

    If --enable-jit is set on an unsupported platform, compilation fails.

    -A client program can tell if JIT support is available by calling +A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular @@ -72,11 +72,19 @@

    pcre2jit man page

    items that are not supported by JIT (see below). Another reason is that in some environments JIT is unable to get -memory in which to build its compiled code. The only guarantee from +executable memory in which to build its compiled code. The only guarantee from pcre2_config() is that if it returns zero, JIT will definitely not be used.

    +As of release 10.45 there is a more informative way to test for JIT support. If +pcre2_compile_jit() is called with the single option PCRE2_JIT_TEST_ALLOC +it returns zero if JIT is available and has a working allocator. Otherwise it +returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable +memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. The +code argument is ignored, so it can be a NULL value. +

    +

    A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs @@ -126,7 +134,8 @@

    pcre2jit man page

    PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial matching. If pcre2_jit_compile() is called with no option bits set, it immediately -returns zero. This is an alternative way of testing whether JIT is available. +returns zero. This is an alternative way of testing whether JIT support has +been compiled.

    At present, it is not possible to free JIT compiled code except when the entire @@ -487,7 +496,7 @@

    pcre2jit man page


    REVISION

    -Last updated: 21 February 2024 +Last updated: 22 August 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2limits.html b/mingw64/share/doc/pcre2/html/pcre2limits.html index 8152ed22d71..514c50b2396 100644 --- a/mingw64/share/doc/pcre2/html/pcre2limits.html +++ b/mingw64/share/doc/pcre2/html/pcre2limits.html @@ -96,7 +96,7 @@

    pcre2limits man page

    REVISION

    -Last updated: August 2023 +Last updated: 16 August 2023
    Copyright © 1997-2023 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2matching.html b/mingw64/share/doc/pcre2/html/pcre2matching.html index 3b8b629380c..4d0232507b6 100644 --- a/mingw64/share/doc/pcre2/html/pcre2matching.html +++ b/mingw64/share/doc/pcre2/html/pcre2matching.html @@ -27,7 +27,7 @@

    pcre2matching man page

    This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() -function. This works in the same as Perl's matching function, and provide a +function. This works in the same as Perl's matching function, and provides a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the pcre2jit @@ -42,7 +42,7 @@

    pcre2matching man page

    When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, -when there are multiple possibilities. For example, if the pattern +when there are multiple possibilities. For example, if the anchored pattern

       ^<.*>
     
    @@ -115,9 +115,9 @@

    pcre2matching man page

    Note that the size of vector needed to contain all the results depends on the -number of simultaneous matches, not on the number of parentheses in the -pattern. Using pcre2_match_data_create_from_pattern() to create the match -data block is therefore not advisable when doing DFA matching. +number of simultaneous matches, not on the number of capturing parentheses in +the pattern. Using pcre2_match_data_create_from_pattern() to create the +match data block is therefore not advisable when doing DFA matching.

    Note also that all the matches that are found start at the same point in the @@ -166,37 +166,43 @@

    pcre2matching man page

    do this. This means that no captured substrings are available.

    -3. Because no substrings are captured, backreferences within the pattern are -not supported. -

    -

    -4. For the same reason, conditional expressions that use a backreference as the -condition or test for a specific group recursion are not supported. -

    -

    -5. Again for the same reason, script runs are not supported. +3. Because no substrings are captured, a number of related features are not +available: +
    +
    +(a) Backreferences; +
    +
    +(b) Conditional expressions that use a backreference as the condition or test +for a specific group recursion; +
    +
    +(c) Script runs; +
    +
    +(d) Scan substring assertions.

    -6. Because many paths through the tree may be active, the \K escape sequence, +4. Because many paths through the tree may be active, the \K escape sequence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported.

    -7. Callouts are supported, but the value of the capture_top field is +5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0.

    -8. The \C escape sequence, which (in the standard algorithm) always matches a -single code unit, even in a UTF mode, is not supported in these modes, because +6. The \C escape sequence, which (in the standard algorithm) always matches a +single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree.

    -9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not +7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion.

    -10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not +8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not supported by pcre2_dfa_match().


    ADVANTAGES OF THE ALTERNATIVE ALGORITHM
    @@ -223,15 +229,18 @@

    pcre2matching man page

    less susceptible to optimization.

    -2. Capturing parentheses, backreferences, script runs, and matching within -invalid UTF string are not supported. +2. Capturing parentheses and other features such as backreferences that rely on +them are not supported. +

    +

    +3. Matching within invalid UTF strings is not supported.

    -3. Although atomic groups are supported, their use does not provide the +4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm.

    -4. JIT optimization is not supported. +5. JIT optimization is not supported.


    AUTHOR

    @@ -244,7 +253,7 @@

    pcre2matching man page


    REVISION

    -Last updated: 19 January 2024 +Last updated: 30 August 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2partial.html b/mingw64/share/doc/pcre2/html/pcre2partial.html index 64116c4f20f..067064d90a1 100644 --- a/mingw64/share/doc/pcre2/html/pcre2partial.html +++ b/mingw64/share/doc/pcre2/html/pcre2partial.html @@ -399,7 +399,7 @@

    pcre2partial man page


    REVISION

    -Last updated: 04 September 2019 +Last updated: 27 November 2024
    Copyright © 1997-2019 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2pattern.html b/mingw64/share/doc/pcre2/html/pcre2pattern.html index cf50c1a1095..84eb0aa17c5 100644 --- a/mingw64/share/doc/pcre2/html/pcre2pattern.html +++ b/mingw64/share/doc/pcre2/html/pcre2pattern.html @@ -14,37 +14,41 @@

    pcre2pattern man page



    PCRE2 REGULAR EXPRESSION DETAILS

    @@ -52,9 +56,11 @@

    pcre2pattern man page

    are described in detail below. There is a quick-reference syntax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. -PCRE2 also supports some alternative regular expression syntax (which does not -conflict with the Perl syntax) in order to provide some compatibility with -regular expressions in Python, .NET, and Oniguruma. +PCRE2 also supports some alternative regular expression syntax that does not +conflict with the Perl syntax in order to provide some compatibility with +regular expressions in Python, .NET, and Oniguruma. There are in addition some +options that enable alternative syntax and semantics that are not the same as +in Perl.

    Perl's regular expressions are described in its own documentation, and regular @@ -74,7 +80,19 @@

    pcre2pattern man page

    pcre2matching page.

    -
    SPECIAL START-OF-PATTERN ITEMS
    +
    EBCDIC CHARACTER CODES
    +

    +Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes +this by default. However, it can be compiled to run in an environment that uses +the EBCDIC code, which is the case for some IBM mainframe operating systems. In +the sections below, character code values are ASCII or Unicode; in an EBCDIC +environment these characters may have different code values, and there are no +code points greater than 255. Differences in behaviour when PCRE2 is running in +an EBCDIC environment are described in the section +"EBCDIC environments" +below, which you can ignore unless you really are in an EBCDIC environment. +

    +
    SPECIAL START-OF-PATTERN ITEMS

    A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-compatible, but @@ -141,7 +159,8 @@

    pcre2pattern man page


    If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting -the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making quantifiers +the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_optimize() with +a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers possessive when what follows cannot match the repeated item. For example, by default a+b is treated as a++b. For more details, see the pcre2api @@ -152,8 +171,9 @@

    pcre2pattern man page


    If a pattern starts with (*NO_START_OPT), it has the same effect as setting the -PCRE2_NO_START_OPTIMIZE option. This disables several optimizations for quickly -reaching "no match" results. For more details, see the +PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_optimize() with +a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for +quickly reaching "no match" results. For more details, see the pcre2api documentation.

    @@ -162,7 +182,8 @@

    pcre2pattern man page


    If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as -setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimizations that +setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_optimize() +with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables optimizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the pcre2api @@ -275,14 +296,6 @@

    pcre2pattern man page

    (*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, corresponding to PCRE2_BSR_UNICODE.

    -
    EBCDIC CHARACTER CODES
    -

    -PCRE2 can be compiled to run in an environment that uses EBCDIC as its -character code instead of ASCII or Unicode (typically a mainframe system). In -the sections below, character code values are ASCII or Unicode; in an EBCDIC -environment these characters may have different code values, and there are no -code points greater than 255. -


    CHARACTERS AND METACHARACTERS

    A regular expression is a pattern that is matched against a subject string from @@ -298,7 +311,10 @@

    pcre2pattern man page

    equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to -pcre2_compile() or set by (?r) within the pattern). +pcre2_compile() or set by (*CASELESS_RESTRICT) or (?r) within the +pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed +to pcre2_compile() or set by (*TURKISH_CASING) within the pattern), then +the 'i' letters are matched according to Turkish and Azeri languages.

    The power of regular expressions comes from the ability to include wild cards, @@ -346,7 +362,7 @@

    pcre2pattern man page

    If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or -between a # outside a character class and the next newline, inclusive, are +between a # outside a character class and the next newline, inclusive, is ignored. An escaping backslash can be used to include a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are @@ -404,6 +420,14 @@

    pcre2pattern man page

    the pattern (that is, \E is assumed at the end). If the isolated \Q is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket. +

    +

    +Another difference from Perl is that any appearance of \Q or \E inside what +might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a +quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers +is inside \Q...\E, but not if the separating comma is. When not recognized as +a quantifier a sequence such as {\Q1\E,2} is treated as the literal string +"{1,2}".


    Non-printing characters @@ -424,17 +448,28 @@

    pcre2pattern man page

    \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd - \ddd character with octal code ddd, or backreference + \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh..
    -By default, after \x that is not followed by {, from zero to two hexadecimal -digits are read (letters can be in upper or lower case). Any number of -hexadecimal digits may appear between \x{ and }. If a character other than a -hexadecimal digit appears between \x{ and }, or if there is no terminating }, -an error occurs. +A description of how back references work is given +later, +following the discussion of +parenthesized groups. +

    +

    +By default, after \x that is not followed by {, one or two hexadecimal +digits are read (letters can be in upper or lower case). If the character that +follows \x is neither { nor a hexadecimal digit, an error occurs. This is +different from Perl's default behaviour, which generates a NUL character, but +is in line with the behaviour of Perl's 'strict' mode in re. +

    +

    +Any number of hexadecimal digits may appear between \x{ and }. If a character +other than a hexadecimal digit appears between \x{ and }, or if there is no +terminating }, an error occurs.

    Characters whose code points are less than 256 can be defined by either of the @@ -481,69 +516,54 @@

    pcre2pattern man page

    a compile-time error occurs.

    -When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, -\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c -escape is processed as specified for Perl in the perlebcdic document. The -only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], -^, _, or ?. Any other character provokes a compile-time error. The sequence -\c@ encodes character code 0; after \c the letters (in either case) encode -characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 -(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +For differences in the way some escapes behave in EBCDIC environments, +see section +"EBCDIC environments" +below.

    +
    +Octal escapes and back references +

    -Thus, apart from \c?, these escapes generate the same character code values as -they do in an ASCII environment, though the meanings of the values mostly -differ. For example, \cG always generates code value 7, which is BEL in ASCII -but DEL in EBCDIC. +The escape \o must be followed by a sequence of octal digits, enclosed in +braces. An error occurs if this is not the case. This escape provides a way of +specifying character code points as octal numbers greater than 0777, and it +also allows octal numbers and backreferences to be unambiguously distinguished.

    -The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but -because 127 is not a control character in EBCDIC, Perl makes it generate the -APC character. Unfortunately, there are several variants of EBCDIC. In most of -them the APC character has the value 255 (hex FF), but in the one Perl calls -POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC -values, PCRE2 makes \c? generate 95; otherwise it generates 255. +If braces are not used, after \0 up to two further octal digits are read. +However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit +must follow \0 (use \00 to generate a NUL character). Make sure you supply +two digits after the initial zero if the pattern character that follows is +itself an octal digit.

    -After \0 up to two further octal digits are read. If there are fewer than two -digits, just those that are present are used. Thus the sequence \0\x\015 -specifies two binary zeros followed by a CR character (code value 13). Make -sure you supply two digits after the initial zero if the pattern character that -follows is itself an octal digit. +Inside a character class, when a backslash is followed by any octal digit, up +to three octal digits are read to generate a code point. Any subsequent digits +stand for themselves. The sequences \8 and \9 are treated as the literal +characters "8" and "9".

    -The escape \o must be followed by a sequence of octal digits, enclosed in -braces. An error occurs if this is not the case. This escape is a recent -addition to Perl; it provides way of specifying character code points as octal -numbers greater than 0777, and it also allows octal numbers and backreferences -to be unambiguously specified. +Outside a character class, Perl's handling of a backslash followed by a digit +other than 0 is complicated by ambiguity, and Perl has changed over time, +causing PCRE2 also to change. From PCRE2 release 10.45 there is an option +called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous +rules. The next two subsections describe the two sets of rules.

    For greater clarity and unambiguity, it is best to avoid following \ by a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical -character code points, and \g{...} to specify backreferences. The following -paragraphs describe the old, ambiguous syntax. -

    -

    -The handling of a backslash followed by a digit other than 0 is complicated, -and Perl has changed over time, causing PCRE2 also to change. -

    -

    -Outside a character class, PCRE2 reads the digit and any following digits as a -decimal number. If the number is less than 10, begins with the digit 8 or 9, or -if there are at least that many previous capture groups in the expression, the -entire sequence is taken as a backreference. A description of how this -works is given -later, -following the discussion of -parenthesized groups. -Otherwise, up to three octal digits are read to form a character code. +character code points, and \g{...} to specify backreferences.

    +
    +Perl rules for non-class backslash 1-9 +

    -Inside a character class, PCRE2 handles \8 and \9 as the literal characters -"8" and "9", and otherwise reads up to three octal digits following the -backslash, using them to generate a data character. Any subsequent digits stand -for themselves. For example, outside a character class: +All the digits that follow the backslash are read as a decimal number. If the +number is less than 10, begins with the digit 8 or 9, or if there are at least +that many previous capture groups in the expression, the entire sequence is +taken as a back reference. Otherwise, up to three octal digits are read to form +a character code. For example:

       \040   is another way of writing an ASCII space
       \40    is the same, provided there are fewer than 40 previous capture groups
    @@ -560,6 +580,19 @@ 

    pcre2pattern man page

    digits are ever read.


    +Python rules for non_class backslash 1-9 +
    +

    +If there are at least three octal digits after the backslash, exactly three are +read as an octal code point number, but the value must be no greater than +\377, even in modes where higher code point values are supported. Any +subsequent digits stand for themselves. If there are fewer than three octal +digits, the sequence is taken as a decimal back reference. Thus, for example, +\12 is always a back reference, independent of how many captures there are in +the pattern. An error is generated for a reference to a non-existent capturing +group. +

    +
    Constraints on character values

    @@ -805,7 +838,7 @@

    pcre2pattern man page

    sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are -less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points +less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Unknown script and with an unassigned type.

    @@ -823,12 +856,33 @@

    pcre2pattern man page

    \P{xx} a character without the xx property \X a Unicode extended grapheme cluster
    -The property names represented by xx above are not case-sensitive, and in -accordance with Unicode's "loose matching" rules, spaces, hyphens, and -underscores are ignored. There is support for Unicode script names, Unicode -general category properties, "Any", which matches any character (including -newline), Bidi_Class, a number of binary (yes/no) properties, and some special -PCRE2 properties (described +For compatibility with Perl, negation can be specified by including a +circumflex between the opening brace and the property. For example, \p{^Lu} is +the same as \P{Lu}. +

    +

    +In accordance with Unicode's "loose matching" rules, ASCII white space +characters, hyphens, and underscores are ignored in the properties represented +by xx above. As well as the space character, ASCII white space can be +tab, linefeed, vertical tab, formfeed, or carriage return. +

    +

    +Some properties are specified as a name only; others as a name and a value, +separated by a colon or an equals sign. The names and values consist of ASCII +letters and digits (with one Perl-specific exception, see below). They are not +case sensitive. Note, however, that the escapes themselves, \p and \P, +are case sensitive. There are abbreviations for many names. The following +examples are all equivalent: +

    +  \p{bidiclass=al}
    +  \p{BC=al}
    +  \p{ Bidi_Class : AL }
    +  \p{ Bi-di class = Al }
    +  \P{ ^ Bi-di class = Al }
    +
    +There is support for Unicode script names, Unicode general category properties, +"Any", which matches any character (including newline), Bidi_Class, a number of +binary (yes/no) properties, and some special PCRE2 properties (described below). Certain other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} does not match any characters, so always causes a @@ -844,10 +898,11 @@

    pcre2pattern man page

    example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and "script extensions" for the -property types are recognized, and a equals sign is an alternative to the -colon. If a script name is given without a property type, for example, -\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this -interpretation at release 5.26 and PCRE2 changed at release 10.40. +property types are recognized and, as for all property specifications, an +equals sign is an alternative to the colon. If a script name is given without a +property type, for example, \p{Adlam}, it is treated as \p{scx:Adlam}. Perl +changed to this interpretation at release 5.26 and PCRE2 changed at release +10.40.

    Unassigned characters (and in non-UTF 32-bit mode, characters with code points @@ -865,15 +920,10 @@

    pcre2pattern man page


    Each character has exactly one Unicode general category property, specified by -a two-letter abbreviation. For compatibility with Perl, negation can be -specified by including a circumflex between the opening brace and the property -name. For example, \p{^Lu} is the same as \P{Lu}. -

    -

    -If only one letter is specified with \p or \P, it includes all the general -category properties that start with that letter. In this case, in the absence -of negation, the curly brackets in the escape sequence are optional; these two -examples have the same effect: +a two-letter abbreviation. If only one letter is specified with \p or \P, it +includes all the general category properties that start with that letter. In +this case, in the absence of negation, the curly brackets in the escape +sequence are optional; these two examples have the same effect:

       \p{L}
       \pL
    @@ -888,6 +938,7 @@ 

    pcre2pattern man page

    Cs Surrogate L Letter + Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter @@ -924,9 +975,13 @@

    pcre2pattern man page

    Zp Paragraph separator Zs Space separator
    -The special property LC, which has the synonym L&, is also supported: it -matches a character that has the Lu, Ll, or Lt property, in other words, a -letter that is not classified as a modifier or "other". +Perl originally used the name L& for the Lc property. This is still supported +by Perl, but discouraged. PCRE2 also still supports it. This property matches +any character that has the Lu, Ll, or Lt property, in other words, any letter +that is not classified as a modifier or "other". From release 10.45 of PCRE2 +the properties Lu, Ll, and Lt are all treated as Lc when case-independent +matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The +other properties are not affected by caseless matching.

    The Cs (Surrogate) property applies only to characters whose code points are in @@ -948,11 +1003,6 @@

    pcre2pattern man page

    Instead, this property is assumed for any code point that is not in the Unicode table.

    -

    -Specifying caseless matching does not affect these escape sequences. For -example, \p{Lu} always matches only upper case letters. This is different from -the behaviour of current versions of Perl. -


    Binary (yes/no) properties for \p and \P
    @@ -997,10 +1047,11 @@

    pcre2pattern man page

    RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space
    -An equals sign may be used instead of a colon. The class names are -case-insensitive; only the short names listed above are recognized. +As in all property specifications, an equals sign may be used instead of a +colon and the class names are case-insensitive. Only the short names listed +above are recognized; PCRE2 does not at present support any long alternatives.


    Extended grapheme clusters @@ -1073,11 +1124,11 @@

    pcre2pattern man page

    Xan matches characters that have either the L (letter) or the N (number) property. Xps matches the characters tab, linefeed, vertical tab, form feed, or -carriage return, and any other character that has the Z (separator) property. -Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl -compatibility, but Perl changed. Xwd matches the same characters as Xan, plus -those that match Mn (non-spacing mark) or Pc (connector punctuation, which -includes underscore). +carriage return, and any other character that has the Z (separator) property +(this includes the space character). Xsp is the same as Xps; in PCRE1 it used +to exclude vertical tab, for Perl compatibility, but Perl changed. Xwd matches +the same characters as Xan, plus those that match Mn (non-spacing mark) or Pc +(connector punctuation, which includes underscore).

    There is another non-standard property, Xuc, which matches any character that @@ -1389,13 +1440,12 @@

    pcre2pattern man page

    character, or escape it with a backslash.

    -For example, the character class [aeiou] matches any lower case vowel, while -[^aeiou] matches any character that is not a lower case vowel. Note that a -circumflex is just a convenient notation for specifying the characters that -are in the class by enumerating those that are not. A class that starts with a -circumflex is not an assertion; it still consumes a character from the subject -string, and therefore it fails if the current pointer is at the end of the -string. +For example, the character class [aeiou] matches any lower case English vowel, +whereas [^aeiou] matches all other characters. Note that a circumflex is just a +convenient notation for specifying the characters that are in the class by +enumerating those that are not. A class that starts with a circumflex is not an +assertion; it still consumes a character from the subject string, and therefore +it fails to match if the current pointer is at the end of the string.

    Characters in a class may be specified by their code points using \o, \x, or @@ -1405,7 +1455,10 @@

    pcre2pattern man page

    match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) -respectively when either PCRE2_UTF or PCRE2_UCP is set. +respectively when either PCRE2_UTF or PCRE2_UCP is set. If you do not want +these ASCII/non-ASCII case equivalences, you can suppress them by setting +PCRE2_EXTRA_CASELESS_RESTRICT, either as an option in a compile context, or by +including (*CASELESS_RESTRICT) or (?r) within a pattern.

    Characters that might indicate line breaks are never treated in any special way @@ -1437,6 +1490,12 @@

    pcre2pattern man page

    b to d, a hyphen character, or z.

    +There is some special treatment for alphabetic ranges in EBCDIC environments; +see the section +"EBCDIC environments" +below. +

    +

    Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d or \H. However, unless the hyphen is the last character in the class, Perl outputs a @@ -1448,9 +1507,9 @@

    pcre2pattern man page

    range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as -the end of range, so [W-\]46] is interpreted as a class containing a range -followed by two other characters. The octal or hexadecimal representation of -"]" can also be used to end a range. +the end of a range, so [W-\]46] is interpreted as a class containing a range +and two other characters. The octal or hexadecimal representation of "]" can +also be used to end a range.

    Ranges normally include all code points between the start and end characters, @@ -1463,15 +1522,6 @@

    pcre2pattern man page

    surrogates, are always permitted.

    -There is a special case in EBCDIC environments for ranges whose end points are -both specified as literal letters in the same case. For compatibility with -Perl, EBCDIC code points within the range that are not letters are omitted. For -example, [h-k] matches only four characters, even though the codes for h and k -are 0x88 and 0x92, a range of 11 code points. However, if the range is -specified numerically, for example, [\x88-\x92] or [h-\x92], all code points -are included. -

    -

    If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character @@ -1487,18 +1537,132 @@

    pcre2pattern man page

    something AND NOT ...".

    -The only metacharacters that are recognized in character classes are backslash, -hyphen (only where it can be interpreted as specifying a range), circumflex -(only at the start), opening square bracket (only when it can be interpreted as -introducing a POSIX class name, or for a special compatibility feature - see -the next two sections), and the terminating closing square bracket. However, -escaping other non-alphanumeric characters does no harm. +The metacharacters that are recognized in character classes are backslash, +hyphen (when it can be interpreted as specifying a range), circumflex +(only at the start), and the terminating closing square bracket. An opening +square bracket is also special when it can be interpreted as introducing a +POSIX class (see +"Posix character classes" +below), or a special compatibility feature (see +"Compatibility feature for word boundaries" +below. Escaping any non-alphanumeric character in a class turns it into a +literal, whether or not it would otherwise be a metacharacter. +

    +
    PERL EXTENDED CHARACTER CLASSES
    +

    +From release 10.45 PCRE2 supports Perl's (?[...]) extended character class +syntax. This can be used to perform set operations such as intersection on +character classes. +

    +

    +The syntax permitted within (?[...]) is quite different to ordinary character +classes. Inside the extended class, there is an expression syntax consisting of +"atoms", operators, and ordinary parentheses "()" used for grouping. Such +classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) +turned on within them. This means that literal space and tab characters are +ignored everywhere in the class. +

    +

    +The allowed atoms are individual characters specified by escape sequences such +as \n or \x{123}, character types such as \d, POSIX classes such as +[:alpha:], and nested ordinary (non-extended) character classes. For example, +in (?[\d & [...]]) the nested class [...] follows the usual rules for ordinary +character classes, in which parentheses are not metacharacters, and character +literals and ranges are permitted. +

    +

    +Character literals and ranges may not appear outside a nested ordinary +character class because they are not atoms in the extended syntax. The extended +syntax does not introduce any additional escape sequences, so (?[\y]) is an +unknown escape, as it would be in [\y]. +

    +

    +In the extended syntax, ^ does not negate a class (except within an +ordinary class nested inside an extended class); it is instead a binary +operator. +

    +

    +The binary operators are "&" (intersection), "|" or "+" (union), "-" +(subtraction) and "^" (symmetric difference). These are left-associative and +"&" has higher (tighter) precedence, while the others have equal lower +precedence. The one prefix unary operator is "!" (complement), with highest +precedence. +

    +
    UTS#18 EXTENDED CHARACTER CLASSES
    +

    +The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) +syntax, allowing instead extended class behaviour inside ordinary [...] +character classes. This altered syntax for [...] classes is loosely described +by the Unicode standard UTS#18. The PCRE2_ALT_EXTENDED_CLASS option does not +prevent use of (?[...]) classes; it just changes the meaning of all +[...] classes that are not nested inside a Perl (?[...]) class. +

    +

    +Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a +character class with two literal characters "a" and "[", but in UTS#18 extended +classes the "[" character becomes an additional metacharacter within classes, +denoting the start of a nested class, so a literal "[" must be escaped as "\[". +

    +

    +Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", +"--" and "~~" which denote character class union, intersection, subtraction, +and symmetric difference respectively. In standard Perl syntax, these would +simply be needlessly-repeated literals (except for "--" which could be the +start or end of a range). In UTS#18 extended classes these operators can be used +in constructs such as [\p{L}--[QW]] for "Unicode letters, other than Q and W". +A literal "-" at the start or end of a range must be escaped, so while "[--1]" +in Perl syntax is the range from hyphen to "1", it must be escaped as "[\--1]" +in UTS#18 extended classes. +

    +

    +Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to +ignore space and tab characters is not automatically enabled for UTS#18 +extended classes, but it is honoured if set. +

    +

    +Extended UTS#18 classes can be nested, and nested classes are themselves +extended classes (unlike Perl, where nested classes must be simple classes). +For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any letter that is in +the Thai or Greek scripts. Note that this means that no special grouping +characters (such as the parentheses used in Perl's (?[...]) class syntax) are +needed. +

    +

    +Individual class items (literal characters, literal ranges, properties such as +\d or \p{...}, and nested classes) can be combined by juxtaposition or by an +operator. Juxtaposition is the implicit union operator, and binds more tightly +than any explicit operator. Thus a sequence of literals and/or ranges behaves +as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the +same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character +except "E" or "8". +

    +

    +Precedence between the explicit operators is not defined, so mixing operators +is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is +valid.

    -
    POSIX CHARACTER CLASSES
    +

    +This is an emerging syntax which is being adopted gradually across the regex +ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024; +Python's "re" module reserves the syntax for future use with a FutureWarning +for unescaped use of "[" as a literal within character classes. Due to UTS#18 +providing insufficient guidance, engines interpret the syntax differently. +Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 +extended classes, but with slight incompatibilities ([A||B&&C] is parsed as +[A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex"). +

    +

    +PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so +that all the UTS#18 extended classes accepted as valid by PCRE2 have the +property that they are interpreted either with the same behaviour, or as +invalid, by all other major engines. Please file an issue if you are aware of +cross-engine differences in behaviour between PCRE2 and another major engine. +

    +
    POSIX CHARACTER CLASSES

    Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports -this notation. For example, +this notation, in both ordinary and extended classes. For example,

       [01[:alpha:]%]
     
    @@ -1584,7 +1748,7 @@

    pcre2pattern man page

    [:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This -is a change that was made in PCRE release 10.43 for Perl compatibility. +is a change that was made in PCRE2 release 10.43 for Perl compatibility.

    The other POSIX classes are unchanged by PCRE2_UCP, and match only characters @@ -1597,8 +1761,8 @@

    pcre2pattern man page

    (?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency. -

    -
    COMPATIBILITY FEATURE FOR WORD BOUNDARIES
    +

    +
    COMPATIBILITY FEATURE FOR WORD BOUNDARIES

    In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of @@ -1619,7 +1783,7 @@

    pcre2pattern man page

    PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so it also affects these POSIX sequences.

    -
    VERTICAL BAR
    +
    VERTICAL BAR

    Vertical bar characters are used to separate alternative patterns. For example, the pattern @@ -1634,7 +1798,7 @@

    pcre2pattern man page

    "succeeds" means matching the rest of the main pattern as well as the alternative in the group.

    -
    INTERNAL OPTION SETTING
    +
    INTERNAL OPTION SETTING

    The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, @@ -1732,7 +1896,7 @@

    pcre2pattern man page

    the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences.

    -
    GROUPS
    +
    GROUPS

    Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things: @@ -1788,7 +1952,7 @@

    pcre2pattern man page

    reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday".

    -
    DUPLICATE GROUP NUMBERS
    +
    DUPLICATE GROUP NUMBERS

    Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is @@ -1834,7 +1998,7 @@

    pcre2pattern man page

    An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section.

    -
    NAMED CAPTURE GROUPS
    +
    NAMED CAPTURE GROUPS

    Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is @@ -1954,7 +2118,7 @@

    pcre2pattern man page

    pcre2api documentation.

    -
    REPETITION
    +
    REPETITION

    Repetition is specified by quantifiers, which may follow any one of these items: @@ -2118,8 +2282,9 @@

    pcre2pattern man page

    (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking control verbs -(*PRUNE) and (*SKIP) also disable this optimization, and there is an option, -PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. +(*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, +either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call +pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive.

    When a capture group is repeated, the value captured is the substring that @@ -2135,7 +2300,7 @@

    pcre2pattern man page

    matches "aba" the value of the second captured substring is "b".

    -
    ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
    +
    ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS

    With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be @@ -2216,8 +2381,9 @@

    pcre2pattern man page

    PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. -This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting -the pattern with (*NO_AUTO_POSSESS). +This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling +pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive, or by +starting the pattern with (*NO_AUTO_POSSESS).

    When a pattern contains an unlimited repeat inside a group that can itself be @@ -2245,7 +2411,7 @@

    pcre2pattern man page

    sequences of non-digits cannot be broken, and failure happens quickly.

    -
    BACKREFERENCES
    +
    BACKREFERENCES

    Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that @@ -2383,23 +2549,32 @@

    pcre2pattern man page

    This restriction no longer applies, and backtracking into such groups can occur as normal.

    -
    ASSERTIONS
    +
    ASSERTIONS

    -An assertion is a test on the characters following or preceding the current -matching point that does not consume any characters. The simple assertions -coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described +An assertion is a test that does not consume any characters. The test must +succeed for the match to continue. The simple assertions coded as \b, \B, +\A, \G, \Z, \z, ^ and $ are described above.

    -More complicated assertions are coded as parenthesized groups. There are two -kinds: those that look ahead of the current position in the subject string, and -those that look behind it, and in each case an assertion may be positive (must -match for the assertion to be true) or negative (must not match for the -assertion to be true). An assertion group is matched in the normal way, -and if it is true, matching continues after it, but with the matching position +More complicated assertions are coded as parenthesized groups. If matching such +a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed.

    +A special kind of assertion, called a "scan substring" assertion, matches a +subpattern against a previously captured substring. This is described in the +section entitled +"Scan substring assertions" +below. It is a PCRE2 extension, not compatible with Perl. +

    +

    +The other goup-based assertions are of two kinds: those that look ahead of the +current position in the subject string, and those that look behind it, and in +each case an assertion may be positive (must match for the assertion to be +true) or negative (must not match for the assertion to be true). +

    +

    The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic assertions can be @@ -2624,7 +2799,7 @@

    pcre2pattern man page

    is another pattern that matches "foo" preceded by three digits and any three characters that are not "999".

    -
    NON-ATOMIC ASSERTIONS
    +
    NON-ATOMIC ASSERTIONS

    Traditional lookaround assertions are atomic. That is, if an assertion is true, but there is a subsequent matching failure, there is no backtracking into the @@ -2683,8 +2858,67 @@

    pcre2pattern man page

    that assertions that appear as conditions for conditional groups (see below) must be atomic. +

    +
    SCAN SUBSTRING ASSERTIONS
    +

    +A special kind of assertion, not compatible with Perl, makes it possible to +check the contents of a captured substring by matching it with a subpattern. +Because this involves capturing, this feature is not supported by +pcre2_dfa_match(). +

    +

    +A scan substring assertion starts with the sequence (*scan_substring: or +(*scs: which is followed by a list of substring numbers (absolute or relative) +and/or substring names enclosed in single quotes or angle brackets, all within +parentheses. The rest of the item is the subpattern that is applied to the +substring, as shown in these examples: +

    +  (*scan_substring:(1)...)
    +  (*scs:(-2)...)
    +  (*scs:('AB')...)
    +  (*scs:(1,'AB',-2)...)
    +
    +The list of groups is checked in the order they are given, and it is the +contents of the first one that is found to be set that are scanned. When +PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the +same name are checked in numerical order. A scan substring assertion fails if +none of the groups it references have been set.

    -
    SCRIPT RUNS
    +

    +The pattern match on the substring is always anchored, that is, it must match +from the start of the substring. There is no "bumpalong" if it does not match +at the start. The end of the subject is temporarily reset to be the end of the +substring, so \Z, \z, and $ will match there. However, the start of the +subject is not reset. This means that ^ matches only if the substring is +actually at the start of the main subject, but it also means that lookbehind +assertions into what precedes the substring are possible. +

    +

    +Here is a very simple example: find a word that contains the rare (in English) +sequence of letters "rh" not at the start: +

    +  \b(\w++)(*scs:(1).+rh)
    +
    +The first group captures a word which is then scanned by the second group. +This example does not actually need this heavyweight feature; the same match +can be achieved with: +
    +  \b\w+?rh\w*\b
    +
    +When things are more complicated, however, scanning a captured substring can be +a useful way to describe the required match. For exmple, there is a rather +complicated pattern in the PCRE2 test data that checks an entire subject string +for a palindrome, that is, the sequence of letters is the same in both +directions. Suppose you want to search for individual words of two or more +characters such as "level" that are palindromes: +
    +  (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...)
    +
    +Within a substring scanning subpattern, references to other groups work as +normal. Capturing groups may appear, and will retain their values during +ongoing matching if the assertion succeeds. +

    +
    SCRIPT RUNS

    In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are @@ -2746,7 +2980,7 @@

    pcre2pattern man page

    should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking.

    -
    CONDITIONAL GROUPS
    +
    CONDITIONAL GROUPS

    It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on @@ -2947,13 +3181,13 @@

    pcre2pattern man page

    assertion, whether it succeeds or fails. (Compare non-conditional assertions, for which captures are retained only for positive assertions that succeed.)

    -
    COMMENTS
    +
    COMMENTS

    There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as -(?: or a group name or number. The characters that make up a comment play -no part in the pattern matching. +(?: or a group name or number or a Unicode property name. The characters that +make up a comment play no part in the pattern matching.

    The sequence (?# marks the start of a comment that continues up to the next @@ -2977,7 +3211,7 @@

    pcre2pattern man page

    it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so.

    -
    RECURSIVE PATTERNS
    +
    RECURSIVE PATTERNS

    Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can @@ -3165,7 +3399,7 @@

    pcre2pattern man page

    "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works.

    -
    GROUPS AS SUBROUTINES
    +
    GROUPS AS SUBROUTINES

    If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine @@ -3213,7 +3447,7 @@

    pcre2pattern man page

    "Backtracking verbs in subroutines" below.

    -
    ONIGURUMA SUBROUTINE SYNTAX
    +
    ONIGURUMA SUBROUTINE SYNTAX

    For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative @@ -3231,7 +3465,7 @@

    pcre2pattern man page

    Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call.

    -
    CALLOUTS
    +
    CALLOUTS

    Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it @@ -3244,7 +3478,9 @@

    pcre2pattern man page

    function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is passed, or if the callout -entry point is set to NULL, callouts are disabled. +entry point is set to NULL, callout points will be passed over silently during +matching. To disallow callouts in the pattern syntax, you may use the +PCRE2_EXTRA_NEVER_CALLOUT option.

    Within a regular expression, (?C<arg>) indicates a point at which the external @@ -3307,7 +3543,7 @@

    pcre2pattern man page

    The doubling is removed before the string is passed to the callout function.

    -
    BACKTRACKING CONTROL
    +
    BACKTRACKING CONTROL

    There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They @@ -3347,8 +3583,8 @@

    pcre2pattern man page

    Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching -function, because that uses a backtracking algorithm. With the exception of -(*FAIL), which behaves like a failing negative assertion, the backtracking +function or JIT, because they use backtracking algorithms. With the exception +of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function.

    @@ -3369,7 +3605,8 @@

    pcre2pattern man page

    present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option -when calling pcre2_compile(), or by starting the pattern with +when calling pcre2_compile(), by calling pcre2_set_optimize() with a +PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled "Compiling a pattern" @@ -3502,7 +3739,8 @@

    pcre2pattern man page

    If you are interested in (*MARK) values after failed matches, you should -probably set the PCRE2_NO_START_OPTIMIZE option +probably either set the PCRE2_NO_START_OPTIMIZE option or call +pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see above) to ensure that the match is always attempted.

    @@ -3514,9 +3752,9 @@

    pcre2pattern man page

    with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, backtracking cannot pass to the left of the verb. However, when one of these verbs appears inside an -atomic group or in a lookaround assertion that is true, its effect is confined -to that group, because once the group has been matched, there is never any -backtracking into it. Backtracking from beyond an assertion or an atomic group +atomic group or in an atomic lookaround assertion that is true, its effect is +confined to that group, because once the group has been matched, there is never +any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point.

    @@ -3782,9 +4020,11 @@

    pcre2pattern man page

    assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.

    -PCRE2 now supports non-atomic positive assertions, as described in the section -entitled +PCRE2 now supports non-atomic positive assertions and also "scan substring" +assertions, as described in the sections entitled "Non-atomic assertions" +and +"Scan substring assertions" above. These assertions must be standalone (not used as conditions). They are not Perl-compatible. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by @@ -3793,7 +4033,8 @@

    pcre2pattern man page

    The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, -and a negative assertion to be true. +and a negative assertion to be true. This behaviour differs from Perl when the +assertion has only one branch.

    The other backtracking verbs are not treated specially if they appear in a @@ -3829,13 +4070,57 @@

    pcre2pattern man page

    enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +

    +
    EBCDIC ENVIRONMENTS
    +

    +Differences in the way PCRE behaves when it is running in an EBCDIC environment +are covered in this section. +

    +
    +Escape sequences +
    +

    +When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, +\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c +escape is processed as specified for Perl in the perlebcdic document. The +only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], +^, _, or ?. Any other character provokes a compile-time error. The sequence +\c@ encodes character code 0; after \c the letters (in either case) encode +characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 +(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +

    +

    +Thus, apart from \c?, these escapes generate the same character code values as +they do in an ASCII or Unicode environment, though the meanings of the values +mostly differ. For example, \cG always generates code value 7, which is BEL in +ASCII but DEL in EBCDIC. +

    +

    +The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but +because 127 is not a control character in EBCDIC, Perl makes it generate the +APC character. Unfortunately, there are several variants of EBCDIC. In most of +them the APC character has the value 255 (hex FF), but in the one Perl calls +POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC +values, PCRE2 makes \c? generate 95; otherwise it generates 255. +

    +
    +Character classes +
    +

    +In character classes there is a special case in EBCDIC environments for ranges +whose end points are both specified as literal letters in the same case. For +compatibility with Perl, EBCDIC code points within the range that are not +letters are omitted. For example, [h-k] matches only four characters, even +though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code +points. However, if the range is specified numerically, for example, +[\x88-\x92] or [h-\x92], all code points are included.

    -
    SEE ALSO
    +
    SEE ALSO

    pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -3844,9 +4129,9 @@

    pcre2pattern man page

    Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 04 June 2024 +Last updated: 27 November 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2perform.html b/mingw64/share/doc/pcre2/html/pcre2perform.html index 55fdf202fc4..b595119ba88 100644 --- a/mingw64/share/doc/pcre2/html/pcre2perform.html +++ b/mingw64/share/doc/pcre2/html/pcre2perform.html @@ -271,7 +271,7 @@

    pcre2perform man page


    REVISION

    -Last updated: 27 July 2022 +Last updated: 06 December 2022
    Copyright © 1997-2022 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2posix.html b/mingw64/share/doc/pcre2/html/pcre2posix.html index 6e7abd932ab..bc60c3b798c 100644 --- a/mingw64/share/doc/pcre2/html/pcre2posix.html +++ b/mingw64/share/doc/pcre2/html/pcre2posix.html @@ -171,7 +171,7 @@

    pcre2posix man page

    When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments -are ignored, and no captured strings are returned. Versions of the PCRE library +are ignored, and no captured strings are returned. Versions of the PCRE2 library prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens because it disables the use of backreferences.
    @@ -370,7 +370,7 @@ 

    pcre2posix man page


    REVISION

    -Last updated: 19 January 2024 +Last updated: 27 November 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2sample.html b/mingw64/share/doc/pcre2/html/pcre2sample.html index 345df031131..0903f04f99b 100644 --- a/mingw64/share/doc/pcre2/html/pcre2sample.html +++ b/mingw64/share/doc/pcre2/html/pcre2sample.html @@ -101,7 +101,7 @@

    pcre2sample man page

    REVISION

    -Last updated: 02 February 2016 +Last updated: 14 November 2023
    Copyright © 1997-2016 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2serialize.html b/mingw64/share/doc/pcre2/html/pcre2serialize.html index 19418a83b21..d189bde2b63 100644 --- a/mingw64/share/doc/pcre2/html/pcre2serialize.html +++ b/mingw64/share/doc/pcre2/html/pcre2serialize.html @@ -203,7 +203,7 @@

    pcre2serialize man page


    REVISION

    -Last updated: 27 June 2018 +Last updated: 19 January 2024
    Copyright © 1997-2018 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2syntax.html b/mingw64/share/doc/pcre2/html/pcre2syntax.html index 1c0ccb003e2..46da3d71fcc 100644 --- a/mingw64/share/doc/pcre2/html/pcre2syntax.html +++ b/mingw64/share/doc/pcre2/html/pcre2syntax.html @@ -24,34 +24,41 @@

    pcre2syntax man page

  • SCRIPT MATCHING WITH \p AND \P
  • THE BIDI_CLASS PROPERTY FOR \p AND \P
  • CHARACTER CLASSES -
  • QUANTIFIERS -
  • ANCHORS AND SIMPLE ASSERTIONS -
  • REPORTED MATCH POINT SETTING -
  • ALTERNATION -
  • CAPTURING -
  • ATOMIC GROUPS -
  • COMMENT -
  • OPTION SETTING -
  • NEWLINE CONVENTION -
  • WHAT \R MATCHES -
  • LOOKAHEAD AND LOOKBEHIND ASSERTIONS -
  • NON-ATOMIC LOOKAROUND ASSERTIONS -
  • SCRIPT RUNS -
  • BACKREFERENCES -
  • SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) -
  • CONDITIONAL PATTERNS -
  • BACKTRACKING CONTROL -
  • CALLOUTS -
  • SEE ALSO -
  • AUTHOR -
  • REVISION +
  • PERL EXTENDED CHARACTER CLASSES +
  • QUANTIFIERS +
  • ANCHORS AND SIMPLE ASSERTIONS +
  • REPORTED MATCH POINT SETTING +
  • ALTERNATION +
  • CAPTURING +
  • ATOMIC GROUPS +
  • COMMENT +
  • OPTION SETTING +
  • NEWLINE CONVENTION +
  • WHAT \R MATCHES +
  • LOOKAHEAD AND LOOKBEHIND ASSERTIONS +
  • NON-ATOMIC LOOKAROUND ASSERTIONS +
  • SUBSTRING SCAN ASSERTION +
  • SCRIPT RUNS +
  • BACKREFERENCES +
  • SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) +
  • CONDITIONAL PATTERNS +
  • BACKTRACKING CONTROL +
  • CALLOUTS +
  • REPLACEMENT STRINGS +
  • SEE ALSO +
  • AUTHOR +
  • REVISION
    PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY

    -The full syntax and semantics of the regular expressions that are supported by -PCRE2 are described in the +The full syntax and semantics of the regular expression patterns that are +supported by PCRE2 are described in the pcre2pattern -documentation. This document contains a quick-reference summary of the syntax. +documentation. This document contains a quick-reference summary of the pattern +syntax followed by the syntax of replacement strings in substitution function. +The full description of the latter is in the +pcre2api +documentation.


    QUOTING

    @@ -60,7 +67,10 @@

    pcre2syntax man page

    \Q...\E treat enclosed characters as literal
  • Note that white space inside \Q...\E is always treated as literal, even if -PCRE2_EXTENDED is set, causing most other white space to be ignored. +PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also +that PCRE2's handling of \Q...\E has some differences from Perl's. See the +pcre2pattern +documentation for details.


    BRACED ITEMS

    @@ -91,6 +101,11 @@

    pcre2syntax man page

    \xhh character with hex code hh \x{hh..} character with hex code hh.. +\N{U+hh..} is synonymous with \x{hh..} but is not supported in environments +that use EBCDIC code (mainly IBM mainframes). Note that \N not followed by an +opening curly bracket has a different meaning (see below). +

    +

    If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized:

    @@ -98,7 +113,7 @@ 

    pcre2syntax man page

    \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX
    -When \x is not followed by {, from zero to two hexadecimal digits are read, +When \x is not followed by {, one or two hexadecimal digits are read, but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise it matches a literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits @@ -112,9 +127,7 @@

    pcre2syntax man page

    in the pcre2pattern documentation, where details of escape processing in EBCDIC environments are -also given. \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not -supported in EBCDIC environments. Note that \N not followed by an opening -curly bracket has a different meaning (see below). +also given.


    CHARACTER TYPES

    @@ -154,8 +167,9 @@

    pcre2syntax man page

    Property descriptions in \p and \P are matched caselessly; hyphens, -underscores, and white space are ignored, in accordance with Unicode's "loose -matching" rules. +underscores, and ASCII white space characters are ignored, in accordance with +Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} is the same +as \p{ bidi class = AL }.


    GENERAL CATEGORY PROPERTIES FOR \p and \P

    @@ -168,13 +182,13 @@

    pcre2syntax man page

    Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -205,7 +219,9 @@

    pcre2syntax man page

    Zl Line separator Zp Paragraph separator Zs Space separator - + +From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all +equivalent to Lc.


    PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P

    @@ -268,7 +284,7 @@

    pcre2syntax man page

    RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space


    CHARACTER CLASSES
    @@ -299,7 +315,45 @@

    pcre2syntax man page

    but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class.

    -
    QUANTIFIERS
    +

    +When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be +used, allowing nested character classes, combined using set operators. +

    +  [x&&[^y]]   UTS#18 extended character class
    +
    +  x||y        set union (OR)
    +  x&&y        set intersection (AND)
    +  x--y        set difference (AND NOT)
    +  x~~y        set symmetric difference (XOR)
    +
    +
    +

    +
    PERL EXTENDED CHARACTER CLASSES
    +

    +

    +  (?[...])                Perl extended character class
    +  (?[\p{Thai} & \p{Nd}])  operators; whitespace ignored
    +  (?[(x - y) & z])        parentheses for grouping
    +
    +  (?[ [^3] & \p{Nd} ])    [...] is a nested ordinary class
    +  (?[ [:alpha:] - [z] ])  POSIX set is allowed outside [...]
    +  (?[ \d - [3] ])         backslash-escaped set is allowed outside [...]
    +  (?[ !\n & [:ascii:] ])  backslash-escaped character is allowed outside [...]
    +                      all other characters or ranges must be enclosed in [...]
    +
    +  x|y, x+y                set union (OR)
    +  x&y                     set intersection (AND)
    +  x-y                     set difference (AND NOT)
    +  x^y                     set symmetric difference (XOR)
    +  !x                      set complement (NOT)
    +
    +Inside a Perl extended character class, [...] switches mode to be interpreted +as an ordinary character class. Outside of a nested [...], the only items +permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside +a nested ordinary class, ^ has its usual meaning (inverts the class when used +as the first character); outside of a nested class, ^ is the XOR operator. +

    +
    QUANTIFIERS

       ?           0 or 1, greedy
    @@ -323,7 +377,7 @@ 

    pcre2syntax man page

    {,m}? zero up to m, lazy

    -
    ANCHORS AND SIMPLE ASSERTIONS
    +
    ANCHORS AND SIMPLE ASSERTIONS

       \b          word boundary
    @@ -341,7 +395,7 @@ 

    pcre2syntax man page

    \G first matching position in subject

    -
    REPORTED MATCH POINT SETTING
    +
    REPORTED MATCH POINT SETTING

       \K          set reported start of match
    @@ -351,13 +405,13 @@ 

    pcre2syntax man page

    option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ignored in negative ones.

    -
    ALTERNATION
    +
    ALTERNATION

       expr|expr|expr...
     

    -
    CAPTURING
    +
    CAPTURING

       (...)           capture group
    @@ -372,20 +426,20 @@ 

    pcre2syntax man page

    in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit.

    -
    ATOMIC GROUPS
    +
    ATOMIC GROUPS

       (?>...)         atomic non-capture group
       (*atomic:...)   atomic non-capture group
     

    -
    COMMENT
    +
    COMMENT

       (?#....)        comment (not nestable)
     

    -
    OPTION SETTING
    +
    OPTION SETTING

    Changes of these options within a group are automatically cancelled at the end of the group. @@ -409,7 +463,7 @@

    pcre2syntax man page

    (?^) unset imnrsx options
    (?aP) implies (?aT) as well, though this has no additional effect. However, it -means that (?-aP) is really (?-PT) which disables all ASCII restrictions for +means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes.

    @@ -421,20 +475,22 @@

    pcre2syntax man page

    The following are recognized only at the very start of a pattern or after one -of the newline or \R options with similar syntax. More than one of them may -appear. For the first three, d is a decimal number. -

    -  (*LIMIT_DEPTH=d) set the backtracking limit to d
    -  (*LIMIT_HEAP=d)  set the heap size limit to d * 1024 bytes
    -  (*LIMIT_MATCH=d) set the match limit to d
    -  (*NOTEMPTY)      set PCRE2_NOTEMPTY when matching
    -  (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching
    -  (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
    +of the newline or \R sequences or options with similar syntax. More than one
    +of them may appear. For the first three, d is a decimal number.
    +
    +  (*LIMIT_DEPTH=d)     set the backtracking limit to d
    +  (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
    +  (*LIMIT_MATCH=d)     set the match limit to d
    +  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
    +  (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
    +  (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
    +  (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
       (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
    -  (*NO_JIT)       disable JIT optimization
    -  (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
    -  (*UTF)          set appropriate UTF mode for the library in use
    -  (*UCP)          set PCRE2_UCP (use Unicode properties for \d etc)
    +  (*NO_JIT)            disable JIT optimization
    +  (*NO_START_OPT)      no start-match optimization (PCRE2_NO_START_OPTIMIZE)
    +  (*TURKISH_CASING)    set PCRE2_EXTRA_TURKISH_CASING when matching
    +  (*UTF)               set appropriate UTF mode for the library in use
    +  (*UCP)               set PCRE2_UCP (use Unicode properties for \d etc)
     
    Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or pcre2_dfa_match(), @@ -442,7 +498,7 @@

    pcre2syntax man page

    application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.

    -
    NEWLINE CONVENTION
    +
    NEWLINE CONVENTION

    These are recognized only at the very start of the pattern or after option settings with a similar syntax. @@ -455,7 +511,7 @@

    pcre2syntax man page

    (*NUL) the NUL character (binary zero)

    -
    WHAT \R MATCHES
    +
    WHAT \R MATCHES

    These are recognized only at the very start of the pattern or after option setting with a similar syntax. @@ -464,7 +520,7 @@

    pcre2syntax man page

    (*BSR_UNICODE) any Unicode newline sequence

    -
    LOOKAHEAD AND LOOKBEHIND ASSERTIONS
    +
    LOOKAHEAD AND LOOKBEHIND ASSERTIONS

       (?=...)                     )
    @@ -490,7 +546,7 @@ 

    pcre2syntax man page

    (ultimate default 255). If every branch matches a fixed number of characters, the limit for each branch is 65535 characters.

    -
    NON-ATOMIC LOOKAROUND ASSERTIONS
    +
    NON-ATOMIC LOOKAROUND ASSERTIONS

    These assertions are specific to PCRE2 and are not Perl-compatible.

    @@ -503,7 +559,24 @@ 

    pcre2syntax man page

    (*non_atomic_positive_lookbehind:...) )

    -
    SCRIPT RUNS
    +
    SUBSTRING SCAN ASSERTION
    +

    +This feature is not Perl-compatible. +

    +  (*scan_substring:(grouplist)...)  scan captured substring
    +  (*scs:(grouplist)...)             scan captured substring
    +
    +The comma-separated list may identify groups in any of the following ways: +
    +  n       absolute reference
    +  +n      relative reference
    +  -n      relative reference
    +  <name>  name
    +  'name'  name
    +
    +
    +

    +
    SCRIPT RUNS

       (*script_run:...)           ) script run, can be backtracked into
    @@ -513,7 +586,7 @@ 

    pcre2syntax man page

    (*asr:...) )

    -
    BACKREFERENCES
    +
    BACKREFERENCES

       \n              reference by number (can be ambiguous)
    @@ -530,7 +603,7 @@ 

    pcre2syntax man page

    (?P=name) reference by name (Python)

    -
    SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
    +
    SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)

       (?R)            recurse whole pattern
    @@ -549,7 +622,7 @@ 

    pcre2syntax man page

    \g'-n' call subroutine by relative number (PCRE2 extension)

    -
    CONDITIONAL PATTERNS
    +
    CONDITIONAL PATTERNS

       (?(condition)yes-pattern)
    @@ -572,7 +645,7 @@ 

    pcre2syntax man page

    conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists.

    -
    BACKTRACKING CONTROL
    +
    BACKTRACKING CONTROL

    All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour @@ -599,7 +672,7 @@

    pcre2syntax man page

    The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call.

    -
    CALLOUTS
    +
    CALLOUTS

       (?C)            callout (assumed number 0)
    @@ -610,12 +683,58 @@ 

    pcre2syntax man page

    start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it.

    -
    SEE ALSO
    +
    REPLACEMENT STRINGS
    +

    +If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for +pcre2_substitute() is not interpreted. Otherwise, by default, the only +special character is the dollar character in one of the following forms: +

    +  $$                  insert a dollar character
    +  $n or ${n}          insert the contents of group n
    +  $<name>             insert the contents of named group
    +  $0 or $&            insert the entire matched substring
    +  $`                  insert the substring that precedes the match
    +  $'                  insert the substring that follows the match
    +  $_                  insert the entire input string
    +  $*MARK or ${*MARK}  insert a control verb name
    +
    +For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, +there is additional interpretation: +

    +

    +1. Backslash is an escape character, and the forms described in "ESCAPED +CHARACTERS" above are recognized. Also: +

    +  \Q...\E   can be used to suppress interpretation
    +  \l        force the next character to lower case
    +  \u        force the next character to upper case
    +  \L        force subsequent characters to lower case
    +  \U        force subsequent characters to upper case
    +  \u\L      force next character to upper case, then all lower
    +  \l\U      force next character to lower case, then all upper
    +  \E        end \L or \U case forcing
    +  \b        backspace character (note: as in character class in pattern)
    +  \v        vertical tab character (note: not the same as in a pattern)
    +
    +2. The Python form \g<n>, where the angle brackets are part of the syntax and +n is either a group name or a number, is recognized as an alternative way +of inserting the contents of a group, for example \g<3>. +

    +

    +3. Capture substitution supports the following additional forms: +

    +  ${n:-string}             default for unset group
    +  ${n:+string1:string2}    values for set/unset group
    +
    +The substitution strings themselves are expanded. Backslash can be used to +escape colons and closing curly brackets. +

    +
    SEE ALSO

    pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -624,11 +743,11 @@

    pcre2syntax man page

    Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 12 October 2023 +Last updated: 27 November 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/mingw64/share/doc/pcre2/html/pcre2test.html b/mingw64/share/doc/pcre2/html/pcre2test.html index 6cc3cc317ff..db9073f0e60 100644 --- a/mingw64/share/doc/pcre2/html/pcre2test.html +++ b/mingw64/share/doc/pcre2/html/pcre2test.html @@ -105,8 +105,8 @@

    pcre2test man page

    When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed -to the library. For subject lines, backslash escapes can be used. In addition, -when the utf modifier (see +to the library. For subject lines and some patterns, backslash escapes can be +used. In addition, when the utf modifier (see "Setting compilation options" below) is set, the pattern and any following subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. @@ -125,9 +125,8 @@

    pcre2test man page

    than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) -0x80000000 is added to the character's value. This is the only way of passing -such code points in a pattern string. For subject strings, using an escape -sequence is preferable. +0x80000000 is added to the character's value. For subject strings, using an +escape sequence is preferable.


    COMMAND LINE OPTIONS

    @@ -178,8 +177,8 @@

    pcre2test man page

    following options output the value and set the exit code as indicated:
       ebcdic-nl  the code for LF (= NL) in an EBCDIC environment:
    -               0x15 or 0x25
    -               0 if used in an ASCII environment
    +               either 0x15 or 0x25
    +               0 if used in an ASCII/Unicode environment
                    exit code is always 0
       linksize   the configured internal link size (2, 3, or 4)
                    exit code is set to the link size
    @@ -201,6 +200,16 @@ 

    pcre2test man page

    pcre2-8 the 8-bit library was built unicode Unicode support is available
    +Note that the availability of JIT support in the library does not guarantee +that it can actually be used because in some environments it is unable to +allocate executable memory. The option "jitusable" gives more detailed +information. It returns one of the following values: +
    +  0  JIT is available and usable
    +  1  JIT is available but cannot allocate executable memory
    +  2  JIT is not available
    +  3  Unexpected return from test call to pcre2_jit_compile()
    +
    If an unknown option is given, an error message is output; the exit code is 0.

    @@ -527,39 +536,48 @@

    pcre2test man page

    subject_literal modifier was set for the pattern. The following provide a means of encoding non-printing characters in a visible way:
    -  \a         alarm (BEL, \x07)
    -  \b         backspace (\x08)
    -  \e         escape (\x27)
    -  \f         form feed (\x0c)
    -  \n         newline (\x0a)
    -  \r         carriage return (\x0d)
    -  \t         tab (\x09)
    -  \v         vertical tab (\x0b)
    -  \nnn       octal character (up to 3 octal digits); always
    -               a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode
    -  \o{dd...}  octal character (any number of octal digits}
    -  \xhh       hexadecimal byte (up to 2 hex digits)
    -  \x{hh...}  hexadecimal character (any number of hex digits)
    +  \a          alarm (BEL, \x07)
    +  \b          backspace (\x08)
    +  \e          escape (\x27)
    +  \f          form feed (\x0c)
    +  \n          newline (\x0a)
    +  \N{U+hh...} unicode character (any number of hex digits)
    +  \r          carriage return (\x0d)
    +  \t          tab (\x09)
    +  \v          vertical tab (\x0b)
    +  \ddd        octal number (up to 3 octal digits); represent a single
    +                code point unless larger than 255 with the 8-bit library
    +  \o{dd...}   octal number (any number of octal digits} representing a
    +                character in UTF mode or a code point
    +  \xhh        hexadecimal byte (up to 2 hex digits)
    +  \x{hh...}   hexadecimal number (up to 8 hex digits) representing a
    +                character in UTF mode or a code point
     
    -The use of \x{hh...} is not dependent on the use of the utf modifier on -the pattern. It is recognized always. There may be any number of hexadecimal -digits inside the braces; invalid values provoke error messages. +Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf +modifier on the pattern. It is always recognized. There may be any number of +hexadecimal digits inside the braces; invalid values provoke error messages +but when using \N{U+hh...} with some invalid unicode characters they will +be accepted with a warning instead.

    -Note that \xhh specifies one byte rather than one character in UTF-8 mode; -this makes it possible to construct invalid UTF-8 sequences for testing -purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in -UTF-8 mode, generating more than one byte if the value is greater than 127. -When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte -for values less than 256, and causes an error for greater values. +Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) +describe one byte rather than one character; this makes it possible to +construct invalid UTF-8 sequences for testing purposes. On the other hand, +\x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating +more than one byte if the value is greater than 127. To avoid the ambiguity +it is preferred to use \N{U+hh...} when describing characters. When testing +the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values +that could fit on it, and causes an error for greater values.

    -In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it -possible to construct invalid UTF-16 sequences for testing purposes. +When testing the 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh} +values are accepted. This makes it possible to construct invalid UTF-16 +sequences for testing purposes.

    -In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it -possible to construct invalid UTF-32 sequences for testing purposes. +When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...} +values are accepted. This makes it possible to construct invalid UTF-32 +sequences for testing purposes.

    There is a special backslash sequence that specifies replication of one or more @@ -625,6 +643,7 @@

    pcre2test man page

    allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -653,13 +672,17 @@

    pcre2test man page

    match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -671,6 +694,23 @@

    pcre2test man page

    brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. +
    +
    +The following modifiers enable or disable performance optimizations by +calling pcre2_set_optimize() before invoking the regex compiler. +
    +      optimization_full      enable all optional optimizations
    +      optimization_none      disable all optional optimizations
    +      auto_possess           auto-possessify variable quantifiers
    +      auto_possess_off       don't auto-possessify variable quantifiers
    +      dotstar_anchor         anchor patterns starting with .*
    +      dotstar_anchor_off     don't anchor patterns starting with .*
    +      start_optimize         enable pre-scan of subject string
    +      start_optimize_off     disable pre-scan of subject string
    +
    +See the +pcre2_set_optimize +documentation for details on these optimizations.


    Setting compilation controls @@ -680,14 +720,15 @@

    pcre2test man page

    about the pattern. There are single-letter abbreviations for some that are heavily used in the test files.
    -      bsr=[anycrlf|unicode]     specify \R handling
       /B  bincode                   show binary code without lengths
    +      bsr=[anycrlf|unicode]     specify \R handling
           callout_info              show callout information
           convert=<options>         request foreign pattern conversion
           convert_glob_escape=c     set glob escape character
           convert_glob_separator=c  set glob separator character
           convert_length            set convert buffer length
           debug                     same as info,fullbincode
    +      expand                    expand repetition syntax in pattern
           framesize                 show matching frame size
           fullbincode               show binary code with lengths
       /I  info                      show info about compiled pattern
    @@ -709,6 +750,7 @@ 

    pcre2test man page

    posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard=<number> test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -1128,6 +1170,7 @@

    pcre2test man page

    replace=<string> specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1217,10 +1260,11 @@

    pcre2test man page

    The following modifiers set options for pcre2_match() or pcre2_dfa_match(). See -pcreapi +pcre2api for a description of their effects.

           anchored                   set PCRE2_ANCHORED
    +      copy_matched_subject       set PCRE2_COPY_MATCHED_SUBJECT
           endanchored                set PCRE2_ENDANCHORED
           dfa_restart                set PCRE2_DFA_RESTART
           dfa_shortest               set PCRE2_DFA_SHORTEST
    @@ -1271,8 +1315,8 @@ 

    pcre2test man page

    aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data=<n> set a value to pass via callouts @@ -1306,7 +1350,8 @@

    pcre2test man page

    startchar show startchar when relevant startoffset=<n> same as offset=<n> substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1592,6 +1637,21 @@

    pcre2test man page

    or stop is supported, which is sufficient for testing that the feature works.


    +Testing substitute case callouts +
    +

    +If the substitute_case_callout modifier is set, a substitution +case callout function is set up. The callout function is called for each +substituted chunk which is to be case-transformed. +

    +

    +The callout function passed is a fixed function with implementation for certain +behaviours: inputs which shrink when case-transformed; inputs which grow; inputs +with distinct upper/lower/titlecase forms. The characters which are not +special-cased for testing purposes are left unmodified, as if they are caseless +characters. +

    +
    Setting the JIT stack size

    @@ -2204,7 +2264,7 @@

    pcre2test man page


    REVISION

    -Last updated: 24 April 2024 +Last updated: 26 December 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/mingw64/share/doc/pcre2/html/pcre2unicode.html b/mingw64/share/doc/pcre2/html/pcre2unicode.html index 6f0972e706a..5b425329fac 100644 --- a/mingw64/share/doc/pcre2/html/pcre2unicode.html +++ b/mingw64/share/doc/pcre2/html/pcre2unicode.html @@ -53,7 +53,7 @@

    pcre2unicode man page

    The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal number, the derived properties -Any and LC (synonym L&), the Unicode script names such as Arabic or Han, +Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties.

    @@ -157,6 +157,40 @@

    pcre2unicode man page

    counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. +
    +    Without PCRE2_EXTRA_CASELESS_RESTRICT:
    +      'k' = 'K' = U+212A (Kelvin sign)
    +      's' = 'S' = U+017F (long S)
    +    With PCRE2_EXTRA_CASELESS_RESTRICT:
    +      'k' = 'K'
    +      U+212A (Kelvin sign)  only case-equivalent to itself
    +      's' = 'S'
    +      U+017F (long S)       only case-equivalent to itself
    +
    +

    +

    +One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +

    +    Without PCRE2_EXTRA_TURKISH_CASING:
    +      'i' = 'I'
    +      U+0130 (capital I with dot above)  only case-equivalent to itself
    +      U+0131 (small dotless i)           only case-equivalent to itself
    +    With PCRE2_EXTRA_TURKISH_CASING:
    +      'i' = U+0130 (capital I with dot above)
    +      U+0131 (small dotless i) = 'I'
    +
    +

    +

    +It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and +PCRE2_EXTRA_TURKISH_CASING together. +

    +

    +From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower +case), and Lt (title case) are all treated as Lc (cased letter) when caseless +matching is set by the PCRE2_CASELESS option or (?i) within the pattern.


    SCRIPT RUNS @@ -513,9 +547,9 @@

    pcre2unicode man page

    REVISION

    -Last updated: 12 October 2023 +Last updated: 27 November 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/mingw64/share/doc/pcre2/pcre2-config.txt b/mingw64/share/doc/pcre2/pcre2-config.txt index dc8cf8f7ed4..c87de589db7 100644 --- a/mingw64/share/doc/pcre2/pcre2-config.txt +++ b/mingw64/share/doc/pcre2/pcre2-config.txt @@ -1,4 +1,3 @@ - PCRE2-CONFIG(1) General Commands Manual PCRE2-CONFIG(1) @@ -82,4 +81,4 @@ REVISION Last updated: 28 September 2014 -PCRE2 10.00 28 September 2014 PCRE2-CONFIG(1) +PCRE2 10.45 28 September 2014 PCRE2-CONFIG(1) diff --git a/mingw64/share/doc/pcre2/pcre2.txt b/mingw64/share/doc/pcre2/pcre2.txt index 85eead6e61f..38e86d6e6a3 100644 --- a/mingw64/share/doc/pcre2/pcre2.txt +++ b/mingw64/share/doc/pcre2/pcre2.txt @@ -8,7 +8,6 @@ pcre2test commands. ----------------------------------------------------------------------------- - PCRE2(3) Library Functions Manual PCRE2(3) @@ -171,27 +170,29 @@ USER DOCUMENTATION library function, listing its arguments and results. -AUTHOR +AUTHORS - Philip Hazel - Retired from University Computing Service - Cambridge, England. + The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Her- + czeg. + + PCRE2 was written by Philip Hazel, of the University Computing Service, + Cambridge, England. Many others have also contributed. - Putting an actual email address here is a spam magnet. If you want to - email me, use my two names separated by a dot at gmail.com. + To contact the maintainers, please use the GitHub issues tracker or + PCRE2 mailing list, as described at the project page: + https://github.com/PCRE2Project/pcre2 REVISION - Last updated: 27 August 2021 + Last updated: 18 December 2024 Copyright (c) 1997-2021 University of Cambridge. -PCRE2 10.38 27 August 2021 PCRE2(3) +PCRE2 10.45 18 December 2024 PCRE2(3) ------------------------------------------------------------------------------ - PCRE2API(3) Library Functions Manual PCRE2API(3) @@ -298,6 +299,9 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); + int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); + PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS @@ -317,6 +321,12 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); @@ -858,6 +868,7 @@ PCRE2 CONTEXTS The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) + Which performance optimizations the compiler should apply A compile context is also required if you are using custom memory man- agement. If none of these apply, just pass NULL as the context argu- @@ -980,6 +991,110 @@ PCRE2 CONTEXTS ment of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. + int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); + + PCRE2 can apply various performance optimizations during compilation, + in order to make matching faster. For example, the compiler might con- + vert some regex constructs into an equivalent construct which + pcre2_match() can execute faster. By default, all available optimiza- + tions are enabled. However, in rare cases, one might wish to disable + specific optimizations. For example, if it is known that some optimiza- + tions cannot benefit a certain regex, it might be desirable to disable + them, in order to speed up compilation. + + The permitted values of directive are as follows: + + PCRE2_OPTIMIZATION_FULL + + Enable all optional performance optimizations. This is the default + value. + + PCRE2_OPTIMIZATION_NONE + + Disable all optional performance optimizations. + + PCRE2_AUTO_POSSESS + PCRE2_AUTO_POSSESS_OFF + + Enable/disable "auto-possessification" of variable quantifiers such as + * and +. This optimization, for example, turns a+b into a++b in order + to avoid backtracks into a+ that can never be successful. However, if + callouts are in use, auto-possessification means that some callouts are + never taken. You can disable this optimization if you want the matching + functions to do a full, unoptimized search and run all the callouts. + + PCRE2_DOTSTAR_ANCHOR + PCRE2_DOTSTAR_ANCHOR_OFF + + Enable/disable an optimization that is applied when .* is the first + significant item in a top-level branch of a pattern, and all the other + branches also start with .* or with \A or \G or ^. Such a pattern is + automatically anchored if PCRE2_DOTALL is set for all the .* items and + PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that + any match must start either at the start of the subject or following a + newline is remembered. Like other optimizations, this can cause call- + outs to be skipped. + + Dotstar anchor optimization is automatically disabled for .* if it is + inside an atomic group or a capture group that is the subject of a + backreference, or if the pattern contains (*PRUNE) or (*SKIP). + + PCRE2_START_OPTIMIZE + PCRE2_START_OPTIMIZE_OFF + + Enable/disable optimizations which cause matching functions to scan the + subject string for specific code unit values before attempting a match. + For example, if it is known that an unanchored match must start with a + specific value, the matching code searches the subject for that value, + and fails immediately if it cannot find it, without actually running + the main matching function. This means that a special item such as + (*COMMIT) at the start of a pattern is not considered until after a + suitable starting point for the match has been found. Also, when call- + outs or (*MARK) items are in use, these "start-up" optimizations can + cause them to be skipped if the pattern is never actually used. The + start-up optimizations are in effect a pre-scan of the subject that + takes place before the pattern is run. + + Disabling start-up optimizations ensures that in cases where the result + is "no match", the callouts do occur, and that items such as (*COMMIT) + and (*MARK) are considered at every possible starting position in the + subject string. + + Disabling start-up optimizations may change the outcome of a matching + operation. Consider the pattern + + (*COMMIT)ABC + + When this is compiled, PCRE2 records the fact that a match must start + with the character "A". Suppose the subject string is "DEFABC". The + start-up optimization scans along the subject, finds "A" and runs the + first match attempt from there. The (*COMMIT) item means that the pat- + tern must match the current starting position, which in this case, it + does. However, if the same match is run without start-up optimizations, + the initial scan along the subject string does not happen. The first + match attempt is run starting from "D" and when this fails, (*COMMIT) + prevents any further matches being tried, so the overall result is "no + match". + + Another start-up optimization makes use of a minimum length for a + matching subject, which is recorded when possible. Consider the pattern + + (*MARK:1)B(*MARK:2)(X|Y) + + The minimum length for a match is two characters. If the subject is + "XXBB", the "starting character" optimization skips "XX", then tries to + match "BB", which is long enough. In the process, (*MARK:2) is encoun- + tered and remembered. When the match attempt fails, the next "B" is + found, but there is only one character left, so there are no more at- + tempts, and "no match" is returned with the "last mark seen" set to + "2". Without start-up optimizations, however, matches are tried at + every possible starting position, including at the end of the subject, + where (*MARK:1) is encountered, but there is no "B", so the "last mark + seen" that is returned is "1". In this case, the optimizations do not + affect the overall match result, which is still "no match", but they do + affect the auxiliary information that is returned. + The match context A match context is required if you want to: @@ -1025,6 +1140,16 @@ PCRE2 CONTEXTS tion made by pcre2_substitute(). Details are given in the section enti- tled "Creating a new string with substitutions" below. + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + + This sets up a callout function for PCRE2 to call when performing case + transformations inside pcre2_substitute(). Details are given in the + section entitled "Creating a new string with substitutions" below. + int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); @@ -1224,8 +1349,11 @@ CHECKING BUILD-TIME OPTIONS The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee - that JIT will be used for any given match. See the pcre2jit documenta- - tion for more details. + that JIT will be used for any given match, and neither does it guaran- + tee that JIT will actually be able to function, because it may not be + able to allocate executable memory in some environments. There is a + special call to pcre2_jit_compile() that can be used to check this. See + the pcre2jit documentation for more details. PCRE2_CONFIG_JITTARGET @@ -1413,10 +1541,10 @@ COMPILING A PATTERN spectively, when pcre2_compile() returns NULL because a compilation er- ror has occurred. - There are nearly 100 positive error codes that pcre2_compile() may re- - turn if it finds an error in the pattern. There are also some negative - error codes that are used for invalid UTF strings when validity check- - ing is in force. These are the same as given by pcre2_match() and + There are over 100 positive error codes that pcre2_compile() may return + if it finds an error in the pattern. There are also some negative error + codes that are used for invalid UTF strings when validity checking is + in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and are described in the pcre2unicode documentation. There is no separate documentation for the positive error codes, be- cause the textual error messages that are obtained by calling the @@ -1511,39 +1639,56 @@ COMPILING A PATTERN Perl. If you want a multiline circumflex also to match after a termi- nating newline, you must set PCRE2_ALT_CIRCUMFLEX. + PCRE2_ALT_EXTENDED_CLASS + + Alters the parsing of character classes to follow the extended syntax + described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no + impact on the behaviour of the Perl-specific "(?[...])" syntax for ex- + tended classes, but instead enables the alternative syntax of extended + class behaviour inside ordinary "[...]" character classes. See the + pcre2pattern documentation for details of the character classes sup- + ported. + PCRE2_ALT_VERBNAMES - By default, for compatibility with Perl, the name in any verb sequence - such as (*MARK:NAME) is any sequence of characters that does not in- - clude a closing parenthesis. The name is not processed in any way, and - it is not possible to include a closing parenthesis in the name. How- - ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- - cessing is applied to verb names and only an unescaped closing paren- - thesis terminates the name. A closing parenthesis can be included in a - name either as \) or between \Q and \E. If the PCRE2_EXTENDED or - PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped - whitespace in verb names is skipped and #-comments are recognized, ex- + By default, for compatibility with Perl, the name in any verb sequence + such as (*MARK:NAME) is any sequence of characters that does not in- + clude a closing parenthesis. The name is not processed in any way, and + it is not possible to include a closing parenthesis in the name. How- + ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- + cessing is applied to verb names and only an unescaped closing paren- + thesis terminates the name. A closing parenthesis can be included in a + name either as \) or between \Q and \E. If the PCRE2_EXTENDED or + PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped + whitespace in verb names is skipped and #-comments are recognized, ex- actly as in the rest of the pattern. PCRE2_AUTO_CALLOUT - If this bit is set, pcre2_compile() automatically inserts callout - items, all with number 255, before each pattern item, except immedi- - ately before or after an explicit callout in the pattern. For discus- + If this bit is set, pcre2_compile() automatically inserts callout + items, all with number 255, before each pattern item, except immedi- + ately before or after an explicit callout in the pattern. For discus- sion of the callout facility, see the pcre2callout documentation. PCRE2_CASELESS - If this bit is set, letters in the pattern match both upper and lower - case letters in the subject. It is equivalent to Perl's /i option, and - it can be changed within a pattern by a (?i) option setting. If either - PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all - characters with more than one other case, and for all characters whose - code points are greater than U+007F. Note that there are two ASCII - characters, K and S, that, in addition to their lower case ASCII equiv- - alents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long - S) respectively. If you do not want this case equivalence, you can sup- - press it by setting PCRE2_EXTRA_CASELESS_RESTRICT. + If this bit is set, letters in the pattern match both upper and lower + case letters in the subject. It is equivalent to Perl's /i option, and + it can be changed within a pattern by a (?i) option setting. If either + PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all + characters with more than one other case, and for all characters whose + code points are greater than U+007F. + + Note that there are two ASCII characters, K and S, that, in addition to + their lower case ASCII equivalents, are case-equivalent with U+212A + (Kelvin sign) and U+017F (long S) respectively. If you do not want this + case equivalence, you can suppress it by setting PCRE2_EXTRA_CASE- + LESS_RESTRICT. + + One language family, Turkish and Azeri, has its own case-insensitivity + rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. + This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot + above), and U+0131 (small dotless i) characters. For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup @@ -1551,201 +1696,206 @@ COMPILING A PATTERN (available only in 16-bit or 32-bit mode) are treated as not having an- other case. + From release 10.45 PCRE2_CASELESS also affects what some of the letter- + related Unicode property escapes (\p and \P) match. The properties Lu + (upper case letter), Ll (lower case letter), and Lt (title case letter) + are all treated as LC (cased letter) when PCRE2_CASELESS is set. + PCRE2_DOLLAR_ENDONLY - If this bit is set, a dollar metacharacter in the pattern matches only - at the end of the subject string. Without this option, a dollar also - matches immediately before a newline at the end of the string (but not - before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored - if PCRE2_MULTILINE is set. There is no equivalent to this option in + If this bit is set, a dollar metacharacter in the pattern matches only + at the end of the subject string. Without this option, a dollar also + matches immediately before a newline at the end of the string (but not + before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored + if PCRE2_MULTILINE is set. There is no equivalent to this option in Perl, and no way to set it within a pattern. PCRE2_DOTALL - If this bit is set, a dot metacharacter in the pattern matches any - character, including one that indicates a newline. However, it only + If this bit is set, a dot metacharacter in the pattern matches any + character, including one that indicates a newline. However, it only ever matches one character, even if newlines are coded as CRLF. Without this option, a dot does not match when the current position in the sub- - ject is at a newline. This option is equivalent to Perl's /s option, + ject is at a newline. This option is equivalent to Perl's /s option, and it can be changed within a pattern by a (?s) option setting. A neg- - ative class such as [^a] always matches newline characters, and the \N - escape sequence always matches a non-newline character, independent of + ative class such as [^a] always matches newline characters, and the \N + escape sequence always matches a non-newline character, independent of the setting of PCRE2_DOTALL. PCRE2_DUPNAMES - If this bit is set, names used to identify capture groups need not be - unique. This can be helpful for certain types of pattern when it is - known that only one instance of the named group can ever be matched. - There are more details of named capture groups below; see also the + If this bit is set, names used to identify capture groups need not be + unique. This can be helpful for certain types of pattern when it is + known that only one instance of the named group can ever be matched. + There are more details of named capture groups below; see also the pcre2pattern documentation. PCRE2_ENDANCHORED - If this bit is set, the end of any pattern match must be right at the + If this bit is set, the end of any pattern match must be right at the end of the string being searched (the "subject string"). If the pattern match succeeds by reaching (*ACCEPT), but does not reach the end of the - subject, the match fails at the current starting point. For unanchored - patterns, a new match is then tried at the next starting point. How- + subject, the match fails at the current starting point. For unanchored + patterns, a new match is then tried at the next starting point. How- ever, if the match succeeds by reaching the end of the pattern, but not - the end of the subject, backtracking occurs and an alternative match + the end of the subject, backtracking occurs and an alternative match may be found. Consider these two patterns: .(*ACCEPT)|.. .|.. - If matched against "abc" with PCRE2_ENDANCHORED set, the first matches - "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED - can also be achieved by appropriate constructs in the pattern itself, + If matched against "abc" with PCRE2_ENDANCHORED set, the first matches + "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED + can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only - to the first (that is, the longest) matched string. Other parallel - matches, which are necessarily substrings of the first one, must obvi- + to the first (that is, the longest) matched string. Other parallel + matches, which are necessarily substrings of the first one, must obvi- ously end before the end of the subject. PCRE2_EXTENDED - If this bit is set, most white space characters in the pattern are to- - tally ignored except when escaped, inside a character class, or inside - a \Q...\E sequence. However, white space is not allowed within se- - quences such as (?> that introduce various parenthesized groups, nor - within numerical quantifiers such as {1,3}. Ignorable white space is - permitted between an item and a following quantifier and between a - quantifier and a following + that indicates possessiveness. PCRE2_EX- - TENDED is equivalent to Perl's /x option, and it can be changed within + If this bit is set, most white space characters in the pattern are to- + tally ignored except when escaped, inside a character class, or inside + a \Q...\E sequence. However, white space is not allowed within se- + quences such as (?> that introduce various parenthesized groups, nor + within numerical quantifiers such as {1,3}. Ignorable white space is + permitted between an item and a following quantifier and between a + quantifier and a following + that indicates possessiveness. PCRE2_EX- + TENDED is equivalent to Perl's /x option, and it can be changed within a pattern by a (?x) option setting. - When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- - nizes as white space only those characters with code points less than + When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- + nizes as white space only those characters with code points less than 256 that are flagged as white space in its low-character table. The ta- ble is normally created by pcre2_maketables(), which uses the isspace() - function to identify space characters. In most ASCII environments, the - relevant characters are those with code points 0x0009 (tab), 0x000A - (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage + function to identify space characters. In most ASCII environments, the + relevant characters are those with code points 0x0009 (tab), 0x000A + (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage return), and 0x0020 (space). When PCRE2 is compiled with Unicode support, in addition to these char- - acters, five more Unicode "Pattern White Space" characters are recog- + acters, five more Unicode "Pattern White Space" characters are recog- nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to- - right mark), U+200F (right-to-left mark), U+2028 (line separator), and - U+2029 (paragraph separator). This set of characters is the same as - recognized by Perl's /x option. Note that the horizontal and vertical - space characters that are matched by the \h and \v escapes in patterns + right mark), U+200F (right-to-left mark), U+2028 (line separator), and + U+2029 (paragraph separator). This set of characters is the same as + recognized by Perl's /x option. Note that the horizontal and vertical + space characters that are matched by the \h and \v escapes in patterns are a much bigger set. - As well as ignoring most white space, PCRE2_EXTENDED also causes char- - acters between an unescaped # outside a character class and the next - newline, inclusive, to be ignored, which makes it possible to include + As well as ignoring most white space, PCRE2_EXTENDED also causes char- + acters between an unescaped # outside a character class and the next + newline, inclusive, to be ignored, which makes it possible to include comments inside complicated patterns. Note that the end of this type of - comment is a literal newline sequence in the pattern; escape sequences + comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. Which characters are interpreted as newlines can be specified by a set- - ting in the compile context that is passed to pcre2_compile() or by a - special sequence at the start of the pattern, as described in the sec- - tion entitled "Newline conventions" in the pcre2pattern documentation. + ting in the compile context that is passed to pcre2_compile() or by a + special sequence at the start of the pattern, as described in the sec- + tion entitled "Newline conventions" in the pcre2pattern documentation. A default is defined when PCRE2 is built. PCRE2_EXTENDED_MORE - This option has the effect of PCRE2_EXTENDED, but, in addition, un- - escaped space and horizontal tab characters are ignored inside a char- - acter class. Note: only these two characters are ignored, not the full - set of pattern white space characters that are ignored outside a char- - acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, + This option has the effect of PCRE2_EXTENDED, but, in addition, un- + escaped space and horizontal tab characters are ignored inside a char- + acter class. Note: only these two characters are ignored, not the full + set of pattern white space characters that are ignored outside a char- + acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, and it can be changed within a pattern by a (?xx) option setting. PCRE2_FIRSTLINE If this option is set, the start of an unanchored pattern match must be - before or at the first newline in the subject string following the - start of matching, though the matched text may continue over the new- + before or at the first newline in the subject string following the + start of matching, though the matched text may continue over the new- line. If startoffset is non-zero, the limiting newline is not necessar- - ily the first newline in the subject. For example, if the subject + ily the first newline in the subject. For example, if the subject string is "abc\nxyz" (where \n represents a single-character newline) a - pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is - greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more - general limiting facility. If PCRE2_FIRSTLINE is set with an offset - limit, a match must occur in the first line and also within the offset + pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is + greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more + general limiting facility. If PCRE2_FIRSTLINE is set with an offset + limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. This option has no effect for anchored patterns. PCRE2_LITERAL If this option is set, all meta-characters in the pattern are disabled, - and it is treated as a literal string. Matching literal strings with a + and it is treated as a literal string. Matching literal strings with a regular expression engine is not the most efficient way of doing it. If - you are doing a lot of literal matching and are worried about effi- + you are doing a lot of literal matching and are worried about effi- ciency, you should consider using other approaches. The only other main options that are allowed with PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, - PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- + PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- TRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error. PCRE2_MATCH_INVALID_UTF - This option forces PCRE2_UTF (see below) and also enables support for - matching by pcre2_match() in subject strings that contain invalid UTF - sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries - process strings as sequences of uint16_t or uint32_t code points. They + This option forces PCRE2_UTF (see below) and also enables support for + matching by pcre2_match() in subject strings that contain invalid UTF + sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries + process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes un- - less such sequences are suitably aligned. This facility is not sup- - ported for DFA matching. For details, see the pcre2unicode documenta- + less such sequences are suitably aligned. This facility is not sup- + ported for DFA matching. For details, see the pcre2unicode documenta- tion. PCRE2_MATCH_UNSET_BACKREF - If this option is set, a backreference to an unset capture group - matches an empty string (by default this causes the current matching + If this option is set, a backreference to an unset capture group + matches an empty string (by default this causes the current matching alternative to fail). A pattern such as (\1)(a) succeeds when this op- - tion is set (assuming it can find an "a" in the subject), whereas it - fails by default, for Perl compatibility. Setting this option makes + tion is set (assuming it can find an "a" in the subject), whereas it + fails by default, for Perl compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka JavaScript). PCRE2_MULTILINE - By default, for the purposes of matching "start of line" and "end of - line", PCRE2 treats the subject string as consisting of a single line - of characters, even if it actually contains newlines. The "start of - line" metacharacter (^) matches only at the start of the string, and - the "end of line" metacharacter ($) matches only at the end of the - string, or before a terminating newline (except when PCRE2_DOLLAR_EN- + By default, for the purposes of matching "start of line" and "end of + line", PCRE2 treats the subject string as consisting of a single line + of characters, even if it actually contains newlines. The "start of + line" metacharacter (^) matches only at the start of the string, and + the "end of line" metacharacter ($) matches only at the end of the + string, or before a terminating newline (except when PCRE2_DOLLAR_EN- DONLY is set). Note, however, that unless PCRE2_DOTALL is set, the "any - character" metacharacter (.) does not match at a newline. This behav- + character" metacharacter (.) does not match at a newline. This behav- iour (for ^, $, and dot) is the same as Perl. - When PCRE2_MULTILINE it is set, the "start of line" and "end of line" - constructs match immediately following or immediately before internal - newlines in the subject string, respectively, as well as at the very - start and end. This is equivalent to Perl's /m option, and it can be + When PCRE2_MULTILINE it is set, the "start of line" and "end of line" + constructs match immediately following or immediately before internal + newlines in the subject string, respectively, as well as at the very + start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. Note that the "start of line" metacharacter does not match after a newline at the end of the - subject, for compatibility with Perl. However, you can change this by - setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a - subject string, or no occurrences of ^ or $ in a pattern, setting + subject, for compatibility with Perl. However, you can change this by + setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a + subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. PCRE2_NEVER_BACKSLASH_C - This option locks out the use of \C in the pattern that is being com- - piled. This escape can cause unpredictable behaviour in UTF-8 or - UTF-16 modes, because it may leave the current matching point in the + This option locks out the use of \C in the pattern that is being com- + piled. This escape can cause unpredictable behaviour in UTF-8 or + UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in ap- plications that process patterns from external sources. Note that there is also a build-time option that permanently locks out the use of \C. PCRE2_NEVER_UCP - This option locks out the use of Unicode properties for handling \B, + This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as - described for the PCRE2_UCP option below. In particular, it prevents - the creator of the pattern from enabling this facility by starting the - pattern with (*UCP). This option may be useful in applications that - process patterns from external sources. The option combination PCRE_UCP - and PCRE_NEVER_UCP causes an error. + described for the PCRE2_UCP option below. In particular, it prevents + the creator of the pattern from enabling this facility by starting the + pattern with (*UCP). This option may be useful in applications that + process patterns from external sources. The option combination + PCRE2_UCP and PCRE2_NEVER_UCP causes an error. PCRE2_NEVER_UTF @@ -1769,86 +1919,56 @@ COMPILING A PATTERN PCRE2_NO_AUTO_POSSESS - If this option is set, it disables "auto-possessification", which is an - optimization that, for example, turns a+b into a++b in order to avoid - backtracks into a+ that can never be successful. However, if callouts - are in use, auto-possessification means that some callouts are never - taken. You can set this option if you want the matching functions to do - a full unoptimized search and run all the callouts, but it is mainly - provided for testing purposes. + If this (deprecated) option is set, it disables "auto-possessifica- + tion", which is an optimization that, for example, turns a+b into a++b + in order to avoid backtracks into a+ that can never be successful. How- + ever, if callouts are in use, auto-possessification means that some + callouts are never taken. You can set this option if you want the + matching functions to do a full unoptimized search and run all the + callouts, but it is mainly provided for testing purposes. + + If a compile context is available, it is recommended to use + pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather + than the compile option PCRE2_NO_AUTO_POSSESS. Note that + PCRE2_NO_AUTO_POSSESS takes precedence over the pcre2_set_optimize() + optimization directives PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF. PCRE2_NO_DOTSTAR_ANCHOR - If this option is set, it disables an optimization that is applied when - .* is the first significant item in a top-level branch of a pattern, - and all the other branches also start with .* or with \A or \G or ^. - The optimization is automatically disabled for .* if it is inside an - atomic group or a capture group that is the subject of a backreference, - or if the pattern contains (*PRUNE) or (*SKIP). When the optimization - is not disabled, such a pattern is automatically anchored if + If this (deprecated) option is set, it disables an optimization that is + applied when .* is the first significant item in a top-level branch of + a pattern, and all the other branches also start with .* or with \A or + \G or ^. The optimization is automatically disabled for .* if it is in- + side an atomic group or a capture group that is the subject of a back- + reference, or if the pattern contains (*PRUNE) or (*SKIP). When the op- + timization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set - for any ^ items. Otherwise, the fact that any match must start either - at the start of the subject or following a newline is remembered. Like - other optimizations, this can cause callouts to be skipped. + for any ^ items. Otherwise, the fact that any match must start either + at the start of the subject or following a newline is remembered. Like + other optimizations, this can cause callouts to be skipped. (If a com- + pile context is available, it is recommended to use pcre2_set_opti- + mize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF instead.) PCRE2_NO_START_OPTIMIZE This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of - the JIT compiler. + the JIT compiler. Setting this option is equivalent to calling + pcre2_set_optimize() with the directive parameter set to + PCRE2_START_OPTIMIZE_OFF. There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails imme- diately if it cannot find it, without actually running the main match- - ing function. This means that a special item such as (*COMMIT) at the - start of a pattern is not considered until after a suitable starting - point for the match has been found. Also, when callouts or (*MARK) - items are in use, these "start-up" optimizations can cause them to be - skipped if the pattern is never actually used. The start-up optimiza- - tions are in effect a pre-scan of the subject that takes place before - the pattern is run. - - The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, - possibly causing performance to suffer, but ensuring that in cases - where the result is "no match", the callouts do occur, and that items - such as (*COMMIT) and (*MARK) are considered at every possible starting - position in the subject string. - - Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching - operation. Consider the pattern + ing function. The start-up optimizations are in effect a pre-scan of + the subject that takes place before the pattern is run. - (*COMMIT)ABC - - When this is compiled, PCRE2 records the fact that a match must start - with the character "A". Suppose the subject string is "DEFABC". The - start-up optimization scans along the subject, finds "A" and runs the - first match attempt from there. The (*COMMIT) item means that the pat- - tern must match the current starting position, which in this case, it - does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE - set, the initial scan along the subject string does not happen. The - first match attempt is run starting from "D" and when this fails, - (*COMMIT) prevents any further matches being tried, so the overall re- - sult is "no match". - - As another start-up optimization makes use of a minimum length for a - matching subject, which is recorded when possible. Consider the pattern - - (*MARK:1)B(*MARK:2)(X|Y) - - The minimum length for a match is two characters. If the subject is - "XXBB", the "starting character" optimization skips "XX", then tries to - match "BB", which is long enough. In the process, (*MARK:2) is encoun- - tered and remembered. When the match attempt fails, the next "B" is - found, but there is only one character left, so there are no more at- - tempts, and "no match" is returned with the "last mark seen" set to - "2". If NO_START_OPTIMIZE is set, however, matches are tried at every - possible starting position, including at the end of the subject, where - (*MARK:1) is encountered, but there is no "B", so the "last mark seen" - that is returned is "1". In this case, the optimizations do not affect - the overall match result, which is still "no match", but they do affect - the auxiliary information that is returned. + Disabling the start-up optimizations may cause performance to suffer. + However, this may be desirable for patterns which contain callouts or + items such as (*COMMIT) and (*MARK). See the above description of + PCRE2_START_OPTIMIZE_OFF for further details. PCRE2_NO_UTF_CHECK @@ -1892,41 +2012,46 @@ COMPILING A PATTERN ties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode - support (which is the default). The PCRE2_EXTRA_CASELESS_RESTRICT op- - tion (see below) restricts caseless matching such that ASCII characters - match only ASCII characters and non-ASCII characters match only non- - ASCII characters. + support (which is the default). + + The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless + matching such that ASCII characters match only ASCII characters and + non-ASCII characters match only non-ASCII characters. The PCRE2_EX- + TRA_TURKISH_CASING option (see above) alters the matching of the 'i' + characters to follow their behaviour in Turkish and Azeri languages. + For further details on PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EX- + TRA_TURKISH_CASING, see the pcre2unicode page. PCRE2_UNGREEDY - This option inverts the "greediness" of the quantifiers so that they - are not greedy by default, but become greedy if followed by "?". It is - not compatible with Perl. It can also be set by a (?U) option setting + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting within the pattern. PCRE2_USE_OFFSET_LIMIT This option must be set for pcre2_compile() if pcre2_set_offset_limit() - is going to be used to set a non-default offset limit in a match con- - text for matches that use this pattern. An error is generated if an - offset limit is set without this option. For more details, see the de- - scription of pcre2_set_offset_limit() in the section that describes + is going to be used to set a non-default offset limit in a match con- + text for matches that use this pattern. An error is generated if an + offset limit is set without this option. For more details, see the de- + scription of pcre2_set_offset_limit() in the section that describes match contexts. See also the PCRE2_FIRSTLINE option above. PCRE2_UTF - This option causes PCRE2 to regard both the pattern and the subject - strings that are subsequently processed as strings of UTF characters - instead of single-code-unit strings. It is available when PCRE2 is - built to include Unicode support (which is the default). If Unicode + This option causes PCRE2 to regard both the pattern and the subject + strings that are subsequently processed as strings of UTF characters + instead of single-code-unit strings. It is available when PCRE2 is + built to include Unicode support (which is the default). If Unicode support is not available, the use of this option provokes an error. De- - tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the + tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the pcre2unicode page. In particular, note that it changes the way PCRE2_CASELESS works. Extra compile options - The option bits that can be set in a compile context by calling the + The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_options() function are as follows: PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK @@ -1938,102 +2063,102 @@ COMPILING A PATTERN PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES - This option applies when compiling a pattern in UTF-8 or UTF-32 mode. - It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode + This option applies when compiling a pattern in UTF-8 or UTF-32 mode. + It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs - in UTF-16 to encode code points with values in the range 0x10000 to - 0x10ffff. The surrogates cannot therefore be represented in UTF-16. + in UTF-16 to encode code points with values in the range 0x10000 to + 0x10ffff. The surrogates cannot therefore be represented in UTF-16. They can be represented in UTF-8 and UTF-32, but are defined as invalid - code points, and cause errors if encountered in a UTF-8 or UTF-32 + code points, and cause errors if encountered in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2. - These values also cause errors if encountered in escape sequences such + These values also cause errors if encountered in escape sequences such as \x{d912} within a pattern. However, it seems that some applications, when using PCRE2 to check for unwanted characters in UTF-8 strings, ex- - plicitly test for the surrogates using escape sequences. The - PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- + plicitly test for the surrogates using escape sequences. The + PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- cause it applies only to the testing of input strings for UTF validity. - If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- - gate code point values in UTF-8 and UTF-32 patterns no longer provoke - errors and are incorporated in the compiled pattern. However, they can - only match subject characters if the matching function is called with + If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- + gate code point values in UTF-8 and UTF-32 patterns no longer provoke + errors and are incorporated in the compiled pattern. However, they can + only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set. PCRE2_EXTRA_ALT_BSUX - The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and - \x in the way that ECMAscript (aka JavaScript) does. Additional func- + The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and + \x in the way that ECMAscript (aka JavaScript) does. Additional func- tionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has - the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} + the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal character code, where hhh.. is any number of hexadeci- mal digits. PCRE2_EXTRA_ASCII_BSD - This option forces \d to match only ASCII digits, even when PCRE2_UCP - is set. It can be changed within a pattern by means of the (?aD) op- + This option forces \d to match only ASCII digits, even when PCRE2_UCP + is set. It can be changed within a pattern by means of the (?aD) op- tion setting. PCRE2_EXTRA_ASCII_BSS - This option forces \s to match only ASCII space characters, even when - PCRE2_UCP is set. It can be changed within a pattern by means of the + This option forces \s to match only ASCII space characters, even when + PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) option setting. PCRE2_EXTRA_ASCII_BSW - This option forces \w to match only ASCII word characters, even when - PCRE2_UCP is set. It can be changed within a pattern by means of the + This option forces \w to match only ASCII word characters, even when + PCRE2_UCP is set. It can be changed within a pattern by means of the (?aW) option setting. PCRE2_EXTRA_ASCII_DIGIT This option forces the POSIX character classes [:digit:] and [:xdigit:] - to match only ASCII digits, even when PCRE2_UCP is set. It can be + to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option setting. PCRE2_EXTRA_ASCII_POSIX This option forces all the POSIX character classes, including [:digit:] - and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is - set. It can be changed within a pattern by means of the (?aP) option - setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order + and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is + set. It can be changed within a pattern by means of the (?aP) option + setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL - This is a dangerous option. Use with care. By default, an unrecognized - escape such as \j or a malformed one such as \x{2z} causes a compile- + This is a dangerous option. Use with care. By default, an unrecognized + escape such as \j or a malformed one such as \x{2z} causes a compile- time error when detected by pcre2_compile(). Perl is somewhat inconsis- - tent in handling such items: for example, \j is treated as a literal - "j", and non-hexadecimal digits in \x{} are just ignored, though warn- - ings are given in both cases if Perl's warning switch is enabled. How- - ever, a malformed octal number after \o{ always causes an error in + tent in handling such items: for example, \j is treated as a literal + "j", and non-hexadecimal digits in \x{} are just ignored, though warn- + ings are given in both cases if Perl's warning switch is enabled. How- + ever, a malformed octal number after \o{ always causes an error in Perl. - If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to - pcre2_compile(), all unrecognized or malformed escape sequences are - treated as single-character escapes. For example, \j is a literal "j" - and \x{2z} is treated as the literal string "x{2z}". Setting this op- + If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to + pcre2_compile(), all unrecognized or malformed escape sequences are + treated as single-character escapes. For example, \j is a literal "j" + and \x{2z} is treated as the literal string "x{2z}". Setting this op- tion means that typos in patterns may go undetected and have unexpected - results. Also note that a sequence such as [\N{] is interpreted as a - malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] + results. Also note that a sequence such as [\N{] is interpreted as a + malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] gives an error because an unqualified \N is a valid escape sequence but - is not supported in a character class. To reiterate: this is a danger- + is not supported in a character class. To reiterate: this is a danger- ous option. Use with great care. PCRE2_EXTRA_CASELESS_RESTRICT - When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows + When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow for more than two cases per character. There are two case-equivalent character sets that contain both ASCII and non- ASCII characters. The ASCII letter S is case-equivalent to U+017f (long - S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). - This option disables recognition of case-equivalences that cross the + S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). + This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must ei- - ther be ASCII or non-ASCII. The option can be changed with a pattern by - the (?r) option setting. + ther be ASCII or non-ASCII. The option can be changed within a pattern + by the (*CASELESS_RESTRICT) or (?r) option settings. PCRE2_EXTRA_ESCAPED_CR_IS_LF @@ -2062,6 +2187,36 @@ COMPILING A PATTERN end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. + PCRE2_EXTRA_NO_BS0 + + If this option is set (note that its final character is the digit 0) it + locks out the use of the sequence \0 unless at least one more octal + digit follows. + + PCRE2_EXTRA_PYTHON_OCTAL + + If this option is set, PCRE2 follows Python's rules for interpreting + octal escape sequences. The rules for handling sequences such as \14, + which could be an octal number or a back reference are different. De- + tails are given in the pcre2pattern documentation. + + PCRE2_EXTRA_NEVER_CALLOUT + + If this option is set, PCRE2 treats callouts in the pattern as a syntax + error, returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if + the application knows that a callout will not be provided to + pcre2_match(), so that callouts in the pattern are not silently ig- + nored. + + PCRE2_EXTRA_TURKISH_CASING + + This option alters case-equivalence of the 'i' letters to follow the + alphabet used by Turkish and Azeri languages. The option can be changed + within a pattern by the (*TURKISH_CASING) start-of-pattern setting. Ei- + ther the UTF or UCP options must be set. In the 8-bit library, UTF must + be set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RE- + STRICT. + JUST-IN-TIME (JIT) COMPILATION @@ -2255,6 +2410,7 @@ INFORMATION ABOUT A COMPILED PATTERN PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3520,9 +3676,9 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (with- out, of course, writing anything) in order to compute the size of - buffer that is needed. This value is passed back via the outlengthptr - variable, with the result of the function still being PCRE2_ER- - ROR_NOMEMORY. + buffer that is needed, which will include the extra space for the ter- + minating NUL. This value is passed back via the outlengthptr variable, + with the result of the function still being PCRE2_ERROR_NOMEMORY. Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean @@ -3541,24 +3697,32 @@ CREATING A NEW STRING WITH SUBSTITUTIONS cape character that can specify the insertion of characters from cap- ture groups and names from (*MARK) or other control verbs in the pat- tern. Dollar is the only escape character (backslash is treated as lit- - eral). The following forms are always recognized: + eral). The following forms are recognized: $$ insert a dollar character - $ or ${} insert the contents of group + $n or ${n} insert the contents of group n + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string $*MARK or ${*MARK} insert a control verb name - Either a group number or a group name can be given for . Curly - brackets are required only if the following character would be inter- - preted as part of the number or name. The number may be zero to include - the entire matched string. For example, if the pattern a(b)c is - matched with "=abc=" and the replacement string "+$1$0$1+", the result - is "=+babcb+=". + Either a group number or a group name can be given for n, for example + $2 or $NAME. Curly brackets are required only if the following charac- + ter would be interpreted as part of the number or name. The number may + be zero to include the entire matched string. For example, if the pat- + tern a(b)c is matched with "=abc=" and the replacement string + "+$1$0$1+", the result is "=+babcb+=". + + The JavaScript form $, where the angle brackets are part of the + syntax, is also recognized for group names, but not for group numbers + or *MARK. - $*MARK inserts the name from the last encountered backtracking control - verb on the matching path that has a name. (*MARK) must always include - a name, but the other verbs need not. For example, in the case of + $*MARK inserts the name from the last encountered backtracking control + verb on the matching path that has a name. (*MARK) must always include + a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) - the relevant name is "B". This facility can be used to perform simple + the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this pcre2test example shows: /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} @@ -3566,15 +3730,15 @@ CREATING A NEW STRING WITH SUBSTITUTIONS 2: pear orange PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject - string, replacing every matching substring. If this option is not set, - only the first matching substring is replaced. The search for matches - takes place in the original subject string (that is, previous replace- - ments do not affect it). Iteration is implemented by advancing the - startoffset value for each search, which is always passed the entire + string, replacing every matching substring. If this option is not set, + only the first matching substring is replaced. The search for matches + takes place in the original subject string (that is, previous replace- + ments do not affect it). Iteration is implemented by advancing the + startoffset value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, search- ing stops when that limit is reached. - You can restrict the effect of a global substitution to a portion of + You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of startoffset and an off- set limit. Here is a pcre2test example: @@ -3582,73 +3746,95 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ABC ABC ABC ABC\=offset=3,offset_limit=12 2: ABC A!C A!C ABC - When continuing with global substitutions after matching a substring + When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same off- set is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next - two characters are CR, LF. In this case, the offset is advanced by two + two characters are CR, LF. In this case, the offset is advanced by two characters. PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option - should be used with care, because it means that a typo in a group name + should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error. PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including un- - known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated - as empty strings when inserted as described above. If this option is + known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated + as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UN- - SET error. This option does not influence the extended substitution + SET error. This option does not influence the extended substitution syntax described below. - PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the - replacement string. Without this option, only the dollar character is - special, and only the group insertion forms listed above are valid. - When PCRE2_SUBSTITUTE_EXTENDED is set, two things change: + PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the + replacement string. Without this option, only the dollar character is + special, and only the group insertion forms listed above are valid. + When PCRE2_SUBSTITUTE_EXTENDED is set, several things change: + + Firstly, backslash in a replacement string is interpreted as an escape + character. The usual forms such as \x{ddd} can be used to specify par- + ticular character codes, and backslash followed by any non-alphanumeric + character quotes that character. Extended quoting can be coded using + \Q...\E, exactly as in pattern strings. The escapes \b and \v are in- + terpreted as the characters backspace and vertical tab, respectively. - Firstly, backslash in a replacement string is interpreted as an escape - character. The usual forms such as \n or \x{ddd} can be used to specify - particular character codes, and backslash followed by any non-alphanu- - meric character quotes that character. Extended quoting can be coded - using \Q...\E, exactly as in pattern strings. + The interpretation of backslash followed by one or more digits is the + same as in a pattern, which in Perl has some ambiguities. Details are + given in the pcre2pattern page. + + The Python form \g, where the angle brackets are part of the syntax + and n is either a group name or number, is recognized as an altertive + way of inserting the contents of a group, for example \g<3>. There are also four escape sequences for forcing the case of inserted - letters. The insertion mechanism has three states: no case forcing, - force upper case, and force lower case. The escape sequences change the - current state: \U and \L change to upper or lower case forcing, respec- - tively, and \E (when not terminating a \Q quoted sequence) reverts to - no case forcing. The sequences \u and \l force the next character (if - it is a letter) to upper or lower case, respectively, and then the - state automatically reverts to no case forcing. Case forcing applies to - all inserted characters, including those from capture groups and let- - ters within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP - was set when the pattern was compiled, Unicode properties are used for - case forcing characters whose code points are greater than 127. + letters. Case forcing applies to all inserted characters, including + those from capture groups and letters within \Q...\E quoted sequences. + The insertion mechanism has three states: no case forcing, force upper + case, and force lower case. The escape sequences change the current + state: \U and \L change to upper or lower case forcing, respectively, + and \E (when not terminating a \Q quoted sequence) reverts to no case + forcing. The sequences \u and \l force the next character (if it is a + letter) to upper or lower case, respectively, and then the state auto- + matically reverts to no case forcing. + + However, if \u is immediately followed by \L or \l is immediately fol- + lowed by \U, the next character's case is forced by the first escape + sequence, and subsequent characters by the second. This provides a "ti- + tle casing" facility that can be applied to group captures. For exam- + ple, if group 1 has captured "heLLo", the replacement string "\u\L$1" + becomes "Hello". + + If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, + Unicode properties are used for case forcing characters whose code + points are greater than 127. However, only simple case folding, as de- + termined by the Unicode file CaseFolding.txt is supported. PCRE2 does + not support language-specific special casing rules such as using dif- + ferent lower case Greek sigmas in the middle and ends of words (as de- + fined in the Unicode file SpecialCasing.txt). Note that case forcing sequences such as \U...\E do not nest. For exam- ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EX- TRA_ALT_BSUX options do not apply to replacement strings. - The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more + The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash: - ${:-} - ${:+:} + ${n:-string} + ${n:+string1:string2} - As before, may be a group number or a name. The first form speci- - fies a default value. If group is set, its value is inserted; if - not, is expanded and the result inserted. The second form - specifies strings that are expanded and inserted when group is set - or unset, respectively. The first form is just a convenient shorthand - for + As in the simple case, n may be a group number or a name. The first + form specifies a default value. If group n is set, its value is in- + serted; if not, the string is expanded and the result inserted. The + second form specifies strings that are expanded and inserted when group + n is set or unset, respectively. The first form is just a convenient + shorthand for - ${:+${}:} + ${n:+${n}:string} - Backslash can be used to escape colons and closing curly brackets in - the replacement strings. A change of the case forcing state within a - replacement string remains in force afterwards, as shown in this + Backslash can be used to escape colons and closing curly brackets in + the replacement strings. A change of the case forcing state within a + replacement string remains in force afterwards, as shown in this pcre2test example: /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo @@ -3657,8 +3843,8 @@ CREATING A NEW STRING WITH SUBSTITUTIONS somebody 1: HELLO - The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended - substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- + The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended + substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- known groups in the extended syntax forms to be treated as unset. If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, @@ -3667,39 +3853,39 @@ CREATING A NEW STRING WITH SUBSTITUTIONS Substitution errors - In the event of an error, pcre2_substitute() returns a negative error - code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors + In the event of an error, pcre2_substitute() returns a negative error + code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() are passed straight back. PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser- tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ- - ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) - when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- + ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) + when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- SET_EMPTY is not set. - PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big + PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size - of buffer that is needed is returned via outlengthptr. Note that this + of buffer that is needed is returned via outlengthptr. Note that this does not happen by default. PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the - match_data argument is NULL or if the subject or replacement arguments - are NULL. For backward compatibility reasons an exception is made for + match_data argument is NULL or if the subject or replacement arguments + are NULL. For backward compatibility reasons an exception is made for the replacement argument if the rlength argument is also 0. - PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in - the replacement string, with more particular errors being PCRE2_ER- + PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in + the replacement string, with more particular errors being PCRE2_ER- ROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE - (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax - error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN + (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax + error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier - than the current position in the subject, which can happen if \K is + than the current position in the subject, which can happen if \K is used in an assertion). As for all PCRE2 errors, a text message that describes the error can be - obtained by calling the pcre2_get_error_message() function (see "Ob- + obtained by calling the pcre2_get_error_message() function (see "Ob- taining a textual error message" above). Substitution callouts @@ -3708,12 +3894,20 @@ CREATING A NEW STRING WITH SUBSTITUTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); - The pcre2_set_substitution_callout() function can be used to specify a - callout function for pcre2_substitute(). This information is passed in + The pcre2_set_substitution_callout() function can be used to specify a + callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution - has been processed, but it can cause the replacement not to happen. The - callout function is not called for simulated substitutions that happen - as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. + has been processed, but it can cause the replacement not to happen. + + The callout function is not called for simulated substitutions that + happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In + this mode, when substitution processing exceeds the buffer space pro- + vided by the caller, processing continues by counting code units. The + simulation is unable to populate the callout block, and so the simula- + tion is pessimistic about the required buffer size. Whichever is larger + of accepted or rejected substitution is reported as the required size. + Therefore, the returned buffer length may be an overestimate (without a + substitution callout, it is normally an exact measurement). The first argument of the callout function is a pointer to a substitute callout block structure, which contains the following fields, not nec- @@ -3757,62 +3951,149 @@ CREATING A NEW STRING WITH SUBSTITUTIONS to the output and the call to pcre2_substitute() exits, returning the number of matches so far. + Substitution case callouts + + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + + The pcre2_set_substitution_case_callout() function can be used to spec- + ify a callout function for pcre2_substitute() to use when performing + case transformations. This does not affect any case insensitivity be- + haviour when performing a match, but only the user-visible transforma- + tions performed when processing a substitution such as: + + pcre2_substitute(..., "\\U$1", ...) + + The default case transformations applied by PCRE2 are reasonably com- + plete, and, in UTF or UCP mode, perform the simple locale-invariant + case transformations as specified by Unicode. This is suitable for the + internal (invisible) case-equivalence procedures used during pattern + matching, but an application may wish to use more sophisticated locale- + aware processing for the user-visible substitution transformations. + + One example implementation of the callout_function using the ICU li- + brary would be: + + PCRE2_SIZE + icu_case_callout( + PCRE2_SPTR input, PCRE2_SIZE input_len, + PCRE2_UCHAR *output, PCRE2_SIZE output_cap, + int to_case, void *data_ptr) + { + UErrorCode err = U_ZERO_ERROR; + int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER + ? u_strToLower(output, output_cap, input, input_len, NULL, &err) + : to_case == PCRE2_SUBSTITUTE_CASE_UPPER + ? u_strToUpper(output, output_cap, input, input_len, NULL, &err) + : u_strToTitle(output, output_cap, input, input_len, &first_char_only, + NULL, &err); + if (U_FAILURE(err)) return (~(PCRE2_SIZE)0); + return r; + } + + The first and second arguments of the case callout function are the + Unicode string to transform. + + The third and fourth arguments are the output buffer and its capacity. + + The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, + PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. + PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed + to the callout to indicate that the case of the entire callout input + should be case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed + to indicate that only the first character or glyph should be trans- + formed to Unicode titlecase and the rest to Unicode lowercase (note + that titlecasing sometimes uses Unicode properties to titlecase each + word in a string; but PCRE2 is requesting that only the single leading + character is to be titlecased). + + The sixth argument is the callout_data supplied to pcre2_set_substi- + tute_case_callout(). + + The resulting string in the destination buffer may be larger or smaller + than the input, if the casing rules merge or split characters. The re- + turn value is the length required for the output string. If a buffer of + sufficient size was provided to the callout, then the result must be + written to the buffer and the number of code units returned. If the re- + sult does not fit in the provided buffer, then the required capacity + must be returned and PCRE2 will not make use of the output buffer. + PCRE2 provides input and output buffers which overlap, so the callout + must support this by suitable internal buffering. + + Alternatively, if the callout wishes to indicate an error, then it may + return (~(PCRE2_SIZE)0). In this case pcre2_substitute() will immedi- + ately fail with error PCRE2_ERROR_REPLACECASE. + + When a case callout is combined with the PCRE2_SUBSTITUTE_OVER- + FLOW_LENGTH option, there are situations when pcre2_substitute() will + return an underestimate of the required buffer size. If you call + pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the + input buffer is too small for the replacement string to be constructed, + then instead of calling the case callout, pcre2_substitute() will make + an estimate of the required buffer size. The second call should also + pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second call is not + guaranteed to succeed either, if the case callout requires more buffer + space than expected. The caller must make repeated attempts in a loop. + DUPLICATE CAPTURE GROUP NAMES int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); - When a pattern is compiled with the PCRE2_DUPNAMES option, names for - capture groups are not required to be unique. Duplicate names are al- - ways allowed for groups with the same number, created by using the (?| + When a pattern is compiled with the PCRE2_DUPNAMES option, names for + capture groups are not required to be unique. Duplicate names are al- + ways allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names. - Normally, patterns that use duplicate names are such that in any one - match, only one of each set of identically-named groups participates. + Normally, patterns that use duplicate names are such that in any one + match, only one of each set of identically-named groups participates. An example is shown in the pcre2pattern documentation. - When duplicates are present, pcre2_substring_copy_byname() and - pcre2_substring_get_byname() return the first substring corresponding - to the given name that is set. Only if none are set is PCRE2_ERROR_UN- - SET is returned. The pcre2_substring_number_from_name() function re- - turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate + When duplicates are present, pcre2_substring_copy_byname() and + pcre2_substring_get_byname() return the first substring corresponding + to the given name that is set. Only if none are set is PCRE2_ERROR_UN- + SET is returned. The pcre2_substring_number_from_name() function re- + turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. - If you want to get full details of all captured substrings for a given - name, you must use the pcre2_substring_nametable_scan() function. The - first argument is the compiled pattern, and the second is the name. If - the third and fourth arguments are NULL, the function returns a group + If you want to get full details of all captured substrings for a given + name, you must use the pcre2_substring_nametable_scan() function. The + first argument is the compiled pattern, and the second is the name. If + the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. When the third and fourth arguments are not NULL, they must be pointers - to variables that are updated by the function. After it has run, they + to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the - given name, and the function returns the length of each entry in code - units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are + given name, and the function returns the length of each entry in code + units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. The format of the name table is described above in the section entitled - Information about a pattern. Given all the relevant entries for the - name, you can extract each of their numbers, and hence the captured + Information about a pattern. Given all the relevant entries for the + name, you can extract each of their numbers, and hence the captured data. FINDING ALL POSSIBLE MATCHES AT ONE POSITION - The traditional matching function uses a similar algorithm to Perl, - which stops when it finds the first match at a given point in the sub- + The traditional matching function uses a similar algorithm to Perl, + which stops when it finds the first match at a given point in the sub- ject. If you want to find all possible matches, or the longest possible - match at a given position, consider using the alternative matching - function (see below) instead. If you cannot use the alternative func- + match at a given position, consider using the alternative matching + function (see below) instead. If you cannot use the alternative func- tion, you can kludge it up by making use of the callout facility, which is described in the pcre2callout documentation. What you have to do is to insert a callout right at the end of the pat- - tern. When your callout function is called, extract and save the cur- - rent matched substring. Then return 1, which forces pcre2_match() to - backtrack and try other alternatives. Ultimately, when it runs out of + tern. When your callout function is called, extract and save the cur- + rent matched substring. Then return 1, which forces pcre2_match() to + backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH. @@ -3824,27 +4105,27 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount); - The function pcre2_dfa_match() is called to match a subject string - against a compiled pattern, using a matching algorithm that scans the + The function pcre2_dfa_match() is called to match a subject string + against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does - not backtrack (except when processing lookaround assertions). This has - different characteristics to the normal algorithm, and is not compati- - ble with Perl. Some of the features of PCRE2 patterns are not sup- + not backtrack (except when processing lookaround assertions). This has + different characteristics to the normal algorithm, and is not compati- + ble with Perl. Some of the features of PCRE2 patterns are not sup- ported. Nevertheless, there are times when this kind of matching can be - useful. For a discussion of the two matching algorithms, and a list of + useful. For a discussion of the two matching algorithms, and a list of features that pcre2_dfa_match() does not support, see the pcre2matching documentation. - The arguments for the pcre2_dfa_match() function are the same as for + The arguments for the pcre2_dfa_match() function are the same as for pcre2_match(), plus two extras. The ovector within the match data block is used in a different way, and this is described below. The other com- - mon arguments are used in the same way as for pcre2_match(), so their + mon arguments are used in the same way as for pcre2_match(), so their description is not repeated here. - The two additional arguments provide workspace for the function. The - workspace vector should contain at least 20 elements. It is used for - keeping track of multiple paths through the pattern tree. More work- - space is needed for patterns and subjects where there are a lot of po- + The two additional arguments provide workspace for the function. The + workspace vector should contain at least 20 elements. It is used for + keeping track of multiple paths through the pattern tree. More work- + space is needed for patterns and subjects where there are a lot of po- tential matches. Here is an example of a simple call to pcre2_dfa_match(): @@ -3864,45 +4145,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION Option bits for pcre2_dfa_match() - The unused bits of the options argument for pcre2_dfa_match() must be - zero. The only bits that may be set are PCRE2_ANCHORED, - PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- + The unused bits of the options argument for pcre2_dfa_match() must be + zero. The only bits that may be set are PCRE2_ANCHORED, + PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- TEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, - PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and - PCRE2_DFA_RESTART. All but the last four of these are exactly the same + PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and + PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for pcre2_match(), so their description is not repeated here. PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT - These have the same general effect as they do for pcre2_match(), but - the details are slightly different. When PCRE2_PARTIAL_HARD is set for - pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the + These have the same general effect as they do for pcre2_match(), but + the details are slightly different. When PCRE2_PARTIAL_HARD is set for + pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the subject is reached and there is still at least one matching possibility that requires additional characters. This happens even if some complete - matches have already been found. When PCRE2_PARTIAL_SOFT is set, the - return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL - if the end of the subject is reached, there have been no complete + matches have already been found. When PCRE2_PARTIAL_SOFT is set, the + return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL + if the end of the subject is reached, there have been no complete matches, but there is still at least one matching possibility. The por- - tion of the string that was inspected when the longest partial match + tion of the string that was inspected when the longest partial match was found is set as the first matching string in both cases. There is a - more detailed discussion of partial and multi-segment matching, with + more detailed discussion of partial and multi-segment matching, with examples, in the pcre2partial documentation. PCRE2_DFA_SHORTEST - Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to + Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as soon as it has found one match. Because of the way the alterna- - tive algorithm works, this is necessarily the shortest possible match + tive algorithm works, this is necessarily the shortest possible match at the first possible matching point in the subject string. PCRE2_DFA_RESTART - When pcre2_dfa_match() returns a partial match, it is possible to call + When pcre2_dfa_match() returns a partial match, it is possible to call it again, with additional subject characters, and have it continue with the same match. The PCRE2_DFA_RESTART option requests this action; when - it is set, the workspace and wscount options must reference the same - vector as before because data about the match so far is left in them + it is set, the workspace and wscount options must reference the same + vector as before because data about the match so far is left in them after a partial match. There is more discussion of this facility in the pcre2partial documentation. @@ -3910,8 +4191,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION When pcre2_dfa_match() succeeds, it may have matched more than one sub- string in the subject. Note, however, that all the matches from one run - of the function start at the same point in the subject. The shorter - matches are all initial substrings of the longer matches. For example, + of the function start at the same point in the subject. The shorter + matches are all initial substrings of the longer matches. For example, if the pattern <.*> @@ -3926,80 +4207,80 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION - On success, the yield of the function is a number greater than zero, - which is the number of matched substrings. The offsets of the sub- - strings are returned in the ovector, and can be extracted by number in - the same way as for pcre2_match(), but the numbers bear no relation to - any capture groups that may exist in the pattern, because DFA matching + On success, the yield of the function is a number greater than zero, + which is the number of matched substrings. The offsets of the sub- + strings are returned in the ovector, and can be extracted by number in + the same way as for pcre2_match(), but the numbers bear no relation to + any capture groups that may exist in the pattern, because DFA matching does not support capturing. - Calls to the convenience functions that extract substrings by name re- + Calls to the convenience functions that extract substrings by name re- turn the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used af- - ter a DFA match. The convenience functions that extract substrings by + ter a DFA match. The convenience functions that extract substrings by number never return PCRE2_ERROR_NOSUBSTRING. - The matched strings are stored in the ovector in reverse order of - length; that is, the longest matching string is first. If there were - too many matches to fit into the ovector, the yield of the function is + The matched strings are stored in the ovector in reverse order of + length; that is, the longest matching string is first. If there were + too many matches to fit into the ovector, the yield of the function is zero, and the vector is filled with the longest matches. - NOTE: PCRE2's "auto-possessification" optimization usually applies to - character repeats at the end of a pattern (as well as internally). For - example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA - matching, this means that only one possible match is found. If you re- + NOTE: PCRE2's "auto-possessification" optimization usually applies to + character repeats at the end of a pattern (as well as internally). For + example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA + matching, this means that only one possible match is found. If you re- ally do want multiple matches in such cases, either use an ungreedy re- - peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- + peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- piling. Error returns from pcre2_dfa_match() The pcre2_dfa_match() function returns a negative number when it fails. - Many of the errors are the same as for pcre2_match(), as described + Many of the errors are the same as for pcre2_match(), as described above. There are in addition the following errors that are specific to pcre2_dfa_match(): PCRE2_ERROR_DFA_UITEM - This return is given if pcre2_dfa_match() encounters an item in the - pattern that it does not support, for instance, the use of \C in a UTF + This return is given if pcre2_dfa_match() encounters an item in the + pattern that it does not support, for instance, the use of \C in a UTF mode or a backreference. PCRE2_ERROR_DFA_UCOND - This return is given if pcre2_dfa_match() encounters a condition item + This return is given if pcre2_dfa_match() encounters a condition item that uses a backreference for the condition, or a test for recursion in a specific capture group. These are not supported. PCRE2_ERROR_DFA_UINVALID_UTF - This return is given if pcre2_dfa_match() is called for a pattern that - was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for + This return is given if pcre2_dfa_match() is called for a pattern that + was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA matching. PCRE2_ERROR_DFA_WSSIZE - This return is given if pcre2_dfa_match() runs out of space in the + This return is given if pcre2_dfa_match() runs out of space in the workspace vector. PCRE2_ERROR_DFA_RECURSE When a recursion or subroutine call is processed, the matching function - calls itself recursively, using private memory for the ovector and - workspace. This error is given if the internal ovector is not large - enough. This should be extremely rare, as a vector of size 1000 is + calls itself recursively, using private memory for the ovector and + workspace. This error is given if the internal ovector is not large + enough. This should be extremely rare, as a vector of size 1000 is used. PCRE2_ERROR_DFA_BADRESTART - When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, - some plausibility checks are made on the contents of the workspace, - which should contain data about the previous partial match. If any of + When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, + some plausibility checks are made on the contents of the workspace, + which should contain data about the previous partial match. If any of these checks fail, this error is given. SEE ALSO - pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), + pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3). @@ -4012,15 +4293,14 @@ AUTHOR REVISION - Last updated: 24 April 2024 + Last updated: 26 December 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 24 April 2024 PCRE2API(3) +PCRE2 10.45 26 December 2024 PCRE2API(3) ------------------------------------------------------------------------------ - PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3) @@ -4639,15 +4919,14 @@ AUTHOR REVISION - Last updated: 15 April 2024 + Last updated: 16 April 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 15 April 2024 PCRE2BUILD(3) +PCRE2 10.45 16 April 2024 PCRE2BUILD(3) ------------------------------------------------------------------------------ - PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) @@ -5077,11 +5356,10 @@ REVISION Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2CALLOUT(3) +PCRE2 10.45 19 January 2024 PCRE2CALLOUT(3) ------------------------------------------------------------------------------ - PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3) @@ -5140,7 +5418,7 @@ DIFFERENCES BETWEEN PCRE2 AND PERL 7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties - such as Lu and Nd, the derived properties Any and LC (synonym L&), + such as Lu and Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) prop- erty, but in PCRE2 its use is limited. See the pcre2pattern documenta- @@ -5167,118 +5445,128 @@ DIFFERENCES BETWEEN PCRE2 AND PERL \Q\\E \ \\E The \Q...\E sequence is recognized both inside and outside character - classes by both PCRE2 and Perl. - - 9. Fairly obviously, PCRE2 does not support the (?{code}) and + classes by both PCRE2 and Perl. Another difference from Perl is that + any appearance of \Q or \E inside what might otherwise be a quantifier + causes PCRE2 not to recognize the sequence as a quantifier. Perl recog- + nizes a quantifier if (redundantly) either of the numbers is inside + \Q...\E, but not if the separating comma is. When not recognized as a + quantifier a sequence such as {\Q1\E,2} is treated as the literal + string "{1,2}". + + 9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details. - 10. Subroutine calls (whether recursive or not) were treated as atomic - groups up to PCRE2 release 10.23, but from release 10.30 this changed, + 10. Subroutine calls (whether recursive or not) were treated as atomic + groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. - 11. In PCRE2, if any of the backtracking control verbs are used in a - group that is called as a subroutine (whether or not recursively), - their effect is confined to that group; it does not extend to the sur- - rounding pattern. This is not always the case in Perl. In particular, - if (*THEN) is present in a group that is called as a subroutine, its + 11. In PCRE2, if any of the backtracking control verbs are used in a + group that is called as a subroutine (whether or not recursively), + their effect is confined to that group; it does not extend to the sur- + rounding pattern. This is not always the case in Perl. In particular, + if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any - | characters. Note that such groups are processed as anchored at the - point where they are tested. - - 12. If a pattern contains more than one backtracking control verb, the - first one that is backtracked onto acts. For example, in the pattern - A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure + | characters. Note that such groups are processed as anchored at the + point where they are tested. PCRE2 also confines all control verbs + within atomic assertions, again including (*THEN) in assertions with + only one branch. + + 12. If a pattern contains more than one backtracking control verb, the + first one that is backtracked onto acts. For example, in the pattern + A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. - 13. There are some differences that are concerned with the settings of - captured strings when part of a pattern is repeated. For example, - matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- + 13. There are some differences that are concerned with the settings of + captured strings when part of a pattern is repeated. For example, + matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- set, but in PCRE2 it is set to "b". - 14. PCRE2's handling of duplicate capture group numbers and names is - not as general as Perl's. This is a consequence of the fact the PCRE2 - works internally just with numbers, using an external table to trans- - late between numbers and names. In particular, a pattern such as - (?|(?A)|(?B)), where the two capture groups have the same number - but different names, is not supported, and causes an error at compile + 14. PCRE2's handling of duplicate capture group numbers and names is + not as general as Perl's. This is a consequence of the fact the PCRE2 + works internally just with numbers, using an external table to trans- + late between numbers and names. In particular, a pattern such as + (?|(?A)|(?B)), where the two capture groups have the same number + but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which - group matched, because both names map to capture group number 1. To + group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. 15. Perl used to recognize comments in some places that PCRE2 does not, - for example, between the ( and ? at the start of a group. If the /x - modifier is set, Perl allowed white space between ( and ? though the - latest Perls give an error (for a while it was just deprecated). There + for example, between the ( and ? at the start of a group. If the /x + modifier is set, Perl allowed white space between ( and ? though the + latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. - 16. Perl, when in warning mode, gives warnings for character classes - such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- + 16. Perl, when in warning mode, gives warnings for character classes + such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- als. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. - 17. In PCRE2, the upper/lower case character properties Lu and Ll are - not affected when case-independent matching is specified. For example, - \p{Lu} always matches an upper case letter. I think Perl has changed in - this respect; in the release at the time of writing (5.38), \p{Lu} and - \p{Ll} match all letters, regardless of case, when case independence is - specified. + 17. In PCRE2, until release 10.45, the upper/lower case character prop- + erties Lu and Ll were not affected when case-independent matching was + specified. Perl has changed in this respect, and PCRE2 has now changed + to match. When caseless matching is in force, Lu, Ll, and Lt (title + case) are all treated as Lc (cased letter). 18. From release 5.32.0, Perl locks out the use of \K in lookaround as- - sertions. From release 10.38 PCRE2 does the same by default. However, - there is an option for re-enabling the previous behaviour. When this - option is set, \K is acted on when it occurs in positive assertions, + sertions. From release 10.38 PCRE2 does the same by default. However, + there is an option for re-enabling the previous behaviour. When this + option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions. - 19. PCRE2 provides some extensions to the Perl regular expression fa- - cilities. Perl 5.10 included new features that were not in earlier - versions of Perl, some of which (such as named parentheses) were in + 19. PCRE2 provides some extensions to the Perl regular expression fa- + cilities. Perl 5.10 included new features that were not in earlier + versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This list is with respect to Perl 5.38: - (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the + (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. - (b) A backslash followed by a letter with no special meaning is + (b) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) - (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- + (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- fiers is inverted, that is, by default they are not greedy, but if fol- lowed by a question mark they are. - (d) PCRE2_ANCHORED can be used at matching time to force a pattern to + (d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. - (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and + (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART options have no Perl equivalents. - (f) The \R escape sequence can be restricted to match only CR, LF, or + (f) The \R escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. - (g) The callout facility is PCRE2-specific. Perl supports codeblocks + (g) The callout facility is PCRE2-specific. Perl supports codeblocks and variable interpolation, but not general hooks on every match. (h) The partial matching facility is PCRE2-specific. - (i) The alternative matching function (pcre2_dfa_match() matches in a + (i) The alternative matching function (pcre2_dfa_match() matches in a different way and is not Perl-compatible. - (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) - at the start of a pattern. These set overall options that cannot be + (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) + at the start of a pattern. These set overall options that cannot be changed within the pattern. - (k) PCRE2 supports non-atomic positive lookaround assertions. This is + (k) PCRE2 supports non-atomic positive lookaround assertions. This is an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. - (l) There are three syntactical items in patterns that can refer to a - capturing group by number: back references such as \g{2}, subroutine - calls such as (?3), and condition references such as (?(4)...). PCRE2 - supports relative group numbers such as +2 and -4 in all three cases. - Perl supports both plus and minus for subroutine calls, but only minus + (l) There are three syntactical items in patterns that can refer to a + capturing group by number: back references such as \g{2}, subroutine + calls such as (?3), and condition references such as (?(4)...). PCRE2 + supports relative group numbers such as +2 and -4 in all three cases. + Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. + (m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 ex- + tension that is not available in Perl. + 20. Perl has different limits than PCRE2. See the pcre2limit documenta- tion for details. Perl went with 5.10 from recursion to iteration keep- ing the intermediate matches on the heap, which is ~10% slower but does @@ -5297,6 +5585,17 @@ DIFFERENCES BETWEEN PCRE2 AND PERL ple is /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject. + 23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl + tries to recover and prints a warning if the problem was that an in- + valid hexadecimal digit was found, since PCRE2 doesn't have warnings it + returns an error instead. Additionally, Perl accepts \x{} and gener- + ates NUL unlike PCRE2. + + 24. From release 10.45, PCRE2 gives an error if \x is not followed by a + hexadecimal digit or a curly bracket. It used to interpret this as the + NUL character. Perl still generates NUL, but warns when in warning mode + in most cases. + AUTHOR @@ -5307,15 +5606,14 @@ AUTHOR REVISION - Last updated: 30 November 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 02 October 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 30 November 2023 PCRE2COMPAT(3) +PCRE2 10.45 02 October 2024 PCRE2COMPAT(3) ------------------------------------------------------------------------------ - PCRE2JIT(3) Library Functions Manual PCRE2JIT(3) @@ -5359,146 +5657,155 @@ AVAILABILITY OF JIT SUPPORT If --enable-jit is set on an unsupported platform, compilation fails. - A client program can tell if JIT support is available by calling + A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular match. One reason for this is that there are a number of op- tions and pattern items that are not supported by JIT (see below). An- - other reason is that in some environments JIT is unable to get memory - in which to build its compiled code. The only guarantee from pcre2_con- - fig() is that if it returns zero, JIT will definitely not be used. - - A simple program does not need to check availability in order to use - JIT when possible. The API is implemented in a way that falls back to - the interpretive code if JIT is not available or cannot be used for a - given match. For programs that need the best possible performance, + other reason is that in some environments JIT is unable to get exe- + cutable memory in which to build its compiled code. The only guarantee + from pcre2_config() is that if it returns zero, JIT will definitely not + be used. + + As of release 10.45 there is a more informative way to test for JIT + support. If pcre2_compile_jit() is called with the single option + PCRE2_JIT_TEST_ALLOC it returns zero if JIT is available and has a + working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is + available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UN- + SUPPORTED if JIT support is not compiled. The code argument is ignored, + so it can be a NULL value. + + A simple program does not need to check availability in order to use + JIT when possible. The API is implemented in a way that falls back to + the interpretive code if JIT is not available or cannot be used for a + given match. For programs that need the best possible performance, there is a "fast path" API that is JIT-specific. SIMPLE USE OF JIT - To make use of the JIT support in the simplest way, all you have to do - is to call pcre2_jit_compile() after successfully compiling a pattern + To make use of the JIT support in the simplest way, all you have to do + is to call pcre2_jit_compile() after successfully compiling a pattern with pcre2_compile(). This function has two arguments: the first is the - compiled pattern pointer that was returned by pcre2_compile(), and the - second is zero or more of the following option bits: PCRE2_JIT_COM- + compiled pattern pointer that was returned by pcre2_compile(), and the + second is zero or more of the following option bits: PCRE2_JIT_COM- PLETE, PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT. - If JIT support is not available, a call to pcre2_jit_compile() does - nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled + If JIT support is not available, a call to pcre2_jit_compile() does + nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern is passed to the JIT compiler, which turns it into machine code that executes much faster than the normal interpretive code, but yields - exactly the same results. The returned value from pcre2_jit_compile() + exactly the same results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code. - There is a limit to the size of pattern that JIT supports, imposed by - the size of machine stack that it uses. The exact rules are not docu- + There is a limit to the size of pattern that JIT supports, imposed by + the size of machine stack that it uses. The exact rules are not docu- mented because they may change at any time, in particular, when new op- - timizations are introduced. If a pattern is too big, a call to + timizations are introduced. If a pattern is too big, a call to pcre2_jit_compile() returns PCRE2_ERROR_NOMEMORY. - PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- - plete matches. If you want to run partial matches using the PCRE2_PAR- - TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should - set one or both of the other options as well as, or instead of + PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- + plete matches. If you want to run partial matches using the PCRE2_PAR- + TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should + set one or both of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT compiler generates different optimized code - for each of the three modes (normal, soft partial, hard partial). When - pcre2_match() is called, the appropriate code is run if it is avail- + for each of the three modes (normal, soft partial, hard partial). When + pcre2_match() is called, the appropriate code is run if it is avail- able. Otherwise, the pattern is matched using interpretive code. - You can call pcre2_jit_compile() multiple times for the same compiled - pattern. It does nothing if it has previously compiled code for any of - the option bits. For example, you can call it once with PCRE2_JIT_COM- - PLETE and (perhaps later, when you find you need partial matching) - again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it + You can call pcre2_jit_compile() multiple times for the same compiled + pattern. It does nothing if it has previously compiled code for any of + the option bits. For example, you can call it once with PCRE2_JIT_COM- + PLETE and (perhaps later, when you find you need partial matching) + again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial match- ing. If pcre2_jit_compile() is called with no option bits set, it imme- diately returns zero. This is an alternative way of testing whether JIT - is available. + support has been compiled. - At present, it is not possible to free JIT compiled code except when + At present, it is not possible to free JIT compiled code except when the entire compiled pattern is freed by calling pcre2_code_free(). - In some circumstances you may need to call additional functions. These - are described in the section entitled "Controlling the JIT stack" be- + In some circumstances you may need to call additional functions. These + are described in the section entitled "Controlling the JIT stack" be- low. There are some pcre2_match() options that are not supported by JIT, and - there are also some pattern items that JIT cannot handle. Details are - given below. In both cases, matching automatically falls back to the - interpretive code. If you want to know whether JIT was actually used - for a particular match, you should arrange for a JIT callback function - to be set up as described in the section entitled "Controlling the JIT - stack" below, even if you do not need to supply a non-default JIT + there are also some pattern items that JIT cannot handle. Details are + given below. In both cases, matching automatically falls back to the + interpretive code. If you want to know whether JIT was actually used + for a particular match, you should arrange for a JIT callback function + to be set up as described in the section entitled "Controlling the JIT + stack" below, even if you do not need to supply a non-default JIT stack. Such a callback function is called whenever JIT code is about to - be obeyed. If the match-time options are not right for JIT execution, + be obeyed. If the match-time options are not right for JIT execution, the callback function is not obeyed. - If the JIT compiler finds an unsupported item, no JIT data is gener- + If the JIT compiler finds an unsupported item, no JIT data is gener- ated. You can find out if JIT compilation was successful for a compiled pattern by calling pcre2_pattern_info() with the PCRE2_INFO_JITSIZE op- - tion. A non-zero result means that JIT compilation was successful. A + tion. A non-zero result means that JIT compilation was successful. A result of 0 means that JIT support is not available, or the pattern was - not processed by pcre2_jit_compile(), or the JIT compiler was not able - to handle the pattern. Successful JIT compilation does not, however, - guarantee the use of JIT at match time because there are some match + not processed by pcre2_jit_compile(), or the JIT compiler was not able + to handle the pattern. Successful JIT compilation does not, however, + guarantee the use of JIT at match time because there are some match time options that are not supported by JIT. MATCHING SUBJECTS CONTAINING INVALID UTF - When a pattern is compiled with the PCRE2_UTF option, subject strings - are normally expected to be a valid sequence of UTF code units. By de- - fault, this is checked at the start of matching and an error is gener- - ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be + When a pattern is compiled with the PCRE2_UTF option, subject strings + are normally expected to be a valid sequence of UTF code units. By de- + fault, this is checked at the start of matching and an error is gener- + ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be passed to pcre2_match() to skip the check (for improved performance) if - you are sure that a subject string is valid. If this option is used - with an invalid string, the result is undefined. The calling program + you are sure that a subject string is valid. If this option is used + with an invalid string, the result is undefined. The calling program may crash or loop or otherwise misbehave. - However, a way of running matches on strings that may contain invalid - UTF sequences is available. Calling pcre2_compile() with the - PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- - preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- - pile() is subsequently called, the compiled JIT code also supports in- - valid UTF. Details of how this support works, in both the JIT and the + However, a way of running matches on strings that may contain invalid + UTF sequences is available. Calling pcre2_compile() with the + PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- + preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- + pile() is subsequently called, the compiled JIT code also supports in- + valid UTF. Details of how this support works, in both the JIT and the interpretive cases, is given in the pcre2unicode documentation. There is also an obsolete option for pcre2_jit_compile() called PCRE2_JIT_INVALID_UTF, which currently exists only for backward compat- - ibility. It is superseded by the pcre2_compile() option + ibility. It is superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF and should no longer be used. It may be removed in future. UNSUPPORTED OPTIONS AND PATTERN ITEMS - The pcre2_match() options that are supported for JIT matching are + The pcre2_match() options that are supported for JIT matching are PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, - PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and - PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options + PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and + PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not supported at match time. - If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the + If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the use of JIT, forcing matching by the interpreter code. - The only unsupported pattern items are \C (match a single data unit) - when running in a UTF mode, and a callout immediately before an asser- + The only unsupported pattern items are \C (match a single data unit) + when running in a UTF mode, and a callout immediately before an asser- tion condition in a conditional group. RETURN VALUES FROM JIT MATCHING - When a pattern is matched using JIT, the return values are the same as - those given by the interpretive pcre2_match() code, with the addition - of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the - memory used for the JIT stack was insufficient. See "Controlling the + When a pattern is matched using JIT, the return values are the same as + those given by the interpretive pcre2_match() code, with the addition + of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the + memory used for the JIT stack was insufficient. See "Controlling the JIT stack" below for a discussion of JIT stack usage. - The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if - searching a very large pattern tree goes on for too long, as it is in - the same circumstance when JIT is not used, but the details of exactly + The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if + searching a very large pattern tree goes on for too long, as it is in + the same circumstance when JIT is not used, but the details of exactly what is counted are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned when JIT matching is used. @@ -5506,25 +5813,25 @@ RETURN VALUES FROM JIT MATCHING CONTROLLING THE JIT STACK When the compiled JIT code runs, it needs a block of memory to use as a - stack. By default, it uses 32KiB on the machine stack. However, some - large or complicated patterns need more than this. The error PCRE2_ER- + stack. By default, it uses 32KiB on the machine stack. However, some + large or complicated patterns need more than this. The error PCRE2_ER- ROR_JIT_STACKLIMIT is given when there is not enough stack. Three func- tions are provided for managing blocks of memory for use as JIT stacks. - There is further discussion about the use of JIT stacks in the section + There is further discussion about the use of JIT stacks in the section entitled "JIT stack FAQ" below. - The pcre2_jit_stack_create() function creates a JIT stack. Its argu- - ments are a starting size, a maximum size, and a general context (for - memory allocation functions, or NULL for standard memory allocation). + The pcre2_jit_stack_create() function creates a JIT stack. Its argu- + ments are a starting size, a maximum size, and a general context (for + memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type pcre2_jit_stack, or - NULL if there is an error. The pcre2_jit_stack_free() function is used + NULL if there is an error. The pcre2_jit_stack_free() function is used to free a stack that is no longer needed. If its argument is NULL, this - function returns immediately, without doing anything. (For the techni- - cally minded: the address space is allocated by mmap or VirtualAlloc.) - A maximum stack size of 512KiB to 1MiB should be more than enough for + function returns immediately, without doing anything. (For the techni- + cally minded: the address space is allocated by mmap or VirtualAlloc.) + A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. - The pcre2_jit_stack_assign() function specifies which stack JIT code + The pcre2_jit_stack_assign() function specifies which stack JIT code should use. Its arguments are as follows: pcre2_match_context *mcontext @@ -5534,7 +5841,7 @@ CONTROLLING THE JIT STACK The first argument is a pointer to a match context. When this is subse- quently passed to a matching function, its information determines which JIT stack is used. If this argument is NULL, the function returns imme- - diately, without doing anything. There are three cases for the values + diately, without doing anything. There are three cases for the values of the other two options: (1) If callback is NULL and data is NULL, an internal 32KiB block @@ -5552,34 +5859,34 @@ CONTROLLING THE JIT STACK return value must be a valid JIT stack, the result of calling pcre2_jit_stack_create(). - A callback function is obeyed whenever JIT code is about to be run; it + A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when pcre2_match() is called with options that are incom- - patible for JIT matching. A callback function can therefore be used to - determine whether a match operation was executed by JIT or by the in- + patible for JIT matching. A callback function can therefore be used to + determine whether a match operation was executed by JIT or by the in- terpreter. You may safely use the same JIT stack for more than one pattern (either - by assigning directly or by callback), as long as the patterns are + by assigning directly or by callback), as long as the patterns are matched sequentially in the same thread. Currently, the only way to set - up non-sequential matches in one thread is to use callouts: if a call- - out function starts another match, that match must use a different JIT + up non-sequential matches in one thread is to use callouts: if a call- + out function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es). - In a multithread application, if you do not specify a JIT stack, or if - you assign or pass back NULL from a callback, that is thread-safe, be- - cause each thread has its own machine stack. However, if you assign or + In a multithread application, if you do not specify a JIT stack, or if + you assign or pass back NULL from a callback, that is thread-safe, be- + cause each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for each thread so that the application is thread-safe. - Strictly speaking, even more is allowed. You can assign the same non- - NULL stack to a match context that is used by any number of patterns, - as long as they are not used for matching by multiple threads at the - same time. For example, you could use the same stack in all compiled - patterns, with a global mutex in the callback to wait until the stack + Strictly speaking, even more is allowed. You can assign the same non- + NULL stack to a match context that is used by any number of patterns, + as long as they are not used for matching by multiple threads at the + same time. For example, you could use the same stack in all compiled + patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended. - This is a suggestion for how a multithreaded program that needs to set + This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate: During thread initialization @@ -5591,7 +5898,7 @@ CONTROLLING THE JIT STACK Use a one-line callback function return thread_local_var - All the functions described in this section do nothing if JIT is not + All the functions described in this section do nothing if JIT is not available. @@ -5600,20 +5907,20 @@ JIT STACK FAQ (1) Why do we need JIT stacks? PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack - where the local data of the current node is pushed before checking its + where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is diffi- cult. For example, the stack chain needs to be updated every time if we - extend the stack on PowerPC. Although it is possible, its updating + extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory. (2) Why don't we simply allocate blocks of memory with malloc()? - Modern operating systems have a nice feature: they can reserve an ad- + Modern operating systems have a nice feature: they can reserve an ad- dress space instead of allocating memory. We can safely allocate memory pages inside this address space, so the stack could grow without moving - memory data (this is important because of pointers). Thus we can allo- - cate 1MiB address space, and use only a single memory page (usually - 4KiB) if that is enough. However, we can still grow up to 1MiB anytime + memory data (this is important because of pointers). Thus we can allo- + cate 1MiB address space, and use only a single memory page (usually + 4KiB) if that is enough. However, we can still grow up to 1MiB anytime if needed. (3) Who "owns" a JIT stack? @@ -5621,8 +5928,8 @@ JIT STACK FAQ The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by pcre2_match(), (that is, it is assigned to a match context that - is passed to the pattern currently running), that stack must not be - used by any other threads (to avoid overwriting the same memory area). + is passed to the pattern currently running), that stack must not be + used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function. @@ -5630,36 +5937,36 @@ JIT STACK FAQ You can free a JIT stack at any time, as long as it will not be used by pcre2_match() again. When you assign the stack to a match context, only - a pointer is set. There is no reference counting or any other magic. + a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, any- - time. Just do not call pcre2_match() with a match context pointing to + time. Just do not call pcre2_match() with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free - a stack currently used by pcre2_match() in another thread). You can - also replace the stack in a context at any time when it is not in use. + a stack currently used by pcre2_match() in another thread). You can + also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement. - (5) Should I allocate/free a stack every time before/after calling + (5) Should I allocate/free a stack every time before/after calling pcre2_match()? - No, because this is too costly in terms of resources. However, you - could implement some clever idea which release the stack if it is not - used in let's say two minutes. The JIT callback can help to achieve + No, because this is too costly in terms of resources. However, you + could implement some clever idea which release the stack if it is not + used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns. - (6) OK, the stack is for long term memory allocation. But what happens - if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB + (6) OK, the stack is for long term memory allocation. But what happens + if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the stack is freed? Especially on embedded systems, it might be a good idea to release mem- - ory sometimes without freeing the stack. There is no API for this at - the moment. Probably a function call which returns with the currently - allocated memory for any stack and another which allows releasing mem- + ory sometimes without freeing the stack. There is no API for this at + the moment. Probably a function call which returns with the currently + allocated memory for any stack and another which allows releasing mem- ory (shrinking the stack) would be a good idea if someone needs this. (7) This is too much of a headache. Isn't there any better solution for JIT stack handling? - No, thanks to Windows. If POSIX threads were used everywhere, we could + No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API. @@ -5668,18 +5975,18 @@ FREEING JIT SPECULATIVE MEMORY void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); The JIT executable allocator does not free all memory when it is possi- - ble. It expects new allocations, and keeps some free memory around to - improve allocation speed. However, in low memory conditions, it might - be better to free all possible memory. You can cause this to happen by - calling pcre2_jit_free_unused_memory(). Its argument is a general con- + ble. It expects new allocations, and keeps some free memory around to + improve allocation speed. However, in low memory conditions, it might + be better to free all possible memory. You can cause this to happen by + calling pcre2_jit_free_unused_memory(). Its argument is a general con- text, for custom memory management, or NULL for standard memory manage- ment. EXAMPLE CODE - This is a single-threaded example that specifies a JIT stack without - using a callback. A real program should include error checking after + This is a single-threaded example that specifies a JIT stack without + using a callback. A real program should include error checking after all the function calls. int rc; @@ -5707,36 +6014,36 @@ EXAMPLE CODE JIT FAST PATH API Because the API described above falls back to interpreted matching when - JIT is not available, it is convenient for programs that are written + JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via pcre2_match() does have a performance impact. Programs that are written - for use where JIT is known to be available, and which need the best - possible performance, can instead use a "fast path" API to call JIT - matching directly instead of calling pcre2_match() (obviously only for + for use where JIT is known to be available, and which need the best + possible performance, can instead use a "fast path" API to call JIT + matching directly instead of calling pcre2_match() (obviously only for patterns that have been successfully processed by pcre2_jit_compile()). - The fast path function is called pcre2_jit_match(), and it takes ex- - actly the same arguments as pcre2_match(). However, the subject string - must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- + The fast path function is called pcre2_jit_match(), and it takes ex- + actly the same arguments as pcre2_match(). However, the subject string + must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- ported. Unsupported option bits (for example, PCRE2_ANCHORED and - PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- - turn values are also the same as for pcre2_match(), plus PCRE2_ER- + PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- + turn values are also the same as for pcre2_match(), plus PCRE2_ER- ROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. - When you call pcre2_match(), as well as testing for invalid options, a + When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For exam- - ple, if the subject pointer is NULL but the length is non-zero, an im- - mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF + ple, if the subject pointer is NULL but the length is non-zero, an im- + mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these - checks do not happen on the JIT fast path. If invalid UTF data is - passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), - the result is undefined. The program may crash or loop or give wrong - results. In the absence of PCRE2_MATCH_INVALID_UTF you should call - pcre2_jit_match() in UTF mode only if you are sure the subject is + checks do not happen on the JIT fast path. If invalid UTF data is + passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), + the result is undefined. The program may crash or loop or give wrong + results. In the absence of PCRE2_MATCH_INVALID_UTF you should call + pcre2_jit_match() in UTF mode only if you are sure the subject is valid. - Bypassing the sanity checks and the pcre2_match() wrapping can give + Bypassing the sanity checks and the pcre2_match() wrapping can give speedups of more than 10%. @@ -5754,15 +6061,14 @@ AUTHOR REVISION - Last updated: 21 February 2024 + Last updated: 22 August 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 21 February 2024 PCRE2JIT(3) +PCRE2 10.45 22 August 2024 PCRE2JIT(3) ------------------------------------------------------------------------------ - PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3) @@ -5838,15 +6144,14 @@ AUTHOR REVISION - Last updated: August 2023 + Last updated: 16 August 2023 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 1 August 2023 PCRE2LIMITS(3) +PCRE2 10.45 16 August 2023 PCRE2LIMITS(3) ------------------------------------------------------------------------------ - PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3) @@ -5860,7 +6165,7 @@ PCRE2 MATCHING ALGORITHMS in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() function. This works in the same as Perl's matching func- - tion, and provide a Perl-compatible matching operation. The just-in- + tion, and provides a Perl-compatible matching operation. The just-in- time (JIT) optimization that is described in the pcre2jit documentation is compatible with this function. @@ -5872,7 +6177,7 @@ PCRE2 MATCHING ALGORITHMS When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, when there are multiple possibilities. For example, if - the pattern + the anchored pattern ^<.*> @@ -5948,83 +6253,86 @@ THE ALTERNATIVE MATCHING ALGORITHM first match (which is necessarily the shortest) is found. Note that the size of vector needed to contain all the results depends - on the number of simultaneous matches, not on the number of parentheses - in the pattern. Using pcre2_match_data_create_from_pattern() to create - the match data block is therefore not advisable when doing DFA match- - ing. + on the number of simultaneous matches, not on the number of capturing + parentheses in the pattern. Using pcre2_match_data_create_from_pat- + tern() to create the match data block is therefore not advisable when + doing DFA matching. - Note also that all the matches that are found start at the same point + Note also that all the matches that are found start at the same point in the subject. If the pattern cat(er(pillar)?)? - is matched against the string "the caterpillar catchment", the result - is the three strings "caterpillar", "cater", and "cat" that start at - the fifth character of the subject. The algorithm does not automati- + is matched against the string "the caterpillar catchment", the result + is the three strings "caterpillar", "cater", and "cat" that start at + the fifth character of the subject. The algorithm does not automati- cally move on to find matches that start at later positions. PCRE2's "auto-possessification" optimization usually applies to charac- - ter repeats at the end of a pattern (as well as internally). For exam- + ter repeats at the end of a pattern (as well as internally). For exam- ple, the pattern "a\d+" is compiled as if it were "a\d++" because there - is no point even considering the possibility of backtracking into the - repeated digits. For DFA matching, this means that only one possible - match is found. If you really do want multiple matches in such cases, - either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- + is no point even considering the possibility of backtracking into the + repeated digits. For DFA matching, this means that only one possible + match is found. If you really do want multiple matches in such cases, + either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- SESS option when compiling. - There are a number of features of PCRE2 regular expressions that are - not supported or behave differently in the alternative matching func- + There are a number of features of PCRE2 regular expressions that are + not supported or behave differently in the alternative matching func- tion. Those that are not supported cause an error if encountered. - 1. Because the algorithm finds all possible matches, the greedy or un- - greedy nature of repetition quantifiers is not relevant (though it may - affect auto-possessification, as just described). During matching, - greedy and ungreedy quantifiers are treated in exactly the same way. + 1. Because the algorithm finds all possible matches, the greedy or un- + greedy nature of repetition quantifiers is not relevant (though it may + affect auto-possessification, as just described). During matching, + greedy and ungreedy quantifiers are treated in exactly the same way. However, possessive quantifiers can make a difference when what follows - could also match what is quantified, for example in a pattern like + could also match what is quantified, for example in a pattern like this: ^a++\w! - This pattern matches "aaab!" but not "aaa!", which would be matched by - a non-possessive quantifier. Similarly, if an atomic group is present, - it is matched as if it were a standalone pattern at the current point, - and the longest match is then "locked in" for the rest of the overall + This pattern matches "aaab!" but not "aaa!", which would be matched by + a non-possessive quantifier. Similarly, if an atomic group is present, + it is matched as if it were a standalone pattern at the current point, + and the longest match is then "locked in" for the rest of the overall pattern. 2. When dealing with multiple paths through the tree simultaneously, it - is not straightforward to keep track of captured substrings for the - different matching possibilities, and PCRE2's implementation of this + is not straightforward to keep track of captured substrings for the + different matching possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured sub- strings are available. - 3. Because no substrings are captured, backreferences within the pat- - tern are not supported. + 3. Because no substrings are captured, a number of related features are + not available: - 4. For the same reason, conditional expressions that use a backrefer- - ence as the condition or test for a specific group recursion are not - supported. + (a) Backreferences; - 5. Again for the same reason, script runs are not supported. + (b) Conditional expressions that use a backreference as the condition + or test for a specific group recursion; - 6. Because many paths through the tree may be active, the \K escape se- - quence, which resets the start of the match when encountered (but may + (c) Script runs; + + (d) Scan substring assertions. + + 4. Because many paths through the tree may be active, the \K escape se- + quence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported. - 7. Callouts are supported, but the value of the capture_top field is + 5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0. - 8. The \C escape sequence, which (in the standard algorithm) always - matches a single code unit, even in a UTF mode, is not supported in - these modes, because the alternative algorithm moves through the sub- - ject string one character (not code unit) at a time, for all active - paths through the tree. + 6. The \C escape sequence, which (in the standard algorithm) always + matches a single code unit, even in a UTF mode, is not supported in UTF + modes because the alternative algorithm moves through the subject + string one character (not code unit) at a time, for all active paths + through the tree. - 9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) + 7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion. - 10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- + 8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- ported by pcre2_dfa_match(). @@ -6049,13 +6357,15 @@ DISADVANTAGES OF THE ALTERNATIVE ALGORITHM partly because it has to search for all possible matches, but is also because it is less susceptible to optimization. - 2. Capturing parentheses, backreferences, script runs, and matching - within invalid UTF string are not supported. + 2. Capturing parentheses and other features such as backreferences that + rely on them are not supported. - 3. Although atomic groups are supported, their use does not provide the + 3. Matching within invalid UTF strings is not supported. + + 4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm. - 4. JIT optimization is not supported. + 5. JIT optimization is not supported. AUTHOR @@ -6067,20 +6377,19 @@ AUTHOR REVISION - Last updated: 19 January 2024 + Last updated: 30 August 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2MATCHING(3) +PCRE2 10.45 30 August 2024 PCRE2MATCHING(3) ------------------------------------------------------------------------------ - PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3) NAME - PCRE2 - Perl-compatible regular expressions + PCRE2 - Perl-compatible regular expressions (revised API) PARTIAL MATCHING IN PCRE2 @@ -6451,15 +6760,14 @@ AUTHOR REVISION - Last updated: 04 September 2019 + Last updated: 27 November 2024 Copyright (c) 1997-2019 University of Cambridge. -PCRE2 10.34 04 September 2019 PCRE2PARTIAL(3) +PCRE2 10.45 27 November 2024 PCRE2PARTIAL(3) ------------------------------------------------------------------------------ - PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) @@ -6473,9 +6781,11 @@ PCRE2 REGULAR EXPRESSION DETAILS by PCRE2 are described in detail below. There is a quick-reference syn- tax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. PCRE2 also supports some alterna- - tive regular expression syntax (which does not conflict with the Perl - syntax) in order to provide some compatibility with regular expressions - in Python, .NET, and Oniguruma. + tive regular expression syntax that does not conflict with the Perl + syntax in order to provide some compatibility with regular expressions + in Python, .NET, and Oniguruma. There are in addition some options that + enable alternative syntax and semantics that are not the same as in + Perl. Perl's regular expressions are described in its own documentation, and regular expressions in general are covered in a number of books, some @@ -6494,82 +6804,98 @@ PCRE2 REGULAR EXPRESSION DETAILS tion, are discussed in the pcre2matching page. +EBCDIC CHARACTER CODES + + Most computers use ASCII or Unicode for encoding characters, and PCRE2 + assumes this by default. However, it can be compiled to run in an envi- + ronment that uses the EBCDIC code, which is the case for some IBM main- + frame operating systems. In the sections below, character code values + are ASCII or Unicode; in an EBCDIC environment these characters may + have different code values, and there are no code points greater than + 255. Differences in behaviour when PCRE2 is running in an EBCDIC envi- + ronment are described in the section "EBCDIC environments" below, which + you can ignore unless you really are in an EBCDIC environment. + + SPECIAL START-OF-PATTERN ITEMS - A number of options that can be passed to pcre2_compile() can also be + A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-com- - patible, but are provided to make these options accessible to pattern - writers who are not able to change the program that processes the pat- - tern. Any number of these items may appear, but they must all be to- - gether right at the start of the pattern string, and the letters must + patible, but are provided to make these options accessible to pattern + writers who are not able to change the program that processes the pat- + tern. Any number of these items may appear, but they must all be to- + gether right at the start of the pattern string, and the letters must be in upper case. UTF support In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 - can be specified for the 32-bit library, in which case it constrains - the character values to valid Unicode code points. To process UTF - strings, PCRE2 must be built to include Unicode support (which is the - default). When using UTF strings you must either call the compiling - function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF - options, or the pattern must start with the special sequence (*UTF), - which is equivalent to setting the relevant PCRE2_UTF. How setting a + can be specified for the 32-bit library, in which case it constrains + the character values to valid Unicode code points. To process UTF + strings, PCRE2 must be built to include Unicode support (which is the + default). When using UTF strings you must either call the compiling + function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF + options, or the pattern must start with the special sequence (*UTF), + which is equivalent to setting the relevant PCRE2_UTF. How setting a UTF mode affects pattern matching is mentioned in several places below. There is also a summary of features in the pcre2unicode page. Some applications that allow their users to supply patterns may wish to - restrict them to non-UTF data for security reasons. If the - PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- + restrict them to non-UTF data for security reasons. If the + PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- lowed, and its appearance in a pattern causes an error. Unicode property support - Another special sequence that may appear at the start of a pattern is - (*UCP). This has the same effect as setting the PCRE2_UCP option: it - causes sequences such as \d and \w to use Unicode properties to deter- + Another special sequence that may appear at the start of a pattern is + (*UCP). This has the same effect as setting the PCRE2_UCP option: it + causes sequences such as \d and \w to use Unicode properties to deter- mine character types, instead of recognizing only characters with codes less than 256 via a lookup table. If also causes upper/lower casing op- - erations to use Unicode properties for characters with code points - greater than 127, even when UTF is not set. These behaviours can be - changed within the pattern; see the section entitled "Internal Option + erations to use Unicode properties for characters with code points + greater than 127, even when UTF is not set. These behaviours can be + changed within the pattern; see the section entitled "Internal Option Setting" below. Some applications that allow their users to supply patterns may wish to - restrict them for security reasons. If the PCRE2_NEVER_UCP option is + restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to pcre2_compile(), (*UCP) is not allowed, and its appearance in a pattern causes an error. Locking out empty string matching Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same - effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option + effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever matching function is subsequently called to match the pat- - tern. These options lock out the matching of empty strings, either en- + tern. These options lock out the matching of empty strings, either en- tirely, or only at the start of the subject. Disabling auto-possessification - If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as - setting the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making - quantifiers possessive when what follows cannot match the repeated - item. For example, by default a+b is treated as a++b. For more details, - see the pcre2api documentation. + If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as + setting the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_opti- + mize() with a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from + making quantifiers possessive when what follows cannot match the re- + peated item. For example, by default a+b is treated as a++b. For more + details, see the pcre2api documentation. Disabling start-up optimizations - If a pattern starts with (*NO_START_OPT), it has the same effect as - setting the PCRE2_NO_START_OPTIMIZE option. This disables several opti- - mizations for quickly reaching "no match" results. For more details, - see the pcre2api documentation. + If a pattern starts with (*NO_START_OPT), it has the same effect as + setting the PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_opti- + mize() with a PCRE2_START_OPTIMIZE_OFF directive. This disables several + optimizations for quickly reaching "no match" results. For more de- + tails, see the pcre2api documentation. Disabling automatic anchoring If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect - as setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimiza- - tions that apply to patterns whose top-level branches all start with .* - (match any number of arbitrary characters). For more details, see the - pcre2api documentation. + as setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_op- + timize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables opti- + mizations that apply to patterns whose top-level branches all start + with .* (match any number of arbitrary characters). For more details, + see the pcre2api documentation. Disabling JIT compilation @@ -6666,33 +6992,27 @@ SPECIAL START-OF-PATTERN ITEMS CODE) is also recognized, corresponding to PCRE2_BSR_UNICODE. -EBCDIC CHARACTER CODES - - PCRE2 can be compiled to run in an environment that uses EBCDIC as its - character code instead of ASCII or Unicode (typically a mainframe sys- - tem). In the sections below, character code values are ASCII or Uni- - code; in an EBCDIC environment these characters may have different code - values, and there are no code points greater than 255. - - CHARACTERS AND METACHARACTERS - A regular expression is a pattern that is matched against a subject - string from left to right. Most characters stand for themselves in a - pattern, and match the corresponding characters in the subject. As a + A regular expression is a pattern that is matched against a subject + string from left to right. Most characters stand for themselves in a + pattern, and match the corresponding characters in the subject. As a trivial example, the pattern The quick brown fox matches a portion of a subject string that is identical to itself. When - caseless matching is specified (the PCRE2_CASELESS option or (?i) - within the pattern), letters are matched independently of case. Note - that there are two ASCII characters, K and S, that, in addition to - their lower case ASCII equivalents, are case-equivalent with Unicode - U+212A (Kelvin sign) and U+017F (long S) respectively when either + caseless matching is specified (the PCRE2_CASELESS option or (?i) + within the pattern), letters are matched independently of case. Note + that there are two ASCII characters, K and S, that, in addition to + their lower case ASCII equivalents, are case-equivalent with Unicode + U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT - option is in force (either passed to pcre2_compile() or set by (?r) - within the pattern). + option is in force (either passed to pcre2_compile() or set by (*CASE- + LESS_RESTRICT) or (?r) within the pattern). If the PCRE2_EXTRA_TURK- + ISH_CASING option is in force (either passed to pcre2_compile() or set + by (*TURKISH_CASING) within the pattern), then the 'i' letters are + matched according to Turkish and Azeri languages. The power of regular expressions comes from the ability to include wild cards, character classes, alternatives, and repetitions in the pattern. @@ -6739,7 +7059,7 @@ CHARACTERS AND METACHARACTERS If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or between a # outside a character class and the next new- - line, inclusive, are ignored. An escaping backslash can be used to in- + line, inclusive, is ignored. An escaping backslash can be used to in- clude a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are ignored inside a @@ -6797,6 +7117,13 @@ BACKSLASH error, because the character class is then not terminated by a closing square bracket. + Another difference from Perl is that any appearance of \Q or \E inside + what might otherwise be a quantifier causes PCRE2 not to recognize the + sequence as a quantifier. Perl recognizes a quantifier if (redundantly) + either of the numbers is inside \Q...\E, but not if the separating + comma is. When not recognized as a quantifier a sequence such as + {\Q1\E,2} is treated as the literal string "{1,2}". + Non-printing characters A second use of backslash provides a way of encoding non-printing char- @@ -6815,115 +7142,107 @@ BACKSLASH \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd - \ddd character with octal code ddd, or backreference + \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh.. - By default, after \x that is not followed by {, from zero to two hexa- - decimal digits are read (letters can be in upper or lower case). Any - number of hexadecimal digits may appear between \x{ and }. If a charac- - ter other than a hexadecimal digit appears between \x{ and }, or if - there is no terminating }, an error occurs. + A description of how back references work is given later, following the + discussion of parenthesized groups. + + By default, after \x that is not followed by {, one or two hexadecimal + digits are read (letters can be in upper or lower case). If the charac- + ter that follows \x is neither { nor a hexadecimal digit, an error oc- + curs. This is different from Perl's default behaviour, which generates + a NUL character, but is in line with the behaviour of Perl's 'strict' + mode in re. + + Any number of hexadecimal digits may appear between \x{ and }. If a + character other than a hexadecimal digit appears between \x{ and }, or + if there is no terminating }, an error occurs. Characters whose code points are less than 256 can be defined by either of the two syntaxes for \x or by an octal sequence. There is no differ- ence in the way they are handled. For example, \xdc is exactly the same - as \x{dc} or \334. However, using the braced versions does make such + as \x{dc} or \334. However, using the braced versions does make such sequences easier to read. - Support is available for some ECMAScript (aka JavaScript) escape se- + Support is available for some ECMAScript (aka JavaScript) escape se- quences via two compile-time options. If PCRE2_ALT_BSUX is set, the se- - quence \x followed by { is not recognized. Only if \x is followed by - two hexadecimal digits is it recognized as a character escape. Other- - wise it is interpreted as a literal "x" character. In this mode, sup- - port for code points greater than 256 is provided by \u, which must be - followed by four hexadecimal digits; otherwise it is interpreted as a + quence \x followed by { is not recognized. Only if \x is followed by + two hexadecimal digits is it recognized as a character escape. Other- + wise it is interpreted as a literal "x" character. In this mode, sup- + port for code points greater than 256 is provided by \u, which must be + followed by four hexadecimal digits; otherwise it is interpreted as a literal "u" character. - PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- + PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- dition, \u{hhh..} is recognized as the character specified by hexadeci- mal code point. There may be any number of hexadecimal digits, but un- - like other places that also use curly brackets, spaces are not allowed - and would result in the string being interpreted as a literal. This + like other places that also use curly brackets, spaces are not allowed + and would result in the string being interpreted as a literal. This syntax is from ECMAScript 6. - The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- - ating in UTF mode. Perl also uses \N{name} to specify characters by - Unicode name; PCRE2 does not support this. Note that when \N is not + The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- + ating in UTF mode. Perl also uses \N{name} to specify characters by + Unicode name; PCRE2 does not support this. Note that when \N is not followed by an opening brace (curly bracket) it has an entirely differ- ent meaning, matching any character that is not a newline. - There are some legacy applications where the escape sequence \r is ex- - pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option - is set, \r in a pattern is converted to \n so that it matches a LF + There are some legacy applications where the escape sequence \r is ex- + pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option + is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character. - An error occurs if \c is not followed by a character whose ASCII code - point is in the range 32 to 126. The precise effect of \cx is as fol- - lows: if x is a lower case letter, it is converted to upper case. Then + An error occurs if \c is not followed by a character whose ASCII code + point is in the range 32 to 126. The precise effect of \cx is as fol- + lows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \cA to \cZ become hex - 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and - \c; becomes hex 7B (; is 3B). If the code unit following \c has a code + 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and + \c; becomes hex 7B (; is 3B). If the code unit following \c has a code point less than 32 or greater than 126, a compile-time error occurs. - When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. - \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. - The \c escape is processed as specified for Perl in the perlebcdic doc- - ument. The only characters that are allowed after \c are A-Z, a-z, or - one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- - time error. The sequence \c@ encodes character code 0; after \c the - letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, - \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- - comes either 255 (hex FF) or 95 (hex 5F). + For differences in the way some escapes behave in EBCDIC environments, + see section "EBCDIC environments" below. - Thus, apart from \c?, these escapes generate the same character code - values as they do in an ASCII environment, though the meanings of the - values mostly differ. For example, \cG always generates code value 7, - which is BEL in ASCII but DEL in EBCDIC. + Octal escapes and back references - The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, - but because 127 is not a control character in EBCDIC, Perl makes it - generate the APC character. Unfortunately, there are several variants - of EBCDIC. In most of them the APC character has the value 255 (hex - FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If - certain other characters have POSIX-BC values, PCRE2 makes \c? generate - 95; otherwise it generates 255. + The escape \o must be followed by a sequence of octal digits, enclosed + in braces. An error occurs if this is not the case. This escape pro- + vides a way of specifying character code points as octal numbers + greater than 0777, and it also allows octal numbers and backreferences + to be unambiguously distinguished. - After \0 up to two further octal digits are read. If there are fewer - than two digits, just those that are present are used. Thus the se- - quence \0\x\015 specifies two binary zeros followed by a CR character - (code value 13). Make sure you supply two digits after the initial zero - if the pattern character that follows is itself an octal digit. + If braces are not used, after \0 up to two further octal digits are + read. However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one + more octal digit must follow \0 (use \00 to generate a NUL character). + Make sure you supply two digits after the initial zero if the pattern + character that follows is itself an octal digit. - The escape \o must be followed by a sequence of octal digits, enclosed - in braces. An error occurs if this is not the case. This escape is a - recent addition to Perl; it provides way of specifying character code - points as octal numbers greater than 0777, and it also allows octal - numbers and backreferences to be unambiguously specified. + Inside a character class, when a backslash is followed by any octal + digit, up to three octal digits are read to generate a code point. Any + subsequent digits stand for themselves. The sequences \8 and \9 are + treated as the literal characters "8" and "9". + + Outside a character class, Perl's handling of a backslash followed by a + digit other than 0 is complicated by ambiguity, and Perl has changed + over time, causing PCRE2 also to change. From PCRE2 release 10.45 there + is an option called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use + Python's unambiguous rules. The next two subsections describe the two + sets of rules. For greater clarity and unambiguity, it is best to avoid following \ by - a digit greater than zero. Instead, use \o{...} or \x{...} to specify + a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical character code points, and \g{...} to specify backreferences. - The following paragraphs describe the old, ambiguous syntax. - - The handling of a backslash followed by a digit other than 0 is compli- - cated, and Perl has changed over time, causing PCRE2 also to change. - Outside a character class, PCRE2 reads the digit and any following dig- - its as a decimal number. If the number is less than 10, begins with the - digit 8 or 9, or if there are at least that many previous capture - groups in the expression, the entire sequence is taken as a backrefer- - ence. A description of how this works is given later, following the - discussion of parenthesized groups. Otherwise, up to three octal dig- - its are read to form a character code. + Perl rules for non-class backslash 1-9 - Inside a character class, PCRE2 handles \8 and \9 as the literal char- - acters "8" and "9", and otherwise reads up to three octal digits fol- - lowing the backslash, using them to generate a data character. Any sub- - sequent digits stand for themselves. For example, outside a character - class: + All the digits that follow the backslash are read as a decimal number. + If the number is less than 10, begins with the digit 8 or 9, or if + there are at least that many previous capture groups in the expression, + the entire sequence is taken as a back reference. Otherwise, up to + three octal digits are read to form a character code. For example: \040 is another way of writing an ASCII space \40 is the same, provided there are fewer than 40 @@ -6939,10 +7258,21 @@ BACKSLASH the value 255 (decimal) \81 is always a backreference - Note that octal values of 100 or greater that are specified using this - syntax must not be introduced by a leading zero, because no more than + Note that octal values of 100 or greater that are specified using this + syntax must not be introduced by a leading zero, because no more than three octal digits are ever read. + Python rules for non_class backslash 1-9 + + If there are at least three octal digits after the backslash, exactly + three are read as an octal code point number, but the value must be no + greater than \377, even in modes where higher code point values are + supported. Any subsequent digits stand for themselves. If there are + fewer than three octal digits, the sequence is taken as a decimal back + reference. Thus, for example, \12 is always a back reference, indepen- + dent of how many captures there are in the pattern. An error is gener- + ated for a reference to a non-existent capturing group. + Constraints on character values Characters that are specified using octal or hexadecimal numbers are @@ -7161,7 +7491,7 @@ BACKSLASH tional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing charac- - ters whose code points are less than U+0100 and U+10000, respectively. + ters whose code points are less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Un- known script and with an unassigned type. @@ -7179,15 +7509,34 @@ BACKSLASH \P{xx} a character without the xx property \X a Unicode extended grapheme cluster - The property names represented by xx above are not case-sensitive, and - in accordance with Unicode's "loose matching" rules, spaces, hyphens, - and underscores are ignored. There is support for Unicode script names, - Unicode general category properties, "Any", which matches any character - (including newline), Bidi_Class, a number of binary (yes/no) proper- - ties, and some special PCRE2 properties (described below). Certain - other Perl properties such as "InMusicalSymbols" are not supported by - PCRE2. Note that \P{Any} does not match any characters, so always - causes a match failure. + For compatibility with Perl, negation can be specified by including a + circumflex between the opening brace and the property. For example, + \p{^Lu} is the same as \P{Lu}. + + In accordance with Unicode's "loose matching" rules, ASCII white space + characters, hyphens, and underscores are ignored in the properties rep- + resented by xx above. As well as the space character, ASCII white space + can be tab, linefeed, vertical tab, formfeed, or carriage return. + + Some properties are specified as a name only; others as a name and a + value, separated by a colon or an equals sign. The names and values + consist of ASCII letters and digits (with one Perl-specific exception, + see below). They are not case sensitive. Note, however, that the es- + capes themselves, \p and \P, are case sensitive. There are abbrevia- + tions for many names. The following examples are all equivalent: + + \p{bidiclass=al} + \p{BC=al} + \p{ Bidi_Class : AL } + \p{ Bi-di class = Al } + \P{ ^ Bi-di class = Al } + + There is support for Unicode script names, Unicode general category + properties, "Any", which matches any character (including newline), + Bidi_Class, a number of binary (yes/no) properties, and some special + PCRE2 properties (described below). Certain other Perl properties such + as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} + does not match any characters, so always causes a match failure. Script properties for \p and \P @@ -7197,15 +7546,15 @@ BACKSLASH Adlam script as an example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and - "script extensions" for the property types are recognized, and a equals - sign is an alternative to the colon. If a script name is given without - a property type, for example, \p{Adlam}, it is treated as \p{scx:Ad- - lam}. Perl changed to this interpretation at release 5.26 and PCRE2 - changed at release 10.40. + "script extensions" for the property types are recognized and, as for + all property specifications, an equals sign is an alternative to the + colon. If a script name is given without a property type, for example, + \p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this inter- + pretation at release 5.26 and PCRE2 changed at release 10.40. Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others - that are not part of an identified script are lumped together as "Com- + that are not part of an identified script are lumped together as "Com- mon". The current list of recognized script names and their 4-character abbreviations can be obtained by running this command: @@ -7215,15 +7564,11 @@ BACKSLASH The general category property for \p and \P Each character has exactly one Unicode general category property, spec- - ified by a two-letter abbreviation. For compatibility with Perl, nega- - tion can be specified by including a circumflex between the opening - brace and the property name. For example, \p{^Lu} is the same as - \P{Lu}. - - If only one letter is specified with \p or \P, it includes all the gen- - eral category properties that start with that letter. In this case, in - the absence of negation, the curly brackets in the escape sequence are - optional; these two examples have the same effect: + ified by a two-letter abbreviation. If only one letter is specified + with \p or \P, it includes all the general category properties that + start with that letter. In this case, in the absence of negation, the + curly brackets in the escape sequence are optional; these two examples + have the same effect: \p{L} \pL @@ -7238,6 +7583,7 @@ BACKSLASH Cs Surrogate L Letter + Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter @@ -7274,35 +7620,36 @@ BACKSLASH Zp Paragraph separator Zs Space separator - The special property LC, which has the synonym L&, is also supported: - it matches a character that has the Lu, Ll, or Lt property, in other - words, a letter that is not classified as a modifier or "other". - - The Cs (Surrogate) property applies only to characters whose code - points are in the range U+D800 to U+DFFF. These characters are no dif- - ferent to any other character when PCRE2 is not in UTF mode (using the - 16-bit or 32-bit library). However, they are not valid in Unicode + Perl originally used the name L& for the Lc property. This is still + supported by Perl, but discouraged. PCRE2 also still supports it. This + property matches any character that has the Lu, Ll, or Lt property, in + other words, any letter that is not classified as a modifier or + "other". From release 10.45 of PCRE2 the properties Lu, Ll, and Lt are + all treated as Lc when case-independent matching is set by the + PCRE2_CASELESS option or (?i) within the pattern. The other properties + are not affected by caseless matching. + + The Cs (Surrogate) property applies only to characters whose code + points are in the range U+D800 to U+DFFF. These characters are no dif- + ferent to any other character when PCRE2 is not in UTF mode (using the + 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF valid- - ity checking has been turned off (see the discussion of + ity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api page). - The long synonyms for property names that Perl supports (such as - \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix + The long synonyms for property names that Perl supports (such as + \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". No character that is in the Unicode table has the Cn (unassigned) prop- erty. Instead, this property is assumed for any code point that is not in the Unicode table. - Specifying caseless matching does not affect these escape sequences. - For example, \p{Lu} always matches only upper case letters. This is - different from the behaviour of current versions of Perl. - Binary (yes/no) properties for \p and \P - Unicode defines a number of binary properties, that is, properties - whose only values are true or false. You can obtain a list of those - that are recognized by \p and \P, along with their abbreviations, by + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP @@ -7337,63 +7684,65 @@ BACKSLASH RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space - An equals sign may be used instead of a colon. The class names are - case-insensitive; only the short names listed above are recognized. + As in all property specifications, an equals sign may be used instead + of a colon and the class names are case-insensitive. Only the short + names listed above are recognized; PCRE2 does not at present support + any long alternatives. Extended grapheme clusters - The \X escape matches any number of Unicode characters that form an + The \X escape matches any number of Unicode characters that form an "extended grapheme cluster", and treats the sequence as an atomic group - (see below). Unicode supports various kinds of composite character by - giving each character a grapheme breaking property, and having rules + (see below). Unicode supports various kinds of composite character by + giving each character a grapheme breaking property, and having rules that use these properties to define the boundaries of extended grapheme - clusters. The rules are defined in Unicode Standard Annex 29, "Unicode - Text Segmentation". Unicode 11.0.0 abandoned the use of some previous - properties that had been used for emojis. Instead it introduced vari- - ous emoji-specific properties. PCRE2 uses only the Extended Picto- + clusters. The rules are defined in Unicode Standard Annex 29, "Unicode + Text Segmentation". Unicode 11.0.0 abandoned the use of some previous + properties that had been used for emojis. Instead it introduced vari- + ous emoji-specific properties. PCRE2 uses only the Extended Picto- graphic property. - \X always matches at least one character. Then it decides whether to + \X always matches at least one character. Then it decides whether to add additional characters according to the following rules for ending a cluster: 1. End at the end of the subject string. - 2. Do not end between CR and LF; otherwise end after any control char- + 2. Do not end between CR and LF; otherwise end after any control char- acter. - 3. Do not break Hangul (a Korean script) syllable sequences. Hangul - characters are of five types: L, V, T, LV, and LVT. An L character may - be followed by an L, V, LV, or LVT character; an LV or V character may - be followed by a V or T character; an LVT or T character may be fol- + 3. Do not break Hangul (a Korean script) syllable sequences. Hangul + characters are of five types: L, V, T, LV, and LVT. An L character may + be followed by an L, V, LV, or LVT character; an LV or V character may + be followed by a V or T character; an LVT or T character may be fol- lowed only by a T character. 4. Do not end before extending characters or spacing marks or the zero- - width joiner (ZWJ) character. Characters with the "mark" property al- + width joiner (ZWJ) character. Characters with the "mark" property al- ways have the "extend" grapheme breaking property. 5. Do not end after prepend characters. - 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width - joiner) sequences. An emoji ZWJ sequence consists of a character with - the Extended_Pictographic property, optionally followed by one or more - characters with the Extend property, followed by the ZWJ character, + 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width + joiner) sequences. An emoji ZWJ sequence consists of a character with + the Extended_Pictographic property, optionally followed by one or more + characters with the Extend property, followed by the ZWJ character, followed by another Extended_Pictographic character. - 7. Do not break within emoji flag sequences. That is, do not break be- - tween regional indicator (RI) characters if there are an odd number of + 7. Do not break within emoji flag sequences. That is, do not break be- + tween regional indicator (RI) characters if there are an odd number of RI characters before the break point. 8. Otherwise, end the cluster. PCRE2's additional properties - As well as the standard Unicode properties described above, PCRE2 sup- + As well as the standard Unicode properties described above, PCRE2 sup- ports four more that make it possible to convert traditional escape se- - quences such as \w and \s to use Unicode properties. PCRE2 uses these - non-standard, non-Perl properties internally when PCRE2_UCP is set. + quences such as \w and \s to use Unicode properties. PCRE2 uses these + non-standard, non-Perl properties internally when PCRE2_UCP is set. However, they may also be used explicitly. These properties are: Xan Any alphanumeric character @@ -7401,73 +7750,74 @@ BACKSLASH Xsp Any Perl space character Xwd Any Perl "word" character - Xan matches characters that have either the L (letter) or the N (num- - ber) property. Xps matches the characters tab, linefeed, vertical tab, - form feed, or carriage return, and any other character that has the Z - (separator) property. Xsp is the same as Xps; in PCRE1 it used to ex- - clude vertical tab, for Perl compatibility, but Perl changed. Xwd - matches the same characters as Xan, plus those that match Mn (non-spac- - ing mark) or Pc (connector punctuation, which includes underscore). - - There is another non-standard property, Xuc, which matches any charac- - ter that can be represented by a Universal Character Name in C++ and - other programming languages. These are the characters $, @, ` (grave - accent), and all characters with Unicode code points greater than or - equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that - most base (ASCII) characters are excluded. (Universal Character Names - are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. + Xan matches characters that have either the L (letter) or the N (num- + ber) property. Xps matches the characters tab, linefeed, vertical tab, + form feed, or carriage return, and any other character that has the Z + (separator) property (this includes the space character). Xsp is the + same as Xps; in PCRE1 it used to exclude vertical tab, for Perl compat- + ibility, but Perl changed. Xwd matches the same characters as Xan, plus + those that match Mn (non-spacing mark) or Pc (connector punctuation, + which includes underscore). + + There is another non-standard property, Xuc, which matches any charac- + ter that can be represented by a Universal Character Name in C++ and + other programming languages. These are the characters $, @, ` (grave + accent), and all characters with Unicode code points greater than or + equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that + most base (ASCII) characters are excluded. (Universal Character Names + are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. Note that the Xuc property does not match these sequences but the char- acters that they represent.) Resetting the match start - In normal use, the escape sequence \K causes any previously matched + In normal use, the escape sequence \K causes any previously matched characters not to be included in the final matched sequence that is re- turned. For example, the pattern: foo\Kbar - matches "foobar", but reports that it has matched "bar". \K does not + matches "foobar", but reports that it has matched "bar". \K does not interact with anchoring in any way. The pattern: ^foo\Kbar - matches only when the subject begins with "foobar" (in single line - mode), though it again reports the matched string as "bar". This fea- - ture is similar to a lookbehind assertion (described below), but the + matches only when the subject begins with "foobar" (in single line + mode), though it again reports the matched string as "bar". This fea- + ture is similar to a lookbehind assertion (described below), but the part of the pattern that precedes \K is not constrained to match a lim- - ited number of characters, as is required for a lookbehind assertion. - The use of \K does not interfere with the setting of captured sub- + ited number of characters, as is required for a lookbehind assertion. + The use of \K does not interfere with the setting of captured sub- strings. For example, when the pattern (foo)\Kbar matches "foobar", the first substring is still set to "foo". - From version 5.32.0 Perl forbids the use of \K in lookaround asser- - tions. From release 10.38 PCRE2 also forbids this by default. However, - the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling - pcre2_compile() to re-enable the previous behaviour. When this option + From version 5.32.0 Perl forbids the use of \K in lookaround asser- + tions. From release 10.38 PCRE2 also forbids this by default. However, + the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling + pcre2_compile() to re-enable the previous behaviour. When this option is set, \K is acted upon when it occurs inside positive assertions, but - is ignored in negative assertions. Note that when a pattern such as - (?=ab\K) matches, the reported start of the match can be greater than - the end of the match. Using \K in a lookbehind assertion at the start - of a pattern can also lead to odd effects. For example, consider this + is ignored in negative assertions. Note that when a pattern such as + (?=ab\K) matches, the reported start of the match can be greater than + the end of the match. Using \K in a lookbehind assertion at the start + of a pattern can also lead to odd effects. For example, consider this pattern: (?<=\Kfoo)bar - If the subject is "foobar", a call to pcre2_match() with a starting - offset of 3 succeeds and reports the matching string as "foobar", that - is, the start of the reported match is earlier than where the match + If the subject is "foobar", a call to pcre2_match() with a starting + offset of 3 succeeds and reports the matching string as "foobar", that + is, the start of the reported match is earlier than where the match started. Simple assertions - The final use of backslash is for certain simple assertions. An asser- - tion specifies a condition that has to be met at a particular point in - a match, without consuming any characters from the subject string. The - use of groups for more complicated assertions is described below. The + The final use of backslash is for certain simple assertions. An asser- + tion specifies a condition that has to be met at a particular point in + a match, without consuming any characters from the subject string. The + use of groups for more complicated assertions is described below. The backslashed assertions are: \b matches at a word boundary @@ -7478,193 +7828,193 @@ BACKSLASH \z matches only at the end of the subject \G matches at the first matching position in the subject - Inside a character class, \b has a different meaning; it matches the - backspace character. If any other of these assertions appears in a + Inside a character class, \b has a different meaning; it matches the + backspace character. If any other of these assertions appears in a character class, an "invalid escape sequence" error is generated. - A word boundary is a position in the subject string where the current - character and the previous character do not both match \w or \W (i.e. - one matches \w and the other matches \W), or the start or end of the - string if the first or last character matches \w, respectively. When - PCRE2 is built with Unicode support, the meanings of \w and \W can be + A word boundary is a position in the subject string where the current + character and the previous character do not both match \w or \W (i.e. + one matches \w and the other matches \W), or the start or end of the + string if the first or last character matches \w, respectively. When + PCRE2 is built with Unicode support, the meanings of \w and \W can be changed by setting the PCRE2_UCP option. When this is done, it also af- - fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" - or "end of word" metasequence. However, whatever follows \b normally - determines which it is. For example, the fragment \ba matches "a" at + fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" + or "end of word" metasequence. However, whatever follows \b normally + determines which it is. For example, the fragment \ba matches "a" at the start of a word. - The \A, \Z, and \z assertions differ from the traditional circumflex + The \A, \Z, and \z assertions differ from the traditional circumflex and dollar (described in the next section) in that they only ever match - at the very start and end of the subject string, whatever options are - set. Thus, they are independent of multiline mode. These three asser- - tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, - which affect only the behaviour of the circumflex and dollar metachar- - acters. However, if the startoffset argument of pcre2_match() is non- - zero, indicating that matching is to start at a point other than the - beginning of the subject, \A can never match. The difference between - \Z and \z is that \Z matches before a newline at the end of the string + at the very start and end of the subject string, whatever options are + set. Thus, they are independent of multiline mode. These three asser- + tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, + which affect only the behaviour of the circumflex and dollar metachar- + acters. However, if the startoffset argument of pcre2_match() is non- + zero, indicating that matching is to start at a point other than the + beginning of the subject, \A can never match. The difference between + \Z and \z is that \Z matches before a newline at the end of the string as well as at the very end, whereas \z matches only at the end. - The \G assertion is true only when the current matching position is at - the start point of the matching process, as specified by the startoff- - set argument of pcre2_match(). It differs from \A when the value of - startoffset is non-zero. By calling pcre2_match() multiple times with - appropriate arguments, you can mimic Perl's /g option, and it is in + The \G assertion is true only when the current matching position is at + the start point of the matching process, as specified by the startoff- + set argument of pcre2_match(). It differs from \A when the value of + startoffset is non-zero. By calling pcre2_match() multiple times with + appropriate arguments, you can mimic Perl's /g option, and it is in this kind of implementation where \G can be useful. - Note, however, that PCRE2's implementation of \G, being true at the - starting character of the matching process, is subtly different from - Perl's, which defines it as true at the end of the previous match. In - Perl, these can be different when the previously matched string was + Note, however, that PCRE2's implementation of \G, being true at the + starting character of the matching process, is subtly different from + Perl's, which defines it as true at the end of the previous match. In + Perl, these can be different when the previously matched string was empty. Because PCRE2 does just one match at a time, it cannot reproduce this behaviour. - If all the alternatives of a pattern begin with \G, the expression is + If all the alternatives of a pattern begin with \G, the expression is anchored to the starting match position, and the "anchored" flag is set in the compiled regular expression. CIRCUMFLEX AND DOLLAR - The circumflex and dollar metacharacters are zero-width assertions. - That is, they test for a particular condition being true without con- + The circumflex and dollar metacharacters are zero-width assertions. + That is, they test for a particular condition being true without con- suming any characters from the subject string. These two metacharacters - are concerned with matching the starts and ends of lines. If the new- - line convention is set so that only the two-character sequence CRLF is - recognized as a newline, isolated CR and LF characters are treated as + are concerned with matching the starts and ends of lines. If the new- + line convention is set so that only the two-character sequence CRLF is + recognized as a newline, isolated CR and LF characters are treated as ordinary data characters, and are not recognized as newlines. Outside a character class, in the default matching mode, the circumflex - character is an assertion that is true only if the current matching - point is at the start of the subject string. If the startoffset argu- - ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- - flex can never match if the PCRE2_MULTILINE option is unset. Inside a - character class, circumflex has an entirely different meaning (see be- + character is an assertion that is true only if the current matching + point is at the start of the subject string. If the startoffset argu- + ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- + flex can never match if the PCRE2_MULTILINE option is unset. Inside a + character class, circumflex has an entirely different meaning (see be- low). - Circumflex need not be the first character of the pattern if a number - of alternatives are involved, but it should be the first thing in each - alternative in which it appears if the pattern is ever to match that - branch. If all possible alternatives start with a circumflex, that is, - if the pattern is constrained to match only at the start of the sub- - ject, it is said to be an "anchored" pattern. (There are also other + Circumflex need not be the first character of the pattern if a number + of alternatives are involved, but it should be the first thing in each + alternative in which it appears if the pattern is ever to match that + branch. If all possible alternatives start with a circumflex, that is, + if the pattern is constrained to match only at the start of the sub- + ject, it is said to be an "anchored" pattern. (There are also other constructs that can cause a pattern to be anchored.) - The dollar character is an assertion that is true only if the current - matching point is at the end of the subject string, or immediately be- - fore a newline at the end of the string (by default), unless PCRE2_NO- - TEOL is set. Note, however, that it does not actually match the new- - line. Dollar need not be the last character of the pattern if a number - of alternatives are involved, but it should be the last item in any - branch in which it appears. Dollar has no special meaning in a charac- + The dollar character is an assertion that is true only if the current + matching point is at the end of the subject string, or immediately be- + fore a newline at the end of the string (by default), unless PCRE2_NO- + TEOL is set. Note, however, that it does not actually match the new- + line. Dollar need not be the last character of the pattern if a number + of alternatives are involved, but it should be the last item in any + branch in which it appears. Dollar has no special meaning in a charac- ter class. - The meaning of dollar can be changed so that it matches only at the - very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at + The meaning of dollar can be changed so that it matches only at the + very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \Z assertion. The meanings of the circumflex and dollar metacharacters are changed if - the PCRE2_MULTILINE option is set. When this is the case, a dollar - character matches before any newlines in the string, as well as at the - very end, and a circumflex matches immediately after internal newlines - as well as at the start of the subject string. It does not match after - a newline that ends the string, for compatibility with Perl. However, + the PCRE2_MULTILINE option is set. When this is the case, a dollar + character matches before any newlines in the string, as well as at the + very end, and a circumflex matches immediately after internal newlines + as well as at the start of the subject string. It does not match after + a newline that ends the string, for compatibility with Perl. However, this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option. - For example, the pattern /^abc$/ matches the subject string "def\nabc" - (where \n represents a newline) in multiline mode, but not otherwise. - Consequently, patterns that are anchored in single line mode because - all branches start with ^ are not anchored in multiline mode, and a - match for circumflex is possible when the startoffset argument of - pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored + For example, the pattern /^abc$/ matches the subject string "def\nabc" + (where \n represents a newline) in multiline mode, but not otherwise. + Consequently, patterns that are anchored in single line mode because + all branches start with ^ are not anchored in multiline mode, and a + match for circumflex is possible when the startoffset argument of + pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. - When the newline convention (see "Newline conventions" below) recog- - nizes the two-character sequence CRLF as a newline, this is preferred, - even if the single characters CR and LF are also recognized as new- - lines. For example, if the newline convention is "any", a multiline - mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather - than after CR, even though CR on its own is a valid newline. (It also + When the newline convention (see "Newline conventions" below) recog- + nizes the two-character sequence CRLF as a newline, this is preferred, + even if the single characters CR and LF are also recognized as new- + lines. For example, if the newline convention is "any", a multiline + mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather + than after CR, even though CR on its own is a valid newline. (It also matches at the very start of the string, of course.) - Note that the sequences \A, \Z, and \z can be used to match the start - and end of the subject in both modes, and if all branches of a pattern - start with \A it is always anchored, whether or not PCRE2_MULTILINE is + Note that the sequences \A, \Z, and \z can be used to match the start + and end of the subject in both modes, and if all branches of a pattern + start with \A it is always anchored, whether or not PCRE2_MULTILINE is set. FULL STOP (PERIOD, DOT) AND \N Outside a character class, a dot in the pattern matches any one charac- - ter in the subject string except (by default) a character that signi- + ter in the subject string except (by default) a character that signi- fies the end of a line. One or more characters may be specified as line terminators (see "Newline conventions" above). - Dot never matches a single line-ending character. When the two-charac- - ter sequence CRLF is the only line ending, dot does not match CR if it - is immediately followed by LF, but otherwise it matches all characters - (including isolated CRs and LFs). When ANYCRLF is selected for line - endings, no occurrences of CR of LF match dot. When all Unicode line + Dot never matches a single line-ending character. When the two-charac- + ter sequence CRLF is the only line ending, dot does not match CR if it + is immediately followed by LF, but otherwise it matches all characters + (including isolated CRs and LFs). When ANYCRLF is selected for line + endings, no occurrences of CR of LF match dot. When all Unicode line endings are being recognized, dot does not match CR or LF or any of the other line ending characters. - The behaviour of dot with regard to newlines can be changed. If the - PCRE2_DOTALL option is set, a dot matches any one character, without - exception. If the two-character sequence CRLF is present in the sub- + The behaviour of dot with regard to newlines can be changed. If the + PCRE2_DOTALL option is set, a dot matches any one character, without + exception. If the two-character sequence CRLF is present in the sub- ject string, it takes two dots to match it. - The handling of dot is entirely independent of the handling of circum- - flex and dollar, the only relationship being that they both involve + The handling of dot is entirely independent of the handling of circum- + flex and dollar, the only relationship being that they both involve newlines. Dot has no special meaning in a character class. - The escape sequence \N when not followed by an opening brace behaves - like a dot, except that it is not affected by the PCRE2_DOTALL option. - In other words, it matches any character except one that signifies the + The escape sequence \N when not followed by an opening brace behaves + like a dot, except that it is not affected by the PCRE2_DOTALL option. + In other words, it matches any character except one that signifies the end of a line. When \N is followed by an opening brace it has a different meaning. See - the section entitled "Non-printing characters" above for details. Perl - also uses \N{name} to specify characters by Unicode name; PCRE2 does + the section entitled "Non-printing characters" above for details. Perl + also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. MATCHING A SINGLE CODE UNIT - Outside a character class, the escape sequence \C matches any one code - unit, whether or not a UTF mode is set. In the 8-bit library, one code - unit is one byte; in the 16-bit library it is a 16-bit unit; in the - 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches - line-ending characters. The feature is provided in Perl in order to + Outside a character class, the escape sequence \C matches any one code + unit, whether or not a UTF mode is set. In the 8-bit library, one code + unit is one byte; in the 16-bit library it is a 16-bit unit; in the + 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches + line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, but it is unclear how it can use- fully be used. - Because \C breaks up characters into individual code units, matching - one unit with \C in UTF-8 or UTF-16 mode means that the rest of the + Because \C breaks up characters into individual code units, matching + one unit with \C in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined re- sults, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's valid- - ity at the start of processing unless the PCRE2_NO_UTF_CHECK or + ity at the start of processing unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used). - An application can lock out the use of \C by setting the - PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also + An application can lock out the use of \C by setting the + PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to build PCRE2 with the use of \C permanently disabled. - PCRE2 does not allow \C to appear in lookbehind assertions (described - below) in UTF-8 or UTF-16 modes, because this would make it impossible - to calculate the length of the lookbehind. Neither the alternative + PCRE2 does not allow \C to appear in lookbehind assertions (described + below) in UTF-8 or UTF-16 modes, because this would make it impossible + to calculate the length of the lookbehind. Neither the alternative matching function pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. The former gives a match-time error; the latter fails to optimize and so the match is always run using the interpreter. - In the 32-bit library, however, \C is always supported (when not ex- - plicitly locked out) because it always matches a single code unit, + In the 32-bit library, however, \C is always supported (when not ex- + plicitly locked out) because it always matches a single code unit, whether or not UTF-32 is specified. In general, the \C escape sequence is best avoided. However, one way of - using it that avoids the problem of malformed UTF-8 or UTF-16 charac- - ters is to use a lookahead to check the length of the next character, - as in this pattern, which could be used with a UTF-8 string (ignore + using it that avoids the problem of malformed UTF-8 or UTF-16 charac- + ters is to use a lookahead to check the length of the next character, + as in this pattern, which could be used with a UTF-8 string (ignore white space and line breaks): (?| (?=[\x00-\x7f])(\C) | @@ -7672,11 +8022,11 @@ MATCHING A SINGLE CODE UNIT (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) | (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C)) - In this example, a group that starts with (?| resets the capturing - parentheses numbers in each alternative (see "Duplicate Group Numbers" + In this example, a group that starts with (?| resets the capturing + parentheses numbers in each alternative (see "Duplicate Group Numbers" below). The assertions at the start of each branch check the next UTF-8 - character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- - tively. The character's individual bytes are then captured by the ap- + character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- + tively. The character's individual bytes are then captured by the ap- propriate number of \C groups. @@ -7684,27 +8034,27 @@ SQUARE BRACKETS AND CHARACTER CLASSES An opening square bracket introduces a character class, terminated by a closing square bracket. A closing square bracket on its own is not spe- - cial by default. If a closing square bracket is required as a member + cial by default. If a closing square bracket is required as a member of the class, it should be the first data character in the class (after - an initial circumflex, if present) or escaped with a backslash. This - means that, by default, an empty class cannot be defined. However, if - the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at + an initial circumflex, if present) or escaped with a backslash. This + means that, by default, an empty class cannot be defined. However, if + the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at the start does end the (empty) class. - A character class matches a single character in the subject. A matched + A character class matches a single character in the subject. A matched character must be in the set of characters defined by the class, unless - the first character in the class definition is a circumflex, in which + the first character in the class definition is a circumflex, in which case the subject character must not be in the set defined by the class. - If a circumflex is actually required as a member of the class, ensure + If a circumflex is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash. - For example, the character class [aeiou] matches any lower case vowel, - while [^aeiou] matches any character that is not a lower case vowel. - Note that a circumflex is just a convenient notation for specifying the - characters that are in the class by enumerating those that are not. A - class that starts with a circumflex is not an assertion; it still con- - sumes a character from the subject string, and therefore it fails if - the current pointer is at the end of the string. + For example, the character class [aeiou] matches any lower case English + vowel, whereas [^aeiou] matches all other characters. Note that a cir- + cumflex is just a convenient notation for specifying the characters + that are in the class by enumerating those that are not. A class that + starts with a circumflex is not an assertion; it still consumes a char- + acter from the subject string, and therefore it fails to match if the + current pointer is at the end of the string. Characters in a class may be specified by their code points using \o, \x, or \N{U+hh..} in the usual way. When caseless matching is set, any @@ -7714,7 +8064,10 @@ SQUARE BRACKETS AND CHARACTER CLASSES would. Note that there are two ASCII characters, K and S, that, in ad- dition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when ei- - ther PCRE2_UTF or PCRE2_UCP is set. + ther PCRE2_UTF or PCRE2_UCP is set. If you do not want these ASCII/non- + ASCII case equivalences, you can suppress them by setting PCRE2_EX- + TRA_CASELESS_RESTRICT, either as an option in a compile context, or by + including (*CASELESS_RESTRICT) or (?r) within a pattern. Characters that might indicate line breaks are never treated in any special way when matching character classes, whatever line-ending se- @@ -7743,67 +8096,171 @@ SQUARE BRACKETS AND CHARACTER CLASSES last character in the class, or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z. + There is some special treatment for alphabetic ranges in EBCDIC envi- + ronments; see the section "EBCDIC environments" below. + Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d - or \H. However, unless the hyphen is the last character in the class, - Perl outputs a warning in its warning mode, as this is most likely a - user error. As PCRE2 has no facility for warning, an error is given in + or \H. However, unless the hyphen is the last character in the class, + Perl outputs a warning in its warning mode, as this is most likely a + user error. As PCRE2 has no facility for warning, an error is given in these cases. It is not possible to have the literal character "]" as the end charac- - ter of a range. A pattern such as [W-]46] is interpreted as a class of - two characters ("W" and "-") followed by a literal string "46]", so it - would match "W46]" or "-46]". However, if the "]" is escaped with a - backslash it is interpreted as the end of range, so [W-\]46] is inter- - preted as a class containing a range followed by two other characters. - The octal or hexadecimal representation of "]" can also be used to end - a range. + ter of a range. A pattern such as [W-]46] is interpreted as a class of + two characters ("W" and "-") followed by a literal string "46]", so it + would match "W46]" or "-46]". However, if the "]" is escaped with a + backslash it is interpreted as the end of a range, so [W-\]46] is in- + terpreted as a class containing a range and two other characters. The + octal or hexadecimal representation of "]" can also be used to end a + range. Ranges normally include all code points between the start and end char- - acters, inclusive. They can also be used for code points specified nu- - merically, for example [\000-\037]. Ranges can include any characters - that are valid for the current mode. In any UTF mode, the so-called - "surrogate" characters (those whose code points lie between 0xd800 and - 0xdfff inclusive) may not be specified explicitly by default (the - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- + acters, inclusive. They can also be used for code points specified nu- + merically, for example [\000-\037]. Ranges can include any characters + that are valid for the current mode. In any UTF mode, the so-called + "surrogate" characters (those whose code points lie between 0xd800 and + 0xdfff inclusive) may not be specified explicitly by default (the + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- ever, ranges such as [\x{d7ff}-\x{e000}], which include the surrogates, are always permitted. - There is a special case in EBCDIC environments for ranges whose end - points are both specified as literal letters in the same case. For com- - patibility with Perl, EBCDIC code points within the range that are not - letters are omitted. For example, [h-k] matches only four characters, - even though the codes for h and k are 0x88 and 0x92, a range of 11 code - points. However, if the range is specified numerically, for example, - [\x88-\x92] or [h-\x92], all code points are included. - If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent - to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if - character tables for a French locale are in use, [\xc8-\xcb] matches + to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if + character tables for a French locale are in use, [\xc8-\xcb] matches accented E characters in both cases. - A circumflex can conveniently be used with the upper case character - types to specify a more restricted set of characters than the matching - lower case type. For example, the class [^\W_] matches any letter or + A circumflex can conveniently be used with the upper case character + types to specify a more restricted set of characters than the matching + lower case type. For example, the class [^\W_] matches any letter or digit, but not underscore, whereas [\w] includes underscore. A positive character class should be read as "something OR something OR ..." and a negative class as "NOT something AND NOT something AND NOT ...". - The only metacharacters that are recognized in character classes are - backslash, hyphen (only where it can be interpreted as specifying a - range), circumflex (only at the start), opening square bracket (only - when it can be interpreted as introducing a POSIX class name, or for a - special compatibility feature - see the next two sections), and the - terminating closing square bracket. However, escaping other non-al- - phanumeric characters does no harm. + The metacharacters that are recognized in character classes are back- + slash, hyphen (when it can be interpreted as specifying a range), cir- + cumflex (only at the start), and the terminating closing square + bracket. An opening square bracket is also special when it can be in- + terpreted as introducing a POSIX class (see "Posix character classes" + below), or a special compatibility feature (see "Compatibility feature + for word boundaries" below. Escaping any non-alphanumeric character in + a class turns it into a literal, whether or not it would otherwise be a + metacharacter. + + +PERL EXTENDED CHARACTER CLASSES + + From release 10.45 PCRE2 supports Perl's (?[...]) extended character + class syntax. This can be used to perform set operations such as inter- + section on character classes. + + The syntax permitted within (?[...]) is quite different to ordinary + character classes. Inside the extended class, there is an expression + syntax consisting of "atoms", operators, and ordinary parentheses "()" + used for grouping. Such classes always have the Perl /xx modifier + (PCRE2 option PCRE2_EXTENDED_MORE) turned on within them. This means + that literal space and tab characters are ignored everywhere in the + class. + + The allowed atoms are individual characters specified by escape se- + quences such as \n or \x{123}, character types such as \d, POSIX + classes such as [:alpha:], and nested ordinary (non-extended) character + classes. For example, in (?[\d & [...]]) the nested class [...] follows + the usual rules for ordinary character classes, in which parentheses + are not metacharacters, and character literals and ranges are permit- + ted. + + Character literals and ranges may not appear outside a nested ordinary + character class because they are not atoms in the extended syntax. The + extended syntax does not introduce any additional escape sequences, so + (?[\y]) is an unknown escape, as it would be in [\y]. + + In the extended syntax, ^ does not negate a class (except within an or- + dinary class nested inside an extended class); it is instead a binary + operator. + + The binary operators are "&" (intersection), "|" or "+" (union), "-" + (subtraction) and "^" (symmetric difference). These are left-associa- + tive and "&" has higher (tighter) precedence, while the others have + equal lower precedence. The one prefix unary operator is "!" (comple- + ment), with highest precedence. + + +UTS#18 EXTENDED CHARACTER CLASSES + + The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's + (?[...]) syntax, allowing instead extended class behaviour inside or- + dinary [...] character classes. This altered syntax for [...] classes + is loosely described by the Unicode standard UTS#18. The PCRE2_ALT_EX- + TENDED_CLASS option does not prevent use of (?[...]) classes; it just + changes the meaning of all [...] classes that are not nested inside a + Perl (?[...]) class. + + Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is + a character class with two literal characters "a" and "[", but in + UTS#18 extended classes the "[" character becomes an additional + metacharacter within classes, denoting the start of a nested class, so + a literal "[" must be escaped as "\[". + + Secondly, within the UTS#18 extended syntax, there are operators "||", + "&&", "--" and "~~" which denote character class union, intersection, + subtraction, and symmetric difference respectively. In standard Perl + syntax, these would simply be needlessly-repeated literals (except for + "--" which could be the start or end of a range). In UTS#18 extended + classes these operators can be used in constructs such as [\p{L}--[QW]] + for "Unicode letters, other than Q and W". A literal "-" at the start + or end of a range must be escaped, so while "[--1]" in Perl syntax is + the range from hyphen to "1", it must be escaped as "[\--1]" in UTS#18 + extended classes. + + Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option + to ignore space and tab characters is not automatically enabled for + UTS#18 extended classes, but it is honoured if set. + + Extended UTS#18 classes can be nested, and nested classes are them- + selves extended classes (unlike Perl, where nested classes must be sim- + ple classes). For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any + letter that is in the Thai or Greek scripts. Note that this means that + no special grouping characters (such as the parentheses used in Perl's + (?[...]) class syntax) are needed. + + Individual class items (literal characters, literal ranges, properties + such as \d or \p{...}, and nested classes) can be combined by juxtapo- + sition or by an operator. Juxtaposition is the implicit union operator, + and binds more tightly than any explicit operator. Thus a sequence of + literals and/or ranges behaves as if it is enclosed in square brackets. + For example, [A-Z0-9&&[^E8]] is the same as [[A-Z0-9]&&[^E8]], which + matches any upper case alphanumeric character except "E" or "8". + + Precedence between the explicit operators is not defined, so mixing op- + erators is a syntax error. For example, [A&&B--C] is an error, but + [A&&[B--C]] is valid. + + This is an emerging syntax which is being adopted gradually across the + regex ecosystem: for example JavaScript adopted the "/v" flag in EC- + MAScript 2024; Python's "re" module reserves the syntax for future use + with a FutureWarning for unescaped use of "[" as a literal within char- + acter classes. Due to UTS#18 providing insufficient guidance, engines + interpret the syntax differently. Rust's "regex" crate and Python's + "regex" PyPi module both implement UTS#18 extended classes, but with + slight incompatibilities ([A||B&&C] is parsed as [A||[B&&C]] in + Python's "regex" but as [[A||B]&&C] in Rust's "regex"). + + PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v + flag, so that all the UTS#18 extended classes accepted as valid by + PCRE2 have the property that they are interpreted either with the same + behaviour, or as invalid, by all other major engines. Please file an + issue if you are aware of cross-engine differences in behaviour between + PCRE2 and another major engine. POSIX CHARACTER CLASSES Perl supports the POSIX notation for character classes. This uses names - enclosed by [: and :] within the enclosing square brackets. PCRE2 also - supports this notation. For example, + enclosed by [: and :] within the enclosing square brackets. PCRE2 also + supports this notation, in both ordinary and extended classes. For ex- + ample, [01[:alpha:]%] @@ -7883,7 +8340,7 @@ POSIX CHARACTER CLASSES In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This is a change that - was made in PCRE release 10.43 for Perl compatibility. + was made in PCRE2 release 10.43 for Perl compatibility. The other POSIX classes are unchanged by PCRE2_UCP, and match only characters with code points less than 256. @@ -8391,17 +8848,18 @@ REPETITION (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking con- - trol verbs (*PRUNE) and (*SKIP) also disable this optimization, and - there is an option, PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. + trol verbs (*PRUNE) and (*SKIP) also disable this optimization. To do + so explicitly, either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, + or call pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. - When a capture group is repeated, the value captured is the substring + When a capture group is repeated, the value captured is the substring that matched the final iteration. For example, after (tweedle[dume]{3}\s*)+ has matched "tweedledum tweedledee" the value of the captured substring - is "tweedledee". However, if there are nested capture groups, the cor- - responding captured values may have been set in previous iterations. + is "tweedledee". However, if there are nested capture groups, the cor- + responding captured values may have been set in previous iterations. For example, after (a|(b))+ @@ -8411,57 +8869,57 @@ REPETITION ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS - With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") - repetition, failure of what follows normally causes the repeated item - to be re-evaluated to see if a different number of repeats allows the - rest of the pattern to match. Sometimes it is useful to prevent this, - either to change the nature of the match, or to cause it fail earlier - than it otherwise might, when the author of the pattern knows there is + With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") + repetition, failure of what follows normally causes the repeated item + to be re-evaluated to see if a different number of repeats allows the + rest of the pattern to match. Sometimes it is useful to prevent this, + either to change the nature of the match, or to cause it fail earlier + than it otherwise might, when the author of the pattern knows there is no point in carrying on. - Consider, for example, the pattern \d+foo when applied to the subject + Consider, for example, the pattern \d+foo when applied to the subject line 123456bar After matching all 6 digits and then failing to match "foo", the normal - action of the matcher is to try again with only 5 digits matching the - \d+ item, and then with 4, and so on, before ultimately failing. - "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides + action of the matcher is to try again with only 5 digits matching the + \d+ item, and then with 4, and so on, before ultimately failing. + "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides the means for specifying that once a group has matched, it is not to be re-evaluated in this way. - If we use atomic grouping for the previous example, the matcher gives - up immediately on failing to match "foo" the first time. The notation + If we use atomic grouping for the previous example, the matcher gives + up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example: (?>\d+)foo - Perl 5.28 introduced an experimental alphabetic form starting with (* + Perl 5.28 introduced an experimental alphabetic form starting with (* which may be easier to remember: (*atomic:\d+)foo - This kind of parenthesized group "locks up" the part of the pattern it + This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is - prevented from backtracking into it. Backtracking past it to previous + prevented from backtracking into it. Backtracking past it to previous items, however, works as normal. An alternative description is that a group of this type matches exactly - the string of characters that an identical standalone pattern would + the string of characters that an identical standalone pattern would match, if anchored at the current point in the subject string. - Atomic groups are not capture groups. Simple cases such as the above - example can be thought of as a maximizing repeat that must swallow - everything it can. So, while both \d+ and \d+? are prepared to adjust - the number of digits they match in order to make the rest of the pat- + Atomic groups are not capture groups. Simple cases such as the above + example can be thought of as a maximizing repeat that must swallow + everything it can. So, while both \d+ and \d+? are prepared to adjust + the number of digits they match in order to make the rest of the pat- tern match, (?>\d+) can only match an entire sequence of digits. - Atomic groups in general can of course contain arbitrarily complicated + Atomic groups in general can of course contain arbitrarily complicated expressions, and can be nested. However, when the contents of an atomic - group is just a single repeated item, as in the example above, a sim- - pler notation, called a "possessive quantifier" can be used. This con- - sists of an additional + character following a quantifier. Using this + group is just a single repeated item, as in the example above, a sim- + pler notation, called a "possessive quantifier" can be used. This con- + sists of an additional + character following a quantifier. Using this notation, the previous example can be rewritten as \d++foo @@ -8471,24 +8929,26 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS (abc|xyz){2,3}+ - Possessive quantifiers are always greedy; the setting of the PCRE2_UN- - GREEDY option is ignored. They are a convenient notation for the sim- - pler forms of atomic group. However, there is no difference in the - meaning of a possessive quantifier and the equivalent atomic group, - though there may be a performance difference; possessive quantifiers + Possessive quantifiers are always greedy; the setting of the PCRE2_UN- + GREEDY option is ignored. They are a convenient notation for the sim- + pler forms of atomic group. However, there is no difference in the + meaning of a possessive quantifier and the equivalent atomic group, + though there may be a performance difference; possessive quantifiers should be slightly faster. - The possessive quantifier syntax is an extension to the Perl 5.8 syn- - tax. Jeffrey Friedl originated the idea (and the name) in the first + The possessive quantifier syntax is an extension to the Perl 5.8 syn- + tax. Jeffrey Friedl originated the idea (and the name) in the first edition of his book. Mike McCloskey liked it, so implemented it when he - built Sun's Java package, and PCRE1 copied it from there. It found its + built Sun's Java package, and PCRE1 copied it from there. It found its way into Perl at release 5.10. - PCRE2 has an optimization that automatically "possessifies" certain - simple pattern constructs. For example, the sequence A+B is treated as - A++B because there is no point in backtracking into a sequence of A's - when B must follow. This feature can be disabled by the PCRE2_NO_AUTO- - POSSESS option, or starting the pattern with (*NO_AUTO_POSSESS). + PCRE2 has an optimization that automatically "possessifies" certain + simple pattern constructs. For example, the sequence A+B is treated as + A++B because there is no point in backtracking into a sequence of A's + when B must follow. This feature can be disabled by the + PCRE2_NO_AUTO_POSSESS option, by calling pcre2_set_optimize() with a + PCRE2_AUTO_POSSESS_OFF directive, or by starting the pattern with + (*NO_AUTO_POSSESS). When a pattern contains an unlimited repeat inside a group that can it- self be repeated an unlimited number of times, the use of an atomic @@ -8649,19 +9109,25 @@ BACKREFERENCES ASSERTIONS - An assertion is a test on the characters following or preceding the - current matching point that does not consume any characters. The simple - assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described - above. + An assertion is a test that does not consume any characters. The test + must succeed for the match to continue. The simple assertions coded as + \b, \B, \A, \G, \Z, \z, ^ and $ are described above. + + More complicated assertions are coded as parenthesized groups. If + matching such a group succeeds, matching continues after it, but with + the matching position in the subject string reset to what it was before + the assertion was processed. + + A special kind of assertion, called a "scan substring" assertion, + matches a subpattern against a previously captured substring. This is + described in the section entitled "Scan substring assertions" below. It + is a PCRE2 extension, not compatible with Perl. - More complicated assertions are coded as parenthesized groups. There - are two kinds: those that look ahead of the current position in the - subject string, and those that look behind it, and in each case an as- - sertion may be positive (must match for the assertion to be true) or - negative (must not match for the assertion to be true). An assertion - group is matched in the normal way, and if it is true, matching contin- - ues after it, but with the matching position in the subject string re- - set to what it was before the assertion was processed. + The other goup-based assertions are of two kinds: those that look ahead + of the current position in the subject string, and those that look be- + hind it, and in each case an assertion may be positive (must match for + the assertion to be true) or negative (must not match for the assertion + to be true). The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no back- @@ -8928,6 +9394,66 @@ NON-ATOMIC ASSERTIONS groups (see below) must be atomic. +SCAN SUBSTRING ASSERTIONS + + A special kind of assertion, not compatible with Perl, makes it possi- + ble to check the contents of a captured substring by matching it with a + subpattern. Because this involves capturing, this feature is not sup- + ported by pcre2_dfa_match(). + + A scan substring assertion starts with the sequence (*scan_substring: + or (*scs: which is followed by a list of substring numbers (absolute or + relative) and/or substring names enclosed in single quotes or angle + brackets, all within parentheses. The rest of the item is the subpat- + tern that is applied to the substring, as shown in these examples: + + (*scan_substring:(1)...) + (*scs:(-2)...) + (*scs:('AB')...) + (*scs:(1,'AB',-2)...) + + The list of groups is checked in the order they are given, and it is + the contents of the first one that is found to be set that are scanned. + When PCRE2_DUPNAMES is set and there are ambiguous group names, all + groups with the same name are checked in numerical order. A scan sub- + string assertion fails if none of the groups it references have been + set. + + The pattern match on the substring is always anchored, that is, it must + match from the start of the substring. There is no "bumpalong" if it + does not match at the start. The end of the subject is temporarily re- + set to be the end of the substring, so \Z, \z, and $ will match there. + However, the start of the subject is not reset. This means that ^ + matches only if the substring is actually at the start of the main sub- + ject, but it also means that lookbehind assertions into what precedes + the substring are possible. + + Here is a very simple example: find a word that contains the rare (in + English) sequence of letters "rh" not at the start: + + \b(\w++)(*scs:(1).+rh) + + The first group captures a word which is then scanned by the second + group. This example does not actually need this heavyweight feature; + the same match can be achieved with: + + \b\w+?rh\w*\b + + When things are more complicated, however, scanning a captured sub- + string can be a useful way to describe the required match. For exmple, + there is a rather complicated pattern in the PCRE2 test data that + checks an entire subject string for a palindrome, that is, the sequence + of letters is the same in both directions. Suppose you want to search + for individual words of two or more characters such as "level" that are + palindromes: + + (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...) + + Within a substring scanning subpattern, references to other groups work + as normal. Capturing groups may appear, and will retain their values + during ongoing matching if the assertion succeeds. + + SCRIPT RUNS In concept, a script run is a sequence of characters that are all from @@ -9175,8 +9701,9 @@ COMMENTS There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related - characters such as (?: or a group name or number. The characters that - make up a comment play no part in the pattern matching. + characters such as (?: or a group name or number or a Unicode property + name. The characters that make up a comment play no part in the pattern + matching. The sequence (?# marks the start of a comment that continues up to the next closing parenthesis. Nested parentheses are not permitted. If the @@ -9459,8 +9986,9 @@ CALLOUTS provides an external function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is - passed, or if the callout entry point is set to NULL, callouts are dis- - abled. + passed, or if the callout entry point is set to NULL, callout points + will be passed over silently during matching. To disallow callouts in + the pattern syntax, you may use the PCRE2_EXTRA_NEVER_CALLOUT option. Within a regular expression, (?C) indicates a point at which the external function is to be called. There are two kinds of callout: @@ -9555,10 +10083,10 @@ BACKTRACKING CONTROL Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the tra- - ditional matching function, because that uses a backtracking algorithm. - With the exception of (*FAIL), which behaves like a failing negative - assertion, the backtracking control verbs cause an error if encountered - by the DFA matching function. + ditional matching function or JIT, because they use backtracking algo- + rithms. With the exception of (*FAIL), which behaves like a failing + negative assertion, the backtracking control verbs cause an error if + encountered by the DFA matching function. The behaviour of these verbs in repeated groups, assertions, and in capture groups called as subroutines (whether or not recursively) is @@ -9573,11 +10101,12 @@ BACKTRACKING CONTROL running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option when calling pcre2_com- - pile(), or by starting the pattern with (*NO_START_OPT). There is more - discussion of this option in the section entitled "Compiling a pattern" - in the pcre2api documentation. + pile(), by calling pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF + directive, or by starting the pattern with (*NO_START_OPT). There is + more discussion of this option in the section entitled "Compiling a + pattern" in the pcre2api documentation. - Experiments with Perl suggest that it too has similar optimizations, + Experiments with Perl suggest that it too has similar optimizations, and like PCRE2, turning them off can change the result of a match. Verbs that act immediately @@ -9586,77 +10115,77 @@ BACKTRACKING CONTROL (*ACCEPT) or (*ACCEPT:NAME) - This verb causes the match to end successfully, skipping the remainder - of the pattern. However, when it is inside a capture group that is + This verb causes the match to end successfully, skipping the remainder + of the pattern. However, when it is inside a capture group that is called as a subroutine, only that group is ended successfully. Matching then continues at the outer level. If (*ACCEPT) in triggered in a posi- - tive assertion, the assertion succeeds; in a negative assertion, the + tive assertion, the assertion succeeds; in a negative assertion, the assertion fails. - If (*ACCEPT) is inside capturing parentheses, the data so far is cap- + If (*ACCEPT) is inside capturing parentheses, the data so far is cap- tured. For example: A((?:A|B(*ACCEPT)|C)D) - This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- + This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- tured by the outer parentheses. - (*ACCEPT) is the only backtracking verb that is allowed to be quanti- - fied because an ungreedy quantification with a minimum of zero acts + (*ACCEPT) is the only backtracking verb that is allowed to be quanti- + fied because an ungreedy quantification with a minimum of zero acts only when a backtrack happens. Consider, for example, (A(*ACCEPT)??B)C - where A, B, and C may be complex expressions. After matching "A", the - matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) - is triggered and the match succeeds. In both cases, all but C is cap- - tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- + where A, B, and C may be complex expressions. After matching "A", the + matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) + is triggered and the match succeeds. In both cases, all but C is cap- + tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- peated (*ACCEPT) of this type means "succeed on backtrack". - Warning: (*ACCEPT) should not be used within a script run group, be- - cause it causes an immediate exit from the group, bypassing the script + Warning: (*ACCEPT) should not be used within a script run group, be- + cause it causes an immediate exit from the group, bypassing the script run checking. (*FAIL) or (*FAIL:NAME) - This verb causes a matching failure, forcing backtracking to occur. It - may be abbreviated to (*F). It is equivalent to (?!) but easier to + This verb causes a matching failure, forcing backtracking to occur. It + may be abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl documentation notes that it is probably useful only when combined with (?{}) or (??{}). Those are, of course, Perl features that - are not present in PCRE2. The nearest equivalent is the callout fea- + are not present in PCRE2. The nearest equivalent is the callout fea- ture, as for example in this pattern: a+(?C)(*FAIL) - A match with the string "aaaa" always fails, but the callout is taken + A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times). - (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- - CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is + (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- + CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before the verb acts. Recording which path was taken - There is one verb whose main purpose is to track how a match was ar- - rived at, though it also has a secondary use in conjunction with ad- + There is one verb whose main purpose is to track how a match was ar- + rived at, though it also has a secondary use in conjunction with ad- vancing the match starting point (see (*SKIP) below). (*MARK:NAME) or (*:NAME) - A name is always required with this verb. For all the other backtrack- + A name is always required with this verb. For all the other backtrack- ing control verbs, a NAME argument is optional. - When a match succeeds, the name of the last-encountered mark name on + When a match succeeds, the name of the last-encountered mark name on the matching path is passed back to the caller as described in the sec- tion entitled "Other information about the match" in the pcre2api docu- - mentation. This applies to all instances of (*MARK) and other verbs, + mentation. This applies to all instances of (*MARK) and other verbs, including those inside assertions and atomic groups. However, there are - differences in those cases when (*MARK) is used in conjunction with + differences in those cases when (*MARK) is used in conjunction with (*SKIP) as described below. - The mark name that was last encountered on the matching path is passed - back. A verb without a NAME argument is ignored for this purpose. Here - is an example of pcre2test output, where the "mark" modifier requests + The mark name that was last encountered on the matching path is passed + back. A verb without a NAME argument is ignored for this purpose. Here + is an example of pcre2test output, where the "mark" modifier requests the retrieval and outputting of (*MARK) data: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark @@ -9668,30 +10197,31 @@ BACKTRACKING CONTROL MK: B The (*MARK) name is tagged with "MK:" in this output, and in this exam- - ple it indicates which of the two alternatives matched. This is a more - efficient way of obtaining this information than putting each alterna- + ple it indicates which of the two alternatives matched. This is a more + efficient way of obtaining this information than putting each alterna- tive in its own capturing parentheses. - If a verb with a name is encountered in a positive assertion that is - true, the name is recorded and passed back if it is the last-encoun- + If a verb with a name is encountered in a positive assertion that is + true, the name is recorded and passed back if it is the last-encoun- tered. This does not happen for negative assertions or failing positive assertions. - After a partial match or a failed match, the last encountered name in + After a partial match or a failed match, the last encountered name in the entire match process is returned. For example: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XP No match, mark = B - Note that in this unanchored example the mark is retained from the + Note that in this unanchored example the mark is retained from the match attempt that started at the letter "X" in the subject. Subsequent match attempts starting at "P" and then with an empty string do not get as far as the (*MARK) item, but nevertheless do not reset it. - If you are interested in (*MARK) values after failed matches, you - should probably set the PCRE2_NO_START_OPTIMIZE option (see above) to - ensure that the match is always attempted. + If you are interested in (*MARK) values after failed matches, you + should probably either set the PCRE2_NO_START_OPTIMIZE option or call + pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see + above) to ensure that the match is always attempted. Verbs that act after backtracking @@ -9699,11 +10229,11 @@ BACKTRACKING CONTROL tinues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, back- tracking cannot pass to the left of the verb. However, when one of - these verbs appears inside an atomic group or in a lookaround assertion - that is true, its effect is confined to that group, because once the - group has been matched, there is never any backtracking into it. Back- - tracking from beyond an assertion or an atomic group ignores the entire - group, and seeks a preceding backtracking point. + these verbs appears inside an atomic group or in an atomic lookaround + assertion that is true, its effect is confined to that group, because + once the group has been matched, there is never any backtracking into + it. Backtracking from beyond an atomic assertion or group ignores the + entire group, and seeks a preceding backtracking point. These verbs differ in exactly what kind of failure occurs when back- tracking reaches them. The behaviour described below is what happens @@ -9960,21 +10490,23 @@ BACKTRACKING CONTROL (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern. - PCRE2 now supports non-atomic positive assertions, as described in the - section entitled "Non-atomic assertions" above. These assertions must - be standalone (not used as conditions). They are not Perl-compatible. - For these assertions, a later backtrack does jump back into the asser- - tion, and therefore verbs such as (*COMMIT) can be triggered by back- - tracks from later in the pattern. + PCRE2 now supports non-atomic positive assertions and also "scan sub- + string" assertions, as described in the sections entitled "Non-atomic + assertions" and "Scan substring assertions" above. These assertions + must be standalone (not used as conditions). They are not Perl-compati- + ble. For these assertions, a later backtrack does jump back into the + assertion, and therefore verbs such as (*COMMIT) can be triggered by + backtracks from later in the pattern. The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion - to be false, and a negative assertion to be true. + to be false, and a negative assertion to be true. This behaviour dif- + fers from Perl when the assertion has only one branch. - The other backtracking verbs are not treated specially if they appear - in a standalone positive assertion. In a conditional positive asser- + The other backtracking verbs are not treated specially if they appear + in a standalone positive assertion. In a conditional positive asser- tion, backtracking (from within the assertion) into (*COMMIT), (*SKIP), - or (*PRUNE) causes the condition to be false. However, for both stand- + or (*PRUNE) causes the condition to be false. However, for both stand- alone and conditional negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes the assertion to be true, without consider- ing any further alternative branches. @@ -9984,26 +10516,68 @@ BACKTRACKING CONTROL These behaviours occur whether or not the group is called recursively. (*ACCEPT) in a group called as a subroutine causes the subroutine match - to succeed without any further processing. Matching then continues af- - ter the subroutine call. Perl documents this behaviour. Perl's treat- + to succeed without any further processing. Matching then continues af- + ter the subroutine call. Perl documents this behaviour. Perl's treat- ment of the other verbs in subroutines is different in some cases. - (*FAIL) in a group called as a subroutine has its normal effect: it + (*FAIL) in a group called as a subroutine has its normal effect: it forces an immediate backtrack. - (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail - when triggered by being backtracked to in a group called as a subrou- + (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail + when triggered by being backtracked to in a group called as a subrou- tine. There is then a backtrack at the outer level. (*THEN), when triggered, skips to the next alternative in the innermost - enclosing group that has alternatives (its normal behaviour). However, + enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +EBCDIC ENVIRONMENTS + + Differences in the way PCRE behaves when it is running in an EBCDIC en- + vironment are covered in this section. + + Escape sequences + + When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. + \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. + The \c escape is processed as specified for Perl in the perlebcdic doc- + ument. The only characters that are allowed after \c are A-Z, a-z, or + one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- + time error. The sequence \c@ encodes character code 0; after \c the + letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, + \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- + comes either 255 (hex FF) or 95 (hex 5F). + + Thus, apart from \c?, these escapes generate the same character code + values as they do in an ASCII or Unicode environment, though the mean- + ings of the values mostly differ. For example, \cG always generates + code value 7, which is BEL in ASCII but DEL in EBCDIC. + + The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, + but because 127 is not a control character in EBCDIC, Perl makes it + generate the APC character. Unfortunately, there are several variants + of EBCDIC. In most of them the APC character has the value 255 (hex + FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If + certain other characters have POSIX-BC values, PCRE2 makes \c? generate + 95; otherwise it generates 255. + + Character classes + + In character classes there is a special case in EBCDIC environments for + ranges whose end points are both specified as literal letters in the + same case. For compatibility with Perl, EBCDIC code points within the + range that are not letters are omitted. For example, [h-k] matches only + four characters, even though the EBCDIC codes for h and k are 0x88 and + 0x92, a range of 11 code points. However, if the range is specified nu- + merically, for example, [\x88-\x92] or [h-\x92], all code points are + included. + + SEE ALSO - pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), + pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3). @@ -10016,15 +10590,14 @@ AUTHOR REVISION - Last updated: 04 June 2024 + Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 04 June 2024 PCRE2PATTERN(3) +PCRE2 10.45 27 November 2024 PCRE2PATTERN(3) ------------------------------------------------------------------------------ - PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) @@ -10272,15 +10845,14 @@ AUTHOR REVISION - Last updated: 27 July 2022 + Last updated: 06 December 2022 Copyright (c) 1997-2022 University of Cambridge. -PCRE2 10.41 27 July 2022 PCRE2PERFORM(3) +PCRE2 10.45 06 December 2022 PCRE2PERFORM(3) ------------------------------------------------------------------------------ - PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3) @@ -10431,7 +11003,7 @@ COMPILING A PATTERN When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments are ig- - nored, and no captured strings are returned. Versions of the PCRE li- + nored, and no captured strings are returned. Versions of the PCRE2 li- brary prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile op- tion, but this no longer happens because it disables the use of back- references. @@ -10631,15 +11203,14 @@ AUTHOR REVISION - Last updated: 19 January 2024 + Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2POSIX(3) +PCRE2 10.45 27 November 2024 PCRE2POSIX(3) ------------------------------------------------------------------------------ - PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) @@ -10725,13 +11296,12 @@ AUTHOR REVISION - Last updated: 02 February 2016 + Last updated: 14 November 2023 Copyright (c) 1997-2016 University of Cambridge. -PCRE2 10.22 02 February 2016 PCRE2SAMPLE(3) +PCRE2 10.45 14 November 2023 PCRE2SAMPLE(3) ------------------------------------------------------------------------------ - PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3) @@ -10917,15 +11487,14 @@ AUTHOR REVISION - Last updated: 27 June 2018 + Last updated: 19 January 2024 Copyright (c) 1997-2018 University of Cambridge. -PCRE2 10.32 27 June 2018 PCRE2SERIALIZE(3) +PCRE2 10.45 19 January 2024 PCRE2SERIALIZE(3) ------------------------------------------------------------------------------ - PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3) @@ -10935,9 +11504,11 @@ NAME PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY - The full syntax and semantics of the regular expressions that are sup- - ported by PCRE2 are described in the pcre2pattern documentation. This - document contains a quick-reference summary of the syntax. + The full syntax and semantics of the regular expression patterns that + are supported by PCRE2 are described in the pcre2pattern documentation. + This document contains a quick-reference summary of the pattern syntax + followed by the syntax of replacement strings in substitution function. + The full description of the latter is in the pcre2api documentation. QUOTING @@ -10947,22 +11518,24 @@ QUOTING Note that white space inside \Q...\E is always treated as literal, even if PCRE2_EXTENDED is set, causing most other white space to be ignored. + Note also that PCRE2's handling of \Q...\E has some differences from + Perl's. See the pcre2pattern documentation for details. BRACED ITEMS - With one exception, wherever brace characters { and } are required to - enclose data for constructions such as \g{2} or \k{name}, space and/or - horizontal tab characters that follow { or precede } are allowed and + With one exception, wherever brace characters { and } are required to + enclose data for constructions such as \g{2} or \k{name}, space and/or + horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or - after the comma. The exception is \u{...} which is not Perl-compatible + after the comma. The exception is \u{...} which is not Perl-compatible and is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This is an EC- MAScript compatibility feature, and follows ECMAScript's behaviour. ESCAPED CHARACTERS - This table applies to ASCII and Unicode environments. An unrecognized + This table applies to ASCII and Unicode environments. An unrecognized escape sequence causes an error. \a alarm, that is, the BEL character (hex 07) @@ -10979,6 +11552,11 @@ ESCAPED CHARACTERS \xhh character with hex code hh \x{hh..} character with hex code hh.. + \N{U+hh..} is synonymous with \x{hh..} but is not supported in environ- + ments that use EBCDIC code (mainly IBM mainframes). Note that \N not + followed by an opening curly bracket has a different meaning (see be- + low). + If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized: @@ -10986,20 +11564,17 @@ ESCAPED CHARACTERS \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX - When \x is not followed by {, from zero to two hexadecimal digits are - read, but in ALT_BSUX mode \x must be followed by two hexadecimal dig- - its to be recognized as a hexadecimal escape; otherwise it matches a - literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by - four hexadecimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex - digits in curly brackets, it matches a literal "u". + When \x is not followed by {, one or two hexadecimal digits are read, + but in ALT_BSUX mode \x must be followed by two hexadecimal digits to + be recognized as a hexadecimal escape; otherwise it matches a literal + "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexa- + decimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in + curly brackets, it matches a literal "u". Note that \0dd is always an octal code. The treatment of backslash fol- - lowed by a non-zero digit is complicated; for details see the section - "Non-printing characters" in the pcre2pattern documentation, where de- - tails of escape processing in EBCDIC environments are also given. - \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not supported in - EBCDIC environments. Note that \N not followed by an opening curly - bracket has a different meaning (see below). + lowed by a non-zero digit is complicated; for details see the section + "Non-printing characters" in the pcre2pattern documentation, where de- + tails of escape processing in EBCDIC environments are also given. CHARACTER TYPES @@ -11023,23 +11598,24 @@ CHARACTER TYPES \W a "non-word" character \X a Unicode extended grapheme cluster - \C is dangerous because it may leave the current matching point in the + \C is dangerous because it may leave the current matching point in the middle of a UTF-8 or UTF-16 character. The application can lock out the - use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also + use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 with the use of \C permanently disabled. - By default, \d, \s, and \w match only ASCII characters, even in UTF-8 + By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific - matching is happening, \s and \w may also match characters with code + matching is happening, \s and \w may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behav- iour of these escape sequences is changed to use Unicode properties and - they match many more characters, but there are some option settings - that can restrict individual sequences to matching only ASCII charac- + they match many more characters, but there are some option settings + that can restrict individual sequences to matching only ASCII charac- ters. Property descriptions in \p and \P are matched caselessly; hyphens, un- - derscores, and white space are ignored, in accordance with Unicode's - "loose matching" rules. + derscores, and ASCII white space characters are ignored, in accordance + with Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} + is the same as \p{ bidi class = AL }. GENERAL CATEGORY PROPERTIES FOR \p and \P @@ -11052,13 +11628,13 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -11090,6 +11666,9 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Zp Paragraph separator Zs Space separator + From release 10.45, when caseless matching is set, Ll, Lu, and Lt are + all equivalent to Lc. + PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P @@ -11106,9 +11685,9 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P BINARY PROPERTIES FOR \p AND \P - Unicode defines a number of binary properties, that is, properties - whose only values are true or false. You can obtain a list of those - that are recognized by \p and \P, along with their abbreviations, by + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP @@ -11116,8 +11695,8 @@ BINARY PROPERTIES FOR \p AND \P SCRIPT MATCHING WITH \p AND \P - Many script names and their 4-letter abbreviations are recognized in - \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P + Many script names and their 4-letter abbreviations are recognized in + \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of course). You can obtain a list of these scripts by running this com- mand: @@ -11153,7 +11732,7 @@ THE BIDI_CLASS PROPERTY FOR \p AND \P RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space CHARACTER CLASSES @@ -11179,10 +11758,50 @@ CHARACTER CLASSES word same as \w xdigit hexadecimal digit - In PCRE2, POSIX character set names recognize only ASCII characters by - default, but some of them use Unicode properties if PCRE2_UCP is set. + In PCRE2, POSIX character set names recognize only ASCII characters by + default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class. + When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes + may be used, allowing nested character classes, combined using set op- + erators. + + [x&&[^y]] UTS#18 extended character class + + x||y set union (OR) + x&&y set intersection (AND) + x--y set difference (AND NOT) + x~~y set symmetric difference (XOR) + + +PERL EXTENDED CHARACTER CLASSES + + (?[...]) Perl extended character class + (?[\p{Thai} & \p{Nd}]) operators; whitespace ignored + (?[(x - y) & z]) parentheses for grouping + + (?[ [^3] & \p{Nd} ]) [...] is a nested ordinary class + (?[ [:alpha:] - [z] ]) POSIX set is allowed outside [...] + (?[ \d - [3] ]) backslash-escaped set is allowed outside + [...] + (?[ !\n & [:ascii:] ]) backslash-escaped character is allowed out- + side [...] + all other characters or ranges must be enclosed + in [...] + + x|y, x+y set union (OR) + x&y set intersection (AND) + x-y set difference (AND NOT) + x^y set symmetric difference (XOR) + !x set complement (NOT) + + Inside a Perl extended character class, [...] switches mode to be in- + terpreted as an ordinary character class. Outside of a nested [...], + the only items permitted are backslash-escapes, POSIX sets, operators, + and parentheses. Inside a nested ordinary class, ^ has its usual mean- + ing (inverts the class when used as the first character); outside of a + nested class, ^ is the XOR operator. + QUANTIFIERS @@ -11289,7 +11908,7 @@ OPTION SETTING (?^) unset imnrsx options (?aP) implies (?aT) as well, though this has no additional effect. How- - ever, it means that (?-aP) is really (?-PT) which disables all ASCII + ever, it means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes. Unsetting x or xx unsets both. Several options may be set at once, and @@ -11299,20 +11918,25 @@ OPTION SETTING capture group, for example (?i:...). The following are recognized only at the very start of a pattern or af- - ter one of the newline or \R options with similar syntax. More than one - of them may appear. For the first three, d is a decimal number. - - (*LIMIT_DEPTH=d) set the backtracking limit to d - (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes - (*LIMIT_MATCH=d) set the match limit to d - (*NOTEMPTY) set PCRE2_NOTEMPTY when matching - (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching - (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) + ter one of the newline or \R sequences or options with similar syntax. + More than one of them may appear. For the first three, d is a decimal + number. + + (*LIMIT_DEPTH=d) set the backtracking limit to d + (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes + (*LIMIT_MATCH=d) set the match limit to d + (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching + (*NOTEMPTY) set PCRE2_NOTEMPTY when matching + (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching + (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) - (*NO_JIT) disable JIT optimization - (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) - (*UTF) set appropriate UTF mode for the library in use - (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) + (*NO_JIT) disable JIT optimization + (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OP- + TIMIZE) + (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching + (*UTF) set appropriate UTF mode for the library in use + (*UCP) set PCRE2_UCP (use Unicode properties for \d + etc) Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or @@ -11383,6 +12007,22 @@ NON-ATOMIC LOOKAROUND ASSERTIONS (*non_atomic_positive_lookbehind:...) ) +SUBSTRING SCAN ASSERTION + This feature is not Perl-compatible. + + (*scan_substring:(grouplist)...) scan captured substring + (*scs:(grouplist)...) scan captured substring + + The comma-separated list may identify groups in any of the following + ways: + + n absolute reference + +n relative reference + -n relative reference + name + 'name' name + + SCRIPT RUNS (*script_run:...) ) script run, can be backtracked into @@ -11444,16 +12084,16 @@ CONDITIONAL PATTERNS (?(VERSION[>]=n.m) test PCRE2 version (?(assert) assertion condition - Note the ambiguity of (?(R) and (?(Rn) which might be named reference - conditions or recursion tests. Such a condition is interpreted as a + Note the ambiguity of (?(R) and (?(Rn) which might be named reference + conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. BACKTRACKING CONTROL - All backtracking control verbs may be in the form (*VERB:NAME). For - (*MARK) the name is mandatory, for the others it is optional. (*SKIP) - changes its behaviour if :NAME is present. The others just set a name + All backtracking control verbs may be in the form (*VERB:NAME). For + (*MARK) the name is mandatory, for the others it is optional. (*SKIP) + changes its behaviour if :NAME is present. The others just set a name for passing back to the caller, but this is not a name that (*SKIP) can see. The following act immediately they are reached: @@ -11461,7 +12101,7 @@ BACKTRACKING CONTROL (*FAIL) force backtrack; synonym (*F) (*MARK:NAME) set name to be passed back; synonym (*:NAME) - The following act only when a subsequent match failure causes a back- + The following act only when a subsequent match failure causes a back- track to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored. @@ -11473,7 +12113,7 @@ BACKTRACKING CONTROL (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation - The effect of one of these verbs in a group called as a subroutine is + The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call. @@ -11484,14 +12124,61 @@ CALLOUTS (?C"text") callout with string data The allowed string delimiters are ` ' " ^ % # $ (which are the same for - the start and the end), and the starting delimiter { matched with the - ending delimiter }. To encode the ending delimiter within the string, + the start and the end), and the starting delimiter { matched with the + ending delimiter }. To encode the ending delimiter within the string, double it. +REPLACEMENT STRINGS + + If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for + pcre2_substitute() is not interpreted. Otherwise, by default, the only + special character is the dollar character in one of the following + forms: + + $$ insert a dollar character + $n or ${n} insert the contents of group n + $ insert the contents of named group + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string + $*MARK or ${*MARK} insert a control verb name + + For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is + set, there is additional interpretation: + + 1. Backslash is an escape character, and the forms described in "ES- + CAPED CHARACTERS" above are recognized. Also: + + \Q...\E can be used to suppress interpretation + \l force the next character to lower case + \u force the next character to upper case + \L force subsequent characters to lower case + \U force subsequent characters to upper case + \u\L force next character to upper case, then all lower + \l\U force next character to lower case, then all upper + \E end \L or \U case forcing + \b backspace character (note: as in character class in pat- + tern) + \v vertical tab character (note: not the same as in a pattern) + + 2. The Python form \g, where the angle brackets are part of the syn- + tax and n is either a group name or a number, is recognized as an al- + ternative way of inserting the contents of a group, for example \g<3>. + + 3. Capture substitution supports the following additional forms: + + ${n:-string} default for unset group + ${n:+string1:string2} values for set/unset group + + The substitution strings themselves are expanded. Backslash can be used + to escape colons and closing curly brackets. + + SEE ALSO - pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), + pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3). @@ -11504,20 +12191,19 @@ AUTHOR REVISION - Last updated: 12 October 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 27 November 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 12 October 2023 PCRE2SYNTAX(3) +PCRE2 10.45 27 November 2024 PCRE2SYNTAX(3) ------------------------------------------------------------------------------ - PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) NAME - PCRE - Perl-compatible regular expressions (revised API) + PCRE2 - Perl-compatible regular expressions (revised API) UNICODE AND UTF SUPPORT @@ -11554,7 +12240,7 @@ UNICODE PROPERTY SUPPORT ting. The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal num- - ber, the derived properties Any and LC (synonym L&), the Unicode script + ber, the derived properties Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties. @@ -11647,173 +12333,203 @@ UNICODE CASE-EQUIVALENCE in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. + Without PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' = U+212A (Kelvin sign) + 's' = 'S' = U+017F (long S) + With PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' + U+212A (Kelvin sign) only case-equivalent to itself + 's' = 'S' + U+017F (long S) only case-equivalent to itself + + One language family, Turkish and Azeri, has its own case-insensitivity + rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. + This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot + above), and U+0131 (small dotless i) characters. + + Without PCRE2_EXTRA_TURKISH_CASING: + 'i' = 'I' + U+0130 (capital I with dot above) only case-equivalent to itself + U+0131 (small dotless i) only case-equivalent to itself + With PCRE2_EXTRA_TURKISH_CASING: + 'i' = U+0130 (capital I with dot above) + U+0131 (small dotless i) = 'I' + + It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and + PCRE2_EXTRA_TURKISH_CASING together. + + From release 10.45 the Unicode letter properties Lu (upper case), Ll + (lower case), and Lt (title case) are all treated as Lc (cased letter) + when caseless matching is set by the PCRE2_CASELESS option or (?i) + within the pattern. + SCRIPT RUNS - The pattern constructs (*script_run:...) and (*atomic_script_run:...), - with synonyms (*sr:...) and (*asr:...), verify that the string matched - within the parentheses is a script run. In concept, a script run is a - sequence of characters that are all from the same Unicode script. How- + The pattern constructs (*script_run:...) and (*atomic_script_run:...), + with synonyms (*sr:...) and (*asr:...), verify that the string matched + within the parentheses is a script run. In concept, a script run is a + sequence of characters that are all from the same Unicode script. How- ever, because some scripts are commonly used together, and because some - diacritical and other marks are used with multiple scripts, it is not + diacritical and other marks are used with multiple scripts, it is not that simple. Every Unicode character has a Script property, mostly with a value cor- - responding to the name of a script, such as Latin, Greek, or Cyrillic. + responding to the name of a script, such as Latin, Greek, or Cyrillic. There are also three special values: "Unknown" is used for code points that have not been assigned, and also - for the surrogate code points. In the PCRE2 32-bit library, characters - whose code points are greater than the Unicode maximum (U+10FFFF), - which are accessible only in non-UTF mode, are assigned the Unknown + for the surrogate code points. In the PCRE2 32-bit library, characters + whose code points are greater than the Unicode maximum (U+10FFFF), + which are accessible only in non-UTF mode, are assigned the Unknown script. - "Common" is used for characters that are used with many scripts. These - include punctuation, emoji, mathematical, musical, and currency sym- + "Common" is used for characters that are used with many scripts. These + include punctuation, emoji, mathematical, musical, and currency sym- bols, and the ASCII digits 0 to 9. - "Inherited" is used for characters such as diacritical marks that mod- + "Inherited" is used for characters such as diacritical marks that mod- ify a previous character. These are considered to take on the script of the character that they modify. - Some Inherited characters are used with many scripts, but many of them - are only normally used with a small number of scripts. For example, + Some Inherited characters are used with many scripts, but many of them + are only normally used with a small number of scripts. For example, U+102E0 (Coptic Epact thousands mark) is used only with Arabic and Cop- - tic. In order to make it possible to check this, a Unicode property + tic. In order to make it possible to check this, a Unicode property called Script Extension exists. Its value is a list of scripts that ap- ply to the character. For the majority of characters, the list contains - just one script, the same one as the Script property. However, for - characters such as U+102E0 more than one Script is listed. There are - also some Common characters that have a single, non-Common script in + just one script, the same one as the Script property. However, for + characters such as U+102E0 more than one Script is listed. There are + also some Common characters that have a single, non-Common script in their Script Extension list. The next section describes the basic rules for deciding whether a given - string of characters is a script run. Note, however, that there are - some special cases involving the Chinese Han script, and an additional - constraint for decimal digits. These are covered in subsequent sec- + string of characters is a script run. Note, however, that there are + some special cases involving the Chinese Han script, and an additional + constraint for decimal digits. These are covered in subsequent sec- tions. Basic script run rules A string that is less than two characters long is a script run. This is - the only case in which an Unknown character can be part of a script - run. Longer strings are checked using only the Script Extensions prop- + the only case in which an Unknown character can be part of a script + run. Longer strings are checked using only the Script Extensions prop- erty, not the basic Script property. - If a character's Script Extension property is the single value "Inher- + If a character's Script Extension property is the single value "Inher- ited", it is always accepted as part of a script run. This is also true - for the property "Common", subject to the checking of decimal digits + for the property "Common", subject to the checking of decimal digits described below. All the remaining characters in a script run must have - at least one script in common in their Script Extension lists. In set- + at least one script in common in their Script Extension lists. In set- theoretic terminology, the intersection of all the sets of scripts must not be empty. - A simple example is an Internet name such as "google.com". The letters + A simple example is an Internet name such as "google.com". The letters are all in the Latin script, and the dot is Common, so this string is a script run. However, the Cyrillic letter "o" looks exactly the same as - the Latin "o"; a string that looks the same, but with Cyrillic "o"s is + the Latin "o"; a string that looks the same, but with Cyrillic "o"s is not a script run. - More interesting examples involve characters with more than one script + More interesting examples involve characters with more than one script in their Script Extension. Consider the following characters: U+060C Arabic comma U+06D4 Arabic full stop - The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- - iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both - of them could appear in script runs of either Arabic or Hanifi Ro- - hingya. The first could also appear in Syriac or Thaana script runs, + The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- + iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both + of them could appear in script runs of either Arabic or Hanifi Ro- + hingya. The first could also appear in Syriac or Thaana script runs, but the second could not. The Chinese Han script - The Chinese Han script is commonly used in conjunction with other - scripts for writing certain languages. Japanese uses the Hiragana and - Katakana scripts together with Han; Korean uses Hangul and Han; Tai- - wanese Mandarin uses Bopomofo and Han. These three combinations are - treated as special cases when checking script runs and are, in effect, - "virtual scripts". Thus, a script run may contain a mixture of Hira- - gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture - of Bopomofo and Han, but not, for example, a mixture of Hangul and - Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- - dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- + The Chinese Han script is commonly used in conjunction with other + scripts for writing certain languages. Japanese uses the Hiragana and + Katakana scripts together with Han; Korean uses Hangul and Han; Tai- + wanese Mandarin uses Bopomofo and Han. These three combinations are + treated as special cases when checking script runs and are, in effect, + "virtual scripts". Thus, a script run may contain a mixture of Hira- + gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture + of Bopomofo and Han, but not, for example, a mixture of Hangul and + Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- + dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- ports/tr39/) in allowing such mixtures. Decimal digits - Unicode contains many sets of 10 decimal digits in different scripts, - and some scripts (including the Common script) contain more than one - set. Some of these decimal digits them are visually indistinguishable - from the common ASCII digits. In addition to the script checking de- - scribed above, if a script run contains any decimal digits, they must + Unicode contains many sets of 10 decimal digits in different scripts, + and some scripts (including the Common script) contain more than one + set. Some of these decimal digits them are visually indistinguishable + from the common ASCII digits. In addition to the script checking de- + scribed above, if a script run contains any decimal digits, they must all come from the same set of 10 adjacent characters. VALIDITY OF UTF STRINGS - When the PCRE2_UTF option is set, the strings passed as patterns and + When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant functions. If an invalid UTF string is passed, a negative error code is - returned. The code unit offset to the offending character can be ex- - tracted from the match data block by calling pcre2_get_startchar(), + returned. The code unit offset to the offending character can be ex- + tracted from the match data block by calling pcre2_get_startchar(), which is used for this purpose after a UTF error. - In some situations, you may already know that your strings are valid, - and therefore want to skip these checks in order to improve perfor- - mance, for example in the case of a long subject string that is being - scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- - pile time or at match time, PCRE2 assumes that the pattern or subject + In some situations, you may already know that your strings are valid, + and therefore want to skip these checks in order to improve perfor- + mance, for example in the case of a long subject string that is being + scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- + pile time or at match time, PCRE2 assumes that the pattern or subject it is given (respectively) contains only valid UTF code unit sequences. - If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the - result is undefined and your program may crash or loop indefinitely or - give incorrect results. There is, however, one mode of matching that - can handle invalid UTF subject strings. This is enabled by passing - PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in - the next section. The rest of this section covers the case when + If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the + result is undefined and your program may crash or loop indefinitely or + give incorrect results. There is, however, one mode of matching that + can handle invalid UTF subject strings. This is enabled by passing + PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in + the next section. The rest of this section covers the case when PCRE2_MATCH_INVALID_UTF is not set. - Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF - check for the pattern; it does not also apply to subject strings. If - you want to disable the check for a subject string you must pass this + Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF + check for the pattern; it does not also apply to subject strings. If + you want to disable the check for a subject string you must pass this same option to pcre2_match() or pcre2_dfa_match(). UTF-16 and UTF-32 strings can indicate their endianness by special code - knows as a byte-order mark (BOM). The PCRE2 functions do not handle + knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. - Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any + Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other processing takes place. In the case of pcre2_match() and - pcre2_dfa_match() calls with a non-zero starting offset, the check is + pcre2_dfa_match() calls with a non-zero starting offset, the check is applied only to that part of the subject that could be inspected during - matching, and there is a check that the starting offset points to the - first code unit of a character or to the end of the subject. If there - are no lookbehind assertions in the pattern, the check starts at the - starting offset. Otherwise, it starts at the length of the longest - lookbehind before the starting offset, or at the start of the subject - if there are not that many characters before the starting offset. Note + matching, and there is a check that the starting offset points to the + first code unit of a character or to the end of the subject. If there + are no lookbehind assertions in the pattern, the check starts at the + starting offset. Otherwise, it starts at the length of the longest + lookbehind before the starting offset, or at the start of the subject + if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds. - In addition to checking the format of the string, there is a check to + In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding - the surrogate area. The so-called "non-character" code points are not + the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be. - Characters in the "Surrogate Area" of Unicode are reserved for use by - UTF-16, where they are used in pairs to encode code points with values - greater than 0xFFFF. The code points that are encoded by UTF-16 pairs - are available independently in the UTF-8 and UTF-32 encodings. (In - other words, the whole surrogate thing is a fudge for UTF-16 which un- + Characters in the "Surrogate Area" of Unicode are reserved for use by + UTF-16, where they are used in pairs to encode code points with values + greater than 0xFFFF. The code points that are encoded by UTF-16 pairs + are available independently in the UTF-8 and UTF-32 encodings. (In + other words, the whole surrogate thing is a fudge for UTF-16 which un- fortunately messes up UTF-8 and UTF-32.) - Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error - that is given if an escape sequence for an invalid Unicode code point - is encountered in the pattern. If you want to allow escape sequences - such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- - TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible - only in UTF-8 and UTF-32 modes, because these values are not repre- + Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error + that is given if an escape sequence for an invalid Unicode code point + is encountered in the pattern. If you want to allow escape sequences + such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- + TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible + only in UTF-8 and UTF-32 modes, because these values are not repre- sentable in UTF-16. Errors in UTF-8 strings @@ -11826,10 +12542,10 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR4 PCRE2_ERROR_UTF8_ERR5 - The string ends with a truncated UTF-8 character; the code specifies - how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 - characters to be no longer than 4 bytes, the encoding scheme (origi- - nally defined by RFC 2279) allows for up to 6 bytes, and this is + The string ends with a truncated UTF-8 character; the code specifies + how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 + characters to be no longer than 4 bytes, the encoding scheme (origi- + nally defined by RFC 2279) allows for up to 6 bytes, and this is checked first; hence the possibility of 4 or 5 missing bytes. PCRE2_ERROR_UTF8_ERR6 @@ -11839,13 +12555,13 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR10 The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of - the character do not have the binary value 0b10 (that is, either the + the character do not have the binary value 0b10 (that is, either the most significant bit is 0, or the next bit is 1). PCRE2_ERROR_UTF8_ERR11 PCRE2_ERROR_UTF8_ERR12 - A character that is valid by the RFC 2279 rules is either 5 or 6 bytes + A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; these code points are excluded by RFC 3629. PCRE2_ERROR_UTF8_ERR13 @@ -11855,8 +12571,8 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR14 - A 3-byte character has a value in the range 0xd800 to 0xdfff; this - range of code points are reserved by RFC 3629 for use with UTF-16, and + A 3-byte character has a value in the range 0xd800 to 0xdfff; this + range of code points are reserved by RFC 3629 for use with UTF-16, and so are excluded from UTF-8. PCRE2_ERROR_UTF8_ERR15 @@ -11865,26 +12581,26 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR18 PCRE2_ERROR_UTF8_ERR19 - A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes - for a value that can be represented by fewer bytes, which is invalid. - For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- + A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes + for a value that can be represented by fewer bytes, which is invalid. + For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- rect coding uses just one byte. PCRE2_ERROR_UTF8_ERR20 The two most significant bits of the first byte of a character have the - binary value 0b10 (that is, the most significant bit is 1 and the sec- - ond is 0). Such a byte can only validly occur as the second or subse- + binary value 0b10 (that is, the most significant bit is 1 and the sec- + ond is 0). Such a byte can only validly occur as the second or subse- quent byte of a multi-byte character. PCRE2_ERROR_UTF8_ERR21 - The first byte of a character has the value 0xfe or 0xff. These values + The first byte of a character has the value 0xfe or 0xff. These values can never occur in a valid UTF-8 string. Errors in UTF-16 strings - The following negative error codes are given for invalid UTF-16 + The following negative error codes are given for invalid UTF-16 strings: PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string @@ -11894,7 +12610,7 @@ VALIDITY OF UTF STRINGS Errors in UTF-32 strings - The following negative error codes are given for invalid UTF-32 + The following negative error codes are given for invalid UTF-32 strings: PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) @@ -11904,60 +12620,60 @@ VALIDITY OF UTF STRINGS MATCHING IN INVALID UTF STRINGS You can run pattern matches on subject strings that may contain invalid - UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- - VALID_UTF option. This is supported by pcre2_match(), including JIT + UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- + VALID_UTF option. This is supported by pcre2_match(), including JIT matching, but not by pcre2_dfa_match(). When PCRE2_MATCH_INVALID_UTF is - set, it forces PCRE2_UTF to be set as well. Note, however, that the + set, it forces PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a valid UTF string. - If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, - and you are not certain that your subject strings are valid UTF se- - quences, you should not make use of the JIT "fast path" function - pcre2_jit_match() because it bypasses sanity checks, including the one - for UTF validity. An invalid string may cause undefined behaviour, in- + If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, + and you are not certain that your subject strings are valid UTF se- + quences, you should not make use of the JIT "fast path" function + pcre2_jit_match() because it bypasses sanity checks, including the one + for UTF validity. An invalid string may cause undefined behaviour, in- cluding looping, crashing, or giving the wrong answer. - Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() - generates, but if pcre2_jit_compile() is subsequently called, it does + Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() + generates, but if pcre2_jit_compile() is subsequently called, it does generate different code. If JIT is not used, the option affects the be- haviour of the interpretive code in pcre2_match(). When PCRE2_MATCH_IN- - VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at + VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time. - In this mode, an invalid code unit sequence in the subject never - matches any pattern item. It does not match dot, it does not match - \p{Any}, it does not even match negative items such as [^X]. A lookbe- - hind assertion fails if it encounters an invalid sequence while moving - the current point backwards. In other words, an invalid UTF code unit + In this mode, an invalid code unit sequence in the subject never + matches any pattern item. It does not match dot, it does not match + \p{Any}, it does not even match negative items such as [^X]. A lookbe- + hind assertion fails if it encounters an invalid sequence while moving + the current point backwards. In other words, an invalid UTF code unit sequence acts as a barrier which no match can cross. You can also think of this as the subject being split up into fragments - of valid UTF, delimited internally by invalid code unit sequences. The - pattern is matched fragment by fragment. The result of a successful - match, however, is given as code unit offsets in the entire subject + of valid UTF, delimited internally by invalid code unit sequences. The + pattern is matched fragment by fragment. The result of a successful + match, however, is given as code unit offsets in the entire subject string in the usual way. There are a few points to consider: - The internal boundaries are not interpreted as the beginnings or ends - of lines and so do not match circumflex or dollar characters in the + The internal boundaries are not interpreted as the beginnings or ends + of lines and so do not match circumflex or dollar characters in the pattern. - If pcre2_match() is called with an offset that points to an invalid - UTF-sequence, that sequence is skipped, and the match starts at the + If pcre2_match() is called with an offset that points to an invalid + UTF-sequence, that sequence is skipped, and the match starts at the next valid UTF character, or the end of the subject. At internal fragment boundaries, \b and \B behave in the same way as at - the beginning and end of the subject. For example, a sequence such as - \bWORD\b would match an instance of WORD that is surrounded by invalid + the beginning and end of the subject. For example, a sequence such as + \bWORD\b would match an instance of WORD that is surrounded by invalid UTF code units. - Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- - trary data, knowing that any matched strings that are returned are + Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- + trary data, knowing that any matched strings that are returned are valid UTF. This can be useful when searching for UTF text in executable or other binary files. - Note, however, that the 16-bit and 32-bit PCRE2 libraries process - strings as sequences of uint16_t or uint32_t code points. They cannot - find valid UTF sequences within an arbitrary string of bytes unless + Note, however, that the 16-bit and 32-bit PCRE2 libraries process + strings as sequences of uint16_t or uint32_t code points. They cannot + find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. @@ -11970,11 +12686,11 @@ AUTHOR REVISION - Last updated: 12 October 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 27 November 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 04 February 2023 PCRE2UNICODE(3) +PCRE2 10.45 27 November 2024 PCRE2UNICODE(3) ------------------------------------------------------------------------------ diff --git a/mingw64/share/doc/pcre2/pcre2grep.txt b/mingw64/share/doc/pcre2/pcre2grep.txt index 7914c450fcb..9e07a5a7dac 100644 --- a/mingw64/share/doc/pcre2/pcre2grep.txt +++ b/mingw64/share/doc/pcre2/pcre2grep.txt @@ -1,4 +1,3 @@ - PCRE2GREP(1) General Commands Manual PCRE2GREP(1) @@ -366,139 +365,140 @@ OPTIONS used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --new- line option has no effect on this option. Trailing white - space is removed from each line, and blank lines are ignored. - An empty file contains no patterns and therefore matches - nothing. Patterns read from a file in this way may contain - binary zeros, which are treated as ordinary data characters. - - If this option is given more than once, all the specified - files are read. A data line is output if any of the patterns - match it. A file name can be given as "-" to refer to the - standard input. When -f is used, patterns specified on the - command line using -e may also be present; they are matched + space is removed from each line, and blank lines are ignored + unless the --posix-pattern-file option is also provided. An + empty file contains no patterns and therefore matches noth- + ing. Patterns read from a file in this way may contain binary + zeros, which are treated as ordinary character literals. + + If this option is given more than once, all the specified + files are read. A data line is output if any of the patterns + match it. A file name can be given as "-" to refer to the + standard input. When -f is used, patterns specified on the + command line using -e may also be present; they are matched before the file's patterns. However, no pattern is taken from - the command line; all arguments are treated as the names of + the command line; all arguments are treated as the names of paths to be searched. --file-list=filename - Read a list of files and/or directories that are to be + Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a - newline when reading the file is the operating system's de- - fault. Trailing white space is removed from each line, and + newline when reading the file is the operating system's de- + fault. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any - that are listed on the command line. The file name can be - given as "-" to refer to the standard input. If --file and - --file-list are both specified as "-", patterns are read - first. This is useful only when the standard input is a ter- - minal, from which further lines (the list of files) can be + that are listed on the command line. The file name can be + given as "-" to refer to the standard input. If --file and + --file-list are both specified as "-", patterns are read + first. This is useful only when the standard input is a ter- + minal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read. --file-offsets - Instead of showing lines or parts of lines that match, show - each match as an offset from the start of the file and a - length, separated by a comma. In this mode, --colour has no - effect, and no context is shown. That is, the -A, -B, and -C - options are ignored. If there is more than one match in a - line, each of them is shown separately. This option is mutu- - ally exclusive with --output, --line-offsets, and --only- + Instead of showing lines or parts of lines that match, show + each match as an offset from the start of the file and a + length, separated by a comma. In this mode, --colour has no + effect, and no context is shown. That is, the -A, -B, and -C + options are ignored. If there is more than one match in a + line, each of them is shown separately. This option is mutu- + ally exclusive with --output, --line-offsets, and --only- matching. --group-separator=text Output this text string instead of two hyphens between groups - of lines when -A, -B, or -C is in use. See also --no-group- + of lines when -A, -B, or -C is in use. See also --no-group- separator. -H, --with-filename - Force the inclusion of the file name at the start of output + Force the inclusion of the file name at the start of output lines when searching a single file. The file name is not nor- - mally shown in this case. By default, for matching lines, - the file name is followed by a colon; for context lines, a + mally shown in this case. By default, for matching lines, + the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. When the -M option causes a - pattern to match more than one line, only the first is pre- - ceded by the file name. This option overrides any previous + pattern to match more than one line, only the first is pre- + ceded by the file name. This option overrides any previous -h, -l, or -L options. -h, --no-filename Suppress the output file names when searching multiple files. File names are normally shown when multiple files are - searched. By default, for matching lines, the file name is + searched. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a - zero byte. If a line number is also being output, it follows + zero byte. If a line number is also being output, it follows the file name. This option overrides any previous -H, -L, or -l options. --heap-limit=number See --match-limit below. - --help Output a help message, giving brief details of the command - options and file type support, and then exit. Anything else + --help Output a help message, giving brief details of the command + options and file type support, and then exit. Anything else on the command line is ignored. - -I Ignore binary files. This is equivalent to --binary- + -I Ignore binary files. This is equivalent to --binary- files=without-match. -i, --ignore-case - Ignore upper/lower case distinctions when pattern matching. + Ignore upper/lower case distinctions when pattern matching. This applies when matching path names for inclusion or exclu- sion as well as when matching lines in files. --include=pattern - If any --include patterns are specified, the only files that + If any --include patterns are specified, the only files that are processed are those whose names match one of the patterns - and do not match an --exclude pattern. This option does not - affect directories, but it applies to all files, whether - listed on the command line, obtained from --file-list, or by - scanning a directory. The pattern is a PCRE2 regular expres- - sion, and is matched against the final component of the file - name, not the entire path. The -F, -w, and -x options do not - apply to this pattern. The option may be given any number of - times. If a file name matches both an --include and an --ex- - clude pattern, it is excluded. There is no short form for + and do not match an --exclude pattern. This option does not + affect directories, but it applies to all files, whether + listed on the command line, obtained from --file-list, or by + scanning a directory. The pattern is a PCRE2 regular expres- + sion, and is matched against the final component of the file + name, not the entire path. The -F, -w, and -x options do not + apply to this pattern. The option may be given any number of + times. If a file name matches both an --include and an --ex- + clude pattern, it is excluded. There is no short form for this option. --include-from=filename - Treat each non-empty line of the file as the data for an + Treat each non-empty line of the file as the data for an --include option. What constitutes a newline for this purpose - is the operating system's default. The --newline option has + is the operating system's default. The --newline option has no effect on this option. This option may be given any number of times; all the files are read. --include-dir=pattern - If any --include-dir patterns are specified, the only direc- - tories that are processed are those whose names match one of - the patterns and do not match an --exclude-dir pattern. This - applies to all directories, whether listed on the command - line, obtained from --file-list, or by scanning a parent di- - rectory. The pattern is a PCRE2 regular expression, and is - matched against the final component of the directory name, - not the entire path. The -F, -w, and -x options do not apply + If any --include-dir patterns are specified, the only direc- + tories that are processed are those whose names match one of + the patterns and do not match an --exclude-dir pattern. This + applies to all directories, whether listed on the command + line, obtained from --file-list, or by scanning a parent di- + rectory. The pattern is a PCRE2 regular expression, and is + matched against the final component of the directory name, + not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. - If a directory matches both --include-dir and --exclude-dir, + If a directory matches both --include-dir and --exclude-dir, it is excluded. There is no short form for this option. -L, --files-without-match - Instead of outputting lines from the files, just output the - names of the files that do not contain any lines that would - have been output. Each file name is output once, on a sepa- - rate line by default, but if the -Z option is set, they are - separated by zero bytes instead of newlines. This option + Instead of outputting lines from the files, just output the + names of the files that do not contain any lines that would + have been output. Each file name is output once, on a sepa- + rate line by default, but if the -Z option is set, they are + separated by zero bytes instead of newlines. This option overrides any previous -H, -h, or -l options. -l, --files-with-matches - Instead of outputting lines from the files, just output the + Instead of outputting lines from the files, just output the names of the files containing lines that would have been out- - put. Each file name is output once, on a separate line, but + put. Each file name is output once, on a separate line, but if the -Z option is set, they are separated by zero bytes in- - stead of newlines. Searching normally stops as soon as a - matching line is found in a file. However, if the -c (count) - option is also used, matching continues in order to obtain - the correct count, and those files that have at least one - match are listed along with their counts. Using this option - with -c is a way of suppressing the listing of files with no + stead of newlines. Searching normally stops as soon as a + matching line is found in a file. However, if the -c (count) + option is also used, matching continues in order to obtain + the correct count, and those files that have at least one + match are listed along with their counts. Using this option + with -c is a way of suppressing the listing of files with no matches that occurs with -c on its own. This option overrides any previous -H, -h, or -L options. @@ -508,130 +508,130 @@ OPTIONS input)" is used. There is no short form for this option. --line-buffered - When this option is given, non-compressed input is read and - processed line by line, and the output is flushed after each - write. By default, input is read in large chunks, unless - pcre2grep can determine that it is reading from a terminal, + When this option is given, non-compressed input is read and + processed line by line, and the output is flushed after each + write. By default, input is read in large chunks, unless + pcre2grep can determine that it is reading from a terminal, which is currently possible only in Unix-like environments or Windows. Output to terminal is normally automatically flushed - by the operating system. This option can be useful when the - input or output is attached to a pipe and you do not want - pcre2grep to buffer up large amounts of data. However, its - use will affect performance, and the -M (multiline) option - ceases to work. When input is from a compressed .gz or .bz2 + by the operating system. This option can be useful when the + input or output is attached to a pipe and you do not want + pcre2grep to buffer up large amounts of data. However, its + use will affect performance, and the -M (multiline) option + ceases to work. When input is from a compressed .gz or .bz2 file, --line-buffered is ignored. --line-offsets - Instead of showing lines or parts of lines that match, show + Instead of showing lines or parts of lines that match, show each match as a line number, the offset from the start of the - line, and a length. The line number is terminated by a colon - (as usual; see the -n option), and the offset and length are - separated by a comma. In this mode, --colour has no effect, - and no context is shown. That is, the -A, -B, and -C options - are ignored. If there is more than one match in a line, each - of them is shown separately. This option is mutually exclu- + line, and a length. The line number is terminated by a colon + (as usual; see the -n option), and the offset and length are + separated by a comma. In this mode, --colour has no effect, + and no context is shown. That is, the -A, -B, and -C options + are ignored. If there is more than one match in a line, each + of them is shown separately. This option is mutually exclu- sive with --output, --file-offsets, and --only-matching. --locale=locale-name - This option specifies a locale to be used for pattern match- - ing. It overrides the value in the LC_ALL or LC_CTYPE envi- - ronment variables. If no locale is specified, the PCRE2 li- + This option specifies a locale to be used for pattern match- + ing. It overrides the value in the LC_ALL or LC_CTYPE envi- + ronment variables. If no locale is specified, the PCRE2 li- brary's default (usually the "C" locale) is used. There is no short form for this option. -M, --multiline - Allow patterns to match more than one line. When this option - is set, the PCRE2 library is called in "multiline" mode, and - a match is allowed to continue past the end of the initial + Allow patterns to match more than one line. When this option + is set, the PCRE2 library is called in "multiline" mode, and + a match is allowed to continue past the end of the initial line and onto one or more subsequent lines. - Patterns used with -M may usefully contain literal newline - characters and internal occurrences of ^ and $ characters, - because in multiline mode these can match at internal new- - lines. Because pcre2grep is scanning multiple lines, the \Z - and \z assertions match only at the end of the last line in + Patterns used with -M may usefully contain literal newline + characters and internal occurrences of ^ and $ characters, + because in multiline mode these can match at internal new- + lines. Because pcre2grep is scanning multiple lines, the \Z + and \z assertions match only at the end of the last line in the file. The \A assertion matches at the start of the first - line of a match. This can be any line in the file; it is not + line of a match. This can be any line in the file; it is not anchored to the first line. - The output for a successful match may consist of more than - one line. The first line is the line in which the match - started, and the last line is the line in which the match - ended. If the matched string ends with a newline sequence, - the output ends at the end of that line. If -v is set, none - of the lines in a multi-line match are output. Once a match - has been handled, scanning restarts at the beginning of the + The output for a successful match may consist of more than + one line. The first line is the line in which the match + started, and the last line is the line in which the match + ended. If the matched string ends with a newline sequence, + the output ends at the end of that line. If -v is set, none + of the lines in a multi-line match are output. Once a match + has been handled, scanning restarts at the beginning of the line after the one in which the match ended. - The newline sequence that separates multiple lines must be - matched as part of the pattern. For example, to find the - phrase "regular expression" in a file where "regular" might - be at the end of a line and "expression" at the start of the + The newline sequence that separates multiple lines must be + matched as part of the pattern. For example, to find the + phrase "regular expression" in a file where "regular" might + be at the end of a line and "expression" at the start of the next line, you could use this command: pcre2grep -M 'regular\s+expression' The \s escape sequence matches any white space character, in- - cluding newlines, and is followed by + so as to match trail- - ing white space on the first line as well as possibly han- + cluding newlines, and is followed by + so as to match trail- + ing white space on the first line as well as possibly han- dling a two-character newline sequence. - There is a limit to the number of lines that can be matched, - imposed by the way that pcre2grep buffers the input file as - it scans it. With a sufficiently large processing buffer, + There is a limit to the number of lines that can be matched, + imposed by the way that pcre2grep buffers the input file as + it scans it. With a sufficiently large processing buffer, this should not be a problem. - The -M option does not work when input is read line by line + The -M option does not work when input is read line by line (see --line-buffered.) -m number, --max-count=number - Stop processing after finding number matching lines, or non- - matching lines if -v is also set. Any trailing context lines - are output after the final match. In multiline mode, each - multiline match counts as just one line for this purpose. If - this limit is reached when reading the standard input from a + Stop processing after finding number matching lines, or non- + matching lines if -v is also set. Any trailing context lines + are output after the final match. In multiline mode, each + multiline match counts as just one line for this purpose. If + this limit is reached when reading the standard input from a regular file, the file is left positioned just after the last - matching line. If -c is also set, the count that is output - is never greater than number. This option has no effect if + matching line. If -c is also set, the count that is output + is never greater than number. This option has no effect if used with -L, -l, or -q, or when just checking for a match in a binary file. --match-limit=number - Processing some regular expression patterns may take a very + Processing some regular expression patterns may take a very long time to search for all possible matching strings. Others - may require a very large amount of memory. There are three + may require a very large amount of memory. There are three options that set resource limits for matching. The --match-limit option provides a means of limiting comput- - ing resource usage when processing patterns that are not go- + ing resource usage when processing patterns that are not go- ing to match, but which have a very large number of possibil- ities in their search trees. The classic example is a pattern - that uses nested unlimited repeats. Internally, PCRE2 has a - counter that is incremented each time around its main pro- - cessing loop. If the value set by --match-limit is reached, + that uses nested unlimited repeats. Internally, PCRE2 has a + counter that is incremented each time around its main pro- + cessing loop. If the value set by --match-limit is reached, an error occurs. - The --heap-limit option specifies, as a number of kibibytes + The --heap-limit option specifies, as a number of kibibytes (units of 1024 bytes), the maximum amount of heap memory that may be used for matching. - The --depth-limit option limits the depth of nested back- + The --depth-limit option limits the depth of nested back- tracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtrack- - ing point depends on the number of capturing parentheses in + ing point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this - limit acts varies from pattern to pattern. This limit is of + limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than --match-limit. - There are no short forms for these options. The default lim- - its can be set when the PCRE2 library is compiled; if they - are not specified, the defaults are very large and so effec- + There are no short forms for these options. The default lim- + its can be set when the PCRE2 library is compiled; if they + are not specified, the defaults are very large and so effec- tively unlimited. --max-buffer-size=number - This limits the expansion of the processing buffer, whose - initial size can be set by --buffer-size. The maximum buffer - size is silently forced to be no smaller than the starting + This limits the expansion of the processing buffer, whose + initial size can be set by --buffer-size. The maximum buffer + size is silently forced to be no smaller than the starting buffer size. -N newline-type, --newline=newline-type @@ -640,72 +640,72 @@ OPTIONS pcre2grep -N CRLF 'some pattern' - The newline type may be specified in upper, lower, or mixed - case. If the newline type is NUL, lines are separated by bi- - nary zero characters. The other types are the single-charac- - ter sequences CR (carriage return) and LF (linefeed), the - two-character sequence CRLF, an "anycrlf" type, which recog- - nizes any of the preceding three types, and an "any" type, - for which any Unicode line ending sequence is assumed to end - a line. The Unicode sequences are the three just mentioned, - plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL - (next line, U+0085), LS (line separator, U+2028), and PS + The newline type may be specified in upper, lower, or mixed + case. If the newline type is NUL, lines are separated by bi- + nary zero characters. The other types are the single-charac- + ter sequences CR (carriage return) and LF (linefeed), the + two-character sequence CRLF, an "anycrlf" type, which recog- + nizes any of the preceding three types, and an "any" type, + for which any Unicode line ending sequence is assumed to end + a line. The Unicode sequences are the three just mentioned, + plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL + (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). - When the PCRE2 library is built, a default line-ending se- - quence is specified. This is normally the standard sequence - for the operating system. Unless otherwise specified by this + When the PCRE2 library is built, a default line-ending se- + quence is specified. This is normally the standard sequence + for the operating system. Unless otherwise specified by this option, pcre2grep uses the library's default. - This option makes it possible to use pcre2grep to scan files + This option makes it possible to use pcre2grep to scan files that have come from other environments without having to mod- - ify their line endings. If the data that is being scanned - does not agree with the convention set by this option, - pcre2grep may behave in strange ways. Note that this option - does not apply to files specified by the -f, --exclude-from, - or --include-from options, which are expected to use the op- + ify their line endings. If the data that is being scanned + does not agree with the convention set by this option, + pcre2grep may behave in strange ways. Note that this option + does not apply to files specified by the -f, --exclude-from, + or --include-from options, which are expected to use the op- erating system's standard newline sequence. -n, --line-number Precede each output line by its line number in the file, fol- - lowed by a colon for matching lines or a hyphen for context + lowed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the - line number. When the -M option causes a pattern to match - more than one line, only the first is preceded by its line + line number. When the -M option causes a pattern to match + more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used. --no-group-separator - Do not output a separator between groups of lines when -A, + Do not output a separator between groups of lines when -A, -B, or -C is in use. The default is to output a line contain- ing two hyphens. See also --group-separator. - --no-jit If the PCRE2 library is built with support for just-in-time + --no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build - time. This option can be used to disable the use of JIT at + time. This option can be used to disable the use of JIT at run time. It is provided for testing and working around prob- lems. It should never be needed in normal use. -O text, --output=text - When there is a match, instead of outputting the line that - matched, output just the text specified in this option, fol- - lowed by an operating-system standard newline. In this mode, - --colour has no effect, and no context is shown. That is, - the -A, -B, and -C options are ignored. The --newline option - has no effect on this option, which is mutually exclusive + When there is a match, instead of outputting the line that + matched, output just the text specified in this option, fol- + lowed by an operating-system standard newline. In this mode, + --colour has no effect, and no context is shown. That is, + the -A, -B, and -C options are ignored. The --newline option + has no effect on this option, which is mutually exclusive with --only-matching, --file-offsets, and --line-offsets. - However, like --only-matching, if there is more than one + However, like --only-matching, if there is more than one match in a line, each of them causes a line of output. Escape sequences starting with a dollar character may be used to insert the contents of the matched part of the line and/or captured substrings into the text. - $ or ${} is replaced by the captured sub- - string of the given decimal number; zero substitutes the - whole match. If the number is greater than the number of cap- - turing substrings, or if the capture is unset, the replace- - ment is empty. + $ or ${} is replaced by the captured sub- + string of the given decimal number; $& (or the legacy $0) + substitutes the whole match. If the number is greater than + the number of capturing substrings, or if the capture is un- + set, the replacement is empty. $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; @@ -787,93 +787,103 @@ OPTIONS mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \w to match Unicode letters and digits. + --posix-pattern-file + When patterns are provided with the -f option, do not trim + trailing spaces or ignore empty lines in a similar way than + other grep tools. To keep the behaviour consistent with older + versions, if the pattern read was terminated with CRLF (as + character literals) then both characters won't be included as + part of it, so if you really need to have pattern ending in + '\r', use a escape sequence or provide it by a different + method. + -q, --quiet Work quietly, that is, display nothing except error messages. - The exit status indicates whether or not any matches were + The exit status indicates whether or not any matches were found. -r, --recursive - If any given path is a directory, recursively scan the files - it contains, taking note of any --include and --exclude set- - tings. By default, a directory is read as a normal file; in - some operating systems this gives an immediate end-of-file. - This option is a shorthand for setting the -d option to "re- + If any given path is a directory, recursively scan the files + it contains, taking note of any --include and --exclude set- + tings. By default, a directory is read as a normal file; in + some operating systems this gives an immediate end-of-file. + This option is a shorthand for setting the -d option to "re- curse". --recursion-limit=number - This is an obsolete synonym for --depth-limit. See --match- + This is an obsolete synonym for --depth-limit. See --match- limit above for details. -s, --no-messages - Suppress error messages about non-existent or unreadable - files. Such files are quietly skipped. However, the return + Suppress error messages about non-existent or unreadable + files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. -t, --total-count - This option is useful when scanning more than one file. If - used on its own, -t suppresses all output except for a grand - total number of matching lines (or non-matching lines if -v + This option is useful when scanning more than one file. If + used on its own, -t suppresses all output except for a grand + total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand to- - tal is output except when the previous output is just one - line. In other words, it is not output when just one file's - count is listed. If file names are being output, the grand - total is preceded by "TOTAL:". Otherwise, it appears as just - another number. The -t option is ignored when used with -L - (list files without matches), because the grand total would + tal is output except when the previous output is just one + line. In other words, it is not output when just one file's + count is listed. If file names are being output, the grand + total is preceded by "TOTAL:". Otherwise, it appears as just + another number. The -t option is ignored when used with -L + (list files without matches), because the grand total would always be zero. -u, --utf Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (in- - cluding those for any --exclude and --include options) and - all lines that are scanned must be valid strings of UTF-8 + cluding those for any --exclude and --include options) and + all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an er- ror occurs. -U, --utf-allow-invalid - As --utf, but in addition subject lines may contain invalid - UTF-8 code unit sequences. These can never form part of any - pattern match. Patterns themselves, however, must still be + As --utf, but in addition subject lines may contain invalid + UTF-8 code unit sequences. These can never form part of any + pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or - other binary files. For more details about matching in non- + other binary files. For more details about matching in non- valid UTF-8 strings, see the pcre2unicode(3) documentation. -V, --version - Write the version numbers of pcre2grep and the PCRE2 library - to the standard output and then exit. Anything else on the + Write the version numbers of pcre2grep and the PCRE2 library + to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match - Invert the sense of the match, so that lines which do not - match any of the patterns are the ones that are found. When - this option is set, options such as --only-matching and - --output, which specify parts of a match that are to be out- + Invert the sense of the match, so that lines which do not + match any of the patterns are the ones that are found. When + this option is set, options such as --only-matching and + --output, which specify parts of a match that are to be out- put, are ignored. -w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must - be a word boundary at the start and end of each matched - string. This is equivalent to having "\b(?:" at the start of - each pattern, and ")\b" at the end. This option applies only - to the patterns that are matched against the contents of - files; it does not apply to patterns specified by any of the + be a word boundary at the start and end of each matched + string. This is equivalent to having "\b(?:" at the start of + each pattern, and ")\b" at the end. This option applies only + to the patterns that are matched against the contents of + files; it does not apply to patterns specified by any of the --include or --exclude options. -x, --line-regex, --line-regexp - Force the patterns to start matching only at the beginnings - of lines, and in addition, require them to match entire + Force the patterns to start matching only at the beginnings + of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pat- - tern and ")$" at the end. This option applies only to the - patterns that are matched against the contents of files; it - does not apply to patterns specified by any of the --include + tern and ")$" at the end. This option applies only to the + patterns that are matched against the contents of files; it + does not apply to patterns specified by any of the --include or --exclude options. -Z, --null - Terminate files names in the regular output with a zero byte - (the NUL character) instead of what would normally appear. - This is useful when file names contain unusual characters - such as colons, hyphens, or even newlines. The option does + Terminate files names in the regular output with a zero byte + (the NUL character) instead of what would normally appear. + This is useful when file names contain unusual characters + such as colons, hyphens, or even newlines. The option does not apply to file names in error messages. @@ -887,90 +897,90 @@ ENVIRONMENT VARIABLES NEWLINES - The -N (--newline) option allows pcre2grep to scan files with newline - conventions that differ from the default. This option affects only the - way scanned files are processed. It does not affect the interpretation - of files specified by the -f, --file-list, --exclude-from, or --in- + The -N (--newline) option allows pcre2grep to scan files with newline + conventions that differ from the default. This option affects only the + way scanned files are processed. It does not affect the interpretation + of files specified by the -f, --file-list, --exclude-from, or --in- clude-from options. - Any parts of the scanned input files that are written to the standard - output are copied with whatever newline sequences they have in the in- - put. However, if the final line of a file is output, and it does not - end with a newline sequence, a newline sequence is added. If the new- - line setting is CR, LF, CRLF or NUL, that line ending is output; for + Any parts of the scanned input files that are written to the standard + output are copied with whatever newline sequences they have in the in- + put. However, if the final line of a file is output, and it does not + end with a newline sequence, a newline sequence is added. If the new- + line setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. - The newline setting does not affect the way in which pcre2grep writes - newlines in informational messages to the standard output and error - streams. Under Windows, the standard output is set to be binary, so - that "\r\n" at the ends of output lines that are copied from the input - is not converted to "\r\r\n" by the C I/O library. This means that any - messages written to the standard output must end with "\r\n". For all - other operating systems, and for all messages to the standard error + The newline setting does not affect the way in which pcre2grep writes + newlines in informational messages to the standard output and error + streams. Under Windows, the standard output is set to be binary, so + that "\r\n" at the ends of output lines that are copied from the input + is not converted to "\r\r\n" by the C I/O library. This means that any + messages written to the standard output must end with "\r\n". For all + other operating systems, and for all messages to the standard error stream, "\n" is used. OPTIONS COMPATIBILITY WITH GNU GREP Many of the short and long forms of pcre2grep's options are the same as - in the GNU grep program. Any long option of the form --xxx-regexp (GNU - terminology) is also available as --xxx-regex (PCRE2 terminology). - However, the --case-restrict, --depth-limit, -E, --file-list, --file- + in the GNU grep program. Any long option of the form --xxx-regexp (GNU + terminology) is also available as --xxx-regex (PCRE2 terminology). + However, the --case-restrict, --depth-limit, -E, --file-list, --file- offsets, --heap-limit, --include-dir, --line-offsets, --locale, - --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- - tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are + --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- + tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number. - Although most of the common options work the same way, a few are dif- - ferent in pcre2grep. For example, the --include option's argument is a + Although most of the common options work the same way, a few are dif- + ferent in pcre2grep. For example, the --include option's argument is a glob for GNU grep, but in pcre2grep it is a regular expression to which - the -i option applies. If both the -c and -l options are given, GNU - grep lists only file names, without counts, but pcre2grep gives the + the -i option applies. If both the -c and -l options are given, GNU + grep lists only file names, without counts, but pcre2grep gives the counts as well. OPTIONS WITH DATA There are four different ways in which an option with data can be spec- - ified. If a short form option is used, the data may follow immedi- + ified. If a short form option is used, the data may follow immedi- ately, or (with one exception) in the next command line item. For exam- ple: -f/some/file -f /some/file - The exception is the -o option, which may appear with or without data. - Because of this, if data is present, it must follow immediately in the + The exception is the -o option, which may appear with or without data. + Because of this, if data is present, it must follow immediately in the same item, for example -o3. - If a long form option is used, the data may appear in the same command - line item, separated by an equals character, or (with two exceptions) + If a long form option is used, the data may appear in the same command + line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: --file=/some/file --file /some/file - Note, however, that if you want to supply a file name beginning with ~ - as data in a shell command, and have the shell expand ~ to a home di- - rectory, you must separate the file name from the option, because the + Note, however, that if you want to supply a file name beginning with ~ + as data in a shell command, and have the shell expand ~ to a home di- + rectory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. - The exceptions to the above are the --colour (or --color) and --only- - matching options, for which the data is optional. If one of these op- - tions does have data, it must be given in the first form, using an + The exceptions to the above are the --colour (or --color) and --only- + matching options, for which the data is optional. If one of these op- + tions does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data. USING PCRE2'S CALLOUT FACILITY - pcre2grep has, by default, support for calling external programs or - scripts or echoing specific strings during matching by making use of - PCRE2's callout facility. However, this support can be completely or - partially disabled when pcre2grep is built. You can find out whether - your binary has support for callouts by running it with the --help op- - tion. If callout support is completely disabled, all callouts in pat- - terns are ignored by pcre2grep. If the facility is partially disabled, + pcre2grep has, by default, support for calling external programs or + scripts or echoing specific strings during matching by making use of + PCRE2's callout facility. However, this support can be completely or + partially disabled when pcre2grep is built. You can find out whether + your binary has support for callouts by running it with the --help op- + tion. If callout support is completely disabled, callouts in patterns + are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored. @@ -988,13 +998,13 @@ USING PCRE2'S CALLOUT FACILITY processed as a zero-terminated string, which means it should not con- tain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the - --output (-O) option (see above). However, $0 cannot be used to insert - a matched substring because the match is still in progress. Instead, - the single character '0' is inserted. Any syntax errors in the string - (for example, a dollar not followed by another character) causes the - callout to be ignored. No terminator is added to the output string, so - if you want a newline, you must include it explicitly using the escape - $n. For example: + --output (-O) option (see above). However, $0 or $& cannot be used to + insert a matched substring because the match is still in progress. In- + stead, the single character '0' is inserted. Any syntax errors in the + string (for example, a dollar not followed by another character) causes + the callout to be ignored. No terminator is added to the output string, + so if you want a newline, you must include it explicitly using the es- + cape $n. For example: pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' @@ -1018,10 +1028,11 @@ USING PCRE2'S CALLOUT FACILITY Any substring (including the executable name) may contain escape se- quences started by a dollar character. These are the same as for the - --output (-O) option documented above, except that $0 cannot insert the - matched string because the match is still in progress. Instead, the - character '0' is inserted. If you need a literal dollar or pipe charac- - ter in any substring, use $$ or $| respectively. Here is an example: + --output (-O) option documented above, except that $0 or $& cannot in- + sert the matched string because the match is still in progress. In- + stead, the character '0' is inserted. If you need a literal dollar or + pipe character in any substring, use $$ or $| respectively. Here is an + example: echo -e "abcde\n12345" | pcre2grep \ '(?x)(.)(..(.)) @@ -1034,43 +1045,43 @@ USING PCRE2'S CALLOUT FACILITY Arg1: [1] [234] [4] Arg2: |1| () 12345 - The parameters for the system call that is used to run the program or + The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero charac- - ters in the callout argument will cause premature termination of their - substrings, and therefore should not be present. Any syntax errors in - the string (for example, a dollar not followed by another character) + ters in the callout argument will cause premature termination of their + substrings, and therefore should not be present. Any syntax errors in + the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any - reason (including the non-existence of the executable), a local match- + reason (including the non-existence of the executable), a local match- ing failure occurs and the matcher backtracks in the normal way. MATCHING ERRORS - It is possible to supply a regular expression that takes a very long - time to fail to match certain lines. Such patterns normally involve - nested indefinite repeats, for example: (a+)*\d when matched against a - line of a's with no final digit. The PCRE2 matching function has a re- - source limit that causes it to abort in these circumstances. If this - happens, pcre2grep outputs an error message and the line that caused - the problem to the standard error stream. If there are more than 20 + It is possible to supply a regular expression that takes a very long + time to fail to match certain lines. Such patterns normally involve + nested indefinite repeats, for example: (a+)*\d when matched against a + line of a's with no final digit. The PCRE2 matching function has a re- + source limit that causes it to abort in these circumstances. If this + happens, pcre2grep outputs an error message and the line that caused + the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. - The --match-limit option of pcre2grep can be used to set the overall - resource limit. There are also other limits that affect the amount of - memory used during matching; see the discussion of --heap-limit and + The --match-limit option of pcre2grep can be used to set the overall + resource limit. There are also other limits that affect the amount of + memory used during matching; see the discussion of --heap-limit and --depth-limit above. DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, - and 2 for syntax errors, overlong lines, non-existent or inaccessible - files (even if matches were found in other files) or too many matching + and 2 for syntax errors, overlong lines, non-existent or inaccessible + files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. - When run under VMS, the return code is placed in the symbol - PCRE2GREP_RC because VMS does not distinguish between exit(0) and + When run under VMS, the return code is placed in the symbol + PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). @@ -1088,8 +1099,8 @@ AUTHOR REVISION - Last updated: 22 December 2023 + Last updated: 04 February 2025 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 22 December 2023 PCRE2GREP(1) +PCRE2 10.45 04 February 2025 PCRE2GREP(1) diff --git a/mingw64/share/doc/pcre2/pcre2test.txt b/mingw64/share/doc/pcre2/pcre2test.txt index ddb491d7e7c..b6574b2ea1b 100644 --- a/mingw64/share/doc/pcre2/pcre2test.txt +++ b/mingw64/share/doc/pcre2/pcre2test.txt @@ -1,4 +1,3 @@ - PCRE2TEST(1) General Commands Manual PCRE2TEST(1) @@ -72,26 +71,25 @@ INPUT ENCODING When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that - are passed to the library. For subject lines, backslash escapes can be - used. In addition, when the utf modifier (see "Setting compilation op- - tions" below) is set, the pattern and any following subject lines are - interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as ap- - propriate. - - For non-UTF testing of wide characters, the utf8_input modifier can be - used. This is mutually exclusive with utf, and is allowed only in - 16-bit or 32-bit mode. It causes the pattern and following subject - lines to be treated as UTF-8 according to the original definition (RFC + are passed to the library. For subject lines and some patterns, back- + slash escapes can be used. In addition, when the utf modifier (see + "Setting compilation options" below) is set, the pattern and any fol- + lowing subject lines are interpreted as UTF-8 strings and translated to + UTF-16 or UTF-32 as appropriate. + + For non-UTF testing of wide characters, the utf8_input modifier can be + used. This is mutually exclusive with utf, and is allowed only in + 16-bit or 32-bit mode. It causes the pattern and following subject + lines to be treated as UTF-8 according to the original definition (RFC 2279), which allows for character values up to 0x7fffffff. Each charac- - ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, + ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error to occur). - UTF-8 (in its original definition) is not capable of encoding values - greater than 0x7fffffff, but such values can be handled by the 32-bit + UTF-8 (in its original definition) is not capable of encoding values + greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte - in UTF-8) 0x80000000 is added to the character's value. This is the - only way of passing such code points in a pattern string. For subject + in UTF-8) 0x80000000 is added to the character's value. For subject strings, using an escape sequence is preferable. @@ -135,8 +133,8 @@ COMMAND LINE OPTIONS the exit code as indicated: ebcdic-nl the code for LF (= NL) in an EBCDIC environment: - 0x15 or 0x25 - 0 if used in an ASCII environment + either 0x15 or 0x25 + 0 if used in an ASCII/Unicode environment exit code is always 0 linksize the configured internal link size (2, 3, or 4) exit code is set to the link size @@ -158,56 +156,67 @@ COMMAND LINE OPTIONS pcre2-8 the 8-bit library was built unicode Unicode support is available - If an unknown option is given, an error message is output; + Note that the availability of JIT support in the library does + not guarantee that it can actually be used because in some + environments it is unable to allocate executable memory. The + option "jitusable" gives more detailed information. It re- + turns one of the following values: + + 0 JIT is available and usable + 1 JIT is available but cannot allocate executable memory + 2 JIT is not available + 3 Unexpected return from test call to pcre2_jit_compile() + + If an unknown option is given, an error message is output; the exit code is 0. - -d Behave as if each pattern has the debug modifier; the inter- + -d Behave as if each pattern has the debug modifier; the inter- nal form and information about the compiled pattern is output after compilation; -d is equivalent to -b -i. -dfa Behave as if each subject line has the dfa modifier; matching - is done using the pcre2_dfa_match() function instead of the + is done using the pcre2_dfa_match() function instead of the default pcre2_match(). -error number[,number,...] - Call pcre2_get_error_message() for each of the error numbers - in the comma-separated list, display the resulting messages - on the standard output, then exit with zero exit code. The - numbers may be positive or negative. This is a convenience + Call pcre2_get_error_message() for each of the error numbers + in the comma-separated list, display the resulting messages + on the standard output, then exit with zero exit code. The + numbers may be positive or negative. This is a convenience facility for PCRE2 maintainers. -help Output a brief summary these options and then exit. - -i Behave as if each pattern has the info modifier; information + -i Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation. - -jit Behave as if each pattern line has the jit modifier; after - successful compilation, each pattern is passed to the just- + -jit Behave as if each pattern line has the jit modifier; after + successful compilation, each pattern is passed to the just- in-time compiler, if available. - -jitfast Behave as if each pattern line has the jitfast modifier; af- - ter successful compilation, each pattern is passed to the + -jitfast Behave as if each pattern line has the jitfast modifier; af- + ter successful compilation, each pattern is passed to the just-in-time compiler, if available, and each subject line is passed directly to the JIT matcher via its "fast path". -jitverify - Behave as if each pattern line has the jitverify modifier; - after successful compilation, each pattern is passed to the - just-in-time compiler, if available, and the use of JIT for + Behave as if each pattern line has the jitverify modifier; + after successful compilation, each pattern is passed to the + just-in-time compiler, if available, and the use of JIT for matching is verified. -LM List modifiers: write a list of available pattern and subject - modifiers to the standard output, then exit with zero exit - code. All other options are ignored. If both -C and any -Lx + modifiers to the standard output, then exit with zero exit + code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. - -LP List properties: write a list of recognized Unicode proper- - ties to the standard output, then exit with zero exit code. + -LP List properties: write a list of recognized Unicode proper- + ties to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -LS List scripts: write a list of recognized Unicode script names - to the standard output, then exit with zero exit code. All + to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. @@ -217,25 +226,25 @@ COMMAND LINE OPTIONS -q Do not output the version number of pcre2test at the start of execution. - -S size On Unix-like systems, set the size of the run-time stack to + -S size On Unix-like systems, set the size of the run-time stack to size mebibytes (units of 1024*1024 bytes). -subject modifier-list Behave as if each subject line contains the given modifiers. - -t Run each compile and match many times with a timer, and out- - put the resulting times per compile or match. When JIT is - used, separate times are given for the initial compile and - the JIT compile. You can control the number of iterations - that are used for timing by following -t with a number (as a - separate item on the command line). For example, "-t 1000" + -t Run each compile and match many times with a timer, and out- + put the resulting times per compile or match. When JIT is + used, separate times are given for the initial compile and + the JIT compile. You can control the number of iterations + that are used for timing by following -t with a number (as a + separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times. -tm This is like -t except that it times only the matching phase, not the compile phase. - -T -TM These behave like -t and -tm, but in addition, at the end of - a run, the total times for all compiles and matches are out- + -T -TM These behave like -t and -tm, but in addition, at the end of + a run, the total times for all compiles and matches are out- put. -version Output the PCRE2 version number and then exit. @@ -243,153 +252,153 @@ COMMAND LINE OPTIONS DESCRIPTION - If pcre2test is given two filename arguments, it reads from the first + If pcre2test is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from - the standard input. If pcre2test is given only one argument, it reads + the standard input. If pcre2test is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout. - When pcre2test is built, a configuration option can specify that it - should be linked with the libreadline or libedit library. When this is - done, if the input is from a terminal, it is read using the readline() + When pcre2test is built, a configuration option can specify that it + should be linked with the libreadline or libedit library. When this is + done, if the input is from a terminal, it is read using the readline() function. This provides line-editing and history facilities. The output from the -help option states whether or not readline() will be used. - The program handles any number of tests, each of which consists of a - set of input lines. Each set starts with a regular expression pattern, + The program handles any number of tests, each of which consists of a + set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pat- tern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed - by the perltest.sh script that is distributed with PCRE2 as a means of + by the perltest.sh script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. For a speci- - fication of perltest.sh, see the comments near its beginning. See also + fication of perltest.sh, see the comments near its beginning. See also the #perltest command below. When the input is a terminal, pcre2test prompts for each line of input, - using "re>" to prompt for regular expression patterns, and "data>" to - prompt for subject lines. Command lines starting with # can be entered + using "re>" to prompt for regular expression patterns, and "data>" to + prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt. - Each subject line is matched separately and independently. If you want + Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r - or \r\n, etc., depending on the newline setting) in a single line of - input to encode the newline sequences. There is no limit on the length - of subject lines; the input buffer is automatically extended if it is - too small. There are replication features that makes it possible to - generate long repetitive pattern or subject lines without having to + or \r\n, etc., depending on the newline setting) in a single line of + input to encode the newline sequences. There is no limit on the length + of subject lines; the input buffer is automatically extended if it is + too small. There are replication features that makes it possible to + generate long repetitive pattern or subject lines without having to supply them explicitly. - An empty line or the end of the file signals the end of the subject - lines for a test, at which point a new pattern or command line is ex- + An empty line or the end of the file signals the end of the subject + lines for a test, at which point a new pattern or command line is ex- pected if there is still input to be read. COMMAND LINES - In between sets of test data, a line that begins with # is interpreted + In between sets of test data, a line that begins with # is interpreted as a command line. If the first character is followed by white space or - an exclamation mark, the line is treated as a comment, and ignored. + an exclamation mark, the line is treated as a comment, and ignored. Otherwise, the following commands are recognized: #forbid_utf - Subsequent patterns automatically have the PCRE2_NEVER_UTF and - PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF - and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of - patterns. This command also forces an error if a subsequent pattern - contains any occurrences of \P, \p, or \X, which are still supported - when PCRE2_UTF is not set, but which require Unicode property support + Subsequent patterns automatically have the PCRE2_NEVER_UTF and + PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF + and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of + patterns. This command also forces an error if a subsequent pattern + contains any occurrences of \P, \p, or \X, which are still supported + when PCRE2_UTF is not set, but which require Unicode property support to be included in the library. - This is a trigger guard that is used in test files to ensure that UTF - or Unicode property tests are not accidentally added to files that are - used when Unicode support is not included in the library. Setting - PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained - by the use of #pattern; the difference is that #forbid_utf cannot be - unset, and the automatic options are not displayed in pattern informa- + This is a trigger guard that is used in test files to ensure that UTF + or Unicode property tests are not accidentally added to files that are + used when Unicode support is not included in the library. Setting + PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained + by the use of #pattern; the difference is that #forbid_utf cannot be + unset, and the automatic options are not displayed in pattern informa- tion, to avoid cluttering up test output. #load This command is used to load a set of precompiled patterns from a file, - as described in the section entitled "Saving and restoring compiled + as described in the section entitled "Saving and restoring compiled patterns" below. #loadtables - This command is used to load a set of binary character tables that can - be accessed by the tables=3 qualifier. Such tables can be created by + This command is used to load a set of binary character tables that can + be accessed by the tables=3 qualifier. Such tables can be created by the pcre2_dftables program with the -b option. #newline_default [] - When PCRE2 is built, a default newline convention can be specified. - This determines which characters and/or character pairs are recognized + When PCRE2 is built, a default newline convention can be specified. + This determines which characters and/or character pairs are recognized as indicating a newline in a pattern or subject string. The default can - be overridden when a pattern is compiled. The standard test files con- - tain tests of various newline conventions, but the majority of the - tests expect a single linefeed to be recognized as a newline by de- - fault. Without special action the tests would fail when PCRE2 is com- + be overridden when a pattern is compiled. The standard test files con- + tain tests of various newline conventions, but the majority of the + tests expect a single linefeed to be recognized as a newline by de- + fault. Without special action the tests would fail when PCRE2 is com- piled with either CR or CRLF as the default newline. The #newline_default command specifies a list of newline types that are - acceptable as the default. The types must be one of CR, LF, CRLF, ANY- + acceptable as the default. The types must be one of CR, LF, CRLF, ANY- CRLF, ANY, or NUL (in upper or lower case), for example: #newline_default LF Any anyCRLF If the default newline is in the list, this command has no effect. Oth- - erwise, except when testing the POSIX API, a newline modifier that + erwise, except when testing the POSIX API, a newline modifier that specifies the first newline convention in the list (LF in the above ex- - ample) is added to any pattern that does not already have a newline + ample) is added to any pattern that does not already have a newline modifier. If the newline list is empty, the feature is turned off. This command is present in a number of the standard test input files. - When the POSIX API is being tested there is no way to override the de- + When the POSIX API is being tested there is no way to override the de- fault newline convention, though it is possible to set the newline con- - vention from within the pattern. A warning is given if the posix or - posix_nosub modifier is used when #newline_default would set a default + vention from within the pattern. A warning is given if the posix or + posix_nosub modifier is used when #newline_default would set a default for the non-POSIX API. #pattern - This command sets a default modifier list that applies to all subse- + This command sets a default modifier list that applies to all subse- quent patterns. Modifiers on a pattern can change these settings. #perltest - This line is used in test files that can also be processed by perl- - test.sh to confirm that Perl gives the same results as PCRE2. Subse- - quent tests are checked for the use of pcre2test features that are in- + This line is used in test files that can also be processed by perl- + test.sh to confirm that Perl gives the same results as PCRE2. Subse- + quent tests are checked for the use of pcre2test features that are in- compatible with the perltest.sh script. - Patterns must use '/' as their delimiter, and only certain modifiers - are supported. Comment lines, #pattern commands, and #subject commands - that set or unset "mark" are recognized and acted on. The #perltest, - #forbid_utf, and #newline_default commands, which are needed in the + Patterns must use '/' as their delimiter, and only certain modifiers + are supported. Comment lines, #pattern commands, and #subject commands + that set or unset "mark" are recognized and acted on. The #perltest, + #forbid_utf, and #newline_default commands, which are needed in the relevant pcre2test files, are silently ignored. All other command lines - are ignored, but give a warning message. The #perltest command helps - detect tests that are accidentally put in the wrong file or use the - wrong delimiter. For more details of the perltest.sh script see the + are ignored, but give a warning message. The #perltest command helps + detect tests that are accidentally put in the wrong file or use the + wrong delimiter. For more details of the perltest.sh script see the comments it contains. #pop [] #popcopy [] - These commands are used to manipulate the stack of compiled patterns, - as described in the section entitled "Saving and restoring compiled + These commands are used to manipulate the stack of compiled patterns, + as described in the section entitled "Saving and restoring compiled patterns" below. #save - This command is used to save a set of compiled patterns to a file, as - described in the section entitled "Saving and restoring compiled pat- + This command is used to save a set of compiled patterns to a file, as + described in the section entitled "Saving and restoring compiled pat- terns" below. #subject - This command sets a default modifier list that applies to all subse- - quent subject lines. Modifiers on a subject line can change these set- + This command sets a default modifier list that applies to all subse- + quent subject lines. Modifiers on a subject line can change these set- tings. @@ -397,47 +406,47 @@ MODIFIER SYNTAX Modifier lists are used with both pattern and subject lines. Items in a list are separated by commas followed by optional white space. Trailing - whitespace in a modifier list is ignored. Some modifiers may be given - for both patterns and subject lines, whereas others are valid only for - one or the other. Each modifier has a long name, for example "an- - chored", and some of them must be followed by an equals sign and a - value, for example, "offset=12". Values cannot contain comma charac- - ters, but may contain spaces. Modifiers that do not take values may be + whitespace in a modifier list is ignored. Some modifiers may be given + for both patterns and subject lines, whereas others are valid only for + one or the other. Each modifier has a long name, for example "an- + chored", and some of them must be followed by an equals sign and a + value, for example, "offset=12". Values cannot contain comma charac- + ters, but may contain spaces. Modifiers that do not take values may be preceded by a minus sign to turn off a previous setting. A few of the more common modifiers can also be specified as single let- - ters, for example "i" for "caseless". In documentation, following the + ters, for example "i" for "caseless". In documentation, following the Perl convention, these are written with a slash ("the /i modifier") for - clarity. Abbreviated modifiers must all be concatenated in the first - item of a modifier list. If the first item is not recognized as a long - modifier name, it is interpreted as a sequence of these abbreviations. + clarity. Abbreviated modifiers must all be concatenated in the first + item of a modifier list. If the first item is not recognized as a long + modifier name, it is interpreted as a sequence of these abbreviations. For example: /abc/ig,newline=cr,jit=3 - This is a pattern line whose modifier list starts with two one-letter - modifiers (/i and /g). The lower-case abbreviated modifiers are the + This is a pattern line whose modifier list starts with two one-letter + modifiers (/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. PATTERN SYNTAX - A pattern line must start with one of the following characters (common + A pattern line must start with one of the following characters (common symbols, excluding pattern meta-characters): / ! " ' ` - = _ : ; , % & @ ~ - This is interpreted as the pattern's delimiter. A regular expression - may be continued over several input lines, in which case the newline + This is interpreted as the pattern's delimiter. A regular expression + may be continued over several input lines, in which case the newline characters are included within it. It is possible to include the delim- - iter as a literal within the pattern by escaping it with a backslash, + iter as a literal within the pattern by escaping it with a backslash, for example /abc\/def/ - If you do this, the escape and the delimiter form part of the pattern, + If you do this, the escape and the delimiter form part of the pattern, but since the delimiters are all non-alphanumeric, the inclusion of the - backslash does not affect the pattern's interpretation. Note, however, + backslash does not affect the pattern's interpretation. Note, however, that this trick does not work within \Q...\E literal bracketing because the backslash will itself be interpreted as a literal. If the terminat- ing delimiter is immediately followed by a backslash, for example, @@ -445,13 +454,13 @@ PATTERN SYNTAX /abc/\ a backslash is added to the end of the pattern. This is done to provide - a way of testing the error condition that arises if a pattern finishes + a way of testing the error condition that arises if a pattern finishes with a backslash, because /abc\/ - is interpreted as the first line of a pattern that starts with "abc/", - causing pcre2test to read the next line as a continuation of the regu- + is interpreted as the first line of a pattern that starts with "abc/", + causing pcre2test to read the next line as a continuation of the regu- lar expression. A pattern can be followed by a modifier list (details below). @@ -460,44 +469,52 @@ PATTERN SYNTAX SUBJECT LINE SYNTAX Before each subject line is passed to pcre2_match(), pcre2_dfa_match(), - or pcre2_jit_match(), leading and trailing white space is removed, and - the line is scanned for backslash escapes, unless the subject_literal - modifier was set for the pattern. The following provide a means of en- + or pcre2_jit_match(), leading and trailing white space is removed, and + the line is scanned for backslash escapes, unless the subject_literal + modifier was set for the pattern. The following provide a means of en- coding non-printing characters in a visible way: - \a alarm (BEL, \x07) - \b backspace (\x08) - \e escape (\x27) - \f form feed (\x0c) - \n newline (\x0a) - \r carriage return (\x0d) - \t tab (\x09) - \v vertical tab (\x0b) - \nnn octal character (up to 3 octal digits); always - a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode - \o{dd...} octal character (any number of octal digits} - \xhh hexadecimal byte (up to 2 hex digits) - \x{hh...} hexadecimal character (any number of hex digits) - - The use of \x{hh...} is not dependent on the use of the utf modifier on - the pattern. It is recognized always. There may be any number of hexa- - decimal digits inside the braces; invalid values provoke error mes- - sages. - - Note that \xhh specifies one byte rather than one character in UTF-8 - mode; this makes it possible to construct invalid UTF-8 sequences for - testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8 - character in UTF-8 mode, generating more than one byte if the value is - greater than 127. When testing the 8-bit library not in UTF-8 mode, - \x{hh} generates one byte for values less than 256, and causes an error + \a alarm (BEL, \x07) + \b backspace (\x08) + \e escape (\x27) + \f form feed (\x0c) + \n newline (\x0a) + \N{U+hh...} unicode character (any number of hex digits) + \r carriage return (\x0d) + \t tab (\x09) + \v vertical tab (\x0b) + \ddd octal number (up to 3 octal digits); represent a single + code point unless larger than 255 with the 8-bit li- + brary + \o{dd...} octal number (any number of octal digits} representing a + character in UTF mode or a code point + \xhh hexadecimal byte (up to 2 hex digits) + \x{hh...} hexadecimal number (up to 8 hex digits) representing a + character in UTF mode or a code point + + Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf + modifier on the pattern. It is always recognized. There may be any num- + ber of hexadecimal digits inside the braces; invalid values provoke er- + ror messages but when using \N{U+hh...} with some invalid unicode char- + acters they will be accepted with a warning instead. + + Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) + describe one byte rather than one character; this makes it possible to + construct invalid UTF-8 sequences for testing purposes. On the other + hand, \x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only + generating more than one byte if the value is greater than 127. To + avoid the ambiguity it is preferred to use \N{U+hh...} when describing + characters. When testing the 8-bit library not in UTF-8 mode, \x{hh} + generates one byte for values that could fit on it, and causes an error for greater values. - In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it - possible to construct invalid UTF-16 sequences for testing purposes. + When testing the 16-bit library, not in UTF-16 mode, all 4-digit + \x{hhhh} values are accepted. This makes it possible to construct in- + valid UTF-16 sequences for testing purposes. - In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This - makes it possible to construct invalid UTF-32 sequences for testing - purposes. + When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit + \x{...} values are accepted. This makes it possible to construct in- + valid UTF-32 sequences for testing purposes. There is a special backslash sequence that specifies replication of one or more characters: @@ -561,6 +578,7 @@ PATTERN MODIFIERS allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -589,13 +607,17 @@ PATTERN MODIFIERS match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -608,20 +630,36 @@ PATTERN MODIFIERS causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. + The following modifiers enable or disable performance optimizations by + calling pcre2_set_optimize() before invoking the regex compiler. + + optimization_full enable all optional optimizations + optimization_none disable all optional optimizations + auto_possess auto-possessify variable quantifiers + auto_possess_off don't auto-possessify variable quantifiers + dotstar_anchor anchor patterns starting with .* + dotstar_anchor_off don't anchor patterns starting with .* + start_optimize enable pre-scan of subject string + start_optimize_off disable pre-scan of subject string + + See the pcre2_set_optimize documentation for details on these optimiza- + tions. + Setting compilation controls - The following modifiers affect the compilation process or request in- - formation about the pattern. There are single-letter abbreviations for + The following modifiers affect the compilation process or request in- + formation about the pattern. There are single-letter abbreviations for some that are heavily used in the test files. - bsr=[anycrlf|unicode] specify \R handling /B bincode show binary code without lengths + bsr=[anycrlf|unicode] specify \R handling callout_info show callout information convert= request foreign pattern conversion convert_glob_escape=c set glob escape character convert_glob_separator=c set glob separator character convert_length set convert buffer length debug same as info,fullbincode + expand expand repetition syntax in pattern framesize show matching frame size fullbincode show binary code with lengths /I info show info about compiled pattern @@ -643,6 +681,7 @@ PATTERN MODIFIERS posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard= test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -653,35 +692,35 @@ PATTERN MODIFIERS Newline and \R handling - The bsr modifier specifies what \R in a pattern should match. If it is - set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to - "unicode", \R matches any Unicode newline sequence. The default can be + The bsr modifier specifies what \R in a pattern should match. If it is + set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to + "unicode", \R matches any Unicode newline sequence. The default can be specified when PCRE2 is built; if it is not, the default is set to Uni- code. - The newline modifier specifies which characters are to be interpreted + The newline modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case). Information about a pattern - The debug modifier is a shorthand for info,fullbincode, requesting all + The debug modifier is a shorthand for info,fullbincode, requesting all available information. The bincode modifier causes a representation of the compiled code to be - output after compilation. This information does not contain length and + output after compilation. This information does not contain length and offset values, which ensures that the same output is generated for dif- - ferent internal link sizes and different code unit widths. By using - bincode, the same regression tests can be used in different environ- + ferent internal link sizes and different code unit widths. By using + bincode, the same regression tests can be used in different environ- ments. - The fullbincode modifier, by contrast, does include length and offset - values. This is used in a few special tests that run only for specific + The fullbincode modifier, by contrast, does include length and offset + values. This is used in a few special tests that run only for specific code unit widths and link sizes, and is also useful for one-off tests. - The info modifier requests information about the compiled pattern - (whether it is anchored, has a fixed first character, and so on). The - information is obtained from the pcre2_pattern_info() function. Here + The info modifier requests information about the compiled pattern + (whether it is anchored, has a fixed first character, and so on). The + information is obtained from the pcre2_pattern_info() function. Here are some typical examples: re> /(?i)(^a|^b)/m,info @@ -699,136 +738,136 @@ PATTERN MODIFIERS Last code unit = 'c' (caseless) Subject length lower bound = 3 - "Compile options" are those specified by modifiers; "overall options" - have added options that are taken or deduced from the pattern. If both - sets of options are the same, just a single "options" line is output; - if there are no options, the line is omitted. "First code unit" is - where any match must start; if there is more than one they are listed - as "starting code units". "Last code unit" is the last literal code - unit that must be present in any match. This is not necessarily the - last character. These lines are omitted if no starting or ending code - units are recorded. The subject length line is omitted when - no_start_optimize is set because the minimum length is not calculated + "Compile options" are those specified by modifiers; "overall options" + have added options that are taken or deduced from the pattern. If both + sets of options are the same, just a single "options" line is output; + if there are no options, the line is omitted. "First code unit" is + where any match must start; if there is more than one they are listed + as "starting code units". "Last code unit" is the last literal code + unit that must be present in any match. This is not necessarily the + last character. These lines are omitted if no starting or ending code + units are recorded. The subject length line is omitted when + no_start_optimize is set because the minimum length is not calculated when it can never be used. - The framesize modifier shows the size, in bytes, of each storage frame - used by pcre2_match() for handling backtracking. The size depends on - the number of capturing parentheses in the pattern. A vector of these - frames is used at matching time; its overall size is shown when the + The framesize modifier shows the size, in bytes, of each storage frame + used by pcre2_match() for handling backtracking. The size depends on + the number of capturing parentheses in the pattern. A vector of these + frames is used at matching time; its overall size is shown when the heaframes_size subject modifier is set. - The callout_info modifier requests information about all the callouts + The callout_info modifier requests information about all the callouts in the pattern. A list of them is output at the end of any other infor- mation that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. Passing a NULL context - Normally, pcre2test passes a context block to pcre2_compile(). If the - null_context modifier is set, however, NULL is passed. This is for - testing that pcre2_compile() behaves correctly in this case (it uses + Normally, pcre2test passes a context block to pcre2_compile(). If the + null_context modifier is set, however, NULL is passed. This is for + testing that pcre2_compile() behaves correctly in this case (it uses default values). Passing a NULL pattern - The null_pattern modifier is for testing the behaviour of pcre2_com- - pile() when the pattern argument is NULL. The length value passed is + The null_pattern modifier is for testing the behaviour of pcre2_com- + pile() when the pattern argument is NULL. The length value passed is the default PCRE2_ZERO_TERMINATED unless use_length is set. Any length other than zero causes an error. Specifying pattern characters in hexadecimal - The hex modifier specifies that the characters of the pattern, except - for substrings enclosed in single or double quotes, are to be inter- - preted as pairs of hexadecimal digits. This feature is provided as a + The hex modifier specifies that the characters of the pattern, except + for substrings enclosed in single or double quotes, are to be inter- + preted as pairs of hexadecimal digits. This feature is provided as a way of creating patterns that contain binary zeros and other non-print- - ing characters. White space is permitted between pairs of digits. For + ing characters. White space is permitted between pairs of digits. For example, this pattern contains three characters: /ab 32 59/hex - Parts of such a pattern are taken literally if quoted. This pattern - contains nine characters, only two of which are specified in hexadeci- + Parts of such a pattern are taken literally if quoted. This pattern + contains nine characters, only two of which are specified in hexadeci- mal: /ab "literal" 32/hex - Either single or double quotes may be used. There is no way of includ- - ing the delimiter within a substring. The hex and expand modifiers are + Either single or double quotes may be used. There is no way of includ- + ing the delimiter within a substring. The hex and expand modifiers are mutually exclusive. Specifying the pattern's length By default, patterns are passed to the compiling functions as zero-ter- - minated strings but can be passed by length instead of being zero-ter- - minated. The use_length modifier causes this to happen. Using a length - happens automatically (whether or not use_length is set) when hex is - set, because patterns specified in hexadecimal may contain binary ze- + minated strings but can be passed by length instead of being zero-ter- + minated. The use_length modifier causes this to happen. Using a length + happens automatically (whether or not use_length is set) when hex is + set, because patterns specified in hexadecimal may contain binary ze- ros. If hex or use_length is used with the POSIX wrapper API (see "Using the - POSIX wrapper API" below), the REG_PEND extension is used to pass the + POSIX wrapper API" below), the REG_PEND extension is used to pass the pattern's length. Specifying a maximum for variable lookbehinds - Variable lookbehind assertions are supported only if, for each one, + Variable lookbehind assertions are supported only if, for each one, there is a maximum length (in characters) that it can match. There is a limit on this, whose default can be set at build time, with an ultimate - default of 255. The max_varlookbehind modifier uses the + default of 255. The max_varlookbehind modifier uses the pcre2_set_max_varlookbehind() function to change the limit. Lookbehinds - whose branches each match a fixed length are limited to 65535 charac- + whose branches each match a fixed length are limited to 65535 charac- ters per branch. Specifying wide characters in 16-bit and 32-bit modes In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 - and translated to UTF-16 or UTF-32 when the utf modifier is set. For + and translated to UTF-16 or UTF-32 when the utf modifier is set. For testing the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input - modifier can be used. It is mutually exclusive with utf. Input lines + modifier can be used. It is mutually exclusive with utf. Input lines are interpreted as UTF-8 as a means of specifying wide characters. More details are given in "Input encoding" above. Generating long repetitive patterns - Some tests use long patterns that are very repetitive. Instead of cre- - ating a very long input line for such a pattern, you can use a special - repetition feature, similar to the one described for subject lines - above. If the expand modifier is present on a pattern, parts of the + Some tests use long patterns that are very repetitive. Instead of cre- + ating a very long input line for such a pattern, you can use a special + repetition feature, similar to the one described for subject lines + above. If the expand modifier is present on a pattern, parts of the pattern that have the form \[]{} are expanded before the pattern is passed to pcre2_compile(). For exam- ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction - cannot be nested. An initial "\[" sequence is recognized only if "]{" - followed by decimal digits and "}" is found later in the pattern. If + cannot be nested. An initial "\[" sequence is recognized only if "]{" + followed by decimal digits and "}" is found later in the pattern. If not, the characters remain in the pattern unaltered. The expand and hex modifiers are mutually exclusive. - If part of an expanded pattern looks like an expansion, but is really + If part of an expanded pattern looks like an expansion, but is really part of the actual pattern, unwanted expansion can be avoided by giving two values in the quantifier. For example, \[AB]{6000,6000} is not rec- ognized as an expansion item. - If the info modifier is set on an expanded pattern, the result of the + If the info modifier is set on an expanded pattern, the result of the expansion is included in the information that is output. JIT compilation - Just-in-time (JIT) compiling is a heavyweight optimization that can - greatly speed up pattern matching. See the pcre2jit documentation for - details. JIT compiling happens, optionally, after a pattern has been - successfully compiled into an internal form. The JIT compiler converts + Just-in-time (JIT) compiling is a heavyweight optimization that can + greatly speed up pattern matching. See the pcre2jit documentation for + details. JIT compiling happens, optionally, after a pattern has been + successfully compiled into an internal form. The JIT compiler converts this to optimized machine code. It needs to know whether the match-time options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, - because different code is generated for the different cases. See the - partial modifier in "Subject Modifiers" below for details of how these + because different code is generated for the different cases. See the + partial modifier in "Subject Modifiers" below for details of how these options are specified for each match attempt. JIT compilation is requested by the jit pattern modifier, which may op- - tionally be followed by an equals sign and a number in the range 0 to - 7. The three bits that make up the number specify which of the three + tionally be followed by an equals sign and a number in the range 0 to + 7. The three bits that make up the number specify which of the three JIT operating modes are to be compiled: 1 compile JIT code for non-partial matching @@ -845,31 +884,31 @@ PATTERN MODIFIERS 6 soft and hard partial matching only 7 all three modes - If no number is given, 7 is assumed. The phrase "partial matching" + If no number is given, 7 is assumed. The phrase "partial matching" means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the - PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- + PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- plete match; the options enable the possibility of a partial match, but - do not require it. Note also that if you request JIT compilation only - for partial matching (for example, jit=2) but do not set the partial - modifier on a subject line, that match will not use JIT code because + do not require it. Note also that if you request JIT compilation only + for partial matching (for example, jit=2) but do not set the partial + modifier on a subject line, that match will not use JIT code because none was compiled for non-partial matching. - If JIT compilation is successful, the compiled JIT code will automati- + If JIT compilation is successful, the compiled JIT code will automati- cally be used when an appropriate type of match is run, except when in- - compatible run-time options are specified. For more details, see the - pcre2jit documentation. See also the jitstack modifier below for a way + compatible run-time options are specified. For more details, see the + pcre2jit documentation. See also the jitstack modifier below for a way of setting the size of the JIT stack. - If the jitfast modifier is specified, matching is done using the JIT - "fast path" interface, pcre2_jit_match(), which skips some of the san- - ity checks that are done by pcre2_match(), and of course does not work - when JIT is not supported. If jitfast is specified without jit, jit=7 + If the jitfast modifier is specified, matching is done using the JIT + "fast path" interface, pcre2_jit_match(), which skips some of the san- + ity checks that are done by pcre2_match(), and of course does not work + when JIT is not supported. If jitfast is specified without jit, jit=7 is assumed. - If the jitverify modifier is specified, information about the compiled - pattern shows whether JIT compilation was or was not successful. If - jitverify is specified without jit, jit=7 is assumed. If JIT compila- - tion is successful when jitverify is set, the text "(JIT)" is added to + If the jitverify modifier is specified, information about the compiled + pattern shows whether JIT compilation was or was not successful. If + jitverify is specified without jit, jit=7 is assumed. If JIT compila- + tion is successful when jitverify is set, the text "(JIT)" is added to the first output line after a match or non match when JIT-compiled code was actually used in the match. @@ -880,19 +919,19 @@ PATTERN MODIFIERS /pattern/locale=fr_FR The given locale is set, pcre2_maketables() is called to build a set of - character tables for the locale, and this is then passed to pcre2_com- - pile() when compiling the regular expression. The same tables are used - when matching the following subject lines. The locale modifier applies + character tables for the locale, and this is then passed to pcre2_com- + pile() when compiling the regular expression. The same tables are used + when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern - command if a default is needed. Setting a locale and alternate charac- + command if a default is needed. Setting a locale and alternate charac- ter tables are mutually exclusive. Showing pattern memory The memory modifier causes the size in bytes of the memory used to hold - the compiled pattern to be output. This does not include the size of - the pcre2_code block; it is just the actual compiled data. If the pat- - tern is subsequently passed to the JIT compiler, the size of the JIT + the compiled pattern to be output. This does not include the size of + the pcre2_code block; it is just the actual compiled data. If the pat- + tern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example: re> /a(b)c/jit,memory @@ -902,34 +941,34 @@ PATTERN MODIFIERS Limiting nested parentheses - The parens_nest_limit modifier sets a limit on the depth of nested - parentheses in a pattern. Breaching the limit causes a compilation er- - ror. The default for the library is set when PCRE2 is built, but - pcre2test sets its own default of 220, which is required for running + The parens_nest_limit modifier sets a limit on the depth of nested + parentheses in a pattern. Breaching the limit causes a compilation er- + ror. The default for the library is set when PCRE2 is built, but + pcre2test sets its own default of 220, which is required for running the standard test suite. Limiting the pattern length - The max_pattern_length modifier sets a limit, in code units, to the + The max_pattern_length modifier sets a limit, in code units, to the length of pattern that pcre2_compile() will accept. Breaching the limit - causes a compilation error. The default is the largest number a + causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Limiting the size of a compiled pattern The max_pattern_compiled_length modifier sets a limit, in bytes, to the amount of memory used by a compiled pattern. Breaching the limit causes - a compilation error. The default is the largest number a PCRE2_SIZE + a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Using the POSIX wrapper API - The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via - the POSIX wrapper API rather than its native API. When posix_nosub is - used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX - wrapper supports only the 8-bit library. Note that it does not imply + The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via + the POSIX wrapper API rather than its native API. When posix_nosub is + used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX + wrapper supports only the 8-bit library. Note that it does not imply POSIX matching semantics; for more detail see the pcre2posix documenta- - tion. The following pattern modifiers set options for the regcomp() + tion. The following pattern modifiers set options for the regcomp() function: caseless REG_ICASE @@ -939,42 +978,42 @@ PATTERN MODIFIERS ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) - The regerror_buffsize modifier specifies a size for the error buffer - that is passed to regerror() in the event of a compilation error. For + The regerror_buffsize modifier specifies a size for the error buffer + that is passed to regerror() in the event of a compilation error. For example: /abc/posix,regerror_buffsize=20 - This provides a means of testing the behaviour of regerror() when the - buffer is too small for the error message. If this modifier has not + This provides a means of testing the behaviour of regerror() when the + buffer is too small for the error message. If this modifier has not been set, a large buffer is used. - The aftertext and allaftertext subject modifiers work as described be- + The aftertext and allaftertext subject modifiers work as described be- low. All other modifiers are either ignored, with a warning message, or cause an error. - The pattern is passed to regcomp() as a zero-terminated string by de- + The pattern is passed to regcomp() as a zero-terminated string by de- fault, but if the use_length or hex modifiers are set, the REG_PEND ex- tension is used to pass it by length. Testing the stack guard feature - The stackguard modifier is used to test the use of pcre2_set_com- - pile_recursion_guard(), a function that is provided to enable stack - availability to be checked during compilation (see the pcre2api docu- - mentation for details). If the number specified by the modifier is + The stackguard modifier is used to test the use of pcre2_set_com- + pile_recursion_guard(), a function that is provided to enable stack + availability to be checked during compilation (see the pcre2api docu- + mentation for details). If the number specified by the modifier is greater than zero, pcre2_set_compile_recursion_guard() is called to set - up callback from pcre2_compile() to a local function. The argument it - receives is the current nesting parenthesis depth; if this is greater + up callback from pcre2_compile() to a local function. The argument it + receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted. Using alternative character tables - The value specified for the tables modifier must be one of the digits + The value specified for the tables modifier must be one of the digits 0, 1, 2, or 3. It causes a specific set of built-in character tables to - be passed to pcre2_compile(). This is used in the PCRE2 tests to check - behaviour with different character tables. The digit specifies the ta- + be passed to pcre2_compile(). This is used in the PCRE2 tests to check + behaviour with different character tables. The digit specifies the ta- bles as follows: 0 do not pass any special character tables @@ -985,15 +1024,15 @@ PATTERN MODIFIERS In tables 2, some characters whose codes are greater than 128 are iden- tified as letters, digits, spaces, etc. Tables 3 can be used only after - a #loadtables command has loaded them from a binary file. Setting al- + a #loadtables command has loaded them from a binary file. Setting al- ternate character tables and a locale are mutually exclusive. Setting certain match controls The following modifiers are really subject modifiers, and are described - under "Subject Modifiers" below. However, they may be included in a - pattern's modifier list, in which case they are applied to every sub- - ject line that is processed with that pattern. These modifiers do not + under "Subject Modifiers" below. However, they may be included in a + pattern's modifier list, in which case they are applied to every sub- + ject line that is processed with that pattern. These modifiers do not affect the compilation process. aftertext show text after match @@ -1009,6 +1048,7 @@ PATTERN MODIFIERS replace= specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1019,39 +1059,39 @@ PATTERN MODIFIERS substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY - These modifiers may not appear in a #pattern command. If you want them + These modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command. Specifying literal subject lines - If the subject_literal modifier is present on a pattern, all the sub- + If the subject_literal modifier is present on a pattern, all the sub- ject lines that it matches are taken as literal strings, with no inter- - pretation of backslashes. It is not possible to set subject modifiers - on such lines, but any that are set as defaults by a #subject command + pretation of backslashes. It is not possible to set subject modifiers + on such lines, but any that are set as defaults by a #subject command are recognized. Saving a compiled pattern - When a pattern with the push modifier is successfully compiled, it is - pushed onto a stack of compiled patterns, and pcre2test expects the - next line to contain a new pattern (or a command) instead of a subject + When a pattern with the push modifier is successfully compiled, it is + pushed onto a stack of compiled patterns, and pcre2test expects the + next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as - described in the section entitled "Saving and restoring compiled pat- - terns" below. If pushcopy is used instead of push, a copy of the com- - piled pattern is stacked, leaving the original as current, ready to - match the following input lines. This provides a way of testing the - pcre2_code_copy() function. The push and pushcopy modifiers are in- - compatible with compilation modifiers such as global that act at match + described in the section entitled "Saving and restoring compiled pat- + terns" below. If pushcopy is used instead of push, a copy of the com- + piled pattern is stacked, leaving the original as current, ready to + match the following input lines. This provides a way of testing the + pcre2_code_copy() function. The push and pushcopy modifiers are in- + compatible with compilation modifiers such as global that act at match time. Any that are specified are ignored (for the stacked copy), with a - warning message, except for replace, which causes an error. Note that - jitverify, which is allowed, does not carry through to any subsequent + warning message, except for replace, which causes an error. Note that + jitverify, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern. Testing foreign pattern conversion - The experimental foreign pattern conversion functions in PCRE2 can be - tested by setting the convert modifier. Its argument is a colon-sepa- - rated list of options, which set the equivalent option for the + The experimental foreign pattern conversion functions in PCRE2 can be + tested by setting the convert modifier. Its argument is a colon-sepa- + rated list of options, which set the equivalent option for the pcre2_pattern_convert() function: glob PCRE2_CONVERT_GLOB @@ -1063,19 +1103,19 @@ PATTERN MODIFIERS The "unset" value is useful for turning off a default that has been set by a #pattern command. When one of these options is set, the input pat- - tern is passed to pcre2_pattern_convert(). If the conversion is suc- - cessful, the result is reflected in the output and then passed to + tern is passed to pcre2_pattern_convert(). If the conversion is suc- + cessful, the result is reflected in the output and then passed to pcre2_compile(). The normal utf and no_utf_check options, if set, cause - the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be + the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to pcre2_pattern_convert(). By default, the conversion function is allowed to allocate a buffer for - its output. However, if the convert_length modifier is set to a value - greater than zero, pcre2test passes a buffer of the given length. This + its output. However, if the convert_length modifier is set to a value + greater than zero, pcre2test passes a buffer of the given length. This makes it possible to test the length check. - The convert_glob_escape and convert_glob_separator modifiers can be - used to specify the escape and separator characters for glob process- + The convert_glob_escape and convert_glob_separator modifiers can be + used to specify the escape and separator characters for glob process- ing, overriding the defaults, which are operating-system dependent. @@ -1086,10 +1126,11 @@ SUBJECT MODIFIERS Setting match options - The following modifiers set options for pcre2_match() or - pcre2_dfa_match(). See pcreapi for a description of their effects. + The following modifiers set options for pcre2_match() or + pcre2_dfa_match(). See pcre2api for a description of their effects. anchored set PCRE2_ANCHORED + copy_matched_subject set PCRE2_COPY_MATCHED_SUBJECT endanchored set PCRE2_ENDANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST @@ -1103,42 +1144,42 @@ SUBJECT MODIFIERS partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_soft (or ps) set PCRE2_PARTIAL_SOFT - The partial matching modifiers are provided with abbreviations because + The partial matching modifiers are provided with abbreviations because they appear frequently in tests. - If the posix or posix_nosub modifier was present on the pattern, caus- + If the posix or posix_nosub modifier was present on the pattern, caus- ing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOT- - BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to + BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). The other modifiers are ignored, with a warning message. - There is one additional modifier that can be used with the POSIX wrap- + There is one additional modifier that can be used with the POSIX wrap- per. It is ignored (with a warning) if used for non-POSIX matching. posix_startend=[:] - This causes the subject string to be passed to regexec() using the - REG_STARTEND option, which uses offsets to specify which part of the - string is searched. If only one number is given, the end offset is - passed as the end of the subject string. For more detail of REG_STAR- - TEND, see the pcre2posix documentation. If the subject string contains - binary zeros (coded as escapes such as \x{00} because pcre2test does + This causes the subject string to be passed to regexec() using the + REG_STARTEND option, which uses offsets to specify which part of the + string is searched. If only one number is given, the end offset is + passed as the end of the subject string. For more detail of REG_STAR- + TEND, see the pcre2posix documentation. If the subject string contains + binary zeros (coded as escapes such as \x{00} because pcre2test does not support actual binary zeros in its input), you must use posix_star- tend to specify its length. Setting match controls - The following modifiers affect the matching process or request addi- - tional information. Some of them may also be specified on a pattern - line (see above), in which case they apply to every subject line that - is matched against that pattern, but can be overridden by modifiers on + The following modifiers affect the matching process or request addi- + tional information. Some of them may also be specified on a pattern + line (see above), in which case they apply to every subject line that + is matched against that pattern, but can be overridden by modifiers on the subject. aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data= set a value to pass via callouts @@ -1172,7 +1213,8 @@ SUBJECT MODIFIERS startchar show startchar when relevant startoffset= same as offset= substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1184,29 +1226,29 @@ SUBJECT MODIFIERS zero_terminate pass the subject as zero-terminated The effects of these modifiers are described in the following sections. - When matching via the POSIX wrapper API, the aftertext, allaftertext, - and ovector subject modifiers work as described below. All other modi- + When matching via the POSIX wrapper API, the aftertext, allaftertext, + and ovector subject modifiers work as described below. All other modi- fiers are either ignored, with a warning message, or cause an error. Showing more text - The aftertext modifier requests that as well as outputting the part of + The aftertext modifier requests that as well as outputting the part of the subject string that matched the entire pattern, pcre2test should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. - The allaftertext modifier requests the same action for captured sub- + The allaftertext modifier requests the same action for captured sub- strings as well as the main matched substring. In each case the remain- der is output on the following line with a plus character following the capture number. - The allusedtext modifier requests that all the text that was consulted - during a successful pattern match by the interpreter should be shown, - for both full and partial matches. This feature is not supported for - JIT matching, and if requested with JIT it is ignored (with a warning - message). Setting this modifier affects the output if there is a look- - behind at the start of a match, or, for a complete match, a lookahead + The allusedtext modifier requests that all the text that was consulted + during a successful pattern match by the interpreter should be shown, + for both full and partial matches. This feature is not supported for + JIT matching, and if requested with JIT it is ignored (with a warning + message). Setting this modifier affects the output if there is a look- + behind at the start of a match, or, for a complete match, a lookahead at the end, or if \K is used in the pattern. Characters that precede or - follow the start and end of the actual match are indicated in the out- + follow the start and end of the actual match are indicated in the out- put by '<' or '>' characters underneath them. Here is an example: re> /(?<=pqr)abc(?=xyz)/ @@ -1217,16 +1259,16 @@ SUBJECT MODIFIERS Partial match: pqrabcxy <<< - The first, complete match shows that the matched string is "abc", with - the preceding and following strings "pqr" and "xyz" having been con- - sulted during the match (when processing the assertions). The partial + The first, complete match shows that the matched string is "abc", with + the preceding and following strings "pqr" and "xyz" having been con- + sulted during the match (when processing the assertions). The partial match can indicate only the preceding string. - The startchar modifier requests that the starting character for the - match be indicated, if it is different to the start of the matched + The startchar modifier requests that the starting character for the + match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \K has been processed as part of the match. In this situation, the output for the matched string - is displayed from the starting character instead of from the match + is displayed from the starting character instead of from the match point, with circumflex characters under the earlier characters. For ex- ample: @@ -1235,7 +1277,7 @@ SUBJECT MODIFIERS 0: abcxyz ^^^ - Unlike allusedtext, the startchar modifier can be used with JIT. How- + Unlike allusedtext, the startchar modifier can be used with JIT. How- ever, these two modifiers are mutually exclusive. Showing the value of all capture groups @@ -1243,104 +1285,104 @@ SUBJECT MODIFIERS The allcaptures modifier requests that the values of all potential cap- tured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to - the return code from pcre2_match()). Groups that did not take part in - the match are output as "". This modifier is not relevant for - DFA matching (which does no capturing) and does not apply when replace + the return code from pcre2_match()). Groups that did not take part in + the match are output as "". This modifier is not relevant for + DFA matching (which does no capturing) and does not apply when replace is specified; it is ignored, with a warning message, if present. Showing the entire ovector, for all outcomes The allvector modifier requests that the entire ovector be shown, what- ever the outcome of the match. Compare allcaptures, which shows only up - to the maximum number of capture groups for the pattern, and then only - for a successful complete non-DFA match. This modifier, which acts af- - ter any match result, and also for DFA matching, provides a means of - checking that there are no unexpected modifications to ovector fields. - Before each match attempt, the ovector is filled with a special value, - and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all - groups after the maximum capture group for the pattern. In other cases - it applies to the entire ovector. After a partial match, the first two - elements are the only ones that should be set. After a DFA match, the - amount of ovector that is used depends on the number of matches that + to the maximum number of capture groups for the pattern, and then only + for a successful complete non-DFA match. This modifier, which acts af- + ter any match result, and also for DFA matching, provides a means of + checking that there are no unexpected modifications to ovector fields. + Before each match attempt, the ovector is filled with a special value, + and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all + groups after the maximum capture group for the pattern. In other cases + it applies to the entire ovector. After a partial match, the first two + elements are the only ones that should be set. After a DFA match, the + amount of ovector that is used depends on the number of matches that were found. Testing pattern callouts - A callout function is supplied when pcre2test calls the library match- - ing functions, unless callout_none is specified. Its behaviour can be - controlled by various modifiers listed above whose names begin with - callout_. Details are given in the section entitled "Callouts" below. - Testing callouts from pcre2_substitute() is described separately in + A callout function is supplied when pcre2test calls the library match- + ing functions, unless callout_none is specified. Its behaviour can be + controlled by various modifiers listed above whose names begin with + callout_. Details are given in the section entitled "Callouts" below. + Testing callouts from pcre2_substitute() is described separately in "Testing the substitution function" below. Finding all matches in a string Searching for all possible matches within a subject can be requested by - the global or altglobal modifier. After finding a match, the matching - function is called again to search the remainder of the subject. The - difference between global and altglobal is that the former uses the - start_offset argument to pcre2_match() or pcre2_dfa_match() to start - searching at a new point within the entire string (which is what Perl + the global or altglobal modifier. After finding a match, the matching + function is called again to search the remainder of the subject. The + difference between global and altglobal is that the former uses the + start_offset argument to pcre2_match() or pcre2_dfa_match() to start + searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbe- hind assertion (including \b or \B). - If an empty string is matched, the next match is done with the + If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for another, non-empty, match at the same point in the subject. If this - match fails, the start offset is advanced, and the normal match is re- - tried. This imitates the way Perl handles such cases when using the /g - modifier or the split() function. Normally, the start offset is ad- - vanced by one character, but if the newline convention recognizes CRLF - as a newline, and the current character is CR followed by LF, an ad- + match fails, the start offset is advanced, and the normal match is re- + tried. This imitates the way Perl handles such cases when using the /g + modifier or the split() function. Normally, the start offset is ad- + vanced by one character, but if the newline convention recognizes CRLF + as a newline, and the current character is CR followed by LF, an ad- vance of two characters occurs. Testing substring extraction functions - The copy and get modifiers can be used to test the pcre2_sub- + The copy and get modifiers can be used to test the pcre2_sub- string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be given more than once, and each can specify a capture group name or num- ber, for example: abcd\=copy=1,copy=3,get=G1 - If the #subject command is used to set default copy and/or get lists, - these can be unset by specifying a negative number to cancel all num- + If the #subject command is used to set default copy and/or get lists, + these can be unset by specifying a negative number to cancel all num- bered groups and an empty name to cancel all named groups. - The getall modifier tests pcre2_substring_list_get(), which extracts + The getall modifier tests pcre2_substring_list_get(), which extracts all captured substrings. - If the subject line is successfully matched, the substrings extracted - by the convenience functions are output with C, G, or L after the - string number instead of a colon. This is in addition to the normal - full list. The string length (that is, the return from the extraction + If the subject line is successfully matched, the substrings extracted + by the convenience functions are output with C, G, or L after the + string number instead of a colon. This is in addition to the normal + full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name. Testing the substitution function - If the replace modifier is set, the pcre2_substitute() function is - called instead of one of the matching functions (or after one call of - pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- - placement strings cannot contain commas, because a comma signifies the - end of a modifier. This is not thought to be an issue in a test pro- + If the replace modifier is set, the pcre2_substitute() function is + called instead of one of the matching functions (or after one call of + pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- + placement strings cannot contain commas, because a comma signifies the + end of a modifier. This is not thought to be an issue in a test pro- gram. - Specifying a completely empty replacement string disables this modi- - fier. However, it is possible to specify an empty replacement by pro- - viding a buffer length, as described below, for an otherwise empty re- + Specifying a completely empty replacement string disables this modi- + fier. However, it is possible to specify an empty replacement by pro- + viding a buffer length, as described below, for an otherwise empty re- placement. - Unlike subject strings, pcre2test does not process replacement strings - for escape sequences. In UTF mode, a replacement string is checked to - see if it is a valid UTF-8 string. If so, it is correctly converted to - a UTF string of the appropriate code unit width. If it is not a valid - UTF-8 string, the individual code units are copied directly. This pro- + Unlike subject strings, pcre2test does not process replacement strings + for escape sequences. In UTF mode, a replacement string is checked to + see if it is a valid UTF-8 string. If so, it is correctly converted to + a UTF string of the appropriate code unit width. If it is not a valid + UTF-8 string, the individual code units are copied directly. This pro- vides a means of passing an invalid UTF-8 string for testing purposes. - The following modifiers set options (in additional to the normal match + The following modifiers set options (in additional to the normal match options) for pcre2_substitute(): global PCRE2_SUBSTITUTE_GLOBAL @@ -1354,8 +1396,8 @@ SUBJECT MODIFIERS See the pcre2api documentation for details of these options. - After a successful substitution, the modified string is output, pre- - ceded by the number of replacements. This may be zero if there were no + After a successful substitution, the modified string is output, pre- + ceded by the number of replacements. This may be zero if there were no matches. Here is a simple example of a substitution test: /abc/replace=xxx @@ -1364,12 +1406,12 @@ SUBJECT MODIFIERS =abc=abc=\=global 2: =xxx=xxx= - Subject and replacement strings should be kept relatively short (fewer - than 256 characters) for substitution tests, as fixed-size buffers are - used. To make it easy to test for buffer overflow, if the replacement - string starts with a number in square brackets, that number is passed - to pcre2_substitute() as the size of the output buffer, with the re- - placement string starting at the next character. Here is an example + Subject and replacement strings should be kept relatively short (fewer + than 256 characters) for substitution tests, as fixed-size buffers are + used. To make it easy to test for buffer overflow, if the replacement + string starts with a number in square brackets, that number is passed + to pcre2_substitute() as the size of the output buffer, with the re- + placement string starting at the next character. Here is an example that tests the edge case: /abc/ @@ -1379,12 +1421,12 @@ SUBJECT MODIFIERS Failed: error -47: no more memory The default action of pcre2_substitute() is to return PCRE2_ER- - ROR_NOMEMORY when the output buffer is too small. However, if the - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- + ROR_NOMEMORY when the output buffer is too small. However, if the + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- tute_overflow_length modifier), pcre2_substitute() continues to go - through the motions of matching and substituting (but not doing any - callouts), in order to compute the size of buffer that is required. - When this happens, pcre2test shows the required buffer length (which + through the motions of matching and substituting (but not doing any + callouts), in order to compute the size of buffer that is required. + When this happens, pcre2test shows the required buffer length (which includes space for the trailing zero) as part of the error message. For example: @@ -1393,15 +1435,15 @@ SUBJECT MODIFIERS Failed: error -47: no more memory: 10 code units are needed A replacement string is ignored with POSIX and DFA matching. Specifying - partial matching provokes an error return ("bad option value") from + partial matching provokes an error return ("bad option value") from pcre2_substitute(). Testing substitute callouts If the substitute_callout modifier is set, a substitution callout func- - tion is set up. The null_context modifier must not be set, because the - address of the callout function is passed in a match context. When the - callout function is called (after each substitution), details of the + tion is set up. The null_context modifier must not be set, because the + address of the callout function is passed in a match context. When the + callout function is called (after each substitution), details of the input and output strings are output. For example: /abc/g,replace=<$0>,substitute_callout @@ -1410,19 +1452,19 @@ SUBJECT MODIFIERS 2(1) Old 6 9 "abc" New 8 13 "" 2: defpqr - The first number on each callout line is the count of matches. The + The first number on each callout line is the count of matches. The parenthesized number is the number of pairs that are set in the ovector - (that is, one more than the number of capturing groups that were set). + (that is, one more than the number of capturing groups that were set). Then are listed the offsets of the old substring, its contents, and the same for the replacement. - By default, the substitution callout function returns zero, which ac- - cepts the replacement and causes matching to continue if /g was used. - Two further modifiers can be used to test other return values. If sub- - stitute_skip is set to a value greater than zero the callout function - returns +1 for the match of that number, and similarly substitute_stop - returns -1. These cause the replacement to be rejected, and -1 causes - no further matching to take place. If either of them are set, substi- + By default, the substitution callout function returns zero, which ac- + cepts the replacement and causes matching to continue if /g was used. + Two further modifiers can be used to test other return values. If sub- + stitute_skip is set to a value greater than zero the callout function + returns +1 for the match of that number, and similarly substitute_stop + returns -1. These cause the replacement to be rejected, and -1 causes + no further matching to take place. If either of them are set, substi- tute_callout is assumed. For example: /abc/g,replace=<$0>,substitute_skip=1 @@ -1438,6 +1480,18 @@ SUBJECT MODIFIERS gle skip or stop is supported, which is sufficient for testing that the feature works. + Testing substitute case callouts + + If the substitute_case_callout modifier is set, a substitution case + callout function is set up. The callout function is called for each + substituted chunk which is to be case-transformed. + + The callout function passed is a fixed function with implementation for + certain behaviours: inputs which shrink when case-transformed; inputs + which grow; inputs with distinct upper/lower/titlecase forms. The char- + acters which are not special-cased for testing purposes are left unmod- + ified, as if they are caseless characters. + Setting the JIT stack size The jitstack modifier provides a way of setting the maximum stack size @@ -2007,8 +2061,8 @@ AUTHOR REVISION - Last updated: 24 April 2024 + Last updated: 26 December 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE 10.44 24 April 2024 PCRE2TEST(1) +PCRE2 10.45 26 December 2024 PCRE2TEST(1) diff --git a/mingw32/share/licenses/pcre2/LICENCE b/mingw64/share/licenses/pcre2/LICENCE.md similarity index 55% rename from mingw32/share/licenses/pcre2/LICENCE rename to mingw64/share/licenses/pcre2/LICENCE.md index 3c1ef032dec..f58ceb75a63 100644 --- a/mingw32/share/licenses/pcre2/LICENCE +++ b/mingw64/share/licenses/pcre2/LICENCE.md @@ -1,5 +1,8 @@ -PCRE2 LICENCE -------------- +PCRE2 License +============= + +| SPDX-License-Identifier: | BSD-3-Clause WITH PCRE2-exception | +|---------|-------| PCRE2 is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. @@ -16,40 +19,46 @@ optimize pattern matching. This is an optional feature that can be omitted when the library is built. -THE BASIC LIBRARY FUNCTIONS ---------------------------- +COPYRIGHT +--------- + +### The basic library functions -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com + Written by: Philip Hazel + Email local part: Philip.Hazel + Email domain: gmail.com -Retired from University of Cambridge Computing Service, -Cambridge, England. + Retired from University of Cambridge Computing Service, + Cambridge, England. -Copyright (c) 1997-2024 University of Cambridge -All rights reserved. + Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 2007-2024 Philip Hazel + All rights reserved. +### PCRE2 Just-In-Time compilation support -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu + Copyright (c) 2010-2024 Zoltan Herczeg + All rights reserved. -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. +### Stack-less Just-In-Time compiler + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- + Copyright (c) 2009-2024 Zoltan Herczeg + All rights reserved. -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu +### All other contributions -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. +Many other contributors have participated in the authorship of PCRE2. As PCRE2 +has never required a Contributor Licensing Agreement, or other copyright +assignment agreement, all contributions have copyright retained by each +original contributor or their employer. THE "BSD" LICENCE @@ -58,16 +67,16 @@ THE "BSD" LICENCE Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notices, - this list of conditions and the following disclaimer. +* Redistributions of source code must retain the above copyright notices, + this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notices, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. +* Redistributions in binary form must reproduce the above copyright + notices, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of the University of Cambridge nor the names of any - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. +* Neither the name of the University of Cambridge nor the names of any + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/mingw64/share/man/man1/pcre2-config.1.gz b/mingw64/share/man/man1/pcre2-config.1.gz index bd6a7fd31d0..822df9aded3 100644 Binary files a/mingw64/share/man/man1/pcre2-config.1.gz and b/mingw64/share/man/man1/pcre2-config.1.gz differ diff --git a/mingw64/share/man/man1/pcre2grep.1.gz b/mingw64/share/man/man1/pcre2grep.1.gz index 4986ba33e42..30bcc1cf285 100644 Binary files a/mingw64/share/man/man1/pcre2grep.1.gz and b/mingw64/share/man/man1/pcre2grep.1.gz differ diff --git a/mingw64/share/man/man1/pcre2test.1.gz b/mingw64/share/man/man1/pcre2test.1.gz index 6f7f0a4ded2..2ad05f1e7ca 100644 Binary files a/mingw64/share/man/man1/pcre2test.1.gz and b/mingw64/share/man/man1/pcre2test.1.gz differ diff --git a/mingw64/share/man/man3/pcre2.3.gz b/mingw64/share/man/man3/pcre2.3.gz index 8f8de13a685..0557798fa74 100644 Binary files a/mingw64/share/man/man3/pcre2.3.gz and b/mingw64/share/man/man3/pcre2.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_callout_enumerate.3.gz b/mingw64/share/man/man3/pcre2_callout_enumerate.3.gz index 132edbb1a5e..a808e956bd6 100644 Binary files a/mingw64/share/man/man3/pcre2_callout_enumerate.3.gz and b/mingw64/share/man/man3/pcre2_callout_enumerate.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_code_copy.3.gz b/mingw64/share/man/man3/pcre2_code_copy.3.gz index 0c748480430..cc75c1145df 100644 Binary files a/mingw64/share/man/man3/pcre2_code_copy.3.gz and b/mingw64/share/man/man3/pcre2_code_copy.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_code_copy_with_tables.3.gz b/mingw64/share/man/man3/pcre2_code_copy_with_tables.3.gz index 5ece33f7768..d308117ce38 100644 Binary files a/mingw64/share/man/man3/pcre2_code_copy_with_tables.3.gz and b/mingw64/share/man/man3/pcre2_code_copy_with_tables.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_code_free.3.gz b/mingw64/share/man/man3/pcre2_code_free.3.gz index 2fc6eccaa38..f43574a3931 100644 Binary files a/mingw64/share/man/man3/pcre2_code_free.3.gz and b/mingw64/share/man/man3/pcre2_code_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_compile.3.gz b/mingw64/share/man/man3/pcre2_compile.3.gz index f7d39bd6671..038e31a7280 100644 Binary files a/mingw64/share/man/man3/pcre2_compile.3.gz and b/mingw64/share/man/man3/pcre2_compile.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_compile_context_copy.3.gz b/mingw64/share/man/man3/pcre2_compile_context_copy.3.gz index bd414744c62..e601ecd8051 100644 Binary files a/mingw64/share/man/man3/pcre2_compile_context_copy.3.gz and b/mingw64/share/man/man3/pcre2_compile_context_copy.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_compile_context_create.3.gz b/mingw64/share/man/man3/pcre2_compile_context_create.3.gz index 670392ebb3b..736809aa152 100644 Binary files a/mingw64/share/man/man3/pcre2_compile_context_create.3.gz and b/mingw64/share/man/man3/pcre2_compile_context_create.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_compile_context_free.3.gz b/mingw64/share/man/man3/pcre2_compile_context_free.3.gz index 3aa875dd01c..6af45001ad6 100644 Binary files a/mingw64/share/man/man3/pcre2_compile_context_free.3.gz and b/mingw64/share/man/man3/pcre2_compile_context_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_config.3.gz b/mingw64/share/man/man3/pcre2_config.3.gz index 6858fdeb764..0dc22637f28 100644 Binary files a/mingw64/share/man/man3/pcre2_config.3.gz and b/mingw64/share/man/man3/pcre2_config.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_convert_context_copy.3.gz b/mingw64/share/man/man3/pcre2_convert_context_copy.3.gz index e8c877a0388..4dcbd5f2637 100644 Binary files a/mingw64/share/man/man3/pcre2_convert_context_copy.3.gz and b/mingw64/share/man/man3/pcre2_convert_context_copy.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_convert_context_create.3.gz b/mingw64/share/man/man3/pcre2_convert_context_create.3.gz index 47212499bb1..872ffff8130 100644 Binary files a/mingw64/share/man/man3/pcre2_convert_context_create.3.gz and b/mingw64/share/man/man3/pcre2_convert_context_create.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_convert_context_free.3.gz b/mingw64/share/man/man3/pcre2_convert_context_free.3.gz index a8d1fa23847..344effb030f 100644 Binary files a/mingw64/share/man/man3/pcre2_convert_context_free.3.gz and b/mingw64/share/man/man3/pcre2_convert_context_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_converted_pattern_free.3.gz b/mingw64/share/man/man3/pcre2_converted_pattern_free.3.gz index b2a1dfc7175..37f2c7cda61 100644 Binary files a/mingw64/share/man/man3/pcre2_converted_pattern_free.3.gz and b/mingw64/share/man/man3/pcre2_converted_pattern_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_dfa_match.3.gz b/mingw64/share/man/man3/pcre2_dfa_match.3.gz index 7a1cc9b4bfa..6beb8b69df4 100644 Binary files a/mingw64/share/man/man3/pcre2_dfa_match.3.gz and b/mingw64/share/man/man3/pcre2_dfa_match.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_general_context_copy.3.gz b/mingw64/share/man/man3/pcre2_general_context_copy.3.gz index c1854da2b24..9f4653d1e1a 100644 Binary files a/mingw64/share/man/man3/pcre2_general_context_copy.3.gz and b/mingw64/share/man/man3/pcre2_general_context_copy.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_general_context_create.3.gz b/mingw64/share/man/man3/pcre2_general_context_create.3.gz index 559c4ad603f..05d60da320e 100644 Binary files a/mingw64/share/man/man3/pcre2_general_context_create.3.gz and b/mingw64/share/man/man3/pcre2_general_context_create.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_general_context_free.3.gz b/mingw64/share/man/man3/pcre2_general_context_free.3.gz index 42b9a03c19f..f1133f8a242 100644 Binary files a/mingw64/share/man/man3/pcre2_general_context_free.3.gz and b/mingw64/share/man/man3/pcre2_general_context_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_error_message.3.gz b/mingw64/share/man/man3/pcre2_get_error_message.3.gz index 31a4fe00d0c..5c1dd580e8b 100644 Binary files a/mingw64/share/man/man3/pcre2_get_error_message.3.gz and b/mingw64/share/man/man3/pcre2_get_error_message.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_mark.3.gz b/mingw64/share/man/man3/pcre2_get_mark.3.gz index d189a862d6e..0422987109c 100644 Binary files a/mingw64/share/man/man3/pcre2_get_mark.3.gz and b/mingw64/share/man/man3/pcre2_get_mark.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz b/mingw64/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz index 41e2b42f079..ed3807ee91a 100644 Binary files a/mingw64/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz and b/mingw64/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_match_data_size.3.gz b/mingw64/share/man/man3/pcre2_get_match_data_size.3.gz index 1493a8239d7..b8b90d8b992 100644 Binary files a/mingw64/share/man/man3/pcre2_get_match_data_size.3.gz and b/mingw64/share/man/man3/pcre2_get_match_data_size.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_ovector_count.3.gz b/mingw64/share/man/man3/pcre2_get_ovector_count.3.gz index 3e13f2457bb..cbd0b74f6d1 100644 Binary files a/mingw64/share/man/man3/pcre2_get_ovector_count.3.gz and b/mingw64/share/man/man3/pcre2_get_ovector_count.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_ovector_pointer.3.gz b/mingw64/share/man/man3/pcre2_get_ovector_pointer.3.gz index 46c73d389e6..b638a40fb4f 100644 Binary files a/mingw64/share/man/man3/pcre2_get_ovector_pointer.3.gz and b/mingw64/share/man/man3/pcre2_get_ovector_pointer.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_get_startchar.3.gz b/mingw64/share/man/man3/pcre2_get_startchar.3.gz index 8770d3f18e2..bbd6dc3d8b0 100644 Binary files a/mingw64/share/man/man3/pcre2_get_startchar.3.gz and b/mingw64/share/man/man3/pcre2_get_startchar.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_jit_compile.3.gz b/mingw64/share/man/man3/pcre2_jit_compile.3.gz index 37a905af1c9..4b334032e0f 100644 Binary files a/mingw64/share/man/man3/pcre2_jit_compile.3.gz and b/mingw64/share/man/man3/pcre2_jit_compile.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_jit_free_unused_memory.3.gz b/mingw64/share/man/man3/pcre2_jit_free_unused_memory.3.gz index b854e9f15f1..636b2f299d2 100644 Binary files a/mingw64/share/man/man3/pcre2_jit_free_unused_memory.3.gz and b/mingw64/share/man/man3/pcre2_jit_free_unused_memory.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_jit_match.3.gz b/mingw64/share/man/man3/pcre2_jit_match.3.gz index 1f60dbd220c..aea0f320a12 100644 Binary files a/mingw64/share/man/man3/pcre2_jit_match.3.gz and b/mingw64/share/man/man3/pcre2_jit_match.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_jit_stack_assign.3.gz b/mingw64/share/man/man3/pcre2_jit_stack_assign.3.gz index cd29425f7db..3d21b33376d 100644 Binary files a/mingw64/share/man/man3/pcre2_jit_stack_assign.3.gz and b/mingw64/share/man/man3/pcre2_jit_stack_assign.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_jit_stack_create.3.gz b/mingw64/share/man/man3/pcre2_jit_stack_create.3.gz index ca3326ae5c3..f5135aa0038 100644 Binary files a/mingw64/share/man/man3/pcre2_jit_stack_create.3.gz and b/mingw64/share/man/man3/pcre2_jit_stack_create.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_jit_stack_free.3.gz b/mingw64/share/man/man3/pcre2_jit_stack_free.3.gz index 143e3c8005b..6e880ae0669 100644 Binary files a/mingw64/share/man/man3/pcre2_jit_stack_free.3.gz and b/mingw64/share/man/man3/pcre2_jit_stack_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_maketables.3.gz b/mingw64/share/man/man3/pcre2_maketables.3.gz index 3df5fa94517..964e90286ea 100644 Binary files a/mingw64/share/man/man3/pcre2_maketables.3.gz and b/mingw64/share/man/man3/pcre2_maketables.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_maketables_free.3.gz b/mingw64/share/man/man3/pcre2_maketables_free.3.gz index 7e4c91a7b5e..6c0ff04dfb9 100644 Binary files a/mingw64/share/man/man3/pcre2_maketables_free.3.gz and b/mingw64/share/man/man3/pcre2_maketables_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match.3.gz b/mingw64/share/man/man3/pcre2_match.3.gz index 737e8dd5ad0..de2d9e66605 100644 Binary files a/mingw64/share/man/man3/pcre2_match.3.gz and b/mingw64/share/man/man3/pcre2_match.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match_context_copy.3.gz b/mingw64/share/man/man3/pcre2_match_context_copy.3.gz index 82317151de4..d971b5f740e 100644 Binary files a/mingw64/share/man/man3/pcre2_match_context_copy.3.gz and b/mingw64/share/man/man3/pcre2_match_context_copy.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match_context_create.3.gz b/mingw64/share/man/man3/pcre2_match_context_create.3.gz index 1607514981c..134d340d0f5 100644 Binary files a/mingw64/share/man/man3/pcre2_match_context_create.3.gz and b/mingw64/share/man/man3/pcre2_match_context_create.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match_context_free.3.gz b/mingw64/share/man/man3/pcre2_match_context_free.3.gz index d54a68d4b39..0a7f3016c50 100644 Binary files a/mingw64/share/man/man3/pcre2_match_context_free.3.gz and b/mingw64/share/man/man3/pcre2_match_context_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match_data_create.3.gz b/mingw64/share/man/man3/pcre2_match_data_create.3.gz index 79faeb6e407..dd522dda1a5 100644 Binary files a/mingw64/share/man/man3/pcre2_match_data_create.3.gz and b/mingw64/share/man/man3/pcre2_match_data_create.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match_data_create_from_pattern.3.gz b/mingw64/share/man/man3/pcre2_match_data_create_from_pattern.3.gz index 0649219da29..99b6388d286 100644 Binary files a/mingw64/share/man/man3/pcre2_match_data_create_from_pattern.3.gz and b/mingw64/share/man/man3/pcre2_match_data_create_from_pattern.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_match_data_free.3.gz b/mingw64/share/man/man3/pcre2_match_data_free.3.gz index 0d5769b50d3..d16b50815c1 100644 Binary files a/mingw64/share/man/man3/pcre2_match_data_free.3.gz and b/mingw64/share/man/man3/pcre2_match_data_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_pattern_convert.3.gz b/mingw64/share/man/man3/pcre2_pattern_convert.3.gz index 51cec7d92eb..fa281abaa5a 100644 Binary files a/mingw64/share/man/man3/pcre2_pattern_convert.3.gz and b/mingw64/share/man/man3/pcre2_pattern_convert.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_pattern_info.3.gz b/mingw64/share/man/man3/pcre2_pattern_info.3.gz index 591aadd37d8..77adf99495b 100644 Binary files a/mingw64/share/man/man3/pcre2_pattern_info.3.gz and b/mingw64/share/man/man3/pcre2_pattern_info.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_serialize_decode.3.gz b/mingw64/share/man/man3/pcre2_serialize_decode.3.gz index ee99586dc5b..94dc8db34ac 100644 Binary files a/mingw64/share/man/man3/pcre2_serialize_decode.3.gz and b/mingw64/share/man/man3/pcre2_serialize_decode.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_serialize_encode.3.gz b/mingw64/share/man/man3/pcre2_serialize_encode.3.gz index 0aebb74aeba..ef83c5158ba 100644 Binary files a/mingw64/share/man/man3/pcre2_serialize_encode.3.gz and b/mingw64/share/man/man3/pcre2_serialize_encode.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_serialize_free.3.gz b/mingw64/share/man/man3/pcre2_serialize_free.3.gz index 35e72657a5c..1b893458bf1 100644 Binary files a/mingw64/share/man/man3/pcre2_serialize_free.3.gz and b/mingw64/share/man/man3/pcre2_serialize_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz b/mingw64/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz index 2f5e616a614..d0f6e346a7f 100644 Binary files a/mingw64/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz and b/mingw64/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_bsr.3.gz b/mingw64/share/man/man3/pcre2_set_bsr.3.gz index edc42c33a2e..f886a47c949 100644 Binary files a/mingw64/share/man/man3/pcre2_set_bsr.3.gz and b/mingw64/share/man/man3/pcre2_set_bsr.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_callout.3.gz b/mingw64/share/man/man3/pcre2_set_callout.3.gz index 10c85f78b1c..a28a8719d86 100644 Binary files a/mingw64/share/man/man3/pcre2_set_callout.3.gz and b/mingw64/share/man/man3/pcre2_set_callout.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_character_tables.3.gz b/mingw64/share/man/man3/pcre2_set_character_tables.3.gz index ef705eabf76..7b5c22c7b08 100644 Binary files a/mingw64/share/man/man3/pcre2_set_character_tables.3.gz and b/mingw64/share/man/man3/pcre2_set_character_tables.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_compile_extra_options.3.gz b/mingw64/share/man/man3/pcre2_set_compile_extra_options.3.gz index f0b6532da11..c5b68d4fffe 100644 Binary files a/mingw64/share/man/man3/pcre2_set_compile_extra_options.3.gz and b/mingw64/share/man/man3/pcre2_set_compile_extra_options.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_compile_recursion_guard.3.gz b/mingw64/share/man/man3/pcre2_set_compile_recursion_guard.3.gz index be84b852dfa..0a0f31d308c 100644 Binary files a/mingw64/share/man/man3/pcre2_set_compile_recursion_guard.3.gz and b/mingw64/share/man/man3/pcre2_set_compile_recursion_guard.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_depth_limit.3.gz b/mingw64/share/man/man3/pcre2_set_depth_limit.3.gz index 23431f3bee0..63f86a671ce 100644 Binary files a/mingw64/share/man/man3/pcre2_set_depth_limit.3.gz and b/mingw64/share/man/man3/pcre2_set_depth_limit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_glob_escape.3.gz b/mingw64/share/man/man3/pcre2_set_glob_escape.3.gz index 7f633e4d483..b9686a7d52c 100644 Binary files a/mingw64/share/man/man3/pcre2_set_glob_escape.3.gz and b/mingw64/share/man/man3/pcre2_set_glob_escape.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_glob_separator.3.gz b/mingw64/share/man/man3/pcre2_set_glob_separator.3.gz index 5b40b6d42c9..ef3c380e2f5 100644 Binary files a/mingw64/share/man/man3/pcre2_set_glob_separator.3.gz and b/mingw64/share/man/man3/pcre2_set_glob_separator.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_heap_limit.3.gz b/mingw64/share/man/man3/pcre2_set_heap_limit.3.gz index 059d9882713..b1f60913032 100644 Binary files a/mingw64/share/man/man3/pcre2_set_heap_limit.3.gz and b/mingw64/share/man/man3/pcre2_set_heap_limit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_match_limit.3.gz b/mingw64/share/man/man3/pcre2_set_match_limit.3.gz index 2949258558d..9a4487ee5f4 100644 Binary files a/mingw64/share/man/man3/pcre2_set_match_limit.3.gz and b/mingw64/share/man/man3/pcre2_set_match_limit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz b/mingw64/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz index 22b23a35e17..e6b0799660f 100644 Binary files a/mingw64/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz and b/mingw64/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_max_pattern_length.3.gz b/mingw64/share/man/man3/pcre2_set_max_pattern_length.3.gz index 2c7cc625081..73b8724f64d 100644 Binary files a/mingw64/share/man/man3/pcre2_set_max_pattern_length.3.gz and b/mingw64/share/man/man3/pcre2_set_max_pattern_length.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_max_varlookbehind.3.gz b/mingw64/share/man/man3/pcre2_set_max_varlookbehind.3.gz index b18f39edaf3..7da16360947 100644 Binary files a/mingw64/share/man/man3/pcre2_set_max_varlookbehind.3.gz and b/mingw64/share/man/man3/pcre2_set_max_varlookbehind.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_newline.3.gz b/mingw64/share/man/man3/pcre2_set_newline.3.gz index c7b6911659e..779465b9227 100644 Binary files a/mingw64/share/man/man3/pcre2_set_newline.3.gz and b/mingw64/share/man/man3/pcre2_set_newline.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_offset_limit.3.gz b/mingw64/share/man/man3/pcre2_set_offset_limit.3.gz index 84f80d2cf97..d3202861471 100644 Binary files a/mingw64/share/man/man3/pcre2_set_offset_limit.3.gz and b/mingw64/share/man/man3/pcre2_set_offset_limit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_optimize.3.gz b/mingw64/share/man/man3/pcre2_set_optimize.3.gz new file mode 100644 index 00000000000..3edd2ad258d Binary files /dev/null and b/mingw64/share/man/man3/pcre2_set_optimize.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_parens_nest_limit.3.gz b/mingw64/share/man/man3/pcre2_set_parens_nest_limit.3.gz index 45222f17ed4..8135f86d958 100644 Binary files a/mingw64/share/man/man3/pcre2_set_parens_nest_limit.3.gz and b/mingw64/share/man/man3/pcre2_set_parens_nest_limit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_recursion_limit.3.gz b/mingw64/share/man/man3/pcre2_set_recursion_limit.3.gz index 1b36c8a9930..30af4aed7bb 100644 Binary files a/mingw64/share/man/man3/pcre2_set_recursion_limit.3.gz and b/mingw64/share/man/man3/pcre2_set_recursion_limit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_recursion_memory_management.3.gz b/mingw64/share/man/man3/pcre2_set_recursion_memory_management.3.gz index abf99ac387f..675b3c11476 100644 Binary files a/mingw64/share/man/man3/pcre2_set_recursion_memory_management.3.gz and b/mingw64/share/man/man3/pcre2_set_recursion_memory_management.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_substitute_callout.3.gz b/mingw64/share/man/man3/pcre2_set_substitute_callout.3.gz index 894bcb508a1..355e05fa15c 100644 Binary files a/mingw64/share/man/man3/pcre2_set_substitute_callout.3.gz and b/mingw64/share/man/man3/pcre2_set_substitute_callout.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_set_substitute_case_callout.3.gz b/mingw64/share/man/man3/pcre2_set_substitute_case_callout.3.gz new file mode 100644 index 00000000000..337b0e12190 Binary files /dev/null and b/mingw64/share/man/man3/pcre2_set_substitute_case_callout.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substitute.3.gz b/mingw64/share/man/man3/pcre2_substitute.3.gz index db19673b270..22df41e3e5b 100644 Binary files a/mingw64/share/man/man3/pcre2_substitute.3.gz and b/mingw64/share/man/man3/pcre2_substitute.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_copy_byname.3.gz b/mingw64/share/man/man3/pcre2_substring_copy_byname.3.gz index 610aa32f5c2..2ceb33380ab 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_copy_byname.3.gz and b/mingw64/share/man/man3/pcre2_substring_copy_byname.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_copy_bynumber.3.gz b/mingw64/share/man/man3/pcre2_substring_copy_bynumber.3.gz index ba365db4ebd..692696a4e5b 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_copy_bynumber.3.gz and b/mingw64/share/man/man3/pcre2_substring_copy_bynumber.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_free.3.gz b/mingw64/share/man/man3/pcre2_substring_free.3.gz index 142293639aa..f3ce3fbe38f 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_free.3.gz and b/mingw64/share/man/man3/pcre2_substring_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_get_byname.3.gz b/mingw64/share/man/man3/pcre2_substring_get_byname.3.gz index f07e76be580..2b84e593061 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_get_byname.3.gz and b/mingw64/share/man/man3/pcre2_substring_get_byname.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_get_bynumber.3.gz b/mingw64/share/man/man3/pcre2_substring_get_bynumber.3.gz index d49ab61f7e0..48da30b75af 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_get_bynumber.3.gz and b/mingw64/share/man/man3/pcre2_substring_get_bynumber.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_length_byname.3.gz b/mingw64/share/man/man3/pcre2_substring_length_byname.3.gz index e44474c9fb1..5d0beaff8af 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_length_byname.3.gz and b/mingw64/share/man/man3/pcre2_substring_length_byname.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_length_bynumber.3.gz b/mingw64/share/man/man3/pcre2_substring_length_bynumber.3.gz index aea0ec85261..cb79fb1dc33 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_length_bynumber.3.gz and b/mingw64/share/man/man3/pcre2_substring_length_bynumber.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_list_free.3.gz b/mingw64/share/man/man3/pcre2_substring_list_free.3.gz index 34f3227dfcb..3d84e95e4d5 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_list_free.3.gz and b/mingw64/share/man/man3/pcre2_substring_list_free.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_list_get.3.gz b/mingw64/share/man/man3/pcre2_substring_list_get.3.gz index afa4dd67106..f14794a0266 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_list_get.3.gz and b/mingw64/share/man/man3/pcre2_substring_list_get.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_nametable_scan.3.gz b/mingw64/share/man/man3/pcre2_substring_nametable_scan.3.gz index 31bc29d7964..ac2e7891cb8 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_nametable_scan.3.gz and b/mingw64/share/man/man3/pcre2_substring_nametable_scan.3.gz differ diff --git a/mingw64/share/man/man3/pcre2_substring_number_from_name.3.gz b/mingw64/share/man/man3/pcre2_substring_number_from_name.3.gz index ed0ce752243..77987e8f067 100644 Binary files a/mingw64/share/man/man3/pcre2_substring_number_from_name.3.gz and b/mingw64/share/man/man3/pcre2_substring_number_from_name.3.gz differ diff --git a/mingw64/share/man/man3/pcre2api.3.gz b/mingw64/share/man/man3/pcre2api.3.gz index 6cebacbacd8..a5b2415b524 100644 Binary files a/mingw64/share/man/man3/pcre2api.3.gz and b/mingw64/share/man/man3/pcre2api.3.gz differ diff --git a/mingw64/share/man/man3/pcre2build.3.gz b/mingw64/share/man/man3/pcre2build.3.gz index 8b06eaee96c..c6ffcd91b64 100644 Binary files a/mingw64/share/man/man3/pcre2build.3.gz and b/mingw64/share/man/man3/pcre2build.3.gz differ diff --git a/mingw64/share/man/man3/pcre2callout.3.gz b/mingw64/share/man/man3/pcre2callout.3.gz index a8f9c712c8a..0063d728758 100644 Binary files a/mingw64/share/man/man3/pcre2callout.3.gz and b/mingw64/share/man/man3/pcre2callout.3.gz differ diff --git a/mingw64/share/man/man3/pcre2compat.3.gz b/mingw64/share/man/man3/pcre2compat.3.gz index dc51b2f2eeb..f6fed552b6c 100644 Binary files a/mingw64/share/man/man3/pcre2compat.3.gz and b/mingw64/share/man/man3/pcre2compat.3.gz differ diff --git a/mingw64/share/man/man3/pcre2convert.3.gz b/mingw64/share/man/man3/pcre2convert.3.gz index 346ffa364a4..f654786e3ba 100644 Binary files a/mingw64/share/man/man3/pcre2convert.3.gz and b/mingw64/share/man/man3/pcre2convert.3.gz differ diff --git a/mingw64/share/man/man3/pcre2demo.3.gz b/mingw64/share/man/man3/pcre2demo.3.gz index de857078189..109e0ef15c0 100644 Binary files a/mingw64/share/man/man3/pcre2demo.3.gz and b/mingw64/share/man/man3/pcre2demo.3.gz differ diff --git a/mingw64/share/man/man3/pcre2jit.3.gz b/mingw64/share/man/man3/pcre2jit.3.gz index 3a5cecc7368..5774b3e549e 100644 Binary files a/mingw64/share/man/man3/pcre2jit.3.gz and b/mingw64/share/man/man3/pcre2jit.3.gz differ diff --git a/mingw64/share/man/man3/pcre2limits.3.gz b/mingw64/share/man/man3/pcre2limits.3.gz index dd7f0bde3fe..7fe735473f7 100644 Binary files a/mingw64/share/man/man3/pcre2limits.3.gz and b/mingw64/share/man/man3/pcre2limits.3.gz differ diff --git a/mingw64/share/man/man3/pcre2matching.3.gz b/mingw64/share/man/man3/pcre2matching.3.gz index e14317961ce..40e98bed373 100644 Binary files a/mingw64/share/man/man3/pcre2matching.3.gz and b/mingw64/share/man/man3/pcre2matching.3.gz differ diff --git a/mingw64/share/man/man3/pcre2partial.3.gz b/mingw64/share/man/man3/pcre2partial.3.gz index 14768729058..cbb76c1ce1b 100644 Binary files a/mingw64/share/man/man3/pcre2partial.3.gz and b/mingw64/share/man/man3/pcre2partial.3.gz differ diff --git a/mingw64/share/man/man3/pcre2pattern.3.gz b/mingw64/share/man/man3/pcre2pattern.3.gz index 46ad89b46b1..e24291911eb 100644 Binary files a/mingw64/share/man/man3/pcre2pattern.3.gz and b/mingw64/share/man/man3/pcre2pattern.3.gz differ diff --git a/mingw64/share/man/man3/pcre2perform.3.gz b/mingw64/share/man/man3/pcre2perform.3.gz index 745c1a602a1..9e11a4bf132 100644 Binary files a/mingw64/share/man/man3/pcre2perform.3.gz and b/mingw64/share/man/man3/pcre2perform.3.gz differ diff --git a/mingw64/share/man/man3/pcre2posix.3.gz b/mingw64/share/man/man3/pcre2posix.3.gz index c108ca4e276..41e335f1370 100644 Binary files a/mingw64/share/man/man3/pcre2posix.3.gz and b/mingw64/share/man/man3/pcre2posix.3.gz differ diff --git a/mingw64/share/man/man3/pcre2sample.3.gz b/mingw64/share/man/man3/pcre2sample.3.gz index 87b280f9a50..669f77cd328 100644 Binary files a/mingw64/share/man/man3/pcre2sample.3.gz and b/mingw64/share/man/man3/pcre2sample.3.gz differ diff --git a/mingw64/share/man/man3/pcre2serialize.3.gz b/mingw64/share/man/man3/pcre2serialize.3.gz index 709a1656f41..3069e1b3bc9 100644 Binary files a/mingw64/share/man/man3/pcre2serialize.3.gz and b/mingw64/share/man/man3/pcre2serialize.3.gz differ diff --git a/mingw64/share/man/man3/pcre2syntax.3.gz b/mingw64/share/man/man3/pcre2syntax.3.gz index 6c1976014f3..c9d5d878ed0 100644 Binary files a/mingw64/share/man/man3/pcre2syntax.3.gz and b/mingw64/share/man/man3/pcre2syntax.3.gz differ diff --git a/mingw64/share/man/man3/pcre2unicode.3.gz b/mingw64/share/man/man3/pcre2unicode.3.gz index 8d153f0dc78..ade2ccaa32d 100644 Binary files a/mingw64/share/man/man3/pcre2unicode.3.gz and b/mingw64/share/man/man3/pcre2unicode.3.gz differ diff --git a/usr/bin/msys-pcre2-16-0.dll b/usr/bin/msys-pcre2-16-0.dll index 263c1a22426..ad83726e46d 100644 Binary files a/usr/bin/msys-pcre2-16-0.dll and b/usr/bin/msys-pcre2-16-0.dll differ diff --git a/usr/bin/msys-pcre2-32-0.dll b/usr/bin/msys-pcre2-32-0.dll index 1815c8e9e9d..b56d822133b 100644 Binary files a/usr/bin/msys-pcre2-32-0.dll and b/usr/bin/msys-pcre2-32-0.dll differ diff --git a/usr/bin/msys-pcre2-8-0.dll b/usr/bin/msys-pcre2-8-0.dll index 3cf89e609fd..595777ad6ea 100644 Binary files a/usr/bin/msys-pcre2-8-0.dll and b/usr/bin/msys-pcre2-8-0.dll differ diff --git a/usr/bin/msys-pcre2-posix-3.dll b/usr/bin/msys-pcre2-posix-3.dll index e02e6dc3eb3..06412e755e3 100644 Binary files a/usr/bin/msys-pcre2-posix-3.dll and b/usr/bin/msys-pcre2-posix-3.dll differ diff --git a/usr/bin/pcre2grep.exe b/usr/bin/pcre2grep.exe index 2994afbf385..36e3bbb1f5d 100644 Binary files a/usr/bin/pcre2grep.exe and b/usr/bin/pcre2grep.exe differ diff --git a/usr/bin/pcre2test.exe b/usr/bin/pcre2test.exe index d6de057d2a8..c0576e804c8 100644 Binary files a/usr/bin/pcre2test.exe and b/usr/bin/pcre2test.exe differ diff --git a/usr/share/doc/pcre2/AUTHORS b/usr/share/doc/pcre2/AUTHORS deleted file mode 100644 index 9669f7755ad..00000000000 --- a/usr/share/doc/pcre2/AUTHORS +++ /dev/null @@ -1,36 +0,0 @@ -THE MAIN PCRE2 LIBRARY CODE ---------------------------- - -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com - -Retired from University of Cambridge Computing Service, -Cambridge, England. - -Copyright (c) 1997-2024 University of Cambridge -All rights reserved - - -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. - - -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- - -Written by: Zoltan Herczeg -Email local part: hzmester -Emain domain: freemail.hu - -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. - -#### diff --git a/usr/share/doc/pcre2/AUTHORS.md b/usr/share/doc/pcre2/AUTHORS.md new file mode 100644 index 00000000000..708fc2325ce --- /dev/null +++ b/usr/share/doc/pcre2/AUTHORS.md @@ -0,0 +1,200 @@ +PCRE2 Authorship and Contributors +================================= + +COPYRIGHT +--------- + +Please see the file [LICENCE](./LICENCE.md) in the PCRE2 distribution for +copyright details. + + +MAINTAINERS +----------- + +The PCRE and PCRE2 libraries were authored and maintained by Philip Hazel. + +Since 2024, the contributors with administrator access to the project are now +Nicholas Wilson and Zoltán Herczeg. See the file [SECURITY](./SECURITY.md) for +GPG keys. + +Both administrators are volunteers acting in a personal capacity. + + + + + + + + + + + + + + + + + + +
    NameRole
    + + Nicholas Wilson
    + `nicholas@nicholaswilson.me.uk`
    + Currently of Microsoft Research Cambridge, UK + +
    + + * General project administration & maintenance + * Release management + * Code maintenance + +
    + + Zoltán Herczeg
    + `hzmester@freemail.hu`
    + Currently of the University of Szeged, Hungary + +
    + + * Code maintenance + * Ownership of `sljit` and PCRE2's JIT + +
    + + +CONTRIBUTORS +------------ + +Many others have participated and contributed to PCRE2 over its history. + +The maintainers are grateful for all contributions and participation over the +years. We apologise for any names we have forgotten. + +We are especially grateful to Philip Hazel, creator of PCRE and PCRE2, and +maintainer from 1997 to 2024. + +All names listed alphabetically. + +### Contributors to PCRE2 + +This list includes names up until the PCRE2 10.44 release. New names will be +added from the Git history on each release. + + Scott Bell + Carlo Marcelo Arenas Belón + Edward Betts + Jan-Willem Blokland + Ross Burton + Dmitry Cherniachenko + Alexey Chupahin + Jessica Clarke + Alejandro Colomar + Jeremie Courreges-Anglas + Addison Crump + Alex Dowad + Daniel Engberg + Daniel Richard G + David Gaussmann + Andrey Gorbachev + Jordan Griege + Jason Hood + Bumsu Hyeon + Roy Ivy + Martin Joerg + Guillem Jover + Ralf Junker + Ayesh Karunaratne + Michael Kaufmann + Yunho Kim + Joshua Kinard + David Korczynski + Uwe Korn + Jonas Kvinge + Kristian Larsson + Kai Lu + Behzod Mansurov + B. Scott Michel + Nathan Moinvaziri + Mike Munday + Marc Mutz + Fabio Pagani + Christian Persch + Tristan Ross + William A Rowe Jr + David Seifert + Yaakov Selkowitz + Rich Siegel + Karl Skomski + Maciej Sroczyński + Wolfgang Stöggl + Thomas Tempelmann + Greg Thain + Lucas Trzesniewski + Theodore Tsirpanis + Matthew Vernon + Rémi Verschelde + Thomas Voss + Ezekiel Warren + Carl Weaver + Chris Wilson + Amin Yahyaabadi + Joe Zhang + +### Contributors to PCRE1 + +These people contributed either by sending patches or reporting serious issues. + + Irfan Adilovic + Alexander Barkov + Daniel Bergström + David Burgess + Ross Burton + David Byron + Fred Cox + Christian Ehrlicher + Tom Fortmann + Lionel Fourquaux + Mike Frysinger + Daniel Richard G + Dair Gran + "Graycode" (Red Hat Product Security) + Viktor Griph + Wen Guanxing + Robin Houston + Martin Jerabek + Peter Kankowski + Stephen Kelly + Yunho Kim + Joshua Kinard + Carsten Klein + Evgeny Kotkov + Ronald Landheer-Cieslak + Alan Lehotsky + Dmitry V. Levin + Nuno Lopes + Kai Lu + Giuseppe Maxia + Dan Mooney + Marc Mutz + Markus Oberhumer + Sheri Pierce + Petr Pisar + Ari Pollak + Bob Rossi + Ruiger Rill + Michael Shigorin + Rich Siegel + Craig Silverstein (C++ wrapper) + Karl Skomski + Paul Sokolovsky + Stan Switzer + Ian Taylor + Mark Tetrode + Jeff Trawick + Steven Van Ingelgem + Lawrence Velazquez + Jiong Wang + Stefan Weber + Chris Wilson + +Thanks go to Jeffrey Friedl for testing and debugging assistance. diff --git a/usr/share/doc/pcre2/ChangeLog b/usr/share/doc/pcre2/ChangeLog index ea228c193f7..5217d078599 100644 --- a/usr/share/doc/pcre2/ChangeLog +++ b/usr/share/doc/pcre2/ChangeLog @@ -4,6 +4,194 @@ Change Log for PCRE2 Before the move to GitHub, this was the only record of changes to PCRE2. Now there is also the log of commit messages. +Internal changes which are not visible to clients of the library are mostly not +listed here. + +Version 10.45 05-February-2025 +------------------------------ + +1. (#418) Change 6 of 10.44 broke 32-bit tests because pcre2test's reporting of +memory size was changed to the entire compiled data block, instead of just the +pattern and tables data, so as to align with the new length restriction. +Because the block's header contains pointers, this meant the pcre2test output +was different in 32-bit mode. A patch by Carlo reverts to the previous state +and makes sure that any limit set by pcre2_set_max_pattern_compiled_length() +also avoids the internal struct overhead. + +2. (#416, #622) Updates to build.zig. + +3. (#427, et al.) Various fixes to pacify static analyzers. + +4. (#428) Add --posix-pattern-file to pcre2grep to allow processing of empty +patterns through the -f option, as well as patterns that end in space +characters, for compatibility with other grep tools. + +5. (4fa5b8bd) Fix a bug in the fuzz support quantifier-limiting code. It ignores +strings of more than 5 digits because they are necessarily numbers greater than +65535, the largest legal quantifier. However, it wasn't ignoring non-significant +leading zeros. + +6. (6d82f0cd) The case-independent processing of the letter-matching Unicode +properties Ll, Lt, and Lu have been changed to match Perl (which changed a while +ago). When caseless matching is in force, all three of these properties are now +treated as Lc (cased letter). + +7. (#433) The pcre2_jit_compile() function was updated by the addition of a new +option PCRE2_JIT_TEST_ALLOC which, if called with a NULL first argument, tests +not only the availability of JIT, but also its ability to allocate executable +memory. Update pcre2test to use this support to extend the -C option. + +8. (75b1025a) The code for parsing Unicode property descriptions for \p and \P +been changed as follows: + + . White space etc. before ^ in a negated value such as \p{ ^L } was not being + ignored. + + . The code wouldn't have worked if PCRE2 was compiled for UTF-8 support + within an EBCDIC environment. Possibly nobody does this any more, but it + should now work. + + . The documentation of the syntax of what can follow \p and \P has been + updated. + +9. (1c24ba01) There was an error in the table of lengths for parsed items for +the OPTIONS item, but fortuitously it could never have actually bitten. While +fixing this, some other code that could never be obeyed was discovered and +removed. + +10. (674b6640) Removed some incorect optimization code from DFA matching that +has been there since PCRE1, but has just been found to cause a no match return +instead of a partial match in some cases. It involves partial matching when (*F) +is present so is unlikely to have actually affected anyone. + +11. (b0f4ac17) Tidy the wording and formatting of some pcre2test error messages +concerned with bad modifiers. Also restrict single-letter modifier sequences to +the first item in a modifier list, as documented and always intended. + +12. (1415565c) An iterator at the end of many assertions can always be +auto-possessified, but not at the end of variable-length lookbehinds. There was +a bug in the code that checks for such a lookbehind; it was looking only at the +first branch, which is wrong because some branches can be fixed length when +others are not, for example (?<=AB|CD?). Now all branches are checked for +variability. + +13. (ead08288) Matching with pcre2_match() could give an incorrect result if a +variable-length lookbehind was used as the condition in a conditional group. +The condition could erroneously be treated as true if a branch matched but +overran the current position. This bug was in the interpreter only; matching +with JIT was correct. + +14. (#443) Split out the sljit sub-project into a "Git submodule". Git users +must now run `git submodule init; git submodule update` after a Git checkout, or +the build will fail due to missing files in deps/sljit. + +15. (#441) Add a new error code (PCRE2_ERROR_JIT_UNSUPPORTED) which is yielded +for unsupported jit features. + +16. (#444) Fix bug in 'first code unit' and 'last code unit' optimization +combined with lookahead assertions. + +17. (#445, #447, #449, #451, #452, #459, #563) Add a new feature called scan +substring. This feature is a new type of assertion which matches the content of +a capturing block to a sub-pattern. + +18. (#450) Improvements to 'first code unit' / 'starting code units' +optimisation. + +19. (#455) Many, many improvements to the JIT compiler. + +20. Item 43 of 10.43 was incomplete because it addressed only \z and not \Z, +which was still misbehaving when matching fragments inside invalid UTF strings. + +21. (d29e7290) Octal escapes of the form \045 or \111 were not being recognized +in substitution strings, and if encountered gave an error, though the \o{...} +form was recognized. This bug is now fixed. + +22. (#463, #487) Fix 1 byte out-of-bounds read when parsing malformed limits +(e.g. LIMIT_HEAP) + +23. Many improvements to test infrastructure. Many more platforms and +configurations are now run in Continuous Integration, and all the platforms now +run the full test suite, rather than a partial subset. + +24. (#475) Implement title casing in substitution strings using Perl syntax. + +25. (#478, #504) Disallow \x if not followed by { or a hex digit. + +26. (#473) Implements Python-style backrefs in substitutions. + +27. (#472) Fix error reporting for certain over-large octal escapes. + +28. (#482) Fix parsing of named captures in replacement strings, allowing +non-ASCII capture names to be used. + +29. (#477, #474, #488, #494, #496, #506, #508, #511, #518, #524, #540) Many +improvements to parsing and optimising of character classes. + +30. (#483, #498) Add support for \g and $ to replacement strings. + +31. (#470) Add option flags PCRE2_EXTRA_NO_BS0 and PCRE2_EXTRA_PYTHON_OCTAL. + +32. (#471) Add new API function pcre2_set_optimize() for controlling which +optimizations are enabled. + +33. (#491) Adds $& $` $' and $_ to substitution replacements, as well as +interpreting \b and \v as characters. + +34. (#499) Add option PCRE2_EXTRA_NEVER_CALLOUT to disable callouts. + +35. (#503, #513) Update Unicode support to UCD 16. + +36. (#512, #618, #638) Add new function pcre2_set_substitute_case_callout() to +allow clients to provide a custom callback with locale-aware case +transformation. + +37. (#516) Fix case-insensitive matching of backreferences when using the +PCRE2_EXTRA_CASELESS_RESTRICT option. + +38. (#519) In pcre2grep, add $& as an alias for $0 + +39. (c9bf8339, #534) Updated perltest.sh to enable locale setting. + +40. (#521) Add support for Turkish I casefolding, using new options +PCRE2_EXTRA_TURKISH_CASING, and added pre-pattern flags (*TURKISH_CASING) and +(*CASELESS_RESTRICT). + +41. (#523, #546, #547) Add support for UTS#18 compatible character classes, +using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a metacharacter +within character classes and the operators '&&', '--' and '~~', allowing +subtractions and intersections of character classes to be easily expressed. + +42. (#553, #586, #596, #597) Add support for Perl-style extended character +classes, using the syntax (?[...]). This also allows expressing subtractions and +intersections of character classes, but using a different syntax to UTS#18. + +43. (#554) Fixed a bug in JIT affecting greedy bounded repeats. The upper limit +of repeats inside a repeated bracket might be incorrectly checked. + +44. (#556) Fixed a bug in JIT affecting caseful matching of backreferences. When +utf is disabled, and dupnames is enabled, caseless matching was used even +if caseful matching was needed. + +45. (f34fc0a3) Fixed a bug in pcre2grep reported by Alejandro Colomar + (GitHub issue #577). In certain cases, when lines of above and +below context were contiguous, a separator line was incorrectly being inserted. + +46. (#594) Fix a small (one/two byte) out-of-bounds read on invalid UTF-8 input +in pcre2grep. + +47. (#370) Fix the INSTALL_MSVC_PDB CMake flag. + +48. (#366) Install cmake files in prefix/lib/cmake/pcre2 rather than +prefix/cmake. The new CMake flag PCRE2_INSTALL_CMAKEDIR allows customising this +location. + +49. (#624, #626, #628, #632, #639, #641) Reduce code size of generated JIT code +for repeated character classes. + +50. (#623) Update the Bazel build files. + + Version 10.44 07-June-2024 -------------------------- diff --git a/usr/share/licenses/pcre2/LICENSE b/usr/share/doc/pcre2/LICENCE.md similarity index 55% rename from usr/share/licenses/pcre2/LICENSE rename to usr/share/doc/pcre2/LICENCE.md index 3c1ef032dec..f58ceb75a63 100644 --- a/usr/share/licenses/pcre2/LICENSE +++ b/usr/share/doc/pcre2/LICENCE.md @@ -1,5 +1,8 @@ -PCRE2 LICENCE -------------- +PCRE2 License +============= + +| SPDX-License-Identifier: | BSD-3-Clause WITH PCRE2-exception | +|---------|-------| PCRE2 is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. @@ -16,40 +19,46 @@ optimize pattern matching. This is an optional feature that can be omitted when the library is built. -THE BASIC LIBRARY FUNCTIONS ---------------------------- +COPYRIGHT +--------- + +### The basic library functions -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com + Written by: Philip Hazel + Email local part: Philip.Hazel + Email domain: gmail.com -Retired from University of Cambridge Computing Service, -Cambridge, England. + Retired from University of Cambridge Computing Service, + Cambridge, England. -Copyright (c) 1997-2024 University of Cambridge -All rights reserved. + Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 2007-2024 Philip Hazel + All rights reserved. +### PCRE2 Just-In-Time compilation support -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu + Copyright (c) 2010-2024 Zoltan Herczeg + All rights reserved. -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. +### Stack-less Just-In-Time compiler + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- + Copyright (c) 2009-2024 Zoltan Herczeg + All rights reserved. -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu +### All other contributions -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. +Many other contributors have participated in the authorship of PCRE2. As PCRE2 +has never required a Contributor Licensing Agreement, or other copyright +assignment agreement, all contributions have copyright retained by each +original contributor or their employer. THE "BSD" LICENCE @@ -58,16 +67,16 @@ THE "BSD" LICENCE Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notices, - this list of conditions and the following disclaimer. +* Redistributions of source code must retain the above copyright notices, + this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notices, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. +* Redistributions in binary form must reproduce the above copyright + notices, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of the University of Cambridge nor the names of any - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. +* Neither the name of the University of Cambridge nor the names of any + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/usr/share/doc/pcre2/NEWS b/usr/share/doc/pcre2/NEWS index 5f8dde35406..4b5ec1e5103 100644 --- a/usr/share/doc/pcre2/NEWS +++ b/usr/share/doc/pcre2/NEWS @@ -1,6 +1,92 @@ News about PCRE2 releases ------------------------- +Version 10.45 05-February-2025 +------------------------------ + +This is a comparatively large release, incorporating new features, some +bugfixes, and a few changes with slight backwards compatibility implications. +Please see the ChangeLog and Git log for further details. + +Only changes to behaviour, changes to the API, and major changes to the pattern +syntax are described here. + +This release is the first to be available as a (signed) Git tag, or +alternatively as a (signed) tarball of the Git tag. + +This is also the first release to be made by the new maintainers of PCRE2, and +we would like to thank Philip Hazel, creator and maintainer of PCRE and PCRE2. + +* (Git change) The sljit project has been split out into a separate Git + repository. Git users must now run `git submodule init; git submodule update` + after a Git checkout. + +* (Behaviour change) Update Unicode support to UCD 16. + +* (Match behaviour change) Case-insensitive matching of Unicode properties + Ll, Lt, and Lu has been changed to match Perl. Previously, /\p{Ll}/i would + match only lower-case characters (even though case-insensitive matching was + specified). This also affects case-insensitive matching of POSIX classes such + as [:lower:]. + +* (Minor match behaviour change) Case-insensitive matching of backreferences now + respects the PCRE2_EXTRA_CASELESS_RESTRICT option. + +* (Minor pattern syntax change) Parsing of the \x escape is stricter, and is + no longer parsed as an escape for the NUL character if not followed by '{' or + a hexadecimal digit. Use \x00 instead. + +* (Major new feature) Add a new feature called scan substring. This is a new + type of assertion which matches the content of a capturing block to a + sub-pattern. + + Example: to find a word that contains the rare (in English) sequence of + letters "rh" not at the start: + + \b(\w++)(*scan_substring:(1).+rh) + + The first group captures a word which is then scanned by the + (*scan_substring:(1) ... ) assertion, which tests whether the pattern ".+rh" + matches the capture group "(1)". + +* (Major new feature) Add support for UTS#18 compatible character classes, + using the new option PCRE2_ALT_EXTENDED_CLASS. This adds '[' as a + metacharacter within character classes and the operators '&&', '--' and '~~', + allowing subtractions and intersections of character classes to be easily + expressed. + + Example: to match Thai or Greek letters (but not letters or other characters + in those scripts), use [\p{L}&&[\p{Thai}||\p{Greek}]]. + +* (Major new feature) Add support for Perl-style extended character classes, + using the syntax (?[...]). This also allows expressing subtractions and + intersections of character classes, but using a different syntax to UTS#18. + + Example: to match Thai or Greek letters (but not letters or other characters + in those scripts), use (?[\p{L} & (\p{Thai} + \p{Greek})]). + +* (Minor feature) Significant improvements to the character class match engine. + Compiled character classes are now more compact, and have faster matching + for large or complex character sets, using binary search through the set. + +* JIT compilation now fails with the new error code PCRE2_ERROR_JIT_UNSUPPORTED + for patterns which use features not supported by the JIT compiler. + +* (Minor feature) New options PCRE2_EXTRA_NO_BS0 (disallow \0 as an escape for + the NUL character); PCRE2_EXTRA_PYTHON_OCTAL (use Python disambiguation rules + for deciding whether \12 is a backreference or an octal escape); + PCRE2_EXTRA_NEVER_CALLOUT (disable callout syntax entirely); + PCRE2_EXTRA_TURKISH_CASING (use Turkish rules for case-insensitive matching). + +* (Minor feature) Add new API function pcre2_set_optimize() for controlling + which optimizations are enabled. + +* (Minor new features) A variety of extensions have been made to + pcre2_substitute() and its syntax for replacement strings. These now support: + \123 octal escapes; titlecasing \u\L; \1 backreferences; \g<1> and $ + backreferences; $& $` $' and $_; new function + pcre2_set_substitute_case_callout() to allow locale-aware case transformation. + Version 10.44 07-June-2024 -------------------------- @@ -13,7 +99,7 @@ increased to 128. Some auxiliary files for building under VMS are added. Version 10.43 16-February-2024 ------------------------------ -There are quite a lot of changes in this release (see ChangeLog and git log for +There are quite a lot of changes in this release (see ChangeLog and Git log for a list). Those that are not bugfixes or code tidies are: * The JIT code no longer supports ARMv5 architecture. @@ -52,7 +138,7 @@ a list). Those that are not bugfixes or code tidies are: matches the "fullwidth" versions of hex digits. PCRE2_EXTRA_ASCII_DIGIT can be used to keep it ASCII only. -* Make PCRE2_UCP the default in UTF mode in pcre2grep and add -no_ucp, +* Make PCRE2_UCP the default in UTF mode in pcre2grep and add --no-ucp, --case-restrict and --posix-digit. * Add --group-separator and --no-group-separator to pcre2grep. diff --git a/usr/share/doc/pcre2/README b/usr/share/doc/pcre2/README index dab5e94210b..5a50f7f11b5 100644 --- a/usr/share/doc/pcre2/README +++ b/usr/share/doc/pcre2/README @@ -385,7 +385,7 @@ library. They are also documented in the pcre2build man page. If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of + Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. @@ -411,20 +411,19 @@ library. They are also documented in the pcre2build man page. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about what it is doing. The input strings are specified - by arguments: if an argument starts with "=" the rest of it is a literal - input string. Otherwise, it is assumed to be a file name, and the contents - of the file are the test string. + want to run fuzzing tests on PCRE2. If set, it causes an extra library + called libpcre2-fuzzsupport.a to be built, but not installed. This contains + a single function called LLVMFuzzerTestOneInput() whose arguments are a + pointer to a string and the length of the string. When called, this function + tries to compile the string as a pattern, and if that succeeds, to match + it. This is done both with no options and with some random options bits that + are generated from the string. Setting --enable-fuzz-support also causes an + executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally + run under valgrind or used when PCRE2 is compiled with address sanitizing + enabled. It calls the fuzzing function and outputs information about what it + is doing. The input strings are specified by arguments: if an argument + starts with "=" the rest of it is a literal input string. Otherwise, it is + assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -510,6 +509,7 @@ system. The following are installed (file names are all relative to the LICENCE NEWS README + SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page @@ -607,8 +607,9 @@ zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. +should first run the maint/PrepareRelease script before making a distribution. +This script creates the .txt and HTML forms of the documentation from the man +pages. Testing PCRE2 @@ -822,37 +823,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -860,13 +862,16 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header + src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_neon_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_ucp.h header for Unicode property handling + src/pcre2_util.h header for internal utils - sljit/* source files for the JIT compiler + deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: @@ -878,48 +883,49 @@ The distribution should contain the files listed below. (C) Auxiliary files: - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 + AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 + LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name + SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests + RunTest.bat a Windows batch file for running tests + RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library + m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages + doc/pcre2-config.txt plain text documentation of pcre2-config script + doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories + ar-lib ) + config.guess ) + config.sub ) + depcomp ) helper tools generated by libtool and + compile ) automake, used internally by ./configure + install-sh ) + ltmain.sh ) + missing ) + test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests @@ -927,12 +933,13 @@ The distribution should contain the files listed below. testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files -(D) Auxiliary files for cmake support +(D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake cmake/FindEditline.cmake cmake/FindReadline.cmake + cmake/pcre2-config-version.cmake.in + cmake/pcre2-config.cmake.in CMakeLists.txt config-cmake.h.in @@ -943,14 +950,21 @@ The distribution should contain the files listed below. src/config.h.generic ) a version of config.h for use in non-"configure" ) environments -(F) Auxiliary files for building PCRE2 under OpenVMS +(F) Auxiliary files for building PCRE2 using other build systems + + BUILD.bazel ) + MODULE.bazel ) files used by the Bazel build system + WORKSPACE.bazel ) + build.zig file used by zig's build system + +(G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 15 April 2024 +============================== +Last updated: 18 December 2024 +============================== + diff --git a/usr/share/doc/pcre2/SECURITY.md b/usr/share/doc/pcre2/SECURITY.md new file mode 100644 index 00000000000..1e3a05b9aef --- /dev/null +++ b/usr/share/doc/pcre2/SECURITY.md @@ -0,0 +1,44 @@ +# Security policies + +## Release security + +The PCRE2 project provides source-only releases, with no binaries. + +These source releases can be downloaded from the +[GitHub Releases](https://github.com/PCRE2Project/pcre2/releases) page. Each +release file is GPG-signed. + +* Releases up to and including 10.44 are signed by Philip Hazel (GPG key: + 45F68D54BBE23FB3039B46E59766E084FB0F43D8) +* Releases from 10.45 onwards will be signed by Nicholas Wilson (GPG key: + A95536204A3BB489715231282A98E77EB6F24CA8, cross-signed by Philip + Hazel's key for release continuity) + +From releases 10.45 onwards, the source code will additionally be provided via +Git checkout of the (GPG-signed) release tag. + +Please contact the maintainers for any queries about release integrity or the +project's supply-chain. + +## Reporting vulnerabilities + +The PCRE2 project prioritises security. We appreciate third-party testing and +security research, and would be grateful if you could responsibly disclose your +findings to us. We will make every effort to acknowledge your contributions. + +To report a security issue, please use the GitHub Security Advisory +["Report a Vulnerability"](https://github.com/PCRE2Project/pcre2/security/advisories/new) +tab. (Alternatively, if you prefer you may send a GPG-encrypted email to one of +the maintainers.) + +### Timeline + +As a very small volunteer team, we cannot guarantee rapid response, but would +aim to respond within 1 week, or perhaps 2 during holidays. + +### Response procedure + +PCRE2 has never previously made a rapid or embargoed release in response to a +security incident. We would work with security managers from trusted downstream +distributors, such as major Linux distributions, before disclosing the +vulnerability publicly. diff --git a/usr/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt b/usr/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt index 851976ae238..bb687f7d040 100644 --- a/usr/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt +++ b/usr/share/doc/pcre2/html/NON-AUTOTOOLS-BUILD.txt @@ -105,6 +105,7 @@ example. pcre2_chkdint.c pcre2_chartables.c pcre2_compile.c + pcre2_compile_class.c pcre2_config.c pcre2_context.c pcre2_convert.c @@ -138,7 +139,7 @@ example. Note that you must compile pcre2_jit_compile.c, even if you have not defined SUPPORT_JIT in src/config.h, because when JIT support is not configured, dummy functions are compiled. When JIT support IS configured, - pcre2_jit_compile.c #includes other files from the sljit subdirectory, + pcre2_jit_compile.c #includes other files from the sljit dependency, all of whose names begin with "sljit". It also #includes src/pcre2_jit_match.c and src/pcre2_jit_misc.c, so you should not compile those yourself. @@ -301,56 +302,66 @@ Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no spaces in the names for your CMake installation and your PCRE2 source and build directories. -The following instructions were contributed by a PCRE1 user, but they should -also work for PCRE2. If they are not followed exactly, errors may occur. In the -event that errors do occur, it is recommended that you delete the CMake cache -before attempting to repeat the CMake build process. In the CMake GUI, the -cache can be deleted by selecting "File > Delete Cache". +If you are using CMake and encounter errors, deleting the CMake cache and +restarting from a fresh build may fix the error. In the CMake GUI, the cache can +be deleted by selecting "File > Delete Cache"; or the folder "CMakeCache" can +be deleted. -1. Install the latest CMake version available from http://www.cmake.org/, and - ensure that cmake\bin is on your path. +1. Install the latest CMake version available from http://www.cmake.org/, and + ensure that cmake\bin is on your path. -2. Unzip (retaining folder structure) the PCRE2 source tree into a source - directory such as C:\pcre2. You should ensure your local date and time - is not earlier than the file dates in your source dir if the release is - very new. +2. Unzip (retaining folder structure) the PCRE2 source tree into a source + directory such as C:\pcre2. You should ensure your local date and time + is not earlier than the file dates in your source dir if the release is + very new. -3. Create a new, empty build directory, preferably a subdirectory of the - source dir. For example, C:\pcre2\pcre2-xx\build. +3. Create a new, empty build directory, preferably a subdirectory of the + source dir. For example, C:\pcre2\pcre2-xx\build. -4. Run cmake-gui from the Shell environment of your build tool, for example, - Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try - to start Cmake from the Windows Start menu, as this can lead to errors. +4. Run CMake. -5. Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and - build directories, respectively. + - Using the CLI, simply run `cmake ..` inside the `build/` directory. You can + use the `ccmake` ncurses GUI to select and configure PCRE2 features. -6. Hit the "Configure" button. + - Using the CMake GUI: -7. Select the particular IDE / build tool that you are using (Visual - Studio, MSYS makefiles, MinGW makefiles, etc.) + a) Run cmake-gui from the Shell environment of your build tool, for + example, Msys for Msys/MinGW or Visual Studio Command Prompt for + VC/VC++. -8. The GUI will then list several configuration options. This is where - you can disable Unicode support or select other PCRE2 optional features. + b) Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and + build directories, respectively. -9. Hit "Configure" again. The adjacent "Generate" button should now be - active. + c) Press the "Configure" button. -10. Hit "Generate". + d) Select the particular IDE / build tool that you are using (Visual + Studio, MSYS makefiles, MinGW makefiles, etc.) -11. The build directory should now contain a usable build system, be it a - solution file for Visual Studio, makefiles for MinGW, etc. Exit from - cmake-gui and use the generated build system with your compiler or IDE. - E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 - solution, select the desired configuration (Debug, or Release, etc.) and - build the ALL_BUILD project. + e) The GUI will then list several configuration options. This is where + you can disable Unicode support or select other PCRE2 optional features. -12. If during configuration with cmake-gui you've elected to build the test - programs, you can execute them by building the test project. E.g., for - MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The - most recent build configuration is targeted by the tests. A summary of - test results is presented. Complete test output is subsequently - available for review in Testing\Temporary under your build dir. + f) Press "Configure" again. The adjacent "Generate" button should now be + active. + + g) Press "Generate". + +5. The build directory should now contain a usable build system, be it a + solution file for Visual Studio, makefiles for MinGW, etc. Exit from + cmake-gui and use the generated build system with your compiler or IDE. + E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2 + solution, select the desired configuration (Debug, or Release, etc.) and + build the ALL_BUILD project. + + Regardless of build system used, `cmake --build .` will build it. + +6. If during configuration with cmake-gui you've elected to build the test + programs, you can execute them by building the test project. E.g., for + MinGW: "make test"; for Visual Studio build the RUN_TESTS project. The + most recent build configuration is targeted by the tests. A summary of + test results is presented. Complete test output is subsequently + available for review in Testing\Temporary under your build dir. + + Regardless of build system used, `ctest` will run the tests. BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO @@ -425,6 +436,7 @@ OpenVMS. They are in the "vms" directory in the distribution tarball. Please read the file called vms/openvms_readme.txt. The pcre2test and pcre2grep programs contain some VMS-specific code. -=========================== -Last Updated: 16 April 2024 -=========================== +============================== +Last updated: 26 December 2024 +============================== + diff --git a/usr/share/doc/pcre2/html/README.txt b/usr/share/doc/pcre2/html/README.txt index dab5e94210b..5a50f7f11b5 100644 --- a/usr/share/doc/pcre2/html/README.txt +++ b/usr/share/doc/pcre2/html/README.txt @@ -385,7 +385,7 @@ library. They are also documented in the pcre2build man page. If this is done, when pcre2test's input is from a terminal, it reads it using the readline() function. This provides line-editing and history facilities. - Note that libreadline is GPL-licenced, so if you distribute a binary of + Note that libreadline is GPL-licensed, so if you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking with libedit (which has a BSD licence) instead. @@ -411,20 +411,19 @@ library. They are also documented in the pcre2build man page. Instead of %td or %zu, %lu is used, with a cast for size_t values. . There is a special option called --enable-fuzz-support for use by people who - want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit - library. If set, it causes an extra library called libpcre2-fuzzsupport.a to - be built, but not installed. This contains a single function called - LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the - length of the string. When called, this function tries to compile the string - as a pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to - be created. This is normally run under valgrind or used when PCRE2 is - compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about what it is doing. The input strings are specified - by arguments: if an argument starts with "=" the rest of it is a literal - input string. Otherwise, it is assumed to be a file name, and the contents - of the file are the test string. + want to run fuzzing tests on PCRE2. If set, it causes an extra library + called libpcre2-fuzzsupport.a to be built, but not installed. This contains + a single function called LLVMFuzzerTestOneInput() whose arguments are a + pointer to a string and the length of the string. When called, this function + tries to compile the string as a pattern, and if that succeeds, to match + it. This is done both with no options and with some random options bits that + are generated from the string. Setting --enable-fuzz-support also causes an + executable called pcre2fuzzcheck-{8,16,32} to be created. This is normally + run under valgrind or used when PCRE2 is compiled with address sanitizing + enabled. It calls the fuzzing function and outputs information about what it + is doing. The input strings are specified by arguments: if an argument + starts with "=" the rest of it is a literal input string. Otherwise, it is + assumed to be a file name, and the contents of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -510,6 +509,7 @@ system. The following are installed (file names are all relative to the LICENCE NEWS README + SECURITY pcre2.txt (a concatenation of the man(3) pages) pcre2test.txt the pcre2test man page pcre2grep.txt the pcre2grep man page @@ -607,8 +607,9 @@ zip formats. The command "make distcheck" does the same, but then does a trial build of the new distribution to ensure that it works. If you have modified any of the man page sources in the doc directory, you -should first run the PrepareRelease script before making a distribution. This -script creates the .txt and HTML forms of the documentation from the man pages. +should first run the maint/PrepareRelease script before making a distribution. +This script creates the .txt and HTML forms of the documentation from the man +pages. Testing PCRE2 @@ -822,37 +823,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -860,13 +862,16 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header + src/pcre2_jit_char_inc.h header used by JIT src/pcre2_jit_neon_inc.h header used by JIT src/pcre2_jit_simd_inc.h header used by JIT src/pcre2_ucp.h header for Unicode property handling + src/pcre2_util.h header for internal utils - sljit/* source files for the JIT compiler + deps/sljit/sljit_src/* source files for the JIT compiler (B) Source files for programs that use PCRE2: @@ -878,48 +883,49 @@ The distribution should contain the files listed below. (C) Auxiliary files: - 132html script to turn "man" pages into HTML - AUTHORS information about the author of PCRE2 + AUTHORS.md information about the authors of PCRE2 ChangeLog log of changes to the code - CleanTxt script to clean nroff output for txt man pages - Detrail script to remove trailing spaces HACKING some notes about the internals of PCRE2 INSTALL generic installation instructions - LICENCE conditions for the use of PCRE2 + LICENCE.md conditions for the use of PCRE2 COPYING the same, using GNU's standard name + SECURITY.md information on reporting vulnerabilities Makefile.in ) template for Unix Makefile, which is built by ) "configure" Makefile.am ) the automake input that was used to create ) Makefile.in NEWS important changes in this release NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools - PrepareRelease script to make preparations for "make dist" README this file RunTest a Unix shell script for running tests RunGrepTest a Unix shell script for pcre2grep tests + RunTest.bat a Windows batch file for running tests + RunGrepTest.bat a Windows batch file for pcre2grep tests aclocal.m4 m4 macros (generated by "aclocal") - config.guess ) files used by libtool, - config.sub ) used only when building a shared library + m4/* m4 macros (used by autoconf) configure a configuring shell script (built by autoconf) configure.ac ) the autoconf input that was used to build ) "configure" and config.h - depcomp ) script to find program dependencies, generated by - ) automake doc/*.3 man page sources for PCRE2 doc/*.1 man page sources for pcre2grep and pcre2test - doc/index.html.src the base HTML page doc/html/* HTML documentation doc/pcre2.txt plain text version of the man pages + doc/pcre2-config.txt plain text documentation of pcre2-config script + doc/pcre2grep.txt plain text documentation of grep utility program doc/pcre2test.txt plain text documentation of test program - install-sh a shell script for installing files libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config - ltmain.sh file used to build a libtool script - missing ) common stub for a few missing GNU programs while - ) installing, generated by automake - mkinstalldirs script for making install directories + ar-lib ) + config.guess ) + config.sub ) + depcomp ) helper tools generated by libtool and + compile ) automake, used internally by ./configure + install-sh ) + ltmain.sh ) + missing ) + test-driver ) perltest.sh Script for running a Perl test program pcre2-config.in source of script which retains PCRE2 information testdata/testinput* test data for main library tests @@ -927,12 +933,13 @@ The distribution should contain the files listed below. testdata/grep* input and output for pcre2grep tests testdata/* other supporting test files -(D) Auxiliary files for cmake support +(D) Auxiliary files for CMake support cmake/COPYING-CMAKE-SCRIPTS - cmake/FindPackageHandleStandardArgs.cmake cmake/FindEditline.cmake cmake/FindReadline.cmake + cmake/pcre2-config-version.cmake.in + cmake/pcre2-config.cmake.in CMakeLists.txt config-cmake.h.in @@ -943,14 +950,21 @@ The distribution should contain the files listed below. src/config.h.generic ) a version of config.h for use in non-"configure" ) environments -(F) Auxiliary files for building PCRE2 under OpenVMS +(F) Auxiliary files for building PCRE2 using other build systems + + BUILD.bazel ) + MODULE.bazel ) files used by the Bazel build system + WORKSPACE.bazel ) + build.zig file used by zig's build system + +(G) Auxiliary files for building PCRE2 under OpenVMS vms/configure.com ) vms/openvms_readme.txt ) These files were contributed by a PCRE2 user. vms/pcre2.h_patch ) vms/stdint.h ) -Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com -Last updated: 15 April 2024 +============================== +Last updated: 18 December 2024 +============================== + diff --git a/usr/share/doc/pcre2/html/index.html b/usr/share/doc/pcre2/html/index.html index e4dc78620fd..2d81b678fef 100644 --- a/usr/share/doc/pcre2/html/index.html +++ b/usr/share/doc/pcre2/html/index.html @@ -267,6 +267,9 @@

    Perl-compatible Regular Expressions (revised API: PCRE2)

    pcre2_set_offset_limit   Set the offset limit +pcre2_set_optimize +   Set an optimization directive + pcre2_set_parens_nest_limit   Set the parentheses nesting limit @@ -276,6 +279,12 @@

    Perl-compatible Regular Expressions (revised API: PCRE2)

    pcre2_set_recursion_memory_management   Obsolete function that (from 10.30 onwards) does nothing +pcre2_set_substitute_callout +   Set a substitution callout function + +pcre2_set_substitute_case_callout +   Set a substitution case callout function + pcre2_substitute   Match a compiled pattern to a subject string and do substitutions diff --git a/usr/share/doc/pcre2/html/pcre2.html b/usr/share/doc/pcre2/html/pcre2.html index 4cb83dc184b..e72b6b1cb1d 100644 --- a/usr/share/doc/pcre2/html/pcre2.html +++ b/usr/share/doc/pcre2/html/pcre2.html @@ -16,7 +16,7 @@

    pcre2 man page

  • INTRODUCTION
  • SECURITY CONSIDERATIONS
  • USER DOCUMENTATION -
  • AUTHOR +
  • AUTHORS
  • REVISION
    INTRODUCTION
    @@ -190,22 +190,22 @@

    pcre2 man page

    In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results.

    -
    AUTHOR
    +
    AUTHORS

    -Philip Hazel -
    -Retired from University Computing Service -
    -Cambridge, England. -
    +The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Herczeg. +

    +

    +PCRE2 was written by Philip Hazel, of the University Computing Service, +Cambridge, England. Many others have also contributed.

    -Putting an actual email address here is a spam magnet. If you want to email me, -use my two names separated by a dot at gmail.com. +To contact the maintainers, please use the GitHub issues tracker or PCRE2 +mailing list, as described at the project page: +https://github.com/PCRE2Project/pcre2


    REVISION

    -Last updated: 27 August 2021 +Last updated: 18 December 2024
    Copyright © 1997-2021 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2_compile.html b/usr/share/doc/pcre2/html/pcre2_compile.html index f0080eabe45..ee933f38983 100644 --- a/usr/share/doc/pcre2/html/pcre2_compile.html +++ b/usr/share/doc/pcre2/html/pcre2_compile.html @@ -57,6 +57,7 @@

    pcre2_compile man page

    PCRE2_ALLOW_EMPTY_CLASS Allow empty classes PCRE2_ALT_BSUX Alternative handling of \u, \U, and \x PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode + PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax PCRE2_ALT_VERBNAMES Process backslashes in verb names PCRE2_AUTO_CALLOUT Compile automatic callouts PCRE2_CASELESS Do caseless matching diff --git a/usr/share/doc/pcre2/html/pcre2_jit_compile.html b/usr/share/doc/pcre2/html/pcre2_jit_compile.html index 873d0ddefc6..791dd0c3d78 100644 --- a/usr/share/doc/pcre2/html/pcre2_jit_compile.html +++ b/usr/share/doc/pcre2/html/pcre2_jit_compile.html @@ -33,9 +33,18 @@

    pcre2_jit_compile man page

    documentation.

    -The first argument is a pointer that was returned by a successful call to -pcre2_compile(), and the second must contain one or more of the following -bits: +The availability of JIT support can be tested by calling +pcre2_compile_jit() with a single option PCRE2_JIT_TEST_ALLOC (the +code argument is ignored, so a NULL value is accepted). Such a call +returns zero if JIT is available and has a working allocator. Otherwise +it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate +executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not +compiled. +

    +

    +Otherwise, the first argument must be a pointer that was returned by a +successful call to pcre2_compile(), and the second must contain one or +more of the following bits:

       PCRE2_JIT_COMPLETE      compile code for full matching
       PCRE2_JIT_PARTIAL_SOFT  compile code for soft partial matching
    @@ -46,11 +55,13 @@ 

    pcre2_jit_compile man page

    option is deprecated and may be removed in the future.

    -The yield of the function is 0 for success, or a negative error code otherwise. -In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or -if an unknown bit is set in options. The function can also return -PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the -compiler, even if it was because of a system security restriction. +The yield of the function when called with any of the three options above is 0 +for success, or a negative error code otherwise. In particular, +PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or if an unknown +bit is set in options. The function can also return PCRE2_ERROR_NOMEMORY +if JIT is unable to allocate executable memory for the compiler, even if it was +because of a system security restriction. In a few cases, the function may +return with PCRE2_ERROR_JIT_UNSUPPORTED for unsupported features.

    There is a complete description of the PCRE2 native API in the diff --git a/usr/share/doc/pcre2/html/pcre2_set_compile_extra_options.html b/usr/share/doc/pcre2/html/pcre2_set_compile_extra_options.html index 4924ed79b5e..cb62022a22e 100644 --- a/usr/share/doc/pcre2/html/pcre2_set_compile_extra_options.html +++ b/usr/share/doc/pcre2/html/pcre2_set_compile_extra_options.html @@ -43,6 +43,10 @@

    pcre2_set_compile_extra_options man page

    PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines PCRE2_EXTRA_MATCH_WORD Pattern matches "words" + PCRE2_EXTRA_NEVER_CALLOUT Disallow callouts in pattern + PCRE2_EXTRA_NO_BS0 Disallow \0 (but not \00 or \000) + PCRE2_EXTRA_PYTHON_OCTAL Use Python rules for octal + PCRE2_EXTRA_TURKISH_CASING Use Turkish I case folding
    There is a complete description of the PCRE2 native API in the pcre2api diff --git a/usr/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html b/usr/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html index ab570cf60d1..a40f41e450c 100644 --- a/usr/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html +++ b/usr/share/doc/pcre2/html/pcre2_set_max_pattern_compiled_length.html @@ -27,9 +27,9 @@

    pcre2_set_max_pattern_compiled_length man page


    This function sets, in a compile context, the maximum size (in bytes) for the -memory needed to hold the compiled version of a pattern that is compiled with -this context. The result is always zero. If a pattern that is passed to -pcre2_compile() with this context needs more memory, an error is +memory needed to hold the compiled version of a pattern that is using this +context. The result is always zero. If a pattern that is passed to +pcre2_compile() referencing this context needs more memory, an error is generated. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.

    diff --git a/usr/share/doc/pcre2/html/pcre2_set_optimize.html b/usr/share/doc/pcre2/html/pcre2_set_optimize.html new file mode 100644 index 00000000000..47caeb267ae --- /dev/null +++ b/usr/share/doc/pcre2/html/pcre2_set_optimize.html @@ -0,0 +1,57 @@ + + +pcre2_set_optimize specification + + +

    pcre2_set_optimize man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +
    +SYNOPSIS +
    +

    +#include <pcre2.h> +

    +

    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +

    +
    +DESCRIPTION +
    +

    +This function controls which performance optimizations will be applied +by pcre2_compile(). It can be called multiple times with the same compile +context; the effects are cumulative, with the effects of later calls taking +precedence over earlier ones. +

    +

    +The result is zero for success, PCRE2_ERROR_NULL if ccontext is NULL, +or PCRE2_ERROR_BADOPTION if directive is unknown. The latter could be +useful to detect if a certain optimization is available. +

    +

    +The list of possible values for the directive parameter are: +

    +  PCRE2_OPTIMIZATION_FULL   Enable all optimizations (default)
    +  PCRE2_OPTIMIZATION_NONE   Disable all optimizations
    +  PCRE2_AUTO_POSSESS        Enable auto-possessification
    +  PCRE2_AUTO_POSSESS_OFF    Disable auto-possessification
    +  PCRE2_DOTSTAR_ANCHOR      Enable implicit dotstar anchoring
    +  PCRE2_DOTSTAR_ANCHOR_OFF  Disable implicit dotstar anchoring
    +  PCRE2_START_OPTIMIZE      Enable start-up optimizations at match time
    +  PCRE2_START_OPTIMIZE_OFF  Disable start-up optimizations at match time
    +
    +There is a complete description of the PCRE2 native API, including detailed +descriptions directive parameter values in the +pcre2api +page. +

    +Return to the PCRE2 index page. +

    diff --git a/usr/share/doc/pcre2/html/pcre2_set_substitute_callout.html b/usr/share/doc/pcre2/html/pcre2_set_substitute_callout.html index 7ae3a398d79..8640728fdc4 100644 --- a/usr/share/doc/pcre2/html/pcre2_set_substitute_callout.html +++ b/usr/share/doc/pcre2/html/pcre2_set_substitute_callout.html @@ -20,7 +20,7 @@

    pcre2_set_substitute_callout man page

    int pcre2_set_substitute_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_substitute_callout_block *), + int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);


    diff --git a/usr/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html b/usr/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html new file mode 100644 index 00000000000..ab506879f1f --- /dev/null +++ b/usr/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html @@ -0,0 +1,45 @@ + + +pcre2_set_substitute_case_callout specification + + +

    pcre2_set_substitute_case_callout man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +
    +SYNOPSIS +
    +

    +#include <pcre2.h> +

    +

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +

    +
    +DESCRIPTION +
    +

    +This function sets the substitute case callout fields in a match context (the +first argument). The second argument specifies a callout function, and the third +argument is an opaque data item that is passed to it. The result of this +function is always zero. +

    +

    +There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +

    +Return to the PCRE2 index page. +

    diff --git a/usr/share/doc/pcre2/html/pcre2api.html b/usr/share/doc/pcre2/html/pcre2api.html index 6b60ee9fa7a..079cf176daa 100644 --- a/usr/share/doc/pcre2/html/pcre2api.html +++ b/usr/share/doc/pcre2/html/pcre2api.html @@ -179,6 +179,10 @@

    pcre2api man page


    int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); +
    +
    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive);


    PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

    @@ -203,6 +207,13 @@

    pcre2api man page

    void *callout_data);

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);
    @@ -808,6 +819,7 @@

    pcre2api man page

    The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) + Which performance optimizations the compiler should apply
  • A compile context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of @@ -952,6 +964,110 @@

    pcre2api man page

    nesting, and the second is user data that is set up by the last argument of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. +
    +
    +int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +
    +
    +PCRE2 can apply various performance optimizations during compilation, in order +to make matching faster. For example, the compiler might convert some regex +constructs into an equivalent construct which pcre2_match() can execute +faster. By default, all available optimizations are enabled. However, in rare +cases, one might wish to disable specific optimizations. For example, if it is +known that some optimizations cannot benefit a certain regex, it might be +desirable to disable them, in order to speed up compilation. +

    +

    +The permitted values of directive are as follows: +

    +  PCRE2_OPTIMIZATION_FULL
    +
    +Enable all optional performance optimizations. This is the default value. +
    +  PCRE2_OPTIMIZATION_NONE
    +
    +Disable all optional performance optimizations. +
    +  PCRE2_AUTO_POSSESS
    +  PCRE2_AUTO_POSSESS_OFF
    +
    +Enable/disable "auto-possessification" of variable quantifiers such as * and +. +This optimization, for example, turns a+b into a++b in order to avoid +backtracks into a+ that can never be successful. However, if callouts are in +use, auto-possessification means that some callouts are never taken. You can +disable this optimization if you want the matching functions to do a full, +unoptimized search and run all the callouts. +
    +  PCRE2_DOTSTAR_ANCHOR
    +  PCRE2_DOTSTAR_ANCHOR_OFF
    +
    +Enable/disable an optimization that is applied when .* is the first significant +item in a top-level branch of a pattern, and all the other branches also start +with .* or with \A or \G or ^. Such a pattern is automatically anchored if +PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any +^ items. Otherwise, the fact that any match must start either at the start of +the subject or following a newline is remembered. Like other optimizations, +this can cause callouts to be skipped. +

    +

    +Dotstar anchor optimization is automatically disabled for .* if it is inside an +atomic group or a capture group that is the subject of a backreference, or if +the pattern contains (*PRUNE) or (*SKIP). +

    +  PCRE2_START_OPTIMIZE
    +  PCRE2_START_OPTIMIZE_OFF
    +
    +Enable/disable optimizations which cause matching functions to scan the subject +string for specific code unit values before attempting a match. For example, if +it is known that an unanchored match must start with a specific value, the +matching code searches the subject for that value, and fails immediately if it +cannot find it, without actually running the main matching function. This means +that a special item such as (*COMMIT) at the start of a pattern is not +considered until after a suitable starting point for the match has been found. +Also, when callouts or (*MARK) items are in use, these "start-up" optimizations +can cause them to be skipped if the pattern is never actually used. The start-up +optimizations are in effect a pre-scan of the subject that takes place before +the pattern is run. +

    +

    +Disabling start-up optimizations ensures that in cases where the result is "no +match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are +considered at every possible starting position in the subject string. +

    +

    +Disabling start-up optimizations may change the outcome of a matching operation. +Consider the pattern +

    +  (*COMMIT)ABC
    +
    +When this is compiled, PCRE2 records the fact that a match must start with the +character "A". Suppose the subject string is "DEFABC". The start-up +optimization scans along the subject, finds "A" and runs the first match +attempt from there. The (*COMMIT) item means that the pattern must match the +current starting position, which in this case, it does. However, if the same +match is run without start-up optimizations, the initial scan along the subject +string does not happen. The first match attempt is run starting from "D" and +when this fails, (*COMMIT) prevents any further matches being tried, so the +overall result is "no match". +

    +

    +Another start-up optimization makes use of a minimum length for a matching +subject, which is recorded when possible. Consider the pattern +

    +  (*MARK:1)B(*MARK:2)(X|Y)
    +
    +The minimum length for a match is two characters. If the subject is "XXBB", the +"starting character" optimization skips "XX", then tries to match "BB", which +is long enough. In the process, (*MARK:2) is encountered and remembered. When +the match attempt fails, the next "B" is found, but there is only one character +left, so there are no more attempts, and "no match" is returned with the "last +mark seen" set to "2". Without start-up optimizations, however, matches are +tried at every possible starting position, including at the end of the subject, +where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" +that is returned is "1". In this case, the optimizations do not affect the +overall match result, which is still "no match", but they do affect the +auxiliary information that is returned.


    The match context @@ -1011,6 +1127,19 @@

    pcre2api man page

    below.

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    +This sets up a callout function for PCRE2 to call when performing case +transformations inside pcre2_substitute(). Details are given in the +section entitled "Creating a new string with substitutions" +below. +
    +
    int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value);
    @@ -1228,7 +1357,10 @@

    pcre2api man page

    The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee that JIT will be used for -any given match. See the +any given match, and neither does it guarantee that JIT will actually be able +to function, because it may not be able to allocate executable memory in some +environments. There is a special call to pcre2_jit_compile() that can be +used to check this. See the pcre2jit documentation for more details.
    @@ -1431,7 +1563,7 @@ 

    pcre2api man page

    error has occurred.

    -There are nearly 100 positive error codes that pcre2_compile() may return +There are over 100 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and @@ -1539,6 +1671,16 @@

    pcre2api man page

    end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. +
    +  PCRE2_ALT_EXTENDED_CLASS
    +
    +Alters the parsing of character classes to follow the extended syntax +described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact +on the behaviour of the Perl-specific "(?[...])" syntax for extended classes, +but instead enables the alternative syntax of extended class behaviour inside +ordinary "[...]" character classes. See the +pcre2pattern +documentation for details of the character classes supported.
       PCRE2_ALT_VERBNAMES
     
    @@ -1569,16 +1711,31 @@

    pcre2api man page

    changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than -U+007F. Note that there are two ASCII characters, K and S, that, in addition to +U+007F. +

    +

    +Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.

    +One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +

    +

    For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having another case. +

    +

    +From release 10.45 PCRE2_CASELESS also affects what some of the letter-related +Unicode property escapes (\p and \P) match. The properties Lu (upper case +letter), Ll (lower case letter), and Lt (title case letter) are all treated as +LC (cased letter) when PCRE2_CASELESS is set.

       PCRE2_DOLLAR_ENDONLY
     
    @@ -1775,7 +1932,7 @@

    pcre2api man page

    for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external -sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error.
       PCRE2_NEVER_UTF
     
    @@ -1798,85 +1955,57 @@

    pcre2api man page

       PCRE2_NO_AUTO_POSSESS
     
    -If this option is set, it disables "auto-possessification", which is an -optimization that, for example, turns a+b into a++b in order to avoid +If this (deprecated) option is set, it disables "auto-possessification", which +is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. +

    +

    +If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather +than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS +takes precedence over the pcre2_set_optimize() optimization directives +PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF.

       PCRE2_NO_DOTSTAR_ANCHOR
     
    -If this option is set, it disables an optimization that is applied when .* is -the first significant item in a top-level branch of a pattern, and all the -other branches also start with .* or with \A or \G or ^. The optimization is -automatically disabled for .* if it is inside an atomic group or a capture -group that is the subject of a backreference, or if the pattern contains -(*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is -automatically anchored if PCRE2_DOTALL is set for all the .* items and -PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match -must start either at the start of the subject or following a newline is +If this (deprecated) option is set, it disables an optimization that is applied +when .* is the first significant item in a top-level branch of a pattern, and +all the other branches also start with .* or with \A or \G or ^. The +optimization is automatically disabled for .* if it is inside an atomic group +or a capture group that is the subject of a backreference, or if the pattern +contains (*PRUNE) or (*SKIP). When the optimization is not disabled, such a +pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items +and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any +match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. +(If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF +instead.)
       PCRE2_NO_START_OPTIMIZE
     
    This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT -compiler. +compiler. Setting this option is equivalent to calling pcre2_set_optimize() +with the directive parameter set to PCRE2_START_OPTIMIZE_OFF.

    There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without -actually running the main matching function. This means that a special item -such as (*COMMIT) at the start of a pattern is not considered until after a -suitable starting point for the match has been found. Also, when callouts or -(*MARK) items are in use, these "start-up" optimizations can cause them to be -skipped if the pattern is never actually used. The start-up optimizations are +actually running the main matching function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run.

    -The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, -possibly causing performance to suffer, but ensuring that in cases where the -result is "no match", the callouts do occur, and that items such as (*COMMIT) -and (*MARK) are considered at every possible starting position in the subject -string. -

    -

    -Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation. -Consider the pattern -

    -  (*COMMIT)ABC
    -
    -When this is compiled, PCRE2 records the fact that a match must start with the -character "A". Suppose the subject string is "DEFABC". The start-up -optimization scans along the subject, finds "A" and runs the first match -attempt from there. The (*COMMIT) item means that the pattern must match the -current starting position, which in this case, it does. However, if the same -match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the -subject string does not happen. The first match attempt is run starting from -"D" and when this fails, (*COMMIT) prevents any further matches being tried, so -the overall result is "no match". -

    -

    -As another start-up optimization makes use of a minimum length for a matching -subject, which is recorded when possible. Consider the pattern -

    -  (*MARK:1)B(*MARK:2)(X|Y)
    -
    -The minimum length for a match is two characters. If the subject is "XXBB", the -"starting character" optimization skips "XX", then tries to match "BB", which -is long enough. In the process, (*MARK:2) is encountered and remembered. When -the match attempt fails, the next "B" is found, but there is only one character -left, so there are no more attempts, and "no match" is returned with the "last -mark seen" set to "2". If NO_START_OPTIMIZE is set, however, matches are tried -at every possible starting position, including at the end of the subject, where -(*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is -returned is "1". In this case, the optimizations do not affect the overall -match result, which is still "no match", but they do affect the auxiliary -information that is returned. +Disabling the start-up optimizations may cause performance to suffer. However, +this may be desirable for patterns which contain callouts or items such as +(*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF +for further details.
       PCRE2_NO_UTF_CHECK
     
    @@ -1931,9 +2060,16 @@

    pcre2api man page

    upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default). -The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless +

    +

    +The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII -characters match only non-ASCII characters. +characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option +(see above) alters the matching of the 'i' characters to follow their behaviour +in Turkish and Azeri languages. For further details on +PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the +pcre2unicode +page.

       PCRE2_UNGREEDY
     
    @@ -2070,7 +2206,8 @@

    pcre2api man page

    ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must either be ASCII or non-ASCII. The option -can be changed with a pattern by the (?r) option setting. +can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option +settings.
       PCRE2_EXTRA_ESCAPED_CR_IS_LF
     
    @@ -2097,6 +2234,34 @@

    pcre2api man page

    at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. +
    +  PCRE2_EXTRA_NO_BS0
    +
    +If this option is set (note that its final character is the digit 0) it locks +out the use of the sequence \0 unless at least one more octal digit follows. +
    +  PCRE2_EXTRA_PYTHON_OCTAL
    +
    +If this option is set, PCRE2 follows Python's rules for interpreting octal +escape sequences. The rules for handling sequences such as \14, which could +be an octal number or a back reference are different. Details are given in the +pcre2pattern +documentation. +
    +  PCRE2_EXTRA_NEVER_CALLOUT
    +
    +If this option is set, PCRE2 treats callouts in the pattern as a syntax error, +returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application +knows that a callout will not be provided to pcre2_match(), so that +callouts in the pattern are not silently ignored. +
    +  PCRE2_EXTRA_TURKISH_CASING
    +
    +This option alters case-equivalence of the 'i' letters to follow the +alphabet used by Turkish and Azeri languages. The option can be changed within +a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or +UCP options must be set. In the 8-bit library, UTF must be set. This option +cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.


    JUST-IN-TIME (JIT) COMPILATION

    @@ -2303,6 +2468,7 @@

    pcre2api man page

    PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF
    For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3646,9 +3812,10 @@

    pcre2api man page

    too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (without, of course, writing anything) -in order to compute the size of buffer that is needed. This value is passed -back via the outlengthptr variable, with the result of the function still -being PCRE2_ERROR_NOMEMORY. +in order to compute the size of buffer that is needed, which will include the +extra space for the terminating NUL. This value is passed back via the +outlengthptr variable, with the result of the function still being +PCRE2_ERROR_NOMEMORY.

    Passing a buffer size of zero is a permitted way of finding out how much memory @@ -3667,18 +3834,26 @@

    pcre2api man page

    in any way. By default, however, a dollar character is an escape character that can specify the insertion of characters from capture groups and names from (*MARK) or other control verbs in the pattern. Dollar is the only escape -character (backslash is treated as literal). The following forms are always +character (backslash is treated as literal). The following forms are recognized:
       $$                  insert a dollar character
    -  $<n> or ${<n>}      insert the contents of group <n>
    +  $n or ${n}          insert the contents of group n
    +  $0 or $&            insert the entire matched substring
    +  $`                  insert the substring that precedes the match
    +  $'                  insert the substring that follows the match
    +  $_                  insert the entire input string
       $*MARK or ${*MARK}  insert a control verb name
     
    -Either a group number or a group name can be given for <n>. Curly brackets are -required only if the following character would be interpreted as part of the -number or name. The number may be zero to include the entire matched string. -For example, if the pattern a(b)c is matched with "=abc=" and the replacement -string "+$1$0$1+", the result is "=+babcb+=". +Either a group number or a group name can be given for n, for example $2 or +$NAME. Curly brackets are required only if the following character would be +interpreted as part of the number or name. The number may be zero to include +the entire matched string. For example, if the pattern a(b)c is matched with +"=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=". +

    +

    +The JavaScript form $<name>, where the angle brackets are part of the syntax, +is also recognized for group names, but not for group numbers or *MARK.

    $*MARK inserts the name from the last encountered backtracking control verb on @@ -3732,28 +3907,53 @@

    pcre2api man page

    PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When -PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +PCRE2_SUBSTITUTE_EXTENDED is set, several things change:

    Firstly, backslash in a replacement string is interpreted as an escape -character. The usual forms such as \n or \x{ddd} can be used to specify -particular character codes, and backslash followed by any non-alphanumeric -character quotes that character. Extended quoting can be coded using \Q...\E, -exactly as in pattern strings. +character. The usual forms such as \x{ddd} can be used to specify particular +character codes, and backslash followed by any non-alphanumeric character +quotes that character. Extended quoting can be coded using \Q...\E, exactly +as in pattern strings. The escapes \b and \v are interpreted as the +characters backspace and vertical tab, respectively. +

    +

    +The interpretation of backslash followed by one or more digits is the same as +in a pattern, which in Perl has some ambiguities. Details are given in the +pcre2pattern +page. +

    +

    +The Python form \g<n>, where the angle brackets are part of the syntax and n +is either a group name or number, is recognized as an altertive way of +inserting the contents of a group, for example \g<3>.

    There are also four escape sequences for forcing the case of inserted letters. -The insertion mechanism has three states: no case forcing, force upper case, -and force lower case. The escape sequences change the current state: \U and -\L change to upper or lower case forcing, respectively, and \E (when not -terminating a \Q quoted sequence) reverts to no case forcing. The sequences -\u and \l force the next character (if it is a letter) to upper or lower -case, respectively, and then the state automatically reverts to no case -forcing. Case forcing applies to all inserted characters, including those from -capture groups and letters within \Q...\E quoted sequences. If either -PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode +Case forcing applies to all inserted characters, including those from capture +groups and letters within \Q...\E quoted sequences. The insertion mechanism +has three states: no case forcing, force upper case, and force lower case. The +escape sequences change the current state: \U and \L change to upper or lower +case forcing, respectively, and \E (when not terminating a \Q quoted +sequence) reverts to no case forcing. The sequences \u and \l force the next +character (if it is a letter) to upper or lower case, respectively, and then +the state automatically reverts to no case forcing. +

    +

    +However, if \u is immediately followed by \L or \l is immediately followed +by \U, the next character's case is forced by the first escape sequence, and +subsequent characters by the second. This provides a "title casing" facility +that can be applied to group captures. For example, if group 1 has captured +"heLLo", the replacement string "\u\L$1" becomes "Hello". +

    +

    +If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater -than 127. +than 127. However, only simple case folding, as determined by the Unicode file +CaseFolding.txt is supported. PCRE2 does not support language-specific +special casing rules such as using different lower case Greek sigmas in the +middle and ends of words (as defined in the Unicode file +SpecialCasing.txt).

    Note that case forcing sequences such as \U...\E do not nest. For example, @@ -3762,20 +3962,20 @@

    pcre2api man page

    not apply to replacement strings.

    -The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash:

    -  ${<n>:-<string>}
    -  ${<n>:+<string1>:<string2>}
    +  ${n:-string}
    +  ${n:+string1:string2}
     
    -As before, <n> may be a group number or a name. The first form specifies a -default value. If group <n> is set, its value is inserted; if not, <string> is -expanded and the result inserted. The second form specifies strings that are -expanded and inserted when group <n> is set or unset, respectively. The first -form is just a convenient shorthand for +As in the simple case, n may be a group number or a name. The first form +specifies a default value. If group n is set, its value is inserted; if +not, the string is expanded and the result inserted. The second form specifies +strings that are expanded and inserted when group n is set or unset, +respectively. The first form is just a convenient shorthand for
    -  ${<n>:+${<n>}:<string>}
    +  ${n:+${n}:string}
     
    Backslash can be used to escape colons and closing curly brackets in the replacement strings. A change of the case forcing state within a replacement @@ -3852,9 +4052,18 @@

    pcre2api man page

    The pcre2_set_substitution_callout() function can be used to specify a callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution has -been processed, but it can cause the replacement not to happen. The callout -function is not called for simulated substitutions that happen as a result of -the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. +been processed, but it can cause the replacement not to happen. +

    +

    +The callout function is not called for simulated substitutions that happen as a +result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when +substitution processing exceeds the buffer space provided by the caller, +processing continues by counting code units. The simulation is unable to +populate the callout block, and so the simulation is pessimistic about the +required buffer size. Whichever is larger of accepted or rejected substitution +is reported as the required size. Therefore, the returned buffer length may be +an overestimate (without a substitution callout, it is normally an exact +measurement).

    The first argument of the callout function is a pointer to a substitute callout @@ -3903,6 +4112,107 @@

    pcre2api man page

    output and the call to pcre2_substitute() exits, returning the number of matches so far.

    +
    +Substitution case callouts +
    +

    +int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
    +
    +The pcre2_set_substitution_case_callout() function can be used to specify +a callout function for pcre2_substitute() to use when performing case +transformations. This does not affect any case insensitivity behaviour when +performing a match, but only the user-visible transformations performed when +processing a substitution such as: +

    +    pcre2_substitute(..., "\\U$1", ...)
    +
    +

    +

    +The default case transformations applied by PCRE2 are reasonably complete, and, +in UTF or UCP mode, perform the simple locale-invariant case transformations as +specified by Unicode. This is suitable for the internal (invisible) +case-equivalence procedures used during pattern matching, but an application +may wish to use more sophisticated locale-aware processing for the user-visible +substitution transformations. +

    +

    +One example implementation of the callout_function using the ICU +library would be: +
    +
    +

    +    PCRE2_SIZE
    +    icu_case_callout(
    +      PCRE2_SPTR input, PCRE2_SIZE input_len,
    +      PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
    +      int to_case, void *data_ptr)
    +    {
    +      UErrorCode err = U_ZERO_ERROR;
    +      int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER
    +        ? u_strToLower(output, output_cap, input, input_len, NULL, &err)
    +        : to_case == PCRE2_SUBSTITUTE_CASE_UPPER
    +        ? u_strToUpper(output, output_cap, input, input_len, NULL, &err)
    +        : u_strToTitle(output, output_cap, input, input_len, &first_char_only,
    +                       NULL, &err);
    +      if (U_FAILURE(err)) return (~(PCRE2_SIZE)0);
    +      return r;
    +    }
    +
    +

    +

    +The first and second arguments of the case callout function are the Unicode +string to transform. +

    +

    +The third and fourth arguments are the output buffer and its capacity. +

    +

    +The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, +PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. +PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the +callout to indicate that the case of the entire callout input should be +case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that +only the first character or glyph should be transformed to Unicode titlecase +and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode +properties to titlecase each word in a string; but PCRE2 is requesting that only +the single leading character is to be titlecased). +

    +

    +The sixth argument is the callout_data supplied to +pcre2_set_substitute_case_callout(). +

    +

    +The resulting string in the destination buffer may be larger or smaller than the +input, if the casing rules merge or split characters. The return value is the +length required for the output string. If a buffer of sufficient size was +provided to the callout, then the result must be written to the buffer and the +number of code units returned. If the result does not fit in the provided +buffer, then the required capacity must be returned and PCRE2 will not make use +of the output buffer. PCRE2 provides input and output buffers which overlap, so +the callout must support this by suitable internal buffering. +

    +

    +Alternatively, if the callout wishes to indicate an error, then it may return +(~(PCRE2_SIZE)0). In this case pcre2_substitute() will immediately fail with +error PCRE2_ERROR_REPLACECASE. +

    +

    +When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH +option, there are situations when pcre2_substitute() will return an +underestimate of the required buffer size. If you call pcre2_substitute() once +with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for +the replacement string to be constructed, then instead of calling the case +callout, pcre2_substitute() will make an estimate of the required buffer size. +The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that +second call is not guaranteed to succeed either, if the case callout requires +more buffer space than expected. The caller must make repeated attempts in a +loop. +


    DUPLICATE CAPTURE GROUP NAMES

    int pcre2_substring_nametable_scan(const pcre2_code *code, @@ -4177,7 +4487,7 @@

    pcre2api man page


    REVISION

    -Last updated: 24 April 2024 +Last updated: 26 December 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2build.html b/usr/share/doc/pcre2/html/pcre2build.html index d4b0d336b08..f4e127f14ca 100644 --- a/usr/share/doc/pcre2/html/pcre2build.html +++ b/usr/share/doc/pcre2/html/pcre2build.html @@ -643,7 +643,7 @@

    pcre2build man page


    REVISION

    -Last updated: 15 April 2024 +Last updated: 16 April 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2compat.html b/usr/share/doc/pcre2/html/pcre2compat.html index d60182ed48a..5f7e280d34f 100644 --- a/usr/share/doc/pcre2/html/pcre2compat.html +++ b/usr/share/doc/pcre2/html/pcre2compat.html @@ -71,7 +71,7 @@

    pcre2compat man page

    7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties such as Lu and -Nd, the derived properties Any and LC (synonym L&), script names such as Greek +Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See the @@ -99,7 +99,12 @@

    pcre2compat man page

    \Q\\E \ \\E
    The \Q...\E sequence is recognized both inside and outside character classes -by both PCRE2 and Perl. +by both PCRE2 and Perl. Another difference from Perl is that any appearance of +\Q or \E inside what might otherwise be a quantifier causes PCRE2 not to +recognize the sequence as a quantifier. Perl recognizes a quantifier if +(redundantly) either of the numbers is inside \Q...\E, but not if the +separating comma is. When not recognized as a quantifier a sequence such as +{\Q1\E,2} is treated as the literal string "{1,2}".

    9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) @@ -120,7 +125,9 @@

    pcre2compat man page

    not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are -processed as anchored at the point where they are tested. +processed as anchored at the point where they are tested. PCRE2 also confines +all control verbs within atomic assertions, again including (*THEN) in +assertions with only one branch.

    12. If a pattern contains more than one backtracking control verb, the first @@ -159,11 +166,11 @@

    pcre2compat man page

    certainly user mistakes.

    -17. In PCRE2, the upper/lower case character properties Lu and Ll are not -affected when case-independent matching is specified. For example, \p{Lu} -always matches an upper case letter. I think Perl has changed in this respect; -in the release at the time of writing (5.38), \p{Lu} and \p{Ll} match all -letters, regardless of case, when case independence is specified. +17. In PCRE2, until release 10.45, the upper/lower case character properties Lu +and Ll were not affected when case-independent matching was specified. Perl has +changed in this respect, and PCRE2 has now changed to match. When caseless +matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased +letter).

    18. From release 5.32.0, Perl locks out the use of \K in lookaround @@ -231,6 +238,10 @@

    pcre2compat man page

    numbers such as +2 and -4 in all three cases. Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. +
    +
    +(m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 extension +that is not available in Perl.

    20. Perl has different limits than PCRE2. See the @@ -252,6 +263,18 @@

    pcre2compat man page

    /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject.

    +

    +23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl tries to +recover and prints a warning if the problem was that an invalid hexadecimal +digit was found, since PCRE2 doesn't have warnings it returns an error instead. +Additionally, Perl accepts \x{} and generates NUL unlike PCRE2. +

    +

    +24. From release 10.45, PCRE2 gives an error if \x is not followed by a +hexadecimal digit or a curly bracket. It used to interpret this as the NUL +character. Perl still generates NUL, but warns when in warning mode in most +cases. +


    AUTHOR
    @@ -267,9 +290,9 @@

    pcre2compat man page

    REVISION

    -Last updated: 30 November 2023 +Last updated: 02 October 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/usr/share/doc/pcre2/html/pcre2convert.html b/usr/share/doc/pcre2/html/pcre2convert.html index 6b9fea5575e..57e8989fb4a 100644 --- a/usr/share/doc/pcre2/html/pcre2convert.html +++ b/usr/share/doc/pcre2/html/pcre2convert.html @@ -182,7 +182,7 @@

    pcre2convert man page


    REVISION

    -Last updated: 28 June 2018 +Last updated: 14 November 2023
    Copyright © 1997-2018 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2grep.html b/usr/share/doc/pcre2/html/pcre2grep.html index bd12246ae99..66c56029698 100644 --- a/usr/share/doc/pcre2/html/pcre2grep.html +++ b/usr/share/doc/pcre2/html/pcre2grep.html @@ -391,9 +391,10 @@

    pcre2grep man page

    command line, no delimiters should be used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --newline option has no effect on this option. Trailing white space is -removed from each line, and blank lines are ignored. An empty file contains no +removed from each line, and blank lines are ignored unless the +--posix-pattern-file option is also provided. An empty file contains no patterns and therefore matches nothing. Patterns read from a file in this way -may contain binary zeros, which are treated as ordinary data characters. +may contain binary zeros, which are treated as ordinary character literals.

    If this option is given more than once, all the specified files are read. A @@ -723,9 +724,9 @@

    pcre2grep man page



    $<digits> or ${<digits>} is replaced by the captured substring of the given -decimal number; zero substitutes the whole match. If the number is greater than -the number of capturing substrings, or if the capture is unset, the replacement -is empty. +decimal number; $& (or the legacy $0) substitutes the whole match. If the +number is greater than the number of capturing substrings, or if the capture +is unset, the replacement is empty.

    $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by @@ -808,6 +809,15 @@

    pcre2grep man page

    allowing \w to match Unicode letters and digits.

    +--posix-pattern-file +When patterns are provided with the -f option, do not trim trailing +spaces or ignore empty lines in a similar way than other grep tools. To keep +the behaviour consistent with older versions, if the pattern read was +terminated with CRLF (as character literals) then both characters won't be +included as part of it, so if you really need to have pattern ending in '\r', +use a escape sequence or provide it by a different method. +

    +

    -q, --quiet Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found. @@ -993,7 +1003,7 @@

    pcre2grep man page

    callout facility. However, this support can be completely or partially disabled when pcre2grep is built. You can find out whether your binary has support for callouts by running it with the --help option. If callout support is -completely disabled, all callouts in patterns are ignored by pcre2grep. +completely disabled, callouts in patterns are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored.

    @@ -1015,9 +1025,9 @@

    pcre2grep man page

    zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the --output (-O) option (see -above). However, $0 cannot be used to insert a matched substring because the -match is still in progress. Instead, the single character '0' is inserted. Any -syntax errors in the string (for example, a dollar not followed by another +above). However, $0 or $& cannot be used to insert a matched substring because +the match is still in progress. Instead, the single character '0' is inserted. +Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example: @@ -1047,9 +1057,9 @@

    pcre2grep man page

    Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the --output -(-O) option documented above, except that $0 cannot insert the matched -string because the match is still in progress. Instead, the character '0' -is inserted. If you need a literal dollar or pipe character in any +(-O) option documented above, except that $0 or $& cannot insert the +matched string because the match is still in progress. Instead, the character +'0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example:
       echo -e "abcde\n12345" | pcre2grep \
    @@ -1116,7 +1126,7 @@ 

    pcre2grep man page


    REVISION

    -Last updated: 22 December 2023 +Last updated: 04 February 2025
    Copyright © 1997-2023 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2jit.html b/usr/share/doc/pcre2/html/pcre2jit.html index d97d8003ccb..6835cd8898a 100644 --- a/usr/share/doc/pcre2/html/pcre2jit.html +++ b/usr/share/doc/pcre2/html/pcre2jit.html @@ -64,7 +64,7 @@

    pcre2jit man page

    If --enable-jit is set on an unsupported platform, compilation fails.

    -A client program can tell if JIT support is available by calling +A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular @@ -72,11 +72,19 @@

    pcre2jit man page

    items that are not supported by JIT (see below). Another reason is that in some environments JIT is unable to get -memory in which to build its compiled code. The only guarantee from +executable memory in which to build its compiled code. The only guarantee from pcre2_config() is that if it returns zero, JIT will definitely not be used.

    +As of release 10.45 there is a more informative way to test for JIT support. If +pcre2_compile_jit() is called with the single option PCRE2_JIT_TEST_ALLOC +it returns zero if JIT is available and has a working allocator. Otherwise it +returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable +memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. The +code argument is ignored, so it can be a NULL value. +

    +

    A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs @@ -126,7 +134,8 @@

    pcre2jit man page

    PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial matching. If pcre2_jit_compile() is called with no option bits set, it immediately -returns zero. This is an alternative way of testing whether JIT is available. +returns zero. This is an alternative way of testing whether JIT support has +been compiled.

    At present, it is not possible to free JIT compiled code except when the entire @@ -487,7 +496,7 @@

    pcre2jit man page


    REVISION

    -Last updated: 21 February 2024 +Last updated: 22 August 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2limits.html b/usr/share/doc/pcre2/html/pcre2limits.html index 8152ed22d71..514c50b2396 100644 --- a/usr/share/doc/pcre2/html/pcre2limits.html +++ b/usr/share/doc/pcre2/html/pcre2limits.html @@ -96,7 +96,7 @@

    pcre2limits man page

    REVISION

    -Last updated: August 2023 +Last updated: 16 August 2023
    Copyright © 1997-2023 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2matching.html b/usr/share/doc/pcre2/html/pcre2matching.html index 3b8b629380c..4d0232507b6 100644 --- a/usr/share/doc/pcre2/html/pcre2matching.html +++ b/usr/share/doc/pcre2/html/pcre2matching.html @@ -27,7 +27,7 @@

    pcre2matching man page

    This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() -function. This works in the same as Perl's matching function, and provide a +function. This works in the same as Perl's matching function, and provides a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the pcre2jit @@ -42,7 +42,7 @@

    pcre2matching man page

    When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, -when there are multiple possibilities. For example, if the pattern +when there are multiple possibilities. For example, if the anchored pattern

       ^<.*>
     
    @@ -115,9 +115,9 @@

    pcre2matching man page

    Note that the size of vector needed to contain all the results depends on the -number of simultaneous matches, not on the number of parentheses in the -pattern. Using pcre2_match_data_create_from_pattern() to create the match -data block is therefore not advisable when doing DFA matching. +number of simultaneous matches, not on the number of capturing parentheses in +the pattern. Using pcre2_match_data_create_from_pattern() to create the +match data block is therefore not advisable when doing DFA matching.

    Note also that all the matches that are found start at the same point in the @@ -166,37 +166,43 @@

    pcre2matching man page

    do this. This means that no captured substrings are available.

    -3. Because no substrings are captured, backreferences within the pattern are -not supported. -

    -

    -4. For the same reason, conditional expressions that use a backreference as the -condition or test for a specific group recursion are not supported. -

    -

    -5. Again for the same reason, script runs are not supported. +3. Because no substrings are captured, a number of related features are not +available: +
    +
    +(a) Backreferences; +
    +
    +(b) Conditional expressions that use a backreference as the condition or test +for a specific group recursion; +
    +
    +(c) Script runs; +
    +
    +(d) Scan substring assertions.

    -6. Because many paths through the tree may be active, the \K escape sequence, +4. Because many paths through the tree may be active, the \K escape sequence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported.

    -7. Callouts are supported, but the value of the capture_top field is +5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0.

    -8. The \C escape sequence, which (in the standard algorithm) always matches a -single code unit, even in a UTF mode, is not supported in these modes, because +6. The \C escape sequence, which (in the standard algorithm) always matches a +single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree.

    -9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not +7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion.

    -10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not +8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not supported by pcre2_dfa_match().


    ADVANTAGES OF THE ALTERNATIVE ALGORITHM
    @@ -223,15 +229,18 @@

    pcre2matching man page

    less susceptible to optimization.

    -2. Capturing parentheses, backreferences, script runs, and matching within -invalid UTF string are not supported. +2. Capturing parentheses and other features such as backreferences that rely on +them are not supported. +

    +

    +3. Matching within invalid UTF strings is not supported.

    -3. Although atomic groups are supported, their use does not provide the +4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm.

    -4. JIT optimization is not supported. +5. JIT optimization is not supported.


    AUTHOR

    @@ -244,7 +253,7 @@

    pcre2matching man page


    REVISION

    -Last updated: 19 January 2024 +Last updated: 30 August 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2partial.html b/usr/share/doc/pcre2/html/pcre2partial.html index 64116c4f20f..067064d90a1 100644 --- a/usr/share/doc/pcre2/html/pcre2partial.html +++ b/usr/share/doc/pcre2/html/pcre2partial.html @@ -399,7 +399,7 @@

    pcre2partial man page


    REVISION

    -Last updated: 04 September 2019 +Last updated: 27 November 2024
    Copyright © 1997-2019 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2pattern.html b/usr/share/doc/pcre2/html/pcre2pattern.html index cf50c1a1095..84eb0aa17c5 100644 --- a/usr/share/doc/pcre2/html/pcre2pattern.html +++ b/usr/share/doc/pcre2/html/pcre2pattern.html @@ -14,37 +14,41 @@

    pcre2pattern man page



    PCRE2 REGULAR EXPRESSION DETAILS

    @@ -52,9 +56,11 @@

    pcre2pattern man page

    are described in detail below. There is a quick-reference syntax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. -PCRE2 also supports some alternative regular expression syntax (which does not -conflict with the Perl syntax) in order to provide some compatibility with -regular expressions in Python, .NET, and Oniguruma. +PCRE2 also supports some alternative regular expression syntax that does not +conflict with the Perl syntax in order to provide some compatibility with +regular expressions in Python, .NET, and Oniguruma. There are in addition some +options that enable alternative syntax and semantics that are not the same as +in Perl.

    Perl's regular expressions are described in its own documentation, and regular @@ -74,7 +80,19 @@

    pcre2pattern man page

    pcre2matching page.

    -
    SPECIAL START-OF-PATTERN ITEMS
    +
    EBCDIC CHARACTER CODES
    +

    +Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes +this by default. However, it can be compiled to run in an environment that uses +the EBCDIC code, which is the case for some IBM mainframe operating systems. In +the sections below, character code values are ASCII or Unicode; in an EBCDIC +environment these characters may have different code values, and there are no +code points greater than 255. Differences in behaviour when PCRE2 is running in +an EBCDIC environment are described in the section +"EBCDIC environments" +below, which you can ignore unless you really are in an EBCDIC environment. +

    +
    SPECIAL START-OF-PATTERN ITEMS

    A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-compatible, but @@ -141,7 +159,8 @@

    pcre2pattern man page


    If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting -the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making quantifiers +the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_optimize() with +a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers possessive when what follows cannot match the repeated item. For example, by default a+b is treated as a++b. For more details, see the pcre2api @@ -152,8 +171,9 @@

    pcre2pattern man page


    If a pattern starts with (*NO_START_OPT), it has the same effect as setting the -PCRE2_NO_START_OPTIMIZE option. This disables several optimizations for quickly -reaching "no match" results. For more details, see the +PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_optimize() with +a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for +quickly reaching "no match" results. For more details, see the pcre2api documentation.

    @@ -162,7 +182,8 @@

    pcre2pattern man page


    If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as -setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimizations that +setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_optimize() +with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables optimizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the pcre2api @@ -275,14 +296,6 @@

    pcre2pattern man page

    (*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, corresponding to PCRE2_BSR_UNICODE.

    -
    EBCDIC CHARACTER CODES
    -

    -PCRE2 can be compiled to run in an environment that uses EBCDIC as its -character code instead of ASCII or Unicode (typically a mainframe system). In -the sections below, character code values are ASCII or Unicode; in an EBCDIC -environment these characters may have different code values, and there are no -code points greater than 255. -


    CHARACTERS AND METACHARACTERS

    A regular expression is a pattern that is matched against a subject string from @@ -298,7 +311,10 @@

    pcre2pattern man page

    equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to -pcre2_compile() or set by (?r) within the pattern). +pcre2_compile() or set by (*CASELESS_RESTRICT) or (?r) within the +pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed +to pcre2_compile() or set by (*TURKISH_CASING) within the pattern), then +the 'i' letters are matched according to Turkish and Azeri languages.

    The power of regular expressions comes from the ability to include wild cards, @@ -346,7 +362,7 @@

    pcre2pattern man page

    If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or -between a # outside a character class and the next newline, inclusive, are +between a # outside a character class and the next newline, inclusive, is ignored. An escaping backslash can be used to include a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are @@ -404,6 +420,14 @@

    pcre2pattern man page

    the pattern (that is, \E is assumed at the end). If the isolated \Q is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket. +

    +

    +Another difference from Perl is that any appearance of \Q or \E inside what +might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a +quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers +is inside \Q...\E, but not if the separating comma is. When not recognized as +a quantifier a sequence such as {\Q1\E,2} is treated as the literal string +"{1,2}".


    Non-printing characters @@ -424,17 +448,28 @@

    pcre2pattern man page

    \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd - \ddd character with octal code ddd, or backreference + \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh..
    -By default, after \x that is not followed by {, from zero to two hexadecimal -digits are read (letters can be in upper or lower case). Any number of -hexadecimal digits may appear between \x{ and }. If a character other than a -hexadecimal digit appears between \x{ and }, or if there is no terminating }, -an error occurs. +A description of how back references work is given +later, +following the discussion of +parenthesized groups. +

    +

    +By default, after \x that is not followed by {, one or two hexadecimal +digits are read (letters can be in upper or lower case). If the character that +follows \x is neither { nor a hexadecimal digit, an error occurs. This is +different from Perl's default behaviour, which generates a NUL character, but +is in line with the behaviour of Perl's 'strict' mode in re. +

    +

    +Any number of hexadecimal digits may appear between \x{ and }. If a character +other than a hexadecimal digit appears between \x{ and }, or if there is no +terminating }, an error occurs.

    Characters whose code points are less than 256 can be defined by either of the @@ -481,69 +516,54 @@

    pcre2pattern man page

    a compile-time error occurs.

    -When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, -\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c -escape is processed as specified for Perl in the perlebcdic document. The -only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], -^, _, or ?. Any other character provokes a compile-time error. The sequence -\c@ encodes character code 0; after \c the letters (in either case) encode -characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 -(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +For differences in the way some escapes behave in EBCDIC environments, +see section +"EBCDIC environments" +below.

    +
    +Octal escapes and back references +

    -Thus, apart from \c?, these escapes generate the same character code values as -they do in an ASCII environment, though the meanings of the values mostly -differ. For example, \cG always generates code value 7, which is BEL in ASCII -but DEL in EBCDIC. +The escape \o must be followed by a sequence of octal digits, enclosed in +braces. An error occurs if this is not the case. This escape provides a way of +specifying character code points as octal numbers greater than 0777, and it +also allows octal numbers and backreferences to be unambiguously distinguished.

    -The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but -because 127 is not a control character in EBCDIC, Perl makes it generate the -APC character. Unfortunately, there are several variants of EBCDIC. In most of -them the APC character has the value 255 (hex FF), but in the one Perl calls -POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC -values, PCRE2 makes \c? generate 95; otherwise it generates 255. +If braces are not used, after \0 up to two further octal digits are read. +However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit +must follow \0 (use \00 to generate a NUL character). Make sure you supply +two digits after the initial zero if the pattern character that follows is +itself an octal digit.

    -After \0 up to two further octal digits are read. If there are fewer than two -digits, just those that are present are used. Thus the sequence \0\x\015 -specifies two binary zeros followed by a CR character (code value 13). Make -sure you supply two digits after the initial zero if the pattern character that -follows is itself an octal digit. +Inside a character class, when a backslash is followed by any octal digit, up +to three octal digits are read to generate a code point. Any subsequent digits +stand for themselves. The sequences \8 and \9 are treated as the literal +characters "8" and "9".

    -The escape \o must be followed by a sequence of octal digits, enclosed in -braces. An error occurs if this is not the case. This escape is a recent -addition to Perl; it provides way of specifying character code points as octal -numbers greater than 0777, and it also allows octal numbers and backreferences -to be unambiguously specified. +Outside a character class, Perl's handling of a backslash followed by a digit +other than 0 is complicated by ambiguity, and Perl has changed over time, +causing PCRE2 also to change. From PCRE2 release 10.45 there is an option +called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous +rules. The next two subsections describe the two sets of rules.

    For greater clarity and unambiguity, it is best to avoid following \ by a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical -character code points, and \g{...} to specify backreferences. The following -paragraphs describe the old, ambiguous syntax. -

    -

    -The handling of a backslash followed by a digit other than 0 is complicated, -and Perl has changed over time, causing PCRE2 also to change. -

    -

    -Outside a character class, PCRE2 reads the digit and any following digits as a -decimal number. If the number is less than 10, begins with the digit 8 or 9, or -if there are at least that many previous capture groups in the expression, the -entire sequence is taken as a backreference. A description of how this -works is given -later, -following the discussion of -parenthesized groups. -Otherwise, up to three octal digits are read to form a character code. +character code points, and \g{...} to specify backreferences.

    +
    +Perl rules for non-class backslash 1-9 +

    -Inside a character class, PCRE2 handles \8 and \9 as the literal characters -"8" and "9", and otherwise reads up to three octal digits following the -backslash, using them to generate a data character. Any subsequent digits stand -for themselves. For example, outside a character class: +All the digits that follow the backslash are read as a decimal number. If the +number is less than 10, begins with the digit 8 or 9, or if there are at least +that many previous capture groups in the expression, the entire sequence is +taken as a back reference. Otherwise, up to three octal digits are read to form +a character code. For example:

       \040   is another way of writing an ASCII space
       \40    is the same, provided there are fewer than 40 previous capture groups
    @@ -560,6 +580,19 @@ 

    pcre2pattern man page

    digits are ever read.


    +Python rules for non_class backslash 1-9 +
    +

    +If there are at least three octal digits after the backslash, exactly three are +read as an octal code point number, but the value must be no greater than +\377, even in modes where higher code point values are supported. Any +subsequent digits stand for themselves. If there are fewer than three octal +digits, the sequence is taken as a decimal back reference. Thus, for example, +\12 is always a back reference, independent of how many captures there are in +the pattern. An error is generated for a reference to a non-existent capturing +group. +

    +
    Constraints on character values

    @@ -805,7 +838,7 @@

    pcre2pattern man page

    sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are -less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points +less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Unknown script and with an unassigned type.

    @@ -823,12 +856,33 @@

    pcre2pattern man page

    \P{xx} a character without the xx property \X a Unicode extended grapheme cluster
    -The property names represented by xx above are not case-sensitive, and in -accordance with Unicode's "loose matching" rules, spaces, hyphens, and -underscores are ignored. There is support for Unicode script names, Unicode -general category properties, "Any", which matches any character (including -newline), Bidi_Class, a number of binary (yes/no) properties, and some special -PCRE2 properties (described +For compatibility with Perl, negation can be specified by including a +circumflex between the opening brace and the property. For example, \p{^Lu} is +the same as \P{Lu}. +

    +

    +In accordance with Unicode's "loose matching" rules, ASCII white space +characters, hyphens, and underscores are ignored in the properties represented +by xx above. As well as the space character, ASCII white space can be +tab, linefeed, vertical tab, formfeed, or carriage return. +

    +

    +Some properties are specified as a name only; others as a name and a value, +separated by a colon or an equals sign. The names and values consist of ASCII +letters and digits (with one Perl-specific exception, see below). They are not +case sensitive. Note, however, that the escapes themselves, \p and \P, +are case sensitive. There are abbreviations for many names. The following +examples are all equivalent: +

    +  \p{bidiclass=al}
    +  \p{BC=al}
    +  \p{ Bidi_Class : AL }
    +  \p{ Bi-di class = Al }
    +  \P{ ^ Bi-di class = Al }
    +
    +There is support for Unicode script names, Unicode general category properties, +"Any", which matches any character (including newline), Bidi_Class, a number of +binary (yes/no) properties, and some special PCRE2 properties (described below). Certain other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} does not match any characters, so always causes a @@ -844,10 +898,11 @@

    pcre2pattern man page

    example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and "script extensions" for the -property types are recognized, and a equals sign is an alternative to the -colon. If a script name is given without a property type, for example, -\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this -interpretation at release 5.26 and PCRE2 changed at release 10.40. +property types are recognized and, as for all property specifications, an +equals sign is an alternative to the colon. If a script name is given without a +property type, for example, \p{Adlam}, it is treated as \p{scx:Adlam}. Perl +changed to this interpretation at release 5.26 and PCRE2 changed at release +10.40.

    Unassigned characters (and in non-UTF 32-bit mode, characters with code points @@ -865,15 +920,10 @@

    pcre2pattern man page


    Each character has exactly one Unicode general category property, specified by -a two-letter abbreviation. For compatibility with Perl, negation can be -specified by including a circumflex between the opening brace and the property -name. For example, \p{^Lu} is the same as \P{Lu}. -

    -

    -If only one letter is specified with \p or \P, it includes all the general -category properties that start with that letter. In this case, in the absence -of negation, the curly brackets in the escape sequence are optional; these two -examples have the same effect: +a two-letter abbreviation. If only one letter is specified with \p or \P, it +includes all the general category properties that start with that letter. In +this case, in the absence of negation, the curly brackets in the escape +sequence are optional; these two examples have the same effect:

       \p{L}
       \pL
    @@ -888,6 +938,7 @@ 

    pcre2pattern man page

    Cs Surrogate L Letter + Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter @@ -924,9 +975,13 @@

    pcre2pattern man page

    Zp Paragraph separator Zs Space separator
    -The special property LC, which has the synonym L&, is also supported: it -matches a character that has the Lu, Ll, or Lt property, in other words, a -letter that is not classified as a modifier or "other". +Perl originally used the name L& for the Lc property. This is still supported +by Perl, but discouraged. PCRE2 also still supports it. This property matches +any character that has the Lu, Ll, or Lt property, in other words, any letter +that is not classified as a modifier or "other". From release 10.45 of PCRE2 +the properties Lu, Ll, and Lt are all treated as Lc when case-independent +matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The +other properties are not affected by caseless matching.

    The Cs (Surrogate) property applies only to characters whose code points are in @@ -948,11 +1003,6 @@

    pcre2pattern man page

    Instead, this property is assumed for any code point that is not in the Unicode table.

    -

    -Specifying caseless matching does not affect these escape sequences. For -example, \p{Lu} always matches only upper case letters. This is different from -the behaviour of current versions of Perl. -


    Binary (yes/no) properties for \p and \P
    @@ -997,10 +1047,11 @@

    pcre2pattern man page

    RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space
    -An equals sign may be used instead of a colon. The class names are -case-insensitive; only the short names listed above are recognized. +As in all property specifications, an equals sign may be used instead of a +colon and the class names are case-insensitive. Only the short names listed +above are recognized; PCRE2 does not at present support any long alternatives.


    Extended grapheme clusters @@ -1073,11 +1124,11 @@

    pcre2pattern man page

    Xan matches characters that have either the L (letter) or the N (number) property. Xps matches the characters tab, linefeed, vertical tab, form feed, or -carriage return, and any other character that has the Z (separator) property. -Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl -compatibility, but Perl changed. Xwd matches the same characters as Xan, plus -those that match Mn (non-spacing mark) or Pc (connector punctuation, which -includes underscore). +carriage return, and any other character that has the Z (separator) property +(this includes the space character). Xsp is the same as Xps; in PCRE1 it used +to exclude vertical tab, for Perl compatibility, but Perl changed. Xwd matches +the same characters as Xan, plus those that match Mn (non-spacing mark) or Pc +(connector punctuation, which includes underscore).

    There is another non-standard property, Xuc, which matches any character that @@ -1389,13 +1440,12 @@

    pcre2pattern man page

    character, or escape it with a backslash.

    -For example, the character class [aeiou] matches any lower case vowel, while -[^aeiou] matches any character that is not a lower case vowel. Note that a -circumflex is just a convenient notation for specifying the characters that -are in the class by enumerating those that are not. A class that starts with a -circumflex is not an assertion; it still consumes a character from the subject -string, and therefore it fails if the current pointer is at the end of the -string. +For example, the character class [aeiou] matches any lower case English vowel, +whereas [^aeiou] matches all other characters. Note that a circumflex is just a +convenient notation for specifying the characters that are in the class by +enumerating those that are not. A class that starts with a circumflex is not an +assertion; it still consumes a character from the subject string, and therefore +it fails to match if the current pointer is at the end of the string.

    Characters in a class may be specified by their code points using \o, \x, or @@ -1405,7 +1455,10 @@

    pcre2pattern man page

    match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) -respectively when either PCRE2_UTF or PCRE2_UCP is set. +respectively when either PCRE2_UTF or PCRE2_UCP is set. If you do not want +these ASCII/non-ASCII case equivalences, you can suppress them by setting +PCRE2_EXTRA_CASELESS_RESTRICT, either as an option in a compile context, or by +including (*CASELESS_RESTRICT) or (?r) within a pattern.

    Characters that might indicate line breaks are never treated in any special way @@ -1437,6 +1490,12 @@

    pcre2pattern man page

    b to d, a hyphen character, or z.

    +There is some special treatment for alphabetic ranges in EBCDIC environments; +see the section +"EBCDIC environments" +below. +

    +

    Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d or \H. However, unless the hyphen is the last character in the class, Perl outputs a @@ -1448,9 +1507,9 @@

    pcre2pattern man page

    range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as -the end of range, so [W-\]46] is interpreted as a class containing a range -followed by two other characters. The octal or hexadecimal representation of -"]" can also be used to end a range. +the end of a range, so [W-\]46] is interpreted as a class containing a range +and two other characters. The octal or hexadecimal representation of "]" can +also be used to end a range.

    Ranges normally include all code points between the start and end characters, @@ -1463,15 +1522,6 @@

    pcre2pattern man page

    surrogates, are always permitted.

    -There is a special case in EBCDIC environments for ranges whose end points are -both specified as literal letters in the same case. For compatibility with -Perl, EBCDIC code points within the range that are not letters are omitted. For -example, [h-k] matches only four characters, even though the codes for h and k -are 0x88 and 0x92, a range of 11 code points. However, if the range is -specified numerically, for example, [\x88-\x92] or [h-\x92], all code points -are included. -

    -

    If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character @@ -1487,18 +1537,132 @@

    pcre2pattern man page

    something AND NOT ...".

    -The only metacharacters that are recognized in character classes are backslash, -hyphen (only where it can be interpreted as specifying a range), circumflex -(only at the start), opening square bracket (only when it can be interpreted as -introducing a POSIX class name, or for a special compatibility feature - see -the next two sections), and the terminating closing square bracket. However, -escaping other non-alphanumeric characters does no harm. +The metacharacters that are recognized in character classes are backslash, +hyphen (when it can be interpreted as specifying a range), circumflex +(only at the start), and the terminating closing square bracket. An opening +square bracket is also special when it can be interpreted as introducing a +POSIX class (see +"Posix character classes" +below), or a special compatibility feature (see +"Compatibility feature for word boundaries" +below. Escaping any non-alphanumeric character in a class turns it into a +literal, whether or not it would otherwise be a metacharacter. +

    +
    PERL EXTENDED CHARACTER CLASSES
    +

    +From release 10.45 PCRE2 supports Perl's (?[...]) extended character class +syntax. This can be used to perform set operations such as intersection on +character classes. +

    +

    +The syntax permitted within (?[...]) is quite different to ordinary character +classes. Inside the extended class, there is an expression syntax consisting of +"atoms", operators, and ordinary parentheses "()" used for grouping. Such +classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) +turned on within them. This means that literal space and tab characters are +ignored everywhere in the class. +

    +

    +The allowed atoms are individual characters specified by escape sequences such +as \n or \x{123}, character types such as \d, POSIX classes such as +[:alpha:], and nested ordinary (non-extended) character classes. For example, +in (?[\d & [...]]) the nested class [...] follows the usual rules for ordinary +character classes, in which parentheses are not metacharacters, and character +literals and ranges are permitted. +

    +

    +Character literals and ranges may not appear outside a nested ordinary +character class because they are not atoms in the extended syntax. The extended +syntax does not introduce any additional escape sequences, so (?[\y]) is an +unknown escape, as it would be in [\y]. +

    +

    +In the extended syntax, ^ does not negate a class (except within an +ordinary class nested inside an extended class); it is instead a binary +operator. +

    +

    +The binary operators are "&" (intersection), "|" or "+" (union), "-" +(subtraction) and "^" (symmetric difference). These are left-associative and +"&" has higher (tighter) precedence, while the others have equal lower +precedence. The one prefix unary operator is "!" (complement), with highest +precedence. +

    +
    UTS#18 EXTENDED CHARACTER CLASSES
    +

    +The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) +syntax, allowing instead extended class behaviour inside ordinary [...] +character classes. This altered syntax for [...] classes is loosely described +by the Unicode standard UTS#18. The PCRE2_ALT_EXTENDED_CLASS option does not +prevent use of (?[...]) classes; it just changes the meaning of all +[...] classes that are not nested inside a Perl (?[...]) class. +

    +

    +Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a +character class with two literal characters "a" and "[", but in UTS#18 extended +classes the "[" character becomes an additional metacharacter within classes, +denoting the start of a nested class, so a literal "[" must be escaped as "\[". +

    +

    +Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", +"--" and "~~" which denote character class union, intersection, subtraction, +and symmetric difference respectively. In standard Perl syntax, these would +simply be needlessly-repeated literals (except for "--" which could be the +start or end of a range). In UTS#18 extended classes these operators can be used +in constructs such as [\p{L}--[QW]] for "Unicode letters, other than Q and W". +A literal "-" at the start or end of a range must be escaped, so while "[--1]" +in Perl syntax is the range from hyphen to "1", it must be escaped as "[\--1]" +in UTS#18 extended classes. +

    +

    +Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to +ignore space and tab characters is not automatically enabled for UTS#18 +extended classes, but it is honoured if set. +

    +

    +Extended UTS#18 classes can be nested, and nested classes are themselves +extended classes (unlike Perl, where nested classes must be simple classes). +For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any letter that is in +the Thai or Greek scripts. Note that this means that no special grouping +characters (such as the parentheses used in Perl's (?[...]) class syntax) are +needed. +

    +

    +Individual class items (literal characters, literal ranges, properties such as +\d or \p{...}, and nested classes) can be combined by juxtaposition or by an +operator. Juxtaposition is the implicit union operator, and binds more tightly +than any explicit operator. Thus a sequence of literals and/or ranges behaves +as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the +same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character +except "E" or "8". +

    +

    +Precedence between the explicit operators is not defined, so mixing operators +is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is +valid.

    -
    POSIX CHARACTER CLASSES
    +

    +This is an emerging syntax which is being adopted gradually across the regex +ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024; +Python's "re" module reserves the syntax for future use with a FutureWarning +for unescaped use of "[" as a literal within character classes. Due to UTS#18 +providing insufficient guidance, engines interpret the syntax differently. +Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 +extended classes, but with slight incompatibilities ([A||B&&C] is parsed as +[A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex"). +

    +

    +PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so +that all the UTS#18 extended classes accepted as valid by PCRE2 have the +property that they are interpreted either with the same behaviour, or as +invalid, by all other major engines. Please file an issue if you are aware of +cross-engine differences in behaviour between PCRE2 and another major engine. +

    +
    POSIX CHARACTER CLASSES

    Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports -this notation. For example, +this notation, in both ordinary and extended classes. For example,

       [01[:alpha:]%]
     
    @@ -1584,7 +1748,7 @@

    pcre2pattern man page

    [:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This -is a change that was made in PCRE release 10.43 for Perl compatibility. +is a change that was made in PCRE2 release 10.43 for Perl compatibility.

    The other POSIX classes are unchanged by PCRE2_UCP, and match only characters @@ -1597,8 +1761,8 @@

    pcre2pattern man page

    (?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency. -

    -
    COMPATIBILITY FEATURE FOR WORD BOUNDARIES
    +

    +
    COMPATIBILITY FEATURE FOR WORD BOUNDARIES

    In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of @@ -1619,7 +1783,7 @@

    pcre2pattern man page

    PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so it also affects these POSIX sequences.

    -
    VERTICAL BAR
    +
    VERTICAL BAR

    Vertical bar characters are used to separate alternative patterns. For example, the pattern @@ -1634,7 +1798,7 @@

    pcre2pattern man page

    "succeeds" means matching the rest of the main pattern as well as the alternative in the group.

    -
    INTERNAL OPTION SETTING
    +
    INTERNAL OPTION SETTING

    The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, @@ -1732,7 +1896,7 @@

    pcre2pattern man page

    the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences.

    -
    GROUPS
    +
    GROUPS

    Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things: @@ -1788,7 +1952,7 @@

    pcre2pattern man page

    reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday".

    -
    DUPLICATE GROUP NUMBERS
    +
    DUPLICATE GROUP NUMBERS

    Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is @@ -1834,7 +1998,7 @@

    pcre2pattern man page

    An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section.

    -
    NAMED CAPTURE GROUPS
    +
    NAMED CAPTURE GROUPS

    Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is @@ -1954,7 +2118,7 @@

    pcre2pattern man page

    pcre2api documentation.

    -
    REPETITION
    +
    REPETITION

    Repetition is specified by quantifiers, which may follow any one of these items: @@ -2118,8 +2282,9 @@

    pcre2pattern man page

    (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking control verbs -(*PRUNE) and (*SKIP) also disable this optimization, and there is an option, -PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. +(*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, +either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call +pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive.

    When a capture group is repeated, the value captured is the substring that @@ -2135,7 +2300,7 @@

    pcre2pattern man page

    matches "aba" the value of the second captured substring is "b".

    -
    ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS
    +
    ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS

    With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be @@ -2216,8 +2381,9 @@

    pcre2pattern man page

    PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. -This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting -the pattern with (*NO_AUTO_POSSESS). +This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling +pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive, or by +starting the pattern with (*NO_AUTO_POSSESS).

    When a pattern contains an unlimited repeat inside a group that can itself be @@ -2245,7 +2411,7 @@

    pcre2pattern man page

    sequences of non-digits cannot be broken, and failure happens quickly.

    -
    BACKREFERENCES
    +
    BACKREFERENCES

    Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that @@ -2383,23 +2549,32 @@

    pcre2pattern man page

    This restriction no longer applies, and backtracking into such groups can occur as normal.

    -
    ASSERTIONS
    +
    ASSERTIONS

    -An assertion is a test on the characters following or preceding the current -matching point that does not consume any characters. The simple assertions -coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described +An assertion is a test that does not consume any characters. The test must +succeed for the match to continue. The simple assertions coded as \b, \B, +\A, \G, \Z, \z, ^ and $ are described above.

    -More complicated assertions are coded as parenthesized groups. There are two -kinds: those that look ahead of the current position in the subject string, and -those that look behind it, and in each case an assertion may be positive (must -match for the assertion to be true) or negative (must not match for the -assertion to be true). An assertion group is matched in the normal way, -and if it is true, matching continues after it, but with the matching position +More complicated assertions are coded as parenthesized groups. If matching such +a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed.

    +A special kind of assertion, called a "scan substring" assertion, matches a +subpattern against a previously captured substring. This is described in the +section entitled +"Scan substring assertions" +below. It is a PCRE2 extension, not compatible with Perl. +

    +

    +The other goup-based assertions are of two kinds: those that look ahead of the +current position in the subject string, and those that look behind it, and in +each case an assertion may be positive (must match for the assertion to be +true) or negative (must not match for the assertion to be true). +

    +

    The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic assertions can be @@ -2624,7 +2799,7 @@

    pcre2pattern man page

    is another pattern that matches "foo" preceded by three digits and any three characters that are not "999".

    -
    NON-ATOMIC ASSERTIONS
    +
    NON-ATOMIC ASSERTIONS

    Traditional lookaround assertions are atomic. That is, if an assertion is true, but there is a subsequent matching failure, there is no backtracking into the @@ -2683,8 +2858,67 @@

    pcre2pattern man page

    that assertions that appear as conditions for conditional groups (see below) must be atomic. +

    +
    SCAN SUBSTRING ASSERTIONS
    +

    +A special kind of assertion, not compatible with Perl, makes it possible to +check the contents of a captured substring by matching it with a subpattern. +Because this involves capturing, this feature is not supported by +pcre2_dfa_match(). +

    +

    +A scan substring assertion starts with the sequence (*scan_substring: or +(*scs: which is followed by a list of substring numbers (absolute or relative) +and/or substring names enclosed in single quotes or angle brackets, all within +parentheses. The rest of the item is the subpattern that is applied to the +substring, as shown in these examples: +

    +  (*scan_substring:(1)...)
    +  (*scs:(-2)...)
    +  (*scs:('AB')...)
    +  (*scs:(1,'AB',-2)...)
    +
    +The list of groups is checked in the order they are given, and it is the +contents of the first one that is found to be set that are scanned. When +PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the +same name are checked in numerical order. A scan substring assertion fails if +none of the groups it references have been set.

    -
    SCRIPT RUNS
    +

    +The pattern match on the substring is always anchored, that is, it must match +from the start of the substring. There is no "bumpalong" if it does not match +at the start. The end of the subject is temporarily reset to be the end of the +substring, so \Z, \z, and $ will match there. However, the start of the +subject is not reset. This means that ^ matches only if the substring is +actually at the start of the main subject, but it also means that lookbehind +assertions into what precedes the substring are possible. +

    +

    +Here is a very simple example: find a word that contains the rare (in English) +sequence of letters "rh" not at the start: +

    +  \b(\w++)(*scs:(1).+rh)
    +
    +The first group captures a word which is then scanned by the second group. +This example does not actually need this heavyweight feature; the same match +can be achieved with: +
    +  \b\w+?rh\w*\b
    +
    +When things are more complicated, however, scanning a captured substring can be +a useful way to describe the required match. For exmple, there is a rather +complicated pattern in the PCRE2 test data that checks an entire subject string +for a palindrome, that is, the sequence of letters is the same in both +directions. Suppose you want to search for individual words of two or more +characters such as "level" that are palindromes: +
    +  (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...)
    +
    +Within a substring scanning subpattern, references to other groups work as +normal. Capturing groups may appear, and will retain their values during +ongoing matching if the assertion succeeds. +

    +
    SCRIPT RUNS

    In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are @@ -2746,7 +2980,7 @@

    pcre2pattern man page

    should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking.

    -
    CONDITIONAL GROUPS
    +
    CONDITIONAL GROUPS

    It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on @@ -2947,13 +3181,13 @@

    pcre2pattern man page

    assertion, whether it succeeds or fails. (Compare non-conditional assertions, for which captures are retained only for positive assertions that succeed.)

    -
    COMMENTS
    +
    COMMENTS

    There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as -(?: or a group name or number. The characters that make up a comment play -no part in the pattern matching. +(?: or a group name or number or a Unicode property name. The characters that +make up a comment play no part in the pattern matching.

    The sequence (?# marks the start of a comment that continues up to the next @@ -2977,7 +3211,7 @@

    pcre2pattern man page

    it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so.

    -
    RECURSIVE PATTERNS
    +
    RECURSIVE PATTERNS

    Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can @@ -3165,7 +3399,7 @@

    pcre2pattern man page

    "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works.

    -
    GROUPS AS SUBROUTINES
    +
    GROUPS AS SUBROUTINES

    If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine @@ -3213,7 +3447,7 @@

    pcre2pattern man page

    "Backtracking verbs in subroutines" below.

    -
    ONIGURUMA SUBROUTINE SYNTAX
    +
    ONIGURUMA SUBROUTINE SYNTAX

    For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative @@ -3231,7 +3465,7 @@

    pcre2pattern man page

    Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call.

    -
    CALLOUTS
    +
    CALLOUTS

    Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it @@ -3244,7 +3478,9 @@

    pcre2pattern man page

    function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is passed, or if the callout -entry point is set to NULL, callouts are disabled. +entry point is set to NULL, callout points will be passed over silently during +matching. To disallow callouts in the pattern syntax, you may use the +PCRE2_EXTRA_NEVER_CALLOUT option.

    Within a regular expression, (?C<arg>) indicates a point at which the external @@ -3307,7 +3543,7 @@

    pcre2pattern man page

    The doubling is removed before the string is passed to the callout function.

    -
    BACKTRACKING CONTROL
    +
    BACKTRACKING CONTROL

    There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They @@ -3347,8 +3583,8 @@

    pcre2pattern man page

    Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching -function, because that uses a backtracking algorithm. With the exception of -(*FAIL), which behaves like a failing negative assertion, the backtracking +function or JIT, because they use backtracking algorithms. With the exception +of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function.

    @@ -3369,7 +3605,8 @@

    pcre2pattern man page

    present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option -when calling pcre2_compile(), or by starting the pattern with +when calling pcre2_compile(), by calling pcre2_set_optimize() with a +PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled "Compiling a pattern" @@ -3502,7 +3739,8 @@

    pcre2pattern man page

    If you are interested in (*MARK) values after failed matches, you should -probably set the PCRE2_NO_START_OPTIMIZE option +probably either set the PCRE2_NO_START_OPTIMIZE option or call +pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see above) to ensure that the match is always attempted.

    @@ -3514,9 +3752,9 @@

    pcre2pattern man page

    with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, backtracking cannot pass to the left of the verb. However, when one of these verbs appears inside an -atomic group or in a lookaround assertion that is true, its effect is confined -to that group, because once the group has been matched, there is never any -backtracking into it. Backtracking from beyond an assertion or an atomic group +atomic group or in an atomic lookaround assertion that is true, its effect is +confined to that group, because once the group has been matched, there is never +any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point.

    @@ -3782,9 +4020,11 @@

    pcre2pattern man page

    assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.

    -PCRE2 now supports non-atomic positive assertions, as described in the section -entitled +PCRE2 now supports non-atomic positive assertions and also "scan substring" +assertions, as described in the sections entitled "Non-atomic assertions" +and +"Scan substring assertions" above. These assertions must be standalone (not used as conditions). They are not Perl-compatible. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by @@ -3793,7 +4033,8 @@

    pcre2pattern man page

    The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, -and a negative assertion to be true. +and a negative assertion to be true. This behaviour differs from Perl when the +assertion has only one branch.

    The other backtracking verbs are not treated specially if they appear in a @@ -3829,13 +4070,57 @@

    pcre2pattern man page

    enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +

    +
    EBCDIC ENVIRONMENTS
    +

    +Differences in the way PCRE behaves when it is running in an EBCDIC environment +are covered in this section. +

    +
    +Escape sequences +
    +

    +When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, +\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c +escape is processed as specified for Perl in the perlebcdic document. The +only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], +^, _, or ?. Any other character provokes a compile-time error. The sequence +\c@ encodes character code 0; after \c the letters (in either case) encode +characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 +(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +

    +

    +Thus, apart from \c?, these escapes generate the same character code values as +they do in an ASCII or Unicode environment, though the meanings of the values +mostly differ. For example, \cG always generates code value 7, which is BEL in +ASCII but DEL in EBCDIC. +

    +

    +The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but +because 127 is not a control character in EBCDIC, Perl makes it generate the +APC character. Unfortunately, there are several variants of EBCDIC. In most of +them the APC character has the value 255 (hex FF), but in the one Perl calls +POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC +values, PCRE2 makes \c? generate 95; otherwise it generates 255. +

    +
    +Character classes +
    +

    +In character classes there is a special case in EBCDIC environments for ranges +whose end points are both specified as literal letters in the same case. For +compatibility with Perl, EBCDIC code points within the range that are not +letters are omitted. For example, [h-k] matches only four characters, even +though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code +points. However, if the range is specified numerically, for example, +[\x88-\x92] or [h-\x92], all code points are included.

    -
    SEE ALSO
    +
    SEE ALSO

    pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -3844,9 +4129,9 @@

    pcre2pattern man page

    Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 04 June 2024 +Last updated: 27 November 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2perform.html b/usr/share/doc/pcre2/html/pcre2perform.html index 55fdf202fc4..b595119ba88 100644 --- a/usr/share/doc/pcre2/html/pcre2perform.html +++ b/usr/share/doc/pcre2/html/pcre2perform.html @@ -271,7 +271,7 @@

    pcre2perform man page


    REVISION

    -Last updated: 27 July 2022 +Last updated: 06 December 2022
    Copyright © 1997-2022 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2posix.html b/usr/share/doc/pcre2/html/pcre2posix.html index 6e7abd932ab..bc60c3b798c 100644 --- a/usr/share/doc/pcre2/html/pcre2posix.html +++ b/usr/share/doc/pcre2/html/pcre2posix.html @@ -171,7 +171,7 @@

    pcre2posix man page

    When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments -are ignored, and no captured strings are returned. Versions of the PCRE library +are ignored, and no captured strings are returned. Versions of the PCRE2 library prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens because it disables the use of backreferences.
    @@ -370,7 +370,7 @@ 

    pcre2posix man page


    REVISION

    -Last updated: 19 January 2024 +Last updated: 27 November 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2sample.html b/usr/share/doc/pcre2/html/pcre2sample.html index 345df031131..0903f04f99b 100644 --- a/usr/share/doc/pcre2/html/pcre2sample.html +++ b/usr/share/doc/pcre2/html/pcre2sample.html @@ -101,7 +101,7 @@

    pcre2sample man page

    REVISION

    -Last updated: 02 February 2016 +Last updated: 14 November 2023
    Copyright © 1997-2016 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2serialize.html b/usr/share/doc/pcre2/html/pcre2serialize.html index 19418a83b21..d189bde2b63 100644 --- a/usr/share/doc/pcre2/html/pcre2serialize.html +++ b/usr/share/doc/pcre2/html/pcre2serialize.html @@ -203,7 +203,7 @@

    pcre2serialize man page


    REVISION

    -Last updated: 27 June 2018 +Last updated: 19 January 2024
    Copyright © 1997-2018 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2syntax.html b/usr/share/doc/pcre2/html/pcre2syntax.html index 1c0ccb003e2..46da3d71fcc 100644 --- a/usr/share/doc/pcre2/html/pcre2syntax.html +++ b/usr/share/doc/pcre2/html/pcre2syntax.html @@ -24,34 +24,41 @@

    pcre2syntax man page

  • SCRIPT MATCHING WITH \p AND \P
  • THE BIDI_CLASS PROPERTY FOR \p AND \P
  • CHARACTER CLASSES -
  • QUANTIFIERS -
  • ANCHORS AND SIMPLE ASSERTIONS -
  • REPORTED MATCH POINT SETTING -
  • ALTERNATION -
  • CAPTURING -
  • ATOMIC GROUPS -
  • COMMENT -
  • OPTION SETTING -
  • NEWLINE CONVENTION -
  • WHAT \R MATCHES -
  • LOOKAHEAD AND LOOKBEHIND ASSERTIONS -
  • NON-ATOMIC LOOKAROUND ASSERTIONS -
  • SCRIPT RUNS -
  • BACKREFERENCES -
  • SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) -
  • CONDITIONAL PATTERNS -
  • BACKTRACKING CONTROL -
  • CALLOUTS -
  • SEE ALSO -
  • AUTHOR -
  • REVISION +
  • PERL EXTENDED CHARACTER CLASSES +
  • QUANTIFIERS +
  • ANCHORS AND SIMPLE ASSERTIONS +
  • REPORTED MATCH POINT SETTING +
  • ALTERNATION +
  • CAPTURING +
  • ATOMIC GROUPS +
  • COMMENT +
  • OPTION SETTING +
  • NEWLINE CONVENTION +
  • WHAT \R MATCHES +
  • LOOKAHEAD AND LOOKBEHIND ASSERTIONS +
  • NON-ATOMIC LOOKAROUND ASSERTIONS +
  • SUBSTRING SCAN ASSERTION +
  • SCRIPT RUNS +
  • BACKREFERENCES +
  • SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) +
  • CONDITIONAL PATTERNS +
  • BACKTRACKING CONTROL +
  • CALLOUTS +
  • REPLACEMENT STRINGS +
  • SEE ALSO +
  • AUTHOR +
  • REVISION
    PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY

    -The full syntax and semantics of the regular expressions that are supported by -PCRE2 are described in the +The full syntax and semantics of the regular expression patterns that are +supported by PCRE2 are described in the pcre2pattern -documentation. This document contains a quick-reference summary of the syntax. +documentation. This document contains a quick-reference summary of the pattern +syntax followed by the syntax of replacement strings in substitution function. +The full description of the latter is in the +pcre2api +documentation.


    QUOTING

    @@ -60,7 +67,10 @@

    pcre2syntax man page

    \Q...\E treat enclosed characters as literal
  • Note that white space inside \Q...\E is always treated as literal, even if -PCRE2_EXTENDED is set, causing most other white space to be ignored. +PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also +that PCRE2's handling of \Q...\E has some differences from Perl's. See the +pcre2pattern +documentation for details.


    BRACED ITEMS

    @@ -91,6 +101,11 @@

    pcre2syntax man page

    \xhh character with hex code hh \x{hh..} character with hex code hh.. +\N{U+hh..} is synonymous with \x{hh..} but is not supported in environments +that use EBCDIC code (mainly IBM mainframes). Note that \N not followed by an +opening curly bracket has a different meaning (see below). +

    +

    If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized:

    @@ -98,7 +113,7 @@ 

    pcre2syntax man page

    \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX
    -When \x is not followed by {, from zero to two hexadecimal digits are read, +When \x is not followed by {, one or two hexadecimal digits are read, but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise it matches a literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits @@ -112,9 +127,7 @@

    pcre2syntax man page

    in the pcre2pattern documentation, where details of escape processing in EBCDIC environments are -also given. \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not -supported in EBCDIC environments. Note that \N not followed by an opening -curly bracket has a different meaning (see below). +also given.


    CHARACTER TYPES

    @@ -154,8 +167,9 @@

    pcre2syntax man page

    Property descriptions in \p and \P are matched caselessly; hyphens, -underscores, and white space are ignored, in accordance with Unicode's "loose -matching" rules. +underscores, and ASCII white space characters are ignored, in accordance with +Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} is the same +as \p{ bidi class = AL }.


    GENERAL CATEGORY PROPERTIES FOR \p and \P

    @@ -168,13 +182,13 @@

    pcre2syntax man page

    Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -205,7 +219,9 @@

    pcre2syntax man page

    Zl Line separator Zp Paragraph separator Zs Space separator - + +From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all +equivalent to Lc.


    PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P

    @@ -268,7 +284,7 @@

    pcre2syntax man page

    RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space


    CHARACTER CLASSES
    @@ -299,7 +315,45 @@

    pcre2syntax man page

    but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class.

    -
    QUANTIFIERS
    +

    +When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be +used, allowing nested character classes, combined using set operators. +

    +  [x&&[^y]]   UTS#18 extended character class
    +
    +  x||y        set union (OR)
    +  x&&y        set intersection (AND)
    +  x--y        set difference (AND NOT)
    +  x~~y        set symmetric difference (XOR)
    +
    +
    +

    +
    PERL EXTENDED CHARACTER CLASSES
    +

    +

    +  (?[...])                Perl extended character class
    +  (?[\p{Thai} & \p{Nd}])  operators; whitespace ignored
    +  (?[(x - y) & z])        parentheses for grouping
    +
    +  (?[ [^3] & \p{Nd} ])    [...] is a nested ordinary class
    +  (?[ [:alpha:] - [z] ])  POSIX set is allowed outside [...]
    +  (?[ \d - [3] ])         backslash-escaped set is allowed outside [...]
    +  (?[ !\n & [:ascii:] ])  backslash-escaped character is allowed outside [...]
    +                      all other characters or ranges must be enclosed in [...]
    +
    +  x|y, x+y                set union (OR)
    +  x&y                     set intersection (AND)
    +  x-y                     set difference (AND NOT)
    +  x^y                     set symmetric difference (XOR)
    +  !x                      set complement (NOT)
    +
    +Inside a Perl extended character class, [...] switches mode to be interpreted +as an ordinary character class. Outside of a nested [...], the only items +permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside +a nested ordinary class, ^ has its usual meaning (inverts the class when used +as the first character); outside of a nested class, ^ is the XOR operator. +

    +
    QUANTIFIERS

       ?           0 or 1, greedy
    @@ -323,7 +377,7 @@ 

    pcre2syntax man page

    {,m}? zero up to m, lazy

    -
    ANCHORS AND SIMPLE ASSERTIONS
    +
    ANCHORS AND SIMPLE ASSERTIONS

       \b          word boundary
    @@ -341,7 +395,7 @@ 

    pcre2syntax man page

    \G first matching position in subject

    -
    REPORTED MATCH POINT SETTING
    +
    REPORTED MATCH POINT SETTING

       \K          set reported start of match
    @@ -351,13 +405,13 @@ 

    pcre2syntax man page

    option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ignored in negative ones.

    -
    ALTERNATION
    +
    ALTERNATION

       expr|expr|expr...
     

    -
    CAPTURING
    +
    CAPTURING

       (...)           capture group
    @@ -372,20 +426,20 @@ 

    pcre2syntax man page

    in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit.

    -
    ATOMIC GROUPS
    +
    ATOMIC GROUPS

       (?>...)         atomic non-capture group
       (*atomic:...)   atomic non-capture group
     

    -
    COMMENT
    +
    COMMENT

       (?#....)        comment (not nestable)
     

    -
    OPTION SETTING
    +
    OPTION SETTING

    Changes of these options within a group are automatically cancelled at the end of the group. @@ -409,7 +463,7 @@

    pcre2syntax man page

    (?^) unset imnrsx options
    (?aP) implies (?aT) as well, though this has no additional effect. However, it -means that (?-aP) is really (?-PT) which disables all ASCII restrictions for +means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes.

    @@ -421,20 +475,22 @@

    pcre2syntax man page

    The following are recognized only at the very start of a pattern or after one -of the newline or \R options with similar syntax. More than one of them may -appear. For the first three, d is a decimal number. -

    -  (*LIMIT_DEPTH=d) set the backtracking limit to d
    -  (*LIMIT_HEAP=d)  set the heap size limit to d * 1024 bytes
    -  (*LIMIT_MATCH=d) set the match limit to d
    -  (*NOTEMPTY)      set PCRE2_NOTEMPTY when matching
    -  (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching
    -  (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
    +of the newline or \R sequences or options with similar syntax. More than one
    +of them may appear. For the first three, d is a decimal number.
    +
    +  (*LIMIT_DEPTH=d)     set the backtracking limit to d
    +  (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
    +  (*LIMIT_MATCH=d)     set the match limit to d
    +  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
    +  (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
    +  (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
    +  (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
       (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
    -  (*NO_JIT)       disable JIT optimization
    -  (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
    -  (*UTF)          set appropriate UTF mode for the library in use
    -  (*UCP)          set PCRE2_UCP (use Unicode properties for \d etc)
    +  (*NO_JIT)            disable JIT optimization
    +  (*NO_START_OPT)      no start-match optimization (PCRE2_NO_START_OPTIMIZE)
    +  (*TURKISH_CASING)    set PCRE2_EXTRA_TURKISH_CASING when matching
    +  (*UTF)               set appropriate UTF mode for the library in use
    +  (*UCP)               set PCRE2_UCP (use Unicode properties for \d etc)
     
    Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or pcre2_dfa_match(), @@ -442,7 +498,7 @@

    pcre2syntax man page

    application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.

    -
    NEWLINE CONVENTION
    +
    NEWLINE CONVENTION

    These are recognized only at the very start of the pattern or after option settings with a similar syntax. @@ -455,7 +511,7 @@

    pcre2syntax man page

    (*NUL) the NUL character (binary zero)

    -
    WHAT \R MATCHES
    +
    WHAT \R MATCHES

    These are recognized only at the very start of the pattern or after option setting with a similar syntax. @@ -464,7 +520,7 @@

    pcre2syntax man page

    (*BSR_UNICODE) any Unicode newline sequence

    -
    LOOKAHEAD AND LOOKBEHIND ASSERTIONS
    +
    LOOKAHEAD AND LOOKBEHIND ASSERTIONS

       (?=...)                     )
    @@ -490,7 +546,7 @@ 

    pcre2syntax man page

    (ultimate default 255). If every branch matches a fixed number of characters, the limit for each branch is 65535 characters.

    -
    NON-ATOMIC LOOKAROUND ASSERTIONS
    +
    NON-ATOMIC LOOKAROUND ASSERTIONS

    These assertions are specific to PCRE2 and are not Perl-compatible.

    @@ -503,7 +559,24 @@ 

    pcre2syntax man page

    (*non_atomic_positive_lookbehind:...) )

    -
    SCRIPT RUNS
    +
    SUBSTRING SCAN ASSERTION
    +

    +This feature is not Perl-compatible. +

    +  (*scan_substring:(grouplist)...)  scan captured substring
    +  (*scs:(grouplist)...)             scan captured substring
    +
    +The comma-separated list may identify groups in any of the following ways: +
    +  n       absolute reference
    +  +n      relative reference
    +  -n      relative reference
    +  <name>  name
    +  'name'  name
    +
    +
    +

    +
    SCRIPT RUNS

       (*script_run:...)           ) script run, can be backtracked into
    @@ -513,7 +586,7 @@ 

    pcre2syntax man page

    (*asr:...) )

    -
    BACKREFERENCES
    +
    BACKREFERENCES

       \n              reference by number (can be ambiguous)
    @@ -530,7 +603,7 @@ 

    pcre2syntax man page

    (?P=name) reference by name (Python)

    -
    SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
    +
    SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)

       (?R)            recurse whole pattern
    @@ -549,7 +622,7 @@ 

    pcre2syntax man page

    \g'-n' call subroutine by relative number (PCRE2 extension)

    -
    CONDITIONAL PATTERNS
    +
    CONDITIONAL PATTERNS

       (?(condition)yes-pattern)
    @@ -572,7 +645,7 @@ 

    pcre2syntax man page

    conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists.

    -
    BACKTRACKING CONTROL
    +
    BACKTRACKING CONTROL

    All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour @@ -599,7 +672,7 @@

    pcre2syntax man page

    The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call.

    -
    CALLOUTS
    +
    CALLOUTS

       (?C)            callout (assumed number 0)
    @@ -610,12 +683,58 @@ 

    pcre2syntax man page

    start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it.

    -
    SEE ALSO
    +
    REPLACEMENT STRINGS
    +

    +If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for +pcre2_substitute() is not interpreted. Otherwise, by default, the only +special character is the dollar character in one of the following forms: +

    +  $$                  insert a dollar character
    +  $n or ${n}          insert the contents of group n
    +  $<name>             insert the contents of named group
    +  $0 or $&            insert the entire matched substring
    +  $`                  insert the substring that precedes the match
    +  $'                  insert the substring that follows the match
    +  $_                  insert the entire input string
    +  $*MARK or ${*MARK}  insert a control verb name
    +
    +For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, +there is additional interpretation: +

    +

    +1. Backslash is an escape character, and the forms described in "ESCAPED +CHARACTERS" above are recognized. Also: +

    +  \Q...\E   can be used to suppress interpretation
    +  \l        force the next character to lower case
    +  \u        force the next character to upper case
    +  \L        force subsequent characters to lower case
    +  \U        force subsequent characters to upper case
    +  \u\L      force next character to upper case, then all lower
    +  \l\U      force next character to lower case, then all upper
    +  \E        end \L or \U case forcing
    +  \b        backspace character (note: as in character class in pattern)
    +  \v        vertical tab character (note: not the same as in a pattern)
    +
    +2. The Python form \g<n>, where the angle brackets are part of the syntax and +n is either a group name or a number, is recognized as an alternative way +of inserting the contents of a group, for example \g<3>. +

    +

    +3. Capture substitution supports the following additional forms: +

    +  ${n:-string}             default for unset group
    +  ${n:+string1:string2}    values for set/unset group
    +
    +The substitution strings themselves are expanded. Backslash can be used to +escape colons and closing curly brackets. +

    +
    SEE ALSO

    pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -624,11 +743,11 @@

    pcre2syntax man page

    Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 12 October 2023 +Last updated: 27 November 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/usr/share/doc/pcre2/html/pcre2test.html b/usr/share/doc/pcre2/html/pcre2test.html index 6cc3cc317ff..db9073f0e60 100644 --- a/usr/share/doc/pcre2/html/pcre2test.html +++ b/usr/share/doc/pcre2/html/pcre2test.html @@ -105,8 +105,8 @@

    pcre2test man page

    When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed -to the library. For subject lines, backslash escapes can be used. In addition, -when the utf modifier (see +to the library. For subject lines and some patterns, backslash escapes can be +used. In addition, when the utf modifier (see "Setting compilation options" below) is set, the pattern and any following subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. @@ -125,9 +125,8 @@

    pcre2test man page

    than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) -0x80000000 is added to the character's value. This is the only way of passing -such code points in a pattern string. For subject strings, using an escape -sequence is preferable. +0x80000000 is added to the character's value. For subject strings, using an +escape sequence is preferable.


    COMMAND LINE OPTIONS

    @@ -178,8 +177,8 @@

    pcre2test man page

    following options output the value and set the exit code as indicated:
       ebcdic-nl  the code for LF (= NL) in an EBCDIC environment:
    -               0x15 or 0x25
    -               0 if used in an ASCII environment
    +               either 0x15 or 0x25
    +               0 if used in an ASCII/Unicode environment
                    exit code is always 0
       linksize   the configured internal link size (2, 3, or 4)
                    exit code is set to the link size
    @@ -201,6 +200,16 @@ 

    pcre2test man page

    pcre2-8 the 8-bit library was built unicode Unicode support is available
    +Note that the availability of JIT support in the library does not guarantee +that it can actually be used because in some environments it is unable to +allocate executable memory. The option "jitusable" gives more detailed +information. It returns one of the following values: +
    +  0  JIT is available and usable
    +  1  JIT is available but cannot allocate executable memory
    +  2  JIT is not available
    +  3  Unexpected return from test call to pcre2_jit_compile()
    +
    If an unknown option is given, an error message is output; the exit code is 0.

    @@ -527,39 +536,48 @@

    pcre2test man page

    subject_literal modifier was set for the pattern. The following provide a means of encoding non-printing characters in a visible way:
    -  \a         alarm (BEL, \x07)
    -  \b         backspace (\x08)
    -  \e         escape (\x27)
    -  \f         form feed (\x0c)
    -  \n         newline (\x0a)
    -  \r         carriage return (\x0d)
    -  \t         tab (\x09)
    -  \v         vertical tab (\x0b)
    -  \nnn       octal character (up to 3 octal digits); always
    -               a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode
    -  \o{dd...}  octal character (any number of octal digits}
    -  \xhh       hexadecimal byte (up to 2 hex digits)
    -  \x{hh...}  hexadecimal character (any number of hex digits)
    +  \a          alarm (BEL, \x07)
    +  \b          backspace (\x08)
    +  \e          escape (\x27)
    +  \f          form feed (\x0c)
    +  \n          newline (\x0a)
    +  \N{U+hh...} unicode character (any number of hex digits)
    +  \r          carriage return (\x0d)
    +  \t          tab (\x09)
    +  \v          vertical tab (\x0b)
    +  \ddd        octal number (up to 3 octal digits); represent a single
    +                code point unless larger than 255 with the 8-bit library
    +  \o{dd...}   octal number (any number of octal digits} representing a
    +                character in UTF mode or a code point
    +  \xhh        hexadecimal byte (up to 2 hex digits)
    +  \x{hh...}   hexadecimal number (up to 8 hex digits) representing a
    +                character in UTF mode or a code point
     
    -The use of \x{hh...} is not dependent on the use of the utf modifier on -the pattern. It is recognized always. There may be any number of hexadecimal -digits inside the braces; invalid values provoke error messages. +Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf +modifier on the pattern. It is always recognized. There may be any number of +hexadecimal digits inside the braces; invalid values provoke error messages +but when using \N{U+hh...} with some invalid unicode characters they will +be accepted with a warning instead.

    -Note that \xhh specifies one byte rather than one character in UTF-8 mode; -this makes it possible to construct invalid UTF-8 sequences for testing -purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in -UTF-8 mode, generating more than one byte if the value is greater than 127. -When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte -for values less than 256, and causes an error for greater values. +Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) +describe one byte rather than one character; this makes it possible to +construct invalid UTF-8 sequences for testing purposes. On the other hand, +\x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating +more than one byte if the value is greater than 127. To avoid the ambiguity +it is preferred to use \N{U+hh...} when describing characters. When testing +the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values +that could fit on it, and causes an error for greater values.

    -In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it -possible to construct invalid UTF-16 sequences for testing purposes. +When testing the 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh} +values are accepted. This makes it possible to construct invalid UTF-16 +sequences for testing purposes.

    -In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it -possible to construct invalid UTF-32 sequences for testing purposes. +When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...} +values are accepted. This makes it possible to construct invalid UTF-32 +sequences for testing purposes.

    There is a special backslash sequence that specifies replication of one or more @@ -625,6 +643,7 @@

    pcre2test man page

    allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -653,13 +672,17 @@

    pcre2test man page

    match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -671,6 +694,23 @@

    pcre2test man page

    brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. +
    +
    +The following modifiers enable or disable performance optimizations by +calling pcre2_set_optimize() before invoking the regex compiler. +
    +      optimization_full      enable all optional optimizations
    +      optimization_none      disable all optional optimizations
    +      auto_possess           auto-possessify variable quantifiers
    +      auto_possess_off       don't auto-possessify variable quantifiers
    +      dotstar_anchor         anchor patterns starting with .*
    +      dotstar_anchor_off     don't anchor patterns starting with .*
    +      start_optimize         enable pre-scan of subject string
    +      start_optimize_off     disable pre-scan of subject string
    +
    +See the +pcre2_set_optimize +documentation for details on these optimizations.


    Setting compilation controls @@ -680,14 +720,15 @@

    pcre2test man page

    about the pattern. There are single-letter abbreviations for some that are heavily used in the test files.
    -      bsr=[anycrlf|unicode]     specify \R handling
       /B  bincode                   show binary code without lengths
    +      bsr=[anycrlf|unicode]     specify \R handling
           callout_info              show callout information
           convert=<options>         request foreign pattern conversion
           convert_glob_escape=c     set glob escape character
           convert_glob_separator=c  set glob separator character
           convert_length            set convert buffer length
           debug                     same as info,fullbincode
    +      expand                    expand repetition syntax in pattern
           framesize                 show matching frame size
           fullbincode               show binary code with lengths
       /I  info                      show info about compiled pattern
    @@ -709,6 +750,7 @@ 

    pcre2test man page

    posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard=<number> test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -1128,6 +1170,7 @@

    pcre2test man page

    replace=<string> specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1217,10 +1260,11 @@

    pcre2test man page

    The following modifiers set options for pcre2_match() or pcre2_dfa_match(). See -pcreapi +pcre2api for a description of their effects.

           anchored                   set PCRE2_ANCHORED
    +      copy_matched_subject       set PCRE2_COPY_MATCHED_SUBJECT
           endanchored                set PCRE2_ENDANCHORED
           dfa_restart                set PCRE2_DFA_RESTART
           dfa_shortest               set PCRE2_DFA_SHORTEST
    @@ -1271,8 +1315,8 @@ 

    pcre2test man page

    aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data=<n> set a value to pass via callouts @@ -1306,7 +1350,8 @@

    pcre2test man page

    startchar show startchar when relevant startoffset=<n> same as offset=<n> substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1592,6 +1637,21 @@

    pcre2test man page

    or stop is supported, which is sufficient for testing that the feature works.


    +Testing substitute case callouts +
    +

    +If the substitute_case_callout modifier is set, a substitution +case callout function is set up. The callout function is called for each +substituted chunk which is to be case-transformed. +

    +

    +The callout function passed is a fixed function with implementation for certain +behaviours: inputs which shrink when case-transformed; inputs which grow; inputs +with distinct upper/lower/titlecase forms. The characters which are not +special-cased for testing purposes are left unmodified, as if they are caseless +characters. +

    +
    Setting the JIT stack size

    @@ -2204,7 +2264,7 @@

    pcre2test man page


    REVISION

    -Last updated: 24 April 2024 +Last updated: 26 December 2024
    Copyright © 1997-2024 University of Cambridge.
    diff --git a/usr/share/doc/pcre2/html/pcre2unicode.html b/usr/share/doc/pcre2/html/pcre2unicode.html index 6f0972e706a..5b425329fac 100644 --- a/usr/share/doc/pcre2/html/pcre2unicode.html +++ b/usr/share/doc/pcre2/html/pcre2unicode.html @@ -53,7 +53,7 @@

    pcre2unicode man page

    The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal number, the derived properties -Any and LC (synonym L&), the Unicode script names such as Arabic or Han, +Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties.

    @@ -157,6 +157,40 @@

    pcre2unicode man page

    counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. +
    +    Without PCRE2_EXTRA_CASELESS_RESTRICT:
    +      'k' = 'K' = U+212A (Kelvin sign)
    +      's' = 'S' = U+017F (long S)
    +    With PCRE2_EXTRA_CASELESS_RESTRICT:
    +      'k' = 'K'
    +      U+212A (Kelvin sign)  only case-equivalent to itself
    +      's' = 'S'
    +      U+017F (long S)       only case-equivalent to itself
    +
    +

    +

    +One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +

    +    Without PCRE2_EXTRA_TURKISH_CASING:
    +      'i' = 'I'
    +      U+0130 (capital I with dot above)  only case-equivalent to itself
    +      U+0131 (small dotless i)           only case-equivalent to itself
    +    With PCRE2_EXTRA_TURKISH_CASING:
    +      'i' = U+0130 (capital I with dot above)
    +      U+0131 (small dotless i) = 'I'
    +
    +

    +

    +It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and +PCRE2_EXTRA_TURKISH_CASING together. +

    +

    +From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower +case), and Lt (title case) are all treated as Lc (cased letter) when caseless +matching is set by the PCRE2_CASELESS option or (?i) within the pattern.


    SCRIPT RUNS @@ -513,9 +547,9 @@

    pcre2unicode man page

    REVISION

    -Last updated: 12 October 2023 +Last updated: 27 November 2024
    -Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.

    Return to the PCRE2 index page. diff --git a/usr/share/doc/pcre2/pcre2-config.txt b/usr/share/doc/pcre2/pcre2-config.txt index dc8cf8f7ed4..c87de589db7 100644 --- a/usr/share/doc/pcre2/pcre2-config.txt +++ b/usr/share/doc/pcre2/pcre2-config.txt @@ -1,4 +1,3 @@ - PCRE2-CONFIG(1) General Commands Manual PCRE2-CONFIG(1) @@ -82,4 +81,4 @@ REVISION Last updated: 28 September 2014 -PCRE2 10.00 28 September 2014 PCRE2-CONFIG(1) +PCRE2 10.45 28 September 2014 PCRE2-CONFIG(1) diff --git a/usr/share/doc/pcre2/pcre2.txt b/usr/share/doc/pcre2/pcre2.txt index 85eead6e61f..38e86d6e6a3 100644 --- a/usr/share/doc/pcre2/pcre2.txt +++ b/usr/share/doc/pcre2/pcre2.txt @@ -8,7 +8,6 @@ pcre2test commands. ----------------------------------------------------------------------------- - PCRE2(3) Library Functions Manual PCRE2(3) @@ -171,27 +170,29 @@ USER DOCUMENTATION library function, listing its arguments and results. -AUTHOR +AUTHORS - Philip Hazel - Retired from University Computing Service - Cambridge, England. + The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Her- + czeg. + + PCRE2 was written by Philip Hazel, of the University Computing Service, + Cambridge, England. Many others have also contributed. - Putting an actual email address here is a spam magnet. If you want to - email me, use my two names separated by a dot at gmail.com. + To contact the maintainers, please use the GitHub issues tracker or + PCRE2 mailing list, as described at the project page: + https://github.com/PCRE2Project/pcre2 REVISION - Last updated: 27 August 2021 + Last updated: 18 December 2024 Copyright (c) 1997-2021 University of Cambridge. -PCRE2 10.38 27 August 2021 PCRE2(3) +PCRE2 10.45 18 December 2024 PCRE2(3) ------------------------------------------------------------------------------ - PCRE2API(3) Library Functions Manual PCRE2API(3) @@ -298,6 +299,9 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); + int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); + PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS @@ -317,6 +321,12 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); @@ -858,6 +868,7 @@ PCRE2 CONTEXTS The compile time nested parentheses limit The maximum length of the pattern string The extra options bits (none set by default) + Which performance optimizations the compiler should apply A compile context is also required if you are using custom memory man- agement. If none of these apply, just pass NULL as the context argu- @@ -980,6 +991,110 @@ PCRE2 CONTEXTS ment of pcre2_set_compile_recursion_guard(). The callout function should return zero if all is well, or non-zero to force an error. + int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); + + PCRE2 can apply various performance optimizations during compilation, + in order to make matching faster. For example, the compiler might con- + vert some regex constructs into an equivalent construct which + pcre2_match() can execute faster. By default, all available optimiza- + tions are enabled. However, in rare cases, one might wish to disable + specific optimizations. For example, if it is known that some optimiza- + tions cannot benefit a certain regex, it might be desirable to disable + them, in order to speed up compilation. + + The permitted values of directive are as follows: + + PCRE2_OPTIMIZATION_FULL + + Enable all optional performance optimizations. This is the default + value. + + PCRE2_OPTIMIZATION_NONE + + Disable all optional performance optimizations. + + PCRE2_AUTO_POSSESS + PCRE2_AUTO_POSSESS_OFF + + Enable/disable "auto-possessification" of variable quantifiers such as + * and +. This optimization, for example, turns a+b into a++b in order + to avoid backtracks into a+ that can never be successful. However, if + callouts are in use, auto-possessification means that some callouts are + never taken. You can disable this optimization if you want the matching + functions to do a full, unoptimized search and run all the callouts. + + PCRE2_DOTSTAR_ANCHOR + PCRE2_DOTSTAR_ANCHOR_OFF + + Enable/disable an optimization that is applied when .* is the first + significant item in a top-level branch of a pattern, and all the other + branches also start with .* or with \A or \G or ^. Such a pattern is + automatically anchored if PCRE2_DOTALL is set for all the .* items and + PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that + any match must start either at the start of the subject or following a + newline is remembered. Like other optimizations, this can cause call- + outs to be skipped. + + Dotstar anchor optimization is automatically disabled for .* if it is + inside an atomic group or a capture group that is the subject of a + backreference, or if the pattern contains (*PRUNE) or (*SKIP). + + PCRE2_START_OPTIMIZE + PCRE2_START_OPTIMIZE_OFF + + Enable/disable optimizations which cause matching functions to scan the + subject string for specific code unit values before attempting a match. + For example, if it is known that an unanchored match must start with a + specific value, the matching code searches the subject for that value, + and fails immediately if it cannot find it, without actually running + the main matching function. This means that a special item such as + (*COMMIT) at the start of a pattern is not considered until after a + suitable starting point for the match has been found. Also, when call- + outs or (*MARK) items are in use, these "start-up" optimizations can + cause them to be skipped if the pattern is never actually used. The + start-up optimizations are in effect a pre-scan of the subject that + takes place before the pattern is run. + + Disabling start-up optimizations ensures that in cases where the result + is "no match", the callouts do occur, and that items such as (*COMMIT) + and (*MARK) are considered at every possible starting position in the + subject string. + + Disabling start-up optimizations may change the outcome of a matching + operation. Consider the pattern + + (*COMMIT)ABC + + When this is compiled, PCRE2 records the fact that a match must start + with the character "A". Suppose the subject string is "DEFABC". The + start-up optimization scans along the subject, finds "A" and runs the + first match attempt from there. The (*COMMIT) item means that the pat- + tern must match the current starting position, which in this case, it + does. However, if the same match is run without start-up optimizations, + the initial scan along the subject string does not happen. The first + match attempt is run starting from "D" and when this fails, (*COMMIT) + prevents any further matches being tried, so the overall result is "no + match". + + Another start-up optimization makes use of a minimum length for a + matching subject, which is recorded when possible. Consider the pattern + + (*MARK:1)B(*MARK:2)(X|Y) + + The minimum length for a match is two characters. If the subject is + "XXBB", the "starting character" optimization skips "XX", then tries to + match "BB", which is long enough. In the process, (*MARK:2) is encoun- + tered and remembered. When the match attempt fails, the next "B" is + found, but there is only one character left, so there are no more at- + tempts, and "no match" is returned with the "last mark seen" set to + "2". Without start-up optimizations, however, matches are tried at + every possible starting position, including at the end of the subject, + where (*MARK:1) is encountered, but there is no "B", so the "last mark + seen" that is returned is "1". In this case, the optimizations do not + affect the overall match result, which is still "no match", but they do + affect the auxiliary information that is returned. + The match context A match context is required if you want to: @@ -1025,6 +1140,16 @@ PCRE2 CONTEXTS tion made by pcre2_substitute(). Details are given in the section enti- tled "Creating a new string with substitutions" below. + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + + This sets up a callout function for PCRE2 to call when performing case + transformations inside pcre2_substitute(). Details are given in the + section entitled "Creating a new string with substitutions" below. + int pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE value); @@ -1224,8 +1349,11 @@ CHECKING BUILD-TIME OPTIONS The output is a uint32_t integer that is set to one if support for just-in-time compiling is included in the library; otherwise it is set to zero. Note that having the support in the library does not guarantee - that JIT will be used for any given match. See the pcre2jit documenta- - tion for more details. + that JIT will be used for any given match, and neither does it guaran- + tee that JIT will actually be able to function, because it may not be + able to allocate executable memory in some environments. There is a + special call to pcre2_jit_compile() that can be used to check this. See + the pcre2jit documentation for more details. PCRE2_CONFIG_JITTARGET @@ -1413,10 +1541,10 @@ COMPILING A PATTERN spectively, when pcre2_compile() returns NULL because a compilation er- ror has occurred. - There are nearly 100 positive error codes that pcre2_compile() may re- - turn if it finds an error in the pattern. There are also some negative - error codes that are used for invalid UTF strings when validity check- - ing is in force. These are the same as given by pcre2_match() and + There are over 100 positive error codes that pcre2_compile() may return + if it finds an error in the pattern. There are also some negative error + codes that are used for invalid UTF strings when validity checking is + in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and are described in the pcre2unicode documentation. There is no separate documentation for the positive error codes, be- cause the textual error messages that are obtained by calling the @@ -1511,39 +1639,56 @@ COMPILING A PATTERN Perl. If you want a multiline circumflex also to match after a termi- nating newline, you must set PCRE2_ALT_CIRCUMFLEX. + PCRE2_ALT_EXTENDED_CLASS + + Alters the parsing of character classes to follow the extended syntax + described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no + impact on the behaviour of the Perl-specific "(?[...])" syntax for ex- + tended classes, but instead enables the alternative syntax of extended + class behaviour inside ordinary "[...]" character classes. See the + pcre2pattern documentation for details of the character classes sup- + ported. + PCRE2_ALT_VERBNAMES - By default, for compatibility with Perl, the name in any verb sequence - such as (*MARK:NAME) is any sequence of characters that does not in- - clude a closing parenthesis. The name is not processed in any way, and - it is not possible to include a closing parenthesis in the name. How- - ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- - cessing is applied to verb names and only an unescaped closing paren- - thesis terminates the name. A closing parenthesis can be included in a - name either as \) or between \Q and \E. If the PCRE2_EXTENDED or - PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped - whitespace in verb names is skipped and #-comments are recognized, ex- + By default, for compatibility with Perl, the name in any verb sequence + such as (*MARK:NAME) is any sequence of characters that does not in- + clude a closing parenthesis. The name is not processed in any way, and + it is not possible to include a closing parenthesis in the name. How- + ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash pro- + cessing is applied to verb names and only an unescaped closing paren- + thesis terminates the name. A closing parenthesis can be included in a + name either as \) or between \Q and \E. If the PCRE2_EXTENDED or + PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped + whitespace in verb names is skipped and #-comments are recognized, ex- actly as in the rest of the pattern. PCRE2_AUTO_CALLOUT - If this bit is set, pcre2_compile() automatically inserts callout - items, all with number 255, before each pattern item, except immedi- - ately before or after an explicit callout in the pattern. For discus- + If this bit is set, pcre2_compile() automatically inserts callout + items, all with number 255, before each pattern item, except immedi- + ately before or after an explicit callout in the pattern. For discus- sion of the callout facility, see the pcre2callout documentation. PCRE2_CASELESS - If this bit is set, letters in the pattern match both upper and lower - case letters in the subject. It is equivalent to Perl's /i option, and - it can be changed within a pattern by a (?i) option setting. If either - PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all - characters with more than one other case, and for all characters whose - code points are greater than U+007F. Note that there are two ASCII - characters, K and S, that, in addition to their lower case ASCII equiv- - alents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long - S) respectively. If you do not want this case equivalence, you can sup- - press it by setting PCRE2_EXTRA_CASELESS_RESTRICT. + If this bit is set, letters in the pattern match both upper and lower + case letters in the subject. It is equivalent to Perl's /i option, and + it can be changed within a pattern by a (?i) option setting. If either + PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all + characters with more than one other case, and for all characters whose + code points are greater than U+007F. + + Note that there are two ASCII characters, K and S, that, in addition to + their lower case ASCII equivalents, are case-equivalent with U+212A + (Kelvin sign) and U+017F (long S) respectively. If you do not want this + case equivalence, you can suppress it by setting PCRE2_EXTRA_CASE- + LESS_RESTRICT. + + One language family, Turkish and Azeri, has its own case-insensitivity + rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. + This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot + above), and U+0131 (small dotless i) characters. For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup @@ -1551,201 +1696,206 @@ COMPILING A PATTERN (available only in 16-bit or 32-bit mode) are treated as not having an- other case. + From release 10.45 PCRE2_CASELESS also affects what some of the letter- + related Unicode property escapes (\p and \P) match. The properties Lu + (upper case letter), Ll (lower case letter), and Lt (title case letter) + are all treated as LC (cased letter) when PCRE2_CASELESS is set. + PCRE2_DOLLAR_ENDONLY - If this bit is set, a dollar metacharacter in the pattern matches only - at the end of the subject string. Without this option, a dollar also - matches immediately before a newline at the end of the string (but not - before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored - if PCRE2_MULTILINE is set. There is no equivalent to this option in + If this bit is set, a dollar metacharacter in the pattern matches only + at the end of the subject string. Without this option, a dollar also + matches immediately before a newline at the end of the string (but not + before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored + if PCRE2_MULTILINE is set. There is no equivalent to this option in Perl, and no way to set it within a pattern. PCRE2_DOTALL - If this bit is set, a dot metacharacter in the pattern matches any - character, including one that indicates a newline. However, it only + If this bit is set, a dot metacharacter in the pattern matches any + character, including one that indicates a newline. However, it only ever matches one character, even if newlines are coded as CRLF. Without this option, a dot does not match when the current position in the sub- - ject is at a newline. This option is equivalent to Perl's /s option, + ject is at a newline. This option is equivalent to Perl's /s option, and it can be changed within a pattern by a (?s) option setting. A neg- - ative class such as [^a] always matches newline characters, and the \N - escape sequence always matches a non-newline character, independent of + ative class such as [^a] always matches newline characters, and the \N + escape sequence always matches a non-newline character, independent of the setting of PCRE2_DOTALL. PCRE2_DUPNAMES - If this bit is set, names used to identify capture groups need not be - unique. This can be helpful for certain types of pattern when it is - known that only one instance of the named group can ever be matched. - There are more details of named capture groups below; see also the + If this bit is set, names used to identify capture groups need not be + unique. This can be helpful for certain types of pattern when it is + known that only one instance of the named group can ever be matched. + There are more details of named capture groups below; see also the pcre2pattern documentation. PCRE2_ENDANCHORED - If this bit is set, the end of any pattern match must be right at the + If this bit is set, the end of any pattern match must be right at the end of the string being searched (the "subject string"). If the pattern match succeeds by reaching (*ACCEPT), but does not reach the end of the - subject, the match fails at the current starting point. For unanchored - patterns, a new match is then tried at the next starting point. How- + subject, the match fails at the current starting point. For unanchored + patterns, a new match is then tried at the next starting point. How- ever, if the match succeeds by reaching the end of the pattern, but not - the end of the subject, backtracking occurs and an alternative match + the end of the subject, backtracking occurs and an alternative match may be found. Consider these two patterns: .(*ACCEPT)|.. .|.. - If matched against "abc" with PCRE2_ENDANCHORED set, the first matches - "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED - can also be achieved by appropriate constructs in the pattern itself, + If matched against "abc" with PCRE2_ENDANCHORED set, the first matches + "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED + can also be achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only - to the first (that is, the longest) matched string. Other parallel - matches, which are necessarily substrings of the first one, must obvi- + to the first (that is, the longest) matched string. Other parallel + matches, which are necessarily substrings of the first one, must obvi- ously end before the end of the subject. PCRE2_EXTENDED - If this bit is set, most white space characters in the pattern are to- - tally ignored except when escaped, inside a character class, or inside - a \Q...\E sequence. However, white space is not allowed within se- - quences such as (?> that introduce various parenthesized groups, nor - within numerical quantifiers such as {1,3}. Ignorable white space is - permitted between an item and a following quantifier and between a - quantifier and a following + that indicates possessiveness. PCRE2_EX- - TENDED is equivalent to Perl's /x option, and it can be changed within + If this bit is set, most white space characters in the pattern are to- + tally ignored except when escaped, inside a character class, or inside + a \Q...\E sequence. However, white space is not allowed within se- + quences such as (?> that introduce various parenthesized groups, nor + within numerical quantifiers such as {1,3}. Ignorable white space is + permitted between an item and a following quantifier and between a + quantifier and a following + that indicates possessiveness. PCRE2_EX- + TENDED is equivalent to Perl's /x option, and it can be changed within a pattern by a (?x) option setting. - When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- - nizes as white space only those characters with code points less than + When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recog- + nizes as white space only those characters with code points less than 256 that are flagged as white space in its low-character table. The ta- ble is normally created by pcre2_maketables(), which uses the isspace() - function to identify space characters. In most ASCII environments, the - relevant characters are those with code points 0x0009 (tab), 0x000A - (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage + function to identify space characters. In most ASCII environments, the + relevant characters are those with code points 0x0009 (tab), 0x000A + (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage return), and 0x0020 (space). When PCRE2 is compiled with Unicode support, in addition to these char- - acters, five more Unicode "Pattern White Space" characters are recog- + acters, five more Unicode "Pattern White Space" characters are recog- nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to- - right mark), U+200F (right-to-left mark), U+2028 (line separator), and - U+2029 (paragraph separator). This set of characters is the same as - recognized by Perl's /x option. Note that the horizontal and vertical - space characters that are matched by the \h and \v escapes in patterns + right mark), U+200F (right-to-left mark), U+2028 (line separator), and + U+2029 (paragraph separator). This set of characters is the same as + recognized by Perl's /x option. Note that the horizontal and vertical + space characters that are matched by the \h and \v escapes in patterns are a much bigger set. - As well as ignoring most white space, PCRE2_EXTENDED also causes char- - acters between an unescaped # outside a character class and the next - newline, inclusive, to be ignored, which makes it possible to include + As well as ignoring most white space, PCRE2_EXTENDED also causes char- + acters between an unescaped # outside a character class and the next + newline, inclusive, to be ignored, which makes it possible to include comments inside complicated patterns. Note that the end of this type of - comment is a literal newline sequence in the pattern; escape sequences + comment is a literal newline sequence in the pattern; escape sequences that happen to represent a newline do not count. Which characters are interpreted as newlines can be specified by a set- - ting in the compile context that is passed to pcre2_compile() or by a - special sequence at the start of the pattern, as described in the sec- - tion entitled "Newline conventions" in the pcre2pattern documentation. + ting in the compile context that is passed to pcre2_compile() or by a + special sequence at the start of the pattern, as described in the sec- + tion entitled "Newline conventions" in the pcre2pattern documentation. A default is defined when PCRE2 is built. PCRE2_EXTENDED_MORE - This option has the effect of PCRE2_EXTENDED, but, in addition, un- - escaped space and horizontal tab characters are ignored inside a char- - acter class. Note: only these two characters are ignored, not the full - set of pattern white space characters that are ignored outside a char- - acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, + This option has the effect of PCRE2_EXTENDED, but, in addition, un- + escaped space and horizontal tab characters are ignored inside a char- + acter class. Note: only these two characters are ignored, not the full + set of pattern white space characters that are ignored outside a char- + acter class. PCRE2_EXTENDED_MORE is equivalent to Perl's /xx option, and it can be changed within a pattern by a (?xx) option setting. PCRE2_FIRSTLINE If this option is set, the start of an unanchored pattern match must be - before or at the first newline in the subject string following the - start of matching, though the matched text may continue over the new- + before or at the first newline in the subject string following the + start of matching, though the matched text may continue over the new- line. If startoffset is non-zero, the limiting newline is not necessar- - ily the first newline in the subject. For example, if the subject + ily the first newline in the subject. For example, if the subject string is "abc\nxyz" (where \n represents a single-character newline) a - pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is - greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more - general limiting facility. If PCRE2_FIRSTLINE is set with an offset - limit, a match must occur in the first line and also within the offset + pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is + greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more + general limiting facility. If PCRE2_FIRSTLINE is set with an offset + limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes first is used. This option has no effect for anchored patterns. PCRE2_LITERAL If this option is set, all meta-characters in the pattern are disabled, - and it is treated as a literal string. Matching literal strings with a + and it is treated as a literal string. Matching literal strings with a regular expression engine is not the most efficient way of doing it. If - you are doing a lot of literal matching and are worried about effi- + you are doing a lot of literal matching and are worried about effi- ciency, you should consider using other approaches. The only other main options that are allowed with PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, - PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- + PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EX- TRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error. PCRE2_MATCH_INVALID_UTF - This option forces PCRE2_UTF (see below) and also enables support for - matching by pcre2_match() in subject strings that contain invalid UTF - sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries - process strings as sequences of uint16_t or uint32_t code points. They + This option forces PCRE2_UTF (see below) and also enables support for + matching by pcre2_match() in subject strings that contain invalid UTF + sequences. Note, however, that the 16-bit and 32-bit PCRE2 libraries + process strings as sequences of uint16_t or uint32_t code points. They cannot find valid UTF sequences within an arbitrary string of bytes un- - less such sequences are suitably aligned. This facility is not sup- - ported for DFA matching. For details, see the pcre2unicode documenta- + less such sequences are suitably aligned. This facility is not sup- + ported for DFA matching. For details, see the pcre2unicode documenta- tion. PCRE2_MATCH_UNSET_BACKREF - If this option is set, a backreference to an unset capture group - matches an empty string (by default this causes the current matching + If this option is set, a backreference to an unset capture group + matches an empty string (by default this causes the current matching alternative to fail). A pattern such as (\1)(a) succeeds when this op- - tion is set (assuming it can find an "a" in the subject), whereas it - fails by default, for Perl compatibility. Setting this option makes + tion is set (assuming it can find an "a" in the subject), whereas it + fails by default, for Perl compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka JavaScript). PCRE2_MULTILINE - By default, for the purposes of matching "start of line" and "end of - line", PCRE2 treats the subject string as consisting of a single line - of characters, even if it actually contains newlines. The "start of - line" metacharacter (^) matches only at the start of the string, and - the "end of line" metacharacter ($) matches only at the end of the - string, or before a terminating newline (except when PCRE2_DOLLAR_EN- + By default, for the purposes of matching "start of line" and "end of + line", PCRE2 treats the subject string as consisting of a single line + of characters, even if it actually contains newlines. The "start of + line" metacharacter (^) matches only at the start of the string, and + the "end of line" metacharacter ($) matches only at the end of the + string, or before a terminating newline (except when PCRE2_DOLLAR_EN- DONLY is set). Note, however, that unless PCRE2_DOTALL is set, the "any - character" metacharacter (.) does not match at a newline. This behav- + character" metacharacter (.) does not match at a newline. This behav- iour (for ^, $, and dot) is the same as Perl. - When PCRE2_MULTILINE it is set, the "start of line" and "end of line" - constructs match immediately following or immediately before internal - newlines in the subject string, respectively, as well as at the very - start and end. This is equivalent to Perl's /m option, and it can be + When PCRE2_MULTILINE it is set, the "start of line" and "end of line" + constructs match immediately following or immediately before internal + newlines in the subject string, respectively, as well as at the very + start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. Note that the "start of line" metacharacter does not match after a newline at the end of the - subject, for compatibility with Perl. However, you can change this by - setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a - subject string, or no occurrences of ^ or $ in a pattern, setting + subject, for compatibility with Perl. However, you can change this by + setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a + subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. PCRE2_NEVER_BACKSLASH_C - This option locks out the use of \C in the pattern that is being com- - piled. This escape can cause unpredictable behaviour in UTF-8 or - UTF-16 modes, because it may leave the current matching point in the + This option locks out the use of \C in the pattern that is being com- + piled. This escape can cause unpredictable behaviour in UTF-8 or + UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in ap- plications that process patterns from external sources. Note that there is also a build-time option that permanently locks out the use of \C. PCRE2_NEVER_UCP - This option locks out the use of Unicode properties for handling \B, + This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as - described for the PCRE2_UCP option below. In particular, it prevents - the creator of the pattern from enabling this facility by starting the - pattern with (*UCP). This option may be useful in applications that - process patterns from external sources. The option combination PCRE_UCP - and PCRE_NEVER_UCP causes an error. + described for the PCRE2_UCP option below. In particular, it prevents + the creator of the pattern from enabling this facility by starting the + pattern with (*UCP). This option may be useful in applications that + process patterns from external sources. The option combination + PCRE2_UCP and PCRE2_NEVER_UCP causes an error. PCRE2_NEVER_UTF @@ -1769,86 +1919,56 @@ COMPILING A PATTERN PCRE2_NO_AUTO_POSSESS - If this option is set, it disables "auto-possessification", which is an - optimization that, for example, turns a+b into a++b in order to avoid - backtracks into a+ that can never be successful. However, if callouts - are in use, auto-possessification means that some callouts are never - taken. You can set this option if you want the matching functions to do - a full unoptimized search and run all the callouts, but it is mainly - provided for testing purposes. + If this (deprecated) option is set, it disables "auto-possessifica- + tion", which is an optimization that, for example, turns a+b into a++b + in order to avoid backtracks into a+ that can never be successful. How- + ever, if callouts are in use, auto-possessification means that some + callouts are never taken. You can set this option if you want the + matching functions to do a full unoptimized search and run all the + callouts, but it is mainly provided for testing purposes. + + If a compile context is available, it is recommended to use + pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather + than the compile option PCRE2_NO_AUTO_POSSESS. Note that + PCRE2_NO_AUTO_POSSESS takes precedence over the pcre2_set_optimize() + optimization directives PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF. PCRE2_NO_DOTSTAR_ANCHOR - If this option is set, it disables an optimization that is applied when - .* is the first significant item in a top-level branch of a pattern, - and all the other branches also start with .* or with \A or \G or ^. - The optimization is automatically disabled for .* if it is inside an - atomic group or a capture group that is the subject of a backreference, - or if the pattern contains (*PRUNE) or (*SKIP). When the optimization - is not disabled, such a pattern is automatically anchored if + If this (deprecated) option is set, it disables an optimization that is + applied when .* is the first significant item in a top-level branch of + a pattern, and all the other branches also start with .* or with \A or + \G or ^. The optimization is automatically disabled for .* if it is in- + side an atomic group or a capture group that is the subject of a back- + reference, or if the pattern contains (*PRUNE) or (*SKIP). When the op- + timization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set - for any ^ items. Otherwise, the fact that any match must start either - at the start of the subject or following a newline is remembered. Like - other optimizations, this can cause callouts to be skipped. + for any ^ items. Otherwise, the fact that any match must start either + at the start of the subject or following a newline is remembered. Like + other optimizations, this can cause callouts to be skipped. (If a com- + pile context is available, it is recommended to use pcre2_set_opti- + mize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF instead.) PCRE2_NO_START_OPTIMIZE This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of - the JIT compiler. + the JIT compiler. Setting this option is equivalent to calling + pcre2_set_optimize() with the directive parameter set to + PCRE2_START_OPTIMIZE_OFF. There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails imme- diately if it cannot find it, without actually running the main match- - ing function. This means that a special item such as (*COMMIT) at the - start of a pattern is not considered until after a suitable starting - point for the match has been found. Also, when callouts or (*MARK) - items are in use, these "start-up" optimizations can cause them to be - skipped if the pattern is never actually used. The start-up optimiza- - tions are in effect a pre-scan of the subject that takes place before - the pattern is run. - - The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, - possibly causing performance to suffer, but ensuring that in cases - where the result is "no match", the callouts do occur, and that items - such as (*COMMIT) and (*MARK) are considered at every possible starting - position in the subject string. - - Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching - operation. Consider the pattern + ing function. The start-up optimizations are in effect a pre-scan of + the subject that takes place before the pattern is run. - (*COMMIT)ABC - - When this is compiled, PCRE2 records the fact that a match must start - with the character "A". Suppose the subject string is "DEFABC". The - start-up optimization scans along the subject, finds "A" and runs the - first match attempt from there. The (*COMMIT) item means that the pat- - tern must match the current starting position, which in this case, it - does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE - set, the initial scan along the subject string does not happen. The - first match attempt is run starting from "D" and when this fails, - (*COMMIT) prevents any further matches being tried, so the overall re- - sult is "no match". - - As another start-up optimization makes use of a minimum length for a - matching subject, which is recorded when possible. Consider the pattern - - (*MARK:1)B(*MARK:2)(X|Y) - - The minimum length for a match is two characters. If the subject is - "XXBB", the "starting character" optimization skips "XX", then tries to - match "BB", which is long enough. In the process, (*MARK:2) is encoun- - tered and remembered. When the match attempt fails, the next "B" is - found, but there is only one character left, so there are no more at- - tempts, and "no match" is returned with the "last mark seen" set to - "2". If NO_START_OPTIMIZE is set, however, matches are tried at every - possible starting position, including at the end of the subject, where - (*MARK:1) is encountered, but there is no "B", so the "last mark seen" - that is returned is "1". In this case, the optimizations do not affect - the overall match result, which is still "no match", but they do affect - the auxiliary information that is returned. + Disabling the start-up optimizations may cause performance to suffer. + However, this may be desirable for patterns which contain callouts or + items such as (*COMMIT) and (*MARK). See the above description of + PCRE2_START_OPTIMIZE_OFF for further details. PCRE2_NO_UTF_CHECK @@ -1892,41 +2012,46 @@ COMPILING A PATTERN ties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode - support (which is the default). The PCRE2_EXTRA_CASELESS_RESTRICT op- - tion (see below) restricts caseless matching such that ASCII characters - match only ASCII characters and non-ASCII characters match only non- - ASCII characters. + support (which is the default). + + The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless + matching such that ASCII characters match only ASCII characters and + non-ASCII characters match only non-ASCII characters. The PCRE2_EX- + TRA_TURKISH_CASING option (see above) alters the matching of the 'i' + characters to follow their behaviour in Turkish and Azeri languages. + For further details on PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EX- + TRA_TURKISH_CASING, see the pcre2unicode page. PCRE2_UNGREEDY - This option inverts the "greediness" of the quantifiers so that they - are not greedy by default, but become greedy if followed by "?". It is - not compatible with Perl. It can also be set by a (?U) option setting + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting within the pattern. PCRE2_USE_OFFSET_LIMIT This option must be set for pcre2_compile() if pcre2_set_offset_limit() - is going to be used to set a non-default offset limit in a match con- - text for matches that use this pattern. An error is generated if an - offset limit is set without this option. For more details, see the de- - scription of pcre2_set_offset_limit() in the section that describes + is going to be used to set a non-default offset limit in a match con- + text for matches that use this pattern. An error is generated if an + offset limit is set without this option. For more details, see the de- + scription of pcre2_set_offset_limit() in the section that describes match contexts. See also the PCRE2_FIRSTLINE option above. PCRE2_UTF - This option causes PCRE2 to regard both the pattern and the subject - strings that are subsequently processed as strings of UTF characters - instead of single-code-unit strings. It is available when PCRE2 is - built to include Unicode support (which is the default). If Unicode + This option causes PCRE2 to regard both the pattern and the subject + strings that are subsequently processed as strings of UTF characters + instead of single-code-unit strings. It is available when PCRE2 is + built to include Unicode support (which is the default). If Unicode support is not available, the use of this option provokes an error. De- - tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the + tails of how PCRE2_UTF changes the behaviour of PCRE2 are given in the pcre2unicode page. In particular, note that it changes the way PCRE2_CASELESS works. Extra compile options - The option bits that can be set in a compile context by calling the + The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_options() function are as follows: PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK @@ -1938,102 +2063,102 @@ COMPILING A PATTERN PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES - This option applies when compiling a pattern in UTF-8 or UTF-32 mode. - It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode + This option applies when compiling a pattern in UTF-8 or UTF-32 mode. + It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs - in UTF-16 to encode code points with values in the range 0x10000 to - 0x10ffff. The surrogates cannot therefore be represented in UTF-16. + in UTF-16 to encode code points with values in the range 0x10000 to + 0x10ffff. The surrogates cannot therefore be represented in UTF-16. They can be represented in UTF-8 and UTF-32, but are defined as invalid - code points, and cause errors if encountered in a UTF-8 or UTF-32 + code points, and cause errors if encountered in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2. - These values also cause errors if encountered in escape sequences such + These values also cause errors if encountered in escape sequences such as \x{d912} within a pattern. However, it seems that some applications, when using PCRE2 to check for unwanted characters in UTF-8 strings, ex- - plicitly test for the surrogates using escape sequences. The - PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- + plicitly test for the surrogates using escape sequences. The + PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be- cause it applies only to the testing of input strings for UTF validity. - If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- - gate code point values in UTF-8 and UTF-32 patterns no longer provoke - errors and are incorporated in the compiled pattern. However, they can - only match subject characters if the matching function is called with + If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro- + gate code point values in UTF-8 and UTF-32 patterns no longer provoke + errors and are incorporated in the compiled pattern. However, they can + only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set. PCRE2_EXTRA_ALT_BSUX - The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and - \x in the way that ECMAscript (aka JavaScript) does. Additional func- + The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and + \x in the way that ECMAscript (aka JavaScript) does. Additional func- tionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has - the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} + the effect of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal character code, where hhh.. is any number of hexadeci- mal digits. PCRE2_EXTRA_ASCII_BSD - This option forces \d to match only ASCII digits, even when PCRE2_UCP - is set. It can be changed within a pattern by means of the (?aD) op- + This option forces \d to match only ASCII digits, even when PCRE2_UCP + is set. It can be changed within a pattern by means of the (?aD) op- tion setting. PCRE2_EXTRA_ASCII_BSS - This option forces \s to match only ASCII space characters, even when - PCRE2_UCP is set. It can be changed within a pattern by means of the + This option forces \s to match only ASCII space characters, even when + PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) option setting. PCRE2_EXTRA_ASCII_BSW - This option forces \w to match only ASCII word characters, even when - PCRE2_UCP is set. It can be changed within a pattern by means of the + This option forces \w to match only ASCII word characters, even when + PCRE2_UCP is set. It can be changed within a pattern by means of the (?aW) option setting. PCRE2_EXTRA_ASCII_DIGIT This option forces the POSIX character classes [:digit:] and [:xdigit:] - to match only ASCII digits, even when PCRE2_UCP is set. It can be + to match only ASCII digits, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option setting. PCRE2_EXTRA_ASCII_POSIX This option forces all the POSIX character classes, including [:digit:] - and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is - set. It can be changed within a pattern by means of the (?aP) option - setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order + and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is + set. It can be changed within a pattern by means of the (?aP) option + setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL - This is a dangerous option. Use with care. By default, an unrecognized - escape such as \j or a malformed one such as \x{2z} causes a compile- + This is a dangerous option. Use with care. By default, an unrecognized + escape such as \j or a malformed one such as \x{2z} causes a compile- time error when detected by pcre2_compile(). Perl is somewhat inconsis- - tent in handling such items: for example, \j is treated as a literal - "j", and non-hexadecimal digits in \x{} are just ignored, though warn- - ings are given in both cases if Perl's warning switch is enabled. How- - ever, a malformed octal number after \o{ always causes an error in + tent in handling such items: for example, \j is treated as a literal + "j", and non-hexadecimal digits in \x{} are just ignored, though warn- + ings are given in both cases if Perl's warning switch is enabled. How- + ever, a malformed octal number after \o{ always causes an error in Perl. - If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to - pcre2_compile(), all unrecognized or malformed escape sequences are - treated as single-character escapes. For example, \j is a literal "j" - and \x{2z} is treated as the literal string "x{2z}". Setting this op- + If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to + pcre2_compile(), all unrecognized or malformed escape sequences are + treated as single-character escapes. For example, \j is a literal "j" + and \x{2z} is treated as the literal string "x{2z}". Setting this op- tion means that typos in patterns may go undetected and have unexpected - results. Also note that a sequence such as [\N{] is interpreted as a - malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] + results. Also note that a sequence such as [\N{] is interpreted as a + malformed attempt at [\N{...}] and so is treated as [N{] whereas [\N] gives an error because an unqualified \N is a valid escape sequence but - is not supported in a character class. To reiterate: this is a danger- + is not supported in a character class. To reiterate: this is a danger- ous option. Use with great care. PCRE2_EXTRA_CASELESS_RESTRICT - When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows + When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow for more than two cases per character. There are two case-equivalent character sets that contain both ASCII and non- ASCII characters. The ASCII letter S is case-equivalent to U+017f (long - S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). - This option disables recognition of case-equivalences that cross the + S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). + This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must ei- - ther be ASCII or non-ASCII. The option can be changed with a pattern by - the (?r) option setting. + ther be ASCII or non-ASCII. The option can be changed within a pattern + by the (*CASELESS_RESTRICT) or (?r) option settings. PCRE2_EXTRA_ESCAPED_CR_IS_LF @@ -2062,6 +2187,36 @@ COMPILING A PATTERN end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. + PCRE2_EXTRA_NO_BS0 + + If this option is set (note that its final character is the digit 0) it + locks out the use of the sequence \0 unless at least one more octal + digit follows. + + PCRE2_EXTRA_PYTHON_OCTAL + + If this option is set, PCRE2 follows Python's rules for interpreting + octal escape sequences. The rules for handling sequences such as \14, + which could be an octal number or a back reference are different. De- + tails are given in the pcre2pattern documentation. + + PCRE2_EXTRA_NEVER_CALLOUT + + If this option is set, PCRE2 treats callouts in the pattern as a syntax + error, returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if + the application knows that a callout will not be provided to + pcre2_match(), so that callouts in the pattern are not silently ig- + nored. + + PCRE2_EXTRA_TURKISH_CASING + + This option alters case-equivalence of the 'i' letters to follow the + alphabet used by Turkish and Azeri languages. The option can be changed + within a pattern by the (*TURKISH_CASING) start-of-pattern setting. Ei- + ther the UTF or UCP options must be set. In the 8-bit library, UTF must + be set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RE- + STRICT. + JUST-IN-TIME (JIT) COMPILATION @@ -2255,6 +2410,7 @@ INFORMATION ABOUT A COMPILED PATTERN PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3520,9 +3676,9 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (with- out, of course, writing anything) in order to compute the size of - buffer that is needed. This value is passed back via the outlengthptr - variable, with the result of the function still being PCRE2_ER- - ROR_NOMEMORY. + buffer that is needed, which will include the extra space for the ter- + minating NUL. This value is passed back via the outlengthptr variable, + with the result of the function still being PCRE2_ERROR_NOMEMORY. Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean @@ -3541,24 +3697,32 @@ CREATING A NEW STRING WITH SUBSTITUTIONS cape character that can specify the insertion of characters from cap- ture groups and names from (*MARK) or other control verbs in the pat- tern. Dollar is the only escape character (backslash is treated as lit- - eral). The following forms are always recognized: + eral). The following forms are recognized: $$ insert a dollar character - $ or ${} insert the contents of group + $n or ${n} insert the contents of group n + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string $*MARK or ${*MARK} insert a control verb name - Either a group number or a group name can be given for . Curly - brackets are required only if the following character would be inter- - preted as part of the number or name. The number may be zero to include - the entire matched string. For example, if the pattern a(b)c is - matched with "=abc=" and the replacement string "+$1$0$1+", the result - is "=+babcb+=". + Either a group number or a group name can be given for n, for example + $2 or $NAME. Curly brackets are required only if the following charac- + ter would be interpreted as part of the number or name. The number may + be zero to include the entire matched string. For example, if the pat- + tern a(b)c is matched with "=abc=" and the replacement string + "+$1$0$1+", the result is "=+babcb+=". + + The JavaScript form $, where the angle brackets are part of the + syntax, is also recognized for group names, but not for group numbers + or *MARK. - $*MARK inserts the name from the last encountered backtracking control - verb on the matching path that has a name. (*MARK) must always include - a name, but the other verbs need not. For example, in the case of + $*MARK inserts the name from the last encountered backtracking control + verb on the matching path that has a name. (*MARK) must always include + a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) - the relevant name is "B". This facility can be used to perform simple + the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this pcre2test example shows: /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} @@ -3566,15 +3730,15 @@ CREATING A NEW STRING WITH SUBSTITUTIONS 2: pear orange PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject - string, replacing every matching substring. If this option is not set, - only the first matching substring is replaced. The search for matches - takes place in the original subject string (that is, previous replace- - ments do not affect it). Iteration is implemented by advancing the - startoffset value for each search, which is always passed the entire + string, replacing every matching substring. If this option is not set, + only the first matching substring is replaced. The search for matches + takes place in the original subject string (that is, previous replace- + ments do not affect it). Iteration is implemented by advancing the + startoffset value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, search- ing stops when that limit is reached. - You can restrict the effect of a global substitution to a portion of + You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of startoffset and an off- set limit. Here is a pcre2test example: @@ -3582,73 +3746,95 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ABC ABC ABC ABC\=offset=3,offset_limit=12 2: ABC A!C A!C ABC - When continuing with global substitutions after matching a substring + When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same off- set is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next - two characters are CR, LF. In this case, the offset is advanced by two + two characters are CR, LF. In this case, the offset is advanced by two characters. PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option - should be used with care, because it means that a typo in a group name + should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error. PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including un- - known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated - as empty strings when inserted as described above. If this option is + known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated + as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UN- - SET error. This option does not influence the extended substitution + SET error. This option does not influence the extended substitution syntax described below. - PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the - replacement string. Without this option, only the dollar character is - special, and only the group insertion forms listed above are valid. - When PCRE2_SUBSTITUTE_EXTENDED is set, two things change: + PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the + replacement string. Without this option, only the dollar character is + special, and only the group insertion forms listed above are valid. + When PCRE2_SUBSTITUTE_EXTENDED is set, several things change: + + Firstly, backslash in a replacement string is interpreted as an escape + character. The usual forms such as \x{ddd} can be used to specify par- + ticular character codes, and backslash followed by any non-alphanumeric + character quotes that character. Extended quoting can be coded using + \Q...\E, exactly as in pattern strings. The escapes \b and \v are in- + terpreted as the characters backspace and vertical tab, respectively. - Firstly, backslash in a replacement string is interpreted as an escape - character. The usual forms such as \n or \x{ddd} can be used to specify - particular character codes, and backslash followed by any non-alphanu- - meric character quotes that character. Extended quoting can be coded - using \Q...\E, exactly as in pattern strings. + The interpretation of backslash followed by one or more digits is the + same as in a pattern, which in Perl has some ambiguities. Details are + given in the pcre2pattern page. + + The Python form \g, where the angle brackets are part of the syntax + and n is either a group name or number, is recognized as an altertive + way of inserting the contents of a group, for example \g<3>. There are also four escape sequences for forcing the case of inserted - letters. The insertion mechanism has three states: no case forcing, - force upper case, and force lower case. The escape sequences change the - current state: \U and \L change to upper or lower case forcing, respec- - tively, and \E (when not terminating a \Q quoted sequence) reverts to - no case forcing. The sequences \u and \l force the next character (if - it is a letter) to upper or lower case, respectively, and then the - state automatically reverts to no case forcing. Case forcing applies to - all inserted characters, including those from capture groups and let- - ters within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP - was set when the pattern was compiled, Unicode properties are used for - case forcing characters whose code points are greater than 127. + letters. Case forcing applies to all inserted characters, including + those from capture groups and letters within \Q...\E quoted sequences. + The insertion mechanism has three states: no case forcing, force upper + case, and force lower case. The escape sequences change the current + state: \U and \L change to upper or lower case forcing, respectively, + and \E (when not terminating a \Q quoted sequence) reverts to no case + forcing. The sequences \u and \l force the next character (if it is a + letter) to upper or lower case, respectively, and then the state auto- + matically reverts to no case forcing. + + However, if \u is immediately followed by \L or \l is immediately fol- + lowed by \U, the next character's case is forced by the first escape + sequence, and subsequent characters by the second. This provides a "ti- + tle casing" facility that can be applied to group captures. For exam- + ple, if group 1 has captured "heLLo", the replacement string "\u\L$1" + becomes "Hello". + + If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, + Unicode properties are used for case forcing characters whose code + points are greater than 127. However, only simple case folding, as de- + termined by the Unicode file CaseFolding.txt is supported. PCRE2 does + not support language-specific special casing rules such as using dif- + ferent lower case Greek sigmas in the middle and ends of words (as de- + fined in the Unicode file SpecialCasing.txt). Note that case forcing sequences such as \U...\E do not nest. For exam- ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EX- TRA_ALT_BSUX options do not apply to replacement strings. - The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more + The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash: - ${:-} - ${:+:} + ${n:-string} + ${n:+string1:string2} - As before, may be a group number or a name. The first form speci- - fies a default value. If group is set, its value is inserted; if - not, is expanded and the result inserted. The second form - specifies strings that are expanded and inserted when group is set - or unset, respectively. The first form is just a convenient shorthand - for + As in the simple case, n may be a group number or a name. The first + form specifies a default value. If group n is set, its value is in- + serted; if not, the string is expanded and the result inserted. The + second form specifies strings that are expanded and inserted when group + n is set or unset, respectively. The first form is just a convenient + shorthand for - ${:+${}:} + ${n:+${n}:string} - Backslash can be used to escape colons and closing curly brackets in - the replacement strings. A change of the case forcing state within a - replacement string remains in force afterwards, as shown in this + Backslash can be used to escape colons and closing curly brackets in + the replacement strings. A change of the case forcing state within a + replacement string remains in force afterwards, as shown in this pcre2test example: /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo @@ -3657,8 +3843,8 @@ CREATING A NEW STRING WITH SUBSTITUTIONS somebody 1: HELLO - The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended - substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- + The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended + substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- known groups in the extended syntax forms to be treated as unset. If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, @@ -3667,39 +3853,39 @@ CREATING A NEW STRING WITH SUBSTITUTIONS Substitution errors - In the event of an error, pcre2_substitute() returns a negative error - code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors + In the event of an error, pcre2_substitute() returns a negative error + code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() are passed straight back. PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser- tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ- - ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) - when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- + ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) + when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- SET_EMPTY is not set. - PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big + PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size - of buffer that is needed is returned via outlengthptr. Note that this + of buffer that is needed is returned via outlengthptr. Note that this does not happen by default. PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the - match_data argument is NULL or if the subject or replacement arguments - are NULL. For backward compatibility reasons an exception is made for + match_data argument is NULL or if the subject or replacement arguments + are NULL. For backward compatibility reasons an exception is made for the replacement argument if the rlength argument is also 0. - PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in - the replacement string, with more particular errors being PCRE2_ER- + PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in + the replacement string, with more particular errors being PCRE2_ER- ROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE - (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax - error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN + (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax + error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier - than the current position in the subject, which can happen if \K is + than the current position in the subject, which can happen if \K is used in an assertion). As for all PCRE2 errors, a text message that describes the error can be - obtained by calling the pcre2_get_error_message() function (see "Ob- + obtained by calling the pcre2_get_error_message() function (see "Ob- taining a textual error message" above). Substitution callouts @@ -3708,12 +3894,20 @@ CREATING A NEW STRING WITH SUBSTITUTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); - The pcre2_set_substitution_callout() function can be used to specify a - callout function for pcre2_substitute(). This information is passed in + The pcre2_set_substitution_callout() function can be used to specify a + callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution - has been processed, but it can cause the replacement not to happen. The - callout function is not called for simulated substitutions that happen - as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. + has been processed, but it can cause the replacement not to happen. + + The callout function is not called for simulated substitutions that + happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In + this mode, when substitution processing exceeds the buffer space pro- + vided by the caller, processing continues by counting code units. The + simulation is unable to populate the callout block, and so the simula- + tion is pessimistic about the required buffer size. Whichever is larger + of accepted or rejected substitution is reported as the required size. + Therefore, the returned buffer length may be an overestimate (without a + substitution callout, it is normally an exact measurement). The first argument of the callout function is a pointer to a substitute callout block structure, which contains the following fields, not nec- @@ -3757,62 +3951,149 @@ CREATING A NEW STRING WITH SUBSTITUTIONS to the output and the call to pcre2_substitute() exits, returning the number of matches so far. + Substitution case callouts + + int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); + + The pcre2_set_substitution_case_callout() function can be used to spec- + ify a callout function for pcre2_substitute() to use when performing + case transformations. This does not affect any case insensitivity be- + haviour when performing a match, but only the user-visible transforma- + tions performed when processing a substitution such as: + + pcre2_substitute(..., "\\U$1", ...) + + The default case transformations applied by PCRE2 are reasonably com- + plete, and, in UTF or UCP mode, perform the simple locale-invariant + case transformations as specified by Unicode. This is suitable for the + internal (invisible) case-equivalence procedures used during pattern + matching, but an application may wish to use more sophisticated locale- + aware processing for the user-visible substitution transformations. + + One example implementation of the callout_function using the ICU li- + brary would be: + + PCRE2_SIZE + icu_case_callout( + PCRE2_SPTR input, PCRE2_SIZE input_len, + PCRE2_UCHAR *output, PCRE2_SIZE output_cap, + int to_case, void *data_ptr) + { + UErrorCode err = U_ZERO_ERROR; + int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER + ? u_strToLower(output, output_cap, input, input_len, NULL, &err) + : to_case == PCRE2_SUBSTITUTE_CASE_UPPER + ? u_strToUpper(output, output_cap, input, input_len, NULL, &err) + : u_strToTitle(output, output_cap, input, input_len, &first_char_only, + NULL, &err); + if (U_FAILURE(err)) return (~(PCRE2_SIZE)0); + return r; + } + + The first and second arguments of the case callout function are the + Unicode string to transform. + + The third and fourth arguments are the output buffer and its capacity. + + The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, + PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. + PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed + to the callout to indicate that the case of the entire callout input + should be case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed + to indicate that only the first character or glyph should be trans- + formed to Unicode titlecase and the rest to Unicode lowercase (note + that titlecasing sometimes uses Unicode properties to titlecase each + word in a string; but PCRE2 is requesting that only the single leading + character is to be titlecased). + + The sixth argument is the callout_data supplied to pcre2_set_substi- + tute_case_callout(). + + The resulting string in the destination buffer may be larger or smaller + than the input, if the casing rules merge or split characters. The re- + turn value is the length required for the output string. If a buffer of + sufficient size was provided to the callout, then the result must be + written to the buffer and the number of code units returned. If the re- + sult does not fit in the provided buffer, then the required capacity + must be returned and PCRE2 will not make use of the output buffer. + PCRE2 provides input and output buffers which overlap, so the callout + must support this by suitable internal buffering. + + Alternatively, if the callout wishes to indicate an error, then it may + return (~(PCRE2_SIZE)0). In this case pcre2_substitute() will immedi- + ately fail with error PCRE2_ERROR_REPLACECASE. + + When a case callout is combined with the PCRE2_SUBSTITUTE_OVER- + FLOW_LENGTH option, there are situations when pcre2_substitute() will + return an underestimate of the required buffer size. If you call + pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the + input buffer is too small for the replacement string to be constructed, + then instead of calling the case callout, pcre2_substitute() will make + an estimate of the required buffer size. The second call should also + pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second call is not + guaranteed to succeed either, if the case callout requires more buffer + space than expected. The caller must make repeated attempts in a loop. + DUPLICATE CAPTURE GROUP NAMES int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); - When a pattern is compiled with the PCRE2_DUPNAMES option, names for - capture groups are not required to be unique. Duplicate names are al- - ways allowed for groups with the same number, created by using the (?| + When a pattern is compiled with the PCRE2_DUPNAMES option, names for + capture groups are not required to be unique. Duplicate names are al- + ways allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names. - Normally, patterns that use duplicate names are such that in any one - match, only one of each set of identically-named groups participates. + Normally, patterns that use duplicate names are such that in any one + match, only one of each set of identically-named groups participates. An example is shown in the pcre2pattern documentation. - When duplicates are present, pcre2_substring_copy_byname() and - pcre2_substring_get_byname() return the first substring corresponding - to the given name that is set. Only if none are set is PCRE2_ERROR_UN- - SET is returned. The pcre2_substring_number_from_name() function re- - turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate + When duplicates are present, pcre2_substring_copy_byname() and + pcre2_substring_get_byname() return the first substring corresponding + to the given name that is set. Only if none are set is PCRE2_ERROR_UN- + SET is returned. The pcre2_substring_number_from_name() function re- + turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. - If you want to get full details of all captured substrings for a given - name, you must use the pcre2_substring_nametable_scan() function. The - first argument is the compiled pattern, and the second is the name. If - the third and fourth arguments are NULL, the function returns a group + If you want to get full details of all captured substrings for a given + name, you must use the pcre2_substring_nametable_scan() function. The + first argument is the compiled pattern, and the second is the name. If + the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. When the third and fourth arguments are not NULL, they must be pointers - to variables that are updated by the function. After it has run, they + to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the - given name, and the function returns the length of each entry in code - units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are + given name, and the function returns the length of each entry in code + units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. The format of the name table is described above in the section entitled - Information about a pattern. Given all the relevant entries for the - name, you can extract each of their numbers, and hence the captured + Information about a pattern. Given all the relevant entries for the + name, you can extract each of their numbers, and hence the captured data. FINDING ALL POSSIBLE MATCHES AT ONE POSITION - The traditional matching function uses a similar algorithm to Perl, - which stops when it finds the first match at a given point in the sub- + The traditional matching function uses a similar algorithm to Perl, + which stops when it finds the first match at a given point in the sub- ject. If you want to find all possible matches, or the longest possible - match at a given position, consider using the alternative matching - function (see below) instead. If you cannot use the alternative func- + match at a given position, consider using the alternative matching + function (see below) instead. If you cannot use the alternative func- tion, you can kludge it up by making use of the callout facility, which is described in the pcre2callout documentation. What you have to do is to insert a callout right at the end of the pat- - tern. When your callout function is called, extract and save the cur- - rent matched substring. Then return 1, which forces pcre2_match() to - backtrack and try other alternatives. Ultimately, when it runs out of + tern. When your callout function is called, extract and save the cur- + rent matched substring. Then return 1, which forces pcre2_match() to + backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH. @@ -3824,27 +4105,27 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount); - The function pcre2_dfa_match() is called to match a subject string - against a compiled pattern, using a matching algorithm that scans the + The function pcre2_dfa_match() is called to match a subject string + against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does - not backtrack (except when processing lookaround assertions). This has - different characteristics to the normal algorithm, and is not compati- - ble with Perl. Some of the features of PCRE2 patterns are not sup- + not backtrack (except when processing lookaround assertions). This has + different characteristics to the normal algorithm, and is not compati- + ble with Perl. Some of the features of PCRE2 patterns are not sup- ported. Nevertheless, there are times when this kind of matching can be - useful. For a discussion of the two matching algorithms, and a list of + useful. For a discussion of the two matching algorithms, and a list of features that pcre2_dfa_match() does not support, see the pcre2matching documentation. - The arguments for the pcre2_dfa_match() function are the same as for + The arguments for the pcre2_dfa_match() function are the same as for pcre2_match(), plus two extras. The ovector within the match data block is used in a different way, and this is described below. The other com- - mon arguments are used in the same way as for pcre2_match(), so their + mon arguments are used in the same way as for pcre2_match(), so their description is not repeated here. - The two additional arguments provide workspace for the function. The - workspace vector should contain at least 20 elements. It is used for - keeping track of multiple paths through the pattern tree. More work- - space is needed for patterns and subjects where there are a lot of po- + The two additional arguments provide workspace for the function. The + workspace vector should contain at least 20 elements. It is used for + keeping track of multiple paths through the pattern tree. More work- + space is needed for patterns and subjects where there are a lot of po- tential matches. Here is an example of a simple call to pcre2_dfa_match(): @@ -3864,45 +4145,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION Option bits for pcre2_dfa_match() - The unused bits of the options argument for pcre2_dfa_match() must be - zero. The only bits that may be set are PCRE2_ANCHORED, - PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- + The unused bits of the options argument for pcre2_dfa_match() must be + zero. The only bits that may be set are PCRE2_ANCHORED, + PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NO- TEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, - PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and - PCRE2_DFA_RESTART. All but the last four of these are exactly the same + PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and + PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for pcre2_match(), so their description is not repeated here. PCRE2_PARTIAL_HARD PCRE2_PARTIAL_SOFT - These have the same general effect as they do for pcre2_match(), but - the details are slightly different. When PCRE2_PARTIAL_HARD is set for - pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the + These have the same general effect as they do for pcre2_match(), but + the details are slightly different. When PCRE2_PARTIAL_HARD is set for + pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the subject is reached and there is still at least one matching possibility that requires additional characters. This happens even if some complete - matches have already been found. When PCRE2_PARTIAL_SOFT is set, the - return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL - if the end of the subject is reached, there have been no complete + matches have already been found. When PCRE2_PARTIAL_SOFT is set, the + return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL + if the end of the subject is reached, there have been no complete matches, but there is still at least one matching possibility. The por- - tion of the string that was inspected when the longest partial match + tion of the string that was inspected when the longest partial match was found is set as the first matching string in both cases. There is a - more detailed discussion of partial and multi-segment matching, with + more detailed discussion of partial and multi-segment matching, with examples, in the pcre2partial documentation. PCRE2_DFA_SHORTEST - Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to + Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as soon as it has found one match. Because of the way the alterna- - tive algorithm works, this is necessarily the shortest possible match + tive algorithm works, this is necessarily the shortest possible match at the first possible matching point in the subject string. PCRE2_DFA_RESTART - When pcre2_dfa_match() returns a partial match, it is possible to call + When pcre2_dfa_match() returns a partial match, it is possible to call it again, with additional subject characters, and have it continue with the same match. The PCRE2_DFA_RESTART option requests this action; when - it is set, the workspace and wscount options must reference the same - vector as before because data about the match so far is left in them + it is set, the workspace and wscount options must reference the same + vector as before because data about the match so far is left in them after a partial match. There is more discussion of this facility in the pcre2partial documentation. @@ -3910,8 +4191,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION When pcre2_dfa_match() succeeds, it may have matched more than one sub- string in the subject. Note, however, that all the matches from one run - of the function start at the same point in the subject. The shorter - matches are all initial substrings of the longer matches. For example, + of the function start at the same point in the subject. The shorter + matches are all initial substrings of the longer matches. For example, if the pattern <.*> @@ -3926,80 +4207,80 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION - On success, the yield of the function is a number greater than zero, - which is the number of matched substrings. The offsets of the sub- - strings are returned in the ovector, and can be extracted by number in - the same way as for pcre2_match(), but the numbers bear no relation to - any capture groups that may exist in the pattern, because DFA matching + On success, the yield of the function is a number greater than zero, + which is the number of matched substrings. The offsets of the sub- + strings are returned in the ovector, and can be extracted by number in + the same way as for pcre2_match(), but the numbers bear no relation to + any capture groups that may exist in the pattern, because DFA matching does not support capturing. - Calls to the convenience functions that extract substrings by name re- + Calls to the convenience functions that extract substrings by name re- turn the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used af- - ter a DFA match. The convenience functions that extract substrings by + ter a DFA match. The convenience functions that extract substrings by number never return PCRE2_ERROR_NOSUBSTRING. - The matched strings are stored in the ovector in reverse order of - length; that is, the longest matching string is first. If there were - too many matches to fit into the ovector, the yield of the function is + The matched strings are stored in the ovector in reverse order of + length; that is, the longest matching string is first. If there were + too many matches to fit into the ovector, the yield of the function is zero, and the vector is filled with the longest matches. - NOTE: PCRE2's "auto-possessification" optimization usually applies to - character repeats at the end of a pattern (as well as internally). For - example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA - matching, this means that only one possible match is found. If you re- + NOTE: PCRE2's "auto-possessification" optimization usually applies to + character repeats at the end of a pattern (as well as internally). For + example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA + matching, this means that only one possible match is found. If you re- ally do want multiple matches in such cases, either use an ungreedy re- - peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- + peat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when com- piling. Error returns from pcre2_dfa_match() The pcre2_dfa_match() function returns a negative number when it fails. - Many of the errors are the same as for pcre2_match(), as described + Many of the errors are the same as for pcre2_match(), as described above. There are in addition the following errors that are specific to pcre2_dfa_match(): PCRE2_ERROR_DFA_UITEM - This return is given if pcre2_dfa_match() encounters an item in the - pattern that it does not support, for instance, the use of \C in a UTF + This return is given if pcre2_dfa_match() encounters an item in the + pattern that it does not support, for instance, the use of \C in a UTF mode or a backreference. PCRE2_ERROR_DFA_UCOND - This return is given if pcre2_dfa_match() encounters a condition item + This return is given if pcre2_dfa_match() encounters a condition item that uses a backreference for the condition, or a test for recursion in a specific capture group. These are not supported. PCRE2_ERROR_DFA_UINVALID_UTF - This return is given if pcre2_dfa_match() is called for a pattern that - was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for + This return is given if pcre2_dfa_match() is called for a pattern that + was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA matching. PCRE2_ERROR_DFA_WSSIZE - This return is given if pcre2_dfa_match() runs out of space in the + This return is given if pcre2_dfa_match() runs out of space in the workspace vector. PCRE2_ERROR_DFA_RECURSE When a recursion or subroutine call is processed, the matching function - calls itself recursively, using private memory for the ovector and - workspace. This error is given if the internal ovector is not large - enough. This should be extremely rare, as a vector of size 1000 is + calls itself recursively, using private memory for the ovector and + workspace. This error is given if the internal ovector is not large + enough. This should be extremely rare, as a vector of size 1000 is used. PCRE2_ERROR_DFA_BADRESTART - When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, - some plausibility checks are made on the contents of the workspace, - which should contain data about the previous partial match. If any of + When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, + some plausibility checks are made on the contents of the workspace, + which should contain data about the previous partial match. If any of these checks fail, this error is given. SEE ALSO - pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), + pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3). @@ -4012,15 +4293,14 @@ AUTHOR REVISION - Last updated: 24 April 2024 + Last updated: 26 December 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 24 April 2024 PCRE2API(3) +PCRE2 10.45 26 December 2024 PCRE2API(3) ------------------------------------------------------------------------------ - PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3) @@ -4639,15 +4919,14 @@ AUTHOR REVISION - Last updated: 15 April 2024 + Last updated: 16 April 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 15 April 2024 PCRE2BUILD(3) +PCRE2 10.45 16 April 2024 PCRE2BUILD(3) ------------------------------------------------------------------------------ - PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) @@ -5077,11 +5356,10 @@ REVISION Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2CALLOUT(3) +PCRE2 10.45 19 January 2024 PCRE2CALLOUT(3) ------------------------------------------------------------------------------ - PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3) @@ -5140,7 +5418,7 @@ DIFFERENCES BETWEEN PCRE2 AND PERL 7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties - such as Lu and Nd, the derived properties Any and LC (synonym L&), + such as Lu and Nd, the derived properties Any and Lc (synonym L&), script names such as Greek or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and Perl support the Cs (surrogate) prop- erty, but in PCRE2 its use is limited. See the pcre2pattern documenta- @@ -5167,118 +5445,128 @@ DIFFERENCES BETWEEN PCRE2 AND PERL \Q\\E \ \\E The \Q...\E sequence is recognized both inside and outside character - classes by both PCRE2 and Perl. - - 9. Fairly obviously, PCRE2 does not support the (?{code}) and + classes by both PCRE2 and Perl. Another difference from Perl is that + any appearance of \Q or \E inside what might otherwise be a quantifier + causes PCRE2 not to recognize the sequence as a quantifier. Perl recog- + nizes a quantifier if (redundantly) either of the numbers is inside + \Q...\E, but not if the separating comma is. When not recognized as a + quantifier a sequence such as {\Q1\E,2} is treated as the literal + string "{1,2}". + + 9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details. - 10. Subroutine calls (whether recursive or not) were treated as atomic - groups up to PCRE2 release 10.23, but from release 10.30 this changed, + 10. Subroutine calls (whether recursive or not) were treated as atomic + groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. - 11. In PCRE2, if any of the backtracking control verbs are used in a - group that is called as a subroutine (whether or not recursively), - their effect is confined to that group; it does not extend to the sur- - rounding pattern. This is not always the case in Perl. In particular, - if (*THEN) is present in a group that is called as a subroutine, its + 11. In PCRE2, if any of the backtracking control verbs are used in a + group that is called as a subroutine (whether or not recursively), + their effect is confined to that group; it does not extend to the sur- + rounding pattern. This is not always the case in Perl. In particular, + if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any - | characters. Note that such groups are processed as anchored at the - point where they are tested. - - 12. If a pattern contains more than one backtracking control verb, the - first one that is backtracked onto acts. For example, in the pattern - A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure + | characters. Note that such groups are processed as anchored at the + point where they are tested. PCRE2 also confines all control verbs + within atomic assertions, again including (*THEN) in assertions with + only one branch. + + 12. If a pattern contains more than one backtracking control verb, the + first one that is backtracked onto acts. For example, in the pattern + A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. - 13. There are some differences that are concerned with the settings of - captured strings when part of a pattern is repeated. For example, - matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- + 13. There are some differences that are concerned with the settings of + captured strings when part of a pattern is repeated. For example, + matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- set, but in PCRE2 it is set to "b". - 14. PCRE2's handling of duplicate capture group numbers and names is - not as general as Perl's. This is a consequence of the fact the PCRE2 - works internally just with numbers, using an external table to trans- - late between numbers and names. In particular, a pattern such as - (?|(?A)|(?B)), where the two capture groups have the same number - but different names, is not supported, and causes an error at compile + 14. PCRE2's handling of duplicate capture group numbers and names is + not as general as Perl's. This is a consequence of the fact the PCRE2 + works internally just with numbers, using an external table to trans- + late between numbers and names. In particular, a pattern such as + (?|(?A)|(?B)), where the two capture groups have the same number + but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which - group matched, because both names map to capture group number 1. To + group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. 15. Perl used to recognize comments in some places that PCRE2 does not, - for example, between the ( and ? at the start of a group. If the /x - modifier is set, Perl allowed white space between ( and ? though the - latest Perls give an error (for a while it was just deprecated). There + for example, between the ( and ? at the start of a group. If the /x + modifier is set, Perl allowed white space between ( and ? though the + latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. - 16. Perl, when in warning mode, gives warnings for character classes - such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- + 16. Perl, when in warning mode, gives warnings for character classes + such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- als. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. - 17. In PCRE2, the upper/lower case character properties Lu and Ll are - not affected when case-independent matching is specified. For example, - \p{Lu} always matches an upper case letter. I think Perl has changed in - this respect; in the release at the time of writing (5.38), \p{Lu} and - \p{Ll} match all letters, regardless of case, when case independence is - specified. + 17. In PCRE2, until release 10.45, the upper/lower case character prop- + erties Lu and Ll were not affected when case-independent matching was + specified. Perl has changed in this respect, and PCRE2 has now changed + to match. When caseless matching is in force, Lu, Ll, and Lt (title + case) are all treated as Lc (cased letter). 18. From release 5.32.0, Perl locks out the use of \K in lookaround as- - sertions. From release 10.38 PCRE2 does the same by default. However, - there is an option for re-enabling the previous behaviour. When this - option is set, \K is acted on when it occurs in positive assertions, + sertions. From release 10.38 PCRE2 does the same by default. However, + there is an option for re-enabling the previous behaviour. When this + option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions. - 19. PCRE2 provides some extensions to the Perl regular expression fa- - cilities. Perl 5.10 included new features that were not in earlier - versions of Perl, some of which (such as named parentheses) were in + 19. PCRE2 provides some extensions to the Perl regular expression fa- + cilities. Perl 5.10 included new features that were not in earlier + versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This list is with respect to Perl 5.38: - (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the + (a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. - (b) A backslash followed by a letter with no special meaning is + (b) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) - (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- + (c) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- fiers is inverted, that is, by default they are not greedy, but if fol- lowed by a question mark they are. - (d) PCRE2_ANCHORED can be used at matching time to force a pattern to + (d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. - (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and + (e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART options have no Perl equivalents. - (f) The \R escape sequence can be restricted to match only CR, LF, or + (f) The \R escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. - (g) The callout facility is PCRE2-specific. Perl supports codeblocks + (g) The callout facility is PCRE2-specific. Perl supports codeblocks and variable interpolation, but not general hooks on every match. (h) The partial matching facility is PCRE2-specific. - (i) The alternative matching function (pcre2_dfa_match() matches in a + (i) The alternative matching function (pcre2_dfa_match() matches in a different way and is not Perl-compatible. - (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) - at the start of a pattern. These set overall options that cannot be + (j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) + at the start of a pattern. These set overall options that cannot be changed within the pattern. - (k) PCRE2 supports non-atomic positive lookaround assertions. This is + (k) PCRE2 supports non-atomic positive lookaround assertions. This is an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. - (l) There are three syntactical items in patterns that can refer to a - capturing group by number: back references such as \g{2}, subroutine - calls such as (?3), and condition references such as (?(4)...). PCRE2 - supports relative group numbers such as +2 and -4 in all three cases. - Perl supports both plus and minus for subroutine calls, but only minus + (l) There are three syntactical items in patterns that can refer to a + capturing group by number: back references such as \g{2}, subroutine + calls such as (?3), and condition references such as (?(4)...). PCRE2 + supports relative group numbers such as +2 and -4 in all three cases. + Perl supports both plus and minus for subroutine calls, but only minus for back references, and no relative numbering at all for conditions. + (m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 ex- + tension that is not available in Perl. + 20. Perl has different limits than PCRE2. See the pcre2limit documenta- tion for details. Perl went with 5.10 from recursion to iteration keep- ing the intermediate matches on the heap, which is ~10% slower but does @@ -5297,6 +5585,17 @@ DIFFERENCES BETWEEN PCRE2 AND PERL ple is /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject. + 23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl + tries to recover and prints a warning if the problem was that an in- + valid hexadecimal digit was found, since PCRE2 doesn't have warnings it + returns an error instead. Additionally, Perl accepts \x{} and gener- + ates NUL unlike PCRE2. + + 24. From release 10.45, PCRE2 gives an error if \x is not followed by a + hexadecimal digit or a curly bracket. It used to interpret this as the + NUL character. Perl still generates NUL, but warns when in warning mode + in most cases. + AUTHOR @@ -5307,15 +5606,14 @@ AUTHOR REVISION - Last updated: 30 November 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 02 October 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 30 November 2023 PCRE2COMPAT(3) +PCRE2 10.45 02 October 2024 PCRE2COMPAT(3) ------------------------------------------------------------------------------ - PCRE2JIT(3) Library Functions Manual PCRE2JIT(3) @@ -5359,146 +5657,155 @@ AVAILABILITY OF JIT SUPPORT If --enable-jit is set on an unsupported platform, compilation fails. - A client program can tell if JIT support is available by calling + A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular match. One reason for this is that there are a number of op- tions and pattern items that are not supported by JIT (see below). An- - other reason is that in some environments JIT is unable to get memory - in which to build its compiled code. The only guarantee from pcre2_con- - fig() is that if it returns zero, JIT will definitely not be used. - - A simple program does not need to check availability in order to use - JIT when possible. The API is implemented in a way that falls back to - the interpretive code if JIT is not available or cannot be used for a - given match. For programs that need the best possible performance, + other reason is that in some environments JIT is unable to get exe- + cutable memory in which to build its compiled code. The only guarantee + from pcre2_config() is that if it returns zero, JIT will definitely not + be used. + + As of release 10.45 there is a more informative way to test for JIT + support. If pcre2_compile_jit() is called with the single option + PCRE2_JIT_TEST_ALLOC it returns zero if JIT is available and has a + working allocator. Otherwise it returns PCRE2_ERROR_NOMEMORY if JIT is + available but cannot allocate executable memory, or PCRE2_ERROR_JIT_UN- + SUPPORTED if JIT support is not compiled. The code argument is ignored, + so it can be a NULL value. + + A simple program does not need to check availability in order to use + JIT when possible. The API is implemented in a way that falls back to + the interpretive code if JIT is not available or cannot be used for a + given match. For programs that need the best possible performance, there is a "fast path" API that is JIT-specific. SIMPLE USE OF JIT - To make use of the JIT support in the simplest way, all you have to do - is to call pcre2_jit_compile() after successfully compiling a pattern + To make use of the JIT support in the simplest way, all you have to do + is to call pcre2_jit_compile() after successfully compiling a pattern with pcre2_compile(). This function has two arguments: the first is the - compiled pattern pointer that was returned by pcre2_compile(), and the - second is zero or more of the following option bits: PCRE2_JIT_COM- + compiled pattern pointer that was returned by pcre2_compile(), and the + second is zero or more of the following option bits: PCRE2_JIT_COM- PLETE, PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT. - If JIT support is not available, a call to pcre2_jit_compile() does - nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled + If JIT support is not available, a call to pcre2_jit_compile() does + nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern is passed to the JIT compiler, which turns it into machine code that executes much faster than the normal interpretive code, but yields - exactly the same results. The returned value from pcre2_jit_compile() + exactly the same results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code. - There is a limit to the size of pattern that JIT supports, imposed by - the size of machine stack that it uses. The exact rules are not docu- + There is a limit to the size of pattern that JIT supports, imposed by + the size of machine stack that it uses. The exact rules are not docu- mented because they may change at any time, in particular, when new op- - timizations are introduced. If a pattern is too big, a call to + timizations are introduced. If a pattern is too big, a call to pcre2_jit_compile() returns PCRE2_ERROR_NOMEMORY. - PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- - plete matches. If you want to run partial matches using the PCRE2_PAR- - TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should - set one or both of the other options as well as, or instead of + PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- + plete matches. If you want to run partial matches using the PCRE2_PAR- + TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should + set one or both of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT compiler generates different optimized code - for each of the three modes (normal, soft partial, hard partial). When - pcre2_match() is called, the appropriate code is run if it is avail- + for each of the three modes (normal, soft partial, hard partial). When + pcre2_match() is called, the appropriate code is run if it is avail- able. Otherwise, the pattern is matched using interpretive code. - You can call pcre2_jit_compile() multiple times for the same compiled - pattern. It does nothing if it has previously compiled code for any of - the option bits. For example, you can call it once with PCRE2_JIT_COM- - PLETE and (perhaps later, when you find you need partial matching) - again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it + You can call pcre2_jit_compile() multiple times for the same compiled + pattern. It does nothing if it has previously compiled code for any of + the option bits. For example, you can call it once with PCRE2_JIT_COM- + PLETE and (perhaps later, when you find you need partial matching) + again with PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial match- ing. If pcre2_jit_compile() is called with no option bits set, it imme- diately returns zero. This is an alternative way of testing whether JIT - is available. + support has been compiled. - At present, it is not possible to free JIT compiled code except when + At present, it is not possible to free JIT compiled code except when the entire compiled pattern is freed by calling pcre2_code_free(). - In some circumstances you may need to call additional functions. These - are described in the section entitled "Controlling the JIT stack" be- + In some circumstances you may need to call additional functions. These + are described in the section entitled "Controlling the JIT stack" be- low. There are some pcre2_match() options that are not supported by JIT, and - there are also some pattern items that JIT cannot handle. Details are - given below. In both cases, matching automatically falls back to the - interpretive code. If you want to know whether JIT was actually used - for a particular match, you should arrange for a JIT callback function - to be set up as described in the section entitled "Controlling the JIT - stack" below, even if you do not need to supply a non-default JIT + there are also some pattern items that JIT cannot handle. Details are + given below. In both cases, matching automatically falls back to the + interpretive code. If you want to know whether JIT was actually used + for a particular match, you should arrange for a JIT callback function + to be set up as described in the section entitled "Controlling the JIT + stack" below, even if you do not need to supply a non-default JIT stack. Such a callback function is called whenever JIT code is about to - be obeyed. If the match-time options are not right for JIT execution, + be obeyed. If the match-time options are not right for JIT execution, the callback function is not obeyed. - If the JIT compiler finds an unsupported item, no JIT data is gener- + If the JIT compiler finds an unsupported item, no JIT data is gener- ated. You can find out if JIT compilation was successful for a compiled pattern by calling pcre2_pattern_info() with the PCRE2_INFO_JITSIZE op- - tion. A non-zero result means that JIT compilation was successful. A + tion. A non-zero result means that JIT compilation was successful. A result of 0 means that JIT support is not available, or the pattern was - not processed by pcre2_jit_compile(), or the JIT compiler was not able - to handle the pattern. Successful JIT compilation does not, however, - guarantee the use of JIT at match time because there are some match + not processed by pcre2_jit_compile(), or the JIT compiler was not able + to handle the pattern. Successful JIT compilation does not, however, + guarantee the use of JIT at match time because there are some match time options that are not supported by JIT. MATCHING SUBJECTS CONTAINING INVALID UTF - When a pattern is compiled with the PCRE2_UTF option, subject strings - are normally expected to be a valid sequence of UTF code units. By de- - fault, this is checked at the start of matching and an error is gener- - ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be + When a pattern is compiled with the PCRE2_UTF option, subject strings + are normally expected to be a valid sequence of UTF code units. By de- + fault, this is checked at the start of matching and an error is gener- + ated if invalid UTF is detected. The PCRE2_NO_UTF_CHECK option can be passed to pcre2_match() to skip the check (for improved performance) if - you are sure that a subject string is valid. If this option is used - with an invalid string, the result is undefined. The calling program + you are sure that a subject string is valid. If this option is used + with an invalid string, the result is undefined. The calling program may crash or loop or otherwise misbehave. - However, a way of running matches on strings that may contain invalid - UTF sequences is available. Calling pcre2_compile() with the - PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- - preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- - pile() is subsequently called, the compiled JIT code also supports in- - valid UTF. Details of how this support works, in both the JIT and the + However, a way of running matches on strings that may contain invalid + UTF sequences is available. Calling pcre2_compile() with the + PCRE2_MATCH_INVALID_UTF option has two effects: it tells the inter- + preter in pcre2_match() to support invalid UTF, and, if pcre2_jit_com- + pile() is subsequently called, the compiled JIT code also supports in- + valid UTF. Details of how this support works, in both the JIT and the interpretive cases, is given in the pcre2unicode documentation. There is also an obsolete option for pcre2_jit_compile() called PCRE2_JIT_INVALID_UTF, which currently exists only for backward compat- - ibility. It is superseded by the pcre2_compile() option + ibility. It is superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF and should no longer be used. It may be removed in future. UNSUPPORTED OPTIONS AND PATTERN ITEMS - The pcre2_match() options that are supported for JIT matching are + The pcre2_match() options that are supported for JIT matching are PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, - PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and - PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options + PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and + PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not supported at match time. - If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the + If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the use of JIT, forcing matching by the interpreter code. - The only unsupported pattern items are \C (match a single data unit) - when running in a UTF mode, and a callout immediately before an asser- + The only unsupported pattern items are \C (match a single data unit) + when running in a UTF mode, and a callout immediately before an asser- tion condition in a conditional group. RETURN VALUES FROM JIT MATCHING - When a pattern is matched using JIT, the return values are the same as - those given by the interpretive pcre2_match() code, with the addition - of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the - memory used for the JIT stack was insufficient. See "Controlling the + When a pattern is matched using JIT, the return values are the same as + those given by the interpretive pcre2_match() code, with the addition + of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the + memory used for the JIT stack was insufficient. See "Controlling the JIT stack" below for a discussion of JIT stack usage. - The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if - searching a very large pattern tree goes on for too long, as it is in - the same circumstance when JIT is not used, but the details of exactly + The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if + searching a very large pattern tree goes on for too long, as it is in + the same circumstance when JIT is not used, but the details of exactly what is counted are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned when JIT matching is used. @@ -5506,25 +5813,25 @@ RETURN VALUES FROM JIT MATCHING CONTROLLING THE JIT STACK When the compiled JIT code runs, it needs a block of memory to use as a - stack. By default, it uses 32KiB on the machine stack. However, some - large or complicated patterns need more than this. The error PCRE2_ER- + stack. By default, it uses 32KiB on the machine stack. However, some + large or complicated patterns need more than this. The error PCRE2_ER- ROR_JIT_STACKLIMIT is given when there is not enough stack. Three func- tions are provided for managing blocks of memory for use as JIT stacks. - There is further discussion about the use of JIT stacks in the section + There is further discussion about the use of JIT stacks in the section entitled "JIT stack FAQ" below. - The pcre2_jit_stack_create() function creates a JIT stack. Its argu- - ments are a starting size, a maximum size, and a general context (for - memory allocation functions, or NULL for standard memory allocation). + The pcre2_jit_stack_create() function creates a JIT stack. Its argu- + ments are a starting size, a maximum size, and a general context (for + memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type pcre2_jit_stack, or - NULL if there is an error. The pcre2_jit_stack_free() function is used + NULL if there is an error. The pcre2_jit_stack_free() function is used to free a stack that is no longer needed. If its argument is NULL, this - function returns immediately, without doing anything. (For the techni- - cally minded: the address space is allocated by mmap or VirtualAlloc.) - A maximum stack size of 512KiB to 1MiB should be more than enough for + function returns immediately, without doing anything. (For the techni- + cally minded: the address space is allocated by mmap or VirtualAlloc.) + A maximum stack size of 512KiB to 1MiB should be more than enough for any pattern. - The pcre2_jit_stack_assign() function specifies which stack JIT code + The pcre2_jit_stack_assign() function specifies which stack JIT code should use. Its arguments are as follows: pcre2_match_context *mcontext @@ -5534,7 +5841,7 @@ CONTROLLING THE JIT STACK The first argument is a pointer to a match context. When this is subse- quently passed to a matching function, its information determines which JIT stack is used. If this argument is NULL, the function returns imme- - diately, without doing anything. There are three cases for the values + diately, without doing anything. There are three cases for the values of the other two options: (1) If callback is NULL and data is NULL, an internal 32KiB block @@ -5552,34 +5859,34 @@ CONTROLLING THE JIT STACK return value must be a valid JIT stack, the result of calling pcre2_jit_stack_create(). - A callback function is obeyed whenever JIT code is about to be run; it + A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when pcre2_match() is called with options that are incom- - patible for JIT matching. A callback function can therefore be used to - determine whether a match operation was executed by JIT or by the in- + patible for JIT matching. A callback function can therefore be used to + determine whether a match operation was executed by JIT or by the in- terpreter. You may safely use the same JIT stack for more than one pattern (either - by assigning directly or by callback), as long as the patterns are + by assigning directly or by callback), as long as the patterns are matched sequentially in the same thread. Currently, the only way to set - up non-sequential matches in one thread is to use callouts: if a call- - out function starts another match, that match must use a different JIT + up non-sequential matches in one thread is to use callouts: if a call- + out function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es). - In a multithread application, if you do not specify a JIT stack, or if - you assign or pass back NULL from a callback, that is thread-safe, be- - cause each thread has its own machine stack. However, if you assign or + In a multithread application, if you do not specify a JIT stack, or if + you assign or pass back NULL from a callback, that is thread-safe, be- + cause each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for each thread so that the application is thread-safe. - Strictly speaking, even more is allowed. You can assign the same non- - NULL stack to a match context that is used by any number of patterns, - as long as they are not used for matching by multiple threads at the - same time. For example, you could use the same stack in all compiled - patterns, with a global mutex in the callback to wait until the stack + Strictly speaking, even more is allowed. You can assign the same non- + NULL stack to a match context that is used by any number of patterns, + as long as they are not used for matching by multiple threads at the + same time. For example, you could use the same stack in all compiled + patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended. - This is a suggestion for how a multithreaded program that needs to set + This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate: During thread initialization @@ -5591,7 +5898,7 @@ CONTROLLING THE JIT STACK Use a one-line callback function return thread_local_var - All the functions described in this section do nothing if JIT is not + All the functions described in this section do nothing if JIT is not available. @@ -5600,20 +5907,20 @@ JIT STACK FAQ (1) Why do we need JIT stacks? PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack - where the local data of the current node is pushed before checking its + where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is diffi- cult. For example, the stack chain needs to be updated every time if we - extend the stack on PowerPC. Although it is possible, its updating + extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory. (2) Why don't we simply allocate blocks of memory with malloc()? - Modern operating systems have a nice feature: they can reserve an ad- + Modern operating systems have a nice feature: they can reserve an ad- dress space instead of allocating memory. We can safely allocate memory pages inside this address space, so the stack could grow without moving - memory data (this is important because of pointers). Thus we can allo- - cate 1MiB address space, and use only a single memory page (usually - 4KiB) if that is enough. However, we can still grow up to 1MiB anytime + memory data (this is important because of pointers). Thus we can allo- + cate 1MiB address space, and use only a single memory page (usually + 4KiB) if that is enough. However, we can still grow up to 1MiB anytime if needed. (3) Who "owns" a JIT stack? @@ -5621,8 +5928,8 @@ JIT STACK FAQ The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by pcre2_match(), (that is, it is assigned to a match context that - is passed to the pattern currently running), that stack must not be - used by any other threads (to avoid overwriting the same memory area). + is passed to the pattern currently running), that stack must not be + used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function. @@ -5630,36 +5937,36 @@ JIT STACK FAQ You can free a JIT stack at any time, as long as it will not be used by pcre2_match() again. When you assign the stack to a match context, only - a pointer is set. There is no reference counting or any other magic. + a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, any- - time. Just do not call pcre2_match() with a match context pointing to + time. Just do not call pcre2_match() with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free - a stack currently used by pcre2_match() in another thread). You can - also replace the stack in a context at any time when it is not in use. + a stack currently used by pcre2_match() in another thread). You can + also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement. - (5) Should I allocate/free a stack every time before/after calling + (5) Should I allocate/free a stack every time before/after calling pcre2_match()? - No, because this is too costly in terms of resources. However, you - could implement some clever idea which release the stack if it is not - used in let's say two minutes. The JIT callback can help to achieve + No, because this is too costly in terms of resources. However, you + could implement some clever idea which release the stack if it is not + used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns. - (6) OK, the stack is for long term memory allocation. But what happens - if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB + (6) OK, the stack is for long term memory allocation. But what happens + if a pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the stack is freed? Especially on embedded systems, it might be a good idea to release mem- - ory sometimes without freeing the stack. There is no API for this at - the moment. Probably a function call which returns with the currently - allocated memory for any stack and another which allows releasing mem- + ory sometimes without freeing the stack. There is no API for this at + the moment. Probably a function call which returns with the currently + allocated memory for any stack and another which allows releasing mem- ory (shrinking the stack) would be a good idea if someone needs this. (7) This is too much of a headache. Isn't there any better solution for JIT stack handling? - No, thanks to Windows. If POSIX threads were used everywhere, we could + No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API. @@ -5668,18 +5975,18 @@ FREEING JIT SPECULATIVE MEMORY void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); The JIT executable allocator does not free all memory when it is possi- - ble. It expects new allocations, and keeps some free memory around to - improve allocation speed. However, in low memory conditions, it might - be better to free all possible memory. You can cause this to happen by - calling pcre2_jit_free_unused_memory(). Its argument is a general con- + ble. It expects new allocations, and keeps some free memory around to + improve allocation speed. However, in low memory conditions, it might + be better to free all possible memory. You can cause this to happen by + calling pcre2_jit_free_unused_memory(). Its argument is a general con- text, for custom memory management, or NULL for standard memory manage- ment. EXAMPLE CODE - This is a single-threaded example that specifies a JIT stack without - using a callback. A real program should include error checking after + This is a single-threaded example that specifies a JIT stack without + using a callback. A real program should include error checking after all the function calls. int rc; @@ -5707,36 +6014,36 @@ EXAMPLE CODE JIT FAST PATH API Because the API described above falls back to interpreted matching when - JIT is not available, it is convenient for programs that are written + JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via pcre2_match() does have a performance impact. Programs that are written - for use where JIT is known to be available, and which need the best - possible performance, can instead use a "fast path" API to call JIT - matching directly instead of calling pcre2_match() (obviously only for + for use where JIT is known to be available, and which need the best + possible performance, can instead use a "fast path" API to call JIT + matching directly instead of calling pcre2_match() (obviously only for patterns that have been successfully processed by pcre2_jit_compile()). - The fast path function is called pcre2_jit_match(), and it takes ex- - actly the same arguments as pcre2_match(). However, the subject string - must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- + The fast path function is called pcre2_jit_match(), and it takes ex- + actly the same arguments as pcre2_match(). However, the subject string + must be specified with a length; PCRE2_ZERO_TERMINATED is not sup- ported. Unsupported option bits (for example, PCRE2_ANCHORED and - PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- - turn values are also the same as for pcre2_match(), plus PCRE2_ER- + PCRE2_ENDANCHORED) are ignored, as is the PCRE2_NO_JIT option. The re- + turn values are also the same as for pcre2_match(), plus PCRE2_ER- ROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. - When you call pcre2_match(), as well as testing for invalid options, a + When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For exam- - ple, if the subject pointer is NULL but the length is non-zero, an im- - mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF + ple, if the subject pointer is NULL but the length is non-zero, an im- + mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the interests of speed, these - checks do not happen on the JIT fast path. If invalid UTF data is - passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), - the result is undefined. The program may crash or loop or give wrong - results. In the absence of PCRE2_MATCH_INVALID_UTF you should call - pcre2_jit_match() in UTF mode only if you are sure the subject is + checks do not happen on the JIT fast path. If invalid UTF data is + passed when PCRE2_MATCH_INVALID_UTF was not set for pcre2_compile(), + the result is undefined. The program may crash or loop or give wrong + results. In the absence of PCRE2_MATCH_INVALID_UTF you should call + pcre2_jit_match() in UTF mode only if you are sure the subject is valid. - Bypassing the sanity checks and the pcre2_match() wrapping can give + Bypassing the sanity checks and the pcre2_match() wrapping can give speedups of more than 10%. @@ -5754,15 +6061,14 @@ AUTHOR REVISION - Last updated: 21 February 2024 + Last updated: 22 August 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 21 February 2024 PCRE2JIT(3) +PCRE2 10.45 22 August 2024 PCRE2JIT(3) ------------------------------------------------------------------------------ - PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3) @@ -5838,15 +6144,14 @@ AUTHOR REVISION - Last updated: August 2023 + Last updated: 16 August 2023 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 1 August 2023 PCRE2LIMITS(3) +PCRE2 10.45 16 August 2023 PCRE2LIMITS(3) ------------------------------------------------------------------------------ - PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3) @@ -5860,7 +6165,7 @@ PCRE2 MATCHING ALGORITHMS in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() function. This works in the same as Perl's matching func- - tion, and provide a Perl-compatible matching operation. The just-in- + tion, and provides a Perl-compatible matching operation. The just-in- time (JIT) optimization that is described in the pcre2jit documentation is compatible with this function. @@ -5872,7 +6177,7 @@ PCRE2 MATCHING ALGORITHMS When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, when there are multiple possibilities. For example, if - the pattern + the anchored pattern ^<.*> @@ -5948,83 +6253,86 @@ THE ALTERNATIVE MATCHING ALGORITHM first match (which is necessarily the shortest) is found. Note that the size of vector needed to contain all the results depends - on the number of simultaneous matches, not on the number of parentheses - in the pattern. Using pcre2_match_data_create_from_pattern() to create - the match data block is therefore not advisable when doing DFA match- - ing. + on the number of simultaneous matches, not on the number of capturing + parentheses in the pattern. Using pcre2_match_data_create_from_pat- + tern() to create the match data block is therefore not advisable when + doing DFA matching. - Note also that all the matches that are found start at the same point + Note also that all the matches that are found start at the same point in the subject. If the pattern cat(er(pillar)?)? - is matched against the string "the caterpillar catchment", the result - is the three strings "caterpillar", "cater", and "cat" that start at - the fifth character of the subject. The algorithm does not automati- + is matched against the string "the caterpillar catchment", the result + is the three strings "caterpillar", "cater", and "cat" that start at + the fifth character of the subject. The algorithm does not automati- cally move on to find matches that start at later positions. PCRE2's "auto-possessification" optimization usually applies to charac- - ter repeats at the end of a pattern (as well as internally). For exam- + ter repeats at the end of a pattern (as well as internally). For exam- ple, the pattern "a\d+" is compiled as if it were "a\d++" because there - is no point even considering the possibility of backtracking into the - repeated digits. For DFA matching, this means that only one possible - match is found. If you really do want multiple matches in such cases, - either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- + is no point even considering the possibility of backtracking into the + repeated digits. For DFA matching, this means that only one possible + match is found. If you really do want multiple matches in such cases, + either use an ungreedy repeat ("a\d+?") or set the PCRE2_NO_AUTO_POS- SESS option when compiling. - There are a number of features of PCRE2 regular expressions that are - not supported or behave differently in the alternative matching func- + There are a number of features of PCRE2 regular expressions that are + not supported or behave differently in the alternative matching func- tion. Those that are not supported cause an error if encountered. - 1. Because the algorithm finds all possible matches, the greedy or un- - greedy nature of repetition quantifiers is not relevant (though it may - affect auto-possessification, as just described). During matching, - greedy and ungreedy quantifiers are treated in exactly the same way. + 1. Because the algorithm finds all possible matches, the greedy or un- + greedy nature of repetition quantifiers is not relevant (though it may + affect auto-possessification, as just described). During matching, + greedy and ungreedy quantifiers are treated in exactly the same way. However, possessive quantifiers can make a difference when what follows - could also match what is quantified, for example in a pattern like + could also match what is quantified, for example in a pattern like this: ^a++\w! - This pattern matches "aaab!" but not "aaa!", which would be matched by - a non-possessive quantifier. Similarly, if an atomic group is present, - it is matched as if it were a standalone pattern at the current point, - and the longest match is then "locked in" for the rest of the overall + This pattern matches "aaab!" but not "aaa!", which would be matched by + a non-possessive quantifier. Similarly, if an atomic group is present, + it is matched as if it were a standalone pattern at the current point, + and the longest match is then "locked in" for the rest of the overall pattern. 2. When dealing with multiple paths through the tree simultaneously, it - is not straightforward to keep track of captured substrings for the - different matching possibilities, and PCRE2's implementation of this + is not straightforward to keep track of captured substrings for the + different matching possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured sub- strings are available. - 3. Because no substrings are captured, backreferences within the pat- - tern are not supported. + 3. Because no substrings are captured, a number of related features are + not available: - 4. For the same reason, conditional expressions that use a backrefer- - ence as the condition or test for a specific group recursion are not - supported. + (a) Backreferences; - 5. Again for the same reason, script runs are not supported. + (b) Conditional expressions that use a backreference as the condition + or test for a specific group recursion; - 6. Because many paths through the tree may be active, the \K escape se- - quence, which resets the start of the match when encountered (but may + (c) Script runs; + + (d) Scan substring assertions. + + 4. Because many paths through the tree may be active, the \K escape se- + quence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported. - 7. Callouts are supported, but the value of the capture_top field is + 5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0. - 8. The \C escape sequence, which (in the standard algorithm) always - matches a single code unit, even in a UTF mode, is not supported in - these modes, because the alternative algorithm moves through the sub- - ject string one character (not code unit) at a time, for all active - paths through the tree. + 6. The \C escape sequence, which (in the standard algorithm) always + matches a single code unit, even in a UTF mode, is not supported in UTF + modes because the alternative algorithm moves through the subject + string one character (not code unit) at a time, for all active paths + through the tree. - 9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) + 7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion. - 10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- + 8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not sup- ported by pcre2_dfa_match(). @@ -6049,13 +6357,15 @@ DISADVANTAGES OF THE ALTERNATIVE ALGORITHM partly because it has to search for all possible matches, but is also because it is less susceptible to optimization. - 2. Capturing parentheses, backreferences, script runs, and matching - within invalid UTF string are not supported. + 2. Capturing parentheses and other features such as backreferences that + rely on them are not supported. - 3. Although atomic groups are supported, their use does not provide the + 3. Matching within invalid UTF strings is not supported. + + 4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm. - 4. JIT optimization is not supported. + 5. JIT optimization is not supported. AUTHOR @@ -6067,20 +6377,19 @@ AUTHOR REVISION - Last updated: 19 January 2024 + Last updated: 30 August 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2MATCHING(3) +PCRE2 10.45 30 August 2024 PCRE2MATCHING(3) ------------------------------------------------------------------------------ - PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3) NAME - PCRE2 - Perl-compatible regular expressions + PCRE2 - Perl-compatible regular expressions (revised API) PARTIAL MATCHING IN PCRE2 @@ -6451,15 +6760,14 @@ AUTHOR REVISION - Last updated: 04 September 2019 + Last updated: 27 November 2024 Copyright (c) 1997-2019 University of Cambridge. -PCRE2 10.34 04 September 2019 PCRE2PARTIAL(3) +PCRE2 10.45 27 November 2024 PCRE2PARTIAL(3) ------------------------------------------------------------------------------ - PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) @@ -6473,9 +6781,11 @@ PCRE2 REGULAR EXPRESSION DETAILS by PCRE2 are described in detail below. There is a quick-reference syn- tax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. PCRE2 also supports some alterna- - tive regular expression syntax (which does not conflict with the Perl - syntax) in order to provide some compatibility with regular expressions - in Python, .NET, and Oniguruma. + tive regular expression syntax that does not conflict with the Perl + syntax in order to provide some compatibility with regular expressions + in Python, .NET, and Oniguruma. There are in addition some options that + enable alternative syntax and semantics that are not the same as in + Perl. Perl's regular expressions are described in its own documentation, and regular expressions in general are covered in a number of books, some @@ -6494,82 +6804,98 @@ PCRE2 REGULAR EXPRESSION DETAILS tion, are discussed in the pcre2matching page. +EBCDIC CHARACTER CODES + + Most computers use ASCII or Unicode for encoding characters, and PCRE2 + assumes this by default. However, it can be compiled to run in an envi- + ronment that uses the EBCDIC code, which is the case for some IBM main- + frame operating systems. In the sections below, character code values + are ASCII or Unicode; in an EBCDIC environment these characters may + have different code values, and there are no code points greater than + 255. Differences in behaviour when PCRE2 is running in an EBCDIC envi- + ronment are described in the section "EBCDIC environments" below, which + you can ignore unless you really are in an EBCDIC environment. + + SPECIAL START-OF-PATTERN ITEMS - A number of options that can be passed to pcre2_compile() can also be + A number of options that can be passed to pcre2_compile() can also be set by special items at the start of a pattern. These are not Perl-com- - patible, but are provided to make these options accessible to pattern - writers who are not able to change the program that processes the pat- - tern. Any number of these items may appear, but they must all be to- - gether right at the start of the pattern string, and the letters must + patible, but are provided to make these options accessible to pattern + writers who are not able to change the program that processes the pat- + tern. Any number of these items may appear, but they must all be to- + gether right at the start of the pattern string, and the letters must be in upper case. UTF support In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 - can be specified for the 32-bit library, in which case it constrains - the character values to valid Unicode code points. To process UTF - strings, PCRE2 must be built to include Unicode support (which is the - default). When using UTF strings you must either call the compiling - function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF - options, or the pattern must start with the special sequence (*UTF), - which is equivalent to setting the relevant PCRE2_UTF. How setting a + can be specified for the 32-bit library, in which case it constrains + the character values to valid Unicode code points. To process UTF + strings, PCRE2 must be built to include Unicode support (which is the + default). When using UTF strings you must either call the compiling + function with one or both of the PCRE2_UTF or PCRE2_MATCH_INVALID_UTF + options, or the pattern must start with the special sequence (*UTF), + which is equivalent to setting the relevant PCRE2_UTF. How setting a UTF mode affects pattern matching is mentioned in several places below. There is also a summary of features in the pcre2unicode page. Some applications that allow their users to supply patterns may wish to - restrict them to non-UTF data for security reasons. If the - PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- + restrict them to non-UTF data for security reasons. If the + PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not al- lowed, and its appearance in a pattern causes an error. Unicode property support - Another special sequence that may appear at the start of a pattern is - (*UCP). This has the same effect as setting the PCRE2_UCP option: it - causes sequences such as \d and \w to use Unicode properties to deter- + Another special sequence that may appear at the start of a pattern is + (*UCP). This has the same effect as setting the PCRE2_UCP option: it + causes sequences such as \d and \w to use Unicode properties to deter- mine character types, instead of recognizing only characters with codes less than 256 via a lookup table. If also causes upper/lower casing op- - erations to use Unicode properties for characters with code points - greater than 127, even when UTF is not set. These behaviours can be - changed within the pattern; see the section entitled "Internal Option + erations to use Unicode properties for characters with code points + greater than 127, even when UTF is not set. These behaviours can be + changed within the pattern; see the section entitled "Internal Option Setting" below. Some applications that allow their users to supply patterns may wish to - restrict them for security reasons. If the PCRE2_NEVER_UCP option is + restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to pcre2_compile(), (*UCP) is not allowed, and its appearance in a pattern causes an error. Locking out empty string matching Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same - effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option + effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever matching function is subsequently called to match the pat- - tern. These options lock out the matching of empty strings, either en- + tern. These options lock out the matching of empty strings, either en- tirely, or only at the start of the subject. Disabling auto-possessification - If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as - setting the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making - quantifiers possessive when what follows cannot match the repeated - item. For example, by default a+b is treated as a++b. For more details, - see the pcre2api documentation. + If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as + setting the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_opti- + mize() with a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from + making quantifiers possessive when what follows cannot match the re- + peated item. For example, by default a+b is treated as a++b. For more + details, see the pcre2api documentation. Disabling start-up optimizations - If a pattern starts with (*NO_START_OPT), it has the same effect as - setting the PCRE2_NO_START_OPTIMIZE option. This disables several opti- - mizations for quickly reaching "no match" results. For more details, - see the pcre2api documentation. + If a pattern starts with (*NO_START_OPT), it has the same effect as + setting the PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_opti- + mize() with a PCRE2_START_OPTIMIZE_OFF directive. This disables several + optimizations for quickly reaching "no match" results. For more de- + tails, see the pcre2api documentation. Disabling automatic anchoring If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect - as setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimiza- - tions that apply to patterns whose top-level branches all start with .* - (match any number of arbitrary characters). For more details, see the - pcre2api documentation. + as setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_op- + timize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables opti- + mizations that apply to patterns whose top-level branches all start + with .* (match any number of arbitrary characters). For more details, + see the pcre2api documentation. Disabling JIT compilation @@ -6666,33 +6992,27 @@ SPECIAL START-OF-PATTERN ITEMS CODE) is also recognized, corresponding to PCRE2_BSR_UNICODE. -EBCDIC CHARACTER CODES - - PCRE2 can be compiled to run in an environment that uses EBCDIC as its - character code instead of ASCII or Unicode (typically a mainframe sys- - tem). In the sections below, character code values are ASCII or Uni- - code; in an EBCDIC environment these characters may have different code - values, and there are no code points greater than 255. - - CHARACTERS AND METACHARACTERS - A regular expression is a pattern that is matched against a subject - string from left to right. Most characters stand for themselves in a - pattern, and match the corresponding characters in the subject. As a + A regular expression is a pattern that is matched against a subject + string from left to right. Most characters stand for themselves in a + pattern, and match the corresponding characters in the subject. As a trivial example, the pattern The quick brown fox matches a portion of a subject string that is identical to itself. When - caseless matching is specified (the PCRE2_CASELESS option or (?i) - within the pattern), letters are matched independently of case. Note - that there are two ASCII characters, K and S, that, in addition to - their lower case ASCII equivalents, are case-equivalent with Unicode - U+212A (Kelvin sign) and U+017F (long S) respectively when either + caseless matching is specified (the PCRE2_CASELESS option or (?i) + within the pattern), letters are matched independently of case. Note + that there are two ASCII characters, K and S, that, in addition to + their lower case ASCII equivalents, are case-equivalent with Unicode + U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT - option is in force (either passed to pcre2_compile() or set by (?r) - within the pattern). + option is in force (either passed to pcre2_compile() or set by (*CASE- + LESS_RESTRICT) or (?r) within the pattern). If the PCRE2_EXTRA_TURK- + ISH_CASING option is in force (either passed to pcre2_compile() or set + by (*TURKISH_CASING) within the pattern), then the 'i' letters are + matched according to Turkish and Azeri languages. The power of regular expressions comes from the ability to include wild cards, character classes, alternatives, and repetitions in the pattern. @@ -6739,7 +7059,7 @@ CHARACTERS AND METACHARACTERS If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or between a # outside a character class and the next new- - line, inclusive, are ignored. An escaping backslash can be used to in- + line, inclusive, is ignored. An escaping backslash can be used to in- clude a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are ignored inside a @@ -6797,6 +7117,13 @@ BACKSLASH error, because the character class is then not terminated by a closing square bracket. + Another difference from Perl is that any appearance of \Q or \E inside + what might otherwise be a quantifier causes PCRE2 not to recognize the + sequence as a quantifier. Perl recognizes a quantifier if (redundantly) + either of the numbers is inside \Q...\E, but not if the separating + comma is. When not recognized as a quantifier a sequence such as + {\Q1\E,2} is treated as the literal string "{1,2}". + Non-printing characters A second use of backslash provides a way of encoding non-printing char- @@ -6815,115 +7142,107 @@ BACKSLASH \r carriage return (hex 0D) (but see below) \t tab (hex 09) \0dd character with octal code 0dd - \ddd character with octal code ddd, or backreference + \ddd character with octal code ddd, or back reference \o{ddd..} character with octal code ddd.. \xhh character with hex code hh \x{hhh..} character with hex code hhh.. \N{U+hhh..} character with Unicode hex code point hhh.. - By default, after \x that is not followed by {, from zero to two hexa- - decimal digits are read (letters can be in upper or lower case). Any - number of hexadecimal digits may appear between \x{ and }. If a charac- - ter other than a hexadecimal digit appears between \x{ and }, or if - there is no terminating }, an error occurs. + A description of how back references work is given later, following the + discussion of parenthesized groups. + + By default, after \x that is not followed by {, one or two hexadecimal + digits are read (letters can be in upper or lower case). If the charac- + ter that follows \x is neither { nor a hexadecimal digit, an error oc- + curs. This is different from Perl's default behaviour, which generates + a NUL character, but is in line with the behaviour of Perl's 'strict' + mode in re. + + Any number of hexadecimal digits may appear between \x{ and }. If a + character other than a hexadecimal digit appears between \x{ and }, or + if there is no terminating }, an error occurs. Characters whose code points are less than 256 can be defined by either of the two syntaxes for \x or by an octal sequence. There is no differ- ence in the way they are handled. For example, \xdc is exactly the same - as \x{dc} or \334. However, using the braced versions does make such + as \x{dc} or \334. However, using the braced versions does make such sequences easier to read. - Support is available for some ECMAScript (aka JavaScript) escape se- + Support is available for some ECMAScript (aka JavaScript) escape se- quences via two compile-time options. If PCRE2_ALT_BSUX is set, the se- - quence \x followed by { is not recognized. Only if \x is followed by - two hexadecimal digits is it recognized as a character escape. Other- - wise it is interpreted as a literal "x" character. In this mode, sup- - port for code points greater than 256 is provided by \u, which must be - followed by four hexadecimal digits; otherwise it is interpreted as a + quence \x followed by { is not recognized. Only if \x is followed by + two hexadecimal digits is it recognized as a character escape. Other- + wise it is interpreted as a literal "x" character. In this mode, sup- + port for code points greater than 256 is provided by \u, which must be + followed by four hexadecimal digits; otherwise it is interpreted as a literal "u" character. - PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- + PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in ad- dition, \u{hhh..} is recognized as the character specified by hexadeci- mal code point. There may be any number of hexadecimal digits, but un- - like other places that also use curly brackets, spaces are not allowed - and would result in the string being interpreted as a literal. This + like other places that also use curly brackets, spaces are not allowed + and would result in the string being interpreted as a literal. This syntax is from ECMAScript 6. - The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- - ating in UTF mode. Perl also uses \N{name} to specify characters by - Unicode name; PCRE2 does not support this. Note that when \N is not + The \N{U+hhh..} escape sequence is recognized only when PCRE2 is oper- + ating in UTF mode. Perl also uses \N{name} to specify characters by + Unicode name; PCRE2 does not support this. Note that when \N is not followed by an opening brace (curly bracket) it has an entirely differ- ent meaning, matching any character that is not a newline. - There are some legacy applications where the escape sequence \r is ex- - pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option - is set, \r in a pattern is converted to \n so that it matches a LF + There are some legacy applications where the escape sequence \r is ex- + pected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option + is set, \r in a pattern is converted to \n so that it matches a LF (linefeed) instead of a CR (carriage return) character. - An error occurs if \c is not followed by a character whose ASCII code - point is in the range 32 to 126. The precise effect of \cx is as fol- - lows: if x is a lower case letter, it is converted to upper case. Then + An error occurs if \c is not followed by a character whose ASCII code + point is in the range 32 to 126. The precise effect of \cx is as fol- + lows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \cA to \cZ become hex - 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and - \c; becomes hex 7B (; is 3B). If the code unit following \c has a code + 01 to hex 1A (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and + \c; becomes hex 7B (; is 3B). If the code unit following \c has a code point less than 32 or greater than 126, a compile-time error occurs. - When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. - \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. - The \c escape is processed as specified for Perl in the perlebcdic doc- - ument. The only characters that are allowed after \c are A-Z, a-z, or - one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- - time error. The sequence \c@ encodes character code 0; after \c the - letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, - \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- - comes either 255 (hex FF) or 95 (hex 5F). + For differences in the way some escapes behave in EBCDIC environments, + see section "EBCDIC environments" below. - Thus, apart from \c?, these escapes generate the same character code - values as they do in an ASCII environment, though the meanings of the - values mostly differ. For example, \cG always generates code value 7, - which is BEL in ASCII but DEL in EBCDIC. + Octal escapes and back references - The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, - but because 127 is not a control character in EBCDIC, Perl makes it - generate the APC character. Unfortunately, there are several variants - of EBCDIC. In most of them the APC character has the value 255 (hex - FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If - certain other characters have POSIX-BC values, PCRE2 makes \c? generate - 95; otherwise it generates 255. + The escape \o must be followed by a sequence of octal digits, enclosed + in braces. An error occurs if this is not the case. This escape pro- + vides a way of specifying character code points as octal numbers + greater than 0777, and it also allows octal numbers and backreferences + to be unambiguously distinguished. - After \0 up to two further octal digits are read. If there are fewer - than two digits, just those that are present are used. Thus the se- - quence \0\x\015 specifies two binary zeros followed by a CR character - (code value 13). Make sure you supply two digits after the initial zero - if the pattern character that follows is itself an octal digit. + If braces are not used, after \0 up to two further octal digits are + read. However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one + more octal digit must follow \0 (use \00 to generate a NUL character). + Make sure you supply two digits after the initial zero if the pattern + character that follows is itself an octal digit. - The escape \o must be followed by a sequence of octal digits, enclosed - in braces. An error occurs if this is not the case. This escape is a - recent addition to Perl; it provides way of specifying character code - points as octal numbers greater than 0777, and it also allows octal - numbers and backreferences to be unambiguously specified. + Inside a character class, when a backslash is followed by any octal + digit, up to three octal digits are read to generate a code point. Any + subsequent digits stand for themselves. The sequences \8 and \9 are + treated as the literal characters "8" and "9". + + Outside a character class, Perl's handling of a backslash followed by a + digit other than 0 is complicated by ambiguity, and Perl has changed + over time, causing PCRE2 also to change. From PCRE2 release 10.45 there + is an option called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use + Python's unambiguous rules. The next two subsections describe the two + sets of rules. For greater clarity and unambiguity, it is best to avoid following \ by - a digit greater than zero. Instead, use \o{...} or \x{...} to specify + a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical character code points, and \g{...} to specify backreferences. - The following paragraphs describe the old, ambiguous syntax. - - The handling of a backslash followed by a digit other than 0 is compli- - cated, and Perl has changed over time, causing PCRE2 also to change. - Outside a character class, PCRE2 reads the digit and any following dig- - its as a decimal number. If the number is less than 10, begins with the - digit 8 or 9, or if there are at least that many previous capture - groups in the expression, the entire sequence is taken as a backrefer- - ence. A description of how this works is given later, following the - discussion of parenthesized groups. Otherwise, up to three octal dig- - its are read to form a character code. + Perl rules for non-class backslash 1-9 - Inside a character class, PCRE2 handles \8 and \9 as the literal char- - acters "8" and "9", and otherwise reads up to three octal digits fol- - lowing the backslash, using them to generate a data character. Any sub- - sequent digits stand for themselves. For example, outside a character - class: + All the digits that follow the backslash are read as a decimal number. + If the number is less than 10, begins with the digit 8 or 9, or if + there are at least that many previous capture groups in the expression, + the entire sequence is taken as a back reference. Otherwise, up to + three octal digits are read to form a character code. For example: \040 is another way of writing an ASCII space \40 is the same, provided there are fewer than 40 @@ -6939,10 +7258,21 @@ BACKSLASH the value 255 (decimal) \81 is always a backreference - Note that octal values of 100 or greater that are specified using this - syntax must not be introduced by a leading zero, because no more than + Note that octal values of 100 or greater that are specified using this + syntax must not be introduced by a leading zero, because no more than three octal digits are ever read. + Python rules for non_class backslash 1-9 + + If there are at least three octal digits after the backslash, exactly + three are read as an octal code point number, but the value must be no + greater than \377, even in modes where higher code point values are + supported. Any subsequent digits stand for themselves. If there are + fewer than three octal digits, the sequence is taken as a decimal back + reference. Thus, for example, \12 is always a back reference, indepen- + dent of how many captures there are in the pattern. An error is gener- + ated for a reference to a non-existent capturing group. + Constraints on character values Characters that are specified using octal or hexadecimal numbers are @@ -7161,7 +7491,7 @@ BACKSLASH tional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing charac- - ters whose code points are less than U+0100 and U+10000, respectively. + ters whose code points are less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Un- known script and with an unassigned type. @@ -7179,15 +7509,34 @@ BACKSLASH \P{xx} a character without the xx property \X a Unicode extended grapheme cluster - The property names represented by xx above are not case-sensitive, and - in accordance with Unicode's "loose matching" rules, spaces, hyphens, - and underscores are ignored. There is support for Unicode script names, - Unicode general category properties, "Any", which matches any character - (including newline), Bidi_Class, a number of binary (yes/no) proper- - ties, and some special PCRE2 properties (described below). Certain - other Perl properties such as "InMusicalSymbols" are not supported by - PCRE2. Note that \P{Any} does not match any characters, so always - causes a match failure. + For compatibility with Perl, negation can be specified by including a + circumflex between the opening brace and the property. For example, + \p{^Lu} is the same as \P{Lu}. + + In accordance with Unicode's "loose matching" rules, ASCII white space + characters, hyphens, and underscores are ignored in the properties rep- + resented by xx above. As well as the space character, ASCII white space + can be tab, linefeed, vertical tab, formfeed, or carriage return. + + Some properties are specified as a name only; others as a name and a + value, separated by a colon or an equals sign. The names and values + consist of ASCII letters and digits (with one Perl-specific exception, + see below). They are not case sensitive. Note, however, that the es- + capes themselves, \p and \P, are case sensitive. There are abbrevia- + tions for many names. The following examples are all equivalent: + + \p{bidiclass=al} + \p{BC=al} + \p{ Bidi_Class : AL } + \p{ Bi-di class = Al } + \P{ ^ Bi-di class = Al } + + There is support for Unicode script names, Unicode general category + properties, "Any", which matches any character (including newline), + Bidi_Class, a number of binary (yes/no) properties, and some special + PCRE2 properties (described below). Certain other Perl properties such + as "InMusicalSymbols" are not supported by PCRE2. Note that \P{Any} + does not match any characters, so always causes a match failure. Script properties for \p and \P @@ -7197,15 +7546,15 @@ BACKSLASH Adlam script as an example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters that have Adlam in their extensions list. The full names "script" and - "script extensions" for the property types are recognized, and a equals - sign is an alternative to the colon. If a script name is given without - a property type, for example, \p{Adlam}, it is treated as \p{scx:Ad- - lam}. Perl changed to this interpretation at release 5.26 and PCRE2 - changed at release 10.40. + "script extensions" for the property types are recognized and, as for + all property specifications, an equals sign is an alternative to the + colon. If a script name is given without a property type, for example, + \p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this inter- + pretation at release 5.26 and PCRE2 changed at release 10.40. Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others - that are not part of an identified script are lumped together as "Com- + that are not part of an identified script are lumped together as "Com- mon". The current list of recognized script names and their 4-character abbreviations can be obtained by running this command: @@ -7215,15 +7564,11 @@ BACKSLASH The general category property for \p and \P Each character has exactly one Unicode general category property, spec- - ified by a two-letter abbreviation. For compatibility with Perl, nega- - tion can be specified by including a circumflex between the opening - brace and the property name. For example, \p{^Lu} is the same as - \P{Lu}. - - If only one letter is specified with \p or \P, it includes all the gen- - eral category properties that start with that letter. In this case, in - the absence of negation, the curly brackets in the escape sequence are - optional; these two examples have the same effect: + ified by a two-letter abbreviation. If only one letter is specified + with \p or \P, it includes all the general category properties that + start with that letter. In this case, in the absence of negation, the + curly brackets in the escape sequence are optional; these two examples + have the same effect: \p{L} \pL @@ -7238,6 +7583,7 @@ BACKSLASH Cs Surrogate L Letter + Lc Cased letter Ll Lower case letter Lm Modifier letter Lo Other letter @@ -7274,35 +7620,36 @@ BACKSLASH Zp Paragraph separator Zs Space separator - The special property LC, which has the synonym L&, is also supported: - it matches a character that has the Lu, Ll, or Lt property, in other - words, a letter that is not classified as a modifier or "other". - - The Cs (Surrogate) property applies only to characters whose code - points are in the range U+D800 to U+DFFF. These characters are no dif- - ferent to any other character when PCRE2 is not in UTF mode (using the - 16-bit or 32-bit library). However, they are not valid in Unicode + Perl originally used the name L& for the Lc property. This is still + supported by Perl, but discouraged. PCRE2 also still supports it. This + property matches any character that has the Lu, Ll, or Lt property, in + other words, any letter that is not classified as a modifier or + "other". From release 10.45 of PCRE2 the properties Lu, Ll, and Lt are + all treated as Lc when case-independent matching is set by the + PCRE2_CASELESS option or (?i) within the pattern. The other properties + are not affected by caseless matching. + + The Cs (Surrogate) property applies only to characters whose code + points are in the range U+D800 to U+DFFF. These characters are no dif- + ferent to any other character when PCRE2 is not in UTF mode (using the + 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF valid- - ity checking has been turned off (see the discussion of + ity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api page). - The long synonyms for property names that Perl supports (such as - \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix + The long synonyms for property names that Perl supports (such as + \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". No character that is in the Unicode table has the Cn (unassigned) prop- erty. Instead, this property is assumed for any code point that is not in the Unicode table. - Specifying caseless matching does not affect these escape sequences. - For example, \p{Lu} always matches only upper case letters. This is - different from the behaviour of current versions of Perl. - Binary (yes/no) properties for \p and \P - Unicode defines a number of binary properties, that is, properties - whose only values are true or false. You can obtain a list of those - that are recognized by \p and \P, along with their abbreviations, by + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP @@ -7337,63 +7684,65 @@ BACKSLASH RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space - An equals sign may be used instead of a colon. The class names are - case-insensitive; only the short names listed above are recognized. + As in all property specifications, an equals sign may be used instead + of a colon and the class names are case-insensitive. Only the short + names listed above are recognized; PCRE2 does not at present support + any long alternatives. Extended grapheme clusters - The \X escape matches any number of Unicode characters that form an + The \X escape matches any number of Unicode characters that form an "extended grapheme cluster", and treats the sequence as an atomic group - (see below). Unicode supports various kinds of composite character by - giving each character a grapheme breaking property, and having rules + (see below). Unicode supports various kinds of composite character by + giving each character a grapheme breaking property, and having rules that use these properties to define the boundaries of extended grapheme - clusters. The rules are defined in Unicode Standard Annex 29, "Unicode - Text Segmentation". Unicode 11.0.0 abandoned the use of some previous - properties that had been used for emojis. Instead it introduced vari- - ous emoji-specific properties. PCRE2 uses only the Extended Picto- + clusters. The rules are defined in Unicode Standard Annex 29, "Unicode + Text Segmentation". Unicode 11.0.0 abandoned the use of some previous + properties that had been used for emojis. Instead it introduced vari- + ous emoji-specific properties. PCRE2 uses only the Extended Picto- graphic property. - \X always matches at least one character. Then it decides whether to + \X always matches at least one character. Then it decides whether to add additional characters according to the following rules for ending a cluster: 1. End at the end of the subject string. - 2. Do not end between CR and LF; otherwise end after any control char- + 2. Do not end between CR and LF; otherwise end after any control char- acter. - 3. Do not break Hangul (a Korean script) syllable sequences. Hangul - characters are of five types: L, V, T, LV, and LVT. An L character may - be followed by an L, V, LV, or LVT character; an LV or V character may - be followed by a V or T character; an LVT or T character may be fol- + 3. Do not break Hangul (a Korean script) syllable sequences. Hangul + characters are of five types: L, V, T, LV, and LVT. An L character may + be followed by an L, V, LV, or LVT character; an LV or V character may + be followed by a V or T character; an LVT or T character may be fol- lowed only by a T character. 4. Do not end before extending characters or spacing marks or the zero- - width joiner (ZWJ) character. Characters with the "mark" property al- + width joiner (ZWJ) character. Characters with the "mark" property al- ways have the "extend" grapheme breaking property. 5. Do not end after prepend characters. - 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width - joiner) sequences. An emoji ZWJ sequence consists of a character with - the Extended_Pictographic property, optionally followed by one or more - characters with the Extend property, followed by the ZWJ character, + 6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width + joiner) sequences. An emoji ZWJ sequence consists of a character with + the Extended_Pictographic property, optionally followed by one or more + characters with the Extend property, followed by the ZWJ character, followed by another Extended_Pictographic character. - 7. Do not break within emoji flag sequences. That is, do not break be- - tween regional indicator (RI) characters if there are an odd number of + 7. Do not break within emoji flag sequences. That is, do not break be- + tween regional indicator (RI) characters if there are an odd number of RI characters before the break point. 8. Otherwise, end the cluster. PCRE2's additional properties - As well as the standard Unicode properties described above, PCRE2 sup- + As well as the standard Unicode properties described above, PCRE2 sup- ports four more that make it possible to convert traditional escape se- - quences such as \w and \s to use Unicode properties. PCRE2 uses these - non-standard, non-Perl properties internally when PCRE2_UCP is set. + quences such as \w and \s to use Unicode properties. PCRE2 uses these + non-standard, non-Perl properties internally when PCRE2_UCP is set. However, they may also be used explicitly. These properties are: Xan Any alphanumeric character @@ -7401,73 +7750,74 @@ BACKSLASH Xsp Any Perl space character Xwd Any Perl "word" character - Xan matches characters that have either the L (letter) or the N (num- - ber) property. Xps matches the characters tab, linefeed, vertical tab, - form feed, or carriage return, and any other character that has the Z - (separator) property. Xsp is the same as Xps; in PCRE1 it used to ex- - clude vertical tab, for Perl compatibility, but Perl changed. Xwd - matches the same characters as Xan, plus those that match Mn (non-spac- - ing mark) or Pc (connector punctuation, which includes underscore). - - There is another non-standard property, Xuc, which matches any charac- - ter that can be represented by a Universal Character Name in C++ and - other programming languages. These are the characters $, @, ` (grave - accent), and all characters with Unicode code points greater than or - equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that - most base (ASCII) characters are excluded. (Universal Character Names - are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. + Xan matches characters that have either the L (letter) or the N (num- + ber) property. Xps matches the characters tab, linefeed, vertical tab, + form feed, or carriage return, and any other character that has the Z + (separator) property (this includes the space character). Xsp is the + same as Xps; in PCRE1 it used to exclude vertical tab, for Perl compat- + ibility, but Perl changed. Xwd matches the same characters as Xan, plus + those that match Mn (non-spacing mark) or Pc (connector punctuation, + which includes underscore). + + There is another non-standard property, Xuc, which matches any charac- + ter that can be represented by a Universal Character Name in C++ and + other programming languages. These are the characters $, @, ` (grave + accent), and all characters with Unicode code points greater than or + equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that + most base (ASCII) characters are excluded. (Universal Character Names + are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. Note that the Xuc property does not match these sequences but the char- acters that they represent.) Resetting the match start - In normal use, the escape sequence \K causes any previously matched + In normal use, the escape sequence \K causes any previously matched characters not to be included in the final matched sequence that is re- turned. For example, the pattern: foo\Kbar - matches "foobar", but reports that it has matched "bar". \K does not + matches "foobar", but reports that it has matched "bar". \K does not interact with anchoring in any way. The pattern: ^foo\Kbar - matches only when the subject begins with "foobar" (in single line - mode), though it again reports the matched string as "bar". This fea- - ture is similar to a lookbehind assertion (described below), but the + matches only when the subject begins with "foobar" (in single line + mode), though it again reports the matched string as "bar". This fea- + ture is similar to a lookbehind assertion (described below), but the part of the pattern that precedes \K is not constrained to match a lim- - ited number of characters, as is required for a lookbehind assertion. - The use of \K does not interfere with the setting of captured sub- + ited number of characters, as is required for a lookbehind assertion. + The use of \K does not interfere with the setting of captured sub- strings. For example, when the pattern (foo)\Kbar matches "foobar", the first substring is still set to "foo". - From version 5.32.0 Perl forbids the use of \K in lookaround asser- - tions. From release 10.38 PCRE2 also forbids this by default. However, - the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling - pcre2_compile() to re-enable the previous behaviour. When this option + From version 5.32.0 Perl forbids the use of \K in lookaround asser- + tions. From release 10.38 PCRE2 also forbids this by default. However, + the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling + pcre2_compile() to re-enable the previous behaviour. When this option is set, \K is acted upon when it occurs inside positive assertions, but - is ignored in negative assertions. Note that when a pattern such as - (?=ab\K) matches, the reported start of the match can be greater than - the end of the match. Using \K in a lookbehind assertion at the start - of a pattern can also lead to odd effects. For example, consider this + is ignored in negative assertions. Note that when a pattern such as + (?=ab\K) matches, the reported start of the match can be greater than + the end of the match. Using \K in a lookbehind assertion at the start + of a pattern can also lead to odd effects. For example, consider this pattern: (?<=\Kfoo)bar - If the subject is "foobar", a call to pcre2_match() with a starting - offset of 3 succeeds and reports the matching string as "foobar", that - is, the start of the reported match is earlier than where the match + If the subject is "foobar", a call to pcre2_match() with a starting + offset of 3 succeeds and reports the matching string as "foobar", that + is, the start of the reported match is earlier than where the match started. Simple assertions - The final use of backslash is for certain simple assertions. An asser- - tion specifies a condition that has to be met at a particular point in - a match, without consuming any characters from the subject string. The - use of groups for more complicated assertions is described below. The + The final use of backslash is for certain simple assertions. An asser- + tion specifies a condition that has to be met at a particular point in + a match, without consuming any characters from the subject string. The + use of groups for more complicated assertions is described below. The backslashed assertions are: \b matches at a word boundary @@ -7478,193 +7828,193 @@ BACKSLASH \z matches only at the end of the subject \G matches at the first matching position in the subject - Inside a character class, \b has a different meaning; it matches the - backspace character. If any other of these assertions appears in a + Inside a character class, \b has a different meaning; it matches the + backspace character. If any other of these assertions appears in a character class, an "invalid escape sequence" error is generated. - A word boundary is a position in the subject string where the current - character and the previous character do not both match \w or \W (i.e. - one matches \w and the other matches \W), or the start or end of the - string if the first or last character matches \w, respectively. When - PCRE2 is built with Unicode support, the meanings of \w and \W can be + A word boundary is a position in the subject string where the current + character and the previous character do not both match \w or \W (i.e. + one matches \w and the other matches \W), or the start or end of the + string if the first or last character matches \w, respectively. When + PCRE2 is built with Unicode support, the meanings of \w and \W can be changed by setting the PCRE2_UCP option. When this is done, it also af- - fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" - or "end of word" metasequence. However, whatever follows \b normally - determines which it is. For example, the fragment \ba matches "a" at + fects \b and \B. Neither PCRE2 nor Perl has a separate "start of word" + or "end of word" metasequence. However, whatever follows \b normally + determines which it is. For example, the fragment \ba matches "a" at the start of a word. - The \A, \Z, and \z assertions differ from the traditional circumflex + The \A, \Z, and \z assertions differ from the traditional circumflex and dollar (described in the next section) in that they only ever match - at the very start and end of the subject string, whatever options are - set. Thus, they are independent of multiline mode. These three asser- - tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, - which affect only the behaviour of the circumflex and dollar metachar- - acters. However, if the startoffset argument of pcre2_match() is non- - zero, indicating that matching is to start at a point other than the - beginning of the subject, \A can never match. The difference between - \Z and \z is that \Z matches before a newline at the end of the string + at the very start and end of the subject string, whatever options are + set. Thus, they are independent of multiline mode. These three asser- + tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, + which affect only the behaviour of the circumflex and dollar metachar- + acters. However, if the startoffset argument of pcre2_match() is non- + zero, indicating that matching is to start at a point other than the + beginning of the subject, \A can never match. The difference between + \Z and \z is that \Z matches before a newline at the end of the string as well as at the very end, whereas \z matches only at the end. - The \G assertion is true only when the current matching position is at - the start point of the matching process, as specified by the startoff- - set argument of pcre2_match(). It differs from \A when the value of - startoffset is non-zero. By calling pcre2_match() multiple times with - appropriate arguments, you can mimic Perl's /g option, and it is in + The \G assertion is true only when the current matching position is at + the start point of the matching process, as specified by the startoff- + set argument of pcre2_match(). It differs from \A when the value of + startoffset is non-zero. By calling pcre2_match() multiple times with + appropriate arguments, you can mimic Perl's /g option, and it is in this kind of implementation where \G can be useful. - Note, however, that PCRE2's implementation of \G, being true at the - starting character of the matching process, is subtly different from - Perl's, which defines it as true at the end of the previous match. In - Perl, these can be different when the previously matched string was + Note, however, that PCRE2's implementation of \G, being true at the + starting character of the matching process, is subtly different from + Perl's, which defines it as true at the end of the previous match. In + Perl, these can be different when the previously matched string was empty. Because PCRE2 does just one match at a time, it cannot reproduce this behaviour. - If all the alternatives of a pattern begin with \G, the expression is + If all the alternatives of a pattern begin with \G, the expression is anchored to the starting match position, and the "anchored" flag is set in the compiled regular expression. CIRCUMFLEX AND DOLLAR - The circumflex and dollar metacharacters are zero-width assertions. - That is, they test for a particular condition being true without con- + The circumflex and dollar metacharacters are zero-width assertions. + That is, they test for a particular condition being true without con- suming any characters from the subject string. These two metacharacters - are concerned with matching the starts and ends of lines. If the new- - line convention is set so that only the two-character sequence CRLF is - recognized as a newline, isolated CR and LF characters are treated as + are concerned with matching the starts and ends of lines. If the new- + line convention is set so that only the two-character sequence CRLF is + recognized as a newline, isolated CR and LF characters are treated as ordinary data characters, and are not recognized as newlines. Outside a character class, in the default matching mode, the circumflex - character is an assertion that is true only if the current matching - point is at the start of the subject string. If the startoffset argu- - ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- - flex can never match if the PCRE2_MULTILINE option is unset. Inside a - character class, circumflex has an entirely different meaning (see be- + character is an assertion that is true only if the current matching + point is at the start of the subject string. If the startoffset argu- + ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- + flex can never match if the PCRE2_MULTILINE option is unset. Inside a + character class, circumflex has an entirely different meaning (see be- low). - Circumflex need not be the first character of the pattern if a number - of alternatives are involved, but it should be the first thing in each - alternative in which it appears if the pattern is ever to match that - branch. If all possible alternatives start with a circumflex, that is, - if the pattern is constrained to match only at the start of the sub- - ject, it is said to be an "anchored" pattern. (There are also other + Circumflex need not be the first character of the pattern if a number + of alternatives are involved, but it should be the first thing in each + alternative in which it appears if the pattern is ever to match that + branch. If all possible alternatives start with a circumflex, that is, + if the pattern is constrained to match only at the start of the sub- + ject, it is said to be an "anchored" pattern. (There are also other constructs that can cause a pattern to be anchored.) - The dollar character is an assertion that is true only if the current - matching point is at the end of the subject string, or immediately be- - fore a newline at the end of the string (by default), unless PCRE2_NO- - TEOL is set. Note, however, that it does not actually match the new- - line. Dollar need not be the last character of the pattern if a number - of alternatives are involved, but it should be the last item in any - branch in which it appears. Dollar has no special meaning in a charac- + The dollar character is an assertion that is true only if the current + matching point is at the end of the subject string, or immediately be- + fore a newline at the end of the string (by default), unless PCRE2_NO- + TEOL is set. Note, however, that it does not actually match the new- + line. Dollar need not be the last character of the pattern if a number + of alternatives are involved, but it should be the last item in any + branch in which it appears. Dollar has no special meaning in a charac- ter class. - The meaning of dollar can be changed so that it matches only at the - very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at + The meaning of dollar can be changed so that it matches only at the + very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \Z assertion. The meanings of the circumflex and dollar metacharacters are changed if - the PCRE2_MULTILINE option is set. When this is the case, a dollar - character matches before any newlines in the string, as well as at the - very end, and a circumflex matches immediately after internal newlines - as well as at the start of the subject string. It does not match after - a newline that ends the string, for compatibility with Perl. However, + the PCRE2_MULTILINE option is set. When this is the case, a dollar + character matches before any newlines in the string, as well as at the + very end, and a circumflex matches immediately after internal newlines + as well as at the start of the subject string. It does not match after + a newline that ends the string, for compatibility with Perl. However, this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option. - For example, the pattern /^abc$/ matches the subject string "def\nabc" - (where \n represents a newline) in multiline mode, but not otherwise. - Consequently, patterns that are anchored in single line mode because - all branches start with ^ are not anchored in multiline mode, and a - match for circumflex is possible when the startoffset argument of - pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored + For example, the pattern /^abc$/ matches the subject string "def\nabc" + (where \n represents a newline) in multiline mode, but not otherwise. + Consequently, patterns that are anchored in single line mode because + all branches start with ^ are not anchored in multiline mode, and a + match for circumflex is possible when the startoffset argument of + pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. - When the newline convention (see "Newline conventions" below) recog- - nizes the two-character sequence CRLF as a newline, this is preferred, - even if the single characters CR and LF are also recognized as new- - lines. For example, if the newline convention is "any", a multiline - mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather - than after CR, even though CR on its own is a valid newline. (It also + When the newline convention (see "Newline conventions" below) recog- + nizes the two-character sequence CRLF as a newline, this is preferred, + even if the single characters CR and LF are also recognized as new- + lines. For example, if the newline convention is "any", a multiline + mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather + than after CR, even though CR on its own is a valid newline. (It also matches at the very start of the string, of course.) - Note that the sequences \A, \Z, and \z can be used to match the start - and end of the subject in both modes, and if all branches of a pattern - start with \A it is always anchored, whether or not PCRE2_MULTILINE is + Note that the sequences \A, \Z, and \z can be used to match the start + and end of the subject in both modes, and if all branches of a pattern + start with \A it is always anchored, whether or not PCRE2_MULTILINE is set. FULL STOP (PERIOD, DOT) AND \N Outside a character class, a dot in the pattern matches any one charac- - ter in the subject string except (by default) a character that signi- + ter in the subject string except (by default) a character that signi- fies the end of a line. One or more characters may be specified as line terminators (see "Newline conventions" above). - Dot never matches a single line-ending character. When the two-charac- - ter sequence CRLF is the only line ending, dot does not match CR if it - is immediately followed by LF, but otherwise it matches all characters - (including isolated CRs and LFs). When ANYCRLF is selected for line - endings, no occurrences of CR of LF match dot. When all Unicode line + Dot never matches a single line-ending character. When the two-charac- + ter sequence CRLF is the only line ending, dot does not match CR if it + is immediately followed by LF, but otherwise it matches all characters + (including isolated CRs and LFs). When ANYCRLF is selected for line + endings, no occurrences of CR of LF match dot. When all Unicode line endings are being recognized, dot does not match CR or LF or any of the other line ending characters. - The behaviour of dot with regard to newlines can be changed. If the - PCRE2_DOTALL option is set, a dot matches any one character, without - exception. If the two-character sequence CRLF is present in the sub- + The behaviour of dot with regard to newlines can be changed. If the + PCRE2_DOTALL option is set, a dot matches any one character, without + exception. If the two-character sequence CRLF is present in the sub- ject string, it takes two dots to match it. - The handling of dot is entirely independent of the handling of circum- - flex and dollar, the only relationship being that they both involve + The handling of dot is entirely independent of the handling of circum- + flex and dollar, the only relationship being that they both involve newlines. Dot has no special meaning in a character class. - The escape sequence \N when not followed by an opening brace behaves - like a dot, except that it is not affected by the PCRE2_DOTALL option. - In other words, it matches any character except one that signifies the + The escape sequence \N when not followed by an opening brace behaves + like a dot, except that it is not affected by the PCRE2_DOTALL option. + In other words, it matches any character except one that signifies the end of a line. When \N is followed by an opening brace it has a different meaning. See - the section entitled "Non-printing characters" above for details. Perl - also uses \N{name} to specify characters by Unicode name; PCRE2 does + the section entitled "Non-printing characters" above for details. Perl + also uses \N{name} to specify characters by Unicode name; PCRE2 does not support this. MATCHING A SINGLE CODE UNIT - Outside a character class, the escape sequence \C matches any one code - unit, whether or not a UTF mode is set. In the 8-bit library, one code - unit is one byte; in the 16-bit library it is a 16-bit unit; in the - 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches - line-ending characters. The feature is provided in Perl in order to + Outside a character class, the escape sequence \C matches any one code + unit, whether or not a UTF mode is set. In the 8-bit library, one code + unit is one byte; in the 16-bit library it is a 16-bit unit; in the + 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches + line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, but it is unclear how it can use- fully be used. - Because \C breaks up characters into individual code units, matching - one unit with \C in UTF-8 or UTF-16 mode means that the rest of the + Because \C breaks up characters into individual code units, matching + one unit with \C in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined re- sults, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's valid- - ity at the start of processing unless the PCRE2_NO_UTF_CHECK or + ity at the start of processing unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used). - An application can lock out the use of \C by setting the - PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also + An application can lock out the use of \C by setting the + PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to build PCRE2 with the use of \C permanently disabled. - PCRE2 does not allow \C to appear in lookbehind assertions (described - below) in UTF-8 or UTF-16 modes, because this would make it impossible - to calculate the length of the lookbehind. Neither the alternative + PCRE2 does not allow \C to appear in lookbehind assertions (described + below) in UTF-8 or UTF-16 modes, because this would make it impossible + to calculate the length of the lookbehind. Neither the alternative matching function pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. The former gives a match-time error; the latter fails to optimize and so the match is always run using the interpreter. - In the 32-bit library, however, \C is always supported (when not ex- - plicitly locked out) because it always matches a single code unit, + In the 32-bit library, however, \C is always supported (when not ex- + plicitly locked out) because it always matches a single code unit, whether or not UTF-32 is specified. In general, the \C escape sequence is best avoided. However, one way of - using it that avoids the problem of malformed UTF-8 or UTF-16 charac- - ters is to use a lookahead to check the length of the next character, - as in this pattern, which could be used with a UTF-8 string (ignore + using it that avoids the problem of malformed UTF-8 or UTF-16 charac- + ters is to use a lookahead to check the length of the next character, + as in this pattern, which could be used with a UTF-8 string (ignore white space and line breaks): (?| (?=[\x00-\x7f])(\C) | @@ -7672,11 +8022,11 @@ MATCHING A SINGLE CODE UNIT (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) | (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C)) - In this example, a group that starts with (?| resets the capturing - parentheses numbers in each alternative (see "Duplicate Group Numbers" + In this example, a group that starts with (?| resets the capturing + parentheses numbers in each alternative (see "Duplicate Group Numbers" below). The assertions at the start of each branch check the next UTF-8 - character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- - tively. The character's individual bytes are then captured by the ap- + character for values whose encoding uses 1, 2, 3, or 4 bytes, respec- + tively. The character's individual bytes are then captured by the ap- propriate number of \C groups. @@ -7684,27 +8034,27 @@ SQUARE BRACKETS AND CHARACTER CLASSES An opening square bracket introduces a character class, terminated by a closing square bracket. A closing square bracket on its own is not spe- - cial by default. If a closing square bracket is required as a member + cial by default. If a closing square bracket is required as a member of the class, it should be the first data character in the class (after - an initial circumflex, if present) or escaped with a backslash. This - means that, by default, an empty class cannot be defined. However, if - the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at + an initial circumflex, if present) or escaped with a backslash. This + means that, by default, an empty class cannot be defined. However, if + the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at the start does end the (empty) class. - A character class matches a single character in the subject. A matched + A character class matches a single character in the subject. A matched character must be in the set of characters defined by the class, unless - the first character in the class definition is a circumflex, in which + the first character in the class definition is a circumflex, in which case the subject character must not be in the set defined by the class. - If a circumflex is actually required as a member of the class, ensure + If a circumflex is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash. - For example, the character class [aeiou] matches any lower case vowel, - while [^aeiou] matches any character that is not a lower case vowel. - Note that a circumflex is just a convenient notation for specifying the - characters that are in the class by enumerating those that are not. A - class that starts with a circumflex is not an assertion; it still con- - sumes a character from the subject string, and therefore it fails if - the current pointer is at the end of the string. + For example, the character class [aeiou] matches any lower case English + vowel, whereas [^aeiou] matches all other characters. Note that a cir- + cumflex is just a convenient notation for specifying the characters + that are in the class by enumerating those that are not. A class that + starts with a circumflex is not an assertion; it still consumes a char- + acter from the subject string, and therefore it fails to match if the + current pointer is at the end of the string. Characters in a class may be specified by their code points using \o, \x, or \N{U+hh..} in the usual way. When caseless matching is set, any @@ -7714,7 +8064,10 @@ SQUARE BRACKETS AND CHARACTER CLASSES would. Note that there are two ASCII characters, K and S, that, in ad- dition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when ei- - ther PCRE2_UTF or PCRE2_UCP is set. + ther PCRE2_UTF or PCRE2_UCP is set. If you do not want these ASCII/non- + ASCII case equivalences, you can suppress them by setting PCRE2_EX- + TRA_CASELESS_RESTRICT, either as an option in a compile context, or by + including (*CASELESS_RESTRICT) or (?r) within a pattern. Characters that might indicate line breaks are never treated in any special way when matching character classes, whatever line-ending se- @@ -7743,67 +8096,171 @@ SQUARE BRACKETS AND CHARACTER CLASSES last character in the class, or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z. + There is some special treatment for alphabetic ranges in EBCDIC envi- + ronments; see the section "EBCDIC environments" below. + Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d - or \H. However, unless the hyphen is the last character in the class, - Perl outputs a warning in its warning mode, as this is most likely a - user error. As PCRE2 has no facility for warning, an error is given in + or \H. However, unless the hyphen is the last character in the class, + Perl outputs a warning in its warning mode, as this is most likely a + user error. As PCRE2 has no facility for warning, an error is given in these cases. It is not possible to have the literal character "]" as the end charac- - ter of a range. A pattern such as [W-]46] is interpreted as a class of - two characters ("W" and "-") followed by a literal string "46]", so it - would match "W46]" or "-46]". However, if the "]" is escaped with a - backslash it is interpreted as the end of range, so [W-\]46] is inter- - preted as a class containing a range followed by two other characters. - The octal or hexadecimal representation of "]" can also be used to end - a range. + ter of a range. A pattern such as [W-]46] is interpreted as a class of + two characters ("W" and "-") followed by a literal string "46]", so it + would match "W46]" or "-46]". However, if the "]" is escaped with a + backslash it is interpreted as the end of a range, so [W-\]46] is in- + terpreted as a class containing a range and two other characters. The + octal or hexadecimal representation of "]" can also be used to end a + range. Ranges normally include all code points between the start and end char- - acters, inclusive. They can also be used for code points specified nu- - merically, for example [\000-\037]. Ranges can include any characters - that are valid for the current mode. In any UTF mode, the so-called - "surrogate" characters (those whose code points lie between 0xd800 and - 0xdfff inclusive) may not be specified explicitly by default (the - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- + acters, inclusive. They can also be used for code points specified nu- + merically, for example [\000-\037]. Ranges can include any characters + that are valid for the current mode. In any UTF mode, the so-called + "surrogate" characters (those whose code points lie between 0xd800 and + 0xdfff inclusive) may not be specified explicitly by default (the + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables this check). How- ever, ranges such as [\x{d7ff}-\x{e000}], which include the surrogates, are always permitted. - There is a special case in EBCDIC environments for ranges whose end - points are both specified as literal letters in the same case. For com- - patibility with Perl, EBCDIC code points within the range that are not - letters are omitted. For example, [h-k] matches only four characters, - even though the codes for h and k are 0x88 and 0x92, a range of 11 code - points. However, if the range is specified numerically, for example, - [\x88-\x92] or [h-\x92], all code points are included. - If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent - to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if - character tables for a French locale are in use, [\xc8-\xcb] matches + to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if + character tables for a French locale are in use, [\xc8-\xcb] matches accented E characters in both cases. - A circumflex can conveniently be used with the upper case character - types to specify a more restricted set of characters than the matching - lower case type. For example, the class [^\W_] matches any letter or + A circumflex can conveniently be used with the upper case character + types to specify a more restricted set of characters than the matching + lower case type. For example, the class [^\W_] matches any letter or digit, but not underscore, whereas [\w] includes underscore. A positive character class should be read as "something OR something OR ..." and a negative class as "NOT something AND NOT something AND NOT ...". - The only metacharacters that are recognized in character classes are - backslash, hyphen (only where it can be interpreted as specifying a - range), circumflex (only at the start), opening square bracket (only - when it can be interpreted as introducing a POSIX class name, or for a - special compatibility feature - see the next two sections), and the - terminating closing square bracket. However, escaping other non-al- - phanumeric characters does no harm. + The metacharacters that are recognized in character classes are back- + slash, hyphen (when it can be interpreted as specifying a range), cir- + cumflex (only at the start), and the terminating closing square + bracket. An opening square bracket is also special when it can be in- + terpreted as introducing a POSIX class (see "Posix character classes" + below), or a special compatibility feature (see "Compatibility feature + for word boundaries" below. Escaping any non-alphanumeric character in + a class turns it into a literal, whether or not it would otherwise be a + metacharacter. + + +PERL EXTENDED CHARACTER CLASSES + + From release 10.45 PCRE2 supports Perl's (?[...]) extended character + class syntax. This can be used to perform set operations such as inter- + section on character classes. + + The syntax permitted within (?[...]) is quite different to ordinary + character classes. Inside the extended class, there is an expression + syntax consisting of "atoms", operators, and ordinary parentheses "()" + used for grouping. Such classes always have the Perl /xx modifier + (PCRE2 option PCRE2_EXTENDED_MORE) turned on within them. This means + that literal space and tab characters are ignored everywhere in the + class. + + The allowed atoms are individual characters specified by escape se- + quences such as \n or \x{123}, character types such as \d, POSIX + classes such as [:alpha:], and nested ordinary (non-extended) character + classes. For example, in (?[\d & [...]]) the nested class [...] follows + the usual rules for ordinary character classes, in which parentheses + are not metacharacters, and character literals and ranges are permit- + ted. + + Character literals and ranges may not appear outside a nested ordinary + character class because they are not atoms in the extended syntax. The + extended syntax does not introduce any additional escape sequences, so + (?[\y]) is an unknown escape, as it would be in [\y]. + + In the extended syntax, ^ does not negate a class (except within an or- + dinary class nested inside an extended class); it is instead a binary + operator. + + The binary operators are "&" (intersection), "|" or "+" (union), "-" + (subtraction) and "^" (symmetric difference). These are left-associa- + tive and "&" has higher (tighter) precedence, while the others have + equal lower precedence. The one prefix unary operator is "!" (comple- + ment), with highest precedence. + + +UTS#18 EXTENDED CHARACTER CLASSES + + The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's + (?[...]) syntax, allowing instead extended class behaviour inside or- + dinary [...] character classes. This altered syntax for [...] classes + is loosely described by the Unicode standard UTS#18. The PCRE2_ALT_EX- + TENDED_CLASS option does not prevent use of (?[...]) classes; it just + changes the meaning of all [...] classes that are not nested inside a + Perl (?[...]) class. + + Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is + a character class with two literal characters "a" and "[", but in + UTS#18 extended classes the "[" character becomes an additional + metacharacter within classes, denoting the start of a nested class, so + a literal "[" must be escaped as "\[". + + Secondly, within the UTS#18 extended syntax, there are operators "||", + "&&", "--" and "~~" which denote character class union, intersection, + subtraction, and symmetric difference respectively. In standard Perl + syntax, these would simply be needlessly-repeated literals (except for + "--" which could be the start or end of a range). In UTS#18 extended + classes these operators can be used in constructs such as [\p{L}--[QW]] + for "Unicode letters, other than Q and W". A literal "-" at the start + or end of a range must be escaped, so while "[--1]" in Perl syntax is + the range from hyphen to "1", it must be escaped as "[\--1]" in UTS#18 + extended classes. + + Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option + to ignore space and tab characters is not automatically enabled for + UTS#18 extended classes, but it is honoured if set. + + Extended UTS#18 classes can be nested, and nested classes are them- + selves extended classes (unlike Perl, where nested classes must be sim- + ple classes). For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any + letter that is in the Thai or Greek scripts. Note that this means that + no special grouping characters (such as the parentheses used in Perl's + (?[...]) class syntax) are needed. + + Individual class items (literal characters, literal ranges, properties + such as \d or \p{...}, and nested classes) can be combined by juxtapo- + sition or by an operator. Juxtaposition is the implicit union operator, + and binds more tightly than any explicit operator. Thus a sequence of + literals and/or ranges behaves as if it is enclosed in square brackets. + For example, [A-Z0-9&&[^E8]] is the same as [[A-Z0-9]&&[^E8]], which + matches any upper case alphanumeric character except "E" or "8". + + Precedence between the explicit operators is not defined, so mixing op- + erators is a syntax error. For example, [A&&B--C] is an error, but + [A&&[B--C]] is valid. + + This is an emerging syntax which is being adopted gradually across the + regex ecosystem: for example JavaScript adopted the "/v" flag in EC- + MAScript 2024; Python's "re" module reserves the syntax for future use + with a FutureWarning for unescaped use of "[" as a literal within char- + acter classes. Due to UTS#18 providing insufficient guidance, engines + interpret the syntax differently. Rust's "regex" crate and Python's + "regex" PyPi module both implement UTS#18 extended classes, but with + slight incompatibilities ([A||B&&C] is parsed as [A||[B&&C]] in + Python's "regex" but as [[A||B]&&C] in Rust's "regex"). + + PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v + flag, so that all the UTS#18 extended classes accepted as valid by + PCRE2 have the property that they are interpreted either with the same + behaviour, or as invalid, by all other major engines. Please file an + issue if you are aware of cross-engine differences in behaviour between + PCRE2 and another major engine. POSIX CHARACTER CLASSES Perl supports the POSIX notation for character classes. This uses names - enclosed by [: and :] within the enclosing square brackets. PCRE2 also - supports this notation. For example, + enclosed by [: and :] within the enclosing square brackets. PCRE2 also + supports this notation, in both ordinary and extended classes. For ex- + ample, [01[:alpha:]%] @@ -7883,7 +8340,7 @@ POSIX CHARACTER CLASSES In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This is a change that - was made in PCRE release 10.43 for Perl compatibility. + was made in PCRE2 release 10.43 for Perl compatibility. The other POSIX classes are unchanged by PCRE2_UCP, and match only characters with code points less than 256. @@ -8391,17 +8848,18 @@ REPETITION (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking con- - trol verbs (*PRUNE) and (*SKIP) also disable this optimization, and - there is an option, PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. + trol verbs (*PRUNE) and (*SKIP) also disable this optimization. To do + so explicitly, either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, + or call pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive. - When a capture group is repeated, the value captured is the substring + When a capture group is repeated, the value captured is the substring that matched the final iteration. For example, after (tweedle[dume]{3}\s*)+ has matched "tweedledum tweedledee" the value of the captured substring - is "tweedledee". However, if there are nested capture groups, the cor- - responding captured values may have been set in previous iterations. + is "tweedledee". However, if there are nested capture groups, the cor- + responding captured values may have been set in previous iterations. For example, after (a|(b))+ @@ -8411,57 +8869,57 @@ REPETITION ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS - With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") - repetition, failure of what follows normally causes the repeated item - to be re-evaluated to see if a different number of repeats allows the - rest of the pattern to match. Sometimes it is useful to prevent this, - either to change the nature of the match, or to cause it fail earlier - than it otherwise might, when the author of the pattern knows there is + With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") + repetition, failure of what follows normally causes the repeated item + to be re-evaluated to see if a different number of repeats allows the + rest of the pattern to match. Sometimes it is useful to prevent this, + either to change the nature of the match, or to cause it fail earlier + than it otherwise might, when the author of the pattern knows there is no point in carrying on. - Consider, for example, the pattern \d+foo when applied to the subject + Consider, for example, the pattern \d+foo when applied to the subject line 123456bar After matching all 6 digits and then failing to match "foo", the normal - action of the matcher is to try again with only 5 digits matching the - \d+ item, and then with 4, and so on, before ultimately failing. - "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides + action of the matcher is to try again with only 5 digits matching the + \d+ item, and then with 4, and so on, before ultimately failing. + "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides the means for specifying that once a group has matched, it is not to be re-evaluated in this way. - If we use atomic grouping for the previous example, the matcher gives - up immediately on failing to match "foo" the first time. The notation + If we use atomic grouping for the previous example, the matcher gives + up immediately on failing to match "foo" the first time. The notation is a kind of special parenthesis, starting with (?> as in this example: (?>\d+)foo - Perl 5.28 introduced an experimental alphabetic form starting with (* + Perl 5.28 introduced an experimental alphabetic form starting with (* which may be easier to remember: (*atomic:\d+)foo - This kind of parenthesized group "locks up" the part of the pattern it + This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is - prevented from backtracking into it. Backtracking past it to previous + prevented from backtracking into it. Backtracking past it to previous items, however, works as normal. An alternative description is that a group of this type matches exactly - the string of characters that an identical standalone pattern would + the string of characters that an identical standalone pattern would match, if anchored at the current point in the subject string. - Atomic groups are not capture groups. Simple cases such as the above - example can be thought of as a maximizing repeat that must swallow - everything it can. So, while both \d+ and \d+? are prepared to adjust - the number of digits they match in order to make the rest of the pat- + Atomic groups are not capture groups. Simple cases such as the above + example can be thought of as a maximizing repeat that must swallow + everything it can. So, while both \d+ and \d+? are prepared to adjust + the number of digits they match in order to make the rest of the pat- tern match, (?>\d+) can only match an entire sequence of digits. - Atomic groups in general can of course contain arbitrarily complicated + Atomic groups in general can of course contain arbitrarily complicated expressions, and can be nested. However, when the contents of an atomic - group is just a single repeated item, as in the example above, a sim- - pler notation, called a "possessive quantifier" can be used. This con- - sists of an additional + character following a quantifier. Using this + group is just a single repeated item, as in the example above, a sim- + pler notation, called a "possessive quantifier" can be used. This con- + sists of an additional + character following a quantifier. Using this notation, the previous example can be rewritten as \d++foo @@ -8471,24 +8929,26 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS (abc|xyz){2,3}+ - Possessive quantifiers are always greedy; the setting of the PCRE2_UN- - GREEDY option is ignored. They are a convenient notation for the sim- - pler forms of atomic group. However, there is no difference in the - meaning of a possessive quantifier and the equivalent atomic group, - though there may be a performance difference; possessive quantifiers + Possessive quantifiers are always greedy; the setting of the PCRE2_UN- + GREEDY option is ignored. They are a convenient notation for the sim- + pler forms of atomic group. However, there is no difference in the + meaning of a possessive quantifier and the equivalent atomic group, + though there may be a performance difference; possessive quantifiers should be slightly faster. - The possessive quantifier syntax is an extension to the Perl 5.8 syn- - tax. Jeffrey Friedl originated the idea (and the name) in the first + The possessive quantifier syntax is an extension to the Perl 5.8 syn- + tax. Jeffrey Friedl originated the idea (and the name) in the first edition of his book. Mike McCloskey liked it, so implemented it when he - built Sun's Java package, and PCRE1 copied it from there. It found its + built Sun's Java package, and PCRE1 copied it from there. It found its way into Perl at release 5.10. - PCRE2 has an optimization that automatically "possessifies" certain - simple pattern constructs. For example, the sequence A+B is treated as - A++B because there is no point in backtracking into a sequence of A's - when B must follow. This feature can be disabled by the PCRE2_NO_AUTO- - POSSESS option, or starting the pattern with (*NO_AUTO_POSSESS). + PCRE2 has an optimization that automatically "possessifies" certain + simple pattern constructs. For example, the sequence A+B is treated as + A++B because there is no point in backtracking into a sequence of A's + when B must follow. This feature can be disabled by the + PCRE2_NO_AUTO_POSSESS option, by calling pcre2_set_optimize() with a + PCRE2_AUTO_POSSESS_OFF directive, or by starting the pattern with + (*NO_AUTO_POSSESS). When a pattern contains an unlimited repeat inside a group that can it- self be repeated an unlimited number of times, the use of an atomic @@ -8649,19 +9109,25 @@ BACKREFERENCES ASSERTIONS - An assertion is a test on the characters following or preceding the - current matching point that does not consume any characters. The simple - assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described - above. + An assertion is a test that does not consume any characters. The test + must succeed for the match to continue. The simple assertions coded as + \b, \B, \A, \G, \Z, \z, ^ and $ are described above. + + More complicated assertions are coded as parenthesized groups. If + matching such a group succeeds, matching continues after it, but with + the matching position in the subject string reset to what it was before + the assertion was processed. + + A special kind of assertion, called a "scan substring" assertion, + matches a subpattern against a previously captured substring. This is + described in the section entitled "Scan substring assertions" below. It + is a PCRE2 extension, not compatible with Perl. - More complicated assertions are coded as parenthesized groups. There - are two kinds: those that look ahead of the current position in the - subject string, and those that look behind it, and in each case an as- - sertion may be positive (must match for the assertion to be true) or - negative (must not match for the assertion to be true). An assertion - group is matched in the normal way, and if it is true, matching contin- - ues after it, but with the matching position in the subject string re- - set to what it was before the assertion was processed. + The other goup-based assertions are of two kinds: those that look ahead + of the current position in the subject string, and those that look be- + hind it, and in each case an assertion may be positive (must match for + the assertion to be true) or negative (must not match for the assertion + to be true). The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no back- @@ -8928,6 +9394,66 @@ NON-ATOMIC ASSERTIONS groups (see below) must be atomic. +SCAN SUBSTRING ASSERTIONS + + A special kind of assertion, not compatible with Perl, makes it possi- + ble to check the contents of a captured substring by matching it with a + subpattern. Because this involves capturing, this feature is not sup- + ported by pcre2_dfa_match(). + + A scan substring assertion starts with the sequence (*scan_substring: + or (*scs: which is followed by a list of substring numbers (absolute or + relative) and/or substring names enclosed in single quotes or angle + brackets, all within parentheses. The rest of the item is the subpat- + tern that is applied to the substring, as shown in these examples: + + (*scan_substring:(1)...) + (*scs:(-2)...) + (*scs:('AB')...) + (*scs:(1,'AB',-2)...) + + The list of groups is checked in the order they are given, and it is + the contents of the first one that is found to be set that are scanned. + When PCRE2_DUPNAMES is set and there are ambiguous group names, all + groups with the same name are checked in numerical order. A scan sub- + string assertion fails if none of the groups it references have been + set. + + The pattern match on the substring is always anchored, that is, it must + match from the start of the substring. There is no "bumpalong" if it + does not match at the start. The end of the subject is temporarily re- + set to be the end of the substring, so \Z, \z, and $ will match there. + However, the start of the subject is not reset. This means that ^ + matches only if the substring is actually at the start of the main sub- + ject, but it also means that lookbehind assertions into what precedes + the substring are possible. + + Here is a very simple example: find a word that contains the rare (in + English) sequence of letters "rh" not at the start: + + \b(\w++)(*scs:(1).+rh) + + The first group captures a word which is then scanned by the second + group. This example does not actually need this heavyweight feature; + the same match can be achieved with: + + \b\w+?rh\w*\b + + When things are more complicated, however, scanning a captured sub- + string can be a useful way to describe the required match. For exmple, + there is a rather complicated pattern in the PCRE2 test data that + checks an entire subject string for a palindrome, that is, the sequence + of letters is the same in both directions. Suppose you want to search + for individual words of two or more characters such as "level" that are + palindromes: + + (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...) + + Within a substring scanning subpattern, references to other groups work + as normal. Capturing groups may appear, and will retain their values + during ongoing matching if the assertion succeeds. + + SCRIPT RUNS In concept, a script run is a sequence of characters that are all from @@ -9175,8 +9701,9 @@ COMMENTS There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related - characters such as (?: or a group name or number. The characters that - make up a comment play no part in the pattern matching. + characters such as (?: or a group name or number or a Unicode property + name. The characters that make up a comment play no part in the pattern + matching. The sequence (?# marks the start of a comment that continues up to the next closing parenthesis. Nested parentheses are not permitted. If the @@ -9459,8 +9986,9 @@ CALLOUTS provides an external function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is - passed, or if the callout entry point is set to NULL, callouts are dis- - abled. + passed, or if the callout entry point is set to NULL, callout points + will be passed over silently during matching. To disallow callouts in + the pattern syntax, you may use the PCRE2_EXTRA_NEVER_CALLOUT option. Within a regular expression, (?C) indicates a point at which the external function is to be called. There are two kinds of callout: @@ -9555,10 +10083,10 @@ BACKTRACKING CONTROL Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the tra- - ditional matching function, because that uses a backtracking algorithm. - With the exception of (*FAIL), which behaves like a failing negative - assertion, the backtracking control verbs cause an error if encountered - by the DFA matching function. + ditional matching function or JIT, because they use backtracking algo- + rithms. With the exception of (*FAIL), which behaves like a failing + negative assertion, the backtracking control verbs cause an error if + encountered by the DFA matching function. The behaviour of these verbs in repeated groups, assertions, and in capture groups called as subroutines (whether or not recursively) is @@ -9573,11 +10101,12 @@ BACKTRACKING CONTROL running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option when calling pcre2_com- - pile(), or by starting the pattern with (*NO_START_OPT). There is more - discussion of this option in the section entitled "Compiling a pattern" - in the pcre2api documentation. + pile(), by calling pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF + directive, or by starting the pattern with (*NO_START_OPT). There is + more discussion of this option in the section entitled "Compiling a + pattern" in the pcre2api documentation. - Experiments with Perl suggest that it too has similar optimizations, + Experiments with Perl suggest that it too has similar optimizations, and like PCRE2, turning them off can change the result of a match. Verbs that act immediately @@ -9586,77 +10115,77 @@ BACKTRACKING CONTROL (*ACCEPT) or (*ACCEPT:NAME) - This verb causes the match to end successfully, skipping the remainder - of the pattern. However, when it is inside a capture group that is + This verb causes the match to end successfully, skipping the remainder + of the pattern. However, when it is inside a capture group that is called as a subroutine, only that group is ended successfully. Matching then continues at the outer level. If (*ACCEPT) in triggered in a posi- - tive assertion, the assertion succeeds; in a negative assertion, the + tive assertion, the assertion succeeds; in a negative assertion, the assertion fails. - If (*ACCEPT) is inside capturing parentheses, the data so far is cap- + If (*ACCEPT) is inside capturing parentheses, the data so far is cap- tured. For example: A((?:A|B(*ACCEPT)|C)D) - This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- + This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- tured by the outer parentheses. - (*ACCEPT) is the only backtracking verb that is allowed to be quanti- - fied because an ungreedy quantification with a minimum of zero acts + (*ACCEPT) is the only backtracking verb that is allowed to be quanti- + fied because an ungreedy quantification with a minimum of zero acts only when a backtrack happens. Consider, for example, (A(*ACCEPT)??B)C - where A, B, and C may be complex expressions. After matching "A", the - matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) - is triggered and the match succeeds. In both cases, all but C is cap- - tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- + where A, B, and C may be complex expressions. After matching "A", the + matcher processes "BC"; if that fails, causing a backtrack, (*ACCEPT) + is triggered and the match succeeds. In both cases, all but C is cap- + tured. Whereas (*COMMIT) (see below) means "fail on backtrack", a re- peated (*ACCEPT) of this type means "succeed on backtrack". - Warning: (*ACCEPT) should not be used within a script run group, be- - cause it causes an immediate exit from the group, bypassing the script + Warning: (*ACCEPT) should not be used within a script run group, be- + cause it causes an immediate exit from the group, bypassing the script run checking. (*FAIL) or (*FAIL:NAME) - This verb causes a matching failure, forcing backtracking to occur. It - may be abbreviated to (*F). It is equivalent to (?!) but easier to + This verb causes a matching failure, forcing backtracking to occur. It + may be abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl documentation notes that it is probably useful only when combined with (?{}) or (??{}). Those are, of course, Perl features that - are not present in PCRE2. The nearest equivalent is the callout fea- + are not present in PCRE2. The nearest equivalent is the callout fea- ture, as for example in this pattern: a+(?C)(*FAIL) - A match with the string "aaaa" always fails, but the callout is taken + A match with the string "aaaa" always fails, but the callout is taken before each backtrack happens (in this example, 10 times). - (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- - CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is + (*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*AC- + CEPT) and (*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before the verb acts. Recording which path was taken - There is one verb whose main purpose is to track how a match was ar- - rived at, though it also has a secondary use in conjunction with ad- + There is one verb whose main purpose is to track how a match was ar- + rived at, though it also has a secondary use in conjunction with ad- vancing the match starting point (see (*SKIP) below). (*MARK:NAME) or (*:NAME) - A name is always required with this verb. For all the other backtrack- + A name is always required with this verb. For all the other backtrack- ing control verbs, a NAME argument is optional. - When a match succeeds, the name of the last-encountered mark name on + When a match succeeds, the name of the last-encountered mark name on the matching path is passed back to the caller as described in the sec- tion entitled "Other information about the match" in the pcre2api docu- - mentation. This applies to all instances of (*MARK) and other verbs, + mentation. This applies to all instances of (*MARK) and other verbs, including those inside assertions and atomic groups. However, there are - differences in those cases when (*MARK) is used in conjunction with + differences in those cases when (*MARK) is used in conjunction with (*SKIP) as described below. - The mark name that was last encountered on the matching path is passed - back. A verb without a NAME argument is ignored for this purpose. Here - is an example of pcre2test output, where the "mark" modifier requests + The mark name that was last encountered on the matching path is passed + back. A verb without a NAME argument is ignored for this purpose. Here + is an example of pcre2test output, where the "mark" modifier requests the retrieval and outputting of (*MARK) data: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark @@ -9668,30 +10197,31 @@ BACKTRACKING CONTROL MK: B The (*MARK) name is tagged with "MK:" in this output, and in this exam- - ple it indicates which of the two alternatives matched. This is a more - efficient way of obtaining this information than putting each alterna- + ple it indicates which of the two alternatives matched. This is a more + efficient way of obtaining this information than putting each alterna- tive in its own capturing parentheses. - If a verb with a name is encountered in a positive assertion that is - true, the name is recorded and passed back if it is the last-encoun- + If a verb with a name is encountered in a positive assertion that is + true, the name is recorded and passed back if it is the last-encoun- tered. This does not happen for negative assertions or failing positive assertions. - After a partial match or a failed match, the last encountered name in + After a partial match or a failed match, the last encountered name in the entire match process is returned. For example: re> /X(*MARK:A)Y|X(*MARK:B)Z/mark data> XP No match, mark = B - Note that in this unanchored example the mark is retained from the + Note that in this unanchored example the mark is retained from the match attempt that started at the letter "X" in the subject. Subsequent match attempts starting at "P" and then with an empty string do not get as far as the (*MARK) item, but nevertheless do not reset it. - If you are interested in (*MARK) values after failed matches, you - should probably set the PCRE2_NO_START_OPTIMIZE option (see above) to - ensure that the match is always attempted. + If you are interested in (*MARK) values after failed matches, you + should probably either set the PCRE2_NO_START_OPTIMIZE option or call + pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see + above) to ensure that the match is always attempted. Verbs that act after backtracking @@ -9699,11 +10229,11 @@ BACKTRACKING CONTROL tinues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, back- tracking cannot pass to the left of the verb. However, when one of - these verbs appears inside an atomic group or in a lookaround assertion - that is true, its effect is confined to that group, because once the - group has been matched, there is never any backtracking into it. Back- - tracking from beyond an assertion or an atomic group ignores the entire - group, and seeks a preceding backtracking point. + these verbs appears inside an atomic group or in an atomic lookaround + assertion that is true, its effect is confined to that group, because + once the group has been matched, there is never any backtracking into + it. Backtracking from beyond an atomic assertion or group ignores the + entire group, and seeks a preceding backtracking point. These verbs differ in exactly what kind of failure occurs when back- tracking reaches them. The behaviour described below is what happens @@ -9960,21 +10490,23 @@ BACKTRACKING CONTROL (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern. - PCRE2 now supports non-atomic positive assertions, as described in the - section entitled "Non-atomic assertions" above. These assertions must - be standalone (not used as conditions). They are not Perl-compatible. - For these assertions, a later backtrack does jump back into the asser- - tion, and therefore verbs such as (*COMMIT) can be triggered by back- - tracks from later in the pattern. + PCRE2 now supports non-atomic positive assertions and also "scan sub- + string" assertions, as described in the sections entitled "Non-atomic + assertions" and "Scan substring assertions" above. These assertions + must be standalone (not used as conditions). They are not Perl-compati- + ble. For these assertions, a later backtrack does jump back into the + assertion, and therefore verbs such as (*COMMIT) can be triggered by + backtracks from later in the pattern. The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion - to be false, and a negative assertion to be true. + to be false, and a negative assertion to be true. This behaviour dif- + fers from Perl when the assertion has only one branch. - The other backtracking verbs are not treated specially if they appear - in a standalone positive assertion. In a conditional positive asser- + The other backtracking verbs are not treated specially if they appear + in a standalone positive assertion. In a conditional positive asser- tion, backtracking (from within the assertion) into (*COMMIT), (*SKIP), - or (*PRUNE) causes the condition to be false. However, for both stand- + or (*PRUNE) causes the condition to be false. However, for both stand- alone and conditional negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes the assertion to be true, without consider- ing any further alternative branches. @@ -9984,26 +10516,68 @@ BACKTRACKING CONTROL These behaviours occur whether or not the group is called recursively. (*ACCEPT) in a group called as a subroutine causes the subroutine match - to succeed without any further processing. Matching then continues af- - ter the subroutine call. Perl documents this behaviour. Perl's treat- + to succeed without any further processing. Matching then continues af- + ter the subroutine call. Perl documents this behaviour. Perl's treat- ment of the other verbs in subroutines is different in some cases. - (*FAIL) in a group called as a subroutine has its normal effect: it + (*FAIL) in a group called as a subroutine has its normal effect: it forces an immediate backtrack. - (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail - when triggered by being backtracked to in a group called as a subrou- + (*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail + when triggered by being backtracked to in a group called as a subrou- tine. There is then a backtrack at the outer level. (*THEN), when triggered, skips to the next alternative in the innermost - enclosing group that has alternatives (its normal behaviour). However, + enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +EBCDIC ENVIRONMENTS + + Differences in the way PCRE behaves when it is running in an EBCDIC en- + vironment are covered in this section. + + Escape sequences + + When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. + \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. + The \c escape is processed as specified for Perl in the perlebcdic doc- + ument. The only characters that are allowed after \c are A-Z, a-z, or + one of @, [, \, ], ^, _, or ?. Any other character provokes a compile- + time error. The sequence \c@ encodes character code 0; after \c the + letters (in either case) encode characters 1-26 (hex 01 to hex 1A); [, + \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and \c? be- + comes either 255 (hex FF) or 95 (hex 5F). + + Thus, apart from \c?, these escapes generate the same character code + values as they do in an ASCII or Unicode environment, though the mean- + ings of the values mostly differ. For example, \cG always generates + code value 7, which is BEL in ASCII but DEL in EBCDIC. + + The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, + but because 127 is not a control character in EBCDIC, Perl makes it + generate the APC character. Unfortunately, there are several variants + of EBCDIC. In most of them the APC character has the value 255 (hex + FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If + certain other characters have POSIX-BC values, PCRE2 makes \c? generate + 95; otherwise it generates 255. + + Character classes + + In character classes there is a special case in EBCDIC environments for + ranges whose end points are both specified as literal letters in the + same case. For compatibility with Perl, EBCDIC code points within the + range that are not letters are omitted. For example, [h-k] matches only + four characters, even though the EBCDIC codes for h and k are 0x88 and + 0x92, a range of 11 code points. However, if the range is specified nu- + merically, for example, [\x88-\x92] or [h-\x92], all code points are + included. + + SEE ALSO - pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), + pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3). @@ -10016,15 +10590,14 @@ AUTHOR REVISION - Last updated: 04 June 2024 + Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.44 04 June 2024 PCRE2PATTERN(3) +PCRE2 10.45 27 November 2024 PCRE2PATTERN(3) ------------------------------------------------------------------------------ - PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) @@ -10272,15 +10845,14 @@ AUTHOR REVISION - Last updated: 27 July 2022 + Last updated: 06 December 2022 Copyright (c) 1997-2022 University of Cambridge. -PCRE2 10.41 27 July 2022 PCRE2PERFORM(3) +PCRE2 10.45 06 December 2022 PCRE2PERFORM(3) ------------------------------------------------------------------------------ - PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3) @@ -10431,7 +11003,7 @@ COMPILING A PATTERN When a pattern that is compiled with this flag is passed to pcre2_regexec() for matching, the nmatch and pmatch arguments are ig- - nored, and no captured strings are returned. Versions of the PCRE li- + nored, and no captured strings are returned. Versions of the PCRE2 li- brary prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile op- tion, but this no longer happens because it disables the use of back- references. @@ -10631,15 +11203,14 @@ AUTHOR REVISION - Last updated: 19 January 2024 + Last updated: 27 November 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 19 January 2024 PCRE2POSIX(3) +PCRE2 10.45 27 November 2024 PCRE2POSIX(3) ------------------------------------------------------------------------------ - PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) @@ -10725,13 +11296,12 @@ AUTHOR REVISION - Last updated: 02 February 2016 + Last updated: 14 November 2023 Copyright (c) 1997-2016 University of Cambridge. -PCRE2 10.22 02 February 2016 PCRE2SAMPLE(3) +PCRE2 10.45 14 November 2023 PCRE2SAMPLE(3) ------------------------------------------------------------------------------ - PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3) @@ -10917,15 +11487,14 @@ AUTHOR REVISION - Last updated: 27 June 2018 + Last updated: 19 January 2024 Copyright (c) 1997-2018 University of Cambridge. -PCRE2 10.32 27 June 2018 PCRE2SERIALIZE(3) +PCRE2 10.45 19 January 2024 PCRE2SERIALIZE(3) ------------------------------------------------------------------------------ - PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3) @@ -10935,9 +11504,11 @@ NAME PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY - The full syntax and semantics of the regular expressions that are sup- - ported by PCRE2 are described in the pcre2pattern documentation. This - document contains a quick-reference summary of the syntax. + The full syntax and semantics of the regular expression patterns that + are supported by PCRE2 are described in the pcre2pattern documentation. + This document contains a quick-reference summary of the pattern syntax + followed by the syntax of replacement strings in substitution function. + The full description of the latter is in the pcre2api documentation. QUOTING @@ -10947,22 +11518,24 @@ QUOTING Note that white space inside \Q...\E is always treated as literal, even if PCRE2_EXTENDED is set, causing most other white space to be ignored. + Note also that PCRE2's handling of \Q...\E has some differences from + Perl's. See the pcre2pattern documentation for details. BRACED ITEMS - With one exception, wherever brace characters { and } are required to - enclose data for constructions such as \g{2} or \k{name}, space and/or - horizontal tab characters that follow { or precede } are allowed and + With one exception, wherever brace characters { and } are required to + enclose data for constructions such as \g{2} or \k{name}, space and/or + horizontal tab characters that follow { or precede } are allowed and are ignored. In the case of quantifiers, they may also appear before or - after the comma. The exception is \u{...} which is not Perl-compatible + after the comma. The exception is \u{...} which is not Perl-compatible and is recognized only when PCRE2_EXTRA_ALT_BSUX is set. This is an EC- MAScript compatibility feature, and follows ECMAScript's behaviour. ESCAPED CHARACTERS - This table applies to ASCII and Unicode environments. An unrecognized + This table applies to ASCII and Unicode environments. An unrecognized escape sequence causes an error. \a alarm, that is, the BEL character (hex 07) @@ -10979,6 +11552,11 @@ ESCAPED CHARACTERS \xhh character with hex code hh \x{hh..} character with hex code hh.. + \N{U+hh..} is synonymous with \x{hh..} but is not supported in environ- + ments that use EBCDIC code (mainly IBM mainframes). Note that \N not + followed by an opening curly bracket has a different meaning (see be- + low). + If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized: @@ -10986,20 +11564,17 @@ ESCAPED CHARACTERS \uhhhh character with hex code hhhh \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX - When \x is not followed by {, from zero to two hexadecimal digits are - read, but in ALT_BSUX mode \x must be followed by two hexadecimal dig- - its to be recognized as a hexadecimal escape; otherwise it matches a - literal "x". Likewise, if \u (in ALT_BSUX mode) is not followed by - four hexadecimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex - digits in curly brackets, it matches a literal "u". + When \x is not followed by {, one or two hexadecimal digits are read, + but in ALT_BSUX mode \x must be followed by two hexadecimal digits to + be recognized as a hexadecimal escape; otherwise it matches a literal + "x". Likewise, if \u (in ALT_BSUX mode) is not followed by four hexa- + decimal digits or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in + curly brackets, it matches a literal "u". Note that \0dd is always an octal code. The treatment of backslash fol- - lowed by a non-zero digit is complicated; for details see the section - "Non-printing characters" in the pcre2pattern documentation, where de- - tails of escape processing in EBCDIC environments are also given. - \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not supported in - EBCDIC environments. Note that \N not followed by an opening curly - bracket has a different meaning (see below). + lowed by a non-zero digit is complicated; for details see the section + "Non-printing characters" in the pcre2pattern documentation, where de- + tails of escape processing in EBCDIC environments are also given. CHARACTER TYPES @@ -11023,23 +11598,24 @@ CHARACTER TYPES \W a "non-word" character \X a Unicode extended grapheme cluster - \C is dangerous because it may leave the current matching point in the + \C is dangerous because it may leave the current matching point in the middle of a UTF-8 or UTF-16 character. The application can lock out the - use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also + use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 with the use of \C permanently disabled. - By default, \d, \s, and \w match only ASCII characters, even in UTF-8 + By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific - matching is happening, \s and \w may also match characters with code + matching is happening, \s and \w may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behav- iour of these escape sequences is changed to use Unicode properties and - they match many more characters, but there are some option settings - that can restrict individual sequences to matching only ASCII charac- + they match many more characters, but there are some option settings + that can restrict individual sequences to matching only ASCII charac- ters. Property descriptions in \p and \P are matched caselessly; hyphens, un- - derscores, and white space are ignored, in accordance with Unicode's - "loose matching" rules. + derscores, and ASCII white space characters are ignored, in accordance + with Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} + is the same as \p{ bidi class = AL }. GENERAL CATEGORY PROPERTIES FOR \p and \P @@ -11052,13 +11628,13 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -11090,6 +11666,9 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Zp Paragraph separator Zs Space separator + From release 10.45, when caseless matching is set, Ll, Lu, and Lt are + all equivalent to Lc. + PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P @@ -11106,9 +11685,9 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P BINARY PROPERTIES FOR \p AND \P - Unicode defines a number of binary properties, that is, properties - whose only values are true or false. You can obtain a list of those - that are recognized by \p and \P, along with their abbreviations, by + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by running this command: pcre2test -LP @@ -11116,8 +11695,8 @@ BINARY PROPERTIES FOR \p AND \P SCRIPT MATCHING WITH \p AND \P - Many script names and their 4-letter abbreviations are recognized in - \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P + Many script names and their 4-letter abbreviations are recognized in + \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of course). You can obtain a list of these scripts by running this com- mand: @@ -11153,7 +11732,7 @@ THE BIDI_CLASS PROPERTY FOR \p AND \P RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space CHARACTER CLASSES @@ -11179,10 +11758,50 @@ CHARACTER CLASSES word same as \w xdigit hexadecimal digit - In PCRE2, POSIX character set names recognize only ASCII characters by - default, but some of them use Unicode properties if PCRE2_UCP is set. + In PCRE2, POSIX character set names recognize only ASCII characters by + default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class. + When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes + may be used, allowing nested character classes, combined using set op- + erators. + + [x&&[^y]] UTS#18 extended character class + + x||y set union (OR) + x&&y set intersection (AND) + x--y set difference (AND NOT) + x~~y set symmetric difference (XOR) + + +PERL EXTENDED CHARACTER CLASSES + + (?[...]) Perl extended character class + (?[\p{Thai} & \p{Nd}]) operators; whitespace ignored + (?[(x - y) & z]) parentheses for grouping + + (?[ [^3] & \p{Nd} ]) [...] is a nested ordinary class + (?[ [:alpha:] - [z] ]) POSIX set is allowed outside [...] + (?[ \d - [3] ]) backslash-escaped set is allowed outside + [...] + (?[ !\n & [:ascii:] ]) backslash-escaped character is allowed out- + side [...] + all other characters or ranges must be enclosed + in [...] + + x|y, x+y set union (OR) + x&y set intersection (AND) + x-y set difference (AND NOT) + x^y set symmetric difference (XOR) + !x set complement (NOT) + + Inside a Perl extended character class, [...] switches mode to be in- + terpreted as an ordinary character class. Outside of a nested [...], + the only items permitted are backslash-escapes, POSIX sets, operators, + and parentheses. Inside a nested ordinary class, ^ has its usual mean- + ing (inverts the class when used as the first character); outside of a + nested class, ^ is the XOR operator. + QUANTIFIERS @@ -11289,7 +11908,7 @@ OPTION SETTING (?^) unset imnrsx options (?aP) implies (?aT) as well, though this has no additional effect. How- - ever, it means that (?-aP) is really (?-PT) which disables all ASCII + ever, it means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes. Unsetting x or xx unsets both. Several options may be set at once, and @@ -11299,20 +11918,25 @@ OPTION SETTING capture group, for example (?i:...). The following are recognized only at the very start of a pattern or af- - ter one of the newline or \R options with similar syntax. More than one - of them may appear. For the first three, d is a decimal number. - - (*LIMIT_DEPTH=d) set the backtracking limit to d - (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes - (*LIMIT_MATCH=d) set the match limit to d - (*NOTEMPTY) set PCRE2_NOTEMPTY when matching - (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching - (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) + ter one of the newline or \R sequences or options with similar syntax. + More than one of them may appear. For the first three, d is a decimal + number. + + (*LIMIT_DEPTH=d) set the backtracking limit to d + (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes + (*LIMIT_MATCH=d) set the match limit to d + (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching + (*NOTEMPTY) set PCRE2_NOTEMPTY when matching + (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching + (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) - (*NO_JIT) disable JIT optimization - (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) - (*UTF) set appropriate UTF mode for the library in use - (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) + (*NO_JIT) disable JIT optimization + (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OP- + TIMIZE) + (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching + (*UTF) set appropriate UTF mode for the library in use + (*UCP) set PCRE2_UCP (use Unicode properties for \d + etc) Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or @@ -11383,6 +12007,22 @@ NON-ATOMIC LOOKAROUND ASSERTIONS (*non_atomic_positive_lookbehind:...) ) +SUBSTRING SCAN ASSERTION + This feature is not Perl-compatible. + + (*scan_substring:(grouplist)...) scan captured substring + (*scs:(grouplist)...) scan captured substring + + The comma-separated list may identify groups in any of the following + ways: + + n absolute reference + +n relative reference + -n relative reference + name + 'name' name + + SCRIPT RUNS (*script_run:...) ) script run, can be backtracked into @@ -11444,16 +12084,16 @@ CONDITIONAL PATTERNS (?(VERSION[>]=n.m) test PCRE2 version (?(assert) assertion condition - Note the ambiguity of (?(R) and (?(Rn) which might be named reference - conditions or recursion tests. Such a condition is interpreted as a + Note the ambiguity of (?(R) and (?(Rn) which might be named reference + conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. BACKTRACKING CONTROL - All backtracking control verbs may be in the form (*VERB:NAME). For - (*MARK) the name is mandatory, for the others it is optional. (*SKIP) - changes its behaviour if :NAME is present. The others just set a name + All backtracking control verbs may be in the form (*VERB:NAME). For + (*MARK) the name is mandatory, for the others it is optional. (*SKIP) + changes its behaviour if :NAME is present. The others just set a name for passing back to the caller, but this is not a name that (*SKIP) can see. The following act immediately they are reached: @@ -11461,7 +12101,7 @@ BACKTRACKING CONTROL (*FAIL) force backtrack; synonym (*F) (*MARK:NAME) set name to be passed back; synonym (*:NAME) - The following act only when a subsequent match failure causes a back- + The following act only when a subsequent match failure causes a back- track to reach them. They all force a match failure, but they differ in what happens afterwards. Those that advance the start-of-match point do so only if the pattern is not anchored. @@ -11473,7 +12113,7 @@ BACKTRACKING CONTROL (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation - The effect of one of these verbs in a group called as a subroutine is + The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call. @@ -11484,14 +12124,61 @@ CALLOUTS (?C"text") callout with string data The allowed string delimiters are ` ' " ^ % # $ (which are the same for - the start and the end), and the starting delimiter { matched with the - ending delimiter }. To encode the ending delimiter within the string, + the start and the end), and the starting delimiter { matched with the + ending delimiter }. To encode the ending delimiter within the string, double it. +REPLACEMENT STRINGS + + If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for + pcre2_substitute() is not interpreted. Otherwise, by default, the only + special character is the dollar character in one of the following + forms: + + $$ insert a dollar character + $n or ${n} insert the contents of group n + $ insert the contents of named group + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string + $*MARK or ${*MARK} insert a control verb name + + For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is + set, there is additional interpretation: + + 1. Backslash is an escape character, and the forms described in "ES- + CAPED CHARACTERS" above are recognized. Also: + + \Q...\E can be used to suppress interpretation + \l force the next character to lower case + \u force the next character to upper case + \L force subsequent characters to lower case + \U force subsequent characters to upper case + \u\L force next character to upper case, then all lower + \l\U force next character to lower case, then all upper + \E end \L or \U case forcing + \b backspace character (note: as in character class in pat- + tern) + \v vertical tab character (note: not the same as in a pattern) + + 2. The Python form \g, where the angle brackets are part of the syn- + tax and n is either a group name or a number, is recognized as an al- + ternative way of inserting the contents of a group, for example \g<3>. + + 3. Capture substitution supports the following additional forms: + + ${n:-string} default for unset group + ${n:+string1:string2} values for set/unset group + + The substitution strings themselves are expanded. Backslash can be used + to escape colons and closing curly brackets. + + SEE ALSO - pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), + pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3). @@ -11504,20 +12191,19 @@ AUTHOR REVISION - Last updated: 12 October 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 27 November 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 12 October 2023 PCRE2SYNTAX(3) +PCRE2 10.45 27 November 2024 PCRE2SYNTAX(3) ------------------------------------------------------------------------------ - PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) NAME - PCRE - Perl-compatible regular expressions (revised API) + PCRE2 - Perl-compatible regular expressions (revised API) UNICODE AND UTF SUPPORT @@ -11554,7 +12240,7 @@ UNICODE PROPERTY SUPPORT ting. The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal num- - ber, the derived properties Any and LC (synonym L&), the Unicode script + ber, the derived properties Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties. @@ -11647,173 +12333,203 @@ UNICODE CASE-EQUIVALENCE in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. + Without PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' = U+212A (Kelvin sign) + 's' = 'S' = U+017F (long S) + With PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' + U+212A (Kelvin sign) only case-equivalent to itself + 's' = 'S' + U+017F (long S) only case-equivalent to itself + + One language family, Turkish and Azeri, has its own case-insensitivity + rules, which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. + This alters the behaviour of the 'i', 'I', U+0130 (capital I with dot + above), and U+0131 (small dotless i) characters. + + Without PCRE2_EXTRA_TURKISH_CASING: + 'i' = 'I' + U+0130 (capital I with dot above) only case-equivalent to itself + U+0131 (small dotless i) only case-equivalent to itself + With PCRE2_EXTRA_TURKISH_CASING: + 'i' = U+0130 (capital I with dot above) + U+0131 (small dotless i) = 'I' + + It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and + PCRE2_EXTRA_TURKISH_CASING together. + + From release 10.45 the Unicode letter properties Lu (upper case), Ll + (lower case), and Lt (title case) are all treated as Lc (cased letter) + when caseless matching is set by the PCRE2_CASELESS option or (?i) + within the pattern. + SCRIPT RUNS - The pattern constructs (*script_run:...) and (*atomic_script_run:...), - with synonyms (*sr:...) and (*asr:...), verify that the string matched - within the parentheses is a script run. In concept, a script run is a - sequence of characters that are all from the same Unicode script. How- + The pattern constructs (*script_run:...) and (*atomic_script_run:...), + with synonyms (*sr:...) and (*asr:...), verify that the string matched + within the parentheses is a script run. In concept, a script run is a + sequence of characters that are all from the same Unicode script. How- ever, because some scripts are commonly used together, and because some - diacritical and other marks are used with multiple scripts, it is not + diacritical and other marks are used with multiple scripts, it is not that simple. Every Unicode character has a Script property, mostly with a value cor- - responding to the name of a script, such as Latin, Greek, or Cyrillic. + responding to the name of a script, such as Latin, Greek, or Cyrillic. There are also three special values: "Unknown" is used for code points that have not been assigned, and also - for the surrogate code points. In the PCRE2 32-bit library, characters - whose code points are greater than the Unicode maximum (U+10FFFF), - which are accessible only in non-UTF mode, are assigned the Unknown + for the surrogate code points. In the PCRE2 32-bit library, characters + whose code points are greater than the Unicode maximum (U+10FFFF), + which are accessible only in non-UTF mode, are assigned the Unknown script. - "Common" is used for characters that are used with many scripts. These - include punctuation, emoji, mathematical, musical, and currency sym- + "Common" is used for characters that are used with many scripts. These + include punctuation, emoji, mathematical, musical, and currency sym- bols, and the ASCII digits 0 to 9. - "Inherited" is used for characters such as diacritical marks that mod- + "Inherited" is used for characters such as diacritical marks that mod- ify a previous character. These are considered to take on the script of the character that they modify. - Some Inherited characters are used with many scripts, but many of them - are only normally used with a small number of scripts. For example, + Some Inherited characters are used with many scripts, but many of them + are only normally used with a small number of scripts. For example, U+102E0 (Coptic Epact thousands mark) is used only with Arabic and Cop- - tic. In order to make it possible to check this, a Unicode property + tic. In order to make it possible to check this, a Unicode property called Script Extension exists. Its value is a list of scripts that ap- ply to the character. For the majority of characters, the list contains - just one script, the same one as the Script property. However, for - characters such as U+102E0 more than one Script is listed. There are - also some Common characters that have a single, non-Common script in + just one script, the same one as the Script property. However, for + characters such as U+102E0 more than one Script is listed. There are + also some Common characters that have a single, non-Common script in their Script Extension list. The next section describes the basic rules for deciding whether a given - string of characters is a script run. Note, however, that there are - some special cases involving the Chinese Han script, and an additional - constraint for decimal digits. These are covered in subsequent sec- + string of characters is a script run. Note, however, that there are + some special cases involving the Chinese Han script, and an additional + constraint for decimal digits. These are covered in subsequent sec- tions. Basic script run rules A string that is less than two characters long is a script run. This is - the only case in which an Unknown character can be part of a script - run. Longer strings are checked using only the Script Extensions prop- + the only case in which an Unknown character can be part of a script + run. Longer strings are checked using only the Script Extensions prop- erty, not the basic Script property. - If a character's Script Extension property is the single value "Inher- + If a character's Script Extension property is the single value "Inher- ited", it is always accepted as part of a script run. This is also true - for the property "Common", subject to the checking of decimal digits + for the property "Common", subject to the checking of decimal digits described below. All the remaining characters in a script run must have - at least one script in common in their Script Extension lists. In set- + at least one script in common in their Script Extension lists. In set- theoretic terminology, the intersection of all the sets of scripts must not be empty. - A simple example is an Internet name such as "google.com". The letters + A simple example is an Internet name such as "google.com". The letters are all in the Latin script, and the dot is Common, so this string is a script run. However, the Cyrillic letter "o" looks exactly the same as - the Latin "o"; a string that looks the same, but with Cyrillic "o"s is + the Latin "o"; a string that looks the same, but with Cyrillic "o"s is not a script run. - More interesting examples involve characters with more than one script + More interesting examples involve characters with more than one script in their Script Extension. Consider the following characters: U+060C Arabic comma U+06D4 Arabic full stop - The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- - iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both - of them could appear in script runs of either Arabic or Hanifi Ro- - hingya. The first could also appear in Syriac or Thaana script runs, + The first has the Script Extension list Arabic, Hanifi Rohingya, Syr- + iac, and Thaana; the second has just Arabic and Hanifi Rohingya. Both + of them could appear in script runs of either Arabic or Hanifi Ro- + hingya. The first could also appear in Syriac or Thaana script runs, but the second could not. The Chinese Han script - The Chinese Han script is commonly used in conjunction with other - scripts for writing certain languages. Japanese uses the Hiragana and - Katakana scripts together with Han; Korean uses Hangul and Han; Tai- - wanese Mandarin uses Bopomofo and Han. These three combinations are - treated as special cases when checking script runs and are, in effect, - "virtual scripts". Thus, a script run may contain a mixture of Hira- - gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture - of Bopomofo and Han, but not, for example, a mixture of Hangul and - Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- - dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- + The Chinese Han script is commonly used in conjunction with other + scripts for writing certain languages. Japanese uses the Hiragana and + Katakana scripts together with Han; Korean uses Hangul and Han; Tai- + wanese Mandarin uses Bopomofo and Han. These three combinations are + treated as special cases when checking script runs and are, in effect, + "virtual scripts". Thus, a script run may contain a mixture of Hira- + gana, Katakana, and Han, or a mixture of Hangul and Han, or a mixture + of Bopomofo and Han, but not, for example, a mixture of Hangul and + Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical Stan- + dard 39 ("Unicode Security Mechanisms", http://unicode.org/re- ports/tr39/) in allowing such mixtures. Decimal digits - Unicode contains many sets of 10 decimal digits in different scripts, - and some scripts (including the Common script) contain more than one - set. Some of these decimal digits them are visually indistinguishable - from the common ASCII digits. In addition to the script checking de- - scribed above, if a script run contains any decimal digits, they must + Unicode contains many sets of 10 decimal digits in different scripts, + and some scripts (including the Common script) contain more than one + set. Some of these decimal digits them are visually indistinguishable + from the common ASCII digits. In addition to the script checking de- + scribed above, if a script run contains any decimal digits, they must all come from the same set of 10 adjacent characters. VALIDITY OF UTF STRINGS - When the PCRE2_UTF option is set, the strings passed as patterns and + When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant functions. If an invalid UTF string is passed, a negative error code is - returned. The code unit offset to the offending character can be ex- - tracted from the match data block by calling pcre2_get_startchar(), + returned. The code unit offset to the offending character can be ex- + tracted from the match data block by calling pcre2_get_startchar(), which is used for this purpose after a UTF error. - In some situations, you may already know that your strings are valid, - and therefore want to skip these checks in order to improve perfor- - mance, for example in the case of a long subject string that is being - scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- - pile time or at match time, PCRE2 assumes that the pattern or subject + In some situations, you may already know that your strings are valid, + and therefore want to skip these checks in order to improve perfor- + mance, for example in the case of a long subject string that is being + scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- + pile time or at match time, PCRE2 assumes that the pattern or subject it is given (respectively) contains only valid UTF code unit sequences. - If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the - result is undefined and your program may crash or loop indefinitely or - give incorrect results. There is, however, one mode of matching that - can handle invalid UTF subject strings. This is enabled by passing - PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in - the next section. The rest of this section covers the case when + If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the + result is undefined and your program may crash or loop indefinitely or + give incorrect results. There is, however, one mode of matching that + can handle invalid UTF subject strings. This is enabled by passing + PCRE2_MATCH_INVALID_UTF to pcre2_compile() and is discussed below in + the next section. The rest of this section covers the case when PCRE2_MATCH_INVALID_UTF is not set. - Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF - check for the pattern; it does not also apply to subject strings. If - you want to disable the check for a subject string you must pass this + Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF + check for the pattern; it does not also apply to subject strings. If + you want to disable the check for a subject string you must pass this same option to pcre2_match() or pcre2_dfa_match(). UTF-16 and UTF-32 strings can indicate their endianness by special code - knows as a byte-order mark (BOM). The PCRE2 functions do not handle + knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. - Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any + Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other processing takes place. In the case of pcre2_match() and - pcre2_dfa_match() calls with a non-zero starting offset, the check is + pcre2_dfa_match() calls with a non-zero starting offset, the check is applied only to that part of the subject that could be inspected during - matching, and there is a check that the starting offset points to the - first code unit of a character or to the end of the subject. If there - are no lookbehind assertions in the pattern, the check starts at the - starting offset. Otherwise, it starts at the length of the longest - lookbehind before the starting offset, or at the start of the subject - if there are not that many characters before the starting offset. Note + matching, and there is a check that the starting offset points to the + first code unit of a character or to the end of the subject. If there + are no lookbehind assertions in the pattern, the check starts at the + starting offset. Otherwise, it starts at the length of the longest + lookbehind before the starting offset, or at the start of the subject + if there are not that many characters before the starting offset. Note that the sequences \b and \B are one-character lookbehinds. - In addition to checking the format of the string, there is a check to + In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding - the surrogate area. The so-called "non-character" code points are not + the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be. - Characters in the "Surrogate Area" of Unicode are reserved for use by - UTF-16, where they are used in pairs to encode code points with values - greater than 0xFFFF. The code points that are encoded by UTF-16 pairs - are available independently in the UTF-8 and UTF-32 encodings. (In - other words, the whole surrogate thing is a fudge for UTF-16 which un- + Characters in the "Surrogate Area" of Unicode are reserved for use by + UTF-16, where they are used in pairs to encode code points with values + greater than 0xFFFF. The code points that are encoded by UTF-16 pairs + are available independently in the UTF-8 and UTF-32 encodings. (In + other words, the whole surrogate thing is a fudge for UTF-16 which un- fortunately messes up UTF-8 and UTF-32.) - Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error - that is given if an escape sequence for an invalid Unicode code point - is encountered in the pattern. If you want to allow escape sequences - such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- - TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible - only in UTF-8 and UTF-32 modes, because these values are not repre- + Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error + that is given if an escape sequence for an invalid Unicode code point + is encountered in the pattern. If you want to allow escape sequences + such as \x{d800} (a surrogate code point) you can set the PCRE2_EX- + TRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible + only in UTF-8 and UTF-32 modes, because these values are not repre- sentable in UTF-16. Errors in UTF-8 strings @@ -11826,10 +12542,10 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR4 PCRE2_ERROR_UTF8_ERR5 - The string ends with a truncated UTF-8 character; the code specifies - how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 - characters to be no longer than 4 bytes, the encoding scheme (origi- - nally defined by RFC 2279) allows for up to 6 bytes, and this is + The string ends with a truncated UTF-8 character; the code specifies + how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 + characters to be no longer than 4 bytes, the encoding scheme (origi- + nally defined by RFC 2279) allows for up to 6 bytes, and this is checked first; hence the possibility of 4 or 5 missing bytes. PCRE2_ERROR_UTF8_ERR6 @@ -11839,13 +12555,13 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR10 The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of - the character do not have the binary value 0b10 (that is, either the + the character do not have the binary value 0b10 (that is, either the most significant bit is 0, or the next bit is 1). PCRE2_ERROR_UTF8_ERR11 PCRE2_ERROR_UTF8_ERR12 - A character that is valid by the RFC 2279 rules is either 5 or 6 bytes + A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; these code points are excluded by RFC 3629. PCRE2_ERROR_UTF8_ERR13 @@ -11855,8 +12571,8 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR14 - A 3-byte character has a value in the range 0xd800 to 0xdfff; this - range of code points are reserved by RFC 3629 for use with UTF-16, and + A 3-byte character has a value in the range 0xd800 to 0xdfff; this + range of code points are reserved by RFC 3629 for use with UTF-16, and so are excluded from UTF-8. PCRE2_ERROR_UTF8_ERR15 @@ -11865,26 +12581,26 @@ VALIDITY OF UTF STRINGS PCRE2_ERROR_UTF8_ERR18 PCRE2_ERROR_UTF8_ERR19 - A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes - for a value that can be represented by fewer bytes, which is invalid. - For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- + A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes + for a value that can be represented by fewer bytes, which is invalid. + For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- rect coding uses just one byte. PCRE2_ERROR_UTF8_ERR20 The two most significant bits of the first byte of a character have the - binary value 0b10 (that is, the most significant bit is 1 and the sec- - ond is 0). Such a byte can only validly occur as the second or subse- + binary value 0b10 (that is, the most significant bit is 1 and the sec- + ond is 0). Such a byte can only validly occur as the second or subse- quent byte of a multi-byte character. PCRE2_ERROR_UTF8_ERR21 - The first byte of a character has the value 0xfe or 0xff. These values + The first byte of a character has the value 0xfe or 0xff. These values can never occur in a valid UTF-8 string. Errors in UTF-16 strings - The following negative error codes are given for invalid UTF-16 + The following negative error codes are given for invalid UTF-16 strings: PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string @@ -11894,7 +12610,7 @@ VALIDITY OF UTF STRINGS Errors in UTF-32 strings - The following negative error codes are given for invalid UTF-32 + The following negative error codes are given for invalid UTF-32 strings: PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) @@ -11904,60 +12620,60 @@ VALIDITY OF UTF STRINGS MATCHING IN INVALID UTF STRINGS You can run pattern matches on subject strings that may contain invalid - UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- - VALID_UTF option. This is supported by pcre2_match(), including JIT + UTF sequences if you call pcre2_compile() with the PCRE2_MATCH_IN- + VALID_UTF option. This is supported by pcre2_match(), including JIT matching, but not by pcre2_dfa_match(). When PCRE2_MATCH_INVALID_UTF is - set, it forces PCRE2_UTF to be set as well. Note, however, that the + set, it forces PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a valid UTF string. - If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, - and you are not certain that your subject strings are valid UTF se- - quences, you should not make use of the JIT "fast path" function - pcre2_jit_match() because it bypasses sanity checks, including the one - for UTF validity. An invalid string may cause undefined behaviour, in- + If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, + and you are not certain that your subject strings are valid UTF se- + quences, you should not make use of the JIT "fast path" function + pcre2_jit_match() because it bypasses sanity checks, including the one + for UTF validity. An invalid string may cause undefined behaviour, in- cluding looping, crashing, or giving the wrong answer. - Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() - generates, but if pcre2_jit_compile() is subsequently called, it does + Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() + generates, but if pcre2_jit_compile() is subsequently called, it does generate different code. If JIT is not used, the option affects the be- haviour of the interpretive code in pcre2_match(). When PCRE2_MATCH_IN- - VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at + VALID_UTF is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time. - In this mode, an invalid code unit sequence in the subject never - matches any pattern item. It does not match dot, it does not match - \p{Any}, it does not even match negative items such as [^X]. A lookbe- - hind assertion fails if it encounters an invalid sequence while moving - the current point backwards. In other words, an invalid UTF code unit + In this mode, an invalid code unit sequence in the subject never + matches any pattern item. It does not match dot, it does not match + \p{Any}, it does not even match negative items such as [^X]. A lookbe- + hind assertion fails if it encounters an invalid sequence while moving + the current point backwards. In other words, an invalid UTF code unit sequence acts as a barrier which no match can cross. You can also think of this as the subject being split up into fragments - of valid UTF, delimited internally by invalid code unit sequences. The - pattern is matched fragment by fragment. The result of a successful - match, however, is given as code unit offsets in the entire subject + of valid UTF, delimited internally by invalid code unit sequences. The + pattern is matched fragment by fragment. The result of a successful + match, however, is given as code unit offsets in the entire subject string in the usual way. There are a few points to consider: - The internal boundaries are not interpreted as the beginnings or ends - of lines and so do not match circumflex or dollar characters in the + The internal boundaries are not interpreted as the beginnings or ends + of lines and so do not match circumflex or dollar characters in the pattern. - If pcre2_match() is called with an offset that points to an invalid - UTF-sequence, that sequence is skipped, and the match starts at the + If pcre2_match() is called with an offset that points to an invalid + UTF-sequence, that sequence is skipped, and the match starts at the next valid UTF character, or the end of the subject. At internal fragment boundaries, \b and \B behave in the same way as at - the beginning and end of the subject. For example, a sequence such as - \bWORD\b would match an instance of WORD that is surrounded by invalid + the beginning and end of the subject. For example, a sequence such as + \bWORD\b would match an instance of WORD that is surrounded by invalid UTF code units. - Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- - trary data, knowing that any matched strings that are returned are + Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbi- + trary data, knowing that any matched strings that are returned are valid UTF. This can be useful when searching for UTF text in executable or other binary files. - Note, however, that the 16-bit and 32-bit PCRE2 libraries process - strings as sequences of uint16_t or uint32_t code points. They cannot - find valid UTF sequences within an arbitrary string of bytes unless + Note, however, that the 16-bit and 32-bit PCRE2 libraries process + strings as sequences of uint16_t or uint32_t code points. They cannot + find valid UTF sequences within an arbitrary string of bytes unless such sequences are suitably aligned. @@ -11970,11 +12686,11 @@ AUTHOR REVISION - Last updated: 12 October 2023 - Copyright (c) 1997-2023 University of Cambridge. + Last updated: 27 November 2024 + Copyright (c) 1997-2024 University of Cambridge. -PCRE2 10.43 04 February 2023 PCRE2UNICODE(3) +PCRE2 10.45 27 November 2024 PCRE2UNICODE(3) ------------------------------------------------------------------------------ diff --git a/usr/share/doc/pcre2/pcre2grep.txt b/usr/share/doc/pcre2/pcre2grep.txt index 7914c450fcb..9e07a5a7dac 100644 --- a/usr/share/doc/pcre2/pcre2grep.txt +++ b/usr/share/doc/pcre2/pcre2grep.txt @@ -1,4 +1,3 @@ - PCRE2GREP(1) General Commands Manual PCRE2GREP(1) @@ -366,139 +365,140 @@ OPTIONS used. What constitutes a newline when reading the file is the operating system's default interpretation of \n. The --new- line option has no effect on this option. Trailing white - space is removed from each line, and blank lines are ignored. - An empty file contains no patterns and therefore matches - nothing. Patterns read from a file in this way may contain - binary zeros, which are treated as ordinary data characters. - - If this option is given more than once, all the specified - files are read. A data line is output if any of the patterns - match it. A file name can be given as "-" to refer to the - standard input. When -f is used, patterns specified on the - command line using -e may also be present; they are matched + space is removed from each line, and blank lines are ignored + unless the --posix-pattern-file option is also provided. An + empty file contains no patterns and therefore matches noth- + ing. Patterns read from a file in this way may contain binary + zeros, which are treated as ordinary character literals. + + If this option is given more than once, all the specified + files are read. A data line is output if any of the patterns + match it. A file name can be given as "-" to refer to the + standard input. When -f is used, patterns specified on the + command line using -e may also be present; they are matched before the file's patterns. However, no pattern is taken from - the command line; all arguments are treated as the names of + the command line; all arguments are treated as the names of paths to be searched. --file-list=filename - Read a list of files and/or directories that are to be + Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a - newline when reading the file is the operating system's de- - fault. Trailing white space is removed from each line, and + newline when reading the file is the operating system's de- + fault. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any - that are listed on the command line. The file name can be - given as "-" to refer to the standard input. If --file and - --file-list are both specified as "-", patterns are read - first. This is useful only when the standard input is a ter- - minal, from which further lines (the list of files) can be + that are listed on the command line. The file name can be + given as "-" to refer to the standard input. If --file and + --file-list are both specified as "-", patterns are read + first. This is useful only when the standard input is a ter- + minal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read. --file-offsets - Instead of showing lines or parts of lines that match, show - each match as an offset from the start of the file and a - length, separated by a comma. In this mode, --colour has no - effect, and no context is shown. That is, the -A, -B, and -C - options are ignored. If there is more than one match in a - line, each of them is shown separately. This option is mutu- - ally exclusive with --output, --line-offsets, and --only- + Instead of showing lines or parts of lines that match, show + each match as an offset from the start of the file and a + length, separated by a comma. In this mode, --colour has no + effect, and no context is shown. That is, the -A, -B, and -C + options are ignored. If there is more than one match in a + line, each of them is shown separately. This option is mutu- + ally exclusive with --output, --line-offsets, and --only- matching. --group-separator=text Output this text string instead of two hyphens between groups - of lines when -A, -B, or -C is in use. See also --no-group- + of lines when -A, -B, or -C is in use. See also --no-group- separator. -H, --with-filename - Force the inclusion of the file name at the start of output + Force the inclusion of the file name at the start of output lines when searching a single file. The file name is not nor- - mally shown in this case. By default, for matching lines, - the file name is followed by a colon; for context lines, a + mally shown in this case. By default, for matching lines, + the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a zero byte. If a line number is also being output, it follows the file name. When the -M option causes a - pattern to match more than one line, only the first is pre- - ceded by the file name. This option overrides any previous + pattern to match more than one line, only the first is pre- + ceded by the file name. This option overrides any previous -h, -l, or -L options. -h, --no-filename Suppress the output file names when searching multiple files. File names are normally shown when multiple files are - searched. By default, for matching lines, the file name is + searched. By default, for matching lines, the file name is followed by a colon; for context lines, a hyphen separator is used. The -Z option can be used to change the terminator to a - zero byte. If a line number is also being output, it follows + zero byte. If a line number is also being output, it follows the file name. This option overrides any previous -H, -L, or -l options. --heap-limit=number See --match-limit below. - --help Output a help message, giving brief details of the command - options and file type support, and then exit. Anything else + --help Output a help message, giving brief details of the command + options and file type support, and then exit. Anything else on the command line is ignored. - -I Ignore binary files. This is equivalent to --binary- + -I Ignore binary files. This is equivalent to --binary- files=without-match. -i, --ignore-case - Ignore upper/lower case distinctions when pattern matching. + Ignore upper/lower case distinctions when pattern matching. This applies when matching path names for inclusion or exclu- sion as well as when matching lines in files. --include=pattern - If any --include patterns are specified, the only files that + If any --include patterns are specified, the only files that are processed are those whose names match one of the patterns - and do not match an --exclude pattern. This option does not - affect directories, but it applies to all files, whether - listed on the command line, obtained from --file-list, or by - scanning a directory. The pattern is a PCRE2 regular expres- - sion, and is matched against the final component of the file - name, not the entire path. The -F, -w, and -x options do not - apply to this pattern. The option may be given any number of - times. If a file name matches both an --include and an --ex- - clude pattern, it is excluded. There is no short form for + and do not match an --exclude pattern. This option does not + affect directories, but it applies to all files, whether + listed on the command line, obtained from --file-list, or by + scanning a directory. The pattern is a PCRE2 regular expres- + sion, and is matched against the final component of the file + name, not the entire path. The -F, -w, and -x options do not + apply to this pattern. The option may be given any number of + times. If a file name matches both an --include and an --ex- + clude pattern, it is excluded. There is no short form for this option. --include-from=filename - Treat each non-empty line of the file as the data for an + Treat each non-empty line of the file as the data for an --include option. What constitutes a newline for this purpose - is the operating system's default. The --newline option has + is the operating system's default. The --newline option has no effect on this option. This option may be given any number of times; all the files are read. --include-dir=pattern - If any --include-dir patterns are specified, the only direc- - tories that are processed are those whose names match one of - the patterns and do not match an --exclude-dir pattern. This - applies to all directories, whether listed on the command - line, obtained from --file-list, or by scanning a parent di- - rectory. The pattern is a PCRE2 regular expression, and is - matched against the final component of the directory name, - not the entire path. The -F, -w, and -x options do not apply + If any --include-dir patterns are specified, the only direc- + tories that are processed are those whose names match one of + the patterns and do not match an --exclude-dir pattern. This + applies to all directories, whether listed on the command + line, obtained from --file-list, or by scanning a parent di- + rectory. The pattern is a PCRE2 regular expression, and is + matched against the final component of the directory name, + not the entire path. The -F, -w, and -x options do not apply to this pattern. The option may be given any number of times. - If a directory matches both --include-dir and --exclude-dir, + If a directory matches both --include-dir and --exclude-dir, it is excluded. There is no short form for this option. -L, --files-without-match - Instead of outputting lines from the files, just output the - names of the files that do not contain any lines that would - have been output. Each file name is output once, on a sepa- - rate line by default, but if the -Z option is set, they are - separated by zero bytes instead of newlines. This option + Instead of outputting lines from the files, just output the + names of the files that do not contain any lines that would + have been output. Each file name is output once, on a sepa- + rate line by default, but if the -Z option is set, they are + separated by zero bytes instead of newlines. This option overrides any previous -H, -h, or -l options. -l, --files-with-matches - Instead of outputting lines from the files, just output the + Instead of outputting lines from the files, just output the names of the files containing lines that would have been out- - put. Each file name is output once, on a separate line, but + put. Each file name is output once, on a separate line, but if the -Z option is set, they are separated by zero bytes in- - stead of newlines. Searching normally stops as soon as a - matching line is found in a file. However, if the -c (count) - option is also used, matching continues in order to obtain - the correct count, and those files that have at least one - match are listed along with their counts. Using this option - with -c is a way of suppressing the listing of files with no + stead of newlines. Searching normally stops as soon as a + matching line is found in a file. However, if the -c (count) + option is also used, matching continues in order to obtain + the correct count, and those files that have at least one + match are listed along with their counts. Using this option + with -c is a way of suppressing the listing of files with no matches that occurs with -c on its own. This option overrides any previous -H, -h, or -L options. @@ -508,130 +508,130 @@ OPTIONS input)" is used. There is no short form for this option. --line-buffered - When this option is given, non-compressed input is read and - processed line by line, and the output is flushed after each - write. By default, input is read in large chunks, unless - pcre2grep can determine that it is reading from a terminal, + When this option is given, non-compressed input is read and + processed line by line, and the output is flushed after each + write. By default, input is read in large chunks, unless + pcre2grep can determine that it is reading from a terminal, which is currently possible only in Unix-like environments or Windows. Output to terminal is normally automatically flushed - by the operating system. This option can be useful when the - input or output is attached to a pipe and you do not want - pcre2grep to buffer up large amounts of data. However, its - use will affect performance, and the -M (multiline) option - ceases to work. When input is from a compressed .gz or .bz2 + by the operating system. This option can be useful when the + input or output is attached to a pipe and you do not want + pcre2grep to buffer up large amounts of data. However, its + use will affect performance, and the -M (multiline) option + ceases to work. When input is from a compressed .gz or .bz2 file, --line-buffered is ignored. --line-offsets - Instead of showing lines or parts of lines that match, show + Instead of showing lines or parts of lines that match, show each match as a line number, the offset from the start of the - line, and a length. The line number is terminated by a colon - (as usual; see the -n option), and the offset and length are - separated by a comma. In this mode, --colour has no effect, - and no context is shown. That is, the -A, -B, and -C options - are ignored. If there is more than one match in a line, each - of them is shown separately. This option is mutually exclu- + line, and a length. The line number is terminated by a colon + (as usual; see the -n option), and the offset and length are + separated by a comma. In this mode, --colour has no effect, + and no context is shown. That is, the -A, -B, and -C options + are ignored. If there is more than one match in a line, each + of them is shown separately. This option is mutually exclu- sive with --output, --file-offsets, and --only-matching. --locale=locale-name - This option specifies a locale to be used for pattern match- - ing. It overrides the value in the LC_ALL or LC_CTYPE envi- - ronment variables. If no locale is specified, the PCRE2 li- + This option specifies a locale to be used for pattern match- + ing. It overrides the value in the LC_ALL or LC_CTYPE envi- + ronment variables. If no locale is specified, the PCRE2 li- brary's default (usually the "C" locale) is used. There is no short form for this option. -M, --multiline - Allow patterns to match more than one line. When this option - is set, the PCRE2 library is called in "multiline" mode, and - a match is allowed to continue past the end of the initial + Allow patterns to match more than one line. When this option + is set, the PCRE2 library is called in "multiline" mode, and + a match is allowed to continue past the end of the initial line and onto one or more subsequent lines. - Patterns used with -M may usefully contain literal newline - characters and internal occurrences of ^ and $ characters, - because in multiline mode these can match at internal new- - lines. Because pcre2grep is scanning multiple lines, the \Z - and \z assertions match only at the end of the last line in + Patterns used with -M may usefully contain literal newline + characters and internal occurrences of ^ and $ characters, + because in multiline mode these can match at internal new- + lines. Because pcre2grep is scanning multiple lines, the \Z + and \z assertions match only at the end of the last line in the file. The \A assertion matches at the start of the first - line of a match. This can be any line in the file; it is not + line of a match. This can be any line in the file; it is not anchored to the first line. - The output for a successful match may consist of more than - one line. The first line is the line in which the match - started, and the last line is the line in which the match - ended. If the matched string ends with a newline sequence, - the output ends at the end of that line. If -v is set, none - of the lines in a multi-line match are output. Once a match - has been handled, scanning restarts at the beginning of the + The output for a successful match may consist of more than + one line. The first line is the line in which the match + started, and the last line is the line in which the match + ended. If the matched string ends with a newline sequence, + the output ends at the end of that line. If -v is set, none + of the lines in a multi-line match are output. Once a match + has been handled, scanning restarts at the beginning of the line after the one in which the match ended. - The newline sequence that separates multiple lines must be - matched as part of the pattern. For example, to find the - phrase "regular expression" in a file where "regular" might - be at the end of a line and "expression" at the start of the + The newline sequence that separates multiple lines must be + matched as part of the pattern. For example, to find the + phrase "regular expression" in a file where "regular" might + be at the end of a line and "expression" at the start of the next line, you could use this command: pcre2grep -M 'regular\s+expression' The \s escape sequence matches any white space character, in- - cluding newlines, and is followed by + so as to match trail- - ing white space on the first line as well as possibly han- + cluding newlines, and is followed by + so as to match trail- + ing white space on the first line as well as possibly han- dling a two-character newline sequence. - There is a limit to the number of lines that can be matched, - imposed by the way that pcre2grep buffers the input file as - it scans it. With a sufficiently large processing buffer, + There is a limit to the number of lines that can be matched, + imposed by the way that pcre2grep buffers the input file as + it scans it. With a sufficiently large processing buffer, this should not be a problem. - The -M option does not work when input is read line by line + The -M option does not work when input is read line by line (see --line-buffered.) -m number, --max-count=number - Stop processing after finding number matching lines, or non- - matching lines if -v is also set. Any trailing context lines - are output after the final match. In multiline mode, each - multiline match counts as just one line for this purpose. If - this limit is reached when reading the standard input from a + Stop processing after finding number matching lines, or non- + matching lines if -v is also set. Any trailing context lines + are output after the final match. In multiline mode, each + multiline match counts as just one line for this purpose. If + this limit is reached when reading the standard input from a regular file, the file is left positioned just after the last - matching line. If -c is also set, the count that is output - is never greater than number. This option has no effect if + matching line. If -c is also set, the count that is output + is never greater than number. This option has no effect if used with -L, -l, or -q, or when just checking for a match in a binary file. --match-limit=number - Processing some regular expression patterns may take a very + Processing some regular expression patterns may take a very long time to search for all possible matching strings. Others - may require a very large amount of memory. There are three + may require a very large amount of memory. There are three options that set resource limits for matching. The --match-limit option provides a means of limiting comput- - ing resource usage when processing patterns that are not go- + ing resource usage when processing patterns that are not go- ing to match, but which have a very large number of possibil- ities in their search trees. The classic example is a pattern - that uses nested unlimited repeats. Internally, PCRE2 has a - counter that is incremented each time around its main pro- - cessing loop. If the value set by --match-limit is reached, + that uses nested unlimited repeats. Internally, PCRE2 has a + counter that is incremented each time around its main pro- + cessing loop. If the value set by --match-limit is reached, an error occurs. - The --heap-limit option specifies, as a number of kibibytes + The --heap-limit option specifies, as a number of kibibytes (units of 1024 bytes), the maximum amount of heap memory that may be used for matching. - The --depth-limit option limits the depth of nested back- + The --depth-limit option limits the depth of nested back- tracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtrack- - ing point depends on the number of capturing parentheses in + ing point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this - limit acts varies from pattern to pattern. This limit is of + limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than --match-limit. - There are no short forms for these options. The default lim- - its can be set when the PCRE2 library is compiled; if they - are not specified, the defaults are very large and so effec- + There are no short forms for these options. The default lim- + its can be set when the PCRE2 library is compiled; if they + are not specified, the defaults are very large and so effec- tively unlimited. --max-buffer-size=number - This limits the expansion of the processing buffer, whose - initial size can be set by --buffer-size. The maximum buffer - size is silently forced to be no smaller than the starting + This limits the expansion of the processing buffer, whose + initial size can be set by --buffer-size. The maximum buffer + size is silently forced to be no smaller than the starting buffer size. -N newline-type, --newline=newline-type @@ -640,72 +640,72 @@ OPTIONS pcre2grep -N CRLF 'some pattern' - The newline type may be specified in upper, lower, or mixed - case. If the newline type is NUL, lines are separated by bi- - nary zero characters. The other types are the single-charac- - ter sequences CR (carriage return) and LF (linefeed), the - two-character sequence CRLF, an "anycrlf" type, which recog- - nizes any of the preceding three types, and an "any" type, - for which any Unicode line ending sequence is assumed to end - a line. The Unicode sequences are the three just mentioned, - plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL - (next line, U+0085), LS (line separator, U+2028), and PS + The newline type may be specified in upper, lower, or mixed + case. If the newline type is NUL, lines are separated by bi- + nary zero characters. The other types are the single-charac- + ter sequences CR (carriage return) and LF (linefeed), the + two-character sequence CRLF, an "anycrlf" type, which recog- + nizes any of the preceding three types, and an "any" type, + for which any Unicode line ending sequence is assumed to end + a line. The Unicode sequences are the three just mentioned, + plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL + (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). - When the PCRE2 library is built, a default line-ending se- - quence is specified. This is normally the standard sequence - for the operating system. Unless otherwise specified by this + When the PCRE2 library is built, a default line-ending se- + quence is specified. This is normally the standard sequence + for the operating system. Unless otherwise specified by this option, pcre2grep uses the library's default. - This option makes it possible to use pcre2grep to scan files + This option makes it possible to use pcre2grep to scan files that have come from other environments without having to mod- - ify their line endings. If the data that is being scanned - does not agree with the convention set by this option, - pcre2grep may behave in strange ways. Note that this option - does not apply to files specified by the -f, --exclude-from, - or --include-from options, which are expected to use the op- + ify their line endings. If the data that is being scanned + does not agree with the convention set by this option, + pcre2grep may behave in strange ways. Note that this option + does not apply to files specified by the -f, --exclude-from, + or --include-from options, which are expected to use the op- erating system's standard newline sequence. -n, --line-number Precede each output line by its line number in the file, fol- - lowed by a colon for matching lines or a hyphen for context + lowed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the - line number. When the -M option causes a pattern to match - more than one line, only the first is preceded by its line + line number. When the -M option causes a pattern to match + more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used. --no-group-separator - Do not output a separator between groups of lines when -A, + Do not output a separator between groups of lines when -A, -B, or -C is in use. The default is to output a line contain- ing two hyphens. See also --group-separator. - --no-jit If the PCRE2 library is built with support for just-in-time + --no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build - time. This option can be used to disable the use of JIT at + time. This option can be used to disable the use of JIT at run time. It is provided for testing and working around prob- lems. It should never be needed in normal use. -O text, --output=text - When there is a match, instead of outputting the line that - matched, output just the text specified in this option, fol- - lowed by an operating-system standard newline. In this mode, - --colour has no effect, and no context is shown. That is, - the -A, -B, and -C options are ignored. The --newline option - has no effect on this option, which is mutually exclusive + When there is a match, instead of outputting the line that + matched, output just the text specified in this option, fol- + lowed by an operating-system standard newline. In this mode, + --colour has no effect, and no context is shown. That is, + the -A, -B, and -C options are ignored. The --newline option + has no effect on this option, which is mutually exclusive with --only-matching, --file-offsets, and --line-offsets. - However, like --only-matching, if there is more than one + However, like --only-matching, if there is more than one match in a line, each of them causes a line of output. Escape sequences starting with a dollar character may be used to insert the contents of the matched part of the line and/or captured substrings into the text. - $ or ${} is replaced by the captured sub- - string of the given decimal number; zero substitutes the - whole match. If the number is greater than the number of cap- - turing substrings, or if the capture is unset, the replace- - ment is empty. + $ or ${} is replaced by the captured sub- + string of the given decimal number; $& (or the legacy $0) + substitutes the whole match. If the number is greater than + the number of capturing substrings, or if the capture is un- + set, the replacement is empty. $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by newline; $r by carriage return; $t by tab; @@ -787,93 +787,103 @@ OPTIONS mode, the sequence (?aP) restricts [:word:] to ASCII letters, while allowing \w to match Unicode letters and digits. + --posix-pattern-file + When patterns are provided with the -f option, do not trim + trailing spaces or ignore empty lines in a similar way than + other grep tools. To keep the behaviour consistent with older + versions, if the pattern read was terminated with CRLF (as + character literals) then both characters won't be included as + part of it, so if you really need to have pattern ending in + '\r', use a escape sequence or provide it by a different + method. + -q, --quiet Work quietly, that is, display nothing except error messages. - The exit status indicates whether or not any matches were + The exit status indicates whether or not any matches were found. -r, --recursive - If any given path is a directory, recursively scan the files - it contains, taking note of any --include and --exclude set- - tings. By default, a directory is read as a normal file; in - some operating systems this gives an immediate end-of-file. - This option is a shorthand for setting the -d option to "re- + If any given path is a directory, recursively scan the files + it contains, taking note of any --include and --exclude set- + tings. By default, a directory is read as a normal file; in + some operating systems this gives an immediate end-of-file. + This option is a shorthand for setting the -d option to "re- curse". --recursion-limit=number - This is an obsolete synonym for --depth-limit. See --match- + This is an obsolete synonym for --depth-limit. See --match- limit above for details. -s, --no-messages - Suppress error messages about non-existent or unreadable - files. Such files are quietly skipped. However, the return + Suppress error messages about non-existent or unreadable + files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. -t, --total-count - This option is useful when scanning more than one file. If - used on its own, -t suppresses all output except for a grand - total number of matching lines (or non-matching lines if -v + This option is useful when scanning more than one file. If + used on its own, -t suppresses all output except for a grand + total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand to- - tal is output except when the previous output is just one - line. In other words, it is not output when just one file's - count is listed. If file names are being output, the grand - total is preceded by "TOTAL:". Otherwise, it appears as just - another number. The -t option is ignored when used with -L - (list files without matches), because the grand total would + tal is output except when the previous output is just one + line. In other words, it is not output when just one file's + count is listed. If file names are being output, the grand + total is preceded by "TOTAL:". Otherwise, it appears as just + another number. The -t option is ignored when used with -L + (list files without matches), because the grand total would always be zero. -u, --utf Operate in UTF/Unicode mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (in- - cluding those for any --exclude and --include options) and - all lines that are scanned must be valid strings of UTF-8 + cluding those for any --exclude and --include options) and + all lines that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an er- ror occurs. -U, --utf-allow-invalid - As --utf, but in addition subject lines may contain invalid - UTF-8 code unit sequences. These can never form part of any - pattern match. Patterns themselves, however, must still be + As --utf, but in addition subject lines may contain invalid + UTF-8 code unit sequences. These can never form part of any + pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or - other binary files. For more details about matching in non- + other binary files. For more details about matching in non- valid UTF-8 strings, see the pcre2unicode(3) documentation. -V, --version - Write the version numbers of pcre2grep and the PCRE2 library - to the standard output and then exit. Anything else on the + Write the version numbers of pcre2grep and the PCRE2 library + to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match - Invert the sense of the match, so that lines which do not - match any of the patterns are the ones that are found. When - this option is set, options such as --only-matching and - --output, which specify parts of a match that are to be out- + Invert the sense of the match, so that lines which do not + match any of the patterns are the ones that are found. When + this option is set, options such as --only-matching and + --output, which specify parts of a match that are to be out- put, are ignored. -w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must - be a word boundary at the start and end of each matched - string. This is equivalent to having "\b(?:" at the start of - each pattern, and ")\b" at the end. This option applies only - to the patterns that are matched against the contents of - files; it does not apply to patterns specified by any of the + be a word boundary at the start and end of each matched + string. This is equivalent to having "\b(?:" at the start of + each pattern, and ")\b" at the end. This option applies only + to the patterns that are matched against the contents of + files; it does not apply to patterns specified by any of the --include or --exclude options. -x, --line-regex, --line-regexp - Force the patterns to start matching only at the beginnings - of lines, and in addition, require them to match entire + Force the patterns to start matching only at the beginnings + of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pat- - tern and ")$" at the end. This option applies only to the - patterns that are matched against the contents of files; it - does not apply to patterns specified by any of the --include + tern and ")$" at the end. This option applies only to the + patterns that are matched against the contents of files; it + does not apply to patterns specified by any of the --include or --exclude options. -Z, --null - Terminate files names in the regular output with a zero byte - (the NUL character) instead of what would normally appear. - This is useful when file names contain unusual characters - such as colons, hyphens, or even newlines. The option does + Terminate files names in the regular output with a zero byte + (the NUL character) instead of what would normally appear. + This is useful when file names contain unusual characters + such as colons, hyphens, or even newlines. The option does not apply to file names in error messages. @@ -887,90 +897,90 @@ ENVIRONMENT VARIABLES NEWLINES - The -N (--newline) option allows pcre2grep to scan files with newline - conventions that differ from the default. This option affects only the - way scanned files are processed. It does not affect the interpretation - of files specified by the -f, --file-list, --exclude-from, or --in- + The -N (--newline) option allows pcre2grep to scan files with newline + conventions that differ from the default. This option affects only the + way scanned files are processed. It does not affect the interpretation + of files specified by the -f, --file-list, --exclude-from, or --in- clude-from options. - Any parts of the scanned input files that are written to the standard - output are copied with whatever newline sequences they have in the in- - put. However, if the final line of a file is output, and it does not - end with a newline sequence, a newline sequence is added. If the new- - line setting is CR, LF, CRLF or NUL, that line ending is output; for + Any parts of the scanned input files that are written to the standard + output are copied with whatever newline sequences they have in the in- + put. However, if the final line of a file is output, and it does not + end with a newline sequence, a newline sequence is added. If the new- + line setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. - The newline setting does not affect the way in which pcre2grep writes - newlines in informational messages to the standard output and error - streams. Under Windows, the standard output is set to be binary, so - that "\r\n" at the ends of output lines that are copied from the input - is not converted to "\r\r\n" by the C I/O library. This means that any - messages written to the standard output must end with "\r\n". For all - other operating systems, and for all messages to the standard error + The newline setting does not affect the way in which pcre2grep writes + newlines in informational messages to the standard output and error + streams. Under Windows, the standard output is set to be binary, so + that "\r\n" at the ends of output lines that are copied from the input + is not converted to "\r\r\n" by the C I/O library. This means that any + messages written to the standard output must end with "\r\n". For all + other operating systems, and for all messages to the standard error stream, "\n" is used. OPTIONS COMPATIBILITY WITH GNU GREP Many of the short and long forms of pcre2grep's options are the same as - in the GNU grep program. Any long option of the form --xxx-regexp (GNU - terminology) is also available as --xxx-regex (PCRE2 terminology). - However, the --case-restrict, --depth-limit, -E, --file-list, --file- + in the GNU grep program. Any long option of the form --xxx-regexp (GNU + terminology) is also available as --xxx-regex (PCRE2 terminology). + However, the --case-restrict, --depth-limit, -E, --file-list, --file- offsets, --heap-limit, --include-dir, --line-offsets, --locale, - --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- - tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are + --match-limit, -M, --multiline, -N, --newline, --no-ucp, --om-separa- + tor, --output, -P, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number. - Although most of the common options work the same way, a few are dif- - ferent in pcre2grep. For example, the --include option's argument is a + Although most of the common options work the same way, a few are dif- + ferent in pcre2grep. For example, the --include option's argument is a glob for GNU grep, but in pcre2grep it is a regular expression to which - the -i option applies. If both the -c and -l options are given, GNU - grep lists only file names, without counts, but pcre2grep gives the + the -i option applies. If both the -c and -l options are given, GNU + grep lists only file names, without counts, but pcre2grep gives the counts as well. OPTIONS WITH DATA There are four different ways in which an option with data can be spec- - ified. If a short form option is used, the data may follow immedi- + ified. If a short form option is used, the data may follow immedi- ately, or (with one exception) in the next command line item. For exam- ple: -f/some/file -f /some/file - The exception is the -o option, which may appear with or without data. - Because of this, if data is present, it must follow immediately in the + The exception is the -o option, which may appear with or without data. + Because of this, if data is present, it must follow immediately in the same item, for example -o3. - If a long form option is used, the data may appear in the same command - line item, separated by an equals character, or (with two exceptions) + If a long form option is used, the data may appear in the same command + line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: --file=/some/file --file /some/file - Note, however, that if you want to supply a file name beginning with ~ - as data in a shell command, and have the shell expand ~ to a home di- - rectory, you must separate the file name from the option, because the + Note, however, that if you want to supply a file name beginning with ~ + as data in a shell command, and have the shell expand ~ to a home di- + rectory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. - The exceptions to the above are the --colour (or --color) and --only- - matching options, for which the data is optional. If one of these op- - tions does have data, it must be given in the first form, using an + The exceptions to the above are the --colour (or --color) and --only- + matching options, for which the data is optional. If one of these op- + tions does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data. USING PCRE2'S CALLOUT FACILITY - pcre2grep has, by default, support for calling external programs or - scripts or echoing specific strings during matching by making use of - PCRE2's callout facility. However, this support can be completely or - partially disabled when pcre2grep is built. You can find out whether - your binary has support for callouts by running it with the --help op- - tion. If callout support is completely disabled, all callouts in pat- - terns are ignored by pcre2grep. If the facility is partially disabled, + pcre2grep has, by default, support for calling external programs or + scripts or echoing specific strings during matching by making use of + PCRE2's callout facility. However, this support can be completely or + partially disabled when pcre2grep is built. You can find out whether + your binary has support for callouts by running it with the --help op- + tion. If callout support is completely disabled, callouts in patterns + are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored. @@ -988,13 +998,13 @@ USING PCRE2'S CALLOUT FACILITY processed as a zero-terminated string, which means it should not con- tain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the - --output (-O) option (see above). However, $0 cannot be used to insert - a matched substring because the match is still in progress. Instead, - the single character '0' is inserted. Any syntax errors in the string - (for example, a dollar not followed by another character) causes the - callout to be ignored. No terminator is added to the output string, so - if you want a newline, you must include it explicitly using the escape - $n. For example: + --output (-O) option (see above). However, $0 or $& cannot be used to + insert a matched substring because the match is still in progress. In- + stead, the single character '0' is inserted. Any syntax errors in the + string (for example, a dollar not followed by another character) causes + the callout to be ignored. No terminator is added to the output string, + so if you want a newline, you must include it explicitly using the es- + cape $n. For example: pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' @@ -1018,10 +1028,11 @@ USING PCRE2'S CALLOUT FACILITY Any substring (including the executable name) may contain escape se- quences started by a dollar character. These are the same as for the - --output (-O) option documented above, except that $0 cannot insert the - matched string because the match is still in progress. Instead, the - character '0' is inserted. If you need a literal dollar or pipe charac- - ter in any substring, use $$ or $| respectively. Here is an example: + --output (-O) option documented above, except that $0 or $& cannot in- + sert the matched string because the match is still in progress. In- + stead, the character '0' is inserted. If you need a literal dollar or + pipe character in any substring, use $$ or $| respectively. Here is an + example: echo -e "abcde\n12345" | pcre2grep \ '(?x)(.)(..(.)) @@ -1034,43 +1045,43 @@ USING PCRE2'S CALLOUT FACILITY Arg1: [1] [234] [4] Arg2: |1| () 12345 - The parameters for the system call that is used to run the program or + The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero charac- - ters in the callout argument will cause premature termination of their - substrings, and therefore should not be present. Any syntax errors in - the string (for example, a dollar not followed by another character) + ters in the callout argument will cause premature termination of their + substrings, and therefore should not be present. Any syntax errors in + the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any - reason (including the non-existence of the executable), a local match- + reason (including the non-existence of the executable), a local match- ing failure occurs and the matcher backtracks in the normal way. MATCHING ERRORS - It is possible to supply a regular expression that takes a very long - time to fail to match certain lines. Such patterns normally involve - nested indefinite repeats, for example: (a+)*\d when matched against a - line of a's with no final digit. The PCRE2 matching function has a re- - source limit that causes it to abort in these circumstances. If this - happens, pcre2grep outputs an error message and the line that caused - the problem to the standard error stream. If there are more than 20 + It is possible to supply a regular expression that takes a very long + time to fail to match certain lines. Such patterns normally involve + nested indefinite repeats, for example: (a+)*\d when matched against a + line of a's with no final digit. The PCRE2 matching function has a re- + source limit that causes it to abort in these circumstances. If this + happens, pcre2grep outputs an error message and the line that caused + the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. - The --match-limit option of pcre2grep can be used to set the overall - resource limit. There are also other limits that affect the amount of - memory used during matching; see the discussion of --heap-limit and + The --match-limit option of pcre2grep can be used to set the overall + resource limit. There are also other limits that affect the amount of + memory used during matching; see the discussion of --heap-limit and --depth-limit above. DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, - and 2 for syntax errors, overlong lines, non-existent or inaccessible - files (even if matches were found in other files) or too many matching + and 2 for syntax errors, overlong lines, non-existent or inaccessible + files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. - When run under VMS, the return code is placed in the symbol - PCRE2GREP_RC because VMS does not distinguish between exit(0) and + When run under VMS, the return code is placed in the symbol + PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). @@ -1088,8 +1099,8 @@ AUTHOR REVISION - Last updated: 22 December 2023 + Last updated: 04 February 2025 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 22 December 2023 PCRE2GREP(1) +PCRE2 10.45 04 February 2025 PCRE2GREP(1) diff --git a/usr/share/doc/pcre2/pcre2test.txt b/usr/share/doc/pcre2/pcre2test.txt index ddb491d7e7c..b6574b2ea1b 100644 --- a/usr/share/doc/pcre2/pcre2test.txt +++ b/usr/share/doc/pcre2/pcre2test.txt @@ -1,4 +1,3 @@ - PCRE2TEST(1) General Commands Manual PCRE2TEST(1) @@ -72,26 +71,25 @@ INPUT ENCODING When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that - are passed to the library. For subject lines, backslash escapes can be - used. In addition, when the utf modifier (see "Setting compilation op- - tions" below) is set, the pattern and any following subject lines are - interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as ap- - propriate. - - For non-UTF testing of wide characters, the utf8_input modifier can be - used. This is mutually exclusive with utf, and is allowed only in - 16-bit or 32-bit mode. It causes the pattern and following subject - lines to be treated as UTF-8 according to the original definition (RFC + are passed to the library. For subject lines and some patterns, back- + slash escapes can be used. In addition, when the utf modifier (see + "Setting compilation options" below) is set, the pattern and any fol- + lowing subject lines are interpreted as UTF-8 strings and translated to + UTF-16 or UTF-32 as appropriate. + + For non-UTF testing of wide characters, the utf8_input modifier can be + used. This is mutually exclusive with utf, and is allowed only in + 16-bit or 32-bit mode. It causes the pattern and following subject + lines to be treated as UTF-8 according to the original definition (RFC 2279), which allows for character values up to 0x7fffffff. Each charac- - ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, + ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error to occur). - UTF-8 (in its original definition) is not capable of encoding values - greater than 0x7fffffff, but such values can be handled by the 32-bit + UTF-8 (in its original definition) is not capable of encoding values + greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte - in UTF-8) 0x80000000 is added to the character's value. This is the - only way of passing such code points in a pattern string. For subject + in UTF-8) 0x80000000 is added to the character's value. For subject strings, using an escape sequence is preferable. @@ -135,8 +133,8 @@ COMMAND LINE OPTIONS the exit code as indicated: ebcdic-nl the code for LF (= NL) in an EBCDIC environment: - 0x15 or 0x25 - 0 if used in an ASCII environment + either 0x15 or 0x25 + 0 if used in an ASCII/Unicode environment exit code is always 0 linksize the configured internal link size (2, 3, or 4) exit code is set to the link size @@ -158,56 +156,67 @@ COMMAND LINE OPTIONS pcre2-8 the 8-bit library was built unicode Unicode support is available - If an unknown option is given, an error message is output; + Note that the availability of JIT support in the library does + not guarantee that it can actually be used because in some + environments it is unable to allocate executable memory. The + option "jitusable" gives more detailed information. It re- + turns one of the following values: + + 0 JIT is available and usable + 1 JIT is available but cannot allocate executable memory + 2 JIT is not available + 3 Unexpected return from test call to pcre2_jit_compile() + + If an unknown option is given, an error message is output; the exit code is 0. - -d Behave as if each pattern has the debug modifier; the inter- + -d Behave as if each pattern has the debug modifier; the inter- nal form and information about the compiled pattern is output after compilation; -d is equivalent to -b -i. -dfa Behave as if each subject line has the dfa modifier; matching - is done using the pcre2_dfa_match() function instead of the + is done using the pcre2_dfa_match() function instead of the default pcre2_match(). -error number[,number,...] - Call pcre2_get_error_message() for each of the error numbers - in the comma-separated list, display the resulting messages - on the standard output, then exit with zero exit code. The - numbers may be positive or negative. This is a convenience + Call pcre2_get_error_message() for each of the error numbers + in the comma-separated list, display the resulting messages + on the standard output, then exit with zero exit code. The + numbers may be positive or negative. This is a convenience facility for PCRE2 maintainers. -help Output a brief summary these options and then exit. - -i Behave as if each pattern has the info modifier; information + -i Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation. - -jit Behave as if each pattern line has the jit modifier; after - successful compilation, each pattern is passed to the just- + -jit Behave as if each pattern line has the jit modifier; after + successful compilation, each pattern is passed to the just- in-time compiler, if available. - -jitfast Behave as if each pattern line has the jitfast modifier; af- - ter successful compilation, each pattern is passed to the + -jitfast Behave as if each pattern line has the jitfast modifier; af- + ter successful compilation, each pattern is passed to the just-in-time compiler, if available, and each subject line is passed directly to the JIT matcher via its "fast path". -jitverify - Behave as if each pattern line has the jitverify modifier; - after successful compilation, each pattern is passed to the - just-in-time compiler, if available, and the use of JIT for + Behave as if each pattern line has the jitverify modifier; + after successful compilation, each pattern is passed to the + just-in-time compiler, if available, and the use of JIT for matching is verified. -LM List modifiers: write a list of available pattern and subject - modifiers to the standard output, then exit with zero exit - code. All other options are ignored. If both -C and any -Lx + modifiers to the standard output, then exit with zero exit + code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. - -LP List properties: write a list of recognized Unicode proper- - ties to the standard output, then exit with zero exit code. + -LP List properties: write a list of recognized Unicode proper- + ties to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -LS List scripts: write a list of recognized Unicode script names - to the standard output, then exit with zero exit code. All + to the standard output, then exit with zero exit code. All other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. @@ -217,25 +226,25 @@ COMMAND LINE OPTIONS -q Do not output the version number of pcre2test at the start of execution. - -S size On Unix-like systems, set the size of the run-time stack to + -S size On Unix-like systems, set the size of the run-time stack to size mebibytes (units of 1024*1024 bytes). -subject modifier-list Behave as if each subject line contains the given modifiers. - -t Run each compile and match many times with a timer, and out- - put the resulting times per compile or match. When JIT is - used, separate times are given for the initial compile and - the JIT compile. You can control the number of iterations - that are used for timing by following -t with a number (as a - separate item on the command line). For example, "-t 1000" + -t Run each compile and match many times with a timer, and out- + put the resulting times per compile or match. When JIT is + used, separate times are given for the initial compile and + the JIT compile. You can control the number of iterations + that are used for timing by following -t with a number (as a + separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times. -tm This is like -t except that it times only the matching phase, not the compile phase. - -T -TM These behave like -t and -tm, but in addition, at the end of - a run, the total times for all compiles and matches are out- + -T -TM These behave like -t and -tm, but in addition, at the end of + a run, the total times for all compiles and matches are out- put. -version Output the PCRE2 version number and then exit. @@ -243,153 +252,153 @@ COMMAND LINE OPTIONS DESCRIPTION - If pcre2test is given two filename arguments, it reads from the first + If pcre2test is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from - the standard input. If pcre2test is given only one argument, it reads + the standard input. If pcre2test is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout. - When pcre2test is built, a configuration option can specify that it - should be linked with the libreadline or libedit library. When this is - done, if the input is from a terminal, it is read using the readline() + When pcre2test is built, a configuration option can specify that it + should be linked with the libreadline or libedit library. When this is + done, if the input is from a terminal, it is read using the readline() function. This provides line-editing and history facilities. The output from the -help option states whether or not readline() will be used. - The program handles any number of tests, each of which consists of a - set of input lines. Each set starts with a regular expression pattern, + The program handles any number of tests, each of which consists of a + set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pat- tern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed - by the perltest.sh script that is distributed with PCRE2 as a means of + by the perltest.sh script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. For a speci- - fication of perltest.sh, see the comments near its beginning. See also + fication of perltest.sh, see the comments near its beginning. See also the #perltest command below. When the input is a terminal, pcre2test prompts for each line of input, - using "re>" to prompt for regular expression patterns, and "data>" to - prompt for subject lines. Command lines starting with # can be entered + using "re>" to prompt for regular expression patterns, and "data>" to + prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt. - Each subject line is matched separately and independently. If you want + Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r - or \r\n, etc., depending on the newline setting) in a single line of - input to encode the newline sequences. There is no limit on the length - of subject lines; the input buffer is automatically extended if it is - too small. There are replication features that makes it possible to - generate long repetitive pattern or subject lines without having to + or \r\n, etc., depending on the newline setting) in a single line of + input to encode the newline sequences. There is no limit on the length + of subject lines; the input buffer is automatically extended if it is + too small. There are replication features that makes it possible to + generate long repetitive pattern or subject lines without having to supply them explicitly. - An empty line or the end of the file signals the end of the subject - lines for a test, at which point a new pattern or command line is ex- + An empty line or the end of the file signals the end of the subject + lines for a test, at which point a new pattern or command line is ex- pected if there is still input to be read. COMMAND LINES - In between sets of test data, a line that begins with # is interpreted + In between sets of test data, a line that begins with # is interpreted as a command line. If the first character is followed by white space or - an exclamation mark, the line is treated as a comment, and ignored. + an exclamation mark, the line is treated as a comment, and ignored. Otherwise, the following commands are recognized: #forbid_utf - Subsequent patterns automatically have the PCRE2_NEVER_UTF and - PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF - and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of - patterns. This command also forces an error if a subsequent pattern - contains any occurrences of \P, \p, or \X, which are still supported - when PCRE2_UTF is not set, but which require Unicode property support + Subsequent patterns automatically have the PCRE2_NEVER_UTF and + PCRE2_NEVER_UCP options set, which locks out the use of the PCRE2_UTF + and PCRE2_UCP options and the use of (*UTF) and (*UCP) at the start of + patterns. This command also forces an error if a subsequent pattern + contains any occurrences of \P, \p, or \X, which are still supported + when PCRE2_UTF is not set, but which require Unicode property support to be included in the library. - This is a trigger guard that is used in test files to ensure that UTF - or Unicode property tests are not accidentally added to files that are - used when Unicode support is not included in the library. Setting - PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained - by the use of #pattern; the difference is that #forbid_utf cannot be - unset, and the automatic options are not displayed in pattern informa- + This is a trigger guard that is used in test files to ensure that UTF + or Unicode property tests are not accidentally added to files that are + used when Unicode support is not included in the library. Setting + PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as a default can also be obtained + by the use of #pattern; the difference is that #forbid_utf cannot be + unset, and the automatic options are not displayed in pattern informa- tion, to avoid cluttering up test output. #load This command is used to load a set of precompiled patterns from a file, - as described in the section entitled "Saving and restoring compiled + as described in the section entitled "Saving and restoring compiled patterns" below. #loadtables - This command is used to load a set of binary character tables that can - be accessed by the tables=3 qualifier. Such tables can be created by + This command is used to load a set of binary character tables that can + be accessed by the tables=3 qualifier. Such tables can be created by the pcre2_dftables program with the -b option. #newline_default [] - When PCRE2 is built, a default newline convention can be specified. - This determines which characters and/or character pairs are recognized + When PCRE2 is built, a default newline convention can be specified. + This determines which characters and/or character pairs are recognized as indicating a newline in a pattern or subject string. The default can - be overridden when a pattern is compiled. The standard test files con- - tain tests of various newline conventions, but the majority of the - tests expect a single linefeed to be recognized as a newline by de- - fault. Without special action the tests would fail when PCRE2 is com- + be overridden when a pattern is compiled. The standard test files con- + tain tests of various newline conventions, but the majority of the + tests expect a single linefeed to be recognized as a newline by de- + fault. Without special action the tests would fail when PCRE2 is com- piled with either CR or CRLF as the default newline. The #newline_default command specifies a list of newline types that are - acceptable as the default. The types must be one of CR, LF, CRLF, ANY- + acceptable as the default. The types must be one of CR, LF, CRLF, ANY- CRLF, ANY, or NUL (in upper or lower case), for example: #newline_default LF Any anyCRLF If the default newline is in the list, this command has no effect. Oth- - erwise, except when testing the POSIX API, a newline modifier that + erwise, except when testing the POSIX API, a newline modifier that specifies the first newline convention in the list (LF in the above ex- - ample) is added to any pattern that does not already have a newline + ample) is added to any pattern that does not already have a newline modifier. If the newline list is empty, the feature is turned off. This command is present in a number of the standard test input files. - When the POSIX API is being tested there is no way to override the de- + When the POSIX API is being tested there is no way to override the de- fault newline convention, though it is possible to set the newline con- - vention from within the pattern. A warning is given if the posix or - posix_nosub modifier is used when #newline_default would set a default + vention from within the pattern. A warning is given if the posix or + posix_nosub modifier is used when #newline_default would set a default for the non-POSIX API. #pattern - This command sets a default modifier list that applies to all subse- + This command sets a default modifier list that applies to all subse- quent patterns. Modifiers on a pattern can change these settings. #perltest - This line is used in test files that can also be processed by perl- - test.sh to confirm that Perl gives the same results as PCRE2. Subse- - quent tests are checked for the use of pcre2test features that are in- + This line is used in test files that can also be processed by perl- + test.sh to confirm that Perl gives the same results as PCRE2. Subse- + quent tests are checked for the use of pcre2test features that are in- compatible with the perltest.sh script. - Patterns must use '/' as their delimiter, and only certain modifiers - are supported. Comment lines, #pattern commands, and #subject commands - that set or unset "mark" are recognized and acted on. The #perltest, - #forbid_utf, and #newline_default commands, which are needed in the + Patterns must use '/' as their delimiter, and only certain modifiers + are supported. Comment lines, #pattern commands, and #subject commands + that set or unset "mark" are recognized and acted on. The #perltest, + #forbid_utf, and #newline_default commands, which are needed in the relevant pcre2test files, are silently ignored. All other command lines - are ignored, but give a warning message. The #perltest command helps - detect tests that are accidentally put in the wrong file or use the - wrong delimiter. For more details of the perltest.sh script see the + are ignored, but give a warning message. The #perltest command helps + detect tests that are accidentally put in the wrong file or use the + wrong delimiter. For more details of the perltest.sh script see the comments it contains. #pop [] #popcopy [] - These commands are used to manipulate the stack of compiled patterns, - as described in the section entitled "Saving and restoring compiled + These commands are used to manipulate the stack of compiled patterns, + as described in the section entitled "Saving and restoring compiled patterns" below. #save - This command is used to save a set of compiled patterns to a file, as - described in the section entitled "Saving and restoring compiled pat- + This command is used to save a set of compiled patterns to a file, as + described in the section entitled "Saving and restoring compiled pat- terns" below. #subject - This command sets a default modifier list that applies to all subse- - quent subject lines. Modifiers on a subject line can change these set- + This command sets a default modifier list that applies to all subse- + quent subject lines. Modifiers on a subject line can change these set- tings. @@ -397,47 +406,47 @@ MODIFIER SYNTAX Modifier lists are used with both pattern and subject lines. Items in a list are separated by commas followed by optional white space. Trailing - whitespace in a modifier list is ignored. Some modifiers may be given - for both patterns and subject lines, whereas others are valid only for - one or the other. Each modifier has a long name, for example "an- - chored", and some of them must be followed by an equals sign and a - value, for example, "offset=12". Values cannot contain comma charac- - ters, but may contain spaces. Modifiers that do not take values may be + whitespace in a modifier list is ignored. Some modifiers may be given + for both patterns and subject lines, whereas others are valid only for + one or the other. Each modifier has a long name, for example "an- + chored", and some of them must be followed by an equals sign and a + value, for example, "offset=12". Values cannot contain comma charac- + ters, but may contain spaces. Modifiers that do not take values may be preceded by a minus sign to turn off a previous setting. A few of the more common modifiers can also be specified as single let- - ters, for example "i" for "caseless". In documentation, following the + ters, for example "i" for "caseless". In documentation, following the Perl convention, these are written with a slash ("the /i modifier") for - clarity. Abbreviated modifiers must all be concatenated in the first - item of a modifier list. If the first item is not recognized as a long - modifier name, it is interpreted as a sequence of these abbreviations. + clarity. Abbreviated modifiers must all be concatenated in the first + item of a modifier list. If the first item is not recognized as a long + modifier name, it is interpreted as a sequence of these abbreviations. For example: /abc/ig,newline=cr,jit=3 - This is a pattern line whose modifier list starts with two one-letter - modifiers (/i and /g). The lower-case abbreviated modifiers are the + This is a pattern line whose modifier list starts with two one-letter + modifiers (/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. PATTERN SYNTAX - A pattern line must start with one of the following characters (common + A pattern line must start with one of the following characters (common symbols, excluding pattern meta-characters): / ! " ' ` - = _ : ; , % & @ ~ - This is interpreted as the pattern's delimiter. A regular expression - may be continued over several input lines, in which case the newline + This is interpreted as the pattern's delimiter. A regular expression + may be continued over several input lines, in which case the newline characters are included within it. It is possible to include the delim- - iter as a literal within the pattern by escaping it with a backslash, + iter as a literal within the pattern by escaping it with a backslash, for example /abc\/def/ - If you do this, the escape and the delimiter form part of the pattern, + If you do this, the escape and the delimiter form part of the pattern, but since the delimiters are all non-alphanumeric, the inclusion of the - backslash does not affect the pattern's interpretation. Note, however, + backslash does not affect the pattern's interpretation. Note, however, that this trick does not work within \Q...\E literal bracketing because the backslash will itself be interpreted as a literal. If the terminat- ing delimiter is immediately followed by a backslash, for example, @@ -445,13 +454,13 @@ PATTERN SYNTAX /abc/\ a backslash is added to the end of the pattern. This is done to provide - a way of testing the error condition that arises if a pattern finishes + a way of testing the error condition that arises if a pattern finishes with a backslash, because /abc\/ - is interpreted as the first line of a pattern that starts with "abc/", - causing pcre2test to read the next line as a continuation of the regu- + is interpreted as the first line of a pattern that starts with "abc/", + causing pcre2test to read the next line as a continuation of the regu- lar expression. A pattern can be followed by a modifier list (details below). @@ -460,44 +469,52 @@ PATTERN SYNTAX SUBJECT LINE SYNTAX Before each subject line is passed to pcre2_match(), pcre2_dfa_match(), - or pcre2_jit_match(), leading and trailing white space is removed, and - the line is scanned for backslash escapes, unless the subject_literal - modifier was set for the pattern. The following provide a means of en- + or pcre2_jit_match(), leading and trailing white space is removed, and + the line is scanned for backslash escapes, unless the subject_literal + modifier was set for the pattern. The following provide a means of en- coding non-printing characters in a visible way: - \a alarm (BEL, \x07) - \b backspace (\x08) - \e escape (\x27) - \f form feed (\x0c) - \n newline (\x0a) - \r carriage return (\x0d) - \t tab (\x09) - \v vertical tab (\x0b) - \nnn octal character (up to 3 octal digits); always - a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode - \o{dd...} octal character (any number of octal digits} - \xhh hexadecimal byte (up to 2 hex digits) - \x{hh...} hexadecimal character (any number of hex digits) - - The use of \x{hh...} is not dependent on the use of the utf modifier on - the pattern. It is recognized always. There may be any number of hexa- - decimal digits inside the braces; invalid values provoke error mes- - sages. - - Note that \xhh specifies one byte rather than one character in UTF-8 - mode; this makes it possible to construct invalid UTF-8 sequences for - testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8 - character in UTF-8 mode, generating more than one byte if the value is - greater than 127. When testing the 8-bit library not in UTF-8 mode, - \x{hh} generates one byte for values less than 256, and causes an error + \a alarm (BEL, \x07) + \b backspace (\x08) + \e escape (\x27) + \f form feed (\x0c) + \n newline (\x0a) + \N{U+hh...} unicode character (any number of hex digits) + \r carriage return (\x0d) + \t tab (\x09) + \v vertical tab (\x0b) + \ddd octal number (up to 3 octal digits); represent a single + code point unless larger than 255 with the 8-bit li- + brary + \o{dd...} octal number (any number of octal digits} representing a + character in UTF mode or a code point + \xhh hexadecimal byte (up to 2 hex digits) + \x{hh...} hexadecimal number (up to 8 hex digits) representing a + character in UTF mode or a code point + + Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf + modifier on the pattern. It is always recognized. There may be any num- + ber of hexadecimal digits inside the braces; invalid values provoke er- + ror messages but when using \N{U+hh...} with some invalid unicode char- + acters they will be accepted with a warning instead. + + Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) + describe one byte rather than one character; this makes it possible to + construct invalid UTF-8 sequences for testing purposes. On the other + hand, \x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only + generating more than one byte if the value is greater than 127. To + avoid the ambiguity it is preferred to use \N{U+hh...} when describing + characters. When testing the 8-bit library not in UTF-8 mode, \x{hh} + generates one byte for values that could fit on it, and causes an error for greater values. - In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it - possible to construct invalid UTF-16 sequences for testing purposes. + When testing the 16-bit library, not in UTF-16 mode, all 4-digit + \x{hhhh} values are accepted. This makes it possible to construct in- + valid UTF-16 sequences for testing purposes. - In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This - makes it possible to construct invalid UTF-32 sequences for testing - purposes. + When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit + \x{...} values are accepted. This makes it possible to construct in- + valid UTF-32 sequences for testing purposes. There is a special backslash sequence that specifies replication of one or more characters: @@ -561,6 +578,7 @@ PATTERN MODIFIERS allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -589,13 +607,17 @@ PATTERN MODIFIERS match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -608,20 +630,36 @@ PATTERN MODIFIERS causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. + The following modifiers enable or disable performance optimizations by + calling pcre2_set_optimize() before invoking the regex compiler. + + optimization_full enable all optional optimizations + optimization_none disable all optional optimizations + auto_possess auto-possessify variable quantifiers + auto_possess_off don't auto-possessify variable quantifiers + dotstar_anchor anchor patterns starting with .* + dotstar_anchor_off don't anchor patterns starting with .* + start_optimize enable pre-scan of subject string + start_optimize_off disable pre-scan of subject string + + See the pcre2_set_optimize documentation for details on these optimiza- + tions. + Setting compilation controls - The following modifiers affect the compilation process or request in- - formation about the pattern. There are single-letter abbreviations for + The following modifiers affect the compilation process or request in- + formation about the pattern. There are single-letter abbreviations for some that are heavily used in the test files. - bsr=[anycrlf|unicode] specify \R handling /B bincode show binary code without lengths + bsr=[anycrlf|unicode] specify \R handling callout_info show callout information convert= request foreign pattern conversion convert_glob_escape=c set glob escape character convert_glob_separator=c set glob separator character convert_length set convert buffer length debug same as info,fullbincode + expand expand repetition syntax in pattern framesize show matching frame size fullbincode show binary code with lengths /I info show info about compiled pattern @@ -643,6 +681,7 @@ PATTERN MODIFIERS posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard= test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -653,35 +692,35 @@ PATTERN MODIFIERS Newline and \R handling - The bsr modifier specifies what \R in a pattern should match. If it is - set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to - "unicode", \R matches any Unicode newline sequence. The default can be + The bsr modifier specifies what \R in a pattern should match. If it is + set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to + "unicode", \R matches any Unicode newline sequence. The default can be specified when PCRE2 is built; if it is not, the default is set to Uni- code. - The newline modifier specifies which characters are to be interpreted + The newline modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case). Information about a pattern - The debug modifier is a shorthand for info,fullbincode, requesting all + The debug modifier is a shorthand for info,fullbincode, requesting all available information. The bincode modifier causes a representation of the compiled code to be - output after compilation. This information does not contain length and + output after compilation. This information does not contain length and offset values, which ensures that the same output is generated for dif- - ferent internal link sizes and different code unit widths. By using - bincode, the same regression tests can be used in different environ- + ferent internal link sizes and different code unit widths. By using + bincode, the same regression tests can be used in different environ- ments. - The fullbincode modifier, by contrast, does include length and offset - values. This is used in a few special tests that run only for specific + The fullbincode modifier, by contrast, does include length and offset + values. This is used in a few special tests that run only for specific code unit widths and link sizes, and is also useful for one-off tests. - The info modifier requests information about the compiled pattern - (whether it is anchored, has a fixed first character, and so on). The - information is obtained from the pcre2_pattern_info() function. Here + The info modifier requests information about the compiled pattern + (whether it is anchored, has a fixed first character, and so on). The + information is obtained from the pcre2_pattern_info() function. Here are some typical examples: re> /(?i)(^a|^b)/m,info @@ -699,136 +738,136 @@ PATTERN MODIFIERS Last code unit = 'c' (caseless) Subject length lower bound = 3 - "Compile options" are those specified by modifiers; "overall options" - have added options that are taken or deduced from the pattern. If both - sets of options are the same, just a single "options" line is output; - if there are no options, the line is omitted. "First code unit" is - where any match must start; if there is more than one they are listed - as "starting code units". "Last code unit" is the last literal code - unit that must be present in any match. This is not necessarily the - last character. These lines are omitted if no starting or ending code - units are recorded. The subject length line is omitted when - no_start_optimize is set because the minimum length is not calculated + "Compile options" are those specified by modifiers; "overall options" + have added options that are taken or deduced from the pattern. If both + sets of options are the same, just a single "options" line is output; + if there are no options, the line is omitted. "First code unit" is + where any match must start; if there is more than one they are listed + as "starting code units". "Last code unit" is the last literal code + unit that must be present in any match. This is not necessarily the + last character. These lines are omitted if no starting or ending code + units are recorded. The subject length line is omitted when + no_start_optimize is set because the minimum length is not calculated when it can never be used. - The framesize modifier shows the size, in bytes, of each storage frame - used by pcre2_match() for handling backtracking. The size depends on - the number of capturing parentheses in the pattern. A vector of these - frames is used at matching time; its overall size is shown when the + The framesize modifier shows the size, in bytes, of each storage frame + used by pcre2_match() for handling backtracking. The size depends on + the number of capturing parentheses in the pattern. A vector of these + frames is used at matching time; its overall size is shown when the heaframes_size subject modifier is set. - The callout_info modifier requests information about all the callouts + The callout_info modifier requests information about all the callouts in the pattern. A list of them is output at the end of any other infor- mation that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. Passing a NULL context - Normally, pcre2test passes a context block to pcre2_compile(). If the - null_context modifier is set, however, NULL is passed. This is for - testing that pcre2_compile() behaves correctly in this case (it uses + Normally, pcre2test passes a context block to pcre2_compile(). If the + null_context modifier is set, however, NULL is passed. This is for + testing that pcre2_compile() behaves correctly in this case (it uses default values). Passing a NULL pattern - The null_pattern modifier is for testing the behaviour of pcre2_com- - pile() when the pattern argument is NULL. The length value passed is + The null_pattern modifier is for testing the behaviour of pcre2_com- + pile() when the pattern argument is NULL. The length value passed is the default PCRE2_ZERO_TERMINATED unless use_length is set. Any length other than zero causes an error. Specifying pattern characters in hexadecimal - The hex modifier specifies that the characters of the pattern, except - for substrings enclosed in single or double quotes, are to be inter- - preted as pairs of hexadecimal digits. This feature is provided as a + The hex modifier specifies that the characters of the pattern, except + for substrings enclosed in single or double quotes, are to be inter- + preted as pairs of hexadecimal digits. This feature is provided as a way of creating patterns that contain binary zeros and other non-print- - ing characters. White space is permitted between pairs of digits. For + ing characters. White space is permitted between pairs of digits. For example, this pattern contains three characters: /ab 32 59/hex - Parts of such a pattern are taken literally if quoted. This pattern - contains nine characters, only two of which are specified in hexadeci- + Parts of such a pattern are taken literally if quoted. This pattern + contains nine characters, only two of which are specified in hexadeci- mal: /ab "literal" 32/hex - Either single or double quotes may be used. There is no way of includ- - ing the delimiter within a substring. The hex and expand modifiers are + Either single or double quotes may be used. There is no way of includ- + ing the delimiter within a substring. The hex and expand modifiers are mutually exclusive. Specifying the pattern's length By default, patterns are passed to the compiling functions as zero-ter- - minated strings but can be passed by length instead of being zero-ter- - minated. The use_length modifier causes this to happen. Using a length - happens automatically (whether or not use_length is set) when hex is - set, because patterns specified in hexadecimal may contain binary ze- + minated strings but can be passed by length instead of being zero-ter- + minated. The use_length modifier causes this to happen. Using a length + happens automatically (whether or not use_length is set) when hex is + set, because patterns specified in hexadecimal may contain binary ze- ros. If hex or use_length is used with the POSIX wrapper API (see "Using the - POSIX wrapper API" below), the REG_PEND extension is used to pass the + POSIX wrapper API" below), the REG_PEND extension is used to pass the pattern's length. Specifying a maximum for variable lookbehinds - Variable lookbehind assertions are supported only if, for each one, + Variable lookbehind assertions are supported only if, for each one, there is a maximum length (in characters) that it can match. There is a limit on this, whose default can be set at build time, with an ultimate - default of 255. The max_varlookbehind modifier uses the + default of 255. The max_varlookbehind modifier uses the pcre2_set_max_varlookbehind() function to change the limit. Lookbehinds - whose branches each match a fixed length are limited to 65535 charac- + whose branches each match a fixed length are limited to 65535 charac- ters per branch. Specifying wide characters in 16-bit and 32-bit modes In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 - and translated to UTF-16 or UTF-32 when the utf modifier is set. For + and translated to UTF-16 or UTF-32 when the utf modifier is set. For testing the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input - modifier can be used. It is mutually exclusive with utf. Input lines + modifier can be used. It is mutually exclusive with utf. Input lines are interpreted as UTF-8 as a means of specifying wide characters. More details are given in "Input encoding" above. Generating long repetitive patterns - Some tests use long patterns that are very repetitive. Instead of cre- - ating a very long input line for such a pattern, you can use a special - repetition feature, similar to the one described for subject lines - above. If the expand modifier is present on a pattern, parts of the + Some tests use long patterns that are very repetitive. Instead of cre- + ating a very long input line for such a pattern, you can use a special + repetition feature, similar to the one described for subject lines + above. If the expand modifier is present on a pattern, parts of the pattern that have the form \[]{} are expanded before the pattern is passed to pcre2_compile(). For exam- ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction - cannot be nested. An initial "\[" sequence is recognized only if "]{" - followed by decimal digits and "}" is found later in the pattern. If + cannot be nested. An initial "\[" sequence is recognized only if "]{" + followed by decimal digits and "}" is found later in the pattern. If not, the characters remain in the pattern unaltered. The expand and hex modifiers are mutually exclusive. - If part of an expanded pattern looks like an expansion, but is really + If part of an expanded pattern looks like an expansion, but is really part of the actual pattern, unwanted expansion can be avoided by giving two values in the quantifier. For example, \[AB]{6000,6000} is not rec- ognized as an expansion item. - If the info modifier is set on an expanded pattern, the result of the + If the info modifier is set on an expanded pattern, the result of the expansion is included in the information that is output. JIT compilation - Just-in-time (JIT) compiling is a heavyweight optimization that can - greatly speed up pattern matching. See the pcre2jit documentation for - details. JIT compiling happens, optionally, after a pattern has been - successfully compiled into an internal form. The JIT compiler converts + Just-in-time (JIT) compiling is a heavyweight optimization that can + greatly speed up pattern matching. See the pcre2jit documentation for + details. JIT compiling happens, optionally, after a pattern has been + successfully compiled into an internal form. The JIT compiler converts this to optimized machine code. It needs to know whether the match-time options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, - because different code is generated for the different cases. See the - partial modifier in "Subject Modifiers" below for details of how these + because different code is generated for the different cases. See the + partial modifier in "Subject Modifiers" below for details of how these options are specified for each match attempt. JIT compilation is requested by the jit pattern modifier, which may op- - tionally be followed by an equals sign and a number in the range 0 to - 7. The three bits that make up the number specify which of the three + tionally be followed by an equals sign and a number in the range 0 to + 7. The three bits that make up the number specify which of the three JIT operating modes are to be compiled: 1 compile JIT code for non-partial matching @@ -845,31 +884,31 @@ PATTERN MODIFIERS 6 soft and hard partial matching only 7 all three modes - If no number is given, 7 is assumed. The phrase "partial matching" + If no number is given, 7 is assumed. The phrase "partial matching" means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the - PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- + PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- plete match; the options enable the possibility of a partial match, but - do not require it. Note also that if you request JIT compilation only - for partial matching (for example, jit=2) but do not set the partial - modifier on a subject line, that match will not use JIT code because + do not require it. Note also that if you request JIT compilation only + for partial matching (for example, jit=2) but do not set the partial + modifier on a subject line, that match will not use JIT code because none was compiled for non-partial matching. - If JIT compilation is successful, the compiled JIT code will automati- + If JIT compilation is successful, the compiled JIT code will automati- cally be used when an appropriate type of match is run, except when in- - compatible run-time options are specified. For more details, see the - pcre2jit documentation. See also the jitstack modifier below for a way + compatible run-time options are specified. For more details, see the + pcre2jit documentation. See also the jitstack modifier below for a way of setting the size of the JIT stack. - If the jitfast modifier is specified, matching is done using the JIT - "fast path" interface, pcre2_jit_match(), which skips some of the san- - ity checks that are done by pcre2_match(), and of course does not work - when JIT is not supported. If jitfast is specified without jit, jit=7 + If the jitfast modifier is specified, matching is done using the JIT + "fast path" interface, pcre2_jit_match(), which skips some of the san- + ity checks that are done by pcre2_match(), and of course does not work + when JIT is not supported. If jitfast is specified without jit, jit=7 is assumed. - If the jitverify modifier is specified, information about the compiled - pattern shows whether JIT compilation was or was not successful. If - jitverify is specified without jit, jit=7 is assumed. If JIT compila- - tion is successful when jitverify is set, the text "(JIT)" is added to + If the jitverify modifier is specified, information about the compiled + pattern shows whether JIT compilation was or was not successful. If + jitverify is specified without jit, jit=7 is assumed. If JIT compila- + tion is successful when jitverify is set, the text "(JIT)" is added to the first output line after a match or non match when JIT-compiled code was actually used in the match. @@ -880,19 +919,19 @@ PATTERN MODIFIERS /pattern/locale=fr_FR The given locale is set, pcre2_maketables() is called to build a set of - character tables for the locale, and this is then passed to pcre2_com- - pile() when compiling the regular expression. The same tables are used - when matching the following subject lines. The locale modifier applies + character tables for the locale, and this is then passed to pcre2_com- + pile() when compiling the regular expression. The same tables are used + when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern - command if a default is needed. Setting a locale and alternate charac- + command if a default is needed. Setting a locale and alternate charac- ter tables are mutually exclusive. Showing pattern memory The memory modifier causes the size in bytes of the memory used to hold - the compiled pattern to be output. This does not include the size of - the pcre2_code block; it is just the actual compiled data. If the pat- - tern is subsequently passed to the JIT compiler, the size of the JIT + the compiled pattern to be output. This does not include the size of + the pcre2_code block; it is just the actual compiled data. If the pat- + tern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example: re> /a(b)c/jit,memory @@ -902,34 +941,34 @@ PATTERN MODIFIERS Limiting nested parentheses - The parens_nest_limit modifier sets a limit on the depth of nested - parentheses in a pattern. Breaching the limit causes a compilation er- - ror. The default for the library is set when PCRE2 is built, but - pcre2test sets its own default of 220, which is required for running + The parens_nest_limit modifier sets a limit on the depth of nested + parentheses in a pattern. Breaching the limit causes a compilation er- + ror. The default for the library is set when PCRE2 is built, but + pcre2test sets its own default of 220, which is required for running the standard test suite. Limiting the pattern length - The max_pattern_length modifier sets a limit, in code units, to the + The max_pattern_length modifier sets a limit, in code units, to the length of pattern that pcre2_compile() will accept. Breaching the limit - causes a compilation error. The default is the largest number a + causes a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Limiting the size of a compiled pattern The max_pattern_compiled_length modifier sets a limit, in bytes, to the amount of memory used by a compiled pattern. Breaching the limit causes - a compilation error. The default is the largest number a PCRE2_SIZE + a compilation error. The default is the largest number a PCRE2_SIZE variable can hold (essentially unlimited). Using the POSIX wrapper API - The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via - the POSIX wrapper API rather than its native API. When posix_nosub is - used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX - wrapper supports only the 8-bit library. Note that it does not imply + The posix and posix_nosub modifiers cause pcre2test to call PCRE2 via + the POSIX wrapper API rather than its native API. When posix_nosub is + used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX + wrapper supports only the 8-bit library. Note that it does not imply POSIX matching semantics; for more detail see the pcre2posix documenta- - tion. The following pattern modifiers set options for the regcomp() + tion. The following pattern modifiers set options for the regcomp() function: caseless REG_ICASE @@ -939,42 +978,42 @@ PATTERN MODIFIERS ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) - The regerror_buffsize modifier specifies a size for the error buffer - that is passed to regerror() in the event of a compilation error. For + The regerror_buffsize modifier specifies a size for the error buffer + that is passed to regerror() in the event of a compilation error. For example: /abc/posix,regerror_buffsize=20 - This provides a means of testing the behaviour of regerror() when the - buffer is too small for the error message. If this modifier has not + This provides a means of testing the behaviour of regerror() when the + buffer is too small for the error message. If this modifier has not been set, a large buffer is used. - The aftertext and allaftertext subject modifiers work as described be- + The aftertext and allaftertext subject modifiers work as described be- low. All other modifiers are either ignored, with a warning message, or cause an error. - The pattern is passed to regcomp() as a zero-terminated string by de- + The pattern is passed to regcomp() as a zero-terminated string by de- fault, but if the use_length or hex modifiers are set, the REG_PEND ex- tension is used to pass it by length. Testing the stack guard feature - The stackguard modifier is used to test the use of pcre2_set_com- - pile_recursion_guard(), a function that is provided to enable stack - availability to be checked during compilation (see the pcre2api docu- - mentation for details). If the number specified by the modifier is + The stackguard modifier is used to test the use of pcre2_set_com- + pile_recursion_guard(), a function that is provided to enable stack + availability to be checked during compilation (see the pcre2api docu- + mentation for details). If the number specified by the modifier is greater than zero, pcre2_set_compile_recursion_guard() is called to set - up callback from pcre2_compile() to a local function. The argument it - receives is the current nesting parenthesis depth; if this is greater + up callback from pcre2_compile() to a local function. The argument it + receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted. Using alternative character tables - The value specified for the tables modifier must be one of the digits + The value specified for the tables modifier must be one of the digits 0, 1, 2, or 3. It causes a specific set of built-in character tables to - be passed to pcre2_compile(). This is used in the PCRE2 tests to check - behaviour with different character tables. The digit specifies the ta- + be passed to pcre2_compile(). This is used in the PCRE2 tests to check + behaviour with different character tables. The digit specifies the ta- bles as follows: 0 do not pass any special character tables @@ -985,15 +1024,15 @@ PATTERN MODIFIERS In tables 2, some characters whose codes are greater than 128 are iden- tified as letters, digits, spaces, etc. Tables 3 can be used only after - a #loadtables command has loaded them from a binary file. Setting al- + a #loadtables command has loaded them from a binary file. Setting al- ternate character tables and a locale are mutually exclusive. Setting certain match controls The following modifiers are really subject modifiers, and are described - under "Subject Modifiers" below. However, they may be included in a - pattern's modifier list, in which case they are applied to every sub- - ject line that is processed with that pattern. These modifiers do not + under "Subject Modifiers" below. However, they may be included in a + pattern's modifier list, in which case they are applied to every sub- + ject line that is processed with that pattern. These modifiers do not affect the compilation process. aftertext show text after match @@ -1009,6 +1048,7 @@ PATTERN MODIFIERS replace= specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1019,39 +1059,39 @@ PATTERN MODIFIERS substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY - These modifiers may not appear in a #pattern command. If you want them + These modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command. Specifying literal subject lines - If the subject_literal modifier is present on a pattern, all the sub- + If the subject_literal modifier is present on a pattern, all the sub- ject lines that it matches are taken as literal strings, with no inter- - pretation of backslashes. It is not possible to set subject modifiers - on such lines, but any that are set as defaults by a #subject command + pretation of backslashes. It is not possible to set subject modifiers + on such lines, but any that are set as defaults by a #subject command are recognized. Saving a compiled pattern - When a pattern with the push modifier is successfully compiled, it is - pushed onto a stack of compiled patterns, and pcre2test expects the - next line to contain a new pattern (or a command) instead of a subject + When a pattern with the push modifier is successfully compiled, it is + pushed onto a stack of compiled patterns, and pcre2test expects the + next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as - described in the section entitled "Saving and restoring compiled pat- - terns" below. If pushcopy is used instead of push, a copy of the com- - piled pattern is stacked, leaving the original as current, ready to - match the following input lines. This provides a way of testing the - pcre2_code_copy() function. The push and pushcopy modifiers are in- - compatible with compilation modifiers such as global that act at match + described in the section entitled "Saving and restoring compiled pat- + terns" below. If pushcopy is used instead of push, a copy of the com- + piled pattern is stacked, leaving the original as current, ready to + match the following input lines. This provides a way of testing the + pcre2_code_copy() function. The push and pushcopy modifiers are in- + compatible with compilation modifiers such as global that act at match time. Any that are specified are ignored (for the stacked copy), with a - warning message, except for replace, which causes an error. Note that - jitverify, which is allowed, does not carry through to any subsequent + warning message, except for replace, which causes an error. Note that + jitverify, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern. Testing foreign pattern conversion - The experimental foreign pattern conversion functions in PCRE2 can be - tested by setting the convert modifier. Its argument is a colon-sepa- - rated list of options, which set the equivalent option for the + The experimental foreign pattern conversion functions in PCRE2 can be + tested by setting the convert modifier. Its argument is a colon-sepa- + rated list of options, which set the equivalent option for the pcre2_pattern_convert() function: glob PCRE2_CONVERT_GLOB @@ -1063,19 +1103,19 @@ PATTERN MODIFIERS The "unset" value is useful for turning off a default that has been set by a #pattern command. When one of these options is set, the input pat- - tern is passed to pcre2_pattern_convert(). If the conversion is suc- - cessful, the result is reflected in the output and then passed to + tern is passed to pcre2_pattern_convert(). If the conversion is suc- + cessful, the result is reflected in the output and then passed to pcre2_compile(). The normal utf and no_utf_check options, if set, cause - the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be + the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to pcre2_pattern_convert(). By default, the conversion function is allowed to allocate a buffer for - its output. However, if the convert_length modifier is set to a value - greater than zero, pcre2test passes a buffer of the given length. This + its output. However, if the convert_length modifier is set to a value + greater than zero, pcre2test passes a buffer of the given length. This makes it possible to test the length check. - The convert_glob_escape and convert_glob_separator modifiers can be - used to specify the escape and separator characters for glob process- + The convert_glob_escape and convert_glob_separator modifiers can be + used to specify the escape and separator characters for glob process- ing, overriding the defaults, which are operating-system dependent. @@ -1086,10 +1126,11 @@ SUBJECT MODIFIERS Setting match options - The following modifiers set options for pcre2_match() or - pcre2_dfa_match(). See pcreapi for a description of their effects. + The following modifiers set options for pcre2_match() or + pcre2_dfa_match(). See pcre2api for a description of their effects. anchored set PCRE2_ANCHORED + copy_matched_subject set PCRE2_COPY_MATCHED_SUBJECT endanchored set PCRE2_ENDANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST @@ -1103,42 +1144,42 @@ SUBJECT MODIFIERS partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_soft (or ps) set PCRE2_PARTIAL_SOFT - The partial matching modifiers are provided with abbreviations because + The partial matching modifiers are provided with abbreviations because they appear frequently in tests. - If the posix or posix_nosub modifier was present on the pattern, caus- + If the posix or posix_nosub modifier was present on the pattern, caus- ing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOT- - BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to + BOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). The other modifiers are ignored, with a warning message. - There is one additional modifier that can be used with the POSIX wrap- + There is one additional modifier that can be used with the POSIX wrap- per. It is ignored (with a warning) if used for non-POSIX matching. posix_startend=[:] - This causes the subject string to be passed to regexec() using the - REG_STARTEND option, which uses offsets to specify which part of the - string is searched. If only one number is given, the end offset is - passed as the end of the subject string. For more detail of REG_STAR- - TEND, see the pcre2posix documentation. If the subject string contains - binary zeros (coded as escapes such as \x{00} because pcre2test does + This causes the subject string to be passed to regexec() using the + REG_STARTEND option, which uses offsets to specify which part of the + string is searched. If only one number is given, the end offset is + passed as the end of the subject string. For more detail of REG_STAR- + TEND, see the pcre2posix documentation. If the subject string contains + binary zeros (coded as escapes such as \x{00} because pcre2test does not support actual binary zeros in its input), you must use posix_star- tend to specify its length. Setting match controls - The following modifiers affect the matching process or request addi- - tional information. Some of them may also be specified on a pattern - line (see above), in which case they apply to every subject line that - is matched against that pattern, but can be overridden by modifiers on + The following modifiers affect the matching process or request addi- + tional information. Some of them may also be specified on a pattern + line (see above), in which case they apply to every subject line that + is matched against that pattern, but can be overridden by modifiers on the subject. aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data= set a value to pass via callouts @@ -1172,7 +1213,8 @@ SUBJECT MODIFIERS startchar show startchar when relevant startoffset= same as offset= substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1184,29 +1226,29 @@ SUBJECT MODIFIERS zero_terminate pass the subject as zero-terminated The effects of these modifiers are described in the following sections. - When matching via the POSIX wrapper API, the aftertext, allaftertext, - and ovector subject modifiers work as described below. All other modi- + When matching via the POSIX wrapper API, the aftertext, allaftertext, + and ovector subject modifiers work as described below. All other modi- fiers are either ignored, with a warning message, or cause an error. Showing more text - The aftertext modifier requests that as well as outputting the part of + The aftertext modifier requests that as well as outputting the part of the subject string that matched the entire pattern, pcre2test should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. - The allaftertext modifier requests the same action for captured sub- + The allaftertext modifier requests the same action for captured sub- strings as well as the main matched substring. In each case the remain- der is output on the following line with a plus character following the capture number. - The allusedtext modifier requests that all the text that was consulted - during a successful pattern match by the interpreter should be shown, - for both full and partial matches. This feature is not supported for - JIT matching, and if requested with JIT it is ignored (with a warning - message). Setting this modifier affects the output if there is a look- - behind at the start of a match, or, for a complete match, a lookahead + The allusedtext modifier requests that all the text that was consulted + during a successful pattern match by the interpreter should be shown, + for both full and partial matches. This feature is not supported for + JIT matching, and if requested with JIT it is ignored (with a warning + message). Setting this modifier affects the output if there is a look- + behind at the start of a match, or, for a complete match, a lookahead at the end, or if \K is used in the pattern. Characters that precede or - follow the start and end of the actual match are indicated in the out- + follow the start and end of the actual match are indicated in the out- put by '<' or '>' characters underneath them. Here is an example: re> /(?<=pqr)abc(?=xyz)/ @@ -1217,16 +1259,16 @@ SUBJECT MODIFIERS Partial match: pqrabcxy <<< - The first, complete match shows that the matched string is "abc", with - the preceding and following strings "pqr" and "xyz" having been con- - sulted during the match (when processing the assertions). The partial + The first, complete match shows that the matched string is "abc", with + the preceding and following strings "pqr" and "xyz" having been con- + sulted during the match (when processing the assertions). The partial match can indicate only the preceding string. - The startchar modifier requests that the starting character for the - match be indicated, if it is different to the start of the matched + The startchar modifier requests that the starting character for the + match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \K has been processed as part of the match. In this situation, the output for the matched string - is displayed from the starting character instead of from the match + is displayed from the starting character instead of from the match point, with circumflex characters under the earlier characters. For ex- ample: @@ -1235,7 +1277,7 @@ SUBJECT MODIFIERS 0: abcxyz ^^^ - Unlike allusedtext, the startchar modifier can be used with JIT. How- + Unlike allusedtext, the startchar modifier can be used with JIT. How- ever, these two modifiers are mutually exclusive. Showing the value of all capture groups @@ -1243,104 +1285,104 @@ SUBJECT MODIFIERS The allcaptures modifier requests that the values of all potential cap- tured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to - the return code from pcre2_match()). Groups that did not take part in - the match are output as "". This modifier is not relevant for - DFA matching (which does no capturing) and does not apply when replace + the return code from pcre2_match()). Groups that did not take part in + the match are output as "". This modifier is not relevant for + DFA matching (which does no capturing) and does not apply when replace is specified; it is ignored, with a warning message, if present. Showing the entire ovector, for all outcomes The allvector modifier requests that the entire ovector be shown, what- ever the outcome of the match. Compare allcaptures, which shows only up - to the maximum number of capture groups for the pattern, and then only - for a successful complete non-DFA match. This modifier, which acts af- - ter any match result, and also for DFA matching, provides a means of - checking that there are no unexpected modifications to ovector fields. - Before each match attempt, the ovector is filled with a special value, - and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all - groups after the maximum capture group for the pattern. In other cases - it applies to the entire ovector. After a partial match, the first two - elements are the only ones that should be set. After a DFA match, the - amount of ovector that is used depends on the number of matches that + to the maximum number of capture groups for the pattern, and then only + for a successful complete non-DFA match. This modifier, which acts af- + ter any match result, and also for DFA matching, provides a means of + checking that there are no unexpected modifications to ovector fields. + Before each match attempt, the ovector is filled with a special value, + and if this is found in both elements of a capturing pair, "" is output. After a successful match, this applies to all + groups after the maximum capture group for the pattern. In other cases + it applies to the entire ovector. After a partial match, the first two + elements are the only ones that should be set. After a DFA match, the + amount of ovector that is used depends on the number of matches that were found. Testing pattern callouts - A callout function is supplied when pcre2test calls the library match- - ing functions, unless callout_none is specified. Its behaviour can be - controlled by various modifiers listed above whose names begin with - callout_. Details are given in the section entitled "Callouts" below. - Testing callouts from pcre2_substitute() is described separately in + A callout function is supplied when pcre2test calls the library match- + ing functions, unless callout_none is specified. Its behaviour can be + controlled by various modifiers listed above whose names begin with + callout_. Details are given in the section entitled "Callouts" below. + Testing callouts from pcre2_substitute() is described separately in "Testing the substitution function" below. Finding all matches in a string Searching for all possible matches within a subject can be requested by - the global or altglobal modifier. After finding a match, the matching - function is called again to search the remainder of the subject. The - difference between global and altglobal is that the former uses the - start_offset argument to pcre2_match() or pcre2_dfa_match() to start - searching at a new point within the entire string (which is what Perl + the global or altglobal modifier. After finding a match, the matching + function is called again to search the remainder of the subject. The + difference between global and altglobal is that the former uses the + start_offset argument to pcre2_match() or pcre2_dfa_match() to start + searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbe- hind assertion (including \b or \B). - If an empty string is matched, the next match is done with the + If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for another, non-empty, match at the same point in the subject. If this - match fails, the start offset is advanced, and the normal match is re- - tried. This imitates the way Perl handles such cases when using the /g - modifier or the split() function. Normally, the start offset is ad- - vanced by one character, but if the newline convention recognizes CRLF - as a newline, and the current character is CR followed by LF, an ad- + match fails, the start offset is advanced, and the normal match is re- + tried. This imitates the way Perl handles such cases when using the /g + modifier or the split() function. Normally, the start offset is ad- + vanced by one character, but if the newline convention recognizes CRLF + as a newline, and the current character is CR followed by LF, an ad- vance of two characters occurs. Testing substring extraction functions - The copy and get modifiers can be used to test the pcre2_sub- + The copy and get modifiers can be used to test the pcre2_sub- string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be given more than once, and each can specify a capture group name or num- ber, for example: abcd\=copy=1,copy=3,get=G1 - If the #subject command is used to set default copy and/or get lists, - these can be unset by specifying a negative number to cancel all num- + If the #subject command is used to set default copy and/or get lists, + these can be unset by specifying a negative number to cancel all num- bered groups and an empty name to cancel all named groups. - The getall modifier tests pcre2_substring_list_get(), which extracts + The getall modifier tests pcre2_substring_list_get(), which extracts all captured substrings. - If the subject line is successfully matched, the substrings extracted - by the convenience functions are output with C, G, or L after the - string number instead of a colon. This is in addition to the normal - full list. The string length (that is, the return from the extraction + If the subject line is successfully matched, the substrings extracted + by the convenience functions are output with C, G, or L after the + string number instead of a colon. This is in addition to the normal + full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name. Testing the substitution function - If the replace modifier is set, the pcre2_substitute() function is - called instead of one of the matching functions (or after one call of - pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- - placement strings cannot contain commas, because a comma signifies the - end of a modifier. This is not thought to be an issue in a test pro- + If the replace modifier is set, the pcre2_substitute() function is + called instead of one of the matching functions (or after one call of + pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that re- + placement strings cannot contain commas, because a comma signifies the + end of a modifier. This is not thought to be an issue in a test pro- gram. - Specifying a completely empty replacement string disables this modi- - fier. However, it is possible to specify an empty replacement by pro- - viding a buffer length, as described below, for an otherwise empty re- + Specifying a completely empty replacement string disables this modi- + fier. However, it is possible to specify an empty replacement by pro- + viding a buffer length, as described below, for an otherwise empty re- placement. - Unlike subject strings, pcre2test does not process replacement strings - for escape sequences. In UTF mode, a replacement string is checked to - see if it is a valid UTF-8 string. If so, it is correctly converted to - a UTF string of the appropriate code unit width. If it is not a valid - UTF-8 string, the individual code units are copied directly. This pro- + Unlike subject strings, pcre2test does not process replacement strings + for escape sequences. In UTF mode, a replacement string is checked to + see if it is a valid UTF-8 string. If so, it is correctly converted to + a UTF string of the appropriate code unit width. If it is not a valid + UTF-8 string, the individual code units are copied directly. This pro- vides a means of passing an invalid UTF-8 string for testing purposes. - The following modifiers set options (in additional to the normal match + The following modifiers set options (in additional to the normal match options) for pcre2_substitute(): global PCRE2_SUBSTITUTE_GLOBAL @@ -1354,8 +1396,8 @@ SUBJECT MODIFIERS See the pcre2api documentation for details of these options. - After a successful substitution, the modified string is output, pre- - ceded by the number of replacements. This may be zero if there were no + After a successful substitution, the modified string is output, pre- + ceded by the number of replacements. This may be zero if there were no matches. Here is a simple example of a substitution test: /abc/replace=xxx @@ -1364,12 +1406,12 @@ SUBJECT MODIFIERS =abc=abc=\=global 2: =xxx=xxx= - Subject and replacement strings should be kept relatively short (fewer - than 256 characters) for substitution tests, as fixed-size buffers are - used. To make it easy to test for buffer overflow, if the replacement - string starts with a number in square brackets, that number is passed - to pcre2_substitute() as the size of the output buffer, with the re- - placement string starting at the next character. Here is an example + Subject and replacement strings should be kept relatively short (fewer + than 256 characters) for substitution tests, as fixed-size buffers are + used. To make it easy to test for buffer overflow, if the replacement + string starts with a number in square brackets, that number is passed + to pcre2_substitute() as the size of the output buffer, with the re- + placement string starting at the next character. Here is an example that tests the edge case: /abc/ @@ -1379,12 +1421,12 @@ SUBJECT MODIFIERS Failed: error -47: no more memory The default action of pcre2_substitute() is to return PCRE2_ER- - ROR_NOMEMORY when the output buffer is too small. However, if the - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- + ROR_NOMEMORY when the output buffer is too small. However, if the + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the substi- tute_overflow_length modifier), pcre2_substitute() continues to go - through the motions of matching and substituting (but not doing any - callouts), in order to compute the size of buffer that is required. - When this happens, pcre2test shows the required buffer length (which + through the motions of matching and substituting (but not doing any + callouts), in order to compute the size of buffer that is required. + When this happens, pcre2test shows the required buffer length (which includes space for the trailing zero) as part of the error message. For example: @@ -1393,15 +1435,15 @@ SUBJECT MODIFIERS Failed: error -47: no more memory: 10 code units are needed A replacement string is ignored with POSIX and DFA matching. Specifying - partial matching provokes an error return ("bad option value") from + partial matching provokes an error return ("bad option value") from pcre2_substitute(). Testing substitute callouts If the substitute_callout modifier is set, a substitution callout func- - tion is set up. The null_context modifier must not be set, because the - address of the callout function is passed in a match context. When the - callout function is called (after each substitution), details of the + tion is set up. The null_context modifier must not be set, because the + address of the callout function is passed in a match context. When the + callout function is called (after each substitution), details of the input and output strings are output. For example: /abc/g,replace=<$0>,substitute_callout @@ -1410,19 +1452,19 @@ SUBJECT MODIFIERS 2(1) Old 6 9 "abc" New 8 13 "" 2: defpqr - The first number on each callout line is the count of matches. The + The first number on each callout line is the count of matches. The parenthesized number is the number of pairs that are set in the ovector - (that is, one more than the number of capturing groups that were set). + (that is, one more than the number of capturing groups that were set). Then are listed the offsets of the old substring, its contents, and the same for the replacement. - By default, the substitution callout function returns zero, which ac- - cepts the replacement and causes matching to continue if /g was used. - Two further modifiers can be used to test other return values. If sub- - stitute_skip is set to a value greater than zero the callout function - returns +1 for the match of that number, and similarly substitute_stop - returns -1. These cause the replacement to be rejected, and -1 causes - no further matching to take place. If either of them are set, substi- + By default, the substitution callout function returns zero, which ac- + cepts the replacement and causes matching to continue if /g was used. + Two further modifiers can be used to test other return values. If sub- + stitute_skip is set to a value greater than zero the callout function + returns +1 for the match of that number, and similarly substitute_stop + returns -1. These cause the replacement to be rejected, and -1 causes + no further matching to take place. If either of them are set, substi- tute_callout is assumed. For example: /abc/g,replace=<$0>,substitute_skip=1 @@ -1438,6 +1480,18 @@ SUBJECT MODIFIERS gle skip or stop is supported, which is sufficient for testing that the feature works. + Testing substitute case callouts + + If the substitute_case_callout modifier is set, a substitution case + callout function is set up. The callout function is called for each + substituted chunk which is to be case-transformed. + + The callout function passed is a fixed function with implementation for + certain behaviours: inputs which shrink when case-transformed; inputs + which grow; inputs with distinct upper/lower/titlecase forms. The char- + acters which are not special-cased for testing purposes are left unmod- + ified, as if they are caseless characters. + Setting the JIT stack size The jitstack modifier provides a way of setting the maximum stack size @@ -2007,8 +2061,8 @@ AUTHOR REVISION - Last updated: 24 April 2024 + Last updated: 26 December 2024 Copyright (c) 1997-2024 University of Cambridge. -PCRE 10.44 24 April 2024 PCRE2TEST(1) +PCRE2 10.45 26 December 2024 PCRE2TEST(1) diff --git a/mingw64/share/licenses/pcre2/LICENCE b/usr/share/licenses/pcre2/LICENSE.md similarity index 55% rename from mingw64/share/licenses/pcre2/LICENCE rename to usr/share/licenses/pcre2/LICENSE.md index 3c1ef032dec..f58ceb75a63 100644 --- a/mingw64/share/licenses/pcre2/LICENCE +++ b/usr/share/licenses/pcre2/LICENSE.md @@ -1,5 +1,8 @@ -PCRE2 LICENCE -------------- +PCRE2 License +============= + +| SPDX-License-Identifier: | BSD-3-Clause WITH PCRE2-exception | +|---------|-------| PCRE2 is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. @@ -16,40 +19,46 @@ optimize pattern matching. This is an optional feature that can be omitted when the library is built. -THE BASIC LIBRARY FUNCTIONS ---------------------------- +COPYRIGHT +--------- + +### The basic library functions -Written by: Philip Hazel -Email local part: Philip.Hazel -Email domain: gmail.com + Written by: Philip Hazel + Email local part: Philip.Hazel + Email domain: gmail.com -Retired from University of Cambridge Computing Service, -Cambridge, England. + Retired from University of Cambridge Computing Service, + Cambridge, England. -Copyright (c) 1997-2024 University of Cambridge -All rights reserved. + Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 2007-2024 Philip Hazel + All rights reserved. +### PCRE2 Just-In-Time compilation support -PCRE2 JUST-IN-TIME COMPILATION SUPPORT --------------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu + Copyright (c) 2010-2024 Zoltan Herczeg + All rights reserved. -Copyright(c) 2010-2024 Zoltan Herczeg -All rights reserved. +### Stack-less Just-In-Time compiler + Written by: Zoltan Herczeg + Email local part: hzmester + Email domain: freemail.hu -STACK-LESS JUST-IN-TIME COMPILER --------------------------------- + Copyright (c) 2009-2024 Zoltan Herczeg + All rights reserved. -Written by: Zoltan Herczeg -Email local part: hzmester -Email domain: freemail.hu +### All other contributions -Copyright(c) 2009-2024 Zoltan Herczeg -All rights reserved. +Many other contributors have participated in the authorship of PCRE2. As PCRE2 +has never required a Contributor Licensing Agreement, or other copyright +assignment agreement, all contributions have copyright retained by each +original contributor or their employer. THE "BSD" LICENCE @@ -58,16 +67,16 @@ THE "BSD" LICENCE Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright notices, - this list of conditions and the following disclaimer. +* Redistributions of source code must retain the above copyright notices, + this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notices, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. +* Redistributions in binary form must reproduce the above copyright + notices, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of the University of Cambridge nor the names of any - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. +* Neither the name of the University of Cambridge nor the names of any + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/usr/share/man/man1/pcre2-config.1.gz b/usr/share/man/man1/pcre2-config.1.gz index bd6a7fd31d0..822df9aded3 100644 Binary files a/usr/share/man/man1/pcre2-config.1.gz and b/usr/share/man/man1/pcre2-config.1.gz differ diff --git a/usr/share/man/man1/pcre2grep.1.gz b/usr/share/man/man1/pcre2grep.1.gz index 4986ba33e42..30bcc1cf285 100644 Binary files a/usr/share/man/man1/pcre2grep.1.gz and b/usr/share/man/man1/pcre2grep.1.gz differ diff --git a/usr/share/man/man1/pcre2test.1.gz b/usr/share/man/man1/pcre2test.1.gz index 6f7f0a4ded2..2ad05f1e7ca 100644 Binary files a/usr/share/man/man1/pcre2test.1.gz and b/usr/share/man/man1/pcre2test.1.gz differ diff --git a/usr/share/man/man3/pcre2.3.gz b/usr/share/man/man3/pcre2.3.gz index 8f8de13a685..0557798fa74 100644 Binary files a/usr/share/man/man3/pcre2.3.gz and b/usr/share/man/man3/pcre2.3.gz differ diff --git a/usr/share/man/man3/pcre2_callout_enumerate.3.gz b/usr/share/man/man3/pcre2_callout_enumerate.3.gz index 132edbb1a5e..a808e956bd6 100644 Binary files a/usr/share/man/man3/pcre2_callout_enumerate.3.gz and b/usr/share/man/man3/pcre2_callout_enumerate.3.gz differ diff --git a/usr/share/man/man3/pcre2_code_copy.3.gz b/usr/share/man/man3/pcre2_code_copy.3.gz index 0c748480430..cc75c1145df 100644 Binary files a/usr/share/man/man3/pcre2_code_copy.3.gz and b/usr/share/man/man3/pcre2_code_copy.3.gz differ diff --git a/usr/share/man/man3/pcre2_code_copy_with_tables.3.gz b/usr/share/man/man3/pcre2_code_copy_with_tables.3.gz index 5ece33f7768..d308117ce38 100644 Binary files a/usr/share/man/man3/pcre2_code_copy_with_tables.3.gz and b/usr/share/man/man3/pcre2_code_copy_with_tables.3.gz differ diff --git a/usr/share/man/man3/pcre2_code_free.3.gz b/usr/share/man/man3/pcre2_code_free.3.gz index 2fc6eccaa38..f43574a3931 100644 Binary files a/usr/share/man/man3/pcre2_code_free.3.gz and b/usr/share/man/man3/pcre2_code_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_compile.3.gz b/usr/share/man/man3/pcre2_compile.3.gz index f7d39bd6671..038e31a7280 100644 Binary files a/usr/share/man/man3/pcre2_compile.3.gz and b/usr/share/man/man3/pcre2_compile.3.gz differ diff --git a/usr/share/man/man3/pcre2_compile_context_copy.3.gz b/usr/share/man/man3/pcre2_compile_context_copy.3.gz index bd414744c62..e601ecd8051 100644 Binary files a/usr/share/man/man3/pcre2_compile_context_copy.3.gz and b/usr/share/man/man3/pcre2_compile_context_copy.3.gz differ diff --git a/usr/share/man/man3/pcre2_compile_context_create.3.gz b/usr/share/man/man3/pcre2_compile_context_create.3.gz index 670392ebb3b..736809aa152 100644 Binary files a/usr/share/man/man3/pcre2_compile_context_create.3.gz and b/usr/share/man/man3/pcre2_compile_context_create.3.gz differ diff --git a/usr/share/man/man3/pcre2_compile_context_free.3.gz b/usr/share/man/man3/pcre2_compile_context_free.3.gz index 3aa875dd01c..6af45001ad6 100644 Binary files a/usr/share/man/man3/pcre2_compile_context_free.3.gz and b/usr/share/man/man3/pcre2_compile_context_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_config.3.gz b/usr/share/man/man3/pcre2_config.3.gz index 6858fdeb764..0dc22637f28 100644 Binary files a/usr/share/man/man3/pcre2_config.3.gz and b/usr/share/man/man3/pcre2_config.3.gz differ diff --git a/usr/share/man/man3/pcre2_convert_context_copy.3.gz b/usr/share/man/man3/pcre2_convert_context_copy.3.gz index e8c877a0388..4dcbd5f2637 100644 Binary files a/usr/share/man/man3/pcre2_convert_context_copy.3.gz and b/usr/share/man/man3/pcre2_convert_context_copy.3.gz differ diff --git a/usr/share/man/man3/pcre2_convert_context_create.3.gz b/usr/share/man/man3/pcre2_convert_context_create.3.gz index 47212499bb1..872ffff8130 100644 Binary files a/usr/share/man/man3/pcre2_convert_context_create.3.gz and b/usr/share/man/man3/pcre2_convert_context_create.3.gz differ diff --git a/usr/share/man/man3/pcre2_convert_context_free.3.gz b/usr/share/man/man3/pcre2_convert_context_free.3.gz index a8d1fa23847..344effb030f 100644 Binary files a/usr/share/man/man3/pcre2_convert_context_free.3.gz and b/usr/share/man/man3/pcre2_convert_context_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_converted_pattern_free.3.gz b/usr/share/man/man3/pcre2_converted_pattern_free.3.gz index b2a1dfc7175..37f2c7cda61 100644 Binary files a/usr/share/man/man3/pcre2_converted_pattern_free.3.gz and b/usr/share/man/man3/pcre2_converted_pattern_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_dfa_match.3.gz b/usr/share/man/man3/pcre2_dfa_match.3.gz index 7a1cc9b4bfa..6beb8b69df4 100644 Binary files a/usr/share/man/man3/pcre2_dfa_match.3.gz and b/usr/share/man/man3/pcre2_dfa_match.3.gz differ diff --git a/usr/share/man/man3/pcre2_general_context_copy.3.gz b/usr/share/man/man3/pcre2_general_context_copy.3.gz index c1854da2b24..9f4653d1e1a 100644 Binary files a/usr/share/man/man3/pcre2_general_context_copy.3.gz and b/usr/share/man/man3/pcre2_general_context_copy.3.gz differ diff --git a/usr/share/man/man3/pcre2_general_context_create.3.gz b/usr/share/man/man3/pcre2_general_context_create.3.gz index 559c4ad603f..05d60da320e 100644 Binary files a/usr/share/man/man3/pcre2_general_context_create.3.gz and b/usr/share/man/man3/pcre2_general_context_create.3.gz differ diff --git a/usr/share/man/man3/pcre2_general_context_free.3.gz b/usr/share/man/man3/pcre2_general_context_free.3.gz index 42b9a03c19f..f1133f8a242 100644 Binary files a/usr/share/man/man3/pcre2_general_context_free.3.gz and b/usr/share/man/man3/pcre2_general_context_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_error_message.3.gz b/usr/share/man/man3/pcre2_get_error_message.3.gz index 31a4fe00d0c..5c1dd580e8b 100644 Binary files a/usr/share/man/man3/pcre2_get_error_message.3.gz and b/usr/share/man/man3/pcre2_get_error_message.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_mark.3.gz b/usr/share/man/man3/pcre2_get_mark.3.gz index d189a862d6e..0422987109c 100644 Binary files a/usr/share/man/man3/pcre2_get_mark.3.gz and b/usr/share/man/man3/pcre2_get_mark.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz b/usr/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz index 41e2b42f079..ed3807ee91a 100644 Binary files a/usr/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz and b/usr/share/man/man3/pcre2_get_match_data_heapframes_size.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_match_data_size.3.gz b/usr/share/man/man3/pcre2_get_match_data_size.3.gz index 1493a8239d7..b8b90d8b992 100644 Binary files a/usr/share/man/man3/pcre2_get_match_data_size.3.gz and b/usr/share/man/man3/pcre2_get_match_data_size.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_ovector_count.3.gz b/usr/share/man/man3/pcre2_get_ovector_count.3.gz index 3e13f2457bb..cbd0b74f6d1 100644 Binary files a/usr/share/man/man3/pcre2_get_ovector_count.3.gz and b/usr/share/man/man3/pcre2_get_ovector_count.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_ovector_pointer.3.gz b/usr/share/man/man3/pcre2_get_ovector_pointer.3.gz index 46c73d389e6..b638a40fb4f 100644 Binary files a/usr/share/man/man3/pcre2_get_ovector_pointer.3.gz and b/usr/share/man/man3/pcre2_get_ovector_pointer.3.gz differ diff --git a/usr/share/man/man3/pcre2_get_startchar.3.gz b/usr/share/man/man3/pcre2_get_startchar.3.gz index 8770d3f18e2..bbd6dc3d8b0 100644 Binary files a/usr/share/man/man3/pcre2_get_startchar.3.gz and b/usr/share/man/man3/pcre2_get_startchar.3.gz differ diff --git a/usr/share/man/man3/pcre2_jit_compile.3.gz b/usr/share/man/man3/pcre2_jit_compile.3.gz index 37a905af1c9..4b334032e0f 100644 Binary files a/usr/share/man/man3/pcre2_jit_compile.3.gz and b/usr/share/man/man3/pcre2_jit_compile.3.gz differ diff --git a/usr/share/man/man3/pcre2_jit_free_unused_memory.3.gz b/usr/share/man/man3/pcre2_jit_free_unused_memory.3.gz index b854e9f15f1..636b2f299d2 100644 Binary files a/usr/share/man/man3/pcre2_jit_free_unused_memory.3.gz and b/usr/share/man/man3/pcre2_jit_free_unused_memory.3.gz differ diff --git a/usr/share/man/man3/pcre2_jit_match.3.gz b/usr/share/man/man3/pcre2_jit_match.3.gz index 1f60dbd220c..aea0f320a12 100644 Binary files a/usr/share/man/man3/pcre2_jit_match.3.gz and b/usr/share/man/man3/pcre2_jit_match.3.gz differ diff --git a/usr/share/man/man3/pcre2_jit_stack_assign.3.gz b/usr/share/man/man3/pcre2_jit_stack_assign.3.gz index cd29425f7db..3d21b33376d 100644 Binary files a/usr/share/man/man3/pcre2_jit_stack_assign.3.gz and b/usr/share/man/man3/pcre2_jit_stack_assign.3.gz differ diff --git a/usr/share/man/man3/pcre2_jit_stack_create.3.gz b/usr/share/man/man3/pcre2_jit_stack_create.3.gz index ca3326ae5c3..f5135aa0038 100644 Binary files a/usr/share/man/man3/pcre2_jit_stack_create.3.gz and b/usr/share/man/man3/pcre2_jit_stack_create.3.gz differ diff --git a/usr/share/man/man3/pcre2_jit_stack_free.3.gz b/usr/share/man/man3/pcre2_jit_stack_free.3.gz index 143e3c8005b..6e880ae0669 100644 Binary files a/usr/share/man/man3/pcre2_jit_stack_free.3.gz and b/usr/share/man/man3/pcre2_jit_stack_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_maketables.3.gz b/usr/share/man/man3/pcre2_maketables.3.gz index 3df5fa94517..964e90286ea 100644 Binary files a/usr/share/man/man3/pcre2_maketables.3.gz and b/usr/share/man/man3/pcre2_maketables.3.gz differ diff --git a/usr/share/man/man3/pcre2_maketables_free.3.gz b/usr/share/man/man3/pcre2_maketables_free.3.gz index 7e4c91a7b5e..6c0ff04dfb9 100644 Binary files a/usr/share/man/man3/pcre2_maketables_free.3.gz and b/usr/share/man/man3/pcre2_maketables_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_match.3.gz b/usr/share/man/man3/pcre2_match.3.gz index 737e8dd5ad0..de2d9e66605 100644 Binary files a/usr/share/man/man3/pcre2_match.3.gz and b/usr/share/man/man3/pcre2_match.3.gz differ diff --git a/usr/share/man/man3/pcre2_match_context_copy.3.gz b/usr/share/man/man3/pcre2_match_context_copy.3.gz index 82317151de4..d971b5f740e 100644 Binary files a/usr/share/man/man3/pcre2_match_context_copy.3.gz and b/usr/share/man/man3/pcre2_match_context_copy.3.gz differ diff --git a/usr/share/man/man3/pcre2_match_context_create.3.gz b/usr/share/man/man3/pcre2_match_context_create.3.gz index 1607514981c..134d340d0f5 100644 Binary files a/usr/share/man/man3/pcre2_match_context_create.3.gz and b/usr/share/man/man3/pcre2_match_context_create.3.gz differ diff --git a/usr/share/man/man3/pcre2_match_context_free.3.gz b/usr/share/man/man3/pcre2_match_context_free.3.gz index d54a68d4b39..0a7f3016c50 100644 Binary files a/usr/share/man/man3/pcre2_match_context_free.3.gz and b/usr/share/man/man3/pcre2_match_context_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_match_data_create.3.gz b/usr/share/man/man3/pcre2_match_data_create.3.gz index 79faeb6e407..dd522dda1a5 100644 Binary files a/usr/share/man/man3/pcre2_match_data_create.3.gz and b/usr/share/man/man3/pcre2_match_data_create.3.gz differ diff --git a/usr/share/man/man3/pcre2_match_data_create_from_pattern.3.gz b/usr/share/man/man3/pcre2_match_data_create_from_pattern.3.gz index 0649219da29..99b6388d286 100644 Binary files a/usr/share/man/man3/pcre2_match_data_create_from_pattern.3.gz and b/usr/share/man/man3/pcre2_match_data_create_from_pattern.3.gz differ diff --git a/usr/share/man/man3/pcre2_match_data_free.3.gz b/usr/share/man/man3/pcre2_match_data_free.3.gz index 0d5769b50d3..d16b50815c1 100644 Binary files a/usr/share/man/man3/pcre2_match_data_free.3.gz and b/usr/share/man/man3/pcre2_match_data_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_pattern_convert.3.gz b/usr/share/man/man3/pcre2_pattern_convert.3.gz index 51cec7d92eb..fa281abaa5a 100644 Binary files a/usr/share/man/man3/pcre2_pattern_convert.3.gz and b/usr/share/man/man3/pcre2_pattern_convert.3.gz differ diff --git a/usr/share/man/man3/pcre2_pattern_info.3.gz b/usr/share/man/man3/pcre2_pattern_info.3.gz index 591aadd37d8..77adf99495b 100644 Binary files a/usr/share/man/man3/pcre2_pattern_info.3.gz and b/usr/share/man/man3/pcre2_pattern_info.3.gz differ diff --git a/usr/share/man/man3/pcre2_serialize_decode.3.gz b/usr/share/man/man3/pcre2_serialize_decode.3.gz index ee99586dc5b..94dc8db34ac 100644 Binary files a/usr/share/man/man3/pcre2_serialize_decode.3.gz and b/usr/share/man/man3/pcre2_serialize_decode.3.gz differ diff --git a/usr/share/man/man3/pcre2_serialize_encode.3.gz b/usr/share/man/man3/pcre2_serialize_encode.3.gz index 0aebb74aeba..ef83c5158ba 100644 Binary files a/usr/share/man/man3/pcre2_serialize_encode.3.gz and b/usr/share/man/man3/pcre2_serialize_encode.3.gz differ diff --git a/usr/share/man/man3/pcre2_serialize_free.3.gz b/usr/share/man/man3/pcre2_serialize_free.3.gz index 35e72657a5c..1b893458bf1 100644 Binary files a/usr/share/man/man3/pcre2_serialize_free.3.gz and b/usr/share/man/man3/pcre2_serialize_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz b/usr/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz index 2f5e616a614..d0f6e346a7f 100644 Binary files a/usr/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz and b/usr/share/man/man3/pcre2_serialize_get_number_of_codes.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_bsr.3.gz b/usr/share/man/man3/pcre2_set_bsr.3.gz index edc42c33a2e..f886a47c949 100644 Binary files a/usr/share/man/man3/pcre2_set_bsr.3.gz and b/usr/share/man/man3/pcre2_set_bsr.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_callout.3.gz b/usr/share/man/man3/pcre2_set_callout.3.gz index 10c85f78b1c..a28a8719d86 100644 Binary files a/usr/share/man/man3/pcre2_set_callout.3.gz and b/usr/share/man/man3/pcre2_set_callout.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_character_tables.3.gz b/usr/share/man/man3/pcre2_set_character_tables.3.gz index ef705eabf76..7b5c22c7b08 100644 Binary files a/usr/share/man/man3/pcre2_set_character_tables.3.gz and b/usr/share/man/man3/pcre2_set_character_tables.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_compile_extra_options.3.gz b/usr/share/man/man3/pcre2_set_compile_extra_options.3.gz index f0b6532da11..c5b68d4fffe 100644 Binary files a/usr/share/man/man3/pcre2_set_compile_extra_options.3.gz and b/usr/share/man/man3/pcre2_set_compile_extra_options.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_compile_recursion_guard.3.gz b/usr/share/man/man3/pcre2_set_compile_recursion_guard.3.gz index be84b852dfa..0a0f31d308c 100644 Binary files a/usr/share/man/man3/pcre2_set_compile_recursion_guard.3.gz and b/usr/share/man/man3/pcre2_set_compile_recursion_guard.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_depth_limit.3.gz b/usr/share/man/man3/pcre2_set_depth_limit.3.gz index 23431f3bee0..63f86a671ce 100644 Binary files a/usr/share/man/man3/pcre2_set_depth_limit.3.gz and b/usr/share/man/man3/pcre2_set_depth_limit.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_glob_escape.3.gz b/usr/share/man/man3/pcre2_set_glob_escape.3.gz index 7f633e4d483..b9686a7d52c 100644 Binary files a/usr/share/man/man3/pcre2_set_glob_escape.3.gz and b/usr/share/man/man3/pcre2_set_glob_escape.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_glob_separator.3.gz b/usr/share/man/man3/pcre2_set_glob_separator.3.gz index 5b40b6d42c9..ef3c380e2f5 100644 Binary files a/usr/share/man/man3/pcre2_set_glob_separator.3.gz and b/usr/share/man/man3/pcre2_set_glob_separator.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_heap_limit.3.gz b/usr/share/man/man3/pcre2_set_heap_limit.3.gz index 059d9882713..b1f60913032 100644 Binary files a/usr/share/man/man3/pcre2_set_heap_limit.3.gz and b/usr/share/man/man3/pcre2_set_heap_limit.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_match_limit.3.gz b/usr/share/man/man3/pcre2_set_match_limit.3.gz index 2949258558d..9a4487ee5f4 100644 Binary files a/usr/share/man/man3/pcre2_set_match_limit.3.gz and b/usr/share/man/man3/pcre2_set_match_limit.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz b/usr/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz index 22b23a35e17..e6b0799660f 100644 Binary files a/usr/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz and b/usr/share/man/man3/pcre2_set_max_pattern_compiled_length.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_max_pattern_length.3.gz b/usr/share/man/man3/pcre2_set_max_pattern_length.3.gz index 2c7cc625081..73b8724f64d 100644 Binary files a/usr/share/man/man3/pcre2_set_max_pattern_length.3.gz and b/usr/share/man/man3/pcre2_set_max_pattern_length.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_max_varlookbehind.3.gz b/usr/share/man/man3/pcre2_set_max_varlookbehind.3.gz index b18f39edaf3..7da16360947 100644 Binary files a/usr/share/man/man3/pcre2_set_max_varlookbehind.3.gz and b/usr/share/man/man3/pcre2_set_max_varlookbehind.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_newline.3.gz b/usr/share/man/man3/pcre2_set_newline.3.gz index c7b6911659e..779465b9227 100644 Binary files a/usr/share/man/man3/pcre2_set_newline.3.gz and b/usr/share/man/man3/pcre2_set_newline.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_offset_limit.3.gz b/usr/share/man/man3/pcre2_set_offset_limit.3.gz index 84f80d2cf97..d3202861471 100644 Binary files a/usr/share/man/man3/pcre2_set_offset_limit.3.gz and b/usr/share/man/man3/pcre2_set_offset_limit.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_optimize.3.gz b/usr/share/man/man3/pcre2_set_optimize.3.gz new file mode 100644 index 00000000000..3edd2ad258d Binary files /dev/null and b/usr/share/man/man3/pcre2_set_optimize.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_parens_nest_limit.3.gz b/usr/share/man/man3/pcre2_set_parens_nest_limit.3.gz index 45222f17ed4..8135f86d958 100644 Binary files a/usr/share/man/man3/pcre2_set_parens_nest_limit.3.gz and b/usr/share/man/man3/pcre2_set_parens_nest_limit.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_recursion_limit.3.gz b/usr/share/man/man3/pcre2_set_recursion_limit.3.gz index 1b36c8a9930..30af4aed7bb 100644 Binary files a/usr/share/man/man3/pcre2_set_recursion_limit.3.gz and b/usr/share/man/man3/pcre2_set_recursion_limit.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_recursion_memory_management.3.gz b/usr/share/man/man3/pcre2_set_recursion_memory_management.3.gz index abf99ac387f..675b3c11476 100644 Binary files a/usr/share/man/man3/pcre2_set_recursion_memory_management.3.gz and b/usr/share/man/man3/pcre2_set_recursion_memory_management.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_substitute_callout.3.gz b/usr/share/man/man3/pcre2_set_substitute_callout.3.gz index 894bcb508a1..355e05fa15c 100644 Binary files a/usr/share/man/man3/pcre2_set_substitute_callout.3.gz and b/usr/share/man/man3/pcre2_set_substitute_callout.3.gz differ diff --git a/usr/share/man/man3/pcre2_set_substitute_case_callout.3.gz b/usr/share/man/man3/pcre2_set_substitute_case_callout.3.gz new file mode 100644 index 00000000000..337b0e12190 Binary files /dev/null and b/usr/share/man/man3/pcre2_set_substitute_case_callout.3.gz differ diff --git a/usr/share/man/man3/pcre2_substitute.3.gz b/usr/share/man/man3/pcre2_substitute.3.gz index db19673b270..22df41e3e5b 100644 Binary files a/usr/share/man/man3/pcre2_substitute.3.gz and b/usr/share/man/man3/pcre2_substitute.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_copy_byname.3.gz b/usr/share/man/man3/pcre2_substring_copy_byname.3.gz index 610aa32f5c2..2ceb33380ab 100644 Binary files a/usr/share/man/man3/pcre2_substring_copy_byname.3.gz and b/usr/share/man/man3/pcre2_substring_copy_byname.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_copy_bynumber.3.gz b/usr/share/man/man3/pcre2_substring_copy_bynumber.3.gz index ba365db4ebd..692696a4e5b 100644 Binary files a/usr/share/man/man3/pcre2_substring_copy_bynumber.3.gz and b/usr/share/man/man3/pcre2_substring_copy_bynumber.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_free.3.gz b/usr/share/man/man3/pcre2_substring_free.3.gz index 142293639aa..f3ce3fbe38f 100644 Binary files a/usr/share/man/man3/pcre2_substring_free.3.gz and b/usr/share/man/man3/pcre2_substring_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_get_byname.3.gz b/usr/share/man/man3/pcre2_substring_get_byname.3.gz index f07e76be580..2b84e593061 100644 Binary files a/usr/share/man/man3/pcre2_substring_get_byname.3.gz and b/usr/share/man/man3/pcre2_substring_get_byname.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_get_bynumber.3.gz b/usr/share/man/man3/pcre2_substring_get_bynumber.3.gz index d49ab61f7e0..48da30b75af 100644 Binary files a/usr/share/man/man3/pcre2_substring_get_bynumber.3.gz and b/usr/share/man/man3/pcre2_substring_get_bynumber.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_length_byname.3.gz b/usr/share/man/man3/pcre2_substring_length_byname.3.gz index e44474c9fb1..5d0beaff8af 100644 Binary files a/usr/share/man/man3/pcre2_substring_length_byname.3.gz and b/usr/share/man/man3/pcre2_substring_length_byname.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_length_bynumber.3.gz b/usr/share/man/man3/pcre2_substring_length_bynumber.3.gz index aea0ec85261..cb79fb1dc33 100644 Binary files a/usr/share/man/man3/pcre2_substring_length_bynumber.3.gz and b/usr/share/man/man3/pcre2_substring_length_bynumber.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_list_free.3.gz b/usr/share/man/man3/pcre2_substring_list_free.3.gz index 34f3227dfcb..3d84e95e4d5 100644 Binary files a/usr/share/man/man3/pcre2_substring_list_free.3.gz and b/usr/share/man/man3/pcre2_substring_list_free.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_list_get.3.gz b/usr/share/man/man3/pcre2_substring_list_get.3.gz index afa4dd67106..f14794a0266 100644 Binary files a/usr/share/man/man3/pcre2_substring_list_get.3.gz and b/usr/share/man/man3/pcre2_substring_list_get.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_nametable_scan.3.gz b/usr/share/man/man3/pcre2_substring_nametable_scan.3.gz index 31bc29d7964..ac2e7891cb8 100644 Binary files a/usr/share/man/man3/pcre2_substring_nametable_scan.3.gz and b/usr/share/man/man3/pcre2_substring_nametable_scan.3.gz differ diff --git a/usr/share/man/man3/pcre2_substring_number_from_name.3.gz b/usr/share/man/man3/pcre2_substring_number_from_name.3.gz index ed0ce752243..77987e8f067 100644 Binary files a/usr/share/man/man3/pcre2_substring_number_from_name.3.gz and b/usr/share/man/man3/pcre2_substring_number_from_name.3.gz differ diff --git a/usr/share/man/man3/pcre2api.3.gz b/usr/share/man/man3/pcre2api.3.gz index 6cebacbacd8..a5b2415b524 100644 Binary files a/usr/share/man/man3/pcre2api.3.gz and b/usr/share/man/man3/pcre2api.3.gz differ diff --git a/usr/share/man/man3/pcre2build.3.gz b/usr/share/man/man3/pcre2build.3.gz index 8b06eaee96c..c6ffcd91b64 100644 Binary files a/usr/share/man/man3/pcre2build.3.gz and b/usr/share/man/man3/pcre2build.3.gz differ diff --git a/usr/share/man/man3/pcre2callout.3.gz b/usr/share/man/man3/pcre2callout.3.gz index a8f9c712c8a..0063d728758 100644 Binary files a/usr/share/man/man3/pcre2callout.3.gz and b/usr/share/man/man3/pcre2callout.3.gz differ diff --git a/usr/share/man/man3/pcre2compat.3.gz b/usr/share/man/man3/pcre2compat.3.gz index dc51b2f2eeb..f6fed552b6c 100644 Binary files a/usr/share/man/man3/pcre2compat.3.gz and b/usr/share/man/man3/pcre2compat.3.gz differ diff --git a/usr/share/man/man3/pcre2convert.3.gz b/usr/share/man/man3/pcre2convert.3.gz index 346ffa364a4..f654786e3ba 100644 Binary files a/usr/share/man/man3/pcre2convert.3.gz and b/usr/share/man/man3/pcre2convert.3.gz differ diff --git a/usr/share/man/man3/pcre2demo.3.gz b/usr/share/man/man3/pcre2demo.3.gz index de857078189..109e0ef15c0 100644 Binary files a/usr/share/man/man3/pcre2demo.3.gz and b/usr/share/man/man3/pcre2demo.3.gz differ diff --git a/usr/share/man/man3/pcre2jit.3.gz b/usr/share/man/man3/pcre2jit.3.gz index 3a5cecc7368..5774b3e549e 100644 Binary files a/usr/share/man/man3/pcre2jit.3.gz and b/usr/share/man/man3/pcre2jit.3.gz differ diff --git a/usr/share/man/man3/pcre2limits.3.gz b/usr/share/man/man3/pcre2limits.3.gz index dd7f0bde3fe..7fe735473f7 100644 Binary files a/usr/share/man/man3/pcre2limits.3.gz and b/usr/share/man/man3/pcre2limits.3.gz differ diff --git a/usr/share/man/man3/pcre2matching.3.gz b/usr/share/man/man3/pcre2matching.3.gz index e14317961ce..40e98bed373 100644 Binary files a/usr/share/man/man3/pcre2matching.3.gz and b/usr/share/man/man3/pcre2matching.3.gz differ diff --git a/usr/share/man/man3/pcre2partial.3.gz b/usr/share/man/man3/pcre2partial.3.gz index 14768729058..cbb76c1ce1b 100644 Binary files a/usr/share/man/man3/pcre2partial.3.gz and b/usr/share/man/man3/pcre2partial.3.gz differ diff --git a/usr/share/man/man3/pcre2pattern.3.gz b/usr/share/man/man3/pcre2pattern.3.gz index 46ad89b46b1..e24291911eb 100644 Binary files a/usr/share/man/man3/pcre2pattern.3.gz and b/usr/share/man/man3/pcre2pattern.3.gz differ diff --git a/usr/share/man/man3/pcre2perform.3.gz b/usr/share/man/man3/pcre2perform.3.gz index 745c1a602a1..9e11a4bf132 100644 Binary files a/usr/share/man/man3/pcre2perform.3.gz and b/usr/share/man/man3/pcre2perform.3.gz differ diff --git a/usr/share/man/man3/pcre2posix.3.gz b/usr/share/man/man3/pcre2posix.3.gz index c108ca4e276..41e335f1370 100644 Binary files a/usr/share/man/man3/pcre2posix.3.gz and b/usr/share/man/man3/pcre2posix.3.gz differ diff --git a/usr/share/man/man3/pcre2sample.3.gz b/usr/share/man/man3/pcre2sample.3.gz index 87b280f9a50..669f77cd328 100644 Binary files a/usr/share/man/man3/pcre2sample.3.gz and b/usr/share/man/man3/pcre2sample.3.gz differ diff --git a/usr/share/man/man3/pcre2serialize.3.gz b/usr/share/man/man3/pcre2serialize.3.gz index 709a1656f41..3069e1b3bc9 100644 Binary files a/usr/share/man/man3/pcre2serialize.3.gz and b/usr/share/man/man3/pcre2serialize.3.gz differ diff --git a/usr/share/man/man3/pcre2syntax.3.gz b/usr/share/man/man3/pcre2syntax.3.gz index 6c1976014f3..c9d5d878ed0 100644 Binary files a/usr/share/man/man3/pcre2syntax.3.gz and b/usr/share/man/man3/pcre2syntax.3.gz differ diff --git a/usr/share/man/man3/pcre2unicode.3.gz b/usr/share/man/man3/pcre2unicode.3.gz index 8d153f0dc78..ade2ccaa32d 100644 Binary files a/usr/share/man/man3/pcre2unicode.3.gz and b/usr/share/man/man3/pcre2unicode.3.gz differ diff --git a/var/lib/pacman/local/libpcre2_16-10.44-1/mtree b/var/lib/pacman/local/libpcre2_16-10.44-1/mtree deleted file mode 100644 index bd8d6056484..00000000000 Binary files a/var/lib/pacman/local/libpcre2_16-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/libpcre2_16-10.44-1/desc b/var/lib/pacman/local/libpcre2_16-10.45-1/desc similarity index 90% rename from var/lib/pacman/local/libpcre2_16-10.44-1/desc rename to var/lib/pacman/local/libpcre2_16-10.45-1/desc index 5821f6996e6..ebc0c14ed8a 100644 --- a/var/lib/pacman/local/libpcre2_16-10.44-1/desc +++ b/var/lib/pacman/local/libpcre2_16-10.45-1/desc @@ -2,7 +2,7 @@ libpcre2_16 %VERSION% -10.44-1 +10.45-1 %BASE% pcre2 @@ -17,16 +17,16 @@ https://www.pcre.org/ x86_64 %BUILDDATE% -1717799151 +1740130102 %INSTALLDATE% -1717815585 +1740193825 %PACKAGER% Johannes Schindelin %SIZE% -586385 +629905 %REASON% 1 diff --git a/var/lib/pacman/local/libpcre2_16-10.44-1/files b/var/lib/pacman/local/libpcre2_16-10.45-1/files similarity index 100% rename from var/lib/pacman/local/libpcre2_16-10.44-1/files rename to var/lib/pacman/local/libpcre2_16-10.45-1/files diff --git a/var/lib/pacman/local/libpcre2_16-10.45-1/mtree b/var/lib/pacman/local/libpcre2_16-10.45-1/mtree new file mode 100644 index 00000000000..f4bdb61a092 Binary files /dev/null and b/var/lib/pacman/local/libpcre2_16-10.45-1/mtree differ diff --git a/var/lib/pacman/local/libpcre2_32-10.44-1/mtree b/var/lib/pacman/local/libpcre2_32-10.44-1/mtree deleted file mode 100644 index 1237aa7fb9a..00000000000 Binary files a/var/lib/pacman/local/libpcre2_32-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/libpcre2_32-10.44-1/desc b/var/lib/pacman/local/libpcre2_32-10.45-1/desc similarity index 90% rename from var/lib/pacman/local/libpcre2_32-10.44-1/desc rename to var/lib/pacman/local/libpcre2_32-10.45-1/desc index 044126c0a30..9c39e521c92 100644 --- a/var/lib/pacman/local/libpcre2_32-10.44-1/desc +++ b/var/lib/pacman/local/libpcre2_32-10.45-1/desc @@ -2,7 +2,7 @@ libpcre2_32 %VERSION% -10.44-1 +10.45-1 %BASE% pcre2 @@ -17,16 +17,16 @@ https://www.pcre.org/ x86_64 %BUILDDATE% -1717799151 +1740130102 %INSTALLDATE% -1717815585 +1740193825 %PACKAGER% Johannes Schindelin %SIZE% -555665 +599185 %REASON% 1 diff --git a/var/lib/pacman/local/libpcre2_32-10.44-1/files b/var/lib/pacman/local/libpcre2_32-10.45-1/files similarity index 100% rename from var/lib/pacman/local/libpcre2_32-10.44-1/files rename to var/lib/pacman/local/libpcre2_32-10.45-1/files diff --git a/var/lib/pacman/local/libpcre2_32-10.45-1/mtree b/var/lib/pacman/local/libpcre2_32-10.45-1/mtree new file mode 100644 index 00000000000..38014739c03 Binary files /dev/null and b/var/lib/pacman/local/libpcre2_32-10.45-1/mtree differ diff --git a/var/lib/pacman/local/libpcre2_8-10.44-1/mtree b/var/lib/pacman/local/libpcre2_8-10.44-1/mtree deleted file mode 100644 index bb90463c71c..00000000000 Binary files a/var/lib/pacman/local/libpcre2_8-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/libpcre2_8-10.44-1/desc b/var/lib/pacman/local/libpcre2_8-10.45-1/desc similarity index 90% rename from var/lib/pacman/local/libpcre2_8-10.44-1/desc rename to var/lib/pacman/local/libpcre2_8-10.45-1/desc index aae22b9332f..ca777ac88ab 100644 --- a/var/lib/pacman/local/libpcre2_8-10.44-1/desc +++ b/var/lib/pacman/local/libpcre2_8-10.45-1/desc @@ -2,7 +2,7 @@ libpcre2_8 %VERSION% -10.44-1 +10.45-1 %BASE% pcre2 @@ -17,16 +17,16 @@ https://www.pcre.org/ x86_64 %BUILDDATE% -1717799151 +1740130102 %INSTALLDATE% -1717815585 +1740193825 %PACKAGER% Johannes Schindelin %SIZE% -642224 +686768 %REASON% 1 diff --git a/var/lib/pacman/local/libpcre2_8-10.44-1/files b/var/lib/pacman/local/libpcre2_8-10.45-1/files similarity index 100% rename from var/lib/pacman/local/libpcre2_8-10.44-1/files rename to var/lib/pacman/local/libpcre2_8-10.45-1/files diff --git a/var/lib/pacman/local/libpcre2_8-10.45-1/mtree b/var/lib/pacman/local/libpcre2_8-10.45-1/mtree new file mode 100644 index 00000000000..26442d89239 Binary files /dev/null and b/var/lib/pacman/local/libpcre2_8-10.45-1/mtree differ diff --git a/var/lib/pacman/local/libpcre2posix-10.44-1/mtree b/var/lib/pacman/local/libpcre2posix-10.44-1/mtree deleted file mode 100644 index db3e05365d1..00000000000 Binary files a/var/lib/pacman/local/libpcre2posix-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/libpcre2posix-10.44-1/desc b/var/lib/pacman/local/libpcre2posix-10.45-1/desc similarity index 87% rename from var/lib/pacman/local/libpcre2posix-10.44-1/desc rename to var/lib/pacman/local/libpcre2posix-10.45-1/desc index ec7f794ed5e..29a0f03f544 100644 --- a/var/lib/pacman/local/libpcre2posix-10.44-1/desc +++ b/var/lib/pacman/local/libpcre2posix-10.45-1/desc @@ -2,7 +2,7 @@ libpcre2posix %VERSION% -10.44-1 +10.45-1 %BASE% pcre2 @@ -17,16 +17,16 @@ https://www.pcre.org/ x86_64 %BUILDDATE% -1717799151 +1740130102 %INSTALLDATE% -1717815585 +1740193825 %PACKAGER% Johannes Schindelin %SIZE% -11394 +10882 %REASON% 1 @@ -42,7 +42,7 @@ sha256 pgp %DEPENDS% -libpcre2_8=10.44 +libpcre2_8=10.45 %XDATA% pkgtype=split diff --git a/var/lib/pacman/local/libpcre2posix-10.44-1/files b/var/lib/pacman/local/libpcre2posix-10.45-1/files similarity index 100% rename from var/lib/pacman/local/libpcre2posix-10.44-1/files rename to var/lib/pacman/local/libpcre2posix-10.45-1/files diff --git a/var/lib/pacman/local/libpcre2posix-10.45-1/mtree b/var/lib/pacman/local/libpcre2posix-10.45-1/mtree new file mode 100644 index 00000000000..f034ca50e20 Binary files /dev/null and b/var/lib/pacman/local/libpcre2posix-10.45-1/mtree differ diff --git a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/mtree b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/mtree deleted file mode 100644 index 843c3b158c3..00000000000 Binary files a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/desc b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/desc similarity index 92% rename from var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/desc rename to var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/desc index 7eec062886d..9e1aa4f1868 100644 --- a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/desc +++ b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/desc @@ -2,7 +2,7 @@ mingw-w64-i686-pcre2 %VERSION% -10.44-1 +10.45-1 %BASE% mingw-w64-pcre2 @@ -17,16 +17,16 @@ https://pcre.org/ any %BUILDDATE% -1717797774 +1740130174 %INSTALLDATE% -1717815585 +1740193825 %PACKAGER% Johannes Schindelin %SIZE% -6800793 +7248956 %LICENSE% BSD diff --git a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/files b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/files similarity index 97% rename from var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/files rename to var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/files index a4398937cfb..72edfee3961 100644 --- a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.44-1/files +++ b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/files @@ -28,7 +28,7 @@ mingw32/lib/pkgconfig/libpcre2-posix.pc mingw32/share/ mingw32/share/doc/ mingw32/share/doc/pcre2/ -mingw32/share/doc/pcre2/AUTHORS +mingw32/share/doc/pcre2/AUTHORS.md mingw32/share/doc/pcre2/ChangeLog mingw32/share/doc/pcre2/html/ mingw32/share/doc/pcre2/html/index.html @@ -95,10 +95,12 @@ mingw32/share/doc/pcre2/html/pcre2_set_max_pattern_length.html mingw32/share/doc/pcre2/html/pcre2_set_max_varlookbehind.html mingw32/share/doc/pcre2/html/pcre2_set_newline.html mingw32/share/doc/pcre2/html/pcre2_set_offset_limit.html +mingw32/share/doc/pcre2/html/pcre2_set_optimize.html mingw32/share/doc/pcre2/html/pcre2_set_parens_nest_limit.html mingw32/share/doc/pcre2/html/pcre2_set_recursion_limit.html mingw32/share/doc/pcre2/html/pcre2_set_recursion_memory_management.html mingw32/share/doc/pcre2/html/pcre2_set_substitute_callout.html +mingw32/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html mingw32/share/doc/pcre2/html/pcre2_substitute.html mingw32/share/doc/pcre2/html/pcre2_substring_copy_byname.html mingw32/share/doc/pcre2/html/pcre2_substring_copy_bynumber.html @@ -137,10 +139,11 @@ mingw32/share/doc/pcre2/pcre2.txt mingw32/share/doc/pcre2/pcre2grep.txt mingw32/share/doc/pcre2/pcre2test.txt mingw32/share/doc/pcre2/README +mingw32/share/doc/pcre2/SECURITY.md mingw32/share/licenses/ mingw32/share/licenses/pcre2/ mingw32/share/licenses/pcre2/COPYING -mingw32/share/licenses/pcre2/LICENCE +mingw32/share/licenses/pcre2/LICENCE.md mingw32/share/man/ mingw32/share/man/man1/ mingw32/share/man/man1/pcre2-config.1.gz @@ -208,10 +211,12 @@ mingw32/share/man/man3/pcre2_set_max_pattern_length.3.gz mingw32/share/man/man3/pcre2_set_max_varlookbehind.3.gz mingw32/share/man/man3/pcre2_set_newline.3.gz mingw32/share/man/man3/pcre2_set_offset_limit.3.gz +mingw32/share/man/man3/pcre2_set_optimize.3.gz mingw32/share/man/man3/pcre2_set_parens_nest_limit.3.gz mingw32/share/man/man3/pcre2_set_recursion_limit.3.gz mingw32/share/man/man3/pcre2_set_recursion_memory_management.3.gz mingw32/share/man/man3/pcre2_set_substitute_callout.3.gz +mingw32/share/man/man3/pcre2_set_substitute_case_callout.3.gz mingw32/share/man/man3/pcre2_substitute.3.gz mingw32/share/man/man3/pcre2_substring_copy_byname.3.gz mingw32/share/man/man3/pcre2_substring_copy_bynumber.3.gz diff --git a/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/mtree b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/mtree new file mode 100644 index 00000000000..f08efc0ddc7 Binary files /dev/null and b/var/lib/pacman/local/mingw-w64-i686-pcre2-10.45-1/mtree differ diff --git a/var/lib/pacman/local/mingw-w64-x86_64-git-extra-1.1.649.af5925057-1/desc b/var/lib/pacman/local/mingw-w64-x86_64-git-extra-1.1.649.af5925057-1/desc index 480f767efc8..6e68353b916 100644 --- a/var/lib/pacman/local/mingw-w64-x86_64-git-extra-1.1.649.af5925057-1/desc +++ b/var/lib/pacman/local/mingw-w64-x86_64-git-extra-1.1.649.af5925057-1/desc @@ -20,7 +20,7 @@ any 1739811804 %INSTALLDATE% -1740107689 +1740193828 %PACKAGER% Johannes Schindelin diff --git a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/mtree b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/mtree deleted file mode 100644 index 7b9ec725711..00000000000 Binary files a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/desc b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/desc similarity index 92% rename from var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/desc rename to var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/desc index 3e8144e6946..d406bf7a9a7 100644 --- a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/desc +++ b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/desc @@ -2,7 +2,7 @@ mingw-w64-x86_64-pcre2 %VERSION% -10.44-1 +10.45-1 %BASE% mingw-w64-pcre2 @@ -17,16 +17,16 @@ https://pcre.org/ any %BUILDDATE% -1717797894 +1740130318 %INSTALLDATE% -1717815586 +1740193826 %PACKAGER% Johannes Schindelin %SIZE% -6804571 +7251012 %LICENSE% BSD diff --git a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/files b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/files similarity index 97% rename from var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/files rename to var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/files index 73531e8f1dc..7e67e088217 100644 --- a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.44-1/files +++ b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/files @@ -28,7 +28,7 @@ mingw64/lib/pkgconfig/libpcre2-posix.pc mingw64/share/ mingw64/share/doc/ mingw64/share/doc/pcre2/ -mingw64/share/doc/pcre2/AUTHORS +mingw64/share/doc/pcre2/AUTHORS.md mingw64/share/doc/pcre2/ChangeLog mingw64/share/doc/pcre2/html/ mingw64/share/doc/pcre2/html/index.html @@ -95,10 +95,12 @@ mingw64/share/doc/pcre2/html/pcre2_set_max_pattern_length.html mingw64/share/doc/pcre2/html/pcre2_set_max_varlookbehind.html mingw64/share/doc/pcre2/html/pcre2_set_newline.html mingw64/share/doc/pcre2/html/pcre2_set_offset_limit.html +mingw64/share/doc/pcre2/html/pcre2_set_optimize.html mingw64/share/doc/pcre2/html/pcre2_set_parens_nest_limit.html mingw64/share/doc/pcre2/html/pcre2_set_recursion_limit.html mingw64/share/doc/pcre2/html/pcre2_set_recursion_memory_management.html mingw64/share/doc/pcre2/html/pcre2_set_substitute_callout.html +mingw64/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html mingw64/share/doc/pcre2/html/pcre2_substitute.html mingw64/share/doc/pcre2/html/pcre2_substring_copy_byname.html mingw64/share/doc/pcre2/html/pcre2_substring_copy_bynumber.html @@ -137,10 +139,11 @@ mingw64/share/doc/pcre2/pcre2.txt mingw64/share/doc/pcre2/pcre2grep.txt mingw64/share/doc/pcre2/pcre2test.txt mingw64/share/doc/pcre2/README +mingw64/share/doc/pcre2/SECURITY.md mingw64/share/licenses/ mingw64/share/licenses/pcre2/ mingw64/share/licenses/pcre2/COPYING -mingw64/share/licenses/pcre2/LICENCE +mingw64/share/licenses/pcre2/LICENCE.md mingw64/share/man/ mingw64/share/man/man1/ mingw64/share/man/man1/pcre2-config.1.gz @@ -208,10 +211,12 @@ mingw64/share/man/man3/pcre2_set_max_pattern_length.3.gz mingw64/share/man/man3/pcre2_set_max_varlookbehind.3.gz mingw64/share/man/man3/pcre2_set_newline.3.gz mingw64/share/man/man3/pcre2_set_offset_limit.3.gz +mingw64/share/man/man3/pcre2_set_optimize.3.gz mingw64/share/man/man3/pcre2_set_parens_nest_limit.3.gz mingw64/share/man/man3/pcre2_set_recursion_limit.3.gz mingw64/share/man/man3/pcre2_set_recursion_memory_management.3.gz mingw64/share/man/man3/pcre2_set_substitute_callout.3.gz +mingw64/share/man/man3/pcre2_set_substitute_case_callout.3.gz mingw64/share/man/man3/pcre2_substitute.3.gz mingw64/share/man/man3/pcre2_substring_copy_byname.3.gz mingw64/share/man/man3/pcre2_substring_copy_bynumber.3.gz diff --git a/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/mtree b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/mtree new file mode 100644 index 00000000000..5459195625e Binary files /dev/null and b/var/lib/pacman/local/mingw-w64-x86_64-pcre2-10.45-1/mtree differ diff --git a/var/lib/pacman/local/pacman-mirrors-20241217-1/files b/var/lib/pacman/local/pacman-mirrors-20241217-1/files deleted file mode 100644 index 6965918a9d0..00000000000 --- a/var/lib/pacman/local/pacman-mirrors-20241217-1/files +++ /dev/null @@ -1,18 +0,0 @@ -%FILES% -etc/ -etc/pacman.d/ -etc/pacman.d/mirrorlist.clang64 -etc/pacman.d/mirrorlist.mingw -etc/pacman.d/mirrorlist.mingw32 -etc/pacman.d/mirrorlist.mingw64 -etc/pacman.d/mirrorlist.msys -etc/pacman.d/mirrorlist.ucrt64 - -%BACKUP% -etc/pacman.d/mirrorlist.msys b55f0e8d43d6d16f178959a06984ee34 -etc/pacman.d/mirrorlist.mingw 8762e78f817920c7bb1274e9ac16179a -etc/pacman.d/mirrorlist.mingw32 aeea08a85fd2469e91454ec9e6d7a883 -etc/pacman.d/mirrorlist.mingw64 4289c91b00f3d31f4eeb0d4cfbe7551e -etc/pacman.d/mirrorlist.ucrt64 3dfb700354bf6eec8cf250e7145f802a -etc/pacman.d/mirrorlist.clang64 47d4098b6ca1979477b2109e4b525630 - diff --git a/var/lib/pacman/local/pacman-mirrors-20241217-1/mtree b/var/lib/pacman/local/pacman-mirrors-20241217-1/mtree deleted file mode 100644 index 35731e6b345..00000000000 Binary files a/var/lib/pacman/local/pacman-mirrors-20241217-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/pacman-mirrors-20241217-1/desc b/var/lib/pacman/local/pacman-mirrors-20250220-1/desc similarity index 75% rename from var/lib/pacman/local/pacman-mirrors-20241217-1/desc rename to var/lib/pacman/local/pacman-mirrors-20250220-1/desc index cbca41eb256..940b8bdd096 100644 --- a/var/lib/pacman/local/pacman-mirrors-20241217-1/desc +++ b/var/lib/pacman/local/pacman-mirrors-20250220-1/desc @@ -2,7 +2,7 @@ pacman-mirrors %VERSION% -20241217-1 +20250220-1 %BASE% pacman-mirrors @@ -17,16 +17,16 @@ https://www.msys2.org/dev/mirrors/ any %BUILDDATE% -1734593157 +1740091787 %INSTALLDATE% -1735473382 +1740193820 %PACKAGER% -CI (msys2/msys2-autobuild/657fd895/12408103218) +CI (msys2/msys2-autobuild/a187346d/13446110201) %SIZE% -10642 +10648 %REASON% 1 diff --git a/var/lib/pacman/local/pacman-mirrors-20250220-1/files b/var/lib/pacman/local/pacman-mirrors-20250220-1/files new file mode 100644 index 00000000000..5e51b4e7ec9 --- /dev/null +++ b/var/lib/pacman/local/pacman-mirrors-20250220-1/files @@ -0,0 +1,18 @@ +%FILES% +etc/ +etc/pacman.d/ +etc/pacman.d/mirrorlist.clang64 +etc/pacman.d/mirrorlist.mingw +etc/pacman.d/mirrorlist.mingw32 +etc/pacman.d/mirrorlist.mingw64 +etc/pacman.d/mirrorlist.msys +etc/pacman.d/mirrorlist.ucrt64 + +%BACKUP% +etc/pacman.d/mirrorlist.msys 3c202c953ea6f5cf9bda92fbc427af0a +etc/pacman.d/mirrorlist.mingw e6e355ab62c0b54bc665e1fdd81aeeb6 +etc/pacman.d/mirrorlist.mingw32 a9986116a8074f862c2b393a285ddea3 +etc/pacman.d/mirrorlist.mingw64 5d04334e40ef0dbcc5f74abf14a0a613 +etc/pacman.d/mirrorlist.ucrt64 3b8165d32e6d5d5c6c7dcc15757df7d2 +etc/pacman.d/mirrorlist.clang64 f3dd5b27c2263d95ec02dffcd70a898e + diff --git a/var/lib/pacman/local/pacman-mirrors-20250220-1/mtree b/var/lib/pacman/local/pacman-mirrors-20250220-1/mtree new file mode 100644 index 00000000000..c0b04372205 Binary files /dev/null and b/var/lib/pacman/local/pacman-mirrors-20250220-1/mtree differ diff --git a/var/lib/pacman/local/pcre2-10.44-1/mtree b/var/lib/pacman/local/pcre2-10.44-1/mtree deleted file mode 100644 index ab6e4cbf1c7..00000000000 Binary files a/var/lib/pacman/local/pcre2-10.44-1/mtree and /dev/null differ diff --git a/var/lib/pacman/local/pcre2-10.44-1/desc b/var/lib/pacman/local/pcre2-10.45-1/desc similarity index 76% rename from var/lib/pacman/local/pcre2-10.44-1/desc rename to var/lib/pacman/local/pcre2-10.45-1/desc index c8cf5484394..214efe55ccd 100644 --- a/var/lib/pacman/local/pcre2-10.44-1/desc +++ b/var/lib/pacman/local/pcre2-10.45-1/desc @@ -2,7 +2,7 @@ pcre2 %VERSION% -10.44-1 +10.45-1 %BASE% pcre2 @@ -17,16 +17,16 @@ https://www.pcre.org/ x86_64 %BUILDDATE% -1717799151 +1740130102 %INSTALLDATE% -1717815586 +1740193826 %PACKAGER% Johannes Schindelin %SIZE% -2455634 +2588275 %REASON% 1 @@ -42,10 +42,10 @@ pgp libreadline libbz2 zlib -libpcre2_8=10.44 -libpcre2_16=10.44 -libpcre2_32=10.44 -libpcre2posix=10.44 +libpcre2_8=10.45 +libpcre2_16=10.45 +libpcre2_32=10.45 +libpcre2posix=10.45 %XDATA% pkgtype=split diff --git a/var/lib/pacman/local/pcre2-10.44-1/files b/var/lib/pacman/local/pcre2-10.45-1/files similarity index 96% rename from var/lib/pacman/local/pcre2-10.44-1/files rename to var/lib/pacman/local/pcre2-10.45-1/files index 35bf9089d4d..af949fb0369 100644 --- a/var/lib/pacman/local/pcre2-10.44-1/files +++ b/var/lib/pacman/local/pcre2-10.45-1/files @@ -6,7 +6,7 @@ usr/bin/pcre2test.exe usr/share/ usr/share/doc/ usr/share/doc/pcre2/ -usr/share/doc/pcre2/AUTHORS +usr/share/doc/pcre2/AUTHORS.md usr/share/doc/pcre2/ChangeLog usr/share/doc/pcre2/COPYING usr/share/doc/pcre2/html/ @@ -74,10 +74,12 @@ usr/share/doc/pcre2/html/pcre2_set_max_pattern_length.html usr/share/doc/pcre2/html/pcre2_set_max_varlookbehind.html usr/share/doc/pcre2/html/pcre2_set_newline.html usr/share/doc/pcre2/html/pcre2_set_offset_limit.html +usr/share/doc/pcre2/html/pcre2_set_optimize.html usr/share/doc/pcre2/html/pcre2_set_parens_nest_limit.html usr/share/doc/pcre2/html/pcre2_set_recursion_limit.html usr/share/doc/pcre2/html/pcre2_set_recursion_memory_management.html usr/share/doc/pcre2/html/pcre2_set_substitute_callout.html +usr/share/doc/pcre2/html/pcre2_set_substitute_case_callout.html usr/share/doc/pcre2/html/pcre2_substitute.html usr/share/doc/pcre2/html/pcre2_substring_copy_byname.html usr/share/doc/pcre2/html/pcre2_substring_copy_bynumber.html @@ -110,16 +112,17 @@ usr/share/doc/pcre2/html/pcre2syntax.html usr/share/doc/pcre2/html/pcre2test.html usr/share/doc/pcre2/html/pcre2unicode.html usr/share/doc/pcre2/html/README.txt -usr/share/doc/pcre2/LICENCE +usr/share/doc/pcre2/LICENCE.md usr/share/doc/pcre2/NEWS usr/share/doc/pcre2/pcre2-config.txt usr/share/doc/pcre2/pcre2.txt usr/share/doc/pcre2/pcre2grep.txt usr/share/doc/pcre2/pcre2test.txt usr/share/doc/pcre2/README +usr/share/doc/pcre2/SECURITY.md usr/share/licenses/ usr/share/licenses/pcre2/ -usr/share/licenses/pcre2/LICENSE +usr/share/licenses/pcre2/LICENSE.md usr/share/man/ usr/share/man/man1/ usr/share/man/man1/pcre2-config.1.gz @@ -187,10 +190,12 @@ usr/share/man/man3/pcre2_set_max_pattern_length.3.gz usr/share/man/man3/pcre2_set_max_varlookbehind.3.gz usr/share/man/man3/pcre2_set_newline.3.gz usr/share/man/man3/pcre2_set_offset_limit.3.gz +usr/share/man/man3/pcre2_set_optimize.3.gz usr/share/man/man3/pcre2_set_parens_nest_limit.3.gz usr/share/man/man3/pcre2_set_recursion_limit.3.gz usr/share/man/man3/pcre2_set_recursion_memory_management.3.gz usr/share/man/man3/pcre2_set_substitute_callout.3.gz +usr/share/man/man3/pcre2_set_substitute_case_callout.3.gz usr/share/man/man3/pcre2_substitute.3.gz usr/share/man/man3/pcre2_substring_copy_byname.3.gz usr/share/man/man3/pcre2_substring_copy_bynumber.3.gz diff --git a/var/lib/pacman/local/pcre2-10.45-1/mtree b/var/lib/pacman/local/pcre2-10.45-1/mtree new file mode 100644 index 00000000000..dea75ab94fa Binary files /dev/null and b/var/lib/pacman/local/pcre2-10.45-1/mtree differ diff --git a/var/lib/pacman/sync/git-for-windows-aarch64.db b/var/lib/pacman/sync/git-for-windows-aarch64.db index 742a5ef8cd0..e19282abcac 100644 Binary files a/var/lib/pacman/sync/git-for-windows-aarch64.db and b/var/lib/pacman/sync/git-for-windows-aarch64.db differ diff --git a/var/lib/pacman/sync/git-for-windows-aarch64.db.sig b/var/lib/pacman/sync/git-for-windows-aarch64.db.sig index 76dfa718cf2..2111393fc4b 100644 Binary files a/var/lib/pacman/sync/git-for-windows-aarch64.db.sig and b/var/lib/pacman/sync/git-for-windows-aarch64.db.sig differ diff --git a/var/lib/pacman/sync/git-for-windows-mingw32.db b/var/lib/pacman/sync/git-for-windows-mingw32.db index c34ab483fc7..d77975d62e7 100644 Binary files a/var/lib/pacman/sync/git-for-windows-mingw32.db and b/var/lib/pacman/sync/git-for-windows-mingw32.db differ diff --git a/var/lib/pacman/sync/git-for-windows-mingw32.db.sig b/var/lib/pacman/sync/git-for-windows-mingw32.db.sig index e03062196c0..438b7f5de1c 100644 Binary files a/var/lib/pacman/sync/git-for-windows-mingw32.db.sig and b/var/lib/pacman/sync/git-for-windows-mingw32.db.sig differ diff --git a/var/lib/pacman/sync/git-for-windows.db b/var/lib/pacman/sync/git-for-windows.db index e807a8fc88b..64cd2692a0f 100644 Binary files a/var/lib/pacman/sync/git-for-windows.db and b/var/lib/pacman/sync/git-for-windows.db differ diff --git a/var/lib/pacman/sync/git-for-windows.db.sig b/var/lib/pacman/sync/git-for-windows.db.sig index 283fd065c13..599af1dc278 100644 Binary files a/var/lib/pacman/sync/git-for-windows.db.sig and b/var/lib/pacman/sync/git-for-windows.db.sig differ diff --git a/var/lib/pacman/sync/msys.db b/var/lib/pacman/sync/msys.db index 522dc11eb52..f835d9219a0 100644 Binary files a/var/lib/pacman/sync/msys.db and b/var/lib/pacman/sync/msys.db differ diff --git a/var/lib/pacman/sync/msys.db.sig b/var/lib/pacman/sync/msys.db.sig index 194e006ad6c..83d61eb7cab 100644 Binary files a/var/lib/pacman/sync/msys.db.sig and b/var/lib/pacman/sync/msys.db.sig differ